Example #1
0
    def __init__(self, dataset=None):

        if dataset == None:
            self.dataset = MOSI()
        else:
            self.dataset = dataset
        self.train_ids = self.dataset.train()
        self.valid_ids = self.dataset.valid()
        self.test_ids = self.dataset.test()
        self.sentiments = self.dataset.sentiments()
np.random.seed(seed)
import os

os.environ['PYTHONHASHSEED'] = '0'
import tensorflow as tf

tf.set_random_seed(seed)
from joblib import Parallel, delayed
import multiprocessing

num_cores = multiprocessing.cpu_count()
mode = sys.argv[1]
task = sys.argv[2]

# Download the data if not present
mosei = MOSI()
embeddings = mosei.embeddings()
if mode == "all" or mode == "AV" or mode == "VT" or mode == "V":
    facet = mosei.facet()
if mode == "all" or mode == "AT" or mode == "AV" or mode == "A":
    covarep = mosei.covarep()
sentiments = mosei.sentiments()
emotions = mosei.emotions()
train_ids = mosei.train()
valid_ids = mosei.valid()
test_ids = mosei.test()

# Merge different features and do word level feature alignment (align according to timestamps of embeddings)
if mode == "all" or mode == "AV":
    bimodal = Dataset.merge(embeddings, facet)
    trimodal = Dataset.merge(bimodal, covarep)
    return K.sum(K.mean(fsp*fst,axis=0)/(devP*devT))

# meta parameters
maxlen = 15 # Each utterance will be truncated/padded to 15 words
batch_size = 128
nb_epoch = 1000 # number of total epochs to train the model
# if the validation loss isn't decreasing for a number of epochs, stop training to prevent over-fitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

opt_func = Adamax(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # optimization function
loss_func = 'mae' # loss function
metr = 'mae' # evaluation metric

# Download the data if not present
mosi = MOSI()
embeddings = mosi.embeddings() # features
sentiments = mosi.sentiments() # Valence labels
train_ids = mosi.train()
valid_ids = mosi.valid()
test_ids = mosi.test()

# Some data preprocessing
x_train = []
y_train = []
x_valid = []
y_valid = []
x_test = []
y_test = []

print("Preparing train and test data...")
Example #4
0
    data = np.array([feature[2] for feature in data])
    n_rows = data.shape[0]
    dim = data.shape[1]
    if max_len >= n_rows:
        diff = max_len - n_rows
        padding = np.zeros((diff, dim))
        padded = np.concatenate((padding, data))
        return padded
    else:
        return data[-max_len:]


if __name__ == "__main__":
    # Download the data if not present
    max_len = 20
    mosi = MOSI()
    embeddings = mosi.embeddings()
    facet = mosi.facet()
    covarep = mosi.covarep()
    sentiments = mosi.sentiments(
    )  # sentiment labels, real-valued. for this tutorial we'll binarize them
    train_ids = mosi.train()
    valid_ids = mosi.valid()
    test_ids = mosi.test()

    # Merge different features and do word level feature alignment (align according to timestamps of embeddings)
    bimodal = Dataset.merge(embeddings, facet)
    trimodal = Dataset.merge(bimodal, covarep)
    dataset = trimodal.align('embeddings')

    # sort through all the video ID, segment ID pairs
Example #5
0
opt_func = Adamax(lr=0.0005, beta_1=0.9, beta_2=0.999,
                  epsilon=1e-08)  # optimization function
loss_func_main = 'mae'  # loss function
metr_main = 'mae'  # evaluation metric
weight_main = 1.0  # weight for multitask learning
# for Valence polarity classification
loss_func_aux1 = 'binary_crossentropy'  # loss function
metr_aux1 = 'binary_accuracy'  # evaluation metric
weight_aux1 = 0.5  # weight for multitask learning
# for Valence intensity classification
loss_func_aux2 = 'categorical_crossentropy'  # loss function
metr_aux2 = 'accuracy'  # evaluation metric
weight_aux2 = 0.5  # weight for multitask learning

# Download the data if not present
mosi = MOSI()
covarep = mosi.covarep()  # features
facet = mosi.facet()  # features
embeddings = mosi.embeddings()  # features
sentiments = mosi.sentiments()  # Valence labels
train_ids = mosi.train()
valid_ids = mosi.valid()
test_ids = mosi.test()

bimodal = Dataset.merge(embeddings, covarep)
dataset = bimodal.align('embeddings')

# Some data preprocessing
print("Preparing train and test data...")
# sort through all the video ID, segment ID pairs
train_set_ids = []
Example #6
0
class UnimodalData():
    def __init__(self, dataset=None):

        if dataset == None:
            self.dataset = MOSI()
        else:
            self.dataset = dataset
        self.train_ids = self.dataset.train()
        self.valid_ids = self.dataset.valid()
        self.test_ids = self.dataset.test()
        self.sentiments = self.dataset.sentiments()

    def get_data(self, data, max_len):

        x_train = []
        y_train = []
        x_test = []
        y_test = []
        x_val = []
        y_val = []
        for vid, vdata in data.items(
        ):  # note that even Dataset with one feature will require explicit indexing of features
            for sid, sdata in vdata.items():
                if sdata == []:
                    continue
                example = []
                for i, time_step in enumerate(sdata):
                    # data is truncated for 15 words
                    if i == max_len:
                        break
                    example.append(
                        time_step[2]
                    )  # here first 2 dims (timestamps) will not be used

                for i in range(max_len - len(sdata)):
                    example.append(np.zeros(
                        sdata[0][2].shape))  # padding each example to max_len
                example = np.asarray(example)
                label = 1 if self.sentiments[vid][
                    sid] >= 0 else 0  # binarize the labels
                # here we just use everything except training set as the test set
                if vid in self.train_ids:
                    x_train.append(example)
                    y_train.append(label)
                elif vid in self.valid_ids:
                    x_val.append(example)
                    y_val.append(label)
                elif vid in self.test_ids:
                    x_test.append(example)
                    y_test.append(label)
        # Prepare the final inputs as numpy arrays
        x_train = np.asarray(x_train)
        x_val = np.asarray(x_val)
        x_test = np.asarray(x_test)
        y_train = np.asarray(y_train)
        y_val = np.asarray(y_val)
        y_test = np.asarray(y_test)

        return x_train, x_val, x_test, y_train, y_val, y_test

    def get_text(self, max_len=20):

        embeddings = self.dataset.embeddings()
        return self.get_data(embeddings["embeddings"], max_len)

    def get_words(self):

        words = self.dataset.words()
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        x_val = []
        y_val = []
        for vid, vdata in words["words"].items(
        ):  # note that even Dataset with one feature will require explicit indexing of features
            for sid, sdata in vdata.items():
                if sdata == []:
                    continue
                example = []
                for i, time_step in enumerate(sdata):
                    example.append(time_step[2])
                example = np.asarray(example)
                label = 1 if self.sentiments[vid][
                    sid] >= 0 else 0  # binarize the labels
                # here we just use everything except training set as the test set
                if vid in self.train_ids:
                    x_train.append(example)
                    y_train.append(label)
                elif vid in self.valid_ids:
                    x_val.append(example)
                    y_val.append(label)
                elif vid in self.test_ids:
                    x_test.append(example)
                    y_test.append(label)
        # Prepare the final inputs as numpy arrays
        x_train = np.asarray(x_train)
        x_val = np.asarray(x_val)
        x_test = np.asarray(x_test)
        y_train = np.asarray(y_train)
        y_val = np.asarray(y_val)
        y_test = np.asarray(y_test)

        return x_train, x_val, x_test, y_train, y_val, y_test

    def get_audio(self, max_len=20):

        covarep = self.dataset.covarep()
        train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test = self.get_data(
            covarep["covarep"], max_len)

        audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0)
        audio_max[audio_max == 0] = 1
        train_set_audio = train_set_audio / audio_max
        valid_set_audio = valid_set_audio / audio_max
        test_set_audio = test_set_audio / audio_max

        train_set_audio[train_set_audio != train_set_audio] = 0
        valid_set_audio[valid_set_audio != valid_set_audio] = 0
        test_set_audio[test_set_audio != test_set_audio] = 0

        return train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test

    def get_video(self, max_len=20):

        facet = self.dataset.facet()
        train_set_visual, valid_set_visual, test_set_visual, y_train, y_val, y_test = self.get_data(
            facet["facet"], max_len)
        visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0)
        visual_max[
            visual_max ==
            0] = 1  # if the maximum is 0 we don't normalize this dimension
        train_set_visual = train_set_visual / visual_max
        valid_set_visual = valid_set_visual / visual_max
        test_set_visual = test_set_visual / visual_max
        train_set_visual[train_set_visual != train_set_visual] = 0
        valid_set_visual[valid_set_visual != valid_set_visual] = 0
        test_set_visual[test_set_visual != test_set_visual] = 0

        return train_set_visual, valid_set_visual, test_set_visual
Example #7
0
def get_data(max_len_audio=20, max_len_text=15, max_len_visual=20):
    mosi = MOSI()
    embeddings = mosi.embeddings()
    facet = mosi.facet()
    covarep = mosi.covarep()
    sentiments = mosi.sentiments(
    )  # sentiment labels, real-valued. for this tutorial we'll binarize them
    train_ids = mosi.train()
    valid_ids = mosi.valid()
    test_ids = mosi.test()
    # sort through all the video ID, segment ID pairs
    train_set_ids = []
    for vid in train_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                train_set_ids.append((vid, sid))

    valid_set_ids = []
    for vid in valid_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                valid_set_ids.append((vid, sid))

    test_set_ids = []
    for vid in test_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                test_set_ids.append((vid, sid))

    # partition the training, valid and tesembeddingsall sequences will be padded/truncated to 15 steps
    # data will have shape (dataset_size, max_len, feature_dim)
    max_len = max_len_audio
    train_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in train_set_ids if covarep['covarep'][vid][sid]
    ],
                               axis=0)
    valid_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in valid_set_ids if covarep['covarep'][vid][sid]
    ],
                               axis=0)
    test_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in test_set_ids if covarep['covarep'][vid][sid]
    ],
                              axis=0)

    max_len = max_len_visual
    train_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in train_set_ids
    ],
                                axis=0)
    valid_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in valid_set_ids
    ],
                                axis=0)
    test_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in test_set_ids
    ],
                               axis=0)

    max_len = max_len_text
    train_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in train_set_ids
    ],
                              axis=0)
    valid_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in valid_set_ids
    ],
                              axis=0)
    test_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in test_set_ids
    ],
                             axis=0)
    # binarize the sentiment scores for binary classification task
    y_train = np.array([sentiments[vid][sid]
                        for (vid, sid) in train_set_ids]) > 0
    y_valid = np.array([sentiments[vid][sid]
                        for (vid, sid) in valid_set_ids]) > 0
    y_test = np.array([sentiments[vid][sid]
                       for (vid, sid) in test_set_ids]) > 0

    # train_set_audio = train_set_audio[:,:,1:35]
    # valid_set_audio = valid_set_audio[:,:,1:35]
    # test_set_audio = test_set_audio[:,:,1:35]

    visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0)
    visual_max[visual_max ==
               0] = 1  # if the maximum is 0 we don't normalize this dimension
    train_set_visual = train_set_visual / visual_max
    valid_set_visual = valid_set_visual / visual_max
    test_set_visual = test_set_visual / visual_max

    train_set_visual[train_set_visual != train_set_visual] = 0
    valid_set_visual[valid_set_visual != valid_set_visual] = 0
    test_set_visual[test_set_visual != test_set_visual] = 0

    audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0)
    audio_max[audio_max == 0] = 1
    train_set_audio = train_set_audio / audio_max
    valid_set_audio = valid_set_audio / audio_max
    test_set_audio = test_set_audio / audio_max

    train_set_audio[train_set_audio != train_set_audio] = 0
    valid_set_audio[valid_set_audio != valid_set_audio] = 0
    test_set_audio[test_set_audio != test_set_audio] = 0

    return train_set_audio, valid_set_audio, test_set_audio, train_set_text, valid_set_text, test_set_text, train_set_visual, valid_set_visual, test_set_visual, y_train, y_valid, y_test
Example #8
0
import tensorflow as tf
tf.set_random_seed(seed)
# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Conv1D, MaxPooling1D, Conv2D, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard

print("Preparing train and test data...")
# Download the data if not present
mosei = MOSI()
embeddings = mosei.embeddings()
sentiments = mosei.sentiments()
train_ids = mosei.train()
valid_ids = mosei.valid()
#test_ids = mosei.test()
max_len = 15
x_train = []
y_train = []
x_val = []
y_val = []
for vid, vdata in embeddings['embeddings'].items(
):  # note that even Dataset with one feature will require explicit indexing of features
    for sid, sdata in vdata.items():
        if sdata == []:
            continue