def __init__(self, dataset=None): if dataset == None: self.dataset = MOSI() else: self.dataset = dataset self.train_ids = self.dataset.train() self.valid_ids = self.dataset.valid() self.test_ids = self.dataset.test() self.sentiments = self.dataset.sentiments()
np.random.seed(seed) import os os.environ['PYTHONHASHSEED'] = '0' import tensorflow as tf tf.set_random_seed(seed) from joblib import Parallel, delayed import multiprocessing num_cores = multiprocessing.cpu_count() mode = sys.argv[1] task = sys.argv[2] # Download the data if not present mosei = MOSI() embeddings = mosei.embeddings() if mode == "all" or mode == "AV" or mode == "VT" or mode == "V": facet = mosei.facet() if mode == "all" or mode == "AT" or mode == "AV" or mode == "A": covarep = mosei.covarep() sentiments = mosei.sentiments() emotions = mosei.emotions() train_ids = mosei.train() valid_ids = mosei.valid() test_ids = mosei.test() # Merge different features and do word level feature alignment (align according to timestamps of embeddings) if mode == "all" or mode == "AV": bimodal = Dataset.merge(embeddings, facet) trimodal = Dataset.merge(bimodal, covarep)
return K.sum(K.mean(fsp*fst,axis=0)/(devP*devT)) # meta parameters maxlen = 15 # Each utterance will be truncated/padded to 15 words batch_size = 128 nb_epoch = 1000 # number of total epochs to train the model # if the validation loss isn't decreasing for a number of epochs, stop training to prevent over-fitting early_stopping = EarlyStopping(monitor='val_loss', patience=5) opt_func = Adamax(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # optimization function loss_func = 'mae' # loss function metr = 'mae' # evaluation metric # Download the data if not present mosi = MOSI() embeddings = mosi.embeddings() # features sentiments = mosi.sentiments() # Valence labels train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() # Some data preprocessing x_train = [] y_train = [] x_valid = [] y_valid = [] x_test = [] y_test = [] print("Preparing train and test data...")
data = np.array([feature[2] for feature in data]) n_rows = data.shape[0] dim = data.shape[1] if max_len >= n_rows: diff = max_len - n_rows padding = np.zeros((diff, dim)) padded = np.concatenate((padding, data)) return padded else: return data[-max_len:] if __name__ == "__main__": # Download the data if not present max_len = 20 mosi = MOSI() embeddings = mosi.embeddings() facet = mosi.facet() covarep = mosi.covarep() sentiments = mosi.sentiments( ) # sentiment labels, real-valued. for this tutorial we'll binarize them train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() # Merge different features and do word level feature alignment (align according to timestamps of embeddings) bimodal = Dataset.merge(embeddings, facet) trimodal = Dataset.merge(bimodal, covarep) dataset = trimodal.align('embeddings') # sort through all the video ID, segment ID pairs
opt_func = Adamax(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # optimization function loss_func_main = 'mae' # loss function metr_main = 'mae' # evaluation metric weight_main = 1.0 # weight for multitask learning # for Valence polarity classification loss_func_aux1 = 'binary_crossentropy' # loss function metr_aux1 = 'binary_accuracy' # evaluation metric weight_aux1 = 0.5 # weight for multitask learning # for Valence intensity classification loss_func_aux2 = 'categorical_crossentropy' # loss function metr_aux2 = 'accuracy' # evaluation metric weight_aux2 = 0.5 # weight for multitask learning # Download the data if not present mosi = MOSI() covarep = mosi.covarep() # features facet = mosi.facet() # features embeddings = mosi.embeddings() # features sentiments = mosi.sentiments() # Valence labels train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() bimodal = Dataset.merge(embeddings, covarep) dataset = bimodal.align('embeddings') # Some data preprocessing print("Preparing train and test data...") # sort through all the video ID, segment ID pairs train_set_ids = []
class UnimodalData(): def __init__(self, dataset=None): if dataset == None: self.dataset = MOSI() else: self.dataset = dataset self.train_ids = self.dataset.train() self.valid_ids = self.dataset.valid() self.test_ids = self.dataset.test() self.sentiments = self.dataset.sentiments() def get_data(self, data, max_len): x_train = [] y_train = [] x_test = [] y_test = [] x_val = [] y_val = [] for vid, vdata in data.items( ): # note that even Dataset with one feature will require explicit indexing of features for sid, sdata in vdata.items(): if sdata == []: continue example = [] for i, time_step in enumerate(sdata): # data is truncated for 15 words if i == max_len: break example.append( time_step[2] ) # here first 2 dims (timestamps) will not be used for i in range(max_len - len(sdata)): example.append(np.zeros( sdata[0][2].shape)) # padding each example to max_len example = np.asarray(example) label = 1 if self.sentiments[vid][ sid] >= 0 else 0 # binarize the labels # here we just use everything except training set as the test set if vid in self.train_ids: x_train.append(example) y_train.append(label) elif vid in self.valid_ids: x_val.append(example) y_val.append(label) elif vid in self.test_ids: x_test.append(example) y_test.append(label) # Prepare the final inputs as numpy arrays x_train = np.asarray(x_train) x_val = np.asarray(x_val) x_test = np.asarray(x_test) y_train = np.asarray(y_train) y_val = np.asarray(y_val) y_test = np.asarray(y_test) return x_train, x_val, x_test, y_train, y_val, y_test def get_text(self, max_len=20): embeddings = self.dataset.embeddings() return self.get_data(embeddings["embeddings"], max_len) def get_words(self): words = self.dataset.words() x_train = [] y_train = [] x_test = [] y_test = [] x_val = [] y_val = [] for vid, vdata in words["words"].items( ): # note that even Dataset with one feature will require explicit indexing of features for sid, sdata in vdata.items(): if sdata == []: continue example = [] for i, time_step in enumerate(sdata): example.append(time_step[2]) example = np.asarray(example) label = 1 if self.sentiments[vid][ sid] >= 0 else 0 # binarize the labels # here we just use everything except training set as the test set if vid in self.train_ids: x_train.append(example) y_train.append(label) elif vid in self.valid_ids: x_val.append(example) y_val.append(label) elif vid in self.test_ids: x_test.append(example) y_test.append(label) # Prepare the final inputs as numpy arrays x_train = np.asarray(x_train) x_val = np.asarray(x_val) x_test = np.asarray(x_test) y_train = np.asarray(y_train) y_val = np.asarray(y_val) y_test = np.asarray(y_test) return x_train, x_val, x_test, y_train, y_val, y_test def get_audio(self, max_len=20): covarep = self.dataset.covarep() train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test = self.get_data( covarep["covarep"], max_len) audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0) audio_max[audio_max == 0] = 1 train_set_audio = train_set_audio / audio_max valid_set_audio = valid_set_audio / audio_max test_set_audio = test_set_audio / audio_max train_set_audio[train_set_audio != train_set_audio] = 0 valid_set_audio[valid_set_audio != valid_set_audio] = 0 test_set_audio[test_set_audio != test_set_audio] = 0 return train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test def get_video(self, max_len=20): facet = self.dataset.facet() train_set_visual, valid_set_visual, test_set_visual, y_train, y_val, y_test = self.get_data( facet["facet"], max_len) visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0) visual_max[ visual_max == 0] = 1 # if the maximum is 0 we don't normalize this dimension train_set_visual = train_set_visual / visual_max valid_set_visual = valid_set_visual / visual_max test_set_visual = test_set_visual / visual_max train_set_visual[train_set_visual != train_set_visual] = 0 valid_set_visual[valid_set_visual != valid_set_visual] = 0 test_set_visual[test_set_visual != test_set_visual] = 0 return train_set_visual, valid_set_visual, test_set_visual
def get_data(max_len_audio=20, max_len_text=15, max_len_visual=20): mosi = MOSI() embeddings = mosi.embeddings() facet = mosi.facet() covarep = mosi.covarep() sentiments = mosi.sentiments( ) # sentiment labels, real-valued. for this tutorial we'll binarize them train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() # sort through all the video ID, segment ID pairs train_set_ids = [] for vid in train_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: train_set_ids.append((vid, sid)) valid_set_ids = [] for vid in valid_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: valid_set_ids.append((vid, sid)) test_set_ids = [] for vid in test_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: test_set_ids.append((vid, sid)) # partition the training, valid and tesembeddingsall sequences will be padded/truncated to 15 steps # data will have shape (dataset_size, max_len, feature_dim) max_len = max_len_audio train_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in train_set_ids if covarep['covarep'][vid][sid] ], axis=0) valid_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in valid_set_ids if covarep['covarep'][vid][sid] ], axis=0) test_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in test_set_ids if covarep['covarep'][vid][sid] ], axis=0) max_len = max_len_visual train_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in train_set_ids ], axis=0) valid_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in valid_set_ids ], axis=0) test_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in test_set_ids ], axis=0) max_len = max_len_text train_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in train_set_ids ], axis=0) valid_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in valid_set_ids ], axis=0) test_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in test_set_ids ], axis=0) # binarize the sentiment scores for binary classification task y_train = np.array([sentiments[vid][sid] for (vid, sid) in train_set_ids]) > 0 y_valid = np.array([sentiments[vid][sid] for (vid, sid) in valid_set_ids]) > 0 y_test = np.array([sentiments[vid][sid] for (vid, sid) in test_set_ids]) > 0 # train_set_audio = train_set_audio[:,:,1:35] # valid_set_audio = valid_set_audio[:,:,1:35] # test_set_audio = test_set_audio[:,:,1:35] visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0) visual_max[visual_max == 0] = 1 # if the maximum is 0 we don't normalize this dimension train_set_visual = train_set_visual / visual_max valid_set_visual = valid_set_visual / visual_max test_set_visual = test_set_visual / visual_max train_set_visual[train_set_visual != train_set_visual] = 0 valid_set_visual[valid_set_visual != valid_set_visual] = 0 test_set_visual[test_set_visual != test_set_visual] = 0 audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0) audio_max[audio_max == 0] = 1 train_set_audio = train_set_audio / audio_max valid_set_audio = valid_set_audio / audio_max test_set_audio = test_set_audio / audio_max train_set_audio[train_set_audio != train_set_audio] = 0 valid_set_audio[valid_set_audio != valid_set_audio] = 0 test_set_audio[test_set_audio != test_set_audio] = 0 return train_set_audio, valid_set_audio, test_set_audio, train_set_text, valid_set_text, test_set_text, train_set_visual, valid_set_visual, test_set_visual, y_train, y_valid, y_test
import tensorflow as tf tf.set_random_seed(seed) # The below is necessary for starting Numpy generated random numbers # in a well-defined initial state. # The below is necessary for starting core Python generated random numbers # in a well-defined state. from keras.models import Sequential from keras.optimizers import Adam from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Conv1D, MaxPooling1D, Conv2D, Flatten from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard print("Preparing train and test data...") # Download the data if not present mosei = MOSI() embeddings = mosei.embeddings() sentiments = mosei.sentiments() train_ids = mosei.train() valid_ids = mosei.valid() #test_ids = mosei.test() max_len = 15 x_train = [] y_train = [] x_val = [] y_val = [] for vid, vdata in embeddings['embeddings'].items( ): # note that even Dataset with one feature will require explicit indexing of features for sid, sdata in vdata.items(): if sdata == []: continue