os.environ['PYTHONHASHSEED'] = '0' import tensorflow as tf tf.set_random_seed(seed) from joblib import Parallel, delayed import multiprocessing num_cores = multiprocessing.cpu_count() mode = sys.argv[1] task = sys.argv[2] # Download the data if not present mosei = MOSI() embeddings = mosei.embeddings() if mode == "all" or mode == "AV" or mode == "VT" or mode == "V": facet = mosei.facet() if mode == "all" or mode == "AT" or mode == "AV" or mode == "A": covarep = mosei.covarep() sentiments = mosei.sentiments() emotions = mosei.emotions() train_ids = mosei.train() valid_ids = mosei.valid() test_ids = mosei.test() # Merge different features and do word level feature alignment (align according to timestamps of embeddings) if mode == "all" or mode == "AV": bimodal = Dataset.merge(embeddings, facet) trimodal = Dataset.merge(bimodal, covarep) dataset = trimodal.align('embeddings') if mode == "AT": bimodal = Dataset.merge(embeddings, covarep)
dim = data.shape[1] if max_len >= n_rows: diff = max_len - n_rows padding = np.zeros((diff, dim)) padded = np.concatenate((padding, data)) return padded else: return data[-max_len:] if __name__ == "__main__": # Download the data if not present max_len = 20 mosi = MOSI() embeddings = mosi.embeddings() facet = mosi.facet() covarep = mosi.covarep() sentiments = mosi.sentiments( ) # sentiment labels, real-valued. for this tutorial we'll binarize them train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() # Merge different features and do word level feature alignment (align according to timestamps of embeddings) bimodal = Dataset.merge(embeddings, facet) trimodal = Dataset.merge(bimodal, covarep) dataset = trimodal.align('embeddings') # sort through all the video ID, segment ID pairs train_set_ids = [] for vid in train_ids:
class UnimodalData(): def __init__(self, dataset=None): if dataset == None: self.dataset = MOSI() else: self.dataset = dataset self.train_ids = self.dataset.train() self.valid_ids = self.dataset.valid() self.test_ids = self.dataset.test() self.sentiments = self.dataset.sentiments() def get_data(self, data, max_len): x_train = [] y_train = [] x_test = [] y_test = [] x_val = [] y_val = [] for vid, vdata in data.items( ): # note that even Dataset with one feature will require explicit indexing of features for sid, sdata in vdata.items(): if sdata == []: continue example = [] for i, time_step in enumerate(sdata): # data is truncated for 15 words if i == max_len: break example.append( time_step[2] ) # here first 2 dims (timestamps) will not be used for i in range(max_len - len(sdata)): example.append(np.zeros( sdata[0][2].shape)) # padding each example to max_len example = np.asarray(example) label = 1 if self.sentiments[vid][ sid] >= 0 else 0 # binarize the labels # here we just use everything except training set as the test set if vid in self.train_ids: x_train.append(example) y_train.append(label) elif vid in self.valid_ids: x_val.append(example) y_val.append(label) elif vid in self.test_ids: x_test.append(example) y_test.append(label) # Prepare the final inputs as numpy arrays x_train = np.asarray(x_train) x_val = np.asarray(x_val) x_test = np.asarray(x_test) y_train = np.asarray(y_train) y_val = np.asarray(y_val) y_test = np.asarray(y_test) return x_train, x_val, x_test, y_train, y_val, y_test def get_text(self, max_len=20): embeddings = self.dataset.embeddings() return self.get_data(embeddings["embeddings"], max_len) def get_words(self): words = self.dataset.words() x_train = [] y_train = [] x_test = [] y_test = [] x_val = [] y_val = [] for vid, vdata in words["words"].items( ): # note that even Dataset with one feature will require explicit indexing of features for sid, sdata in vdata.items(): if sdata == []: continue example = [] for i, time_step in enumerate(sdata): example.append(time_step[2]) example = np.asarray(example) label = 1 if self.sentiments[vid][ sid] >= 0 else 0 # binarize the labels # here we just use everything except training set as the test set if vid in self.train_ids: x_train.append(example) y_train.append(label) elif vid in self.valid_ids: x_val.append(example) y_val.append(label) elif vid in self.test_ids: x_test.append(example) y_test.append(label) # Prepare the final inputs as numpy arrays x_train = np.asarray(x_train) x_val = np.asarray(x_val) x_test = np.asarray(x_test) y_train = np.asarray(y_train) y_val = np.asarray(y_val) y_test = np.asarray(y_test) return x_train, x_val, x_test, y_train, y_val, y_test def get_audio(self, max_len=20): covarep = self.dataset.covarep() train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test = self.get_data( covarep["covarep"], max_len) audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0) audio_max[audio_max == 0] = 1 train_set_audio = train_set_audio / audio_max valid_set_audio = valid_set_audio / audio_max test_set_audio = test_set_audio / audio_max train_set_audio[train_set_audio != train_set_audio] = 0 valid_set_audio[valid_set_audio != valid_set_audio] = 0 test_set_audio[test_set_audio != test_set_audio] = 0 return train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test def get_video(self, max_len=20): facet = self.dataset.facet() train_set_visual, valid_set_visual, test_set_visual, y_train, y_val, y_test = self.get_data( facet["facet"], max_len) visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0) visual_max[ visual_max == 0] = 1 # if the maximum is 0 we don't normalize this dimension train_set_visual = train_set_visual / visual_max valid_set_visual = valid_set_visual / visual_max test_set_visual = test_set_visual / visual_max train_set_visual[train_set_visual != train_set_visual] = 0 valid_set_visual[valid_set_visual != valid_set_visual] = 0 test_set_visual[test_set_visual != test_set_visual] = 0 return train_set_visual, valid_set_visual, test_set_visual
loss_func_main = 'mae' # loss function metr_main = 'mae' # evaluation metric weight_main = 1.0 # weight for multitask learning # for Valence polarity classification loss_func_aux1 = 'binary_crossentropy' # loss function metr_aux1 = 'binary_accuracy' # evaluation metric weight_aux1 = 0.5 # weight for multitask learning # for Valence intensity classification loss_func_aux2 = 'categorical_crossentropy' # loss function metr_aux2 = 'accuracy' # evaluation metric weight_aux2 = 0.5 # weight for multitask learning # Download the data if not present mosi = MOSI() covarep = mosi.covarep() # features facet = mosi.facet() # features embeddings = mosi.embeddings() # features sentiments = mosi.sentiments() # Valence labels train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() bimodal = Dataset.merge(embeddings, covarep) dataset = bimodal.align('embeddings') # Some data preprocessing print("Preparing train and test data...") # sort through all the video ID, segment ID pairs train_set_ids = [] for vid in train_ids: for sid in dataset['embeddings'][vid].keys():
def get_data(max_len_audio=20, max_len_text=15, max_len_visual=20): mosi = MOSI() embeddings = mosi.embeddings() facet = mosi.facet() covarep = mosi.covarep() sentiments = mosi.sentiments( ) # sentiment labels, real-valued. for this tutorial we'll binarize them train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() # sort through all the video ID, segment ID pairs train_set_ids = [] for vid in train_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: train_set_ids.append((vid, sid)) valid_set_ids = [] for vid in valid_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: valid_set_ids.append((vid, sid)) test_set_ids = [] for vid in test_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: test_set_ids.append((vid, sid)) # partition the training, valid and tesembeddingsall sequences will be padded/truncated to 15 steps # data will have shape (dataset_size, max_len, feature_dim) max_len = max_len_audio train_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in train_set_ids if covarep['covarep'][vid][sid] ], axis=0) valid_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in valid_set_ids if covarep['covarep'][vid][sid] ], axis=0) test_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in test_set_ids if covarep['covarep'][vid][sid] ], axis=0) max_len = max_len_visual train_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in train_set_ids ], axis=0) valid_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in valid_set_ids ], axis=0) test_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in test_set_ids ], axis=0) max_len = max_len_text train_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in train_set_ids ], axis=0) valid_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in valid_set_ids ], axis=0) test_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in test_set_ids ], axis=0) # binarize the sentiment scores for binary classification task y_train = np.array([sentiments[vid][sid] for (vid, sid) in train_set_ids]) > 0 y_valid = np.array([sentiments[vid][sid] for (vid, sid) in valid_set_ids]) > 0 y_test = np.array([sentiments[vid][sid] for (vid, sid) in test_set_ids]) > 0 # train_set_audio = train_set_audio[:,:,1:35] # valid_set_audio = valid_set_audio[:,:,1:35] # test_set_audio = test_set_audio[:,:,1:35] visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0) visual_max[visual_max == 0] = 1 # if the maximum is 0 we don't normalize this dimension train_set_visual = train_set_visual / visual_max valid_set_visual = valid_set_visual / visual_max test_set_visual = test_set_visual / visual_max train_set_visual[train_set_visual != train_set_visual] = 0 valid_set_visual[valid_set_visual != valid_set_visual] = 0 test_set_visual[test_set_visual != test_set_visual] = 0 audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0) audio_max[audio_max == 0] = 1 train_set_audio = train_set_audio / audio_max valid_set_audio = valid_set_audio / audio_max test_set_audio = test_set_audio / audio_max train_set_audio[train_set_audio != train_set_audio] = 0 valid_set_audio[valid_set_audio != valid_set_audio] = 0 test_set_audio[test_set_audio != test_set_audio] = 0 return train_set_audio, valid_set_audio, test_set_audio, train_set_text, valid_set_text, test_set_text, train_set_visual, valid_set_visual, test_set_visual, y_train, y_valid, y_test