import multiprocessing num_cores = multiprocessing.cpu_count() mode = sys.argv[1] task = sys.argv[2] # Download the data if not present mosei = MOSI() embeddings = mosei.embeddings() if mode == "all" or mode == "AV" or mode == "VT" or mode == "V": facet = mosei.facet() if mode == "all" or mode == "AT" or mode == "AV" or mode == "A": covarep = mosei.covarep() sentiments = mosei.sentiments() emotions = mosei.emotions() train_ids = mosei.train() valid_ids = mosei.valid() test_ids = mosei.test() # Merge different features and do word level feature alignment (align according to timestamps of embeddings) if mode == "all" or mode == "AV": bimodal = Dataset.merge(embeddings, facet) trimodal = Dataset.merge(bimodal, covarep) dataset = trimodal.align('embeddings') if mode == "AT": bimodal = Dataset.merge(embeddings, covarep) dataset = bimodal.align('embeddings') if mode == "VT": bimodal = Dataset.merge(embeddings, facet) dataset = bimodal.align('embeddings')
class UnimodalData(): def __init__(self, dataset=None): if dataset == None: self.dataset = MOSI() else: self.dataset = dataset self.train_ids = self.dataset.train() self.valid_ids = self.dataset.valid() self.test_ids = self.dataset.test() self.sentiments = self.dataset.sentiments() def get_data(self, data, max_len): x_train = [] y_train = [] x_test = [] y_test = [] x_val = [] y_val = [] for vid, vdata in data.items( ): # note that even Dataset with one feature will require explicit indexing of features for sid, sdata in vdata.items(): if sdata == []: continue example = [] for i, time_step in enumerate(sdata): # data is truncated for 15 words if i == max_len: break example.append( time_step[2] ) # here first 2 dims (timestamps) will not be used for i in range(max_len - len(sdata)): example.append(np.zeros( sdata[0][2].shape)) # padding each example to max_len example = np.asarray(example) label = 1 if self.sentiments[vid][ sid] >= 0 else 0 # binarize the labels # here we just use everything except training set as the test set if vid in self.train_ids: x_train.append(example) y_train.append(label) elif vid in self.valid_ids: x_val.append(example) y_val.append(label) elif vid in self.test_ids: x_test.append(example) y_test.append(label) # Prepare the final inputs as numpy arrays x_train = np.asarray(x_train) x_val = np.asarray(x_val) x_test = np.asarray(x_test) y_train = np.asarray(y_train) y_val = np.asarray(y_val) y_test = np.asarray(y_test) return x_train, x_val, x_test, y_train, y_val, y_test def get_text(self, max_len=20): embeddings = self.dataset.embeddings() return self.get_data(embeddings["embeddings"], max_len) def get_words(self): words = self.dataset.words() x_train = [] y_train = [] x_test = [] y_test = [] x_val = [] y_val = [] for vid, vdata in words["words"].items( ): # note that even Dataset with one feature will require explicit indexing of features for sid, sdata in vdata.items(): if sdata == []: continue example = [] for i, time_step in enumerate(sdata): example.append(time_step[2]) example = np.asarray(example) label = 1 if self.sentiments[vid][ sid] >= 0 else 0 # binarize the labels # here we just use everything except training set as the test set if vid in self.train_ids: x_train.append(example) y_train.append(label) elif vid in self.valid_ids: x_val.append(example) y_val.append(label) elif vid in self.test_ids: x_test.append(example) y_test.append(label) # Prepare the final inputs as numpy arrays x_train = np.asarray(x_train) x_val = np.asarray(x_val) x_test = np.asarray(x_test) y_train = np.asarray(y_train) y_val = np.asarray(y_val) y_test = np.asarray(y_test) return x_train, x_val, x_test, y_train, y_val, y_test def get_audio(self, max_len=20): covarep = self.dataset.covarep() train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test = self.get_data( covarep["covarep"], max_len) audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0) audio_max[audio_max == 0] = 1 train_set_audio = train_set_audio / audio_max valid_set_audio = valid_set_audio / audio_max test_set_audio = test_set_audio / audio_max train_set_audio[train_set_audio != train_set_audio] = 0 valid_set_audio[valid_set_audio != valid_set_audio] = 0 test_set_audio[test_set_audio != test_set_audio] = 0 return train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test def get_video(self, max_len=20): facet = self.dataset.facet() train_set_visual, valid_set_visual, test_set_visual, y_train, y_val, y_test = self.get_data( facet["facet"], max_len) visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0) visual_max[ visual_max == 0] = 1 # if the maximum is 0 we don't normalize this dimension train_set_visual = train_set_visual / visual_max valid_set_visual = valid_set_visual / visual_max test_set_visual = test_set_visual / visual_max train_set_visual[train_set_visual != train_set_visual] = 0 valid_set_visual[valid_set_visual != valid_set_visual] = 0 test_set_visual[test_set_visual != test_set_visual] = 0 return train_set_visual, valid_set_visual, test_set_visual
# meta parameters maxlen = 15 # Each utterance will be truncated/padded to 15 words batch_size = 128 nb_epoch = 1000 # number of total epochs to train the model # if the validation loss isn't decreasing for a number of epochs, stop training to prevent over-fitting early_stopping = EarlyStopping(monitor='val_loss', patience=5) opt_func = Adamax(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # optimization function loss_func = 'mae' # loss function metr = 'mae' # evaluation metric # Download the data if not present mosi = MOSI() embeddings = mosi.embeddings() # features sentiments = mosi.sentiments() # Valence labels train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() # Some data preprocessing x_train = [] y_train = [] x_valid = [] y_valid = [] x_test = [] y_test = [] print("Preparing train and test data...") for vid, vdata in embeddings['embeddings'].items(): # note that even Dataset with one feature will require explicit indexing of features for sid, sdata in vdata.items(): if sdata == []:
def get_data(max_len_audio=20, max_len_text=15, max_len_visual=20): mosi = MOSI() embeddings = mosi.embeddings() facet = mosi.facet() covarep = mosi.covarep() sentiments = mosi.sentiments( ) # sentiment labels, real-valued. for this tutorial we'll binarize them train_ids = mosi.train() valid_ids = mosi.valid() test_ids = mosi.test() # sort through all the video ID, segment ID pairs train_set_ids = [] for vid in train_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: train_set_ids.append((vid, sid)) valid_set_ids = [] for vid in valid_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: valid_set_ids.append((vid, sid)) test_set_ids = [] for vid in test_ids: for sid in embeddings['embeddings'][vid].keys(): if embeddings['embeddings'][vid][sid] and facet['facet'][vid][ sid] and covarep['covarep'][vid][sid]: test_set_ids.append((vid, sid)) # partition the training, valid and tesembeddingsall sequences will be padded/truncated to 15 steps # data will have shape (dataset_size, max_len, feature_dim) max_len = max_len_audio train_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in train_set_ids if covarep['covarep'][vid][sid] ], axis=0) valid_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in valid_set_ids if covarep['covarep'][vid][sid] ], axis=0) test_set_audio = np.stack([ pad(covarep['covarep'][vid][sid], max_len) for (vid, sid) in test_set_ids if covarep['covarep'][vid][sid] ], axis=0) max_len = max_len_visual train_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in train_set_ids ], axis=0) valid_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in valid_set_ids ], axis=0) test_set_visual = np.stack([ pad(facet['facet'][vid][sid], max_len) for (vid, sid) in test_set_ids ], axis=0) max_len = max_len_text train_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in train_set_ids ], axis=0) valid_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in valid_set_ids ], axis=0) test_set_text = np.stack([ pad(embeddings['embeddings'][vid][sid], max_len) for (vid, sid) in test_set_ids ], axis=0) # binarize the sentiment scores for binary classification task y_train = np.array([sentiments[vid][sid] for (vid, sid) in train_set_ids]) > 0 y_valid = np.array([sentiments[vid][sid] for (vid, sid) in valid_set_ids]) > 0 y_test = np.array([sentiments[vid][sid] for (vid, sid) in test_set_ids]) > 0 # train_set_audio = train_set_audio[:,:,1:35] # valid_set_audio = valid_set_audio[:,:,1:35] # test_set_audio = test_set_audio[:,:,1:35] visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0) visual_max[visual_max == 0] = 1 # if the maximum is 0 we don't normalize this dimension train_set_visual = train_set_visual / visual_max valid_set_visual = valid_set_visual / visual_max test_set_visual = test_set_visual / visual_max train_set_visual[train_set_visual != train_set_visual] = 0 valid_set_visual[valid_set_visual != valid_set_visual] = 0 test_set_visual[test_set_visual != test_set_visual] = 0 audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0) audio_max[audio_max == 0] = 1 train_set_audio = train_set_audio / audio_max valid_set_audio = valid_set_audio / audio_max test_set_audio = test_set_audio / audio_max train_set_audio[train_set_audio != train_set_audio] = 0 valid_set_audio[valid_set_audio != valid_set_audio] = 0 test_set_audio[test_set_audio != test_set_audio] = 0 return train_set_audio, valid_set_audio, test_set_audio, train_set_text, valid_set_text, test_set_text, train_set_visual, valid_set_visual, test_set_visual, y_train, y_valid, y_test