def __init__(self, args): self.device = torch.device('cuda:0') self.args = args self.loadTextMap = { 'mosi': self.__load_data_mosi, 'mosei': self.__load_data_mosei } self.bert = BertTextEncoder(language=args.language).to(self.device)
def __init__(self, args): super(SELF_MM, self).__init__() # text subnets self.aligned = args.aligned self.text_model = BertTextEncoder(language=args.language, use_finetune=args.use_finetune) # audio-vision subnets audio_in, video_in = args.feature_dims[1:] self.audio_model = AuViSubNet(audio_in, args.a_lstm_hidden_size, args.audio_out, \ num_layers=args.a_lstm_layers, dropout=args.a_lstm_dropout) self.video_model = AuViSubNet(video_in, args.v_lstm_hidden_size, args.video_out, \ num_layers=args.v_lstm_layers, dropout=args.v_lstm_dropout) # the post_fusion layers self.post_fusion_dropout = nn.Dropout(p=args.post_fusion_dropout) self.post_fusion_layer_1 = nn.Linear( args.text_out + args.video_out + args.audio_out, args.post_fusion_dim) self.post_fusion_layer_2 = nn.Linear(args.post_fusion_dim, args.post_fusion_dim) self.post_fusion_layer_3 = nn.Linear(args.post_fusion_dim, 1) # the classify layer for text self.post_text_dropout = nn.Dropout(p=args.post_text_dropout) self.post_text_layer_1 = nn.Linear(args.text_out, args.post_text_dim) self.post_text_layer_2 = nn.Linear(args.post_text_dim, args.post_text_dim) self.post_text_layer_3 = nn.Linear(args.post_text_dim, 1) # the classify layer for audio self.post_audio_dropout = nn.Dropout(p=args.post_audio_dropout) self.post_audio_layer_1 = nn.Linear(args.audio_out, args.post_audio_dim) self.post_audio_layer_2 = nn.Linear(args.post_audio_dim, args.post_audio_dim) self.post_audio_layer_3 = nn.Linear(args.post_audio_dim, 1) # test self.audio_classifier = nn.Linear(args.audio_out, 1) # the classify layer for video self.post_video_dropout = nn.Dropout(p=args.post_video_dropout) self.post_video_layer_1 = nn.Linear(args.video_out, args.post_video_dim) self.post_video_layer_2 = nn.Linear(args.post_video_dim, args.post_video_dim) self.post_video_layer_3 = nn.Linear(args.post_video_dim, 1) # test self.video_classifier = nn.Linear(args.video_out, 1)
def __init__(self, args): super(MULT, self).__init__() if args.use_bert: # text subnets self.text_model = BertTextEncoder(language=args.language, use_finetune=args.use_finetune) self.use_bert = args.use_bert # Mult Model Initialization. dst_feature_dims, nheads = args.dst_feature_dim_nheads_1, args.dst_feature_dim_nheads_2 self.orig_d_l, self.orig_d_a, self.orig_d_v = args.feature_dims self.d_l = self.d_a = self.d_v = dst_feature_dims self.num_heads = nheads self.layers = args.nlevels self.attn_dropout = args.attn_dropout self.attn_dropout_a = args.attn_dropout_a self.attn_dropout_v = args.attn_dropout_v self.relu_dropout = args.relu_dropout self.embed_dropout = args.embed_dropout self.res_dropout = args.res_dropout self.output_dropout = args.output_dropout self.text_dropout = args.text_dropout self.attn_mask = args.attn_mask combined_dim = 2 * (self.d_l + self.d_a + self.d_v) output_dim = args.num_classes # This is actually not a hyperparameter :-) # 1. Temporal convolutional layers self.proj_l = nn.Conv1d(self.orig_d_l, self.d_l, kernel_size=args.conv1d_kernel_size_l, padding=0, bias=False) self.proj_a = nn.Conv1d(self.orig_d_a, self.d_a, kernel_size=args.conv1d_kernel_size_a, padding=0, bias=False) self.proj_v = nn.Conv1d(self.orig_d_v, self.d_v, kernel_size=args.conv1d_kernel_size_v, padding=0, bias=False) # 2. Crossmodal Attentions self.trans_l_with_a = self.get_network(self_type='la') self.trans_l_with_v = self.get_network(self_type='lv') self.trans_a_with_l = self.get_network(self_type='al') self.trans_a_with_v = self.get_network(self_type='av') self.trans_v_with_l = self.get_network(self_type='vl') self.trans_v_with_a = self.get_network(self_type='va') # 3. Self Attentions (Could be replaced by LSTMs, GRUs, etc.) # [e.g., self.trans_x_mem = nn.LSTM(self.d_x, self.d_x, 1) self.trans_l_mem = self.get_network(self_type='l_mem', layers=3) self.trans_a_mem = self.get_network(self_type='a_mem', layers=3) self.trans_v_mem = self.get_network(self_type='v_mem', layers=3) # Projection layers # print("combined_dim: shape") # print(combined_dim) self.proj1 = nn.Linear(combined_dim, combined_dim) self.proj2 = nn.Linear(combined_dim, combined_dim) self.out_layer = nn.Linear(combined_dim, output_dim)
def __init__(self, config): super(MISA, self).__init__() assert config.use_bert == True self.config = config self.text_size = config.feature_dims[0] self.visual_size = config.feature_dims[2] self.acoustic_size = config.feature_dims[1] self.input_sizes = input_sizes = [ self.text_size, self.visual_size, self.acoustic_size ] self.hidden_sizes = hidden_sizes = [ int(self.text_size), int(self.visual_size), int(self.acoustic_size) ] self.output_size = output_size = config.num_classes if config.train_mode == "classification" else 1 self.dropout_rate = dropout_rate = config.dropout self.activation = nn.ReLU() self.tanh = nn.Tanh() rnn = nn.LSTM if self.config.rnncell == "lstm" else nn.GRU # defining modules - two layer bidirectional LSTM with layer norm in between if config.use_bert: # text subnets self.bertmodel = BertTextEncoder(language=config.language, use_finetune=config.use_finetune) self.vrnn1 = rnn(input_sizes[1], hidden_sizes[1], bidirectional=True) self.vrnn2 = rnn(2 * hidden_sizes[1], hidden_sizes[1], bidirectional=True) self.arnn1 = rnn(input_sizes[2], hidden_sizes[2], bidirectional=True) self.arnn2 = rnn(2 * hidden_sizes[2], hidden_sizes[2], bidirectional=True) ########################################## # mapping modalities to same sized space ########################################## if self.config.use_bert: self.project_t = nn.Sequential() self.project_t.add_module( 'project_t', nn.Linear(in_features=768, out_features=config.hidden_size)) self.project_t.add_module('project_t_activation', self.activation) self.project_t.add_module('project_t_layer_norm', nn.LayerNorm(config.hidden_size)) else: self.project_t = nn.Sequential() self.project_t.add_module( 'project_t', nn.Linear(in_features=hidden_sizes[0] * 4, out_features=config.hidden_size)) self.project_t.add_module('project_t_activation', self.activation) self.project_t.add_module('project_t_layer_norm', nn.LayerNorm(config.hidden_size)) self.project_v = nn.Sequential() self.project_v.add_module( 'project_v', nn.Linear(in_features=hidden_sizes[1] * 4, out_features=config.hidden_size)) self.project_v.add_module('project_v_activation', self.activation) self.project_v.add_module('project_v_layer_norm', nn.LayerNorm(config.hidden_size)) self.project_a = nn.Sequential() self.project_a.add_module( 'project_a', nn.Linear(in_features=hidden_sizes[2] * 4, out_features=config.hidden_size)) self.project_a.add_module('project_a_activation', self.activation) self.project_a.add_module('project_a_layer_norm', nn.LayerNorm(config.hidden_size)) ########################################## # private encoders ########################################## self.private_t = nn.Sequential() self.private_t.add_module( 'private_t_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_t.add_module('private_t_activation_1', nn.Sigmoid()) self.private_v = nn.Sequential() self.private_v.add_module( 'private_v_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_v.add_module('private_v_activation_1', nn.Sigmoid()) self.private_a = nn.Sequential() self.private_a.add_module( 'private_a_3', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_a.add_module('private_a_activation_3', nn.Sigmoid()) ########################################## # shared encoder ########################################## self.shared = nn.Sequential() self.shared.add_module( 'shared_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.shared.add_module('shared_activation_1', nn.Sigmoid()) ########################################## # reconstruct ########################################## self.recon_t = nn.Sequential() self.recon_t.add_module( 'recon_t_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.recon_v = nn.Sequential() self.recon_v.add_module( 'recon_v_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.recon_a = nn.Sequential() self.recon_a.add_module( 'recon_a_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) ########################################## # shared space adversarial discriminator ########################################## if not self.config.use_cmd_sim: self.discriminator = nn.Sequential() self.discriminator.add_module( 'discriminator_layer_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.discriminator.add_module('discriminator_layer_1_activation', self.activation) self.discriminator.add_module('discriminator_layer_1_dropout', nn.Dropout(dropout_rate)) self.discriminator.add_module( 'discriminator_layer_2', nn.Linear(in_features=config.hidden_size, out_features=len(hidden_sizes))) ########################################## # shared-private collaborative discriminator ########################################## self.sp_discriminator = nn.Sequential() self.sp_discriminator.add_module( 'sp_discriminator_layer_1', nn.Linear(in_features=config.hidden_size, out_features=4)) self.fusion = nn.Sequential() self.fusion.add_module( 'fusion_layer_1', nn.Linear(in_features=self.config.hidden_size * 6, out_features=self.config.hidden_size * 3)) self.fusion.add_module('fusion_layer_1_dropout', nn.Dropout(dropout_rate)) self.fusion.add_module('fusion_layer_1_activation', self.activation) self.fusion.add_module( 'fusion_layer_3', nn.Linear(in_features=self.config.hidden_size * 3, out_features=output_size)) self.tlayer_norm = nn.LayerNorm((hidden_sizes[0] * 2, )) self.vlayer_norm = nn.LayerNorm((hidden_sizes[1] * 2, )) self.alayer_norm = nn.LayerNorm((hidden_sizes[2] * 2, )) encoder_layer = nn.TransformerEncoderLayer( d_model=self.config.hidden_size, nhead=2) self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)
class TextPre(object): """A single set of features of data.""" def __init__(self, args): self.device = torch.device('cuda:0') self.args = args self.loadTextMap = { 'mosi': self.__load_data_mosi, 'mosei': self.__load_data_mosei } self.bert = BertTextEncoder(language=args.language).to(self.device) def textConvertID(self, data, tokenizer): features = {} Input_ids, Input_mask, Segment_ids = [], [], [] Raw_text, Visual, Audio = [], [], [] Label, ids = [], [] max_seq_length = self.args.max_seq_length for i in tqdm(range(len(data['raw_text']))): raw_text = data['raw_text'][i] visual = data['vision'][i] audio = data['audio'][i] tokens_a, inversions_a = tokenizer.tokenize(raw_text,invertable=True) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[:max_seq_length - 2] inversions_a = inversions_a[:max_seq_length - 2] tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) padding = [0] * (max_seq_length - len(input_ids)) if self.args.aligned: text_len = min(len(raw_text.split()), max_seq_length) new_visual = [visual[len(visual) - text_len + inv_id] for inv_id in inversions_a] new_audio = [audio[len(audio) - text_len + inv_id] for inv_id in inversions_a] visual = np.array(new_visual) audio = np.array(new_audio) # add "start" and "end" for audio and vision audio_zero = np.zeros((1,audio.shape[1])) audio = np.concatenate((audio_zero,audio,audio_zero)) visual_zero = np.zeros((1,visual.shape[1])) visual = np.concatenate((visual_zero,visual,visual_zero)) audio_padding = np.zeros((max_seq_length - len(input_ids),audio.shape[1])) audio = np.concatenate((audio,audio_padding)) video_padding = np.zeros((max_seq_length - len(input_ids),visual.shape[1])) visual = np.concatenate((visual,video_padding)) assert audio.shape[0] == max_seq_length assert visual.shape[0] == max_seq_length input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label = float(data['labels'][i]) Input_ids.append(input_ids) Visual.append(visual) Audio.append(audio) Input_mask.append(input_mask) Segment_ids.append(segment_ids) Label.append(label) Raw_text.append(raw_text) ids.append(data['id'][i]) features['raw_text'] = np.array(Raw_text) features['audio'] = np.array(Audio) features['vision'] = np.array(Visual) features['labels'] = np.array(Label) features['id'] = np.array(ids) Input_ids = np.expand_dims(Input_ids, 1) Input_mask = np.expand_dims(Input_mask, 1) Segment_ids = np.expand_dims(Segment_ids, 1) text_bert = np.concatenate((Input_ids, Input_mask, Segment_ids), axis=1) features['text_bert'] = text_bert features['text'] = self.__convertID2Vector(text_bert) return features def __convertID2Vector(self, ids, batch_size=64): results = [] left = 0 ids = torch.Tensor(ids) for left in tqdm(range(0, ids.size(0), batch_size)): right = min(left + batch_size, ids.size(0)) c_ids = ids[left:right].to(self.device) c_vector = self.bert(c_ids).detach().cpu().numpy() results.append(c_vector) results = np.concatenate(results, axis=0) return results def __load_data_mosi(self): # get text data link = os.path.join(self.args.data_dir, 'Raw/Transcript/Segmented') text_data = {} for file in os.listdir(link): name = file.split('.')[0] for line in open(os.path.join(link, file), "r"): num_id, cur_t = line.split('_DELIM_') name_id = name + '_' + num_id.strip() text_data[name_id] = cur_t.strip() # get data def matchData(mode='train'): r_text = [] for cur_id in data[mode]['id']: r_text.append(text_data[cur_id[0]]) data[mode]['raw_text'] = r_text with open(os.path.join(self.args.data_dir, 'Processed/mosei_senti_data_noalign.pkl'), 'rb') as lf: data = pickle.load(lf) matchData(mode='train') matchData(mode='valid') matchData(mode='test') return data def __load_data_mosei(self): def convert0(s): if s == '0': return '0.0' return s # get text data link = os.path.join(self.args.data_dir, 'Raw/Transcript/Segmented') text_data = {} for file in os.listdir(link): name = file.split('.')[0] for line in open(os.path.join(link, file), "r"): items = line.split('___') name_id = items[0] + '_' + convert0(items[2]) + '_' + convert0(items[3]) text_data[name_id.strip()] = items[-1].strip() # get data def matchData(mode='train'): r_text = [] for cur_id in data[mode]['id']: name = '_'.join(cur_id) r_text.append(text_data[name]) data[mode]['raw_text'] = r_text with open(os.path.join(self.args.data_dir, 'Processed/mosei_senti_data_noalign.pkl'), 'rb') as lf: data = pickle.load(lf) matchData(mode='train') matchData(mode='valid') matchData(mode='test') return data def run(self): data = self.loadTextMap[self.args.datasetName]() train_list = data['train'] valid_list = data['valid'] test_list = data['test'] tokenizer = self.bert.get_tokenizer() save_data = {} save_data['train'] = self.textConvertID(train_list, tokenizer) save_data['valid'] = self.textConvertID(valid_list, tokenizer) save_data['test'] = self.textConvertID(test_list, tokenizer) if self.args.aligned: saved_path = os.path.join(self.args.save_dir, 'aligned_' + str(self.args.max_seq_length) + '.pkl') else: saved_path = os.path.join(self.args.save_dir, 'unaligned_' + str(self.args.max_seq_length) + '.pkl') if not os.path.exists(os.path.dirname(saved_path)): os.makedirs(os.path.dirname(saved_path)) with open(saved_path, 'wb') as file: pickle.dump(save_data, file, protocol=4) print('Save Successful!')