Ejemplo n.º 1
0
 def __init__(self, args):
     self.device = torch.device('cuda:0')
     self.args = args
     self.loadTextMap = {
         'mosi': self.__load_data_mosi,
         'mosei': self.__load_data_mosei
     }
     self.bert = BertTextEncoder(language=args.language).to(self.device)
Ejemplo n.º 2
0
    def __init__(self, args):
        super(SELF_MM, self).__init__()
        # text subnets
        self.aligned = args.aligned
        self.text_model = BertTextEncoder(language=args.language,
                                          use_finetune=args.use_finetune)

        # audio-vision subnets
        audio_in, video_in = args.feature_dims[1:]
        self.audio_model = AuViSubNet(audio_in, args.a_lstm_hidden_size, args.audio_out, \
                            num_layers=args.a_lstm_layers, dropout=args.a_lstm_dropout)
        self.video_model = AuViSubNet(video_in, args.v_lstm_hidden_size, args.video_out, \
                            num_layers=args.v_lstm_layers, dropout=args.v_lstm_dropout)

        # the post_fusion layers
        self.post_fusion_dropout = nn.Dropout(p=args.post_fusion_dropout)
        self.post_fusion_layer_1 = nn.Linear(
            args.text_out + args.video_out + args.audio_out,
            args.post_fusion_dim)
        self.post_fusion_layer_2 = nn.Linear(args.post_fusion_dim,
                                             args.post_fusion_dim)
        self.post_fusion_layer_3 = nn.Linear(args.post_fusion_dim, 1)

        # the classify layer for text
        self.post_text_dropout = nn.Dropout(p=args.post_text_dropout)
        self.post_text_layer_1 = nn.Linear(args.text_out, args.post_text_dim)
        self.post_text_layer_2 = nn.Linear(args.post_text_dim,
                                           args.post_text_dim)
        self.post_text_layer_3 = nn.Linear(args.post_text_dim, 1)

        # the classify layer for audio
        self.post_audio_dropout = nn.Dropout(p=args.post_audio_dropout)
        self.post_audio_layer_1 = nn.Linear(args.audio_out,
                                            args.post_audio_dim)
        self.post_audio_layer_2 = nn.Linear(args.post_audio_dim,
                                            args.post_audio_dim)
        self.post_audio_layer_3 = nn.Linear(args.post_audio_dim, 1)
        # test
        self.audio_classifier = nn.Linear(args.audio_out, 1)

        # the classify layer for video
        self.post_video_dropout = nn.Dropout(p=args.post_video_dropout)
        self.post_video_layer_1 = nn.Linear(args.video_out,
                                            args.post_video_dim)
        self.post_video_layer_2 = nn.Linear(args.post_video_dim,
                                            args.post_video_dim)
        self.post_video_layer_3 = nn.Linear(args.post_video_dim, 1)
        # test
        self.video_classifier = nn.Linear(args.video_out, 1)
Ejemplo n.º 3
0
    def __init__(self, args):
        super(MULT, self).__init__()
        if args.use_bert:
            # text subnets
            self.text_model = BertTextEncoder(language=args.language,
                                              use_finetune=args.use_finetune)
        self.use_bert = args.use_bert
        # Mult Model Initialization.
        dst_feature_dims, nheads = args.dst_feature_dim_nheads_1, args.dst_feature_dim_nheads_2
        self.orig_d_l, self.orig_d_a, self.orig_d_v = args.feature_dims
        self.d_l = self.d_a = self.d_v = dst_feature_dims
        self.num_heads = nheads
        self.layers = args.nlevels
        self.attn_dropout = args.attn_dropout
        self.attn_dropout_a = args.attn_dropout_a
        self.attn_dropout_v = args.attn_dropout_v
        self.relu_dropout = args.relu_dropout
        self.embed_dropout = args.embed_dropout
        self.res_dropout = args.res_dropout
        self.output_dropout = args.output_dropout
        self.text_dropout = args.text_dropout
        self.attn_mask = args.attn_mask

        combined_dim = 2 * (self.d_l + self.d_a + self.d_v)

        output_dim = args.num_classes  # This is actually not a hyperparameter :-)

        # 1. Temporal convolutional layers
        self.proj_l = nn.Conv1d(self.orig_d_l,
                                self.d_l,
                                kernel_size=args.conv1d_kernel_size_l,
                                padding=0,
                                bias=False)
        self.proj_a = nn.Conv1d(self.orig_d_a,
                                self.d_a,
                                kernel_size=args.conv1d_kernel_size_a,
                                padding=0,
                                bias=False)
        self.proj_v = nn.Conv1d(self.orig_d_v,
                                self.d_v,
                                kernel_size=args.conv1d_kernel_size_v,
                                padding=0,
                                bias=False)

        # 2. Crossmodal Attentions
        self.trans_l_with_a = self.get_network(self_type='la')
        self.trans_l_with_v = self.get_network(self_type='lv')

        self.trans_a_with_l = self.get_network(self_type='al')
        self.trans_a_with_v = self.get_network(self_type='av')

        self.trans_v_with_l = self.get_network(self_type='vl')
        self.trans_v_with_a = self.get_network(self_type='va')

        # 3. Self Attentions (Could be replaced by LSTMs, GRUs, etc.)
        #    [e.g., self.trans_x_mem = nn.LSTM(self.d_x, self.d_x, 1)
        self.trans_l_mem = self.get_network(self_type='l_mem', layers=3)
        self.trans_a_mem = self.get_network(self_type='a_mem', layers=3)
        self.trans_v_mem = self.get_network(self_type='v_mem', layers=3)

        # Projection layers
        # print("combined_dim: shape")
        # print(combined_dim)
        self.proj1 = nn.Linear(combined_dim, combined_dim)
        self.proj2 = nn.Linear(combined_dim, combined_dim)
        self.out_layer = nn.Linear(combined_dim, output_dim)
Ejemplo n.º 4
0
    def __init__(self, config):
        super(MISA, self).__init__()

        assert config.use_bert == True

        self.config = config
        self.text_size = config.feature_dims[0]
        self.visual_size = config.feature_dims[2]
        self.acoustic_size = config.feature_dims[1]

        self.input_sizes = input_sizes = [
            self.text_size, self.visual_size, self.acoustic_size
        ]
        self.hidden_sizes = hidden_sizes = [
            int(self.text_size),
            int(self.visual_size),
            int(self.acoustic_size)
        ]
        self.output_size = output_size = config.num_classes if config.train_mode == "classification" else 1
        self.dropout_rate = dropout_rate = config.dropout
        self.activation = nn.ReLU()
        self.tanh = nn.Tanh()

        rnn = nn.LSTM if self.config.rnncell == "lstm" else nn.GRU
        # defining modules - two layer bidirectional LSTM with layer norm in between

        if config.use_bert:
            # text subnets
            self.bertmodel = BertTextEncoder(language=config.language,
                                             use_finetune=config.use_finetune)

        self.vrnn1 = rnn(input_sizes[1], hidden_sizes[1], bidirectional=True)
        self.vrnn2 = rnn(2 * hidden_sizes[1],
                         hidden_sizes[1],
                         bidirectional=True)

        self.arnn1 = rnn(input_sizes[2], hidden_sizes[2], bidirectional=True)
        self.arnn2 = rnn(2 * hidden_sizes[2],
                         hidden_sizes[2],
                         bidirectional=True)

        ##########################################
        # mapping modalities to same sized space
        ##########################################
        if self.config.use_bert:
            self.project_t = nn.Sequential()
            self.project_t.add_module(
                'project_t',
                nn.Linear(in_features=768, out_features=config.hidden_size))
            self.project_t.add_module('project_t_activation', self.activation)
            self.project_t.add_module('project_t_layer_norm',
                                      nn.LayerNorm(config.hidden_size))
        else:
            self.project_t = nn.Sequential()
            self.project_t.add_module(
                'project_t',
                nn.Linear(in_features=hidden_sizes[0] * 4,
                          out_features=config.hidden_size))
            self.project_t.add_module('project_t_activation', self.activation)
            self.project_t.add_module('project_t_layer_norm',
                                      nn.LayerNorm(config.hidden_size))

        self.project_v = nn.Sequential()
        self.project_v.add_module(
            'project_v',
            nn.Linear(in_features=hidden_sizes[1] * 4,
                      out_features=config.hidden_size))
        self.project_v.add_module('project_v_activation', self.activation)
        self.project_v.add_module('project_v_layer_norm',
                                  nn.LayerNorm(config.hidden_size))

        self.project_a = nn.Sequential()
        self.project_a.add_module(
            'project_a',
            nn.Linear(in_features=hidden_sizes[2] * 4,
                      out_features=config.hidden_size))
        self.project_a.add_module('project_a_activation', self.activation)
        self.project_a.add_module('project_a_layer_norm',
                                  nn.LayerNorm(config.hidden_size))

        ##########################################
        # private encoders
        ##########################################
        self.private_t = nn.Sequential()
        self.private_t.add_module(
            'private_t_1',
            nn.Linear(in_features=config.hidden_size,
                      out_features=config.hidden_size))
        self.private_t.add_module('private_t_activation_1', nn.Sigmoid())

        self.private_v = nn.Sequential()
        self.private_v.add_module(
            'private_v_1',
            nn.Linear(in_features=config.hidden_size,
                      out_features=config.hidden_size))
        self.private_v.add_module('private_v_activation_1', nn.Sigmoid())

        self.private_a = nn.Sequential()
        self.private_a.add_module(
            'private_a_3',
            nn.Linear(in_features=config.hidden_size,
                      out_features=config.hidden_size))
        self.private_a.add_module('private_a_activation_3', nn.Sigmoid())

        ##########################################
        # shared encoder
        ##########################################
        self.shared = nn.Sequential()
        self.shared.add_module(
            'shared_1',
            nn.Linear(in_features=config.hidden_size,
                      out_features=config.hidden_size))
        self.shared.add_module('shared_activation_1', nn.Sigmoid())

        ##########################################
        # reconstruct
        ##########################################
        self.recon_t = nn.Sequential()
        self.recon_t.add_module(
            'recon_t_1',
            nn.Linear(in_features=config.hidden_size,
                      out_features=config.hidden_size))
        self.recon_v = nn.Sequential()
        self.recon_v.add_module(
            'recon_v_1',
            nn.Linear(in_features=config.hidden_size,
                      out_features=config.hidden_size))
        self.recon_a = nn.Sequential()
        self.recon_a.add_module(
            'recon_a_1',
            nn.Linear(in_features=config.hidden_size,
                      out_features=config.hidden_size))

        ##########################################
        # shared space adversarial discriminator
        ##########################################
        if not self.config.use_cmd_sim:
            self.discriminator = nn.Sequential()
            self.discriminator.add_module(
                'discriminator_layer_1',
                nn.Linear(in_features=config.hidden_size,
                          out_features=config.hidden_size))
            self.discriminator.add_module('discriminator_layer_1_activation',
                                          self.activation)
            self.discriminator.add_module('discriminator_layer_1_dropout',
                                          nn.Dropout(dropout_rate))
            self.discriminator.add_module(
                'discriminator_layer_2',
                nn.Linear(in_features=config.hidden_size,
                          out_features=len(hidden_sizes)))

        ##########################################
        # shared-private collaborative discriminator
        ##########################################

        self.sp_discriminator = nn.Sequential()
        self.sp_discriminator.add_module(
            'sp_discriminator_layer_1',
            nn.Linear(in_features=config.hidden_size, out_features=4))

        self.fusion = nn.Sequential()
        self.fusion.add_module(
            'fusion_layer_1',
            nn.Linear(in_features=self.config.hidden_size * 6,
                      out_features=self.config.hidden_size * 3))
        self.fusion.add_module('fusion_layer_1_dropout',
                               nn.Dropout(dropout_rate))
        self.fusion.add_module('fusion_layer_1_activation', self.activation)
        self.fusion.add_module(
            'fusion_layer_3',
            nn.Linear(in_features=self.config.hidden_size * 3,
                      out_features=output_size))

        self.tlayer_norm = nn.LayerNorm((hidden_sizes[0] * 2, ))
        self.vlayer_norm = nn.LayerNorm((hidden_sizes[1] * 2, ))
        self.alayer_norm = nn.LayerNorm((hidden_sizes[2] * 2, ))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.config.hidden_size, nhead=2)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer,
                                                         num_layers=1)
Ejemplo n.º 5
0
class TextPre(object):
    """A single set of features of data."""

    def __init__(self, args):
        self.device = torch.device('cuda:0')
        self.args = args
        self.loadTextMap = {
            'mosi': self.__load_data_mosi,
            'mosei': self.__load_data_mosei
        }
        self.bert = BertTextEncoder(language=args.language).to(self.device)
    
    def textConvertID(self, data, tokenizer):
        features = {}
        Input_ids, Input_mask, Segment_ids = [], [], []
        Raw_text, Visual, Audio = [], [], []
        Label, ids = [], []
        max_seq_length = self.args.max_seq_length
        for i in tqdm(range(len(data['raw_text']))):
            raw_text = data['raw_text'][i]
            visual = data['vision'][i]
            audio = data['audio'][i]
            tokens_a, inversions_a = tokenizer.tokenize(raw_text,invertable=True)
            
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:max_seq_length - 2]
                inversions_a = inversions_a[:max_seq_length - 2]
            
            tokens = ["[CLS]"] + tokens_a + ["[SEP]"]

            segment_ids = [0] * len(tokens)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            input_mask = [1] * len(input_ids)
            padding = [0] * (max_seq_length - len(input_ids))


            if self.args.aligned:
                text_len = min(len(raw_text.split()), max_seq_length)
                new_visual = [visual[len(visual) - text_len + inv_id] for inv_id in inversions_a]
                new_audio = [audio[len(audio) - text_len + inv_id] for inv_id in inversions_a]

                visual = np.array(new_visual)
                audio = np.array(new_audio)

                # add "start" and "end" for audio and vision
                audio_zero = np.zeros((1,audio.shape[1]))
                audio = np.concatenate((audio_zero,audio,audio_zero))

                visual_zero = np.zeros((1,visual.shape[1]))
                visual = np.concatenate((visual_zero,visual,visual_zero))

                audio_padding = np.zeros((max_seq_length - len(input_ids),audio.shape[1]))
                audio = np.concatenate((audio,audio_padding))

                video_padding = np.zeros((max_seq_length - len(input_ids),visual.shape[1]))
                visual = np.concatenate((visual,video_padding))

                assert audio.shape[0] == max_seq_length
                assert visual.shape[0] == max_seq_length

            input_ids += padding
            input_mask += padding
            segment_ids += padding

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            label = float(data['labels'][i])

            Input_ids.append(input_ids)
            Visual.append(visual)
            Audio.append(audio)
            Input_mask.append(input_mask)
            Segment_ids.append(segment_ids)
            Label.append(label)
            Raw_text.append(raw_text)
            ids.append(data['id'][i])

        features['raw_text'] = np.array(Raw_text)
        features['audio'] = np.array(Audio)
        features['vision'] = np.array(Visual)
        features['labels'] = np.array(Label)
        features['id'] = np.array(ids)
        Input_ids = np.expand_dims(Input_ids, 1)
        Input_mask = np.expand_dims(Input_mask, 1)
        Segment_ids = np.expand_dims(Segment_ids, 1)
        text_bert = np.concatenate((Input_ids, Input_mask, Segment_ids), axis=1) 
        features['text_bert'] = text_bert
        features['text'] = self.__convertID2Vector(text_bert)
        return features
    
    def __convertID2Vector(self, ids, batch_size=64):
        results = []
        left = 0
        ids = torch.Tensor(ids)
        for left in tqdm(range(0, ids.size(0), batch_size)):
            right = min(left + batch_size, ids.size(0))
            c_ids = ids[left:right].to(self.device)
            c_vector = self.bert(c_ids).detach().cpu().numpy()
            results.append(c_vector)
        results = np.concatenate(results, axis=0)
        return results
    
    def __load_data_mosi(self):
        # get text data
        link = os.path.join(self.args.data_dir, 'Raw/Transcript/Segmented')
        text_data = {}
        for file in os.listdir(link):
            name = file.split('.')[0]
            for line in open(os.path.join(link, file), "r"):
                num_id, cur_t = line.split('_DELIM_')
                name_id = name + '_' + num_id.strip()
                text_data[name_id] = cur_t.strip()
        # get data
        def matchData(mode='train'):
            r_text = []
            for cur_id in data[mode]['id']:
                r_text.append(text_data[cur_id[0]])
            data[mode]['raw_text'] = r_text
        
        with open(os.path.join(self.args.data_dir, 'Processed/mosei_senti_data_noalign.pkl'), 'rb') as lf:
            data = pickle.load(lf)
        
        matchData(mode='train')
        matchData(mode='valid')
        matchData(mode='test')

        return data
    
    def __load_data_mosei(self):
        def convert0(s):
            if s == '0':
                return '0.0'
            return s
        # get text data
        link = os.path.join(self.args.data_dir, 'Raw/Transcript/Segmented')
        text_data = {}
        for file in os.listdir(link):
            name = file.split('.')[0]
            for line in open(os.path.join(link, file), "r"):
                items = line.split('___')
                name_id = items[0] + '_' + convert0(items[2]) + '_' + convert0(items[3])
                text_data[name_id.strip()] = items[-1].strip()
        # get data
        def matchData(mode='train'):
            r_text = []
            for cur_id in data[mode]['id']:
                name = '_'.join(cur_id)
                r_text.append(text_data[name])
            data[mode]['raw_text'] = r_text
        
        with open(os.path.join(self.args.data_dir, 'Processed/mosei_senti_data_noalign.pkl'), 'rb') as lf:
            data = pickle.load(lf)
        
        matchData(mode='train')
        matchData(mode='valid')
        matchData(mode='test')

        return data

    def run(self):
        data = self.loadTextMap[self.args.datasetName]()

        train_list = data['train']
        valid_list = data['valid']
        test_list = data['test']

        tokenizer = self.bert.get_tokenizer()

        save_data = {}
        save_data['train'] = self.textConvertID(train_list, tokenizer)
        save_data['valid'] = self.textConvertID(valid_list, tokenizer)
        save_data['test'] = self.textConvertID(test_list, tokenizer)

        if self.args.aligned:
            saved_path = os.path.join(self.args.save_dir, 'aligned_' + str(self.args.max_seq_length) + '.pkl')
        else:
            saved_path = os.path.join(self.args.save_dir, 'unaligned_' + str(self.args.max_seq_length) + '.pkl')
        
        if not os.path.exists(os.path.dirname(saved_path)):
            os.makedirs(os.path.dirname(saved_path))

        with open(saved_path, 'wb') as file:
            pickle.dump(save_data, file, protocol=4)
            print('Save Successful!')