Ejemplo n.º 1
0
def test(training_data_index_seq, training_masks_seq, test_lines_seq, model,
         label_sent, label_mask, id2word):
    # model.load_state_dict(torch.load(save_model_path))
    model.eval()
    '''这里的minibatches_idx已经考虑了remain的样本'''
    data_id = 0
    for training_data_index, training_masks, test_lines in zip(
            training_data_index_seq, training_masks_seq, test_lines_seq):

        minibatches_idx = get_minibatches_idx(
            len(training_data_index),
            minibatch_size=config['batch_size'],
            shuffle=False)
        n_test_remain = len(training_data_index) % config['batch_size']
        pred_types = []
        pred_confs = []
        pred_others = []
        # Text_Lines = []
        with torch.no_grad():
            overall_mean = -100.0
            for i, minibatch in enumerate(minibatches_idx):
                '''这里是决定一个mibatch之后才去pad, 感觉有点低效'''
                sentence_batch, mask_batch = get_mask_test(
                    training_data_index, training_masks, minibatch)
                sentence_batch = autograd.Variable(
                    torch.cuda.LongTensor(sentence_batch))
                mask_batch = autograd.Variable(
                    torch.cuda.FloatTensor(mask_batch))
                '''这儿好像是将一个minibatch里面的samples按照长度降序排列'''
                '''dim=-1好像是指the last dimension'''
                lengths_batch = mask_batch.sum(dim=-1)  # is a list
                seq_lengths, seq_idx = lengths_batch.sort(
                    0, descending=True)  # a list
                seq_lengths = seq_lengths.int().data.tolist()

                sentence_batch = sentence_batch[seq_idx]
                mask_batch = mask_batch[seq_idx]
                '''把reorder的seq_idx排回原来的样子'''
                seq_idx_2_list = seq_idx.int().data.tolist()
                return_map = {val: i for i, val in enumerate(seq_idx_2_list)}
                recover_seq_idx = [
                    return_map[i] for i in range(len(seq_idx_2_list))
                ]
                recover_seq_idx = autograd.Variable(
                    torch.cuda.LongTensor(np.array(recover_seq_idx)))

                # reordered_text_lines = [text_lines_batch[id] for id in seq_idx.int().data.tolist()]
                '''targets_batch is array'''
                # targets_batch = targets_batch[list(seq_idx.cpu().numpy())]
                sent_list = recover_pytorch_idmatrix_2_text(
                    sentence_batch, id2word)
                bert_rep_batch = []
                if use_bert:
                    for sent in sent_list:
                        bert_rep = sent_to_embedding_last4(
                            sent, bert_tokenizer, bert_model, True)
                        bert_rep_batch.append(bert_rep.reshape(1, -1))
                    bert_rep_batch = torch.cat(bert_rep_batch,
                                               0)  #(batch, 768)

                tag_scores, tag_scores_task2 = model(sentence_batch,
                                                     seq_lengths, mask_batch,
                                                     label_sent, label_mask,
                                                     bert_rep_batch)
                '''recover the order'''
                tag_scores = tag_scores[recover_seq_idx]
                tag_scores_task2 = (tag_scores_task2.reshape(
                    len(minibatch), 4, 4))[recover_seq_idx]
                # print('tag_scores_task2:',tag_scores_task2)
                # print('recover_seq_idx:',recover_seq_idx)
                # tag_scores_task2 = tag_scores_task2[recover_seq_idx]
                # print('tag_scores_task2:',tag_scores_task2)
                # exit(0)

                tag_scores_2_array = tag_scores.cpu().numpy()
                mean = np.mean(tag_scores_2_array)
                pred_labels = np.where(tag_scores_2_array > mean, 1,
                                       0)  # 17.10/ 33.5
                pred_conf = tag_scores_2_array
                '''recover the order'''
                pred_other = tag_scores_task2.cpu().numpy()

                if i < len(minibatches_idx) - 1:
                    pred_types.append(pred_labels)
                    pred_confs.append(pred_conf)
                    pred_others.append(pred_other)
                    # Text_Lines+=text_lines_batch
                else:
                    pred_types.append(pred_labels[-n_test_remain:])
                    pred_confs.append(pred_conf[-n_test_remain:])
                    pred_others.append(pred_other[-n_test_remain:])
                    # Text_Lines+=text_lines_batch[-n_test_remain:]

        pred_types = np.concatenate(pred_types, axis=0)
        pred_confs = np.concatenate(pred_confs, axis=0)
        pred_others = np.concatenate(pred_others, axis=0)

        # test_mean_f1, test_weight_f1 =average_f1_two_array_by_col(pred_types, np.array(testing_labels))
        # print('test over, test_mean_f1:', test_mean_f1, 'test_weight_f1:', test_weight_f1)
        '''starting generate official output'''

        min_mean_frame = 100.0
        output_file_path = output_file_head + output_file_path_codes[
            data_id] + '.json'
        print('generating ...', output_file_path)
        mean_frame = generate_2019_official_output(test_lines,
                                                   output_file_path,
                                                   pred_types, pred_confs,
                                                   pred_others)
        if mean_frame < min_mean_frame:
            min_mean_frame = mean_frame
        print('\t\t\t test  over, min_mean_frame:', min_mean_frame)
        validate_output_schema(output_file_path, 'LoReHLT19-schema_V1.json')
        data_id += 1
Ejemplo n.º 2
0
def test(training_data_index, training_masks, testing_labels, model,
         label_sent, label_mask, id2word):

    model.eval()
    '''这里的minibatches_idx已经考虑了remain的样本'''
    # output_file_path = '/scratch/wyin3/dickens_save_dataset/LORELEI/il3_Uyghur/il3_system_output.json'
    output_file_path = 'il9_system_output.json'
    minibatches_idx = get_minibatches_idx(len(training_data_index),
                                          minibatch_size=config['batch_size'],
                                          shuffle=False)
    n_test_remain = len(training_data_index) % config['batch_size']
    pred_types = []
    pred_confs = []
    pred_others = []
    # Text_Lines = []
    with torch.no_grad():
        overall_mean = -100.0
        for i, minibatch in enumerate(minibatches_idx):
            '''这里是决定一个mibatch之后才去pad, 感觉有点低效'''
            sentence_batch, mask_batch, label_batch = get_mask(
                training_data_index, training_masks, testing_labels, minibatch)
            sentence_batch = autograd.Variable(
                torch.cuda.LongTensor(sentence_batch))
            mask_batch = autograd.Variable(torch.cuda.FloatTensor(mask_batch))
            '''这儿好像是将一个minibatch里面的samples按照长度降序排列'''
            '''dim=-1好像是指the last dimension'''
            lengths_batch = mask_batch.sum(dim=-1)  # is a list
            seq_lengths, seq_idx = lengths_batch.sort(
                0, descending=True)  # a list
            seq_lengths = seq_lengths.int().data.tolist()

            sentence_batch = sentence_batch[seq_idx]
            mask_batch = mask_batch[seq_idx]
            '''把reorder的seq_idx排回原来的样子'''
            seq_idx_2_list = seq_idx.int().data.tolist()
            return_map = {val: i for i, val in enumerate(seq_idx_2_list)}
            recover_seq_idx = [
                return_map[i] for i in range(len(seq_idx_2_list))
            ]
            recover_seq_idx = autograd.Variable(
                torch.cuda.LongTensor(np.array(recover_seq_idx)))

            # reordered_text_lines = [text_lines_batch[id] for id in seq_idx.int().data.tolist()]
            '''targets_batch is array'''
            # targets_batch = targets_batch[list(seq_idx.cpu().numpy())]
            sent_list = recover_pytorch_idmatrix_2_text(
                sentence_batch, id2word)
            bert_rep_batch = []
            bert2_rep_batch = []
            if use_bert:
                for sent in sent_list:
                    bert_rep = sent_to_embedding_last4(sent, bert_tokenizer,
                                                       bert_model, True)
                    bert_rep_batch.append(bert_rep.reshape(1, -1))
                    bert2_rep = sent_to_embedding_last4(
                        sent, bert2_tokenizer, bert2_model, True)
                    bert2_rep_batch.append(bert2_rep.reshape(1, -1))
                bert_rep_batch = torch.cat(bert_rep_batch, 0)  #(batch, 768)
                bert2_rep_batch = torch.cat(bert2_rep_batch, 0)  #(batch, 768)

            tag_scores, tag_scores_task2 = model(sentence_batch, seq_lengths,
                                                 mask_batch, label_sent,
                                                 label_mask, bert_rep_batch,
                                                 bert2_rep_batch)
            '''recover the order'''
            tag_scores = tag_scores[recover_seq_idx]
            tag_scores_task2 = (tag_scores_task2.reshape(len(minibatch), 4,
                                                         4))[recover_seq_idx]
            # print('tag_scores_task2:',tag_scores_task2)
            # print('recover_seq_idx:',recover_seq_idx)
            # tag_scores_task2 = tag_scores_task2[recover_seq_idx]
            # print('tag_scores_task2:',tag_scores_task2)
            # exit(0)

            tag_scores_2_array = tag_scores.cpu().numpy()
            mean = np.mean(tag_scores_2_array)
            pred_labels = np.where(tag_scores_2_array > mean, 1,
                                   0)  # 17.10/ 33.5
            pred_conf = tag_scores_2_array
            '''recover the order'''
            pred_other = tag_scores_task2.cpu().numpy()

            if i < len(minibatches_idx) - 1:
                pred_types.append(pred_labels)
                pred_confs.append(pred_conf)
                pred_others.append(pred_other)
                # Text_Lines+=text_lines_batch
            else:
                pred_types.append(pred_labels[-n_test_remain:])
                pred_confs.append(pred_conf[-n_test_remain:])
                pred_others.append(pred_other[-n_test_remain:])
                # Text_Lines+=text_lines_batch[-n_test_remain:]

    pred_types = np.concatenate(pred_types, axis=0)
    pred_confs = np.concatenate(pred_confs, axis=0)
    pred_others = np.concatenate(pred_others, axis=0)

    test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
        pred_types, np.array(testing_labels))
    print('test over, test_mean_f1:', test_mean_f1, 'test_weight_f1:',
          test_weight_f1)
Ejemplo n.º 3
0
def train(task1_data, task2_data, test_data, label_sent, label_mask, id2word,
          epoch_num, model, loss_function, optimizer):
    '''combine train set and dev set'''
    '''
    task1_data,task2_data,test_data,
    '''
    training_data_index, training_masks, training_labels = task1_data
    training_data_task2_index, training_task2_masks, training_task2_labels, train_task2_other_labels = task2_data
    testing_data_index, testing_masks, test_lines = test_data

    label_sent = autograd.Variable(torch.cuda.LongTensor(label_sent))
    label_mask = autograd.Variable(torch.cuda.FloatTensor(label_mask))

    print("training...")
    iter = 0
    for epoch in range(epoch_num):

        print('current epoch: ', epoch)
        minibatches_idx = get_minibatches_idx(
            len(training_data_index),
            minibatch_size=config['batch_size'],
            shuffle=True)
        minibatches_idx_task2 = get_minibatches_idx(
            len(training_data_task2_index),
            minibatch_size=config['batch_size'],
            shuffle=True)
        for i, minibatch in enumerate(minibatches_idx):
            model.train()
            '''这里是决定一个mibatch之后才去pad, 感觉有点低效'''
            sentence_batch, mask_batch, targets_batch = get_mask(
                training_data_index, training_masks, training_labels,
                minibatch)
            sentence_batch = autograd.Variable(
                torch.cuda.LongTensor(sentence_batch))
            targets_batch = autograd.Variable(
                torch.cuda.FloatTensor(targets_batch))
            mask_batch = autograd.Variable(torch.cuda.FloatTensor(mask_batch))
            '''dim=-1好像是指the last dimension'''
            lengths_batch = mask_batch.sum(dim=-1)  # is a list
            seq_lengths, seq_idx = lengths_batch.sort(
                0, descending=True)  # a list
            seq_lengths = seq_lengths.int().data.tolist()

            sentence_batch = sentence_batch[seq_idx]
            targets_batch = targets_batch[seq_idx]
            mask_batch = mask_batch[seq_idx]
            model.zero_grad()
            '''Bert'''

            # sentence_numpy = sentence_batch.cpu().array()
            # bert_rep_batch = []
            # for i in range(config['batch_size']):
            #     sent_str = ''
            #     for id in list(sentence_numpy[i]):
            #         if id !=0:
            #             sent_str+=id2word.get(id)+' '
            sent_list = recover_pytorch_idmatrix_2_text(
                sentence_batch, id2word)
            bert_rep_batch = []
            if use_bert:
                for sent in sent_list:
                    bert_rep = sent_to_embedding_last4(sent, bert_tokenizer,
                                                       bert_model, True)
                    bert_rep_batch.append(bert_rep.reshape(1, -1))
                bert_rep_batch = torch.cat(bert_rep_batch, 0)  #(batch, 768)

            tag_scores, _ = model(sentence_batch, seq_lengths, mask_batch,
                                  label_sent, label_mask, bert_rep_batch)
            '''Binary Cross Entropy'''

            temp_loss_matrix = torch_where(
                targets_batch[:, :-1].reshape(-1) < 1,
                1.0 - tag_scores[:, :-1].reshape(-1),
                tag_scores[:, :-1].reshape(-1))
            loss = -torch.mean(torch.log(temp_loss_matrix))
            loss.backward()
            optimizer.step()
            '''task2'''
            if i < len(minibatches_idx_task2):
                model.train()
                minibatch_task2 = minibatches_idx_task2[i]
                '''这里是决定一个mibatch之后才去pad, 感觉有点低效'''
                sentence_batch, mask_batch, targets_batch, others_batch = get_mask_task2(
                    training_data_task2_index, training_task2_masks,
                    training_task2_labels, train_task2_other_labels,
                    minibatch_task2)
                sentence_batch = autograd.Variable(
                    torch.cuda.LongTensor(sentence_batch))
                targets_batch = autograd.Variable(
                    torch.cuda.FloatTensor(targets_batch))
                mask_batch = autograd.Variable(
                    torch.cuda.FloatTensor(mask_batch))
                others_batch = autograd.Variable(
                    torch.cuda.LongTensor(others_batch))
                '''dim=-1好像是指the last dimension'''
                lengths_batch = mask_batch.sum(dim=-1)  # is a list
                seq_lengths, seq_idx = lengths_batch.sort(
                    0, descending=True)  # a list
                seq_lengths = seq_lengths.int().data.tolist()

                sentence_batch = sentence_batch[seq_idx]
                targets_batch = targets_batch[seq_idx]
                mask_batch = mask_batch[seq_idx]
                others_batch = others_batch[seq_idx]
                model.zero_grad()

                sent_list = recover_pytorch_idmatrix_2_text(
                    sentence_batch, id2word)
                bert_rep_batch = []
                if use_bert:
                    for sent in sent_list:
                        bert_rep = sent_to_embedding_last4(
                            sent, bert_tokenizer, bert_model, True)
                        bert_rep_batch.append(bert_rep.reshape(1, -1))
                    bert_rep_batch = torch.cat(bert_rep_batch,
                                               0)  #(batch, 768)
                tag_scores, tag_scores_task2 = model(sentence_batch,
                                                     seq_lengths, mask_batch,
                                                     label_sent, label_mask,
                                                     bert_rep_batch)
                # print('tag_scores_task2:',tag_scores_task2)
                '''Binary Cross Entropy'''
                temp_loss_matrix = torch_where(
                    targets_batch[:, :-1].reshape(-1) < 1,
                    1.0 - tag_scores[:, :-1].reshape(-1),
                    tag_scores[:, :-1].reshape(-1))
                loss_task1 = -torch.mean(torch.log(temp_loss_matrix))
                '''task2 loss'''
                other_label_scores = tag_scores_task2.index_select(
                    1, others_batch.view(-1))
                loss_task2 = -torch.mean(torch.log(other_label_scores))
                # print('loss_task1:',loss_task1)
                # print('loss_task2:', loss_task2)
                loss = loss_task1 + loss_task2
                loss.backward()
                optimizer.step()

            iter += 1
            if iter % 20 == 0:
                print(iter, ' loss: ', loss)
                # if epoch  == 3:
                #     torch.save(model.state_dict(), 'models_'+str(iter)+'.pt')
                '''test after one epoch'''

        # torch.save(model.state_dict(), save_model_path)
        # print('model saved succeed. train over')
        # return
        # else:
        if epoch > 18 and (epoch + 1) % 10 == 0:
            print('testing....')
            test(testing_data_index, testing_masks, test_lines, model,
                 label_sent, label_mask, id2word)
Ejemplo n.º 4
0
def test(training_data_index, training_masks, model, label_sent, label_mask, id2word):

    model.eval()
    '''这里的minibatches_idx已经考虑了remain的样本'''
    # output_file_path = '/scratch/wyin3/dickens_save_dataset/LORELEI/il3_Uyghur/il3_system_output.json'
    output_file_path = 'il3_uyghur_system_output.json'
    # minibatches_idx = get_minibatches_idx(len(training_data_index), minibatch_size=config['batch_size'], shuffle=False)
    # n_test_remain = len(training_data_index)%config['batch_size']
    pred_types = []
    pred_confs = []
    pred_others = []
    # Text_Lines = []
    with torch.no_grad():

        '''这里是决定一个mibatch之后才去pad, 感觉有点低效'''
        sentence_batch, mask_batch= get_mask_demo(training_data_index, training_masks, config['batch_size'])
        sentence_batch = autograd.Variable(torch.cuda.LongTensor(sentence_batch))
        mask_batch = autograd.Variable(torch.cuda.FloatTensor(mask_batch))

        '''这儿好像是将一个minibatch里面的samples按照长度降序排列'''
        '''dim=-1好像是指the last dimension'''
        lengths_batch = mask_batch.sum(dim=-1) # is a list
        seq_lengths, seq_idx = lengths_batch.sort(0, descending=True) # a list
        seq_lengths = seq_lengths.int().data.tolist()

        sentence_batch = sentence_batch[seq_idx]
        mask_batch = mask_batch[seq_idx]
        '''把reorder的seq_idx排回原来的样子'''
        seq_idx_2_list = seq_idx.int().data.tolist()
        return_map = {val:i for i, val in enumerate(seq_idx_2_list)}
        recover_seq_idx =[return_map[i] for i in range(len(seq_idx_2_list))]
        recover_seq_idx=autograd.Variable(torch.cuda.LongTensor(np.array(recover_seq_idx)))



        # reordered_text_lines = [text_lines_batch[id] for id in seq_idx.int().data.tolist()]
        '''targets_batch is array'''
        # targets_batch = targets_batch[list(seq_idx.cpu().numpy())]
        sent_list = recover_pytorch_idmatrix_2_text(sentence_batch, id2word)
        bert_rep_batch = []
        for sent in sent_list:
            # print('sent:', sent)
            bert_rep = sent_to_embedding_last4(sent, bert_tokenizer, bert_model, True)
            bert_rep_batch.append(bert_rep.reshape(1,-1))
        bert_rep_batch = torch.cat(bert_rep_batch, 0) #(batch, 768)

        tag_scores, _ = model(sentence_batch, seq_lengths, mask_batch, label_sent, label_mask, bert_rep_batch)
        '''recover the order'''
        tag_scores = tag_scores[recover_seq_idx]
        # tag_scores_task2 = (tag_scores_task2.reshape(len(minibatch),4,4))[recover_seq_idx]
        # print('tag_scores_task2:',tag_scores_task2)
        # print('recover_seq_idx:',recover_seq_idx)
        # tag_scores_task2 = tag_scores_task2[recover_seq_idx]
        # print('tag_scores_task2:',tag_scores_task2)
        # exit(0)

        tag_scores_2_array = tag_scores.cpu().numpy() #(batch, 12)
        '''
            type2label_id = {'crimeviolence':8, 'med':3, 'search':4, 'food':1, 'out-of-domain':11, 'infra':2,
            'water':7, 'shelter':5, 'regimechange':9, 'evac':0, 'terrorism':10, 'utils':6}
        '''
        typ_2_score = {}
        for id, typ in enumerate(['evac','food', 'infra','med', 'search', 'shelter', 'utils', 'water', 'crimeviolence', 'regimechange', 'terrorism']):
            typ_2_score[typ] = tag_scores_2_array[0][id]

        return typ_2_score