padded_train_sequences, padded_dev_sequences, padded_test_sequences = util.get_train_dev_test_padded_sequences( maxlen=constant.MAX_LEN, train_sequences=train_sequences, dev_sequences=dev_sequences, test_sequences=test_sequences) # 2. embedding_matrix from config import glove_embedding_data_path num_words = constant.MAX_NUM_WORDS embedding_matrix = util.get_embedding_matrix( embedding_data_path=glove_embedding_data_path, embed_size=constant.EMBED_SIZE, tokenizer=tokenizer, num_words=constant.MAX_NUM_WORDS) # 3. model cnn_model = model.get_cnn_model(max_num_words=constant.MAX_NUM_WORDS, max_len=constant.MAX_LEN, embedding_dim_size=constant.EMBED_SIZE, embedding_matrix=embedding_matrix) # 4. train filepath = config.model_path + "cnn-ep-{epoch:02d}-val_acc-{val_acc:.4f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1,
print('VUA dataset division: ', len(raw_train_vua), len(raw_test_vua)) """ 2. Data preparation """ ''' 2. 1 get vocabulary and glove embeddings in raw dataset ''' # vocab is a set of words vocab = get_vocab(raw_train_vua + raw_test_vua) # two dictionaries. <PAD>: 0, <UNK>: 1 word2idx, idx2word = get_word2idx_idx2word(vocab) # glove_embeddings a nn.Embeddings glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) # elmo_embeddings elmos_train_vua = h5py.File('../elmo/VUA_train2.hdf5', 'r') # suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50 suffix_embeddings = nn.Embedding(2, 50) ''' 2. 2 embed the datasets ''' random.seed(0) random.shuffle(raw_train_vua) sentence_to_index_train = ast.literal_eval( elmos_train_vua['sentence_to_index'][0]) sentences = [
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_w2v_model_path", required=True, type=str, help="Path of the tence w2v pretrained model.") parser.add_argument("--query_matrix_path", required=True, type=str, help="Path of the query matrix.") parser.add_argument("--summary_result_path", required=True, type=str, help="Path of the output model.") parser.add_argument("--output_result_path", required=True, type=str, help="Path of the output result.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, required=True, help="Path of the testset.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocab.") parser.add_argument("--elmo_path", type=str, required=True, help="Path of the elmo features.") # Model options. parser.add_argument("--language_type", type=str, choices=["en", "zh"], required=True, help="Num of the classes.") parser.add_argument("--num_classes", type=int, default=3, help="Num of the classes.") parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--require_improvement", type=int, default=5, help="Require improvement.") parser.add_argument("--epochs_num", type=int, default=100, help="Number of epochs.") parser.add_argument("--w2v_embedding_dim", type=int, required=True, help="w2v embedding dim.") parser.add_argument("--elmo_embedding_dim", type=int, default=1024, help="elmo embedding dim.") parser.add_argument("--input_dim", type=int, required=True, help="input embedding dim.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--hidden_size", type=int, default=200, help="hidden size.") parser.add_argument("--layers_num", type=int, default=2, help="Number of layers.") parser.add_argument("--attention_query_size", type=int, default=200, help="Size of attention query matrix.") parser.add_argument("--attention_layer", choices=[ "att", "m_a", "m_pre_orl_a", "m_pre_orl_pun_a", "m_pol_untrain_a", "mpa", "mpoa" ], required=True, help="attention type.") parser.add_argument("--pretrain_model_type", choices=["w2v", "elmo", "none"], required=True, help="pretrain model type.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=0.1, help="Learning rate.") parser.add_argument("--momentum", type=float, default=0.9, help="momentum.") # Training options. parser.add_argument("--dropout", type=float, default=0.2, help="Dropout.") parser.add_argument("--is_bidir", type=int, default=2, help="bidir or only one.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") parser.add_argument("--run_type", type=str, required=True, help="usage: python main_vua.py [train / test]") args = parser.parse_args() #set numpy、random、etc seeds set_seed(args.seed) #set vocab vocab = Vocab() vocab.load(args.vocab_path) label_columns = read_cataloge(args.dev_path) #set embedding embeddings = get_embedding_matrix(args, vocab, normalization=False) elmo_embedding = h5py.File(args.elmo_path, 'r') query_matrix = get_query_matrix(args) # For simplicity, we use DataParallel wrapper to use multiple GPUs. model = RNNSequenceClassifier(args, embeddings, query_matrix) model = model.cuda() best_josn = { 'F_macro': 0, 'P_macro': 0, 'R_macro': 0, 'Best_F_macro': 0, 'ACC': 0, 'F_negative': 0, 'F_positive': 0, 'Predict': [], 'Label': [], 'Weights': [], 'Last_up_epoch': 0, 'Total_batch_loss': 0, 'F_nuetral': 0, 'Time': 0, 'Total_orthogonal_loss': 0, 'train_num': 0, 'test_num': 0, 'dev_num': 0 } def evaluate(args, is_test): model.eval() if is_test: print("Start testing.") dataset = read_dataset(args, args.test_path, label_columns, vocab) best_josn['test_num'] = len(dataset) writer_result = open(os.path.join(args.output_result_path, 'result.txt'), encoding='utf-8', mode='w') writer_summary_result = open(os.path.join(args.summary_result_path, 'summary_result.txt'), mode='a') else: dataset = read_dataset(args, args.dev_path, label_columns, vocab) best_josn['dev_num'] = len(dataset) random.shuffle(dataset) input_ids = torch.LongTensor([example[0] for example in dataset]) label_ids = torch.LongTensor([example[1] for example in dataset]) length_ids = torch.LongTensor([example[2] for example in dataset]) input = [example[3] for example in dataset] if is_test: batch_size = 1 else: batch_size = args.batch_size for i, (input_ids_batch, label_ids_batch, length_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, length_ids)): model.zero_grad() input_ids_batch = input_ids_batch.cuda() label_ids_batch = label_ids_batch.cuda() length_ids_batch = length_ids_batch.cuda() if args.attention_layer == 'att': predicted, weight = model(input_ids_batch, length_ids_batch, elmo_embedding) else: predicted, weight, _ = model(input_ids_batch, length_ids_batch, elmo_embedding) best_josn['Weights'] += weight.squeeze( dim=1).cpu().detach().numpy().tolist() _, predicted_labels = torch.max(predicted.data, 1) best_josn['Predict'] += predicted_labels.cpu().numpy().tolist() best_josn['Label'] += label_ids_batch.data.cpu().numpy().tolist() if is_test: details_result = metrics.classification_report( best_josn['Label'], best_josn['Predict']) best_josn['P_macro'], best_josn['R_macro'], best_josn[ 'F_macro'], _ = metrics.precision_recall_fscore_support( best_josn['Label'], best_josn['Predict'], average="macro") best_josn['ACC'] = metrics.classification.accuracy_score( best_josn['Label'], best_josn['Predict']) saveSenResult(input, best_josn['Label'], best_josn['Predict'], args, best_josn['Weights']) writer_result.writelines(details_result) print( "Testing Acc: {:.4f}, F_macro: {:.4f}, P_macro: {:.4f}, R_macro: {:.4f}" .format(best_josn['ACC'], best_josn['F_macro'], best_josn['P_macro'], best_josn['R_macro'])) writer_result.writelines( "Testing Acc: {:.4f}, F_macro: {:.4f}, P_macro: {:.4f}, R_macro: {:.4f}" .format(best_josn['ACC'], best_josn['F_macro'], best_josn['P_macro'], best_josn['R_macro'])) writer_summary_result.writelines('保存路径' + args.output_result_path + '\n') writer_summary_result.writelines( "Testing Acc: {:.4f}, F_macro: {:.4f}, P_macro: {:.4f}, R_macro: {:.4f}\n\n" .format(best_josn['ACC'], best_josn['F_macro'], best_josn['P_macro'], best_josn['R_macro'])) writer_summary_result.writelines(details_result) else: best_josn['P_macro'], best_josn['R_macro'], best_josn[ 'F_macro'], _ = metrics.precision_recall_fscore_support( best_josn['Label'], best_josn['Predict'], average="macro") best_josn['ACC'] = metrics.classification.accuracy_score( best_josn['Label'], best_josn['Predict']) def train(): print("Start training.") mkdir(args.output_result_path) writer_process = open(os.path.join(args.output_result_path, 'process.txt'), mode='w') writer_process.writelines("Start training.") trainset = read_dataset(args, args.train_path, label_columns, vocab) random.shuffle(trainset) best_josn['train_num'] = len(trainset) input_ids = torch.LongTensor([example[0] for example in trainset]) label_ids = torch.LongTensor([example[1] for example in trainset]) length_ids = torch.LongTensor([example[2] for example in trainset]) print("Batch size: ", args.batch_size) print("The number of training instances:", best_josn['train_num']) start_time = time.time() best_josn['Time'] = get_time_dif(start_time) print("Time usage:", best_josn['Time']) param_optimizer = list(model.named_parameters()) nll_criterion = nn.NLLLoss() if args.attention_layer == 'm_pol_untrain_a': optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if ('query_embedding.weight' not in n) ], 'weight_decay_rate': 0.01 }] else: optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer], 'weight_decay_rate': 0.01 }] optimizer = optim.SGD(optimizer_grouped_parameters, lr=args.learning_rate, momentum=args.momentum) for epoch in range(1, args.epochs_num + 1): model.train() for i, (input_ids_batch, label_ids_batch, length_ids_batch) in enumerate( batch_loader(args.batch_size, input_ids, label_ids, length_ids)): model.zero_grad() input_ids_batch = input_ids_batch.cuda() label_ids_batch = label_ids_batch.cuda() length_ids_batch = length_ids_batch.cuda() if args.attention_layer == 'att': predicted_ids_batch, _ = model(input_ids_batch, length_ids_batch, elmo_embedding) else: predicted_ids_batch, _, orthogonal_loss = model( input_ids_batch, length_ids_batch, elmo_embedding) best_josn['Total_orthogonal_loss'] += orthogonal_loss batch_loss = nll_criterion(predicted_ids_batch, label_ids_batch) best_josn['Total_batch_loss'] += batch_loss if args.attention_layer != 'm_pre_orl_pun_a' and args.attention_layer != 'mpoa': optimizer.zero_grad() batch_loss.backward() optimizer.step() else: optimizer.zero_grad() (0.1 * orthogonal_loss).backward(retain_graph=True) (0.9 * batch_loss).backward() optimizer.step() best_josn['Time'] = get_time_dif(start_time) if (i + 1) % args.report_steps == 0: if args.attention_layer == 'att': print( "Epoch id: {}, Training steps: {}, Avg batch loss: {:.4f}, Time: {}" .format( epoch, i + 1, best_josn['Total_batch_loss'] / args.report_steps, best_josn['Time'])) writer_process.writelines( "Epoch id: {}, Training steps: {}, Avg batch loss: {:.4f}, Time: {}" .format( epoch, i + 1, best_josn['Total_batch_loss'] / args.report_steps, best_josn['Time'])) else: print( "Epoch id: {}, Training steps: {}, Avg batch loss: {:.4f}, Avg orthogonal loss: {:.4f}, Time: {}" .format( epoch, i + 1, best_josn['Total_batch_loss'] / args.report_steps, best_josn['Total_orthogonal_loss'] / args.report_steps, best_josn['Time'])) writer_process.writelines( "Epoch id: {}, Training steps: {}, Avg batch loss: {:.4f}, Avg orthogonal loss: {:.4f}, Time: {}" .format( epoch, i + 1, best_josn['Total_batch_loss'] / args.report_steps, best_josn['Total_orthogonal_loss'] / args.report_steps, best_josn['Time'])) best_josn['Total_batch_loss'] = 0 best_josn['Total_orthogonal_loss'] = 0 # 读取验证集 evaluate(args, False) best_josn['Time'] = get_time_dif(start_time) if best_josn['F_macro'] > best_josn['Best_F_macro'] + 0.001: best_josn['Best_F_macro'] = best_josn['F_macro'] best_josn['Last_up_epoch'] = epoch torch.save(model, os.path.join(args.output_result_path, 'result.pkl')) print("Deving Acc: {:.4f}, F_macro: {:.4f}, Time: {} *".format( best_josn['ACC'], best_josn['F_macro'], best_josn['Time'])) writer_process.writelines( "Deving Acc: {:.4f}, F_macro: {:.4f}, Time: {} *".format( best_josn['ACC'], best_josn['F_macro'], best_josn['Time'])) elif epoch - best_josn['Last_up_epoch'] == args.require_improvement: print("No optimization for a long time, auto-stopping...") writer_process.writelines( "No optimization for a long time, auto-stopping...") break else: print("Deving Acc: {:.4f}, F_macro: {:.4f}, Time: {} ".format( best_josn['ACC'], best_josn['F_macro'], best_josn['Time'])) writer_process.writelines( "Deving Acc: {:.4f}, F_macro: {:.4f}, Time: {} ".format( best_josn['ACC'], best_josn['F_macro'], best_josn['Time'])) if args.run_type == 'train': train() else: model = torch.load(os.path.join(args.output_result_path, 'result.pkl')) evaluate(args, True)