def main(): cmd = argparse.ArgumentParser("sentence_representation_library") cmd.add_argument("--train", help='train data_path', type=str, default='../debugdata/newtrain-mul.toks') cmd.add_argument("--dev", help='dev data_path', type=str, default='../debugdata/newdev-mul.toks') cmd.add_argument("--test", help='test data_path', type=str, default='../debugdata/newtest-mul.toks') # cmd.add_argument("--train", help='train data_path', type=str, default='/users4/bbcai/data/amazon/books/1600_train.review.tok') # cmd.add_argument("--dev", help='dev data_path', type=str, default='/users4/bbcai/data/amazon/books/400_test.review.tok') # cmd.add_argument("--test", help='test data_path', type=str, default='/users4/bbcai/data/amazon/books/400_test.review.tok') cmd.add_argument("--train_tree", help='train tree data_path', type=str, default='../debugdata/train/newa.parents') cmd.add_argument("--dev_tree", help='dev tree data_path', type=str, default='../debugdata/dev/newa.parents') cmd.add_argument("--test_tree", help='test tree data_path', type=str, default='../debugdata/test/newa.parents') cmd.add_argument("--number_normalized", help='number_normalized', action="store_true") cmd.add_argument("--batch_size", help='batch_size', type=int, default=10) cmd.add_argument("--max_epoch", help='max_epoch', type=int, default=6) cmd.add_argument("--hidden_size", help='hidden_size', type=int, default=200) cmd.add_argument("--embedding_size", help='embedding_size', type=int, default=300) cmd.add_argument("--embedding_path", default="../data/glove.840B.300d.txt", help="pre-trained embedding path") # cmd.add_argument("--lr", help='lr', type=float, default=0.01) cmd.add_argument("--seed", help='seed', type=int, default=1) cmd.add_argument("--dropout", help="dropout", type=float, default=0.2) cmd.add_argument( "--kernel_size", help= "kernel_size[Attention:the kernal size should be smaller than the length of your input after padding]", type=str, default="3*4*5") cmd.add_argument("--kernel_num", help="kernel_num", type=str, default="100*100*100") cmd.add_argument("--l2", help="l2 norm", type=int, default=3) cmd.add_argument("--encoder", help="options:[lstm, bilstm, gru, cnn, treelstm, sum]", type=str, default='treelstm') cmd.add_argument("--gpu", action="store_true", help="use gpu", default=True) cmd.add_argument("--model_name", default="sr", help="model name") cmd.add_argument("--optim", default="Adam", help="options:[Adam,SGD]") cmd.add_argument("--load_model", default="", help="model path") # character cmd.add_argument("--char_encoder", help="options:[bilstm, cnn]", type=str, default='') cmd.add_argument("--char_hidden_dim", help="char_hidden_dim", type=int, default=50) cmd.add_argument("--char_embedding_path", help='char_embedding_path', default="") cmd.add_argument("--char_embedding_size", help='char_embedding_size', type=int, default=50) cmd.add_argument("--char_dropout", help="char_dropout", type=float, default=0.1) args = cmd.parse_args() # fixed the seed torch.manual_seed(args.seed) random.seed(args.seed) data = Data(args) data.HP_gpu = True # build word character label alphabet data.build_alphabet(args.train) data.build_alphabet(args.dev) data.build_alphabet(args.test) data.fix_alphabet() # prepare data data.generate_instance(args.train, 'train') data.generate_instance(args.dev, 'dev') data.generate_instance(args.test, 'test') # load pre-trained embedding(if not,we random init the embedding using nn.Embedding()) if args.embedding_path: data.build_word_pretrain_emb(args.embedding_path) if args.char_embedding_path: data.build_char_pretrain_emb(args.char_embedding_path) # create visdom enviroment #vis = Visdom(env=data.HP_model_name) # check visdom connection #vis_use = vis.check_connection() if data.HP_use_char: if data.HP_char_features == "bilstm": data.input_size = data.HP_word_emb_dim + 2 * data.HP_char_hidden_dim elif data.HP_char_features == "cnn": data.input_size = data.HP_word_emb_dim + data.HP_char_hidden_dim else: data.input_size = data.HP_word_emb_dim # create factory and type create the model according to the encoder factory = ModelFactory() model = factory.get_model(data) # load model if args.load_model: model.load_state_dict(torch.load(args.load_model)) if data.HP_gpu: model = model.cuda() # Dataset、DataLoader for Batch if data.HP_encoder_type == 'treelstm': dataset = TreeDataset(args.train_tree, data.train_Ids) else: dataset = MyDataset(data.train_Ids) dataloader = DataLoader(dataset=dataset, batch_size=data.HP_batch_size, shuffle=True, collate_fn=collate_batch) best_valid_acc = 0.0 # optimizer if data.HP_optim.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr) elif data.HP_optim.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr) # print dataset[0] model.train() data.show_data_summary( ) # show information about the hyper parameters and some datas for epoch in range(data.HP_iteration): round_loss = 0 logging.info("epoch:{0} begins!".format(epoch)) if data.HP_encoder_type == 'treelstm': optimizer.zero_grad() total_loss = 0.0 indices = torch.randperm(len(dataset)) model.train() for idx in range(len(dataset)): tree, sent, label = dataset[indices[idx]] if data.HP_gpu: sent = Variable(torch.LongTensor(sent).cuda()) label = Variable( torch.LongTensor(np.array(label, dtype=np.int64)).cuda()) else: sent = Variable(torch.LongTensor(sent)) label = Variable( torch.LongTensor(np.array(label, dtype=np.int64))) loss = model(tree, sent, label) loss.backward() round_loss += loss.data[0] if idx % data.HP_batch_size == 0 and idx > 0: optimizer.step() optimizer.zero_grad() else: for step, (batch_words, batch_chars, batch_label) in enumerate(dataloader): model.train() optimizer.zero_grad() # zero grad loss = model.forward(batch_words, batch_chars, batch_label) loss.backward() # back propagation optimizer.step() # update parameters round_loss += loss.data[0] # the sum of the each epoch`s loss logging.info("epoch:{0} loss:{1}".format(epoch, round_loss)) # draw loss #if vis_use: #vis.line(X=torch.FloatTensor([epoch]), Y=torch.FloatTensor(round_loss), win='loss', #update='append' if epoch > 0 else None) # use current model to test on the dev set if data.HP_encoder_type == 'treelstm': valid_acc = tree_evaluate(data.dev_Ids, args.dev_tree, model, data) else: valid_acc = evaluate(data.dev_Ids, model, data.HP_batch_size) logging.info("valid_acc = {0}".format(valid_acc)) if valid_acc > best_valid_acc: best_valid_acc = valid_acc # test on the test set if data.HP_encoder_type == 'treelstm': test_acc = tree_evaluate(data.test_Ids, args.test_tree, model, data) else: test_acc = evaluate(data.test_Ids, model, data.HP_batch_size) # draw test acc #if vis_use: #vis.line(X=torch.FloatTensor([epoch]), Y=torch.FloatTensor([test_acc]), win='test_acc', #update='append' if epoch > 0 else None) # save model torch.save(model.state_dict(), "../model/" + data.HP_model_name + ".model") logging.info( "epoch:{0} New Record! valid_accuracy:{1}, test_accuracy:{2}". format(epoch, valid_acc, test_acc)) # finally, we evaluate valid and test dataset accuracy if data.HP_encoder_type == 'treelstm': valid_acc = tree_evaluate(data.dev_Ids, args.dev_tree, model, data) test_acc = tree_evaluate(data.test_Ids, args.test_tree, model, data) else: valid_acc = evaluate(data.dev_Ids, model, data.HP_batch_size) test_acc = evaluate(data.test_Ids, model, data.HP_batch_size) logging.info( "Train finished! saved model valid acc:{0}, test acc: {1}".format( valid_acc, test_acc))
def run(status='decode'): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM-CRF') parser.add_argument('--embedding', help='Embedding for words', default='None') parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default=status) parser.add_argument( '--savemodel', default= r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\data/saved_model.lstmcrf" ) parser.add_argument( '--savedset', help='Dir of saved data setting', default= r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\data/saved_model.lstmcrf.dset" ) parser.add_argument( '--train', default= r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\train.char.bmes" ) parser.add_argument( '--dev', default= r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\dev.char.bmes" ) parser.add_argument( '--test', default= r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\test.char.bmes" ) parser.add_argument('--seg', default="True") parser.add_argument('--extendalphabet', default="True") parser.add_argument( '--raw', default= r'C:\Users\DELL\PycharmProjects\research\named_entity_recognition-master\ResumeNER\input.char.bmes' ) parser.add_argument( '--loadmodel', default= r'C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\data/saved_model.lstmcrf.21.model' ) parser.add_argument( '--output', default= r'C:\Users\DELL\PycharmProjects\research\named_entity_recognition-master\ResumeNER\lattice_output.char.bmes' ) args = parser.parse_args() latticelstm_pred = [] train_file = args.train dev_file = args.dev test_file = args.test raw_file = args.raw model_dir = args.loadmodel dset_dir = args.savedset output_file = args.output if args.seg.lower() == "true": seg = True else: seg = False status = args.status.lower() save_model_dir = args.savemodel gpu = torch.cuda.is_available() char_emb = "data/gigaword_chn.all.a2b.uni.ite50.vec" bichar_emb = None gaz_file = "data/ctb.50d.vec" # gaz_file = None # char_emb = None # bichar_emb = None print("CuDNN:", torch.backends.cudnn.enabled) # gpu = False print("GPU available:", gpu) print("Status:", status) print("Seg: ", seg) print("Train file:", train_file) print("Dev file:", dev_file) print("Test file:", test_file) print("Raw file:", raw_file) print("Char emb:", char_emb) print("Bichar emb:", bichar_emb) print("Gaz file:", gaz_file) if status == 'train': print("Model saved to:", save_model_dir) sys.stdout.flush() if status == 'train': data = Data() data.HP_gpu = gpu data.HP_use_char = False data.HP_batch_size = 1 data.use_bigram = False data.gaz_dropout = 0.05 data.norm_gaz_emb = False data.HP_fix_gaz_emb = False data_initialization(data, gaz_file, train_file, dev_file, test_file) data.generate_instance_with_gaz(train_file, 'train') data.generate_instance_with_gaz(dev_file, 'dev') data.generate_instance_with_gaz(test_file, 'test') data.build_word_pretrain_emb(char_emb) data.build_biword_pretrain_emb(bichar_emb) data.build_gaz_pretrain_emb(gaz_file) best_acc, best_p, best_r, best_test, latticelstm_pred = train( data, save_model_dir, seg) print("accuracy = ", best_acc, " precision = ", best_p, " recall = ", best_r, " f_measure = ", best_test) return best_acc, best_p, best_r, best_test, latticelstm_pred elif status == 'test': data = load_data_setting(dset_dir) data.generate_instance_with_gaz(dev_file, 'dev') load_model_decode(model_dir, data, 'dev', gpu, seg) data.generate_instance_with_gaz(test_file, 'test') load_model_decode(model_dir, data, 'test', gpu, seg) elif status == 'decode': data = load_data_setting(dset_dir) data.generate_instance_with_gaz(raw_file, 'raw') decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) data.write_decoded_results(output_file, decode_results, 'raw') return decode_results else: print( "Invalid argument! Please use valid arguments! (train/test/decode)" )