Beispiel #1
0
def main():
    cmd = argparse.ArgumentParser("sentence_representation_library")
    cmd.add_argument("--train",
                     help='train data_path',
                     type=str,
                     default='../debugdata/newtrain-mul.toks')
    cmd.add_argument("--dev",
                     help='dev data_path',
                     type=str,
                     default='../debugdata/newdev-mul.toks')
    cmd.add_argument("--test",
                     help='test data_path',
                     type=str,
                     default='../debugdata/newtest-mul.toks')
    # cmd.add_argument("--train", help='train data_path', type=str, default='/users4/bbcai/data/amazon/books/1600_train.review.tok')
    # cmd.add_argument("--dev", help='dev data_path', type=str, default='/users4/bbcai/data/amazon/books/400_test.review.tok')
    # cmd.add_argument("--test", help='test data_path', type=str, default='/users4/bbcai/data/amazon/books/400_test.review.tok')
    cmd.add_argument("--train_tree",
                     help='train tree data_path',
                     type=str,
                     default='../debugdata/train/newa.parents')
    cmd.add_argument("--dev_tree",
                     help='dev tree data_path',
                     type=str,
                     default='../debugdata/dev/newa.parents')
    cmd.add_argument("--test_tree",
                     help='test tree data_path',
                     type=str,
                     default='../debugdata/test/newa.parents')
    cmd.add_argument("--number_normalized",
                     help='number_normalized',
                     action="store_true")
    cmd.add_argument("--batch_size", help='batch_size', type=int, default=10)
    cmd.add_argument("--max_epoch", help='max_epoch', type=int, default=6)
    cmd.add_argument("--hidden_size",
                     help='hidden_size',
                     type=int,
                     default=200)
    cmd.add_argument("--embedding_size",
                     help='embedding_size',
                     type=int,
                     default=300)
    cmd.add_argument("--embedding_path",
                     default="../data/glove.840B.300d.txt",
                     help="pre-trained embedding path")  #
    cmd.add_argument("--lr", help='lr', type=float, default=0.01)
    cmd.add_argument("--seed", help='seed', type=int, default=1)
    cmd.add_argument("--dropout", help="dropout", type=float, default=0.2)
    cmd.add_argument(
        "--kernel_size",
        help=
        "kernel_size[Attention:the kernal size should be smaller than the length of your input after padding]",
        type=str,
        default="3*4*5")
    cmd.add_argument("--kernel_num",
                     help="kernel_num",
                     type=str,
                     default="100*100*100")
    cmd.add_argument("--l2", help="l2 norm", type=int, default=3)
    cmd.add_argument("--encoder",
                     help="options:[lstm, bilstm, gru, cnn, treelstm, sum]",
                     type=str,
                     default='treelstm')
    cmd.add_argument("--gpu",
                     action="store_true",
                     help="use gpu",
                     default=True)
    cmd.add_argument("--model_name", default="sr", help="model name")
    cmd.add_argument("--optim", default="Adam", help="options:[Adam,SGD]")
    cmd.add_argument("--load_model", default="", help="model path")
    # character
    cmd.add_argument("--char_encoder",
                     help="options:[bilstm, cnn]",
                     type=str,
                     default='')
    cmd.add_argument("--char_hidden_dim",
                     help="char_hidden_dim",
                     type=int,
                     default=50)
    cmd.add_argument("--char_embedding_path",
                     help='char_embedding_path',
                     default="")
    cmd.add_argument("--char_embedding_size",
                     help='char_embedding_size',
                     type=int,
                     default=50)
    cmd.add_argument("--char_dropout",
                     help="char_dropout",
                     type=float,
                     default=0.1)

    args = cmd.parse_args()
    # fixed the seed
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    data = Data(args)
    data.HP_gpu = True
    # build word character label alphabet
    data.build_alphabet(args.train)

    data.build_alphabet(args.dev)
    data.build_alphabet(args.test)
    data.fix_alphabet()

    # prepare data
    data.generate_instance(args.train, 'train')
    data.generate_instance(args.dev, 'dev')
    data.generate_instance(args.test, 'test')

    # load pre-trained embedding(if not,we random init the embedding using nn.Embedding())
    if args.embedding_path:
        data.build_word_pretrain_emb(args.embedding_path)
    if args.char_embedding_path:
        data.build_char_pretrain_emb(args.char_embedding_path)

    # create visdom enviroment
    #vis = Visdom(env=data.HP_model_name)
    # check visdom connection
    #vis_use = vis.check_connection()

    if data.HP_use_char:
        if data.HP_char_features == "bilstm":
            data.input_size = data.HP_word_emb_dim + 2 * data.HP_char_hidden_dim
        elif data.HP_char_features == "cnn":
            data.input_size = data.HP_word_emb_dim + data.HP_char_hidden_dim
    else:
        data.input_size = data.HP_word_emb_dim

    # create factory and type create the model according to the encoder
    factory = ModelFactory()
    model = factory.get_model(data)

    # load model
    if args.load_model:
        model.load_state_dict(torch.load(args.load_model))

    if data.HP_gpu:
        model = model.cuda()

    # Dataset、DataLoader for Batch
    if data.HP_encoder_type == 'treelstm':
        dataset = TreeDataset(args.train_tree, data.train_Ids)
    else:
        dataset = MyDataset(data.train_Ids)
        dataloader = DataLoader(dataset=dataset,
                                batch_size=data.HP_batch_size,
                                shuffle=True,
                                collate_fn=collate_batch)

    best_valid_acc = 0.0

    # optimizer
    if data.HP_optim.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr)
    elif data.HP_optim.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr)

    # print dataset[0]
    model.train()
    data.show_data_summary(
    )  # show information about the hyper parameters and some datas
    for epoch in range(data.HP_iteration):
        round_loss = 0
        logging.info("epoch:{0} begins!".format(epoch))
        if data.HP_encoder_type == 'treelstm':
            optimizer.zero_grad()
            total_loss = 0.0
            indices = torch.randperm(len(dataset))
            model.train()
            for idx in range(len(dataset)):
                tree, sent, label = dataset[indices[idx]]
                if data.HP_gpu:
                    sent = Variable(torch.LongTensor(sent).cuda())
                    label = Variable(
                        torch.LongTensor(np.array(label,
                                                  dtype=np.int64)).cuda())
                else:
                    sent = Variable(torch.LongTensor(sent))
                    label = Variable(
                        torch.LongTensor(np.array(label, dtype=np.int64)))
                loss = model(tree, sent, label)
                loss.backward()
                round_loss += loss.data[0]
                if idx % data.HP_batch_size == 0 and idx > 0:
                    optimizer.step()
                    optimizer.zero_grad()
        else:
            for step, (batch_words, batch_chars,
                       batch_label) in enumerate(dataloader):
                model.train()
                optimizer.zero_grad()  # zero grad
                loss = model.forward(batch_words, batch_chars, batch_label)
                loss.backward()  # back propagation
                optimizer.step()  # update parameters
                round_loss += loss.data[0]  # the sum of the each epoch`s loss
        logging.info("epoch:{0} loss:{1}".format(epoch, round_loss))

        # draw loss
        #if vis_use:
        #vis.line(X=torch.FloatTensor([epoch]), Y=torch.FloatTensor(round_loss), win='loss',
        #update='append' if epoch > 0 else None)

        # use current model to test on the dev set

        if data.HP_encoder_type == 'treelstm':
            valid_acc = tree_evaluate(data.dev_Ids, args.dev_tree, model, data)
        else:
            valid_acc = evaluate(data.dev_Ids, model, data.HP_batch_size)
        logging.info("valid_acc = {0}".format(valid_acc))

        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            # test on the test set
            if data.HP_encoder_type == 'treelstm':
                test_acc = tree_evaluate(data.test_Ids, args.test_tree, model,
                                         data)
            else:
                test_acc = evaluate(data.test_Ids, model, data.HP_batch_size)

            # draw test acc
            #if vis_use:
            #vis.line(X=torch.FloatTensor([epoch]), Y=torch.FloatTensor([test_acc]), win='test_acc',
            #update='append' if epoch > 0 else None)

            # save model
            torch.save(model.state_dict(),
                       "../model/" + data.HP_model_name + ".model")
            logging.info(
                "epoch:{0} New Record! valid_accuracy:{1}, test_accuracy:{2}".
                format(epoch, valid_acc, test_acc))

    # finally, we evaluate valid and test dataset accuracy
    if data.HP_encoder_type == 'treelstm':
        valid_acc = tree_evaluate(data.dev_Ids, args.dev_tree, model, data)
        test_acc = tree_evaluate(data.test_Ids, args.test_tree, model, data)
    else:
        valid_acc = evaluate(data.dev_Ids, model, data.HP_batch_size)
        test_acc = evaluate(data.test_Ids, model, data.HP_batch_size)

    logging.info(
        "Train finished! saved model valid acc:{0}, test acc: {1}".format(
            valid_acc, test_acc))
Beispiel #2
0
def run(status='decode'):
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional LSTM-CRF')
    parser.add_argument('--embedding',
                        help='Embedding for words',
                        default='None')
    parser.add_argument('--status',
                        choices=['train', 'test', 'decode'],
                        help='update algorithm',
                        default=status)
    parser.add_argument(
        '--savemodel',
        default=
        r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\data/saved_model.lstmcrf"
    )
    parser.add_argument(
        '--savedset',
        help='Dir of saved data setting',
        default=
        r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\data/saved_model.lstmcrf.dset"
    )
    parser.add_argument(
        '--train',
        default=
        r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\train.char.bmes"
    )
    parser.add_argument(
        '--dev',
        default=
        r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\dev.char.bmes"
    )
    parser.add_argument(
        '--test',
        default=
        r"C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\test.char.bmes"
    )
    parser.add_argument('--seg', default="True")
    parser.add_argument('--extendalphabet', default="True")
    parser.add_argument(
        '--raw',
        default=
        r'C:\Users\DELL\PycharmProjects\research\named_entity_recognition-master\ResumeNER\input.char.bmes'
    )
    parser.add_argument(
        '--loadmodel',
        default=
        r'C:\Users\DELL\PycharmProjects\research\LatticeLSTM-master\data/saved_model.lstmcrf.21.model'
    )
    parser.add_argument(
        '--output',
        default=
        r'C:\Users\DELL\PycharmProjects\research\named_entity_recognition-master\ResumeNER\lattice_output.char.bmes'
    )
    args = parser.parse_args()

    latticelstm_pred = []
    train_file = args.train
    dev_file = args.dev
    test_file = args.test
    raw_file = args.raw
    model_dir = args.loadmodel
    dset_dir = args.savedset
    output_file = args.output
    if args.seg.lower() == "true":
        seg = True
    else:
        seg = False
    status = args.status.lower()

    save_model_dir = args.savemodel
    gpu = torch.cuda.is_available()

    char_emb = "data/gigaword_chn.all.a2b.uni.ite50.vec"
    bichar_emb = None
    gaz_file = "data/ctb.50d.vec"
    # gaz_file = None
    # char_emb = None
    # bichar_emb = None

    print("CuDNN:", torch.backends.cudnn.enabled)
    # gpu = False
    print("GPU available:", gpu)
    print("Status:", status)
    print("Seg: ", seg)
    print("Train file:", train_file)
    print("Dev file:", dev_file)
    print("Test file:", test_file)
    print("Raw file:", raw_file)
    print("Char emb:", char_emb)
    print("Bichar emb:", bichar_emb)
    print("Gaz file:", gaz_file)
    if status == 'train':
        print("Model saved to:", save_model_dir)
    sys.stdout.flush()

    if status == 'train':
        data = Data()
        data.HP_gpu = gpu
        data.HP_use_char = False
        data.HP_batch_size = 1
        data.use_bigram = False
        data.gaz_dropout = 0.05
        data.norm_gaz_emb = False
        data.HP_fix_gaz_emb = False
        data_initialization(data, gaz_file, train_file, dev_file, test_file)
        data.generate_instance_with_gaz(train_file, 'train')
        data.generate_instance_with_gaz(dev_file, 'dev')
        data.generate_instance_with_gaz(test_file, 'test')
        data.build_word_pretrain_emb(char_emb)
        data.build_biword_pretrain_emb(bichar_emb)
        data.build_gaz_pretrain_emb(gaz_file)
        best_acc, best_p, best_r, best_test, latticelstm_pred = train(
            data, save_model_dir, seg)
        print("accuracy = ", best_acc, " precision = ", best_p, " recall = ",
              best_r, " f_measure = ", best_test)
        return best_acc, best_p, best_r, best_test, latticelstm_pred
    elif status == 'test':
        data = load_data_setting(dset_dir)
        data.generate_instance_with_gaz(dev_file, 'dev')
        load_model_decode(model_dir, data, 'dev', gpu, seg)
        data.generate_instance_with_gaz(test_file, 'test')
        load_model_decode(model_dir, data, 'test', gpu, seg)
    elif status == 'decode':
        data = load_data_setting(dset_dir)
        data.generate_instance_with_gaz(raw_file, 'raw')
        decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg)
        data.write_decoded_results(output_file, decode_results, 'raw')
        return decode_results
    else:
        print(
            "Invalid argument! Please use valid arguments! (train/test/decode)"
        )