Example #1
0
 def get_data(self):
     '''
     读取训练数据和验证集数据
     :return:
     '''
     train_path = os.path.join('.', args.train_data, 'dev.txt')
     train_data = read_corpus(train_path)
     dev_path = os.path.join('.', args.dev_data, 'dev.txt')
     dev_data = read_corpus(dev_path)
     return train_data, dev_data
Example #2
0
def train():
    train_path = os.path.join('./data_path/', 'train_data')
    test_path = os.path.join('./data_path/', 'test_data')

    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path)
    test_size = len(test_data)
    model.build_graph()
    print("train data: {}".format(len(train_data)))
    model.train(
        train_data, test_data
    )  # we could use test_data as the dev_data to see the overfitting phenomena
Example #3
0
def test(opt):
    log = helpers.Logger(opt.verbose)
    timer = helpers.Timer()
    # Load data =========================================================
    log.info('Reading corpora')
    # Read vocabs
    widss, ids2ws, widst, ids2wt = helpers.get_dictionaries(opt, test=True)
    # Read test
    tests_data = np.asarray(data.read_corpus(opt.test_src, widss), dtype=list)
    # Test output
    if not opt.test_out:
        opt.test_out = helpers.exp_filename(opt, 'test.out')
    # Get target language model
    lang_model = helpers.get_language_model(opt, None, widst, test=True)
    # Create model ======================================================
    log.info('Creating model')
    s2s = helpers.build_model(opt, widss, widst, lang_model, test=True)
    # Print configuration ===============================================
    if opt.verbose:
        options.print_config(opt,
                             src_dict_size=len(widss),
                             trg_dict_size=len(widst))
    # Start testing =====================================================
    log.info('Start running on test set, buckle up!')
    timer.restart()
    translations = []
    s2s.set_test_mode()
    for i, x in enumerate(tests_data):
        y = s2s.translate(x, beam_size=opt.beam_size)
        translations.append(' '.join([ids2wt[w] for w in y[1:-1]]))
    np.savetxt(opt.test_out, translations, fmt='%s')
    translations = np.asarray(translations, dtype=str)
    BLEU, details = evaluation.bleu_score(opt.test_dst, opt.test_out)
    log.info('Finished running on test set %.2f elapsed.' % timer.tick())
    log.info(details)
Example #4
0
def data_concat(base_path):
    dev_data = []
    test_data = []

    dev_data_ori = read_corpus(base_path+'/dev_data')
    test_data_ori = read_corpus(base_path+'/test_data')

    dev_data_predicted = read_corpus_3(base_path+'/label_dev')
    test_data_predicted = read_corpus_3(base_path + '/label_test')

    for sent_, sent_predicted_ in zip(dev_data_ori, dev_data_predicted):
        dev_data.append([sent_predicted_[0], sent_predicted_[1], sent_predicted_[2], sent_[2], sent_[3]])

    for sent_, sent_predicted_ in zip(test_data_ori, test_data_predicted):
        test_data.append([sent_predicted_[0], sent_predicted_[1], sent_predicted_[2], sent_[2], sent_[3]])

    return dev_data, test_data
Example #5
0
 def get_data(self):
     '''
     读取测试集
     :return:
     '''
     test_path = os.path.join('.', args.test_data, 'test.txt')
     test_data = read_corpus(test_path)
     return test_data
Example #6
0
def data_format(base_path):
    train_data = read_corpus(train_file_path)
    dev_data_bieo = read_predicted_corpus(dev_file_path)
    test_data_bieo = read_predicted_corpus(test_file_path)


    write_data(train_data, base_path+'/train.txt')
    write_data(dev_data_bieo, base_path + '/dev.txt')
    write_data(test_data_bieo, base_path+'/test.txt')
Example #7
0
def test(opt):
    # Load data =========================================================
    if opt.verbose:
        print('Reading corpora')
    # Read vocabs
    if opt.dic_src:
        widss, ids2ws = data.load_dic(opt.dic_src)
    else:
        widss, ids2ws = data.read_dic(opt.train_src, max_size=opt.src_vocab_size)
        data.save_dic(opt.exp_name + '_src_dic.txt', widss)

    if opt.dic_dst:
        widst, ids2wt = data.load_dic(opt.dic_dst)
    else:
        widst, ids2wt = data.read_dic(opt.train_dst, max_size=opt.trg_vocab_size)
        data.save_dic(opt.exp_name + '_trg_dic.txt', widst)
    # Read test
    tests_data = data.read_corpus(opt.test_src, widss)
    # Create model ======================================================
    if opt.verbose:
        print('Creating model')
        sys.stdout.flush()
    s2s = seq2seq.Seq2SeqModel(opt.emb_dim,
                               opt.hidden_dim,
                               opt.att_dim,
                               widss,
                               widst,
                               model_file=opt.model,
                               bidir=opt.bidir,
                               word_emb=opt.word_emb,
                               dropout=opt.dropout_rate,
                               max_len=opt.max_len)

    if s2s.model_file is not None:
        s2s.load()
    s2s.model_file = opt.exp_name + '_model'
    # Print configuration ===============================================
    if opt.verbose:
        options.print_config(opt, src_dict_size=len(widss), trg_dict_size=len(widst))
        sys.stdout.flush()
    # Start testing =====================================================
    print('Start running on test set, buckle up!')
    sys.stdout.flush()
    test_start = time.time()
    with open(opt.test_out, 'w+') as of:
        for x in tests_data:
            y = s2s.translate(x, beam_size=opt.beam_size)
            translation = ' '.join([ids2wt[w] for w in y[1:-1]])
            of.write(translation+'\n')
    _, details = evaluation.bleu_score(opt.test_dst, opt.test_out)
    test_elapsed = time.time()-test_start
    print('Finished running on test set', test_elapsed, 'elapsed.')
    print(details)
    sys.stdout.flush()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', '-g', default=-1, type=int,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--epoch', '-e', default=400, type=int,
                        help='number of epochs to learn')
    parser.add_argument('--unit', '-u', default=30, type=int,
                        help='number of units')
    parser.add_argument('--batchsize', '-b', type=int, default=25,
                        help='learning minibatch size')
    parser.add_argument('--label', '-l', type=int, default=5,
                        help='number of labels')
    parser.add_argument('--epocheval', '-p', type=int, default=5,
                        help='number of epochs per evaluation')
    parser.add_argument('--test', dest='test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    vocab = {}
    max_size = None
    train_trees = data.read_corpus('trees/train.txt', max_size)
    test_trees = data.read_corpus('trees/test.txt', max_size)

    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        xp = cuda.cupy
    else:
        xp = numpy

    train_data = [linearize_tree(vocab, t, xp) for t in train_trees]
    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    test_data = [linearize_tree(vocab, t, xp) for t in test_trees]
    test_iter = chainer.iterators.SerialIterator(
        test_data, args.batchsize, repeat=False, shuffle=False)

    model = ThinStackRecursiveNet(len(vocab), args.unit, args.label)

    if args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.AdaGrad(0.1)
    optimizer.setup(model)

    updater = training.StandardUpdater(
        train_iter, optimizer, device=None, converter=convert)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'))
    trainer.extend(
        extensions.Evaluator(test_iter, model, converter=convert, device=None),
        trigger=(args.epocheval, 'epoch'))
    trainer.extend(extensions.LogReport())

    trainer.extend(extensions.MicroAverage(
        'main/correct', 'main/total', 'main/accuracy'))
    trainer.extend(extensions.MicroAverage(
        'validation/main/correct', 'validation/main/total',
        'validation/main/accuracy'))

    trainer.extend(extensions.PrintReport(
        ['epoch', 'main/loss', 'validation/main/loss',
         'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

    trainer.run()
Example #9
0
    acc_node = 100.0 * result['correct_node'] / result['total_node']
    acc_root = 100.0 * result['correct_root'] / result['total_root']
    print(' Node accuracy: {0:.2f} %% ({1:,d}/{2:,d})'.format(
        acc_node, result['correct_node'], result['total_node']))
    print(' Root accuracy: {0:.2f} %% ({1:,d}/{2:,d})'.format(
        acc_root, result['correct_root'], result['total_root']))


vocab = {}
if args.test:
    max_size = 10
else:
    max_size = None
train_trees = [convert_tree(vocab, tree)
               for tree in data.read_corpus('trees/train.txt', max_size)]
test_trees = [convert_tree(vocab, tree)
              for tree in data.read_corpus('trees/test.txt', max_size)]
develop_trees = [convert_tree(vocab, tree)
                 for tree in data.read_corpus('trees/dev.txt', max_size)]

model = RecursiveNet(len(vocab), n_units)

if args.gpu >= 0:
    model.to_gpu()

# Setup optimizer
optimizer = optimizers.AdaGrad(lr=0.1)
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0001))
Example #10
0
## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

if args.mode != 'demo':
    train_path_text = os.path.join('.', args.train_data, 'train_data_text')
    train_path_tag = os.path.join('.', args.train_data, 'train_data_tag')

    test_path_text = os.path.join('.', args.test_data, 'test_data_text')
    test_path_tag = os.path.join('.', args.test_data, 'test_data_tag')

    train_data = read_corpus(train_path_text, train_path_tag)

    test_data = read_corpus(test_path_text, test_path_tag)
    test_size = len(test_data)
else:
    demo_tag = os.path.join('.', args.demo_data, 'demo_tag')

## paths setting
paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join('.', args.train_data + "_save", timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
Example #11
0
args = parser.parse_args()

## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

## read corpus and get training data
if args.mode != 'demo':
    #train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    #train_data = read_corpus(train_path)
    test_data = read_corpus(test_path)
    test_size = len(test_data)

## paths setting
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join('.', args.train_data + "_save", timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
result_path = os.path.join(output_path, "results")
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
get_logger(log_path).info(str(args))
Example #12
0
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


## read corpus and get training data
if args.mode != 'demo':
    
    f_source_data = os.path.join('.', args.train_data, 'source_data.txt') 
    f_source_label = os.path.join('.', args.train_data, 'source_label.txt')
    f_test_data = os.path.join('.', args.train_data, 'test_data.txt') 
    f_test_label = os.path.join('.', args.train_data, 'test_label.txt')
    train_data = read_corpus(f_source_data, f_source_label)
    test_data = read_corpus(f_test_data, f_test_label); test_size = len(test_data)


## paths setting
paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join('.', args.train_data+"_save", timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
def main():
    # if args.mode == 'train'
    ap = []
    with open('../../../china_medical_char_data_cleaned/vocab.tags.txt',
              'r') as fin:
        for line in fin:
            ap.append(line.strip())
        fin.close()
    length = len(ap)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.625)
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    sess = tf.Session(config=tf.ConfigProto(
        #         device_count={ "CPU": 48 },
        #         inter_op_parallelism_threads=10,
        allow_soft_placement=True,
        #         intra_op_parallelism_threads=20,
        gpu_options=gpu_options))

    generator = Generator_BiLSTM_CRF(0.5, 1, batch_size, params, filter_sizes,
                                     num_filters, 0.75, length)
    generator.build_graph()

    tvars = tf.trainable_variables()
    (assignment_map,
     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
         tvars, init_checkpoint)
    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
    # 最后初始化变量
    # sess.run(tf.global_variables_initializer())

    sess.run(generator.init_op)
    sess.run(generator.table_op)
    sess.run(generator.init_op_1)
    saver = tf.train.Saver(tf.global_variables())

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
        init_string = ""
        if var.name in initialized_variable_names:
            init_string = ", *INIT_FROM_CKPT*"
        print("  name = %s, shape = %s%s", var.name, var.shape, init_string)

    train_path = os.path.join('.', args.train_data, 'train_data1')
    train_unlabel_path = os.path.join('.', args.train_data_unlabel,
                                      'train_unlabel')
    train_unlabel_path_1 = os.path.join('.', args.train_data_unlabel,
                                        'train_unlabel1')
    test_path = os.path.join('.', args.test_data, 'test_data1')
    sub_test_path = os.path.join('.', args.sub_test_data, 'sub_test_data')
    train_data = read_corpus(train_path)
    train_data_unlabel = read_corpus_unlabel(train_unlabel_path)
    train_data_unlabel_1 = read_corpus_unlabel(train_unlabel_path_1)
    test_data = read_corpus(test_path)
    test_size = len(test_data)
    sub_test_data = read_corpus(sub_test_path)

    batches_labeled = batch_yield(train_data, batch_size, shuffle=True)
    batches_labeled = list(batches_labeled)
    # print(len(batches_labeled))
    num_batches = (len(train_data) + batch_size - 1) // batch_size
    batches_unlabeled = batch_yield_for_unla_da(train_data_unlabel,
                                                batch_size,
                                                shuffle=True)
    batches_unlabeled = list(batches_unlabeled)
    # print(len(batches_unlabeled))
    batches_labeled_for_dis = batch_yield_for_discri(train_data,
                                                     batch_size,
                                                     shuffle=True)
    batches_labeled_for_dis = list(batches_labeled_for_dis)
    batches_unlabeled_for_dis = batch_yield_for_discri_unlabeled(
        train_data_unlabel, batch_size, shuffle=True)
    batches_unlabeled_for_dis = list(batches_unlabeled_for_dis)
    dev = batch_yield(test_data, batch_size, shuffle=True)
    #    num_batches = min(len(batches_labeled),len(batches_unlabeled))
    num_batches_unlabel = (len(train_data_unlabel) + batch_size -
                           1) // batch_size
    num_batches_1 = min(len(batches_labeled_for_dis),
                        len(batches_unlabeled_for_dis))
    index = 0
    if args.mode == 'train':
        for epoch_total in range(30):

            print('epoch_total and index are {} and {}'.format(
                epoch_total + 1, index))
            medi_lis = get_metrics(sess,
                                   generator,
                                   dev,
                                   test_size,
                                   batch_size,
                                   flag=0)

            for ele in medi_lis:
                print('实体识别的', ele)
            print('the whole epoch training accuracy finished!!!!!!!!!!!!')

            for i, (words, labels) in enumerate(batches_labeled):
                run_one_epoch(sess,
                              words,
                              labels,
                              tags=[],
                              dev=test_data,
                              epoch=epoch_total,
                              gen=generator,
                              num_batches=num_batches,
                              batch=i,
                              label=0,
                              it=0,
                              iteration=0,
                              saver=saver)

            dev1 = batch_yield(test_data, batch_size, shuffle=True)

            medi_lis_from_cross_entropy_training = get_metrics(sess,
                                                               generator,
                                                               dev1,
                                                               test_size,
                                                               batch_size,
                                                               flag=0)

            for ele in medi_lis_from_cross_entropy_training:
                print('第一次', ele)

            print(
                'the accuray after cross entropy training finished!!!!!!!!!!!!!!!!!!1'
            )

            # if epoch_total > 3:
            #     #     batches_labeled_for_dis = batches_labeled_for_dis[0: len(batches_labeled_for_dis)-5]
            #     batch_dis_for_label = len(batches_labeled_for_dis)
            #     batch_dis_for_unlabel = len(batches_unlabeled_for_dis)
            #     for (ele, ele2) in zip(enumerate(batches_labeled_for_dis), enumerate(batches_unlabeled_for_dis)):
            #         index += 1
            #         #               if index > 70:
            #         #                    break
            #         run_one_epoch(sess, ele[1][0], ele[1][1], ele[1][2], dev=test_data, epoch=epoch_total,
            #                       gen=generator,
            #                       num_batches=batch_dis_for_label, batch=index, label=2, it=0, iteration=0, saver=saver)
            #         run_one_epoch(sess, ele2[1][0], ele2[1][1], ele2[1][2], dev=test_data, epoch=epoch_total,
            #                       gen=generator,
            #                       num_batches=batch_dis_for_unlabel, batch=index, label=3, it=0, iteration=0,
            #                       saver=saver)
            #     index = 0
            #
            #     print('the whole dis phaseI finished')
            #     #    index += 1
            #     for it in range(5):
            #         for i, (words, labels, tags) in enumerate(batches_unlabeled):
            #             #          print(i)
            #             run_one_epoch(sess, words, labels, tags=tags, dev=test_data, epoch=epoch_total, gen=generator,
            #                           num_batches=num_batches_unlabel, batch=i, label=1, it=it, iteration=i,
            #                           saver=saver)
            #
            #     dev2 = batch_yield(test_data, batch_size, shuffle=True)
            #
            #     medi_lis_from_adversarial_training = get_metrics(sess, generator, dev2, test_size, batch_size, flag=0)
            #
            #     for ele in medi_lis_from_adversarial_training:
            #         print('第二次打印', ele)
            #
            # print('the accuracy after adversarial training of generator finised!!!!!!!!!!!!!!')
            #
            # print('epoch {} finished!'.format(epoch_total))

    if args.mode == 'test':
        sub_dev = batch_yield_for_discri_unlabeled(sub_test_data,
                                                   batch_size,
                                                   shuffle=True)
        #          print(list(sub_dev))
        ckpt_file = tf.train.latest_checkpoint(model_path)

        generator = Generator_BiLSTM_CRF(0.5,
                                         batch_size,
                                         params,
                                         filter_sizes,
                                         num_filters,
                                         0.75,
                                         length,
                                         is_training=False)
        generator.build_graph()
        generator.test(sess, sub_dev, test_size, 20)
Example #14
0
## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)  #(3905,300)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

## read corpus and get training data
if args.mode != 'train':
    # train_path = os.path.join('.', args.train_data, 'train_data')
    # test_path = os.path.join('.', args.test_data, 'test_data')
    train_path = os.path.join('.', args.train_data, 'processed_downloadfile3')
    test_path = os.path.join('.', args.test_data, 'processed_downloadfile4')
    train_data = read_corpus(train_path)  #list[(句子,label),(句子,label)]
    test_data = read_corpus(test_path)
    test_size = len(test_data)  #test中有多少条句子

## paths setting
paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
print(timestamp)
output_path = os.path.join(
    '.', args.train_data + "_save",
    timestamp)  #output_path:.\\data_path_save\\timestamp
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(
    output_path,
    "summaries")  #summary_path:./data_path_save/timestamp/summaries
paths['summary_path'] = summary_path
Example #15
0
# -*- coding: utf-8 -*-
"""
Created on Wed Nov  6 11:09:20 2019

@author: 37112
"""

import time
import data
import utils
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
start = time.time()
#数据载入
data_path = "data/train-v2.0.json"
qlist, alist = data.read_corpus(data_path)
#展示词频最多的次
#word_dic = Counter([q for l in utils.cut(qlist) for q in l])
#utils.show_most_word_freq(word_dic, 50)
#载入预处理后问题
qlist_new = utils.load_qlist('data/q_prepro.txt')
#question = input("您想了解什么问题?")
question = "When did Beyonce start become popular"
# 使用tf-idf方法
idx = utils.find_top_similar_ask1(question, qlist_new)
alist = np.array(alist)
print(alist[idx])
end = time.time()
print(end - start, "s")
Example #16
0
def eval_user_adaptation(opt):
    log = utils.Logger(opt.verbose)
    timer = utils.Timer()
    # Read vocabs
    lexicon = helpers.get_lexicon(opt)
    # Read data
    filepairs = load_user_filepairs(opt.usr_file_list)
    # Get target language model
    lang_model = None
    # Load model
    s2s = helpers.build_model(opt, lexicon, lang_model, test=True)
    if opt.update_mode == 'mixture_weights' and not opt.user_recognizer == 'fact_voc':
        log.info('Updating only the mixture weights doesn\'t make sense here')
        exit()
    s2s.lm = lexicon.trg_unigrams
    #    s2s.freeze_parameters()
    # Trainer
    trainer = helpers.get_trainer(opt, s2s)
    # print config
    if opt.verbose:
        options.print_config(opt,
                             src_dict_size=len(lexicon.w2ids),
                             trg_dict_size=len(lexicon.w2idt))
    # This will store translations and gold sentences
    base_translations = []
    adapt_translations = []
    gold = []
    # Run training
    for usr_id, (src_file, trg_file) in enumerate(filepairs):
        log.info('Evaluating on files %s' %
                 os.path.basename(src_file).split()[0])
        # Load file pair
        src_data = data.read_corpus(src_file, lexicon.w2ids, raw=True)
        trg_data = data.read_corpus(trg_file, lexicon.w2idt, raw=True)
        # split train/test
        train_src, test_src, train_trg, test_trg, order = split_user_data(
            src_data, trg_data, n_test=opt.n_test)
        # Convert train data to indices
        train_src = lexicon.sents_to_ids(train_src)
        train_trg = lexicon.sents_to_ids(train_trg, trg=True)
        # Save test data
        for s in test_trg:
            gold.append(' '.join(s))
        # Reset model
        s2s.load()
        s2s.reset_usr_vec()
        # Translate with baseline model
        base_translations.extend(evaluate_model(s2s, test_src, opt.beam_size))
        # Start loop
        n_train = opt.max_n_train
        adapt_translations.extend(
            adapt_user(s2s, trainer, train_src[:n_train], train_trg[:n_train],
                       test_src, opt))

    # Temp files
    temp_gold = utils.exp_temp_filename(opt, 'gold.txt')
    temp_base = utils.exp_temp_filename(opt, '%s_base.txt' % opt.update_mode)
    temp_adapt = utils.exp_temp_filename(opt, '%s_adapt.txt' % opt.update_mode)
    utils.savetxt(temp_gold, gold)
    utils.savetxt(temp_base, base_translations)
    utils.savetxt(temp_adapt, adapt_translations)
    # Evaluate base translations
    bleu, details = evaluation.bleu_score(temp_gold, temp_base)
    log.info('Base BLEU score: %.2f' % bleu)
    # Evaluate base translations
    bleu, details = evaluation.bleu_score(temp_gold, temp_adapt)
    log.info('Adaptation BLEU score: %.2f' % bleu)
    # Compare both
    temp_bootstrap_gold = utils.exp_temp_filename(opt, 'bootstrap_gold.txt')
    temp_bootstrap_base = utils.exp_temp_filename(opt, 'bootstrap_base.txt')
    temp_bootstrap_adapt = utils.exp_temp_filename(opt, 'bootstrap_adapt.txt')
    bleus = evaluation.paired_bootstrap_resampling(
        temp_gold, temp_base, temp_adapt, opt.bootstrap_num_samples,
        opt.bootstrap_sample_size, temp_bootstrap_gold, temp_bootstrap_base,
        temp_bootstrap_adapt)
    evaluation.print_paired_stats(bleus)
    os.remove(temp_bootstrap_gold)
    os.remove(temp_bootstrap_base)
    os.remove(temp_bootstrap_adapt)
Example #17
0
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.optim as optim

from data import read_corpus, build_dict, TAG_MAP, NER_DataSet, condtraints
from bi_lstm_crf import BiLSTM_CRF
from trainer import train, evaluate, load_model

train_corpus_path = './datasets/train_data'
test_corpus_path = './datasets/test_data'

if __name__ == '__main__':

    # prepare data
    corpus = read_corpus(train_corpus_path)
    dct = build_dict(corpus)

    # build dataloader
    np.random.shuffle(corpus)
    train_ds = NER_DataSet(corpus[:-5000], dct)
    val_ds = NER_DataSet(corpus[-5000:], dct)

    train_dl = DataLoader(train_ds,
                          batch_size=32,
                          shuffle=True,
                          drop_last=True,
                          num_workers=0)
    val_dl = DataLoader(val_ds,
                        batch_size=32,
                        shuffle=False,
Example #18
0
label2id = ner_cfg.generate_tag_to_label()

logger = logging.getLogger(__name__)
current_dir = os.path.dirname(os.path.abspath(__file__))

## get char embeddings
word2id_pos2id = read_dictionary('word2id_pos2id_new.pkl')
word2id = word2id_pos2id['word2id']
pos2id = word2id_pos2id['pos2id']
word_embedding = np.array(np.load('word2vec.npy'), dtype=np.float32)
pos_embedding = np.array(np.load('pos2vec.npy'), dtype=np.float32)

config = Config(word2id,
                pos2id,
                label2id,
                batch_size=128,
                n_epochs=200,
                n_neurons=60)
config.word_embedding = word_embedding
config.pos_embedding = pos_embedding

## read corpus and get training data
train_data, test_data = read_corpus('train_data')
# test_data = read_corpus('test_data')
# test_size = len(test_data)

model = BiLSTM_CRF(is_training=True, config=config)
model.build_graph()
model.train(train_data=train_data, valid_data=test_data)
# model.test(test_data)
Example #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu',
                        '-g',
                        default=-1,
                        type=int,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        type=str,
                        help='Directory to ouput the result')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        help='Resume the training from snapshot')
    parser.add_argument('--epoch',
                        '-e',
                        default=400,
                        type=int,
                        help='number of epochs to learn')
    parser.add_argument('--unit',
                        '-u',
                        default=30,
                        type=int,
                        help='number of units')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=25,
                        help='learning minibatch size')
    parser.add_argument('--label',
                        '-l',
                        type=int,
                        default=5,
                        help='number of labels')
    parser.add_argument('--epocheval',
                        '-p',
                        type=int,
                        default=5,
                        help='number of epochs per evaluation')
    parser.add_argument('--test', dest='test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    n_epoch = args.epoch  # number of epochs
    n_units = args.unit  # number of units per layer
    batchsize = args.batchsize  # minibatch size
    n_label = args.label  # number of labels
    epoch_per_eval = args.epocheval  # number of epochs per evaluation

    if args.test:
        max_size = 10
    else:
        max_size = None

    vocab = {}
    train_data = [
        convert_tree(vocab, tree)
        for tree in data.read_corpus('trees/train.txt', max_size)
    ]
    train_iter = chainer.iterators.SerialIterator(train_data, batchsize)
    validation_data = [
        convert_tree(vocab, tree)
        for tree in data.read_corpus('trees/dev.txt', max_size)
    ]
    validation_iter = chainer.iterators.SerialIterator(validation_data,
                                                       batchsize,
                                                       repeat=False,
                                                       shuffle=False)
    test_data = [
        convert_tree(vocab, tree)
        for tree in data.read_corpus('trees/test.txt', max_size)
    ]

    model = RecursiveNet(len(vocab), n_units, n_label)

    if args.gpu >= 0:
        cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # Setup optimizer
    optimizer = optimizers.AdaGrad(lr=0.1)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0001))

    def _convert(batch, _):
        return batch

    # Setup updater
    updater = chainer.training.StandardUpdater(train_iter,
                                               optimizer,
                                               device=args.gpu,
                                               converter=_convert)

    # Setup trainer and run
    trainer = chainer.training.Trainer(updater, (n_epoch, 'epoch'), args.out)
    trainer.extend(extensions.Evaluator(validation_iter,
                                        model,
                                        device=args.gpu,
                                        converter=_convert),
                   trigger=(epoch_per_eval, 'epoch'))
    trainer.extend(extensions.LogReport())

    trainer.extend(
        extensions.MicroAverage('main/correct', 'main/total', 'main/accuracy'))
    trainer.extend(
        extensions.MicroAverage('validation/main/correct',
                                'validation/main/total',
                                'validation/main/accuracy'))

    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    trainer.extend(
        extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'),
        trigger=(epoch_per_eval, 'epoch'))

    if args.resume is not None:
        chainer.serializers.load_npz(args.resume, trainer)
    trainer.run()

    print('Test evaluation')
    evaluate(model, test_data)
Example #20
0
def train(opt):
    # Load data =========================================================
    if opt.verbose:
        print('Reading corpora')
    # Read vocabs
    if opt.dic_src:
        widss, ids2ws = data.load_dic(opt.dic_src)
    else:
        widss, ids2ws = data.read_dic(opt.train_src, max_size=opt.src_vocab_size)
        data.save_dic(opt.exp_name + '_src_dic.txt', widss)

    if opt.dic_dst:
        widst, ids2wt = data.load_dic(opt.dic_dst)
    else:
        widst, ids2wt = data.read_dic(opt.train_dst, max_size=opt.trg_vocab_size)
        data.save_dic(opt.exp_name + '_trg_dic.txt', widst)

    # Read training
    trainings_data = data.read_corpus(opt.train_src, widss)
    trainingt_data = data.read_corpus(opt.train_dst, widst)
    # Read validation
    valids_data = data.read_corpus(opt.valid_src, widss)
    validt_data = data.read_corpus(opt.valid_dst, widst)

    # Create model ======================================================
    if opt.verbose:
        print('Creating model')
        sys.stdout.flush()
    s2s = seq2seq.Seq2SeqModel(opt.emb_dim,
                               opt.hidden_dim,
                               opt.att_dim,
                               widss,
                               widst,
                               model_file=opt.model,
                               bidir=opt.bidir,
                               word_emb=opt.word_emb,
                               dropout=opt.dropout_rate,
                               max_len=opt.max_len)

    if s2s.model_file is not None:
        s2s.load()
    s2s.model_file = opt.exp_name+'_model.txt'
    # Trainer ==========================================================
    if opt.trainer == 'sgd':
        trainer = dy.SimpleSGDTrainer(
            s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay)
    if opt.trainer == 'clr':
        trainer = dy.CyclicalSGDTrainer(s2s.model, e0_min=opt.learning_rate / 10,
                                        e0_max=opt.learning_rate, edecay=opt.learning_rate_decay)
    elif opt.trainer == 'momentum':
        trainer = dy.MomentumSGDTrainer(
            s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay)
    elif opt.trainer == 'rmsprop':
        trainer = dy.RMSPropTrainer(s2s.model, e0=opt.learning_rate,
                                    edecay=opt.learning_rate_decay)
    elif opt.trainer == 'adam':
        trainer = dy.AdamTrainer(s2s.model, opt.learning_rate, edecay=opt.learning_rate_decay)
    else:
        print('Trainer name invalid or not provided, using SGD', file=sys.stderr)
        trainer = dy.SimpleSGDTrainer(
            s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay)
    if opt.verbose:
        print('Using '+opt.trainer+' optimizer')
    trainer.set_clip_threshold(opt.gradient_clip)
    # Print configuration ===============================================
    if opt.verbose:
        options.print_config(opt, src_dict_size=len(widss), trg_dict_size=len(widst))
        sys.stdout.flush()
    # Creat batch loaders ===============================================
    if opt.verbose:
        print('Creating batch loaders')
        sys.stdout.flush()
    trainbatchloader = data.BatchLoader(trainings_data, trainingt_data, opt.batch_size)
    devbatchloader = data.BatchLoader(valids_data, validt_data, opt.dev_batch_size)
    # Start training ====================================================
    if opt.verbose:
        print('starting training')
        sys.stdout.flush()
    start = time.time()
    train_loss = 0
    processed = 0
    best_bleu = 0
    i = 0
    for epoch in range(opt.num_epochs):
        for x, y in trainbatchloader:
            processed += sum(map(len, y))
            bsize = len(y)
            # Compute loss
            loss = s2s.calculate_loss(x, y)
            # Backward pass and parameter update
            loss.backward()
            trainer.update()
            train_loss += loss.scalar_value() * bsize
            if (i+1) % opt.check_train_error_every == 0:
                # Check average training error from time to time
                logloss = train_loss / processed
                ppl = np.exp(logloss)
                elapsed = time.time()-start
                trainer.status()
                print(" Training_loss=%f, ppl=%f, time=%f s, tokens processed=%d" %
                      (logloss, ppl, elapsed, processed))
                start = time.time()
                train_loss = 0
                processed = 0
                sys.stdout.flush()
            if (i+1) % opt.check_valid_error_every == 0:
                # Check generalization error on the validation set from time to time
                dev_loss = 0
                dev_processed = 0
                dev_start = time.time()
                for x, y in devbatchloader:
                    dev_processed += sum(map(len, y))
                    bsize = len(y)
                    loss = s2s.calculate_loss(x, y, test=True)
                    dev_loss += loss.scalar_value() * bsize
                dev_logloss = dev_loss/dev_processed
                dev_ppl = np.exp(dev_logloss)
                dev_elapsed = time.time()-dev_start
                print("[epoch %d] Dev loss=%f, ppl=%f, time=%f s, tokens processed=%d" %
                      (epoch, dev_logloss, dev_ppl, dev_elapsed, dev_processed))
                sys.stdout.flush()
                start = time.time()

            if (i+1) % opt.valid_bleu_every == 0:
                # Check BLEU score on the validation set from time to time
                print('Start translating validation set, buckle up!')
                sys.stdout.flush()
                bleu_start = time.time()
                with open(opt.valid_out, 'w+') as f:
                    for x in valids_data:
                        y_hat = s2s.translate(x, beam_size=opt.beam_size)
                        translation = [ids2wt[w] for w in y_hat[1:-1]]
                        print(' '.join(translation), file=f)
                bleu, details = evaluation.bleu_score(opt.valid_dst, opt.valid_out)
                bleu_elapsed = time.time()-bleu_start
                print('Finished translating validation set', bleu_elapsed, 'elapsed.')
                print(details)
                # Early stopping : save the latest best model
                if bleu > best_bleu:
                    best_bleu = bleu
                    print('Best BLEU score up to date, saving model to', s2s.model_file)
                    s2s.save()
                sys.stdout.flush()
                start = time.time()
            i = i+1
        trainer.update_epoch()
        return {
            "muc": muc_score,
            "b3": b3_score,
            "ceafe": ceaf_score,
            "avg": avg_score
        }


if __name__ == "__main__":
    args = parser.parse_args()

    if args.random_seed:
        torch.random.manual_seed(args.random_seed)
        np.random.seed(args.random_seed)

    documents = read_corpus(args.dataset)

    def create_model_instance(model_name, **override_kwargs):
        return BaselineController(
            MentionPairFeatures.num_features(),
            model_name=model_name,
            learning_rate=override_kwargs.get("learning_rate",
                                              args.learning_rate),
            dataset_name=override_kwargs.get("dataset", args.dataset))

    # Train model
    if args.dataset == "coref149":
        INNER_K, OUTER_K = 3, 10
        logging.info(
            f"Performing {OUTER_K}-fold (outer) and {INNER_K}-fold (inner) CV..."
        )
Example #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', '-g', default=-1, type=int,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out', '-o', default='result', type=str,
                        help='Directory to ouput the result')
    parser.add_argument('--resume', '-r', type=str,
                        help='Resume the training from snapshot')
    parser.add_argument('--epoch', '-e', default=400, type=int,
                        help='number of epochs to learn')
    parser.add_argument('--unit', '-u', default=30, type=int,
                        help='number of units')
    parser.add_argument('--batchsize', '-b', type=int, default=25,
                        help='learning minibatch size')
    parser.add_argument('--label', '-l', type=int, default=5,
                        help='number of labels')
    parser.add_argument('--epocheval', '-p', type=int, default=5,
                        help='number of epochs per evaluation')
    parser.add_argument('--test', dest='test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    n_epoch = args.epoch       # number of epochs
    n_units = args.unit        # number of units per layer
    batchsize = args.batchsize      # minibatch size
    n_label = args.label         # number of labels
    epoch_per_eval = args.epocheval  # number of epochs per evaluation

    if args.test:
        max_size = 10
    else:
        max_size = None

    vocab = {}
    train_data = [convert_tree(vocab, tree)
                  for tree in data.read_corpus('trees/train.txt', max_size)]
    train_iter = chainer.iterators.SerialIterator(train_data, batchsize)
    validation_data = [convert_tree(vocab, tree)
                       for tree in data.read_corpus('trees/dev.txt', max_size)]
    validation_iter = chainer.iterators.SerialIterator(
        validation_data, batchsize, repeat=False, shuffle=False)
    test_data = [convert_tree(vocab, tree)
                 for tree in data.read_corpus('trees/test.txt', max_size)]

    model = RecursiveNet(len(vocab), n_units, n_label)

    if args.gpu >= 0:
        cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # Setup optimizer
    optimizer = optimizers.AdaGrad(lr=0.1)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0001))

    def _convert(batch, _):
        return batch

    # Setup updater
    updater = chainer.training.StandardUpdater(
        train_iter, optimizer, device=args.gpu, converter=_convert)

    # Setup trainer and run
    trainer = chainer.training.Trainer(updater, (n_epoch, 'epoch'), args.out)
    trainer.extend(
        extensions.Evaluator(validation_iter, model, device=args.gpu,
                             converter=_convert),
        trigger=(epoch_per_eval, 'epoch'))
    trainer.extend(extensions.LogReport())

    trainer.extend(extensions.MicroAverage(
        'main/correct', 'main/total', 'main/accuracy'))
    trainer.extend(extensions.MicroAverage(
        'validation/main/correct', 'validation/main/total',
        'validation/main/accuracy'))

    trainer.extend(extensions.PrintReport(
        ['epoch', 'main/loss', 'validation/main/loss',
         'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

    trainer.extend(
        extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'),
        trigger=(epoch_per_eval, 'epoch'))

    if args.resume is not None:
        chainer.serializers.load_npz(args.resume, trainer)
    trainer.run()

    print('Test evaluation')
    evaluate(model, test_data)
Example #23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu',
                        '-g',
                        default=-1,
                        type=int,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--epoch',
                        '-e',
                        default=400,
                        type=int,
                        help='number of epochs to learn')
    parser.add_argument('--unit',
                        '-u',
                        default=30,
                        type=int,
                        help='number of units')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=25,
                        help='learning minibatch size')
    parser.add_argument('--label',
                        '-l',
                        type=int,
                        default=5,
                        help='number of labels')
    parser.add_argument('--epocheval',
                        '-p',
                        type=int,
                        default=5,
                        help='number of epochs per evaluation')
    parser.add_argument('--test', dest='test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    vocab = {}
    max_size = None
    train_trees = data.read_corpus('trees/train.txt', max_size)
    test_trees = data.read_corpus('trees/test.txt', max_size)

    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        xp = cuda.cupy
    else:
        xp = numpy

    train_data = [linearize_tree(vocab, t, xp) for t in train_trees]
    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    test_data = [linearize_tree(vocab, t, xp) for t in test_trees]
    test_iter = chainer.iterators.SerialIterator(test_data,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    model = ThinStackRecursiveNet(len(vocab), args.unit, args.label)

    if args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.AdaGrad(0.1)
    optimizer.setup(model)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=None,
                                       converter=convert)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'))
    trainer.extend(extensions.Evaluator(test_iter,
                                        model,
                                        converter=convert,
                                        device=None),
                   trigger=(args.epocheval, 'epoch'))
    trainer.extend(extensions.LogReport())

    trainer.extend(
        extensions.MicroAverage('main/correct', 'main/total', 'main/accuracy'))
    trainer.extend(
        extensions.MicroAverage('validation/main/correct',
                                'validation/main/total',
                                'validation/main/accuracy'))

    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    trainer.run()
Example #24
0
#import numpy as np
#import os, argparse, time, random
from BiLSTMmodel import bilstm_model
from data import read_corpus, read_dictionary, random_embedding
from config import config

## get char embeddings
word2id = read_dictionary('vocab')
##随机产生embedding
embeddings = random_embedding(word2id, config.embedding_size)

paths={'log_path':'logger//', 'model_path':'./model2/','result_path':'result//'}

#TODO 注意:model_path!!这是个坑啊!!

model = bilstm_model(embeddings, paths, word2id, config=config)
model.build_graph()



## train model on the whole training data
train_data = read_corpus('pku_training.utf8')
print("train data: {}".format(len(train_data)))
model.train(train_data=train_data) 

##test model
#test_data = read_corpus('pku_test_gold.utf8')
#print("test data: {}".format(len(test_data)))
#model.test(test_data=test_data) 

Example #25
0
if args.embedding_type == 'random':
    #随机生成词嵌入矩阵(一共3905个字,默认取300个特征,维度为3905*300)
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embeddings = load_embeddings(args.embedding_dim, word2id,
                                 args.embedding_type)
    #使用gensim(word2vec)基于wiki百科语料训练的中文词向量

print("\n=========embeddings==========\n", embeddings, "\ndim(embeddings)=",
      embeddings.shape)

## read corpus and get training data获取
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'ner_train_data')
    test_path = os.path.join('.', args.test_data, 'ner_test_data')
    train_data = read_corpus(train_path)  #读取训练集
    test_data = read_corpus(test_path)
    test_size = len(test_data)  #读取测试集
    print('train_data=\n', train_data)
    #print("\n==========train_data================\n",train_data)
    #print("\n==========test_data================\n",test_data)

## paths setting创建相应文件夹目录

paths = {}
# 时间戳就是一个时间点,一般就是为了在同步更新的情况下提高效率之用。
#就比如一个文件,如果他没有被更改,那么他的时间戳就不会改变,那么就没有必要写回,以提高效率,
#如果不论有没有被更改都重新写回的话,很显然效率会有所下降。
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
#输出路径output_path路径设置为data_path_save下的具体时间名字为文件名
output_path = os.path.join('.', args.train_data + "_save", timestamp)
Example #26
0
parser.add_argument("--source_dataset", type=str, default="senticoref")
parser.add_argument("--target_dataset", type=str, default="coref149")
parser.add_argument("--kfold_state_cache_path", type=str, default=None)

if __name__ == "__main__":
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.addHandler(logging.StreamHandler(sys.stdout))
    args = parser.parse_args()

    if args.random_seed:
        torch.random.manual_seed(args.random_seed)
        np.random.seed(args.random_seed)

    src_docs = read_corpus(args.source_dataset)
    tgt_docs = read_corpus(args.target_dataset)

    all_tok2id, _ = extract_vocab(src_docs + tgt_docs, lowercase=True, top_n=10**9)
    logging.info(f"Total vocabulary size: {len(all_tok2id)} tokens")

    pretrained_embs = None
    embedding_size = args.embedding_size

    if args.use_pretrained_embs == "word2vec":
        # Note: pretrained word2vec embeddings we use are uncased
        logging.info("Loading pretrained Slovene word2vec embeddings")
        with codecs.open(args.embedding_path, "r", encoding="utf-8", errors="ignore") as f:
            num_tokens, embedding_size = list(map(int, f.readline().split(" ")))
            embs = {}
            for line in f:
Example #27
0
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

## training model
if args.mode == 'train':
    model = BiLSTM_CRF(args,
                       embeddings,
                       dictname2id,
                       word2id,
                       paths,
                       config=config)
    model.build_graph()

    train_path = os.path.join('.', args.train_data, 'train.txt')
    train_data = read_corpus(train_path, word2id, word2dictname, dictname2id)
    test_path = os.path.join('.', args.test_data, 'test.txt')
    test_data = read_corpus(test_path, word2id, word2dictname, dictname2id)
    test_size = len(test_data)

    ## train model on the whole training data
    print("train data: {}".format(len(train_data)))
    model.train(train=train_data, dev=test_data
                )  # use test_data as the dev_data to see overfitting phenomena

## testing model
elif args.mode == 'test':
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args,
Example #28
0
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)

log_path = os.path.join(result_path, args.dataset_name + log_pre + "_log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

# read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('data_path', args.dataset_name, train_file)
    paths['train_path'] = train_path
    test_path = os.path.join('data_path', args.dataset_name, test_file)
    paths['test_path'] = test_path
    train_data = read_corpus(train_path)[:100]
    test_data = read_corpus(test_path)
    test_size = len(test_data)
    print("train data: {}".format(len(train_data)))
    print("test data: {}".format(test_size))

# training model
if args.mode == 'train':
    model = BiLSTM_CRF(args,
                       embeddings,
                       tag2label,
                       word2id,
                       paths,
                       config=config)
    model.build_graph()
Example #29
0

## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path); test_size = len(test_data)


## paths setting
paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join('.', args.train_data+"_save", timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
Example #30
0

## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path); test_size = len(test_data)


## paths setting
paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join('.', args.train_data+"_save", timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
Example #31
0
args = parser.parse_args()

import torch

torch.manual_seed(args.seed)
args.use_cuda = True

# load data
from data_loader import DataLoader
from data import read_corpus, tag2label
import os
from eval import conlleval

sents_train, labels_train, args.word_size, _ = read_corpus(
    os.path.join('.', args.data, 'source_data.txt'),
    os.path.join('.', args.data, 'source_label.txt'))
sents_test, labels_test, _, data_origin = read_corpus(
    os.path.join('.', args.data, 'test_data.txt'),
    os.path.join('.', args.data, 'test_label.txt'),
    is_train=False)
args.label_size = len(tag2label)

train_data = DataLoader(sents_train,
                        labels_train,
                        cuda=args.use_cuda,
                        batch_size=args.batch_size)
test_data = DataLoader(sents_test,
                       labels_test,
                       cuda=args.use_cuda,
                       shuffle=False,
Example #32
0
config = tf.ConfigProto()



## hyperparameters
embedding_dim = 128

tag2label = {"N": 0,
             "解剖部位": 1, "手术": 2,
             "药物": 3, "独立症状": 4,
             "症状描述": 5}
## get char embeddings
word2id = read_dictionary('./vocab.pkl')
embeddings = random_embedding(word2id, embedding_dim)

train_data = read_corpus('./c.txt')


# embeddings, tag2label, vocab,batch_size,epoch,hidden_dim,CRF,update_embedding,shuffle
## training model
if __name__ == '__main__':
    model = BiLSTM_CRF(embeddings, tag2label, word2id, 4,80,128,False,True,True)
    model.build_graph()
    test_report = open('test_report.txt','w',encoding= 'utf-8')

    print("train data: {}".format(len(train_data)))
    model.test(test_report)
    # model.train(train=train_data)  # use test_data as the dev_data to see overfitting phenomena


Example #33
0
args = parser.parse_args()

## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    pre_train_path = os.path.join('.', args.train_data, 'resume_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    pre_train_data = read_pre_train_data(pre_train_path, args.seq_length)

    test_data = read_corpus(test_path)
    test_size = len(test_data)

## paths setting
paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join('.', args.train_data + "_save", timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
Example #34
0
def getTrainData(filename):
    return read_corpus(filename)