コード例 #1
0
def getRest(input):
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args,
                       embeddings,
                       tag2label,
                       word2id,
                       paths,
                       config=config)
    model.build_graph()
    saver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        print('============= demo =============')
        saver.restore(sess, ckpt_file)
        demo_sent = input
        if demo_sent == '' or demo_sent.isspace():
            return {'status': 'fail'}
        else:
            demo_sent = list(demo_sent.strip())
            demo_data = [(demo_sent, ['O'] * len(demo_sent))]
            tag = model.demo_one(sess, demo_data)
            PER, LOC, ORG = get_entity(tag, demo_sent)
            result = {'status': 'success', 'PER': PER, 'LOC': LOC, 'ORG': ORG}
            return result
コード例 #2
0
def demotest(sentence):
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args,
                       embeddings,
                       tag2label,
                       word2id,
                       paths,
                       config=config)
    model.build_graph()
    saver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        print('============= demo =============')
        saver.restore(sess, ckpt_file)
        while (1):
            #print('Please input your sentence:')
            demo_sent = sentence
            if demo_sent == '' or demo_sent.isspace():
                print('语句为空')
                PER = ['']
                LOC = ['']
                ORG = ['']
                return (PER, LOC, ORG)
            else:
                demo_sent = list(demo_sent.strip())
                demo_data = [(demo_sent, ['O'] * len(demo_sent))]
                tag = model.demo_one(sess, demo_data)
                PER, LOC, ORG = get_entity(tag, demo_sent)
                print('PER: {}\nLOC: {}\nORG: {}'.format(PER, LOC, ORG))
                return (PER, LOC, ORG)
コード例 #3
0
def _main():
    data_manager = DataManager()
    vocab_size = len(data_manager.word2ix)
    model = BiLSTM_CRF(device, vocab_size, data_manager.tag2ix, EMBEDDING_DIM, HIDDEN_DIM)
    model = model.to(device)

    train_set = NerDataset(data_manager.train_sents, data_manager.train_tags)
    dev_set = NerDataset(data_manager.dev_sents, data_manager.dev_tags)
    train_loader = DataLoader(train_set, batch_size=BATCH_SZ, shuffle=True)
    dev_loader = DataLoader(dev_set, batch_size=BATCH_SZ, shuffle=True)

    optimizer = optim.Adam(model.parameters(), lr=0.01)
    epoch_loss = []

    '''with torch.no_grad():
        precheck_sent = to_tensor(train_loader[0])
        precheck_tag = to_tensor(dataset.train_tags[0])
        print(precheck_tag)
        print(model(precheck_sent))'''

    for epoch in range(EPOCH_NUM):
        for sents, tags, lengths in tqdm(train_loader):
            sents = sents.to(device)
            tags = tags.to(device)
            lengths = lengths.to(device)
            # print(lengths, sents.size(), tags.size())
            loss = model.neg_log_likelihood(sents, tags, lengths)

            epoch_loss.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(epoch, ' epoch loss: ', sum(epoch_loss)/len(epoch_loss))
        save_model(model, epoch)
        eval(model, dev_loader)
コード例 #4
0
 def demo_one(self, model_path):
     '''
     输入句子
     :param model_path:
     input:武三思與韋後日夜譖敬暉等不已
     :return: [[0, 2, 'PER'], [4, 5, 'PER'], [9, 10, 'PER']]
     '''
     ckpt_file = tf.train.latest_checkpoint(model_path)
     print(ckpt_file)
     self.paths['model_path'] = ckpt_file
     model = BiLSTM_CRF(args,
                        self.embedding,
                        self.tag2id,
                        self.word2id,
                        self.paths,
                        config=config)
     model.build_graph()
     saver = tf.train.Saver()
     with tf.Session(config=config) as sess:
         print('begain to demo one sentence!')
         saver.restore(sess, ckpt_file)
         while (1):
             print('Please input your sentence:')
             demo_sent = input()
             if demo_sent == '' or demo_sent.isspace(
             ) or demo_sent == 'end':
                 print('See you next time!')
                 break
             else:
                 demo_sent = list(demo_sent.strip())
                 demo_data = [(demo_sent, ['O'] * len(demo_sent))]
                 tag = model.demo_one(sess, demo_data)
                 print(get_ner_demo(tag))
コード例 #5
0
ファイル: main.py プロジェクト: wemiam/tgenerator
def evaluate_words(lines):
    print("start evaluate_words")
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args,
                       embeddings,
                       tag2label,
                       word2id,
                       paths,
                       config=config)
    model.build_graph()
    saver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        print('============= demo =============')
        saver.restore(sess, ckpt_file)

        demo_sent = lines
        print(demo_sent)
        demo_sent = list(demo_sent.strip())
        print(demo_sent)
        demo_data = [(demo_sent, ['O'] * len(demo_sent))]
        tag = model.demo_one(sess, demo_data)
        PER, LOC, ORG = get_entity(tag, demo_sent)
        print('PER: {}\nLOC: {}\nORG: {}'.format(PER, LOC, ORG))
コード例 #6
0
def predict_random(demo_sent):
    word2id, embeddings = getDicEmbed()
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    model = BiLSTM_CRF(batch_size=args.batch_size,
                       epoch_num=args.epoch,
                       hidden_dim=args.hidden_dim,
                       embeddings=embeddings,
                       dropout_keep=args.dropout,
                       optimizer=args.optimizer,
                       lr=args.lr,
                       clip_grad=args.clip,
                       tag2label=tag2label,
                       vocab=word2id,
                       shuffle=args.shuffle,
                       model_path=ckpt_file,
                       summary_path=summary_path,
                       log_path=log_path,
                       result_path=result_path,
                       CRF=args.CRF,
                       update_embedding=args.update_embedding)
    model.build_graph()
    saver = tf.train.Saver()
    with tf.Session() as sess:
        print('============= demo =============')
        saver.restore(sess, ckpt_file)
        demo_sent = list(demo_sent.strip())
        demo_data = [(demo_sent, ['M'] * len(demo_sent))]
        tag = model.demo_one(sess, demo_data)
        sess.close()
    res = segment(sent, tag)
    print(res)
コード例 #7
0
class NER_DEMO(object):
    def __init__(self, args):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = 0.2
        paths, model_path = get_paths(args)
        ckpt_file = tf.train.latest_checkpoint(model_path)

        paths['model_path'] = ckpt_file
        word2id = read_dictionary(
            os.path.join('.', args.train_data, 'word2id.pkl'))
        embeddings = random_embedding(word2id, args.embedding_dim)
        self.model = BiLSTM_CRF(args,
                                embeddings,
                                tag2label,
                                word2id,
                                paths,
                                config=config)
        self.model.build_graph()
        self.saver = tf.train.Saver()
        self.sess = tf.Session(config=config)
        self.saver.restore(self.sess, ckpt_file)

    def predict(self, demo_sent):
        if demo_sent == '' or demo_sent.isspace():
            print('See you next time!')
            return {}
        else:
            demo_sent = list(demo_sent.strip())
            demo_data = [(demo_sent, ['O'] * len(demo_sent))]
            tag = self.model.demo_one(self.sess, demo_data)
            entities = get_entity(tag, demo_sent)
            return entities
コード例 #8
0
def main(args):
    labels = [
        'O', 'B-LOC', 'B-ORG', 'B-T', 'I-LOC', 'I-PER', 'B-PER', 'I-ORG', 'I-T'
    ]
    # labels = ['O', 'I-PER', 'B-PER', 'I-LOC', 'I-ORG', 'B-ORG', 'B-LOC']
    args.num_labels = len(labels)

    tokenizer = None
    word2id = None
    if args.model == 'bert':
        is_BERT = True
        # use 'bert-base-chinese' model
        pretrained_model_name = 'bert-base-chinese'
        tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        config = BertConfig.from_pretrained(
            pretrained_model_name,
            num_labels=args.num_labels,
            hidden_dropout_prob=args.hidden_dropout_prob)
        model = BERTforNER_CRF.from_pretrained(pretrained_model_name,
                                               config=config,
                                               use_crf=args.crf)
    else:
        is_BERT = False
        word2id = json.load(open(args.word2id_file, "r", encoding="utf8"))
        model = BiLSTM_CRF(len(word2id), args.embedding_dim, args.hidden_dim,
                           args.num_labels, args.hidden_dropout_prob, args.crf)

    framework = Framework(args)

    if args.mode == "train":
        print("loading training dataset...")
        train_dataset = NERDataset(file_path=args.train_file,
                                   labels=labels,
                                   word2id=word2id,
                                   tokenizer=tokenizer,
                                   max_len=args.max_len,
                                   is_BERT=is_BERT)

        print("loading dev datasets...")
        dev_dataset = NERDataset(file_path=args.dev_file,
                                 labels=labels,
                                 word2id=word2id,
                                 tokenizer=tokenizer,
                                 max_len=args.max_len,
                                 is_BERT=is_BERT)

        framework.train(train_dataset, dev_dataset, model, labels)

    print("\Testing ...")
    print("loading dev datasets...")
    test_dataset = NERDataset(file_path=args.test_file,
                              labels=labels,
                              word2id=word2id,
                              tokenizer=tokenizer,
                              max_len=args.max_len,
                              is_BERT=is_BERT)

    model.load_state_dict(torch.load(args.save_model))
    framework.test(test_dataset, model, labels)
コード例 #9
0
def train_and_val():
    embedding_dim = 100
    hidden_dim = 100
    model_load_path = None
    best_model_save_path = 'model/model_100_best_0223.pth'
    max_score = 0
    stop_epoch = 30
    unimprove_time = 0
    val_json_path = '/home/agwave/Data/resume/val_0222.json'
    val_pdf_dir = '/home/agwave/Data/resume/val_0222/'

    training_data = get_data_from_data_txt(TRAIN_WORD_TO_TAG_PATH)
    with open('supporting_document/train_word_to_tag_0223.json', 'r') as j:
        word_to_ix = json.load(j)
    tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5,
                 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11,
                 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17,
                 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23,
                 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29,
                 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35,
                 'o': 36, '<start>': 37, '<stop>': 38}
    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    start_epoch = 0
    if model_load_path != None:
        print('load model...')
        checkpoint = torch.load(model_load_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
    preliminary_score = get_score_by_model(model, val_json_path, val_pdf_dir)
    print('preliminary score:', preliminary_score)

    for epoch in range(start_epoch, stop_epoch):
        print("---------------------")
        print("running epoch : ", epoch)
        start_time = time.time()
        for sentence, tags in tqdm(training_data):
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
            loss = model.neg_log_likelihood(sentence_in, targets)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
        cur_epoch_score = get_score_by_model(model, val_json_path, val_pdf_dir)
        print('score', cur_epoch_score)
        print('running time:', time.time() - start_time)
        if cur_epoch_score > max_score:
            unimprove_time = 0
            max_score = cur_epoch_score
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch
            }, best_model_save_path)
            print('save best model successfully.')
        else:
            break
コード例 #10
0
ファイル: predict.py プロジェクト: ljx02/NER
def run():
    embedding_mat = np.random.uniform(
        -0.25, 0.25,
        (len(read_dictionary(params.vocab_path)), params.embedding_dim))
    embedding_mat = np.float32(embedding_mat)
    embeddings = embedding_mat
    num_tags = len(params.tag2label)
    summary_path = "logs"
    model = BiLSTM_CRF(embeddings, params.update_embedding, params.hidden_dim,
                       num_tags, params.clip, summary_path, params.optimizer)
    model.build_graph()
    predict(model, params.batch_size, read_dictionary(params.vocab_path),
            params.tag2label)
コード例 #11
0
ファイル: predict.py プロジェクト: jma-code/CH-NER
def run(demo_sent, flag=False):
    embedding_mat = np.random.uniform(-0.25, 0.25, (len(read_dictionary(params.vocab_path)), params.embedding_dim))
    embedding_mat = np.float32(embedding_mat)
    embeddings = embedding_mat
    num_tags = len(params.tag2label)
    summary_path = "logs"
    model = BiLSTM_CRF(embeddings, params.update_embedding, params.hidden_dim, num_tags, params.clip, summary_path,
                       params.optimizer)
    model.build_graph()
    PER_mess, LOC_mess, ORG_mess = predict(model, params.batch_size, read_dictionary(params.vocab_path), params.tag2label, demo_sent)
    if flag:
        return PER_mess, LOC_mess, ORG_mess

#run('我在北京上北京大学,周恩来是中国总理,我喜欢北京。我在清华大学,毛泽东是中国主席,他去过苏联。')
コード例 #12
0
ファイル: main.py プロジェクト: ryan147k/NER-pytorch
def test():
    """
    模型测试
    """
    # 模型
    bilstm_crf = BiLSTM_CRF(opt.vocab_size, opt.emb_dim, opt.emb_dim//2, opt.tag_num, dropout=opt.dropout)
    if opt.load_model_path:
        bilstm_crf.load(opt.load_model_path)

    # 数据
    test_dataset = RmrbDataset(train=False)
    test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset))
    for i, (x_batch, y_batch) in enumerate(test_dataloader):
        y_hat = bilstm_crf(x_batch)
        print(classification_report(t.flatten(y_batch), t.flatten(y_hat)))
コード例 #13
0
def test(data, file):
    """
    created by jma
    模型测试
    :param data:测试数据
    :param file:模型
    """
    model = BiLSTM_CRF(embeddings, args.update_embedding, args.hidden_dim, len(tag2label), args.clip,
                       params.summary_path, args.optimizer)
    model.build_graph()
    testsaver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        testsaver.restore(sess, file)
        label_list, seq_len_list = dev_one_epoch(model, sess, data)
        evaluate(label_list, data)
コード例 #14
0
def run(word_train,
        label_train,
        word_dev,
        label_dev,
        vocab,
        device,
        kf_index=0):
    # build dataset
    train_dataset = SegDataset(word_train, label_train, vocab, config.label2id)
    dev_dataset = SegDataset(word_dev, label_dev, vocab, config.label2id)
    # build data_loader
    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch_size,
                              shuffle=True,
                              collate_fn=train_dataset.collate_fn)
    dev_loader = DataLoader(dev_dataset,
                            batch_size=config.batch_size,
                            shuffle=True,
                            collate_fn=dev_dataset.collate_fn)
    # model
    model = BiLSTM_CRF(embedding_size=config.embedding_size,
                       hidden_size=config.hidden_size,
                       vocab_size=vocab.vocab_size(),
                       target_size=vocab.label_size(),
                       num_layers=config.lstm_layers,
                       lstm_drop_out=config.lstm_drop_out,
                       nn_drop_out=config.nn_drop_out)
    model.to(device)
    # optimizer
    optimizer = optim.Adam(model.parameters(),
                           lr=config.lr,
                           betas=config.betas)
    scheduler = StepLR(optimizer,
                       step_size=config.lr_step,
                       gamma=config.lr_gamma)
    # how to initialize these parameters elegantly
    for p in model.crf.parameters():
        _ = torch.nn.init.uniform_(p, -1, 1)
    # train and test
    # train(train_loader, dev_loader, vocab, model, optimizer, scheduler, device, kf_index)
    with torch.no_grad():
        # test on the final test set
        test_loss, f1 = test(config.test_dir, vocab, device, kf_index)
    return test_loss, f1
コード例 #15
0
def train(args):
    train_data = args.train_data
    train_eval_split = args.train_eval_split
    min_count = args.min_count
    vocab_file = args.vocab_file
    max_step = args.max_step
    model_path = args.model_path
    data_fold = args.data_fold
    train_sents, train_sent_labels, eval_sents, eval_sent_labels = preprocess(
        train_data, train_eval_split, data_fold)
    vocab = build_vocab(train_sents, min_count, vocab_file)
    bilstm_crf = BiLSTM_CRF(args.batch_size,
                            args.embedding_size,
                            args.hidden_size,
                            args.lr,
                            len(list(vocab.keys())),
                            tag_num=7)
    print("bilstm_crf object created.")
    sess = tf.InteractiveSession()
    saver = tf.train.Saver(max_to_keep=3)
    sess.run(tf.global_variables_initializer())
    batched_train_sents = batch_generate(train_sents, args.batch_size)
    batched_train_labels = batch_generate(train_sent_labels, args.batch_size)
    print("Batch generator created.")
    while (1):
        for batch in zip(batched_train_sents, batched_train_labels):
            #print(batch[0][:10])
            #print(batch[1][:10])
            batch_sents, batch_labels, seq_len = batch_preprocess(batch, vocab)
            loss_value, _, global_step_value = sess.run(
                (bilstm_crf.loss, bilstm_crf.train_step,
                 bilstm_crf.global_step),
                feed_dict={
                    bilstm_crf.input_sents: batch_sents,
                    bilstm_crf.input_labels: batch_labels,
                    bilstm_crf.sequence_lengths: seq_len
                })
            print("%d step finished." % (global_step_value))
            if (global_step_value % 10 == 0) or (global_step_value == 1):
                f_score, right_ner, recog_ner, all_ner, shape = sess.run(
                    (bilstm_crf.f_score, bilstm_crf.right_ner,
                     bilstm_crf.recog_ner, bilstm_crf.all_ner,
                     bilstm_crf.shape),
                    feed_dict={
                        bilstm_crf.input_sents: batch_sents,
                        bilstm_crf.input_labels: batch_labels,
                        bilstm_crf.sequence_lengths: seq_len
                    })
                print("%d step, loss is %s, f_score is %s" %
                      (global_step_value, str(loss_value), str(f_score)))
                print(
                    "length is %s, right_ner is %d, recog_ner is %d, all_ner is %d"
                    % (str(shape), right_ner, recog_ner, all_ner))
                saver.save(sess, os.path.join(model_path, "model.ckpt"))
コード例 #16
0
    def __init__(self, args):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = 0.2
        paths, model_path = get_paths(args)
        ckpt_file = tf.train.latest_checkpoint(model_path)

        paths['model_path'] = ckpt_file
        word2id = read_dictionary(
            os.path.join('.', args.train_data, 'word2id.pkl'))
        embeddings = random_embedding(word2id, args.embedding_dim)
        self.model = BiLSTM_CRF(args,
                                embeddings,
                                tag2label,
                                word2id,
                                paths,
                                config=config)
        self.model.build_graph()
        self.saver = tf.train.Saver()
        self.sess = tf.Session(config=config)
        self.saver.restore(self.sess, ckpt_file)
コード例 #17
0
def Train(trainfile):
    word2id, embeddings = getDicEmbed()
    traindata = getTrainData(trainfile)

    model = BiLSTM_CRF(batch_size=args.batch_size,
                       epoch_num=args.epoch,
                       hidden_dim=args.hidden_dim,
                       embeddings=embeddings,
                       dropout_keep=args.dropout,
                       optimizer=args.optimizer,
                       lr=args.lr,
                       clip_grad=args.clip,
                       tag2label=tag2label,
                       vocab=word2id,
                       shuffle=args.shuffle,
                       model_path=ckpt_prefix,
                       summary_path=summary_path,
                       log_path=log_path,
                       result_path=result_path,
                       CRF=args.CRF,
                       update_embedding=args.update_embedding)
    model.build_graph()

    dev_data = traindata[:5000]
    dev_size = len(dev_data)
    train_data = traindata[5000:]
    train_size = len(train_data)
    print("train data: {0}\n dev data: {1}".format(train_size, dev_size))
    model.train(traindata, dev_data)
コード例 #18
0
def train_all_data():
    embedding_dim = 100
    hidden_dim = 100
    stop_epoch = 1
    model_1_epoch = 'model/model_1_epoch_lr0001.pth'

    training_data = get_data_from_data_txt(DATA_PERFECT_PATH)
    word_to_ix = get_word_to_ix(training_data, min_word_freq=1)
    tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5,
                 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11,
                 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17,
                 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23,
                 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29,
                 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35,
                 'o': 36, '<start>': 37, '<stop>': 38, 'c-live': 39, 'c-proj': 40, 'c-woti': 41,
                 'c-post': 42, 'c-unv': 43, 'c-nati': 44, 'c-poli': 45, 'c-prti':46, 'c-comp': 47}

    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Make sure prepare_sequence from earlier in the LSTM section is loaded
    for epoch in range(
            stop_epoch):  # again, normally you would NOT do 300 epochs, it is toy data
        print("---------------------")
        print("running epon : ", epoch + 1)
        start_time = time.time()
        for sentence, tags in tqdm(training_data):
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
            loss = model.neg_log_likelihood(sentence_in, targets)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 15)
            optimizer.step()
        cur_epoch_score = get_score_by_model(model, TRAIN_JSON_PATH, TRAIN_PDF_DIR)
        print('score', cur_epoch_score)
        print('running time:', time.time() - start_time)
        print()
        if epoch == stop_epoch:
            torch.save({
                'model_state_dict': model.state_dict()
            }, model_1_epoch)
コード例 #19
0
def init_model(char_to_ix, tag_to_ix, START_CHAR_ID, STOP_CHAR_ID,
               START_TAG_ID, STOP_TAG_ID):
    if args.old_model is not None:
        model = torch.load(args.old_model)

    else:
        if args.char_embeddings is not None:
            char_embeddings = utils.read_pretrained_embeddings(
                args.char_embeddings, char_to_ix)
            EMBEDDING_DIM = char_embeddings.shape[1]
        else:
            char_embeddings = None
            EMBEDDING_DIM = args.char_embeddings_dim
        model = BiLSTM_CRF(len(char_to_ix), len(tag_to_ix), START_CHAR_ID,
                           STOP_CHAR_ID, START_TAG_ID, STOP_TAG_ID,
                           args.use_bigram, args.hidden_dim, args.dropout,
                           EMBEDDING_DIM, char_embeddings)

    return processor.to_cuda_if_available(model)
コード例 #20
0
 def train_model(self):
     '''
     开始训练
     :return:
     '''
     model = BiLSTM_CRF(args,
                        self.embedding,
                        self.tag2id,
                        self.word2id,
                        self.paths,
                        config=config)
     model.build_graph()
     print("train data: {}".format(len(self.train_data)))
     print("dev data: {}".format(len(self.dev_data)))
     model.train(self.train_data, self.dev_data, args)
コード例 #21
0
 def test_model(self, model_path):
     '''
     开始测试,测试时要传入模型地址
     :param model_path:
     :return:
     '''
     ckpt_file = tf.train.latest_checkpoint(model_path)
     print(ckpt_file)
     self.paths['model_path'] = ckpt_file
     model = BiLSTM_CRF(args,
                        self.embedding,
                        self.tag2id,
                        self.word2id,
                        self.paths,
                        config=config)
     model.build_graph()
     print("test data size: {}".format(len(self.test_data)))
     model.test(self.test_data, args)
コード例 #22
0
def train(train_corpus, test_corpus):
    """
    create by ljx
    进行模型训练
    :param train_corpus: 训练数据
    :param test_corpus: 测试数据
    :return: 
    """
    # model.train
    model = BiLSTM_CRF(embeddings, args.update_embedding, args.hidden_dim, len(tag2label), args.clip,
                       params.summary_path, args.optimizer)
    model.build_graph()

    saver = tf.train.Saver(tf.global_variables())
    with tf.Session(config=config) as sess:
        # tf.global_variables_initializer()  # 初始化模型参数
        sess.run(model.init_op)
        model.add_summary(sess)

        for epoch in range(args.epoch):
            run_one_epoch(model, sess, train_corpus, test_corpus, tag2label, epoch, saver)
コード例 #23
0
ファイル: main.py プロジェクト: ryan147k/NER-pytorch
def predict(sentence, print_entity=False):
    """
    模型预测
    """
    # 模型
    bilstm_crf = BiLSTM_CRF(opt.vocab_size, opt.emb_dim, opt.emb_dim//2, opt.tag_num, dropout=opt.dropout)
    if opt.load_model_path:
        bilstm_crf.load(opt.load_model_path)
    bilstm_crf.eval()

    # 数据
    x = word2idx(sentence)
    x = t.LongTensor(x).unsqueeze(dim=0)

    tag_idx = bilstm_crf(x).squeeze(dim=0)
    tag_idx = tag_idx.numpy().tolist()

    length = min(opt.max_length, len(sentence))
    entity_list = []
    i = 0
    while i < length:
        if tag_idx[i] == 1:
            entity = sentence[i]
            j = i + 1
            for j in range(i+1, length):
                if tag_idx[j] == 2:
                    entity += sentence[j]
                else:
                    break
            i = j
            entity_list.append(entity)
        else:
            i += 1

    if print_entity:
        print(entity_list)
        print('\n')

    return idx2tag(tag_idx)
コード例 #24
0
def Test(testfile):
    word2id, embeddings = getDicEmbed()
    testdata = getTrainData(testfile)
    ckpt_file = tf.train.latest_checkpoint(model_path)
    model = BiLSTM_CRF(batch_size=args.batch_size,
                       epoch_num=args.epoch,
                       hidden_dim=args.hidden_dim,
                       embeddings=embeddings,
                       dropout_keep=args.dropout,
                       optimizer=args.optimizer,
                       lr=args.lr,
                       clip_grad=args.clip,
                       tag2label=tag2label,
                       vocab=word2id,
                       shuffle=args.shuffle,
                       model_path=ckpt_file,
                       summary_path=summary_path,
                       log_path=log_path,
                       result_path=result_path,
                       CRF=args.CRF,
                       update_embedding=args.update_embedding)
    model.build_graph()

    model.test(testdata)
コード例 #25
0
ファイル: main.py プロジェクト: GhibliField/zh-NER-TF
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))


## training model
if args.mode == 'train':
    model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config)
    model.build_graph()

    ## hyperparameters-tuning, split train/dev
    # dev_data = train_data[:5000]; dev_size = len(dev_data)
    # train_data = train_data[5000:]; train_size = len(train_data)
    # print("train data: {0}\ndev data: {1}".format(train_size, dev_size))
    # model.train(train=train_data, dev=dev_data)

    ## train model on the whole training data
    print("train data: {}".format(len(train_data)))
    model.train(train=train_data, dev=test_data)  # use test_data as the dev_data to see overfitting phenomena

## testing model
elif args.mode == 'test':
    ckpt_file = tf.train.latest_checkpoint(model_path)
コード例 #26
0
import torch
import torch.optim as optim
from dataset import Dataset
from model import BiLSTM_CRF

# torch.set_default_tensor_type('torch.cuda.FloatTensor')

epochs = 100
dataset = Dataset()
train_loader = dataset.get_train_loader(1)
model = BiLSTM_CRF(dataset.get_vocab_size(), dataset.get_label_index_dict(),
                   128, 128)

optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

model.train()
for epoch in range(epochs):
    for iter, batch in enumerate(train_loader):
        sentence_in, targets = batch.line, batch.label

        sentence_in = sentence_in.permute([1, 0]).reshape(-1).contiguous()
        targets = targets.permute([1, 0]).reshape(-1).contiguous()

        model.zero_grad()
        loss = model.neg_log_likelihood(sentence_in.squeeze(-1),
                                        targets.squeeze(-1)) / len(sentence_in)

        loss.backward()
        optimizer.step()

        print("{}-{}: {:.5f}".format(epoch, iter, loss.item()))
コード例 #27
0
ファイル: train.py プロジェクト: msps9341012/NER-pytorch
#     mappings = {
#         'word_to_id': word_to_id,
#         'tag_to_id': tag_to_id,
#         'char_to_id': char_to_id,
#         'parameters': parameters,
#         'word_embeds': word_embeds
#     }
#     cPickle.dump(mappings, f)

print('word_to_id: ', len(word_to_id))
model = BiLSTM_CRF(vocab_size=len(word_to_id),
                   tag_to_ix=tag_to_id,
                   embedding_dim=parameters['word_dim'],
                   hidden_dim=parameters['word_lstm_dim'],
                   use_gpu=use_gpu,
                   char_to_ix=char_to_id,
                   pre_word_embeds=word_embeds,
                   use_crf=parameters['crf'],
                   char_mode=parameters['char_mode'],
                   char_embedding_dim=parameters['char_dim'],
                   char_lstm_dim=parameters['char_lstm_dim'],
                   alpha=parameters['alpha'])
# n_cap=4,
# cap_embedding_dim=10)

if use_gpu:
    model.cuda()

learning_rate = 0.015
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
losses = []
best_dev_F = -1.0
コード例 #28
0
ファイル: main.py プロジェクト: LeiChen9/zh-NER-TF
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

## Pretrain language model
if args.mode == 'pre_train':
    args.CRF = False
    model = BiLSTM_CRF(args,
                       embeddings,
                       tag2label,
                       word2id,
                       paths,
                       config=config)
    model.num_tags = len(word2id)
    model.build_graph()

    ## hyperparameters-tuning, split train/dev
    # dev_data = train_data[:5000]; dev_size = len(dev_data)
    # train_data = train_data[5000:]; train_size = len(train_data)
    # print("train data: {0}\ndev data: {1}".format(train_size, dev_size))
    # model.train(train=train_data, dev=dev_data)

    ## train model on the whole training data
    print("train data: {}".format(len(pre_train_data)))
    model.train(train=pre_train_data, dev=test_data
                )  # use test_data as the dev_data to see overfitting phenomena
コード例 #29
0
    train_data = read_train_corpus(file_path=train_path, maxlen=args.max_len)
    print(args.data_augment)
    if args.data_augment:
        train_data = data_augmentation(train_data, maxlen=args.max_len)
    print("loading valid data...")
    valid_path = file_path = os.path.join('./data', args.valid_data)
    valid_data = read_train_corpus(file_path=valid_path, maxlen=args.max_len)

    print("building model...")
    result_path = os.path.join('./result', args.result_path)
    valid_result_path = os.path.join('./result', args.valid_result)
    model_path = os.path.join(args.model_path, 'model.ckpt')

    model = BiLSTM_CRF(args,
                       embeddings,
                       model_path=model_path,
                       result_path=result_path,
                       valid_result=valid_result_path,
                       config=config)
    model.build_graph()

    ## train model on the whole training data
    print("train data: {}".format(len(train_data)))
    print("start trainging...")
    model.train(train=train_data, dev=valid_data
                )  # use test_data as the dev_data to see overfitting phenomena

## testing model
elif args.mode == 'test':
    print("loading testing data...")
    test_path = os.path.join('./data', args.test_data)
    test_data = read_test_corpus(file_path=test_path, maxlen=args.max_len)
コード例 #30
0
def sort_batch_data(sentences, lengths):
    lengths_sort, idx_sort = lengths.sort(0, descending=True)
    sentences_sort = sentences[idx_sort]
    _, idx_unsort = idx_sort.sort(0, descending=False)

    return sentences_sort, lengths_sort, idx_unsort


char2idx = pickle.load(open('char2idx.pkl', 'rb'))
data = pickle.load(open('predict_data.pkl', 'rb'))

predict_data = PredData(data, char2idx)
dataloader = DataLoader(predict_data, batch_size=32, drop_last=False)

model = BiLSTM_CRF(len(char2idx), len(Config.tagert2idx), Config.embedding_dim,
                   Config.hidden_dim)

model.load_state_dict(torch.load('model_best.pth'))
if Config.use_gpu:
    model.to('cuda')
model.eval()

predict_result = []
with torch.no_grad():
    for batch_sentences, batch_lengths in dataloader:
        sentences, lengths, idx_unsort = sort_batch_data(
            batch_sentences, batch_lengths)
        if Config.use_gpu:
            sentences = sentences.cuda()
        pred = model(sentences, lengths)
        pred = pred[idx_unsort]
コード例 #31
0
if not os.path.exists(model_path):
    os.makedirs(model_path)

ckpt_prefix = os.path.join(model_path, "model")
result_path = os.path.join(output_path, "results")

if not os.path.exists(result_path):
    os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
get_logger(log_path).info(str(args))

# training model
if args.mode == 'train':
    model = BiLSTM_CRF(batch_size=args.batch_size, epoch_num=args.epoch, hidden_dim=args.hidden_dim, embeddings=embeddings,
                       dropout_keep=args.dropout, optimizer=args.optimizer, lr=args.lr, clip_grad=args.clip,
                       tag2label=tag2label, vocab=word2id, shuffle=args.shuffle,
                       model_path=ckpt_prefix, summary_path=summary_path, log_path=log_path, result_path=result_path,
                       CRF=args.CRF, update_embedding=args.update_embedding)
    model.build_graph()
    # hyperparameters-tuning, split train/dev
    # train model on the whole training raw_data
    print("train raw_data: {}".format(len(train_data)))
    model.train(train_data, test_data)  # we could use test_data as the dev_data to see the overfitting phenomena

# testing model
elif args.mode == 'test':
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    model = BiLSTM_CRF(batch_size=args.batch_size, epoch_num=args.epoch, hidden_dim=args.hidden_dim, embeddings=embeddings,
                       dropout_keep=args.dropout, optimizer=args.optimizer, lr=args.lr, clip_grad=args.clip,
                       tag2label=tag2label, vocab=word2id, shuffle=args.shuffle,