Exemple #1
0
def train_ner():
    import os
    from bert_base.train.train_helper import get_args_parser
    from bert_base.train.bert_lstm_ner import train

    args = get_args_parser()
    args.label_list = prefix + 'data_dir/labels.txt'
    args.init_checkpoint = prefix + 'init_checkpoint/bert_model.ckpt'
    args.data_dir = prefix + 'data_dir/'
    args.output_dir = 'out_dir/'
    args.bert_config_file = prefix + 'init_checkpoint/bert_config.json'
    args.vocab_file = prefix + 'init_checkpoint/vocab.txt'
    args.verbose = True
    args.gpu_memory_fraction = 1.0
    args.do_predict = False
    # args.save_checkpoints_steps = 5000
    # args.save_summary_steps = 5000
    args.clean = True

    if True:
        import sys
        param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())])
        print('usage: %s\n%20s   %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str))
    print(args)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
    tf.logging.set_verbosity(tf.logging.INFO)
    train(args=args)
def train_ner():
    args = get_args_parser()
    if True:
        import sys
        param_str = '\n'.join(['%20s = %s' % (k, v)
                               for k, v in sorted(vars(args).items())])
        print('usage: %s\n%20s   %s\n%s\n%s\n' %
              (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str))
    print(args)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
    train(args=args)
Exemple #3
0
def train_bert_class():
    import os
    from bert_base.train.train_helper import get_args_parser
    from bert_base.train.bert_classifier import train

    args = get_args_parser()
    if True:
        import sys
        param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())])
        print('usage: %s\n%20s   %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str))
    print(args)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
    train(args)
Exemple #4
0
def train_ner():
    import os
    from bert_base.train.train_helper import get_args_parser
    # from bert_base.train.bert_lstm_ner import train#train和eval同时进行
    from bert_base.train.bert_lstm_ner_train_inpend_eval import train  #train和eval可以分开进行

    args = get_args_parser()
    if True:
        import sys
        param_str = '\n'.join(
            ['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())])
        print('usage: %s\n%20s   %s\n%s\n%s\n' %
              (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str))
    # print(args)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
    train(args=args)
Exemple #5
0
def train_ner():
    import os
    from bert_base.train.train_helper import get_args_parser
    from bert_lstm_ner import train

    args = get_args_parser()
    if True:
        import sys
        param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())])
        print('usage: %s\n%20s   %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str))
    # print(args)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
    train(args=args)

# if __name__ == '__main__':
#     # start_server()
#     train_ner()
def get_ner_list(sentences):
    args = get_args_parser()
    bert_dir = r'NER_model/chinese_L-12_H-768_A-12'
    tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(
        bert_dir, 'vocab.txt'),
                                           do_lower_case=args.do_lower_case)
    bc = BertClient(show_server_config=False,
                    check_version=False,
                    check_length=False,
                    mode='NER')
    rst = bc.encode(sentences)
    res = NER_Result()
    entities = []
    # print('rst:', rst)
    for (one_str, one_rst) in zip(sentences, rst):
        ners = res.result_to_json(tokenizer.tokenize(one_str), one_rst)
        entities.append(ners)
    return entities
Exemple #7
0
def train_ner():
    import os
    from bert_base.train.train_helper import get_args_parser
    from bert_base.train.bert_lstm_ner import train

    args = get_args_parser()
    if True:
        import sys
        param_str = '\n'.join(
            ['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())])
        print('usage: %s\n%20s   %s\n%s\n%s\n' %
              (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str))
    print(args)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map

    if os.path.isdir(args.output_dir):
        shutil.rmtree(args.output_dir)

    train(args=args)
Exemple #8
0
def train_ner():
    import os
    from bert_base.train.train_helper import get_args_parser
    from bert_base.train.bert_lstm_ner import train
    bert_path = r'D:\localE\code\daguang_extract\BERT-BiLSTM-CRF-NER-tjl\chinese_L-12_H-768_A-12\MSRA'
    root_path = r'D:\localE\code\daguang_extract\BERT-BiLSTM-CRF-NER-tjl'
    args = get_args_parser()
    args.clean = True
    args.max_seq_length = 128
    args.do_train = True
    args.output_dir = os.path.join(root_path, 'output')
    args.num_train_epochs = 30
    args.learning_rate = 1e-4
    args.warmup_proportion = 0.1
    if True:
        import sys
        param_str = '\n'.join(
            ['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())])
        print('usage: %s\n%20s   %s\n%s\n%s\n' %
              (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str))
    print(args)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map

    train(args=args)
Exemple #9
0
基于terminal_predict.py进行的修改,用于处理资讯新闻的人名和机构名等的抽取
@Author: geodgechen
"""

import tensorflow as tf
import numpy as np
import codecs
import pickle
import os

from datetime import datetime

from bert_base.train.models import create_model, InputFeatures
from bert_base.bert import tokenization, modeling
from bert_base.train.train_helper import get_args_parser
args = get_args_parser()

model_dir = '/data/label/ner_extract/output'
bert_dir = '/data/label/ner_extract/chinese_L-12_H-768_A-12'

is_training = False
use_one_hot_embeddings = False
batch_size = 1

gpu_config = tf.ConfigProto()
gpu_config.gpu_options.allow_growth = True
sess = tf.Session(config=gpu_config)
model = None

global graph
input_ids_p, input_mask_p, label_ids_p, segment_ids_p = None, None, None, None
Exemple #10
0
def zjb_eval(output_dir, data_dir, max_seq_length, vocab_file, batch_size,
             learning_rate, init_checkpoint, bert_config_file,
             num_train_epochs):
    tf.logging.set_verbosity(tf.logging.INFO)
    from bert_base.train.train_helper import get_args_parser
    args = get_args_parser()
    token_path = os.path.join(output_dir, "token_test.txt")
    if os.path.exists(token_path):
        os.remove(token_path)

    with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'rb') as rf:
        label2id = pickle.load(rf)
        id2label = {value: key for key, value in label2id.items()}

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=args.do_lower_case)
    processors = {"ner": NerProcessor}
    processor = processors[args.ner](output_dir)
    label_list = processor.get_labels()
    predict_examples = processor.get_test_examples(data_dir)
    predict_file = os.path.join(output_dir, "predict.tf_record")
    filed_based_convert_examples_to_features(predict_examples,
                                             label_list,
                                             max_seq_length,
                                             tokenizer,
                                             predict_file,
                                             output_dir,
                                             mode="test")

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d", len(predict_examples))
    tf.logging.info("  Batch size = %d", batch_size)

    predict_drop_remainder = False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    session_config = tf.ConfigProto(log_device_placement=False,
                                    inter_op_parallelism_threads=0,
                                    intra_op_parallelism_threads=0,
                                    allow_soft_placement=True)
    # session_config.gpu_options.per_process_gpu_memory_fraction = 0.8  # 占用80%显存

    run_config = tf.estimator.RunConfig(model_dir=output_dir,
                                        save_summary_steps=500,
                                        save_checkpoints_steps=500,
                                        session_config=session_config)

    train_examples = processor.get_train_examples(data_dir)
    num_train_steps = int(
        len(train_examples) * 1.0 / batch_size * num_train_epochs)
    if num_train_steps < 1:
        raise AttributeError('training data is so small...')
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)

    bert_config = modeling.BertConfig.from_json_file(bert_config_file)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=init_checkpoint,
                                learning_rate=learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                args=args)

    params = {'batch_size': args.batch_size}

    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    result = estimator.predict(input_fn=predict_input_fn)
    output_predict_file = os.path.join(output_dir, "label_test.txt")

    def result_to_pair(writer):
        for predict_line, prediction in zip(predict_examples, result):
            idx = 0
            line = ''
            line_token = str(predict_line.text).split(' ')
            label_token = str(predict_line.label).split(' ')
            len_seq = len(label_token)
            if len(line_token) != len(label_token):
                tf.logging.info(predict_line.text)
                tf.logging.info(predict_line.label)
                break
            for id in prediction:
                if idx >= len_seq:
                    break
                if id == 0:
                    continue
                curr_labels = id2label[id]
                if curr_labels in ['[CLS]', '[SEP]']:
                    continue
                try:
                    line += line_token[idx] + ' ' + label_token[
                        idx] + ' ' + curr_labels + '\n'
                except Exception as e:
                    tf.logging.info(e)
                    tf.logging.info(predict_line.text)
                    tf.logging.info(predict_line.label)
                    line = ''
                    break
                idx += 1
            writer.write(line + '\n')

    with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:
        result_to_pair(writer)
    from bert_base.train import conlleval
    eval_result = conlleval.return_report(output_predict_file)
    print(''.join(eval_result))
    # 写结果到文件中
    with codecs.open(os.path.join(output_dir, 'predict_score.txt'),
                     'a',
                     encoding='utf-8') as fd:
        fd.write(''.join(eval_result))
    # filter model
    if args.filter_adam_var:
        adam_filter(output_dir)
Exemple #11
0
def Bilstm_CRF_save_model(save_path):

    #def serving_input_receiver_fn():
    #    """Serving input_fn that builds features from placeholders

    #    Returns
    #    -------
    #    tf.estimator.export.ServingInputReceiver
    #    """
    #    words = tf.placeholder(dtype=tf.string, shape=[None, None], name='words')
    #    nwords = tf.placeholder(dtype=tf.int32, shape=[None], name='nwords')
    #    receiver_tensors = {'words': words, 'nwords': nwords}
    #    features = {'words': words, 'nwords': nwords}
    #    return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
    def serving_input_receiver_fn(max_seq_length=128):
        input_ids = tf.placeholder(dtype=tf.int64,
                                   shape=[None, max_seq_length],
                                   name='input_ids')
        #input_mask = tf.placeholder(dtype=tf.int64, shape=[None, max_seq_length], name='input_mask')
        #segment_ids = tf.placeholder(dtype=tf.int64, shape=[None, max_seq_length], name='segment_ids')
        #label_ids = tf.placeholder(dtype=tf.int64, shape=[None, max_seq_length], name='label_ids')

        #receive_tensors = {'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids,
        #                  'label_ids': label_ids}
        #features = {'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, "label_ids": label_ids}
        receive_tensors = {'input_ids': input_ids}
        features = {'input_ids': input_ids}
        return tf.estimator.export.ServingInputReceiver(
            features, receive_tensors)

    from bert_base.bert import modeling
    from bert_base.train.bert_lstm_ner import NerProcessor
    from bert_base.train.bert_lstm_ner import model_fn_builder
    from bert_base.train.train_helper import get_args_parser
    args = get_args_parser()
    bert_config = modeling.BertConfig.from_json_file(
        './checkpoint1/bert_config.json')
    processor = NerProcessor('output/result_dir')
    label_list = processor.get_labels()
    import ipdb
    #ipdb.set_trace()
    session_config = tf.ConfigProto(log_device_placement=False,
                                    inter_op_parallelism_threads=0,
                                    intra_op_parallelism_threads=0,
                                    allow_soft_placement=True)
    run_config = tf.estimator.RunConfig(model_dir='output/result_dir',
                                        save_summary_steps=500,
                                        save_checkpoints_steps=500,
                                        session_config=session_config)
    print('len label list is {}'.format(len(label_list)))
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        #num_labels=2,
        init_checkpoint='./checkpoint1/bert_model.ckpt',
        learning_rate=2e-5,
        num_train_steps=10,
        num_warmup_steps=10,
        args=args)

    # params是一个dict 里面的key是model_fn 里面用到的参数名称,value是对应的数据
    params = {
        'batch_size': 32,
    }
    #estimator = tf.estimator.Estimator(model_fn,'output/result_dir', params=params)
    estimator = tf.estimator.Estimator(model_fn,
                                       'output/result_dir',
                                       params=params,
                                       config=run_config)
    estimator.export_saved_model('{}/saved_model'.format(save_path),
                                 serving_input_receiver_fn)