Beispiel #1
0
 def get_voc(self):
     '''
     读入词表
     :return:
     '''
     word2id = read_dictionary(
         os.path.join('.', args.train_data, 'word2id.pkl'))
     print("word vocab size: {}".format(len(word2id)))
     return word2id
Beispiel #2
0
def getDicEmbed():
    word2id = read_dictionary(os.path.join('.', args.train_data,
                                           'word2id.pkl'))
    if args.pretrain_embedding == 'random':
        embeddings = random_embedding(word2id, args.embedding_dim)
    else:
        embedding_path = 'pretrain_embedding.npy'
        embeddings = np.array(np.load(embedding_path), dtype='float32')

    return word2id, embeddings
Beispiel #3
0
def test101(**kwargs):
    import argparse
    from utils import str2bool
    from data import read_dictionary, tag2label

    print('test101', kwargs)

    ##
    parser = argparse.ArgumentParser(
        description='BiLSTM-CRF for Chinese NER task')
    parser.add_argument('--train_data',
                        type=str,
                        default='data_path',
                        help='train data source')
    parser.add_argument('--demo_model',
                        type=str,
                        default='1521112368',
                        help='model for test and demo')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help='#sample of each minibatch')
    args = parser.parse_args([])

    ##
    word2id = read_dictionary(os.path.join('.', args.train_data,
                                           'word2id.pkl'))

    client = BiLSTM_CRF_Client(args, tag2label, word2id)

    demo_sent = kwargs.get("demo_sent")
    demo_sent = list(demo_sent.strip())
    print('demo_sent', len(demo_sent))
    demo_data = [(demo_sent, ['O'] * len(demo_sent))]

    ret1 = client.demo_one(kwargs.get("server"), demo_data, verbose=False)

    print('result-1', ret1)

    from utils import get_entity

    PER, LOC, ORG = get_entity(ret1, demo_sent)
    print('PER: {}\nLOC: {}\nORG: {}'.format(PER, LOC, ORG))
Beispiel #4
0
    def __init__(self, args):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = 0.2
        paths, model_path = get_paths(args)
        ckpt_file = tf.train.latest_checkpoint(model_path)

        paths['model_path'] = ckpt_file
        word2id = read_dictionary(
            os.path.join('.', args.train_data, 'word2id.pkl'))
        embeddings = random_embedding(word2id, args.embedding_dim)
        self.model = BiLSTM_CRF(args,
                                embeddings,
                                tag2label,
                                word2id,
                                paths,
                                config=config)
        self.model.build_graph()
        self.saver = tf.train.Saver()
        self.sess = tf.Session(config=config)
        self.saver.restore(self.sess, ckpt_file)
Beispiel #5
0
def ner(sent):
    config = tf.ConfigProto()
    parser = argparse.ArgumentParser(
        description='BiLSTM-CRF for Chinese NER task')
    parser.add_argument('--train_data',
                        type=str,
                        default='data_path',
                        help='train data source')
    parser.add_argument('--test_data',
                        type=str,
                        default='data_path',
                        help='test data source')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help='#sample of each minibatch')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='#epoch of training')
    parser.add_argument('--hidden_dim',
                        type=int,
                        default=300,
                        help='#dim of hidden state')
    parser.add_argument('--optimizer',
                        type=str,
                        default='Adam',
                        help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
    parser.add_argument('--CRF',
                        type=str2bool,
                        default=True,
                        help='use CRF at the top layer. if False, use Softmax')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=5.0,
                        help='gradient clipping')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='dropout keep_prob')
    parser.add_argument('--update_embedding',
                        type=str2bool,
                        default=True,
                        help='update embedding during training')
    parser.add_argument(
        '--pretrain_embedding',
        type=str,
        default='random',
        help='use pretrained char embedding or init it randomly')
    parser.add_argument('--embedding_dim',
                        type=int,
                        default=300,
                        help='random init char embedding_dim')
    parser.add_argument('--shuffle',
                        type=str2bool,
                        default=False,
                        help='shuffle training data before each epoch')
    parser.add_argument('--mode',
                        type=str,
                        default='demo',
                        help='train/test/demo')
    parser.add_argument('--demo_model',
                        type=str,
                        default='1563773712',
                        help='model for test and demo')
    args = parser.parse_args()

    ## get char embeddings
    word2id = read_dictionary(os.path.join('.', args.train_data,
                                           'word2id.pkl'))
    if args.pretrain_embedding == 'random':
        embeddings = random_embedding(word2id, args.embedding_dim)
    else:
        embedding_path = 'pretrain_embedding.npy'
        embeddings = np.array(np.load(embedding_path), dtype='float32')

    ## paths setting
    paths = {}
    paths['summary_path'] = './'
    model_path = r'C:\Users\Houking\Desktop\web_api\ner\checkpoint'
    paths['model_path'] = os.path.join(model_path, "model")
    paths['result_path'] = './'
    paths['log_path'] = './'

    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args,
                       embeddings,
                       tag2label,
                       word2id,
                       paths,
                       config=config)
    model.build_graph()
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, ckpt_file)
        while (1):
            print('Please input your sentence:')
            demo_sent = input()
            if demo_sent == '' or demo_sent.isspace():
                print('See you next time!')
                break
            else:
                sent = list(sent)
                data = [(sent, ['O'] * len(sent))]
                tag = model.demo_one(sess, data)
                PER, SEX, TIT, REA = get_entity(tag, sent)
                print('PER: {}\nSEX: {}\nTIT: {}\nREA: {}'.format(
                    PER, SEX, TIT, REA))
Beispiel #6
0
parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state')
parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')  # 解决梯度爆炸的影响
parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='embedding_mat.npy',help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=200, help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training raw_data before each epoch')
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model', type=str, default='1586501733', help='model for test and demo')#random_char_300,1524919794
parser.add_argument('--embedding_dir', type=str, default='word2vector', help='embedding files dir')
args = parser.parse_args()
# get char embeddings
word2id = read_dictionary(os.path.join('.', args.data_dir, args.dictionary))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = os.path.join(os.path.curdir, args.embedding_dir, args.pretrain_embedding)
    embeddings = np.array(np.load(embedding_path), dtype='float32')

# read corpus and get training raw_data
# 读取数据
if args.mode != 'demo':
    train_path = os.path.join('.', args.data_dir, args.train_data)
    test_path = os.path.join('.', args.data_dir, args.test_data)
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path)
    test_size = len(test_data)
#paths setting
Beispiel #7
0
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model', type=str, default='154138625', help='model for test and demo')
args = parser.parse_args()

def file_name(file_dir):
    for root, dirs, files in os.walk(file_dir):
        return files
		
## get char embeddings
word2id = read_dictionary('../code/word2id.pkl')
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data)
    test_path = os.path.join('.', args.test_data)
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path); test_size = len(test_data)

Beispiel #8
0
if not args.use_pre_emb:
    # vocabulary build
    if not os.path.exists(
            os.path.join('data_path', args.dataset_name, 'word2id.pkl')):
        #原始数据集:txt文件  字1\ttag1\n
        #                  字2\ttag2\n
        # line1                    ...
        #                  字n\ttagn\n             需要注意的是每个line 用两个换行符隔开
        #                       \n
        # line2                 同上
        vocab_build(
            os.path.join('data_path', args.dataset_name, 'word2id.pkl'),
            os.path.join('data_path', args.dataset_name, train_file))

    # get word dictionary
    word2id = read_dictionary(
        os.path.join('data_path', args.dataset_name, 'word2id.pkl'))
    embeddings = random_embedding(word2id, args.embedding_dim)
    log_pre = 'not_use_pretrained_embeddings'
else:
    with open('data_path//DaGuang//dr_d_td_all.pkl', 'rb') as f:
        id2word = pickle.load(f)
        word2id = pickle.load(f)
        print('word2id的length:', len(word2id))
        _ = pickle.load(f)
    embeddings_path = os.path.join('data_path', args.dataset_name,
                                   'pretrain_embedding.npy')
    if not os.path.exists(embeddings_path):
        build_character_embeddings(args.pretrained_emb_path, embeddings_path)
    embeddings = np.array(np.load(embeddings_path), dtype='float32')
    log_pre = 'use_pretrained_embeddings'
Beispiel #9
0
parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo')
args = parser.parse_args()  # 解析命令行参数


## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))  # 词典:id,count(去除过低频次,归一化过数字,字母等非中文)
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)  # 均匀初始化词向量
else:
    embedding_path = 'pretrain_embedding.npy'  # 否则,读取预训练的词向量
    embeddings = np.array(np.load(embedding_path), dtype='float32')


## read corpus and get training data
if args.mode != 'demo':  # 读取训练集测试集,每个句子是词id序列和标签序列
    train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path); test_size = len(test_data)

Beispiel #10
0
                    "Comma-separated list of hostname:port pairs")
flags.DEFINE_string(
    "worker_hosts",
    "172.16.23.5:2226,172.16.23.5:2227,172.16.23.5:2228,172.16.23.5:2229",
    "Comma-separated list of hostname:port pairs")

# flags.DEFINE_string("worker_hosts",
#                     "172.16.23.5:2223,172.16.23.5:2224,172.16.23.5:2225,172.16.23.5:2226,"
#                     "172.16.23.11:2223,172.16.23.11:2224,172.16.23.11:2225,172.16.23.11:2226",
#                     "Comma-separated list of hostname:port pairs")

flags.DEFINE_string("job_name", None, "job name: worker or ps")
FLAGS = flags.FLAGS

# get word embeddings
word2id = read_dictionary(os.path.join('./', FLAGS.word2id, 'word2id.pkl'))
if FLAGS.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, FLAGS.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

# read corpus and get training data
if FLAGS.mode != 'demo':
    train_path = os.path.join('.', FLAGS.train_data_path, 'train_data')
    train_data_len = get_train_data_len(train_path)
#     test_path = os.path.join('.', FLAGS.test_data_path, 'test_data')
#     train_data = read_corpus(train_path)
#     test_data = read_corpus(test_path)

# path setting
Beispiel #11
0
args = {
    'batch_size': 128,
    'epoch': 20,
    'hidden_dim': 300,
    'optimizer': 'Adam',
    'CRF': True,
    'lr': 0.001,
    'clip': 5.0,
    'dropout': 0.8,
    'update_embedding': True,
    'shuffle': True
}

## get char embeddings
#word2id = read_dictionary(os.path.join(os.environ['DMPPATH'],'gz_case_address/data_path/word2id.pkl'))
word2id = read_dictionary("./gz_case_address/data_path/word2id.pkl")
embeddings = random_embedding(word2id, 300)

## paths setting
#output_path = os.path.join(os.environ['DMPPATH'],'dmp/gongan/gz_case_address/mode_save')
output_path = os.path.join("./gz_case_address/mode_save")
# output_path = ('./mode_save')

if not os.path.exists(output_path):
    os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
if not os.path.exists(summary_path):
    os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints")
if not os.path.exists(model_path):
    os.makedirs(model_path)
Beispiel #12
0
                    type=str,
                    default='demo',
                    help='train/test/demo/text')
# parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1550144205',
                    help='model for test and demo')
parser.add_argument('--text_file',
                    type=str,
                    default='my.txt',
                    help='text file for demo')
args = parser.parse_args()

## get char embeddings
word2id = read_dictionary('./data_path/word2id.pkl')
embeddings = random_embedding(word2id, 300)
output_path = './data_path_save/1577156952'
model_path = os.path.join(output_path, "checkpoints/")
ckpt_prefix = os.path.join(model_path, "model")
ckpt_file = tf.train.latest_checkpoint(model_path)

## paths setting
paths = {}
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
                    default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode',
                    type=str,
                    default='train',
                    help='train/test/demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1536659706',
                    help='model for test and demo')
args = parser.parse_args(
)  #PyCharm中Run - Edit Configurations - Script Parameters设置命令行参数

## get char embeddings

word2id = read_dictionary(
    r"D:\data\100dim\word2id_100.pkl")  #data.py会生成pkl文件,是词和词向量的对应关系
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(
        word2id,
        args.embedding_dim)  #随机生成len(word2id)*args.embedding_dim维array
else:
    embedding_path = 'D:\\data\\100dim\\np_100.npy'  #磁盘中数组的二进制格式文件
    embeddings = np.array(np.load(embedding_path), dtype='float32')  #加载词向量到内存中

## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train1.txt')  #训练集文件路径拼接
    test_path = os.path.join('.', args.test_data, 'test1.txt')  #测试集文件路径拼接
    train_data = read_corpus(train_path)  #读训练集,为自己定义的一个函数,返回list
    test_data = read_corpus(test_path)
    test_size = len(test_data)
Beispiel #14
0
def infer():
    # Load arguments
    args = parse_args()
    # args.batch_size = 1

    word2id = data.read_dictionary("data/pre_trained_word2id.pkl")
    embeddings = np.load("data/pre_trained_embeddings.npy")
    # word2id = data.read_dictionary("data/pre_trained_copy_mini_word2id.pkl")
    # embeddings = np.load("data/pre_trained_copy_mini_embeddings.npy")

    # word2id_output_mini = {}
    # for i, k in enumerate(word2id):
    #     word2id_output_mini[k] = i
    #     if i > 9100:
    #         break
    # word2id_output_mini["<S>"] = 1
    # word2id_output_mini["<E>"] = 2
    # word2id = word2id_output_mini

    word2id_output = word2id.copy()
    word_ori_size = len(word2id)
    # word_mini_size = len(word2id_output)
    # word_size = word_ori_size
    # word_size = word_mini_size

    word_size = 0
    tag_size = 0
    for k in tag2label:
        if tag2label[k] > tag_size:
            tag_size = tag2label[k]
        tag2label[k] += args.max_length
        if tag2label[k] > word_size:
            word_size = tag2label[k]
    # word2id_output.update(tag2label)
    word2id_output = tag2label
    word2id_output["<S>"] = word_size + 1
    word2id_output["<E>"] = word_size + 2
    word_size += 3
    tag_size += 3
    print("output size", word_size, tag_size)

    # # Dictrionaries init
    # word2id = data.read_dictionary("data/pre_trained_word2id.pkl")
    # embeddings = np.load("data/pre_trained_embeddings.npy")
    # word2id_output = word2id.copy()
    # word_mini_size = len(word2id)
    # word_size = 0
    # for k in tag2label:
    #     tag2label[k] += word_mini_size
    #     if tag2label[k] > word_size:
    #         word_size = tag2label[k]
    # tag2label["<S>"] = word_size + 1
    # tag2label["<E>"] = word_size + 2
    # word_size += 3
    # word2id_output.update(tag2label)
    # # print(type(word2id), len(word2id))
    # # print(type(entity2id), len(entity2id))
    # # print(type(pos2id), len(pos2id))
    # # print(type(word2id_output), len(word2id_output))
    id2entity = {}
    for k in entity2id:
        id2entity[entity2id[k]] = k
    id2word = {}
    for k in word2id:
        id2word[word2id[k]] = k
    id2word_output = {}
    for k in word2id_output:
        id2word_output[word2id_output[k]] = k
    src_dict, trg_dict = id2word, id2word_output

    # Load data
    # data_train = data_load("data/train_pos.txt",
    #         data=data, word2id=word2id, entity2id=entity2id,
    #         pos2id=pos2id, word2id_output=word2id_output,
    #         event_args=event_args)
    data_train = data_load("data/ace_data/train.txt",
                           data=data,
                           word2id=word2id,
                           entity2id=entity2id,
                           pos2id=pos2id,
                           word2id_output=word2id_output,
                           event_args=event_args,
                           generate=True)
    data_dev = data_load("data/ace_data/dev.txt",
                         data=data,
                         word2id=word2id,
                         entity2id=entity2id,
                         pos2id=pos2id,
                         word2id_output=word2id_output,
                         event_args=event_args,
                         generate=True)
    data_test = data_load("data/ace_data/test.txt",
                          data=data,
                          word2id=word2id,
                          entity2id=entity2id,
                          pos2id=pos2id,
                          word2id_output=word2id_output,
                          event_args=event_args,
                          generate=True)
    # data_test = data_train

    print("=====Init scores")
    scores = generate_pr(word_dict=id2word_output)
    scores.append_label(data_test)

    # Inference
    net = model.net(
        args.embedding_dim,
        args.encoder_size,
        args.decoder_size,
        word_ori_size,
        word_size,
        tag_size,
        True,
        # False,
        beam_size=args.beam_size,
        max_length=args.max_length,
        source_entity_dim=len(entity2id),
        source_pos_dim=len(pos2id),
        embedding_entity_dim=args.embedding_entity_dim,
        embedding_pos_dim=args.embedding_pos_dim,
        end_id=word2id_output["<E>"])

    # test_batch_generator = paddle.batch(
    #     paddle.reader.shuffle(
    #         paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
    #     batch_size=args.batch_size,
    #     drop_last=False)

    dev_batch_generator = paddle.batch(paddle.reader.buffered(data_dev,
                                                              size=1000),
                                       batch_size=args.batch_size,
                                       drop_last=False)
    test_batch_generator = paddle.batch(paddle.reader.buffered(data_test,
                                                               size=1000),
                                        batch_size=args.batch_size,
                                        drop_last=False)

    print("begin memory optimization ...")
    # fluid.memory_optimize(train_program)
    fluid.memory_optimize(framework.default_main_program())
    print("end memory optimization ...")

    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
    exe = Executor(place)
    exe.run(framework.default_startup_program())
    # # exe = fluid.ParallelExecutor(use_cuda=args.use_gpu)
    # os.environ['CPU_NUM'] = "2"
    # exe = fluid.parallel_executor.ParallelExecutor(
    #         use_cuda=args.use_gpu, num_trainers=2,
    #         # loss_name=avg_cost.name,
    #         main_program=fluid.default_main_program())

    # LOAD Model
    model_path = os.path.join(args.save_dir, str(args.load_pass_num))
    fluid.io.load_persistables(executor=exe,
                               dirname=model_path,
                               main_program=framework.default_main_program())
    print("==Model loaded", args.save_dir)

    translation_ids = net.translation_ids
    translation_scores = net.translation_scores
    feed_order = net.feeding_list

    feed_list = [
        framework.default_main_program().global_block().var(var_name)
        for var_name in feed_order
    ]
    # print(feed_list)
    feeder = fluid.DataFeeder(feed_list, place)
    scores.reset()
    for batch_id, _data in enumerate(test_batch_generator()):
        print("=====", batch_id, len(_data))
        # The value of batch_size may vary in the last batch
        batch_size = len(_data)

        # Setup initial ids and scores lod tensor
        # init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
        init_ids_data = np.array(
            [word2id_output["<S>"] for _ in range(batch_size)], dtype='int64')
        init_scores_data = np.array([1. for _ in range(batch_size)],
                                    dtype='float32')
        init_ids_data = init_ids_data.reshape((batch_size, 1))
        init_scores_data = init_scores_data.reshape((batch_size, 1))
        init_recursive_seq_lens = [1] * batch_size
        init_recursive_seq_lens = [
            init_recursive_seq_lens, init_recursive_seq_lens
        ]
        init_ids = fluid.create_lod_tensor(init_ids_data,
                                           init_recursive_seq_lens, place)
        init_scores = fluid.create_lod_tensor(init_scores_data,
                                              init_recursive_seq_lens, place)
        # print(init_ids_data.shape)
        # print(init_recursive_seq_lens)
        # print(init_ids.lod())
        # print(init_scores.lod())

        # Feed dict for inference
        feed_dict = feeder.feed([x for x in _data])
        feed_dict['init_ids'] = init_ids
        feed_dict['init_scores'] = init_scores

        print("=====")
        fetch_outs = exe.run(
            framework.default_main_program(),
            feed=feed_dict,
            fetch_list=[translation_ids, translation_scores],
            # fetch_list=[translation_ids],
            return_numpy=False)
        # print(np.array(fetch_outs[0]))
        # print(np.array(fetch_outs[0]).shape)
        print("=====Update scores")
        scores.update(preds=fetch_outs[0],
                      labels=[_[-1] for _ in _data],
                      words_list=[_[0] for _ in _data],
                      for_generate=True)
        # Split the output words by lod levels
        end_id = word2id_output["<E>"]
        result = []
        paragraphs = []
        for ids in np.array(fetch_outs[0]):
            # print("##", ids.shape)
            # print("##", ids)
            new_ids = []
            new_words = []
            pre_id = -1
            for _id in ids:
                if _id == end_id or \
                        _id == pre_id:
                    break
                pre_id = _id
                new_ids.append(_id)
                if _id < args.max_length:
                    new_words.append(str(_id))
                else:
                    new_words.append(trg_dict[_id])
            result.append(new_ids)
            paragraphs.append(new_words)

        # lod_level_1 = fetch_outs[0].lod()[1]
        # token_array = np.array(fetch_outs[0])
        # result = []
        # for i in six.moves.xrange(len(lod_level_1) - 1):
        #     sentence_list = [
        #         trg_dict[token]
        #         for token in token_array[lod_level_1[i]:lod_level_1[i + 1]]
        #     ]
        #     sentence = " ".join(sentence_list[1:-1])
        #     result.append(sentence)
        # lod_level_0 = fetch_outs[0].lod()[0]
        # paragraphs = [
        #     result[lod_level_0[i]:lod_level_0[i + 1]]
        #     for i in six.moves.xrange(len(lod_level_0) - 1)
        # ]

        # target_sentence_list = [" ".join(
        #         [trg_dict[__]
        #         for __ in _[-1]])
        #         for _ in _data]
        target_sentence_list = []
        for item in _data:
            target_words = []
            for _id in item[-1]:
                if _id < args.max_length:
                    target_words.append(str(_id))
                else:
                    target_words.append(trg_dict[_id])
            target_sentence_list.append(" ".join(target_words))
        source_sentence_list = []
        source_entity_list = []
        for item in _data:
            target_words = []
            for _id in item[0]:
                target_words.append(src_dict[_id])
            source_sentence_list.append(target_words)
            entity_tag = []
            for _id in item[1]:
                entity_tag.append(id2entity[_id])
            source_entity_list.append(entity_tag)

        print("=====Print text")
        for paragraph, sentence, source , entities in \
                zip(paragraphs, target_sentence_list, \
                source_sentence_list, source_entity_list):
            print("-----")
            new_words = []
            indexes = range(len(source))
            for i, word, entity in zip(indexes, source, entities):
                new_words.append(word + "(" + str(i) + " " + entity + ")")
            print(" ".join(new_words))
            print("=Predict:", " ".join(paragraph[1:]))
            print("=Label:", sentence)

    scores.eval_show()
Beispiel #15
0
                    default=512,
                    help='random init char embedding_dim')
parser.add_argument('--shuffle',
                    type=str2bool,
                    default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1521112368',
                    help='model for test and demo')
args = parser.parse_args()

## get char embeddings
# word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
word2id = read_dictionary('word.pkl')
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
    # embeddings = BERTEmbedding("chinese", sequence_length=50, task=kashgari.LABELING)
    print('embeddings')
    print(len(embeddings))
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

## read corpus and get training data
if args.mode != 'demo':
    # train_path = os.path.join('.', args.train_data, 'train_data')
    # test_path = os.path.join('.', args.test_data, 'test_data')
    train_path = 'sample3.data'
    test_path = 'train.data'
Beispiel #16
0
    transition_params_proto = result.outputs['transition_params']
    # transition_params_shape = [transition_params_proto.tensor_shape.dim[i].size
    #                            for i in range(len(transition_params_proto.tensor_shape.dim))]
    # transition_params = numpy.array(transition_params_proto.float_val).reshape(transition_params_shape)
    transition_params = tf.contrib.util.make_ndarray(transition_params_proto)

    label_list = []
    for logit, seq_len in zip(logits, seq_len_list):
        viterbi, viterbi_score = viterbi_decode(logit[:seq_len],
                                                transition_params)
        label_list.append(viterbi)
    return label_list, seq_len_list


word2id = read_dictionary(os.path.join('.', 'data_path', 'word2id.pkl'))


def main(test_sent):
    start_time = time.time()
    channel = implementations.insecure_channel('192.168.1.210', 5075)
    stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

    test_sent = list(test_sent.strip())
    test_data = [(test_sent, ['O'] * len(test_sent))]
    label_list = []
    for seqs, labels in batch_yield(test_data,
                                    batch_size=64,
                                    vocab=word2id,
                                    tag2label=tag2label,
                                    shuffle=False):
Beispiel #17
0
                    help='shuffle training data before each epoch')
parser.add_argument('--mode',
                    type=str,
                    default='train',
                    help='train/test/demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1521112368',
                    help='model for test and demo')
args = parser.parse_args()

# Creating .pkl file
vocab_build(Path + '\\word2id.pkl', Path + '\\vocab.txt', 3)

# get char embeddings
word2id = read_dictionary(Path + '\\word2id.pkl')
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

## read corpus and get training data
if args.mode != 'demo':
    train_path = 'D:\\resource\\general_hypernym_extraction\\data\\train.txt'
    test_path = 'D:\\resource\\general_hypernym_extraction\\data\\valid.txt'
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path)
    test_size = len(test_data)

## paths setting
Beispiel #18
0
def train(data=data):
    # Load arguments
    args = parse_args()
    options = vars(args)
    print(json.dumps(options, ensure_ascii=False, indent=4))

    # if not conf.pre_train_word_embedding:
    #     word2id = data.read_dictionary("train_data/word2id.pkl")
    #     embeddings = data.random_embedding(word2id, conf.embedding_dim)
    # else:

    # Dictrionaries init
    word2id = data.read_dictionary("train_data/pre_trained_word2id.pkl")
    # embeddings = np.load("train_data/pre_trained_embeddings.npy")
    # word2id = data.read_dictionary("train_data/pre_trained_mini_word2id.pkl")
    # embeddings = np.load("train_data/pre_trained_mini_embeddings.npy")
    # word2id = data.read_dictionary("train_data/pre_trained_copy_mini_word2id.pkl")
    # embeddings = np.load("train_data/pre_trained_copy_mini_embeddings.npy")

    # word2id_output_mini = {}
    # for i, k in enumerate(word2id):
    #     word2id_output_mini[k] = i
    #     if i > 9100:
    #         break
    # word2id_output_mini["<S>"] = 1
    # word2id_output_mini["<E>"] = 2
    # word2id = word2id_output_mini

    word2id_output = word2id.copy()
    word_ori_size = len(word2id)
    # word_mini_size = len(word2id_output)
    # word_size = word_ori_size
    # word_size = word_mini_size

    word_size = 0
    tag_size = 0
    for k in tag2label:
        if tag2label[k] > tag_size:
            tag_size = tag2label[k]
        tag2label[k] += args.max_length
        if tag2label[k] > word_size:
            word_size = tag2label[k]
    # word2id_output.update(tag2label)
    word2id_output = tag2label
    word2id_output["<S>"] = word_size + 1
    word2id_output["<E>"] = word_size + 2
    word_size += 3
    tag_size += 3
    print("output size", word_size, tag_size)

    # print(type(word2id), len(word2id))
    # print(type(entity2id), len(entity2id))
    # print(type(pos2id), len(pos2id))
    # print(type(word2id_output), len(word2id_output))

    # Load data
    data_train = data_load("train_data/ace_data/train.txt",
                           data=data,
                           word2id=word2id,
                           entity2id=entity2id,
                           pos2id=pos2id,
                           word2id_output=word2id_output,
                           event_args=event_args)
    data_dev = data_load("train_data/ace_data/dev.txt",
                         data=data,
                         word2id=word2id,
                         entity2id=entity2id,
                         pos2id=pos2id,
                         word2id_output=word2id_output,
                         event_args=event_args,
                         generate=True)
    data_test = data_load("train_data/ace_data/test.txt",
                          data=data,
                          word2id=word2id,
                          entity2id=entity2id,
                          pos2id=pos2id,
                          word2id_output=word2id_output,
                          event_args=event_args,
                          generate=True)

    if args.enable_ce:
        framework.default_startup_program().random_seed = 111

    # # Training process
    # net = model.net(
    #     args.embedding_dim,
    #     args.encoder_size,
    #     args.decoder_size,
    #     word_ori_size,
    #     word_size,
    #     tag_size,
    #     False,
    #     beam_size=args.beam_size,
    #     max_length=args.max_length,
    #     source_entity_dim=len(entity2id),
    #     source_pos_dim=len(pos2id),
    #     embedding_entity_dim=args.embedding_entity_dim,
    #     embedding_pos_dim=args.embedding_pos_dim,
    #     end_id=word2id_output["<E>"])
    # avg_cost = net.avg_cost
    # feed_order = net.feeding_list
    # # Test net
    # net_test = model.net(
    #     args.embedding_dim,
    #     args.encoder_size,
    #     args.decoder_size,
    #     word_mini_size,
    #     word_size,
    #     True,
    #     beam_size=args.beam_size,
    #     max_length=args.max_length,
    #     source_entity_dim=len(entity2id),
    #     source_pos_dim=len(pos2id),
    #     embedding_entity_dim=args.embedding_entity_dim,
    #     embedding_pos_dim=args.embedding_pos_dim,
    #     end_id=word2id_output["<E>"])

    # # # clone from default main program and use it as the validation program
    # main_program = fluid.default_main_program()
    # inference_program = fluid.default_main_program().clone(for_test=True)

    # optimizer = fluid.optimizer.Adam(
    #     learning_rate=args.learning_rate,
    #     regularization=fluid.regularizer.L2DecayRegularizer(
    #         regularization_coeff=1e-5))

    # optimizer.minimize(avg_cost, no_grad_set=net.no_grad_set)

    # print("begin memory optimization ...")
    # # fluid.memory_optimize(train_program)
    # fluid.memory_optimize(main_program)
    # print("end memory optimization ...")

    # loss = avg_cost
    train_program = fluid.Program()
    train_startup = fluid.Program()
    # if "CE_MODE_X" in os.environ:
    #     train_program.random_seed = 110
    #     train_startup.random_seed = 110
    with fluid.program_guard(train_program, train_startup):
        with fluid.unique_name.guard():
            # Training process
            net = model.net(args.embedding_dim,
                            args.encoder_size,
                            args.decoder_size,
                            word_ori_size,
                            word_size,
                            tag_size,
                            False,
                            beam_size=args.beam_size,
                            max_length=args.max_length,
                            source_entity_dim=len(entity2id),
                            source_pos_dim=len(pos2id),
                            embedding_entity_dim=args.embedding_entity_dim,
                            embedding_pos_dim=args.embedding_pos_dim,
                            end_id=word2id_output["<E>"])
            loss = net.avg_cost
            feed_order = net.feeding_list
            # gradient clipping
            fluid.clip.set_gradient_clip(
                clip=fluid.clip.GradientClipByValue(max=1.0, min=-1.0))

            optimizer = fluid.optimizer.Adam(
                learning_rate=args.learning_rate,
                regularization=fluid.regularizer.L2DecayRegularizer(
                    regularization_coeff=1e-5))
            # optimizer = fluid.optimizer.Adam(
            #     learning_rate=fluid.layers.exponential_decay(
            #         learning_rate=args.learning_rate,
            #         decay_steps=400,
            #         decay_rate=0.9,
            #         staircase=True))
            optimizer.minimize(loss)
            avg_cost = loss
            # print("begin memory optimization ...")
            # fluid.memory_optimize(train_program)
            # print("end memory optimization ...")

    test_program = fluid.Program()
    test_startup = fluid.Program()
    # if "CE_MODE_X" in os.environ:
    #     test_program.random_seed = 110
    #     test_startup.random_seed = 110
    with fluid.program_guard(test_program, test_startup):
        with fluid.unique_name.guard():
            # Test net
            net_test = model.net(
                args.embedding_dim,
                args.encoder_size,
                args.decoder_size,
                word_ori_size,
                word_size,
                tag_size,
                True,
                beam_size=args.beam_size,
                max_length=args.max_length,
                source_entity_dim=len(entity2id),
                source_pos_dim=len(pos2id),
                embedding_entity_dim=args.embedding_entity_dim,
                embedding_pos_dim=args.embedding_pos_dim,
                end_id=word2id_output["<E>"])

    test_program = test_program.clone(for_test=True)
    main_program = train_program
    inference_program = test_program

    # print(type(paddle.dataset.wmt14.train(args.dict_size)))
    # print(type(paddle.reader.shuffle(
    #             data_train, buf_size=1000)))
    # print(args.enable_ce)
    # for batch_id, data in enumerate(paddle.batch(
    #         paddle.reader.shuffle(
    #             paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
    #         batch_size=args.batch_size,
    #         drop_last=False)()):
    #     print(data)
    #     break

    # Disable shuffle for Continuous Evaluation only
    if not args.enable_ce:
        train_batch_generator = paddle.batch(paddle.reader.shuffle(
            data_train, buf_size=1000),
                                             batch_size=args.batch_size,
                                             drop_last=False)
    else:
        train_batch_generator = paddle.batch(data_train,
                                             batch_size=args.batch_size,
                                             drop_last=False)
    dev_batch_generator = paddle.batch(paddle.reader.buffered(data_dev,
                                                              size=1000),
                                       batch_size=args.batch_size,
                                       drop_last=False)
    test_batch_generator = paddle.batch(paddle.reader.buffered(data_test,
                                                               size=1000),
                                        batch_size=args.batch_size,
                                        drop_last=False)
    # print (type(train_batch_generator))

    # Init model
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    print("device count %d" % dev_count)
    # print("theoretical memory usage: ")
    # print(fluid.contrib.memory_usage(
    #         program=main_program, batch_size=args.batch_size))

    # print("=====Init Main program")
    # exe = Executor(place)
    # # Init para
    # exe.run(framework.default_startup_program())
    # # exe = fluid.ParallelExecutor(use_cuda=args.use_gpu)
    # # os.environ['CPU_NUM'] = "2"
    # # exe = fluid.parallel_executor.ParallelExecutor(
    # #         use_cuda=args.use_gpu, num_trainers=2,
    # #         loss_name=avg_cost.name,
    # #         main_program=fluid.default_main_program())

    exe = fluid.Executor(place)
    print("=====Init train program")
    exe.run(train_startup)
    print("=====Init test program")
    exe.run(test_startup)

    # print("=====Init train exe")
    # train_exe = fluid.ParallelExecutor(
    #     use_cuda=args.use_gpu, loss_name=loss.name, main_program=train_program)

    # print("=====Init test exe")
    # test_exe = fluid.ParallelExecutor(
    #     use_cuda=args.use_gpu,
    #     main_program=test_program,
    #     share_vars_from=train_exe)

    ## Set word emb
    #print("=====Set word embedding")
    #embeddings = embeddings.astype("float32")
    #word_emb_param = fluid.global_scope().find_var(
    #    "emb").get_tensor()
    #word_emb_param.set(embeddings, place)

    print("=====Init Feeder")
    feed_list = [
        main_program.global_block().var(var_name) for var_name in feed_order
    ]
    feed_list_test = [
        inference_program.global_block().var(var_name)
        for var_name in net_test.feeding_list
    ]
    # print(feed_list)
    feeder = fluid.DataFeeder(feed_list, place)
    feeder_test = fluid.DataFeeder(feed_list_test, place)

    # return

    def validation(generater, test_scores):
        # Use test set as validation each pass
        test_scores.reset()
        total_loss = 0.0
        count = 0
        # val_feed_list = [
        #     inference_program.global_block().var(var_name)
        #     for var_name in net_test.feeding_list
        # ]
        # val_feeder = fluid.DataFeeder(val_feed_list, place)

        for batch_id, data in enumerate(generater()):
            # The value of batch_size may vary in the last batch
            batch_size = len(data)

            # Setup initial ids and scores lod tensor
            init_ids_data = np.array(
                [word2id_output["<S>"] for _ in range(batch_size)],
                dtype='int64')
            init_scores_data = np.array([1. for _ in range(batch_size)],
                                        dtype='float32')
            init_ids_data = init_ids_data.reshape((batch_size, 1))
            init_scores_data = init_scores_data.reshape((batch_size, 1))
            init_recursive_seq_lens = [1] * batch_size
            init_recursive_seq_lens = [
                init_recursive_seq_lens, init_recursive_seq_lens
            ]
            init_ids = fluid.create_lod_tensor(init_ids_data,
                                               init_recursive_seq_lens, place)
            init_scores = fluid.create_lod_tensor(init_scores_data,
                                                  init_recursive_seq_lens,
                                                  place)

            # Feed dict for inference
            # feed_dict = feeder.feed([[x[0]] for x in data])
            feed_dict = feeder_test.feed(data)
            feed_dict['init_ids'] = init_ids
            feed_dict['init_scores'] = init_scores

            val_fetch_outs = exe.run(
                inference_program,
                # test_program(),
                feed=feed_dict,
                fetch_list=[net_test.translation_ids],
                return_numpy=False)
            # test_scores.update(
            #         preds=val_fetch_outs[0],
            #         labels=[_[-1] for _ in data])
            # print("=====Update scores")
            test_scores.update(preds=val_fetch_outs[0],
                               labels=[_[-1] for _ in data],
                               words_list=[_[0] for _ in data],
                               for_generate=True)

            # val_fetch_outs = exe.run(inference_program,
            #                          feed=val_feeder.feed(data),
            #                          fetch_list=[avg_cost, net.label],
            #                          return_numpy=False)
            # test_scores.update(
            #         preds=val_fetch_outs[1],
            #         labels=[_[-1] for _ in data],
            #         words_list=[_[0] for _ in data])

            total_loss += 1.0
            count += 1
        # if batch_id > 0:
        #     break
        values = test_scores.eval()
        test_scores.eval_show()

        return total_loss / count, values

    print("=====Init scores")
    id2word_output = {}
    for k in word2id_output:
        id2word_output[word2id_output[k]] = k
    scores_train = generate_pr(word_dict=id2word_output)
    scores_train.append_label(data_train)
    scores_test = generate_pr(word_dict=id2word_output)
    scores_test.append_label(data_test)
    scores_dev = generate_pr(word_dict=id2word_output)
    scores_dev.append_label(data_dev)
    max_tri_f1 = 0.0
    max_tri_pass = -1.0
    max_arg_f1 = 0.0
    max_arg_pass = -1.0
    print("=====Start training")
    for pass_id in range(1, args.pass_num + 1):
        scores_train.reset()
        pass_start_time = time.time()
        words_seen = 0
        for batch_id, _data in enumerate(train_batch_generator()):
            batch_size = len(_data)
            words_seen += len(_data) * 2
            # print(_data)
            # print(len(_data))
            # print(sum([len(_[0]) for _ in _data]))

            # # Setup initial ids and scores lod tensor
            # init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
            # init_scores_data = np.array(
            # [1. for _ in range(batch_size)], dtype='float32')
            # init_ids_data = init_ids_data.reshape((batch_size, 1))
            # init_scores_data = init_scores_data.reshape((batch_size, 1))
            # init_recursive_seq_lens = [1] * batch_size
            # init_recursive_seq_lens = [
            # init_recursive_seq_lens, init_recursive_seq_lens
            # ]
            # init_ids = fluid.create_lod_tensor(init_ids_data,
            # init_recursive_seq_lens, place)
            # init_scores = fluid.create_lod_tensor(init_scores_data,
            # init_recursive_seq_lens, place)

            # # Feed dict for inference
            # # feed_dict = feeder.feed([[x[0]] for x in _data])
            # feed_dict = feeder.feed(_data)
            # feed_dict['init_ids'] = init_ids
            # feed_dict['init_scores'] = init_scores

            # avg_cost_train, preds = exe.run(
            # framework.default_main_program(),
            # # test_program(),
            # feed=feed_dict,
            # fetch_list=[avg_cost, net.predict],
            # return_numpy=False)

            avg_cost_train, preds = exe.run(
                main_program,
                # train_program(),
                feed=feeder.feed(_data),
                fetch_list=[avg_cost, net.label],
                return_numpy=False)
            # print(np.array(labels).shape)
            # print(np.array(preds).tolist())
            # print([_[-1] for _ in _data])
            #print([_[0] for _ in _data])
            avg_cost_train = np.array(avg_cost_train)
            if batch_id % 10 == 0:
                print('pass_id=%d, batch_id=%d, train_loss: %f' %
                      (pass_id, batch_id, avg_cost_train))

            scores_train.update(preds=preds,
                                labels=[_[-1] for _ in _data],
                                words_list=[_[0] for _ in _data])
        # This is for continuous evaluation only
        # if args.enable_ce and batch_id >= 100:
        # if batch_id > 0:
        #     break
        scores_train.eval_show()

        pass_end_time = time.time()
        new_max_dev = False
Beispiel #19
0
parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo')
args = parser.parse_args()


## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path); test_size = len(test_data)

Beispiel #20
0
                    default=300,
                    help='random init char embedding_dim')
parser.add_argument('--shuffle',
                    type=str2bool,
                    default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1521112368',
                    help='model for test and demo')
args = parser.parse_args()

## load char vocabulary list
vocab_path = os.path.join('.', args.train_data, 'word2id.pkl')
word2id = read_dictionary(vocab_path)

# get char embeddings
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

## read corpus and get training data
if args.mode == 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path)
    test_size = len(test_data)
Beispiel #21
0
## Session configuration神经网络训练和测试的入口,通过定义主函数的执行函数,可以在这里控制神经网络的训练,保存模型;以及神经网络的测试,包括模型调用
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # default: 0
config = tf.ConfigProto()



## hyperparameters
embedding_dim = 128

tag2label = {"N": 0,
             "解剖部位": 1, "手术": 2,
             "药物": 3, "独立症状": 4,
             "症状描述": 5}
## get char embeddings
word2id = read_dictionary('./vocab.pkl')
embeddings = random_embedding(word2id, embedding_dim)

train_data = read_corpus('./c.txt')


# embeddings, tag2label, vocab,batch_size,epoch,hidden_dim,CRF,update_embedding,shuffle
## training model
if __name__ == '__main__':
    model = BiLSTM_CRF(embeddings, tag2label, word2id, 4,80,128,False,True,True)
    model.build_graph()
    test_report = open('test_report.txt','w',encoding= 'utf-8')

    print("train data: {}".format(len(train_data)))
    model.test(test_report)
    # model.train(train=train_data)  # use test_data as the dev_data to see overfitting phenomena
Beispiel #22
0
import numpy as np
import pandas as pd
import string
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

from data import read_dictionary, build_char_dict, prepare_data


# Each line is formatted as: "word {gender}"
word_list = read_dictionary('./wiktionary_nouns_with_gender.txt')
char2idx, idx2char = build_char_dict()

vocab_size = len(char2idx)

embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))

for char, idx in char2idx.items():
    onehot = np.zeros(vocab_size)
    onehot[idx-1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
embedding_size = embedding_weights.shape[1]
Beispiel #23
0
    def get_model(self):
        config = tf.ConfigProto()

        parser = argparse.ArgumentParser(
            description='BiLSTM-CRF for Chinese NER task')
        parser.add_argument('--train_data',
                            type=str,
                            default='data_path',
                            help='train data source')
        parser.add_argument('--test_data',
                            type=str,
                            default='data_path',
                            help='test data source')
        parser.add_argument('--batch_size',
                            type=int,
                            default=64,
                            help='#sample of each minibatch')
        parser.add_argument('--epoch',
                            type=int,
                            default=40,
                            help='#epoch of training')
        parser.add_argument('--hidden_dim',
                            type=int,
                            default=300,
                            help='#dim of hidden state')
        parser.add_argument('--optimizer',
                            type=str,
                            default='Adam',
                            help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
        parser.add_argument(
            '--CRF',
            type=str2bool,
            default=True,
            help='use CRF at the top layer. if False, use Softmax')
        parser.add_argument('--lr',
                            type=float,
                            default=0.001,
                            help='learning rate')
        parser.add_argument('--clip',
                            type=float,
                            default=5.0,
                            help='gradient clipping')
        parser.add_argument('--dropout',
                            type=float,
                            default=0.5,
                            help='dropout keep_prob')
        parser.add_argument('--update_embedding',
                            type=str2bool,
                            default=True,
                            help='update embedding during training')
        parser.add_argument(
            '--pretrain_embedding',
            type=str,
            default='random',
            help='use pretrained char embedding or init it randomly')
        parser.add_argument('--embedding_dim',
                            type=int,
                            default=300,
                            help='random init char embedding_dim')
        parser.add_argument('--shuffle',
                            type=str2bool,
                            default=True,
                            help='shuffle training data before each epoch')
        parser.add_argument('--mode',
                            type=str,
                            default='demo',
                            help='train/test/demo')
        parser.add_argument('--demo_model',
                            type=str,
                            default='1521112368',
                            help='model for test and demo')
        args = parser.parse_args()

        ## get char embeddings
        word2id = read_dictionary(
            os.path.join('.', args.train_data, 'word2id.pkl'))
        if args.pretrain_embedding == 'random':
            embeddings = random_embedding(word2id, args.embedding_dim)
        else:
            embedding_path = 'pretrain_embedding.npy'
            embeddings = np.array(np.load(embedding_path), dtype='float32')

        paths = {}
        timestamp = str(int(
            time.time())) if args.mode == 'train' else args.demo_model
        output_path = os.path.join('.', args.train_data + "_save", timestamp)
        if not os.path.exists(output_path): os.makedirs(output_path)
        summary_path = os.path.join(output_path, "summaries")
        paths['summary_path'] = summary_path
        if not os.path.exists(summary_path): os.makedirs(summary_path)
        model_path = os.path.join(output_path, "checkpoints/")
        if not os.path.exists(model_path): os.makedirs(model_path)
        ckpt_prefix = os.path.join(model_path, "model")
        paths['model_path'] = ckpt_prefix
        result_path = os.path.join(output_path, "results")
        paths['result_path'] = result_path
        if not os.path.exists(result_path): os.makedirs(result_path)
        log_path = os.path.join(result_path, "log.txt")
        paths['log_path'] = log_path
        get_logger(log_path).info(str(args))

        ckpt_file = tf.train.latest_checkpoint(model_path)
        print(ckpt_file)
        paths['model_path'] = ckpt_file
        model = BiLSTM_CRF(args,
                           embeddings,
                           tag2label,
                           word2id,
                           paths,
                           config=config)
        model.build_graph()
        saver = tf.train.Saver()
        return model, saver, ckpt_file
    ckpt_prefix = os.path.join(model_path, "model")
    paths['model_path'] = ckpt_prefix
    result_path = os.path.join(output_path, "results")
    paths['result_path'] = result_path
    if not os.path.exists(result_path): os.makedirs(result_path)
    log_path = os.path.join(result_path, "log.txt")
    paths['log_path'] = log_path
    get_logger(log_path).info(str(args))
    return paths

args = para_set()
paths = path_set()
config = tf.ConfigProto()
print("loading data")
train_data, test_data = auas_read_corpus("/home/jinsh/wiki_model/data/extraction.corpus_all.json")
print("{0} training data \n{1} test data".format(len(train_data), len(test_data)))
# always use random embedding

word2id = read_dictionary(word2id_path)
embeddings = random_embedding(word2id, args.embedding_dim)
model_path = root_dir +"key_word/best_model/checkpoints/"

ckpt_file = tf.train.latest_checkpoint(model_path)
# print(ckpt_file)
print(ckpt_file)
# exit
paths['model_path'] = ckpt_file
model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config)
model.build_graph()
# print("test data: {}".format(len(test_data)))
model.test(test_data)
Beispiel #25
0
    words_count_list = sorted(words_count_list, key=lambda x: x[0])
    word2id_mini = {}
    for i, item in enumerate(words_count_list):
        key = id2word[item[0]]
        word2id_mini[key] = i
    print(len(word2id_mini))
    with open(save_prefix + "word2id.pkl", 'w+') as fw:
        pickle.dump(word2id_mini, fw)


if __name__ == "__main__":
    import data
    from Constant import pos2id, entity2id, event_args, label2idx, tag2label
    # word2id = data.read_dictionary("data/pre_trained_word2id.pkl")
    # embeddings = np.load("data/pre_trained_embeddings.npy")
    word2id = data.read_dictionary("data/pre_trained_mini_word2id.pkl")
    embeddings = np.load("data/pre_trained_mini_embeddings.npy")

    word2id_output = word2id.copy()
    word_ori_size = len(word2id)
    # word_mini_size = len(word2id_output)
    # word_size = word_ori_size
    # word_size = word_mini_size
    word_size = 0
    for k in tag2label:
        tag2label[k] += args.max_length
        if tag2label[k] > word_size:
            word_size = tag2label[k]
    # word2id_output.update(tag2label)
    word2id_output = tag2label
    word2id_output["<S>"] = word_size + 1
Beispiel #26
0
lr_pl = float(conf.get('train_arg', 'lr_pl'))
# graph参数配置
uniDocModel_wordEmbedSize = int(
    conf.get('graph_arg', 'uniDocModel_wordEmbedSize'))
uniDocModel_hiddenSize = int(conf.get('graph_arg', 'uniDocModel_hiddenSize'))
classModel_hiddenSize = int(conf.get('graph_arg', 'classModel_hiddenSize'))
# pad参数配置
train_max_sent_len = int(conf.get('pad_arg', 'train_max_sent_len'))
train_max_sent_num = int(conf.get('pad_arg', 'train_max_sent_num'))
test_max_sent_len = int(conf.get('pad_arg', 'test_max_sent_len'))
test_max_sent_num = int(conf.get('pad_arg', 'test_max_sent_num'))

if __name__ == "__main__":
    mode = "train"
    if os.path.exists(os.path.join("data_path", 'word2id.pkl')):
        word2id = read_dictionary(os.path.join("data_path", 'word2id.pkl'))
    else:
        build_vocab_doc(os.path.join("data_path", 'word2id.pkl'),
                        train_data_path)
        word2id = read_dictionary(os.path.join("data_path", 'word2id.pkl'))
    vocab_size = len(word2id)
    num_tags = len(tag2label)
    if mode == "train":
        timestamp = str(int(time.time()))
    else:
        timestamp = conf.get('path_arg', 'test_time')
    output_path = os.path.join(output_path, timestamp)
    if not os.path.exists(output_path): os.makedirs(output_path)
    model_path = os.path.join(output_path, conf.get('path_arg', 'model_path'))
    summary_path = os.path.join(output_path,
                                conf.get('path_arg', 'summary_path'))
Beispiel #27
0
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 27 14:47:03 2019

@author: Administrator
"""

#import tensorflow as tf
#import numpy as np
#import os, argparse, time, random
from BiLSTMmodel import bilstm_model
from data import read_corpus, read_dictionary, random_embedding
from config import config

## get char embeddings
word2id = read_dictionary('vocab')
##随机产生embedding
embeddings = random_embedding(word2id, config.embedding_size)

paths={'log_path':'logger//', 'model_path':'./model2/','result_path':'result//'}

#TODO 注意:model_path!!这是个坑啊!!

model = bilstm_model(embeddings, paths, word2id, config=config)
model.build_graph()



## train model on the whole training data
train_data = read_corpus('pku_training.utf8')
print("train data: {}".format(len(train_data)))
Beispiel #28
0
import os

import numpy as np

from data import read_corpus, read_dictionary
from model import BiLSTM_CRF, Config
from utils import NerCfgData

ner_cfg = NerCfgData()
label2id = ner_cfg.generate_tag_to_label()

logger = logging.getLogger(__name__)
current_dir = os.path.dirname(os.path.abspath(__file__))

## get char embeddings
word2id_pos2id = read_dictionary('word2id_pos2id_new.pkl')
word2id = word2id_pos2id['word2id']
pos2id = word2id_pos2id['pos2id']
word_embedding = np.array(np.load('word2vec.npy'), dtype=np.float32)
pos_embedding = np.array(np.load('pos2vec.npy'), dtype=np.float32)

config = Config(word2id,
                pos2id,
                label2id,
                batch_size=128,
                n_epochs=200,
                n_neurons=60)
config.word_embedding = word_embedding
config.pos_embedding = pos_embedding

## read corpus and get training data
Beispiel #29
0
                    type=str2bool,
                    default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1521112368',
                    help='model for test and demo')
parser.add_argument('--seq_length',
                    type=int,
                    default=20,
                    help='Pretrain language model seq length')
args = parser.parse_args()

## get char embeddings
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    pre_train_path = os.path.join('.', args.train_data, 'resume_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    pre_train_data = read_pre_train_data(pre_train_path, args.seq_length)

    test_data = read_corpus(test_path)
Beispiel #30
0
def run(sentences):
    # 配置session的参数
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # 使用GPU 0
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 日志级别设置
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.2  # need ~700MB GPU memory

    # hyperparameters超参数设置
    # 创建一个解析器对象,并告诉它将会有些什么参数
    # 那么当你的程序运行时,该解析器就可以用于处理命令行参数
    parser = argparse.ArgumentParser(
        description='BiLSTM-CRF for Chinese NER task')
    parser.add_argument('--train_data',
                        type=str,
                        default='data_path',
                        help='train data source')
    parser.add_argument('--test_data',
                        type=str,
                        default='data_path',
                        help='test data source')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help='#sample of each minibatch')
    # batch :批次大小 在深度学习中,一般采用SGD训练,即每次训练在训练集中取batchsize个样本训练
    # iteration:中文翻译为迭代,1个iteration等于使用batchsize个样本训练一次
    # 一个迭代 = 一个正向通过+一个反向通过
    parser.add_argument('--epoch',
                        type=int,
                        default=40,
                        help='#epoch of training')
    # epoch:迭代次数,1个epoch等于使用训练集中的全部样本训练一次
    # 一个epoch = 所有训练样本的一个正向传递和一个反向传递 举个例子,训练集有1000个样本,batchsize=10,那么: 训练完整个样本集需要: 100次iteration,1次epoch。
    parser.add_argument('--hidden_dim',
                        type=int,
                        default=300,
                        help='#dim of hidden state')
    # 输出向量的维度:300维
    parser.add_argument('--optimizer',
                        type=str,
                        default='Adam',
                        help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
    # 优化器用的Adam
    parser.add_argument('--CRF',
                        type=str2bool,
                        default=True,
                        help='use CRF at the top layer. if False, use Softmax')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=5.0,
                        help='gradient clipping')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='dropout keep_prob')
    # dropout是指在深度学习网络的训练过程中,对于神经网络单元,按照一定的概率将其暂时从网络中丢弃
    parser.add_argument('--update_embedding',
                        type=str2bool,
                        default=True,
                        help='update embedding during training')
    parser.add_argument(
        '--pretrain_embedding',
        type=str,
        default='random',
        help='use pretrained char embedding or init it randomly')
    parser.add_argument('--embedding_dim',
                        type=int,
                        default=300,
                        help='random init char embedding_dim')
    parser.add_argument('--shuffle',
                        type=str2bool,
                        default=True,
                        help='shuffle training data before each epoch')
    parser.add_argument('--mode',
                        type=str,
                        default='demo',
                        help='train/test/demo')
    parser.add_argument('--demo_model',
                        type=str,
                        default='1559398699',
                        help='model for test and demo')
    # 传递参数送入模型中
    args = parser.parse_args()

    # 初始化embedding矩阵,读取词典
    word2id = read_dictionary(os.path.join('.', args.train_data,
                                           'word2id.pkl'))
    # 通过调用random_embedding函数返回一个len(vocab)*embedding_dim=3905*300的矩阵(矩阵元素均在-0.25到0.25之间)作为初始值
    if args.pretrain_embedding == 'random':
        embeddings = random_embedding(word2id, args.embedding_dim)
    else:
        embedding_path = 'pretrain_embedding.npy'
        embeddings = np.array(np.load(embedding_path), dtype='float32')

    # 读取训练集和测试集
    if args.mode != 'demo':
        train_path = os.path.join('.', args.train_data, 'train_data')
        test_path = os.path.join('.', args.test_data, 'test_data')
        train_data = read_corpus(train_path)
        test_data = read_corpus(test_path)
        test_size = len(test_data)

    # 设置路径
    paths = {}
    timestamp = str(int(
        time.time())) if args.mode == 'train' else args.demo_model
    output_path = os.path.join('.', args.train_data + "_save", timestamp)
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    summary_path = os.path.join(output_path, "summaries")
    paths['summary_path'] = summary_path
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
    model_path = os.path.join(output_path, "checkpoints/")
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    ckpt_prefix = os.path.join(model_path, "model")
    paths['model_path'] = ckpt_prefix
    result_path = os.path.join(output_path, "results")
    paths['result_path'] = result_path
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    log_path = os.path.join(result_path, "log.txt")
    paths['log_path'] = log_path
    get_logger(log_path).info(str(args))  # 将参数写入日志文件

    if args.mode == 'train':  # 训练模型
        model = BiLSTM_CRF(args,
                           embeddings,
                           tag2label,
                           word2id,
                           paths,
                           config=config)
        model.build_graph()
        model.train(train=train_data, dev=test_data)

    elif args.mode == 'test':  # 测试模型
        ckpt_file = tf.train.latest_checkpoint(model_path)
        print(ckpt_file)
        paths['model_path'] = ckpt_file
        model = BiLSTM_CRF(args,
                           embeddings,
                           tag2label,
                           word2id,
                           paths,
                           config=config)
        model.build_graph()
        print("test data: {}".format(test_size))
        model.test(test_data)

    # demo
    elif args.mode == 'demo':
        location = []
        ckpt_file = tf.train.latest_checkpoint(model_path)
        print("model path: ", ckpt_file)
        paths['model_path'] = ckpt_file  # 设置模型路径
        model = BiLSTM_CRF(args,
                           embeddings,
                           tag2label,
                           word2id,
                           paths,
                           config=config)
        model.build_graph()
        saver = tf.train.Saver()
        with tf.Session(config=config) as sess:
            saver.restore(sess, ckpt_file)
            for sentence in sentences:
                demo_sent = sentence
                demo_sent = list(demo_sent.strip())  # 删除空白符
                demo_data = [(demo_sent, ['O'] * len(demo_sent))]
                tag = model.demo_one(sess, demo_data)
                PER, LOC, ORG = get_entity(tag, demo_sent)  # 根据标注序列输出对应的字符
                new_LOC = list(set(LOC))  # 去重
                loc = ' '.join(new_LOC)
                location.append(loc)
            return location