def load_parameters():
    """load parameters from config file"""

    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = ['f1']
    use_char_feature = config['model_params']['use_char_feature']

    # 加载vocs
    path_vocs = []

    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']

    return feature_names, sep, vocs, max_len, use_char_feature, word_len
コード例 #2
0
ファイル: train.py プロジェクト: xyangk/NER-LSTM-CRF
def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name]['path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)

    # 加载数据

    # 加载vocs
    path_vocs = []
    for feature_name in feature_names:
        path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    data_dict = init_data(
        path=config['data_params']['path_train'], feature_names=feature_names, sep=sep,
        vocs=vocs, max_len=config['model_params']['sequence_length'], model='train')

    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        path_model=config['model_params']['path_model'])

    model.fit(
        data_dict=data_dict, dev_size=config['model_params']['dev_size'])
コード例 #3
0
ファイル: test.py プロジェクト: zyfnhct/NER-LSTM-CRF
def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name]['path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None
    # 加载数据

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(
        path=config['data_params']['path_test'], feature_names=feature_names, sep=sep,
        vocs=vocs, max_len=max_len, model='test', use_char_feature=use_char_feature,
        word_len=word_len)

    # 加载模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])
    saver = tf.train.Saver()
    saver.restore(model.sess, config['model_params']['path_model'])

    # 标记
    viterbi_sequences = model.predict(data_dict)

    # 写入文件
    label_voc = dict()
    for key in vocs[-1]:
        label_voc[vocs[-1][key]] = key
    with codecs.open(config['data_params']['path_test'], 'r', encoding='utf-8') as file_r:
        sentences = file_r.read().strip().split('\n\n')
    file_result = codecs.open(
        config['data_params']['path_result'], 'w', encoding='utf-8')
    for i, sentence in enumerate(sentences):
        for j, item in enumerate(sentence.split('\n')):
            if j < len(viterbi_sequences[i]):
                file_result.write('%s\t%s\n' % (item, label_voc[viterbi_sequences[i][j]]))
            else:
                file_result.write('%s\tO\n' % item)
        file_result.write('\n')

    file_result.close()
コード例 #4
0
ファイル: test.py プロジェクト: mayuchen/slot-filling
def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)


    feature_names = config['model_params']['feature_names'][:2]
    for id in range(0, 83):
        feature_names.append('domain' + str(id))

    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)

    feature_weight_shape_dict, feature_weight_dropout_dict, \
    feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names[:2]:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name]['path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)

    for feature_name in feature_names[2:]:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params']['default']['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params']['default']['dropout_rate']
        path_pre_train = config['model_params']['embed_params']['default']['path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)

    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None
    # 加载数据

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in config['model_params']['feature_names']:
        path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
    # for feature_name in feature_names[2:]:
    #     path_vocs.append(config['data_params']['voc_params']['default']['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)


    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    session_data_dict = load_session_data_domain(config['data_params']['path_test'], config['model_params']['feature_names'], vocs, max_len,model='test')
    # print(session_data_dict)
    # 加载模型
    model = SequenceLabelingModel2(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])

    saver = tf.train.Saver()
    saver.restore(model.sess, config['model_params']['path_model'])

    # 标记
    infer_results = model.predict(session_data_dict)
    # print(infer_results)
    # 写入文件
    slot_voc = dict()
    for key in vocs[-1]:
        slot_voc[vocs[-1][key]] = key
    intent_voc = dict()
    for key in INTENT_DIC:
        intent_voc[INTENT_DIC[key]] = key

    with codecs.open(config['data_params']['path_test'], 'r', encoding='utf-8') as file_r:
        sentences = file_r.readlines()
    file_result = codecs.open(
        config['data_params']['path_result'], 'w', encoding='utf-8')
    infer_sentence_results = []
    # for sid, pred_intents in infer_results:
    #     for x in pred_intents:
    #         infer_sentence_results += [(sid, x)]
    # ['10000', [2], [[1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 5, 1, 1, 1, 1, 1, 22, 59]]]
    for sid, pred_intents,pre_slots in infer_results:
        pre_label = []
        pre_label.append(pred_intents)
        for slot in pre_slots:
            pre_label.append(slot)
        infer_sentence_results.append((sid, pre_label))
    print('predict sentences count', len(infer_sentence_results), len(sentences))
    # for i, sentence in enumerate(sentences):
    #     sid, pred_intent = infer_sentence_results[i]
    #     for j, item in enumerate(sentence.split('\n')):
    #         if j == 0:
    #             for key in INTENT_DIC.keys():
    #                 if pred_intent == INTENT_DIC.get(key):
    #                     file_result.write('%s\t%s\n' % (item, key))
    #             continue
    #         else:
    #             file_result.write('%s\tO\n' % item)
    #     file_result.write('\n')
    infer_session = []
    session_temp = []
    for ss in sentences:
        if ss.strip():
            session_temp.append(ss.strip('\n'))
        else:
            infer_session.append(session_temp)
            session_temp = []
    if session_temp:
        infer_session.append(session_temp)
    assert len(infer_session) == len(infer_sentence_results)
    for sessions,results in zip(infer_session,infer_sentence_results):
        labels = results[1]
        # print(len(sessions) , len(labels))
        # print(sessions)
        assert len(sessions) == len(labels)
        first = True
        for sess,lab in zip(sessions,labels):
            if first:
                file_result.write(sess.strip()+'\t'+intent_voc[lab]+'\n')
                first = False
            else:
                file_result.write(sess.strip() + '\t' + slot_voc[lab] + '\n')
        file_result.write('\n')
    file_result.close()
コード例 #5
0
    def __init__(self):
        with open('./config.yml') as file_config:
            config = yaml.load(file_config)

        self.max_len = config['model_params']['sequence_length']
        feature_names = config['model_params']['feature_names']
        use_char_feature = config['model_params']['use_char_feature']

        # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
        feature_weight_shape_dict, feature_weight_dropout_dict, \
            feature_init_weight_dict = dict(), dict(), dict()
        for feature_name in feature_names:
            feature_weight_shape_dict[feature_name] = \
                config['model_params']['embed_params'][feature_name]['shape']
            feature_weight_dropout_dict[feature_name] = \
                config['model_params']['embed_params'][feature_name]['dropout_rate']
            path_pre_train = config['model_params']['embed_params'][
                feature_name]['path']
            if path_pre_train:
                with open(path_pre_train, 'rb') as file_r:
                    feature_init_weight_dict[feature_name] = pickle.load(
                        file_r)
        # char embedding shape
        if use_char_feature:
            feature_weight_shape_dict['char'] = \
                config['model_params']['embed_params']['char']['shape']
            conv_filter_len_list = config['model_params'][
                'conv_filter_len_list']
            conv_filter_size_list = config['model_params'][
                'conv_filter_size_list']
        else:
            conv_filter_len_list = None
            conv_filter_size_list = None

        # 加载vocs
        path_vocs = []
        if use_char_feature:
            path_vocs.append(
                config['data_params']['voc_params']['char']['path'])
        for feature_name in feature_names:
            path_vocs.append(
                config['data_params']['voc_params'][feature_name]['path'])
        path_vocs.append(config['data_params']['voc_params']['label']['path'])
        self.vocs = load_vocs(path_vocs)

        # 加载模型
        self.model = ClassficationModel(
            sequence_length=config['model_params']['sequence_length'],
            nb_classes=config['model_params']['nb_classes'],
            nb_hidden=config['model_params']['bilstm_params']['num_units'],
            num_layers=config['model_params']['bilstm_params']['num_layers'],
            feature_weight_shape_dict=feature_weight_shape_dict,
            feature_init_weight_dict=feature_init_weight_dict,
            feature_weight_dropout_dict=feature_weight_dropout_dict,
            dropout_rate=config['model_params']['dropout_rate'],
            nb_epoch=config['model_params']['nb_epoch'],
            feature_names=feature_names,
            batch_size=config['model_params']['batch_size'],
            train_max_patience=config['model_params']['max_patience'],
            use_crf=config['model_params']['use_crf'],
            l2_rate=config['model_params']['l2_rate'],
            rnn_unit=config['model_params']['rnn_unit'],
            learning_rate=config['model_params']['learning_rate'],
            use_char_feature=use_char_feature,
            conv_filter_size_list=conv_filter_size_list,
            conv_filter_len_list=conv_filter_len_list,
            word_length=config['model_params']['word_length'],
            path_model=config['model_params']['path_model'])
        saver = tf.train.Saver()
        saver.restore(self.model.sess, config['model_params']['path_model'])

        self.label_voc = {}
        for key, value in self.vocs[-1].items():
            self.label_voc[value] = key
コード例 #6
0
ファイル: train.py プロジェクト: xiaopangxia/DS_CTT
def main():
    # 加载配置文件
    print("config5")
    with open('./train_config/config_b2b_tag_5_only_jieba.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载数据

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(path=config['data_params']['path_train'],
                          feature_names=feature_names,
                          sep=sep,
                          vocs=vocs,
                          max_len=max_len,
                          model='train',
                          use_char_feature=use_char_feature,
                          word_len=word_len)

    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        cnn_dropout_rate=config['model_params']['conv_dropout'],
        word_length=word_len,
        path_model=config['model_params']['path_model'])

    model.fit(data_dict=data_dict, dev_size=config['model_params']['dev_size'])
コード例 #7
0
    feature_weight_shape_dict[feature_name] = \
        config['model_params']['embed_params'][feature_name]['shape']
    feature_weight_dropout_dict[feature_name] = \
        config['model_params']['embed_params'][feature_name]['dropout_rate']
    path_pre_train = config['model_params']['embed_params'][feature_name][
        'path']
    if path_pre_train:
        with open(path_pre_train, 'rb') as file_r:
            feature_init_weight_dict[feature_name] = pickle.load(file_r)

# 加载vocs
path_vocs = []
for feature_name in feature_names:
    path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
path_vocs.append(config['data_params']['voc_params']['label']['path'])
vocs = load_vocs(path_vocs)
print(vocs[-1])
print(len(vocs))
# 加载模型
model = SequenceLabelingModel(
    sequence_length=config['model_params']['sequence_length'],
    nb_classes=config['model_params']['nb_classes'],
    nb_hidden=config['model_params']['bilstm_params']['num_units'],
    feature_weight_shape_dict=feature_weight_shape_dict,
    feature_init_weight_dict=feature_init_weight_dict,
    feature_weight_dropout_dict=feature_weight_dropout_dict,
    dropout_rate=config['model_params']['dropout_rate'],
    nb_epoch=config['model_params']['nb_epoch'],
    feature_names=feature_names,
    batch_size=config['model_params']['batch_size'],
    train_max_patience=config['model_params']['max_patience'],
コード例 #8
0
ファイル: train.py プロジェクト: mayuchen/slot-filling
def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names'][:2]
    for id in range(0, 83):
        feature_names.append('domain' + str(id))
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    """
    f1:字
    f2:词性
    f3:槽名称
    """
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names[:2]:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)

    for feature_name in feature_names[2:]:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params']['default']['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params']['default']['dropout_rate']
        path_pre_train = config['model_params']['embed_params']['default'][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载数据

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in config['model_params']['feature_names']:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    # for feature_name in feature_names[2:]:
    #     path_vocs.append(config['data_params']['voc_params']['default']['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    nb_classes = config['model_params']['nb_classes']

    session_data_dict = load_session_data_domain(
        config['data_params']['path_train'],
        config['model_params']['feature_names'], vocs, max_len)

    print(len(feature_name))
    model = SequenceLabelingModel2(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        cnn_dropout_rate=config['model_params']['conv_dropout'],
        word_length=word_len,
        path_model=config['model_params']['path_model'])

    model.fit(data_dict=session_data_dict,
              dev_size=config['model_params']['dev_size'])
コード例 #9
0
def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    logger.info(feature_names)
    use_char_feature = config['model_params']['use_char_feature']
    logger.info(use_char_feature)
    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict = dict()
    feature_weight_dropout_dict = dict()
    feature_init_weight_dict = dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = config['model_params'][
            'embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = config['model_params'][
            'embed_params'][feature_name]['dropout_rate']
        # embeding mat, 比voc多了两行, 因为voc从2开始编序, 0, 1行用0填充
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']  # 词嵌矩阵位置
        # logger.info("%s init mat path: %s" % (feature_name, path_pre_train))
        with open(path_pre_train, 'rb') as file_r:
            feature_init_weight_dict[feature_name] = pickle.load(file_r)
    logger.info(feature_weight_dropout_dict)
    logger.info(feature_weight_shape_dict)
    logger.info(feature_init_weight_dict)

    # char embedding shape
    if use_char_feature:
        # 暂时不考虑
        feature_weight_shape_dict['char'] = config['model_params'][
            'embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        # 利用卷集层来提取char的信息
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']
                         ['path'])  # vocs用于将文本数字序列化
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']  # 数据的分隔方式
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']

    # 通过voc 将input f1 和输出 label 数字序列化 得到训练的输入和输出
    # data_dict = None
    data_dict = init_data(path=config['data_params']['path_train'],
                          feature_names=feature_names,
                          sep=sep,
                          vocs=vocs,
                          max_len=max_len,
                          model='train',
                          use_char_feature=use_char_feature,
                          word_len=word_len)
    logger.info(data_dict)  # 每个特征序列化后的数据
    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],  # 句子被固定长度
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        cnn_dropout_rate=config['model_params']['conv_dropout'],
        word_length=word_len,
        path_model=config['model_params']['path_model'],
        last_train_sess_path=None,  # 为了加快训练的速度我们继续载入前面训练的参数
        transfer=False)  # 是否对前面载入的参数进行迁移学习,True的话就重置LSTM的输出层

    model.fit(data_dict=data_dict, dev_size=config['model_params']['dev_size'])
    """
コード例 #10
0
def predict(testlist):
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
    feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(path=config['data_params']['path_test'],
                          feature_names=feature_names,
                          sep=sep,
                          test_sens=testlist,
                          vocs=vocs,
                          max_len=max_len,
                          model='test',
                          use_char_feature=use_char_feature,
                          word_len=word_len)

    # 加载模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])
    saver = tf.train.Saver()
    saver.restore(model.sess, config['model_params']['path_model'])

    # print('data_dict', data_dict)
    # 标记
    result_sequences = model.predict(data_dict)

    #print('result_sequences', result_sequences)

    # 输出结果
    label_voc = dict()
    for key in vocs[-1]:
        label_voc[vocs[-1][key]] = key

    outlist = []
    for i, sentence in enumerate(testlist):
        templist = []
        for j, item in enumerate(sentence):
            #char = recheck_char(item[0])
            char = item[0]
            if j < len(result_sequences[i]):
                out = [char, label_voc[result_sequences[i][j]]]
            else:
                out = [char, 'O']
            templist.append(out)
        outlist.append(templist)
    return outlist
コード例 #11
0
def export_serving_model():
    """输出tensorserving model"""
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
    feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载vocs
    path_vocs = []

    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']

    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])

    session = model.sess
    saver = tf.train.Saver()

    saver.restore(session, config['model_params']['path_model'])

    # 输出tensorserving model 过程
    model_version = 1
    work_dir = './Model/ner_model'

    export_path_base = work_dir
    export_path = os.path.join(tf.compat.as_bytes(export_path_base),
                               tf.compat.as_bytes(str(model_version)))

    print('Exporting trained model to', export_path)
    builder = tf.saved_model.builder.SavedModelBuilder(export_path)

    # 定义输入变量
    tensor_info_input_x_f1 = tf.saved_model.utils.build_tensor_info(
        model.input_feature_ph_dict['f1'])
    tensor_info_weight_dropout_ph_dict_f1 = tf.saved_model.utils.build_tensor_info(
        model.weight_dropout_ph_dict['f1'])
    tensor_info_dropout_rate_ph = tf.saved_model.utils.build_tensor_info(
        model.dropout_rate_ph)
    tensor_info_rnn_dropout_rate_ph = tf.saved_model.utils.build_tensor_info(
        model.rnn_dropout_rate_ph)

    tensor_info_logits = tf.saved_model.utils.build_tensor_info(model.logits)
    tensor_info_actual_length = tf.saved_model.utils.build_tensor_info(
        model.sequence_actual_length)
    tensor_info_transition_params = tf.saved_model.utils.build_tensor_info(
        model.transition_params)

    # 构建过程
    prediction_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={
                'input_x_f1': tensor_info_input_x_f1,
                'weight_dropout_ph_dict_f1':
                tensor_info_weight_dropout_ph_dict_f1,
                'dropout_rate_ph': tensor_info_dropout_rate_ph,
                'rnn_dropout_rate_ph': tensor_info_rnn_dropout_rate_ph
            },
            outputs={
                'transition_params': tensor_info_transition_params,
                'logits': tensor_info_logits,
                'sequence_actual_length': tensor_info_actual_length,
            },
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
    )

    builder.add_meta_graph_and_variables(
        session, [tf.saved_model.tag_constants.SERVING],
        signature_def_map={'ner_predict': prediction_signature},
        main_op=tf.tables_initializer(),
        strip_default_attrs=True)

    builder.save()

    print('Done exporting!')