Esempio n. 1
0
def get_similar_index(vec1, vec_list, topn=10):  # 默认输出10个最相似的标题 的(索引号,相似度)列表
    try:
        dists = dot(vec_list, vec1)
        topn_idex = matutils.argsort(dists, topn=topn, reverse=True)
        topn_tuple = [(idex, dists[idex]) for idex in topn_idex]
        return topn_tuple
    except:
        print(' calculate dot error ! ')


if __name__ == "__main__":
    # load word2vec modle...
    model_zh = load_word2vec_model('models/wiki.zh.word_200v.model')
    model_en = load_word2vec_model('models/wiki.en.word_200v.model')
    # model_cha = load_word2vec_model('models/wiki.en.char_200v.model')
    # models = [model_zh, model_en, model_cha]
    models = [model_zh, model_en]
    model_size = 200

    responses = get_libs('data/tianlong_libs.xlsx')
    responses_vec_list = get_vec_sen_list(responses, models, model_size)
    while True:
        query = input('you:')
        query_vec = get_vec_sen(query, models, model_size)

        topn_tuple = get_similar_index(query_vec, responses_vec_list, 10)
        topn_responses = [(responses[index], score)
                          for index, score in topn_tuple]

        print(topn_responses)
Esempio n. 2
0
def main(_):
    model_path = os.path.join('models', Config.file_name)

    input_file = 'data/去除2和null.xlsx'
    # input_file = 'data/30w_.xlsx'
    vocab_file = os.path.join(model_path, 'vocab_label.pkl')

    # 数据处理
    converter = TextConverter(None,
                              vocab_file,
                              max_vocab=Config.vocab_max_size,
                              seq_length=Config.seq_length)
    print('vocab size:', converter.vocab_size)

    # 加载上一次保存的模型
    model = Model(Config, converter.vocab_size)
    checkpoint_path = tf.train.latest_checkpoint(model_path)
    if checkpoint_path:
        model.load(checkpoint_path)

    # 获取测试库数据
    test_QAs = get_excel_QAs(input_file, 0)
    test_libs = get_libs('data/tianlong_libs.xlsx')  # 用整个库3w+
    # test_libs = [r for q, r, y in test_QAs]  # 用QAs

    test_libs_arrs = converter.libs_to_arrs(test_libs)

    # 产生匹配库向量
    save_file = checkpoint_path + '_matul_state_QAs.pkl'
    if os.path.exists(save_file) is False:
        response_matul_state = model.test_to_matul(test_libs_arrs)
        with open(save_file, 'wb') as f:
            pickle.dump(response_matul_state, f)
    else:
        with open(save_file, 'rb') as f:
            response_matul_state = pickle.load(f)

    # 测试
    print('start to testing...')
    QAY = []
    k, n = 0, 0
    for query, y_response, label in test_QAs:
        input_arr, input_len = converter.text_to_arr(query)
        indexs = model.test(input_arr, input_len, response_matul_state)
        if len(indexs) > 10: indexs = indexs[:9]
        responses = converter.index_to_response(indexs, test_libs)

        QAY.append((query, y_response, responses, [
            '',
        ]))
        if responses[0] == y_response:
            k += 1
            print(k, '/', n)
        n += 1
    print('accuracy:', k / float(n))
    result_xls = checkpoint_path + '_Q_for_libs.xls'
    converter.save_to_excel(QAY, result_xls)

    w_acc = open('models/acc.txt', 'a', encoding='utf8')
    configs = [(n, getattr(Config, n)) for n in dir(Config) if n[0] != '_']
    w_acc.write('\n\n---accuracy:"%.3f" ----------config:-----------\n' %
                (k / float(n)))
    for m, v in configs:
        w_acc.write(m + ' = ' + str(v) + '\n')