def doc_distance():
    category = request.form['category']
    in_type = request.form['type']
    if in_type == 'doc':
        f1 = request.files['text1']
        f2 = request.files['text2']
        if save_file(f1) and save_file(f2):
            f_text1 = read_file(f1)
            f_text2 = read_file(f2)
    else:
        f_text1 = request.form['text1'].encode('utf-8').strip()
        f_text2 = request.form['text2'].encode('utf-8').strip()

    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    doc1_seg = inference_engine_wrapper.tokenize(f_text1)
    doc2_seg = inference_engine_wrapper.tokenize(f_text2)
    distances = inference_engine_wrapper.cal_doc_distance(doc1_seg, doc2_seg)

    return json.dumps(
        {
            "Jensen-Shannon Divergence": distances[0],
            "Hellinger Distance": distances[1]
        },
        ensure_ascii=False)
def query_doc_sim():
    category = request.form['category']
    in_type = request.form['type']
    if in_type == 'doc':
        f1 = request.files['text1']
        f2 = request.files['text2']
        if save_file(f1) and save_file(f2):
            f_text1 = read_file(f1)
            f_text2 = read_file(f2)
    else:
        f_text1 = request.form['text1'].encode('utf-8').strip()
        f_text2 = request.form['text2'].encode('utf-8').strip()

    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf(),
                                                      get_emb_file(category))
    doc1_seg = inference_engine_wrapper.tokenize(f_text1)
    doc2_seg = inference_engine_wrapper.tokenize(f_text2)
    distances = inference_engine_wrapper.cal_query_doc_similarity(
        doc1_seg, doc2_seg)

    return json.dumps(
        {
            "LDA Similarity": distances[0],
            "TWE Similarity": distances[1]
        },
        ensure_ascii=False)
def lda_infer():
    category = request.form['category']
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    topic_dist = inference_engine_wrapper.lda_infer(seg_list)

    return json_format(topic_dist)
def doc_keywords():
    category = request.form['category']
    word = request.form['word'].encode('utf-8').strip()
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    items = inference_engine_wrapper.cal_keywords_similarity(
        word, ' '.join(seg_list))

    return json_format(items)
def slda_infer():
    category = request.form['category']
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_slda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    sentences = []
    length = len(seg_list)
    for index in range(0, length, 5):
        sentences.append(seg_list[index:index + 5])
    topic_dist = inference_engine_wrapper.slda_infer(sentences)

    return json_format(topic_dist)
def doc_keywords_plus():
    category = request.form['category']
    #word = request.form['word'].encode('utf-8').strip()
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    items = {}
    for x, w in jieba.analyse.extract_tags(f_text, withWeight=True):
        result = inference_engine_wrapper.cal_keywords_similarity(
            x.encode('utf-8').strip(), ' '.join(seg_list))
        items.update(result)

    return json_format(items)
def doc_topic_word_lda():
    category = request.form['category']
    in_type = request.form['type']
    f_text = input_doc_str(in_type)
    inference_engine_wrapper = InferenceEngineWrapper(get_model_dir(category),
                                                      get_lda_conf())
    seg_list = inference_engine_wrapper.tokenize(f_text)
    topic_dist = inference_engine_wrapper.lda_infer(seg_list)

    result = {}
    for key, value in dict(topic_dist).items():
        twe_wrapper = TopicalWordEmbeddingsWrapper(get_model_dir(category),
                                                   get_emb_file(category))
        result_dict = dict(
            twe_wrapper.nearest_words_around_topic(int(key), get_count()))
        result[value] = result_dict

    return json.dumps(result)
Beispiel #8
0
def recommend_cal(text):
    short_engine_wrapper = InferenceEngineWrapper(
        '/root/Familia/model/webpage', 'lda.conf', 'webpage_twe_lda.model')
    doc_seg_short = short_engine_wrapper.tokenize(text)

    long_engine_wrapper = InferenceEngineWrapper('/root/Familia/model/webpage',
                                                 'lda.conf')
    doc_seg_long = long_engine_wrapper.tokenize(text)

    top_200_list = short_long_cal(short_engine_wrapper, doc_seg_short)

    top_3_jobs = long_long_cal(long_engine_wrapper, doc_seg_long, top_200_list)
    return top_3_jobs
Beispiel #9
0
# found in the LICENSE file.
#
# Author: [email protected]

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3,0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 4:
        sys.stderr.write("Usage:python {} {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file", "emb_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    emb_file = sys.argv[3]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file, emb_file)
    while True:
        # 输入短文本和长文本
        query = input("Enter Query: ").strip()
        doc = input("Enter Document: ").strip()
        distances = inference_engine_wrapper.cal_query_doc_similarity(query, doc)
        # 打印结果
        print("LDA Similarity = {}".format(distances[0]))
        print("TWE similarity = {}".format(distances[1]))
Beispiel #10
0
import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3, 0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 4:
        sys.stderr.write("Usage:python {} {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file", "emb_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    emb_file = sys.argv[3]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file,
                                                      emb_file)
    while True:
        # 输入短文本和长文本
        query = input("Enter Query: ").strip()
        doc = input("Enter Document: ").strip()
        query_seg = inference_engine_wrapper.tokenize(query)
        doc_seg = inference_engine_wrapper.tokenize(doc)
        distances = inference_engine_wrapper.cal_query_doc_similarity(
            query_seg, doc_seg)
        # 打印结果
        print("LDA Similarity = {}".format(distances[0]))
        print("TWE similarity = {}".format(distances[1]))
Beispiel #11
0
    result.append(ent)
    return result


if __name__ == '__main__':
    path = '/media/iiip/数据/duanduan/data/validation.csv'
    documents = read_whole_file(path)
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)
    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    topic_result = {}
    for key in documents:
        print key
        seg_list = inference_engine_wrapper.tokenize(documents[key])
        # 进行推断
        topic_dist = inference_engine_wrapper.lda_infer(seg_list)
        topic_result[key] = cal_topic(topic_dist)
    file = open(path.replace(".csv", "_topic.csv"), 'w')
    writer = csv.writer(file)
    for each in topic_result:
        writer.writerow([each, topic_result[each][0], topic_result[each][1]])
    file.close()
    # return topic_result

Beispiel #12
0
app = Sanic("Familia", strict_slashes=True)
app.blueprint(swagger_blueprint)
app.config.API_TITLE = 'Familia API'
app.config.API_DESCRIPTION = 'A Toolkit for Industrial Topic Modeling'
app.config.API_PRODUCES_CONTENT_TYPES = ['application/json']

RE_BACKSPACES = re.compile("\b+")

model_name = os.environ.get("MODEL_NAME", 'news').lower()
n_workers = int(os.environ.get('WORKERS', multiprocessing.cpu_count()))

model_dir = f"/familia/model/{model_name}"
emb_file = f"{model_name}_twe_lda.model"

inference_engine_lda = InferenceEngineWrapper(model_dir, 'lda.conf', emb_file)
inference_engine_slda = InferenceEngineWrapper(model_dir, 'slda.conf')
twe = TopicalWordEmbeddingsWrapper(model_dir, emb_file)


def read_topic_words_from_file(topic_words_file_name='topic_words.lda.txt'):
    logger.info(f"reading topic_words from file: {topic_words_file_name}")
    topic_words = defaultdict(list)
    file_path = os.path.join(model_dir, topic_words_file_name)
    if not os.path.exists(file_path):
        logger.warn(f"topic_words file not found: {file_path}")
        return topic_words
    with open(file_path, 'r') as f:
        line = f.readline()
        while line:
            pos = line.find('=')
Beispiel #13
0
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
#
# Author: [email protected]

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3,0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        # 输入两个长文本
        doc1 = input("Enter Document1: ").strip()
        doc2 = input("Enter Document2: ").strip()
        distances = inference_engine_wrapper.cal_doc_distance(doc1, doc2)
        # 打印结果
        print("Jensen-Shannon Divergence = {}".format(distances[0]))
        print("Hellinger Distance = {}".format(distances[1]))
Beispiel #14
0
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
#
# Author: [email protected]

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3,0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)
    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        input_text = input("Enter Document: ")
        # 分词
        seg_list = inference_engine_wrapper.tokenize(input_text)
        # 进行推断
        topic_dist = inference_engine_wrapper.lda_infer(seg_list)
        # 打印结果
        print("Document Topic Distribution:")
        print(topic_dist)
Beispiel #15
0
# found in the LICENSE file.

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3, 0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        # 输入两个长文本
        words = input("Enter Keywords: ").strip()
        doc = input("Enter Document: ").strip()
        seg_list = inference_engine_wrapper.tokenize(doc)
        items = inference_engine_wrapper.cal_keywords_similarity(
            words, ' '.join(seg_list))
        # 打印结果
        print('----------------------------')
        for item in items:
            print(item[0] + '\t' + str(item[1]))
Beispiel #16
0
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3,0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        # 输入两个长文本
        doc1 = input("Enter Document1: ").strip()
        doc2 = input("Enter Document2: ").strip()
        doc1_seg = inference_engine_wrapper.tokenize(doc1)
        doc2_seg = inference_engine_wrapper.tokenize(doc2)
        distances = inference_engine_wrapper.cal_doc_distance(doc1_seg, doc2_seg)
        # 打印结果
        print("Jensen-Shannon Divergence = {}".format(distances[0]))
        print("Hellinger Distance = {}".format(distances[1]))
Beispiel #17
0
# Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3, 0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        # 输入两个长文本
        words = input("Enter Keywords: ").strip()
        doc = input("Enter Document: ").strip()
        items = inference_engine_wrapper.cal_keywords_similarity(words, doc)
        # 打印结果
        print('----------------------------')
        for item in items:
            print item[0] + '\t' + str(item[1])
Beispiel #18
0
import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3,0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)
    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        input_text = input("Enter Document: ")
        # 分词
        seg_list = inference_engine_wrapper.tokenize(input_text.strip())
        # 构建句子结构,5个词为一个句子
        sentences = []
        length = len(seg_list)
        for index in range(0, length, 5):
            sentences.append(seg_list[index: index + 5])
        # 进行推断
        topic_dist = inference_engine_wrapper.slda_infer(sentences)
        # 打印结果
        print("Document Topic Distribution:")
        print(topic_dist)
Beispiel #19
0
# Author: [email protected]

import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3, 0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 4:
        sys.stderr.write("Usage:python {} {} {} {}.\n".format(
            sys.argv[0], "model_dir", "conf_file", "emb_file"))
        exit(-1)

    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    emb_file = sys.argv[3]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file,
                                                      emb_file)
    while True:
        # 输入短文本和长文本
        query = input("Enter Query: ").strip()
        doc = input("Enter Document: ").strip()
        distances = inference_engine_wrapper.cal_query_doc_similarity(
            query, doc)
        # 打印结果
        print("LDA Similarity = {}".format(distances[0]))
        print("TWE similarity = {}".format(distances[1]))
Beispiel #20
0
import sys
from familia_wrapper import InferenceEngineWrapper

if sys.version_info < (3, 0):
    input = raw_input

if __name__ == '__main__':
    if len(sys.argv) < 3:
        sys.stderr.write("Usage:python {} {} {}\n".format(
            sys.argv[0], "model_dir", "conf_file"))
        exit(-1)
    # 获取参数
    model_dir = sys.argv[1]
    conf_file = sys.argv[2]
    # 创建InferenceEngineWrapper对象
    inference_engine_wrapper = InferenceEngineWrapper(model_dir, conf_file)
    while True:
        input_text = input("Enter Document: ")
        # 分词
        seg_list = inference_engine_wrapper.tokenize(input_text.strip())
        # 构建句子结构,5个词为一个句子
        sentences = []
        length = len(seg_list)
        for index in range(0, length, 5):
            sentences.append(seg_list[index:index + 5])
        # 进行推断
        topic_dist = inference_engine_wrapper.slda_infer(sentences)
        # 打印结果
        print("Document Topic Distribution:")
        print(topic_dist)