コード例 #1
0
ファイル: bm25util.py プロジェクト: yang0/textutils
def buildModel(jsonFile, fieldNames, query_str):
    # iterable 不能循环两次,所以创建两个变量
    t1 = jsonutil.iterCutFieldList(jsonFile, fieldNames)
    t2 = jsonutil.iterCutFieldList(jsonFile, fieldNames)

    # 建立单词索引字典
    dictionary = corpora.Dictionary(t1)
    dictionary.save(DICTIONARY_PATH)

    # 建立词袋模型.将词汇表示的文本,转换成用id表示
    corpus = [dictionary.doc2bow(text) for text in t2]
    print("词袋: %i " % len(corpus))

    bm25Model = bm25.BM25(corpus)


    # print("bm25 idf lens: %i " %len(bm25Model.f))

    average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())

    query = jiebautil.cutWords(query_str).split()
    query_bow = dictionary.doc2bow(query)

    scores = bm25Model.get_scores(query_bow, average_idf)
    # i = scores.index(max(scores))

    lineRead = LineReader(jsonFile)
    for i in range(5):
        score = max(scores)
        lineNum = scores.index(score) + 1
        s = lineRead.load(lineNum)
        j = json.loads(s)
        print(jsonutil.recursive_get(j, fieldNames[0]))

        del scores[lineNum-1]
コード例 #2
0
def search(words):
    """
    搜索包含关键词的文本
    :param words:
    :return:
    """
    results = jsonutil.iterJsonValue(JSON_FILE, [QUESTION_FIELD, ANSWER_FIELD])
    i = 1
    for result in results:
        s = " ".join(result.values())
        strList = jiebautil.cutWords(s).split()
        if all(w in strList for w in words):
            print(i)
            for k in result:
                print("".join(result[k].split()))
            print("\n")
        i += 1
コード例 #3
0
    def getValley(self, str):
        print(str)
        data = json.loads(str)

        answerKeys = self.answerField.split(".")
        question = jsonutil.recursive_get(data, self.questionField)
        answer = jsonutil.recursive_get(data, self.answerField)

        cutStr = cutWords(question)

        reservedWords = self.valleyDict.extractTags(question)
        reservedWords, conceptsUsedStr = self.concepts.extractTags(
            reservedWords)

        self.valleyRecieved.emit(question + answer, cutStr,
                                 " ".join(reservedWords), conceptsUsedStr,
                                 self.lineReader.currentLine)

        self.saveRunPoint()
コード例 #4
0
ファイル: tfidfutil.py プロジェクト: yang0/textutils
    def extract_keywords(self, sentence, topK=5):  # 提取关键词
        # 分词
        seg_list = jiebautil.cutWords(sentence).split()

        freq = {}
        for w in seg_list:
            freq[w] = freq.get(w, 0.0) + 1.0  # 统计词频
        if '' in freq:
            del freq['']
        total = sum(freq.values())  # 总词数

        for k in freq:  # 计算 TF-IDF
            freq[k] *= self.idf_freq.get(k, self.mean_idf) / total

        tags = sorted(freq, key=freq.__getitem__, reverse=True)  # 排序

        if topK:  # 返回topK
            return tags[:topK]
        else:
            return tags
コード例 #5
0
ファイル: simutil.py プロジェクト: yang0/textutils
def querySimString(jsonFile, fieldName, sentence):
    """
    取得和sentence相似的句子
    :param sentence:
    :return:
    """
    dictionary = corpora.Dictionary.load(DICTIONARY_PATH)
    lsi = models.LsiModel.load(LSI_MODEL, mmap='r')

    query = jiebautil.cutWords(sentence).split()
    #将词转换成id
    query_bow = dictionary.doc2bow(query)

    query_lsi = lsi[query_bow]

    index = similarities.MatrixSimilarity.load(INDEX_PATH)
    sims = index[query_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    for e in sims[:5]:
        lineRead = LineReader(jsonFile)
        s = lineRead.load(e[0] + 1)
        j = json.loads(s)
        print(jsonutil.recursive_get(j, fieldName))
コード例 #6
0
def cutWords(sentence):
    print(jiebautil.cutWords(sentence))