Beispiel #1
0
def getSparseMatrixSimilarity(keyword, texts):

    # 1、将【文本集】生成【分词列表】
    texts = [jieba.lcut(text) for text in texts]

    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)

    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in texts]
    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(jieba.lcut(keyword))

    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)
    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]
    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    for e, s in enumerate(similarities, 1):
        print('kw 与 text%d 相似度为:%.2f' % (e, s))

    print(sparse_matrix)
    print(similarities)
Beispiel #2
0
def samilarRate(texts, keyword):
    # 传入texts,keyword
    # 文本集和搜索词

    # 1、将【文本集】生成【分词列表】
    texts = [lcut(text) for text in texts]

    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)

    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in texts]

    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(lcut(keyword))

    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)

    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]

    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    result = []
    sorft = []
    for e, s in enumerate(similarities, 1):
        result.append('kw 与 text%d 相似度为:%.2f' % (e, s))
        sorft.append(s)
    return result, sorft
Beispiel #3
0
def mergeTags():
    res = {}  # 创建一个空字典
    for i in range(len(displayArr)):
        texts = default_tags
        keyword = displayArr[i]
        # 1、将【文本集】生成【分词列表】
        texts = [lcut(text) for text in texts]
        # 2、基于文本集建立【词典】,并获得词典特征数
        dictionary = Dictionary(texts)
        num_features = len(dictionary.token2id)
        # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
        corpus = [dictionary.doc2bow(text) for text in texts]
        # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
        kw_vector = dictionary.doc2bow(lcut(keyword))
        # 4、创建【TF-IDF模型】,传入【语料库】来训练
        tfidf = TfidfModel(corpus)
        # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
        tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
        tf_kw = tfidf[kw_vector]
        # 6、相似度计算
        sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
        similarities = sparse_matrix.get_similarities(tf_kw)
        for e, s in enumerate(similarities, 1):
            if s > 0.5:
                # print(keyword, ' 与 ', ''.join(texts[e - 1]), ' 的相似度为: ', s)
                key = ''.join(texts[e - 1]).strip()
                res[key] = s
        arrSorted = sorted(res.items(), key=lambda item: item[1], reverse=True)
        for ind, (k, v) in enumerate(arrSorted):
            if ind == 0:
                ids = textsOld[i].strip().split('.')[0]
                textsOld[i] = textsOld[i] + '----------' + k
                # textsOld[i] = ids+'.'+k
        res = {}  #字典置空
    return textsOld
Beispiel #4
0
 def check(news):
     """检查是否重复"""
     dictionary, corpus, num_features = Similar.dictionary()
     kw_vector = dictionary.doc2bow(lcut(news))
     tfidf = TfidfModel(corpus)
     tf_texts = tfidf[corpus]
     tf_kw = tfidf[kw_vector]
     sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
     similarities = sparse_matrix.get_similarities(tf_kw)
     for e, s in enumerate(similarities, 1):
         if 0.6 < s < 0.98:
             return
     return news
def mergeTags(textArr):
    res = []
    for i in range(len(displayArr)):
        try:
            exampleArr = textArr
            if i == 0:
                texts = textArr
            else:
                for item in res:
                    if item in exampleArr:
                        exampleArr.remove(item)
                texts = exampleArr
                res = []

            # print(exampleArr)
            # 文本集和搜索词
            # texts = ['吃鸡这里所谓的吃鸡并不是真的吃鸡,也不是谐音词刺激的意思',
            #          '而是出自策略射击游戏《绝地求生:大逃杀》里的台词',
            #          '我吃鸡翅,你吃鸡腿']
            keyword = texts[i]
            # 1、将【文本集】生成【分词列表】
            texts = [lcut(text) for text in texts]
            # 2、基于文本集建立【词典】,并获得词典特征数
            dictionary = Dictionary(texts)
            num_features = len(dictionary.token2id)
            # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
            corpus = [dictionary.doc2bow(text) for text in texts]
            # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
            kw_vector = dictionary.doc2bow(lcut(keyword))
            # 4、创建【TF-IDF模型】,传入【语料库】来训练
            tfidf = TfidfModel(corpus)
            # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
            tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
            tf_kw = tfidf[kw_vector]
            # 6、相似度计算
            sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
            similarities = sparse_matrix.get_similarities(tf_kw)
            for e, s in enumerate(similarities, 1):
                if s > 0.5:
                    res.append(exampleArr[e - 1])
                    print(keyword + ' 与 ' + exampleArr[e - 1] + ' 的相似度为 :', s)
            print('---------------------------------------------------')
        except:
            print('')
    print('合并完成!')
Beispiel #6
0
def similar(aim):
    aim_text = aim.title + aim.abstract
    simple = [x.title + x.abstract for x in ret[0:-10]]
    text = [set(posseg.lcut(x)) for x in simple]
    text = list({y for x in text for y in x})
    dictionary = Dictionary(text)
    length = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(lcut(src)) for src in simple]
    tfidf = TfidfModel(corpus)
    tf_texts = tfidf[corpus]
    sparse_matrix = SparseMatrixSimilarity(tf_texts, length)

    vector = dictionary.doc2bow(lcut(aim_text))
    tf_kw = tfidf[vector]
    similarities = sparse_matrix.get_similarities(tf_kw)

    print(aim.title)
    for e, s in enumerate(similarities, 1):
        if s > 0.1:
            print(s, ret[e - 1].title)
            """建立词典  获得特征数"""
            dictionary = corpora.Dictionary(diff_word_list)
            feature_cnt = len(dictionary.token2id.keys())
            """基于词典  分词列表转稀疏向量集"""
            corpus = [dictionary.doc2bow(codes) for codes in diff_word_list]
            # print("key")
            # print([x for x in word_list if x not in stopwords])
            kw_vector = dictionary.doc2bow([x for x in word_list if x not in stopwords])
            """创建tf-idf模型   传入语料库训练"""
            tfidf = TfidfModel(corpus)
            """训练好的tf-idf模型处理检索文本和搜索词"""
            tf_texts = tfidf[corpus]
            tf_kw = tfidf[kw_vector]
            """相似度计算"""
            sparse_matrix = SparseMatrixSimilarity(tf_texts, feature_cnt)
            similarities = sparse_matrix.get_similarities(tf_kw)
            # print("similarities")
            # print(similarities)
            # for e, s in enumerate(similarities, 1):
            #     print('kw 与 text%d 相似度为:%.2f' % (e, s))
            conceptualSimilarity.append(max(similarities))

        """key word ratio"""
        keywordsInComments = [x for x in word_list if x in languageKeyWords]
        stopKeyRatio.append(keywordsInComments.__len__() / word_list.__len__())


    print(readable)
    print(max(readable), min(readable))

    fig = plt.figure()
Beispiel #8
0
async def create_file(keyword: str,
                      threshold: float,
                      file: UploadFile = File(...)):
    contents = file.file.read()
    now = time.time()
    with open("./cache_file/" + str(now) + file.filename, "w+") as f:
        f.write(contents.decode("utf-8"))
    with open("./cache_file/" + str(now) + file.filename, "r") as f_read:
        data = f_read.readlines()
    # 1、将【文本集】生成【分词列表】
    texts = [lcut(text.strip("\n")) for text in tqdm(data)]
    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)
    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in tqdm(texts)]
    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(lcut(keyword))
    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)
    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]
    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    # print(similarities)
    new_now = datetime.datetime.now()
    #文件目录简单管理
    os.makedirs("./static/" + keyword + str(new_now))
    db.insert({"name": keyword + str(new_now), "type": "dir"})
    f = open("./static/" + keyword + str(new_now) + "/result.txt", "w")
    db.insert({
        "name": keyword + str(new_now) + "/result.txt",
        "type": "file",
        "dir": keyword + str(new_now)
    })
    f1 = open("./static/" + keyword + str(new_now) + "/result_er.txt", "w")
    db.insert({
        "name": keyword + str(new_now) + "/result_er.txt",
        "type": "file",
        "dir": keyword + str(new_now)
    })
    #end
    Semantic_list = []

    for e, s in enumerate(similarities, 1):
        su = (e, s)
        Semantic_list.append(su)
        try:
            if s >= threshold:
                f.write(data[e - 1].strip("\n") + str(s) + "\n")
            else:
                f1.write(data[e - 1].strip("\n") + str(s) + "\n")
        except Exception as e:
            pass
    Semantic_list.sort(key=takeSecond, reverse=True)
    rs_list = []
    for item in Semantic_list[0:101]:
        rs_dic = {"msg": data[item[0] - 1], "Similaritydegree": str(item[1])}
        rs_list.append(rs_dic)
    # Semantic_list

    os.remove("./cache_file/" + str(now) + file.filename)
    return {"semantic": rs_list}
Beispiel #9
0
                with open(file_path, encoding='UTF-8') as text:
                    text_as_string = ""
                    for line in text:
                        text_as_string += line
                    text = preprocess(text_as_string)
                    texts.append(text)
        return texts

    texts = get_texts()
    texts = get_ngrams(texts)

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    index = SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary))

    # Not 100% how to interpret these results
    for i in range(0, len(corpus)):
        print(index.get_similarities(corpus[i]))

    #
    #
    #
    #
    #
    #
    #