Ejemplo n.º 1
0
def segment_word(data_list):
    result_list = {}
    stopwords = {}.fromkeys([
        line_content.strip()
        for line_content in codecs.open(mydir + 'stopword.txt')
    ])  # 停用词表
    # 包含中文的正则模版
    zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
    cl = CilinSimilarity()
    for li in data_list:
        # seglist = jieba.cut(data_list[li], cut_all=False)  # 精确模式
        seglist = psg.cut(data_list[li])
        output = ''
        for segs, flag in seglist:
            segs = segs.rstrip('\n')
            segs = segs.strip()
            seg = segs.lower()  # 英文字母小写
            if seg == 'wyy':
                output += '王远阳'
                output += ' '
            elif (seg not in stopwords) and (flag.startswith(
                ('a', 'f', 'j', 'l', 'm', 'n', 'q', 't',
                 'v'))):  # 去停用词,并且过滤指定词性的词
                # if seg not in stopwords:  # 去停用词
                if not re.search('^[a-zA-Z-0-9]+$', seg) and len(
                        seg) > 1 and zhPattern.search(seg):  # 去掉分词为1个字的结果
                    output += seg
                    output += ' '
        # print output
        result_list["%s" % li] = output
    return result_list
Ejemplo n.º 2
0
def get_database_list(sql, bbses):
    """获取数据库中数据函数
    根据sql语句获取数据库中内容

    Args:
        sql:数据库操作语句

    Returns:
        list:数据库中取出文本数据数组
    """
    data_list = {}
    tr4s = TextRank4Sentence()
    f2 = open(mydir + 'result_file/aid_bid', 'w', encoding='UTF-8')
    f2.close()
    for bbs in bbses:
        database = setting.BBS_DBNAME + '_' + bbs
        connect = pymysql.connect(host=setting.BBS_HOST, db=database, user=setting.BBS_USER,
                                  passwd=setting.BBS_PASSWD, charset='utf8', use_unicode=True)
        cursor = connect.cursor()
        cursor.execute(sql)
        k = cursor.fetchall()
        f2 = open(mydir + 'result_file/aid_bid', 'a', encoding='UTF-8')
        dir = ''
        for li in k:
            dir = bbs + "_" + li[0] + "_" + li[1]
            f2.write(dir + "\n")
            tr4s.analyze(text=(li[5].strip('\n') + " " + li[6].strip('\n')), lower=True, source='all_filters')
            output = ''
            for item in tr4s.get_key_sentences(num=1):
                output += item.sentence
                output += ' '
            data_list["%s" % dir] = output
            #存放进行分词前的文本
            f3 = open(mydir + 'texts/' + "%s" % dir, 'w', encoding='UTF-8')
            f3.write(li[5].strip('\n') + "\n" + li[6].strip('\n'))
            f3.close()
        connect.commit()
        connect.close()
        f2.close()
    result_list={}
    stopwords = {}.fromkeys([line_content.strip() for line_content in codecs.open(mydir + 'stopword.txt')])  # 停用词表
    #包含中文的正则模版
    zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
    cl = CilinSimilarity()
    for li in data_list:
        # seglist = jieba.cut(data_list[li], cut_all=False)  # 精确模式
        seglist = psg.cut(data_list[li])
        output = ''
        for segs, flag in seglist:
            segs = segs.rstrip('\n')
            segs = segs.strip()
            seg = segs.lower()  # 英文字母小写
            if (seg not in stopwords) and (flag.startswith(('a','f','j','l','m','n','q','t','v'))):  # 去停用词,并且过滤指定词性的词
            #if seg not in stopwords:  # 去停用词
                if not re.search('^[a-zA-Z-0-9]+$', seg) and len(seg) > 1 and zhPattern.search(seg):  # 去掉分词为1个字的结果
                    output += seg
                    output += ' '
        # print output
        result_list["%s" % li] = output
    return data_list, result_list
Ejemplo n.º 3
0
from queue import Queue, LifoQueue
from baidu_link_evidence import get_baike_page_citiao
import collections
import json
import re
import copy
import Levenshtein  #字符串的编辑距离
import synonyms
import json
import os
import sys
import jieba
import xiaoqi

sys.setrecursionlimit(999)  # set the maximum depth as 5000
cs = CilinSimilarity()


#用于将地址信息进行对齐统一,接收参数为两个地址内容
def merge_local(fang_360, fang_baidu):
    location = []
    res_list = []  #返回值以列表形式,列表里的第一个数字表示返回值的状态
    #前面先处理为空的情况
    if fang_360 != '' and fang_baidu == '':
        res_list = [0, fang_360]
    elif fang_baidu != '' and fang_360 == '':
        res_list = [1, fang_baidu]
    elif fang_360 == '' and fang_baidu == '':
        res_list = [4]
    else:
        if fang_360 == fang_baidu:
Ejemplo n.º 4
0
def loadCilin():
    if os.path.exists(cilin_path):
        return pickle.load(open(cilin_path))
    cs = CilinSimilarity()
    pickle.dump(cs, open(cilin_path, "w"))
    return cs
Ejemplo n.º 5
0
    count = execute_sql(count_sql)
    count = int(round(len(count) * 10))
    print(origin_texts)
    print("预处理完毕!")
    data_list = []
    temp_list = []

    #构造所有词的集合
    all_words = set()
    for key in texts:
        for word in texts[key].split():
            all_words.add(word)

    print(all_words)
    #去重与生成同义词
    cl = CilinSimilarity()
    my_output = ''
    for key in texts:
        temp_list.clear()
        my_output = ''
        for word in texts[key].split():
            if word not in temp_list:
                temp_list.append(word)
                my_output += word
                my_output += ' '
                try:
                    codes = cl.get_code(word)
                    for eachcode in codes:
                        if eachcode.endswith('='):
                            for eachword in cl.code_word.get(eachcode):
                                if len(eachword) > 1 and eachword in all_words and not eachword in temp_list:
Ejemplo n.º 6
0
import os
import jieba
import jieba.analyse
import jieba.posseg as pseg
from cilin import CilinSimilarity
from openpyxl import workbook

f = open('./static/词组.txt')
word_list = []

wb = workbook.Workbook()
ws = wb.active
ws.append(['单词'])

for each_line in f:
    #print(each_line)
    each_line = each_line.replace('/', '').replace('、', '').replace(
        '(', '').replace(')', '').split('\n')  #去掉每行末尾的换行符
    cut_words = jieba.lcut(each_line[0],
                           cut_all=False)  #.cut()返回生成器,.lcut()返回列表。
    #print(cut_words)
tongyici = CilinSimilarity()
#tongyi_words = tongyici.return_tongyici(cut_words[0])#实例化cilin中的类
#print(tongyi_words)
Ejemplo n.º 7
0
# -*- coding: utf-8 -*-
'''
@author: [email protected]
@license: (C) Copyright 2017
@desc:    混合采用词林和知网的相似度计算方法。得到更加符合人们感觉的相似度数值。
@DateTime: Created on 2017/12/28, at 18:28 by PyCharm
'''

from howNet import WordSimilarity
from cilin import CilinSimilarity




if __name__ == '__main__':
    cs = CilinSimilarity()  # 先使用词林进行计算
    w1, w2 = '起重机', '器械'
    print(w1, w2)
    print('2016年词林改进版相似度为:', cs.sim2016(w1, w2))
    # 判断词是否在词林中收录,只需要判断是否 in  cs.vocab即可。

    obj = WordSimilarity()  # 实例化一个相似度计算对象
    ci_sim = obj.calc(w1, w2)
    print(ci_sim)