Beispiel #1
0
    def __init__(self,
                 paramAlpha=0.1,
                 paramBeta=0.8,
                 paramGamma=0.1,
                 linkBack=False):
        '''初始化父类的属性'''
        super().__init__()
        # 词语的覆盖影响力因子
        self.paramAlpha = paramAlpha
        # 词语的位置影响力因子
        self.paramBeta = paramBeta
        # 词语的频度影响力因子
        self.paramGamma = paramGamma

        ## 是否在后面的词语中加上指向前面词语的链接关系WordGraph,继承过来的
        self.linkBack = linkBack

        # 词分布
        self._wordLocal = {
            'q': 1.06,
            'z': 1.04,
            'h': 1.05,
            'qz': 1.15,
            'zh': 1.10,
            'qzh': 1.60
        }

        self._logger = get_logger("MyWPWordGraph")
        # 日志输出默认为关闭
        self._logger.disabled = True
def addUserWordDict(pathList=[]):
    logger = log.get_logger("fenchi")
    for path in pathList:
        if os.path.isfile(path):
            jieba.load_userdict(path)
            logger.info("加载用户词典:%s" % (path))
    logger.info("加载用户词典结束:%s" % (pathList))
def fenchiToFile(src_file, output_fil):
    logger = log.get_logger("fenchi")
    if not os.path.exists(src_file):
        logger.info("源文件不存在:%s" % (src_file))
        exit(0)
    if not os.path.exists(os.path.split(output_fil)[0]):
        logger.info("目标输出目录不存在")
        exit(0)

    f = codecs.open(src_file, 'r', encoding="utf-8")
    target = codecs.open(output_fil, 'w', encoding="utf-8")
    logger.info('open files %s to %s' % (src_file, output_fil))
    line_num = 1
    line = f.readline()
    while line:
        # print(line)
        if line_num % 10000 == 0:
            logger.info('---- processing %d article----------------' %
                        (line_num))
        line_seg = ' '.join(jieba.cut(line, cut_all=True))  #全模式
        target.writelines(line_seg)
        line_num = line_num + 1
        line = f.readline()
    f.close()
    target.close()
    logger.info('---- processing row totle %d article----------------' %
                (line_num))
    logger.info("分词完成")
    def __init__(self):
        # 是否在后面的词语中加上指向前面词语的链接关系
        self.linkBack = True
        # 如果读取的单词数量超过该值,则不再处理以后的内容,以避免文本过长导致计算速度过慢,这个现在不管用
        self.maxReadableWordCount = 500000
        # 读取的词语的数量
        self.readWordCount = 0
        self.__wordNodeMap = {}

        self._logger = get_logger(' ')
        pass
Beispiel #5
0
    def __init__(self):
        self.alpha_f = 0.1
        self.beta_f = 0.9
        self.gamma_f = 0.0
        self.lambda_f = 30.0
        self.maxReadWordCount_int = 2000
        self.mergeNeighbor_bool = False

        self.graphType = GraphType.TF_IDF
        self.logger = get_logger('TR_E')

        self.logger.info("使用 %s 提取包" % (self.graphType.name))
    def __init__(self,alpha=0.33, beta=0.34, gamma=0.33, maxK=5, linkBack=False):
        super().__init__()
        # //词语的覆盖影响力因子
        self.paramAlpha = alpha

        # //词语的位置影响力因子
        self.paramBeta = beta

        # //词语的频度影响力因子
        self.paramGamma = gamma


        self.linkBack = linkBack
        self.maxK = maxK

        self._logger = get_logger("w2v")
Beispiel #7
0
def addStopKeywords(pathList):
    """文本,参数是一个list []"""
    logger = log.get_logger("StopKeyword")

    global __global_stopKeywordsDict
    """添加换行,停用词,文本中不好配置,加在这里"""
    __global_stopKeywordsDict.append('\n')
    __global_stopKeywordsDict.append(' ')
    for path in pathList:
        if os.path.isfile(path):
            f = codecs.open(path, 'r', encoding='utf8')
            text = f.read()
            __global_stopKeywordsDict = __global_stopKeywordsDict + list(
                text.split())
            # 去重
            __global_stopKeywordsDict = list(set(__global_stopKeywordsDict))
            f.close()
            logger.info("加载用户停用词:%s" % (path))
        else:
            logger.info("加载用户停用词失败:%s" % (path))
    logger.info("加载用户停用词完成:%s" % (pathList))
Beispiel #8
0
def w2v_tarin(inFile, outFile, size=50, window=5):
    logger = log.get_logger("wordVector")
    logger.info("running ")

    inp = inFile
    outp1 = outFile + "_pBin_size" + str(size) + "_win" + str(window)
    outp2 = outFile + "_cVec_size" + str(size) + "_win" + str(window)
    outp3 = outFile + "_cBin_size" + str(size) + "_win" + str(window)

    model = Word2Vec(LineSentence(inp),
                     size=size,
                     window=window,
                     min_count=5,
                     workers=multiprocessing.cpu_count())

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    '''以二进制格式存储'''
    model.save(outp1)
    '''C语言以文本格式存储,一行一个词的向量'''
    model.wv.save_word2vec_format(outp2, binary=False)
    '''C语言以二进制格式存储'''
    model.wv.save_word2vec_format(outp3, binary=True)
Beispiel #9
0
def mode手动():
    logger = log.get_logger("mode自动")

    '''其他参数self.alpha_f, self.beta_f, self.gamma_f,需要先写死'''
    extractor = KeyExtractor.TextRankExtractor()
    evaRes = EvalResult.EvalResult()



    while True:
        print("输入测试文件(默认data/test.txt)")
        print("模型选择:", list(KeyExtractor.GraphType))
        print("提取关键词个数:(默认为5个)\n输出关键词信息[Y/N](默认为N)")
        filePath = input("input file(输入end时结束程序):").strip()
        if filePath == "end":
            break
        if len(filePath)<=0:
            filePath = 'data/test.txt'
        if not os.path.exists(filePath):
            logger.info("文件[%s]不存在,重新输入"%(filePath))
            continue
        # _---------------------------------------------
        mode = input("模型选择:")
        if len(mode) <= 0:
            mode = 0
        else:
            mode = int(mode)
        # _---------------------------------------------
        topN = input("input topN:")
        if len(topN)==0:
            topN= 5
        else:
            topN = int(topN)
        # _---------------------------------------------

        logger.info("提取关键词开始:%s mode=%d topN=%d "%(filePath, mode,topN))

        if not extractor.setExtractMode(mode):
            logger.info("mode=%d 不存在" % ( mode))
            continue
        # 评价类重置
        evaRes.reSet()
        # 以上为参数判断+++++++++++++++++++++++++++++++++++++++++
        xml = xml2dict.XML2Dict()
        r = xml.parse(filePath)

        for id, article in enumerate(r.articles.article):
            # topN = len(list(article.tags.split(',')))
            logger.info("文本id:%d"%(id))
            keywordsList = extractor.extractAsList(article.title, article.content, topN)
            srcKey = list(article.tags.split(','))
            logger.info("抽取的关键词:%s\t源关键词:%s \n"%(str(keywordsList) , str(srcKey)))

            for i in range(len(srcKey)).__reversed__():
                srcKey[i] = SegmentFactory.get去中文标点符号(srcKey[i])
            #保存到文件
            print(id, '\t', ','.join(keywordsList), '\t', ','.join(srcKey))
            evaRes.addKeyList(srcKey, keywordsList)

        logger.info(evaRes.getRes_String())
        # 统计信息写入文件
        res = evaRes.getRes_PRF()
        print("topN=", topN, '\t', res[0], '\t', res[1], '\t', res[2])

    pass
Beispiel #10
0
def mode自动():
    logger = log.get_logger("mode自动")

    while True:

        logger.info("可选择算法模型,以空格分割 [%s]"%(list(KeyExtractor.GraphType)))
        while True:
            modeList = input("输入选择模型id list:").strip()
            modeList = set(modeList.split())
            modeList = list(map(int, modeList))
            if len(modeList)!=0:
                break
        logger.info("选择算法模型 %s"%(modeList))

        while True:
            filePath = input("input file(输入提取关键词的文本文件):")
            if len(filePath)!=0 and os.path.exists(filePath):
                break
            logger.info("文件[%s]不存在,重新输入" % (filePath))
        pass

        logger.info("选择topN范围(不分前后,只取最大和最小,默认为1),以空格分割。" )
        topNlist = input("选择topN范围:")
        topNlist = list(set(topNlist.split()))
        topNlist = list(map(int, topNlist))
        minTopN = min(topNlist)
        if minTopN<1:
            minTopN=1
        maxTopN = max(topNlist)

        logger.info("\nmodeList:%s \ntopN范围:[%d %d] \n原文件:%s"%(str(modeList), minTopN, maxTopN, filePath))
        ok = input("运行结果会保存在当前目录下data/res/time,[Y/N](默认Y)。如果有问题,请查看日志文件进行程序调试:")
        if ok.__len__()==0 or ok.lower()=='y':
            break
    pass

    resDir = 'data/res/'+time.strftime("%Y%m%d%H%M%S", time.localtime())
    #  新建结果目录
    if not os.path.exists(resDir):
        logger.info("新建结果存放目录 %s"%(resDir))
        os.makedirs(resDir, exist_ok=True)

    # 解析文本
    xml = xml2dict.XML2Dict()
    r = xml.parse(filePath)


    for modeId in modeList:
    #     以模型开始
        extractor = KeyExtractor.TextRankExtractor()
        extractor.setExtractMode(modeId)

        keyResFile = os.path.join(resDir, KeyExtractor.GraphType(modeId).name + "_maxTop.%d" % (maxTopN))
        f = open(keyResFile, mode='w', encoding='utf8')
        print("#id\t提取出来的关键词\t原来的关键词", file=f)
        # 取出maxTopN个词
        for id, article in enumerate(r.articles.article):
            startTime = time.time()
            # logger.info("start id=%d %s"%(id, time.strftime("%H%M%S", time.localtime())))
            keywordsList = extractor.extractAsList(article.title, article.content, maxTopN)
            #保存到文件
            srcKey = list(article.tags.split(','))
            for i in range(len(srcKey)).__reversed__():
                srcKey[i] = SegmentFactory.get去中文标点符号(srcKey[i])
            print(id, '\t', ','.join(keywordsList), '\t', ','.join(srcKey), file=f)
            f.flush()
            logger.info("id=%d 运行时间=%d" % (id, time.time()-startTime))
        pass
        f.close()
    pass
    '''======================================'''
    # 对结果进行分析,PRF,输出到文件
    EvalResult.result分析(resDir, maxTopN)
    # 其他结果统计分析
    其他结果统计.分析(resDir, maxTopN)
Beispiel #11
0
        logger.info(evaRes.getRes_String())
        # 统计信息写入文件
        res = evaRes.getRes_PRF()
        print("topN=", topN, '\t', res[0], '\t', res[1], '\t', res[2])

    pass


if __name__ == '__main__':

    '''关于配置说明:如果不知道那个配置的选项,可以在配置文件中删除了,进行程序,在日志文件中会输出说明'''
    '''其他参数self.alpha_f, self.beta_f, self.gamma_f,需要先写死'''


    logger = log.get_logger("main")

    #初始化加载项 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    if g_config.segmentEnable or g_config.w2v_enable:
        if g_config.segmentEnable:
            logger.info("开始分词")
            SegmentFactory.fenchiToFile(g_config.segmentfenchiInput, g_config.segmentfenchiOutput)
            logger.info("分词结束")
        if g_config.w2v_enable:
            logger.info("开始词向量训练")
            wordVector.w2v_tarin(g_config.w2v_inFile, g_config.w2v_outFile,g_config.w2v_size, g_config.w2v_window)
            logger.info("词向量训练结束")
        exit(0)
    logger.info("开始加载词典")
    SegmentFactory.addUserWordDict(list(g_config.segmentuserDictFile.split()))
    ChineseStopKeywords.addStopKeywords(list(g_config.segmentstopDictFile.split()))