Ejemplo n.º 1
0

def getLabelWeightOutputStr(labelWordWeightList):
    '''

    '''
    resultStr = ''
    for item in labelWordWeightList:
        resultStr = resultStr + '(' + str(item[0]) + ',' + str(
            item[1]) + ')' + ','
    return resultStr


if __name__ == '__main__':

    analysisPath = inout.getDataAnalysisPath('analysis.txt')

    ## 加载停用词列表
    stopWordPath = inout.getResourcePath('stopWordList.txt')
    stopWordList = inout.readListFromTxt(stopWordPath)
    # 这地方加入临时逻辑,后面可以进行停用词合并
    stopWordList = list(set(stopWordList))

    ## 加载关系字典
    relationDic = persistent_relation_object.getRelationShipDic()

    pd.set_option('display.width', 300)
    np.set_printoptions(linewidth=300, suppress=True)

    # corpusPath = inout.getDataOriginPath('special_corpus_copy.txt')
    corpusPath = inout.getDataOriginPath('special_corpus.txt')
                articleTime = mildTime
            # finally:
            #     articleTime = mildTime
        else:
            articleTime = coldTime

        weight = (articleTime - basicTime) / (systemTime - basicTime)
    return weight


if __name__ == '__main__':

    ## 加载包含索引信息的句子到内存
    # sentenceList,sentenceFeatureList = loadIndexSentenceList()

    sentenceFilePath = inout.getDataAnalysisPath('sentenceList.txt')
    sentenceList = inout.readListFromTxt(sentenceFilePath)

    # for item in sentenceList:
    #     print item
    # exit(0)

    inFilePath = inout.getDataAnalysisPath('vote_classify_module_result_fnlp_150w-2100w.txt')
    # inFilePath = '/data/wangtd/workspace/re/vote_classify_module_result_fnlp_150w-2100w.txt'

    outFilePath = inout.getDataAnalysisPath('vote_relation_weight_result_fnlp_150w-2000w.txt')
    # outFilePath = '/data/wangtd/workspace/re/vote_relation_weight_result_fnlp_150w-2000w.txt'

    infoList = inout.readListFromTxt(inFilePath)

    ## 开始处理
            if 'time' in lineList[3]:
                if len(lineList[3].split('#INNER#')) == 2:
                    if lineList[3].split('#INNER#')[1] != '':
                        time = lineList[3].split('#INNER#')[1]
            comSentence = time + '#INNER#' + sentenceStr

            if nePairList:
                sentenceFeatureList.append([nePairList, otherWordList])
                sentenceList.append(comSentence)
    return sentenceList, sentenceFeatureList


if __name__ == '__main__':

    # 输出路径
    outputPath = inout.getDataAnalysisPath(
        'analysis_vote_sentence_fnlp_150w-2100w.txt')
    # outputPath = inout.getDataAnalysisPath('analysis_vote_sentence_0615.txt')
    # outputPath = inout.getDataAnalysisPath('analysis_test.txt')

    ## 配置
    pd.set_option('display.width', 300)
    np.set_printoptions(linewidth=300, suppress=True)

    ## 加载停用词列表
    stopWordPath = inout.getResourcePath('stopWordList.txt')
    stopWordList = inout.readListFromTxt(stopWordPath)
    # 这地方加入临时逻辑,后面可以进行停用词合并
    stopWordList = list(set(stopWordList))

    ## 加载关系字典
    relationDic = persistent_relation_object.getRelationShipDic()
                splitList[3]
            # print resultStr
        return resultStr


def handleSentenceTwo(sentenceTwo):
    print sentenceTwo
    sortedRelationList = eval(sentenceTwo)
    print type(sortedRelationList)
    print sortedRelationList
    exit(0)


if __name__ == '__main__':

    inFilePath = inout.getDataAnalysisPath(
        'vote_relation_ordered_result_fnlp_150w-2000w.txt')

    outFilePath = inout.getDataAnalysisPath(
        'vote_relation_ordered_result_fnlp_150w-2000w_handled.txt')

    infoList = inout.readListFromTxt(inFilePath)

    print 'info list len:', len(infoList)

    allSentenceList = []
    itemSentenceList = []
    for item in infoList:
        item = item.strip()
        if item != '':
            # print item
            itemSentenceList.append(item)
# -*- coding:utf-8 -*-

import pandas as pd
import numpy as np
from utils import inout
from utils.inout import printEscapeStr
from collections import OrderedDict
import time
import codecs

if __name__ == '__main__':

    filePath = inout.getDataAnalysisPath('test.txt')

    fr = codecs.open(filePath, 'rb')

    while True:
        line = fr.readline()
        if line:
            reTuple = eval(line.split('DIV')[1].split('INNER')[1])
            print type(reTuple), reTuple, len(reTuple)
            if len(reTuple) == 1:
                print reTuple[0]
        else:
            break
Ejemplo n.º 6
0

def getSentenceStrList(sentenceIndexList, sentenceList):
    '''

    '''
    outputStr = ''
    for index in sentenceIndexList:
        outputStr = outputStr + sentenceList[int(index)].strip() + '\n'
    return outputStr


if __name__ == '__main__':

    # inFilePath = inout.getDataAnalysisPath('vote_relation_weight_result_fnlp_150w-2000w.txt')
    inFilePath = inout.getDataAnalysisPath('test.txt')
    infoList = inout.readListFromTxt(inFilePath)
    print '关系权重列表长度:', len(infoList)

    outFilePath = inout.getDataAnalysisPath(
        'map_vote_relation_ordered_result_fnlp_150w-2000w.txt')

    sentenceFilePath = inout.getDataAnalysisPath('sentenceList.txt')
    sentenceList = inout.readListFromTxt(sentenceFilePath)
    print '句子列表长度:', len(sentenceList)

    # 初始化字典
    classifyDic = dict()

    print '归类:'
    for i in tqdm(range(len(infoList))):
Ejemplo n.º 7
0
    resultStr = ''
    for item in labelWordWeightList:
        resultStr = resultStr + '(' + str(item[0]) + ',' + str(
            item[1]) + ')' + ','
    return resultStr


if __name__ == '__main__':

    ## 输入参数
    n_cluster = 300
    # n_cluster = 15000

    corpusNum = 500

    analysisPath = inout.getDataAnalysisPath('analysis_cluster_sentence.txt')

    ## 配置
    pd.set_option('display.width', 300)
    np.set_printoptions(linewidth=300, suppress=True)

    ## 加载停用词列表
    stopWordPath = inout.getResourcePath('stopWordList.txt')
    stopWordList = inout.readListFromTxt(stopWordPath)
    # 这地方加入临时逻辑,后面可以进行停用词合并
    stopWordList = list(set(stopWordList))

    ## 加载关系字典
    relationDic = persistent_relation_object.getRelationShipDic()

    ## 作为模块的入口,加载对象