Ejemplo n.º 1
0
def analyseTag():
    krTagFilePath = io.getSourceFilePath('topic_article_kr.txt')
    pwTagFilePath = io.getSourceFilePath('topic_article_pw.txt')

    krArticleList = io.loadData2Json(krTagFilePath)
    pwArticleList = io.loadData2Json(pwTagFilePath)

    i = 1
    for item in pwArticleList:
        if not item['tag']:
            i += 1
    print(i)
Ejemplo n.º 2
0
def getCompanyNameList(filePath):
    '''
        dataSource:investEvents_20161227144154.txt(最新的“新芽”,“it桔子”合并之后的数据)
        CompanyName指“vc_company”,“product_company”两部分公司名称
        为什么要注明dataSource?因为文件改变后,结构可能改变了,函数不一定能适应
    '''
    companyNameList = []
    infoList = io.loadData2Json(filePath)
    for info in infoList:
        if info['startup']['name']:
            companyNameList.append(info['startup']['name'])
        if info['vc']['name']:
            companyNameList.append(info['vc']['name'])
    return list(set(companyNameList))
        这段代码是在给“品玩”,“36kr”全文打完标签之后执行的
        直接读取文章,根据输入标签筛选出目标文章
        封装目标文章格式,并持久化
    """
    fieldTagList = index.ARTIFICIAL_INTELLIGENCE_TAG_LIST

    outputFilePath = io.getProcessedFilePath('topic_ai_theme.csv')
    # outputFilePath = io.getProcessedFilePath('topic_transit_theme.csv')

    # 路径
    krTagFilePath = io.getSourceFilePath('topic_article_kr.txt')
    pwTagFilePath = io.getSourceFilePath('topic_article_pw.txt')

    # 读取信息
    krArticleList = io.loadData2Json(krTagFilePath)
    pwArticleList = io.loadData2Json(pwTagFilePath)
    infoList = krArticleList + pwArticleList

    fw = open(outputFilePath,'w',encoding='utf-8')
    # fw.write('title,theme,originTag' + '\n')

    for item in infoList:
        if item['tag']:
            if isThemeInFieldTagList(item['tag'],fieldTagList):
                # 对标签进行过滤
                cleanedTagList = filterTag(item['tag'])
                title = item['title']
                tagList = cleanedTagList
                originalTagList = item['originalTag']
                outputLine = title.replace(',','') + ',' + ' '.join(tagList) + ',' + ' '.join(originalTagList)
Ejemplo n.º 4
0
                resultDicList.append(dic)
    return resultDicList


if __name__ == '__main__':
    # 输入路径
    classifyTagInputFilePath = io.getProcessedFilePath('ad_分类标签.csv')
    nodeInputFilePath = io.getProcessedFilePath('ad_节点.csv')
    frontJsonInputFilePath = io.getUnprocessedFilePath('ad.json')

    # 输出路径
    adOutputFilePath = io.getProcessedFilePath('ad_final.json')

    classifyTagList = io.readListFromCSV(classifyTagInputFilePath)
    nodeList = io.readListFromCSV(nodeInputFilePath)
    frontDicList = io.loadData2Json(frontJsonInputFilePath)

    # 生成分类标签字典
    classifyTagDic = getClassifyTagDic(classifyTagList)
    # 生成产品公司与分类标签映射字典
    mappingDic = getNodeTagMappingDic(classifyTagDic, nodeList)
    # 初始化frontFinalDic
    finalFrontDic = initFrontFinalDic(frontDicList)
    # 节点标记分类标签
    markedNodeDicList = markClassifyTag(mappingDic, frontDicList[0]['nodes'])
    # 封装node
    finalFrontDic['nodes'] = markedNodeDicList

    # 生成json文件并输出
    fw = open(adOutputFilePath, 'w', encoding='utf-8')
    fw.write(json.dumps(finalFrontDic, ensure_ascii=False))
    tagbaseNameList = ['industry_tags']
    # tagbaseDic init
    initDic = {}
    # newseed taged info list
    newseedInfoDic = {}
    # file path
    inputFilePath = io.getSourceFilePath('investEvents_20161227144154.txt')
    outputFilePath = io.getSourceFilePath(
        'investEvents_taged_20161227144154.txt')
    tagbaseFilePath = io.getSourceFilePath(
        'tagbase_iron_tag_all_product_company.txt')
    newseedInfoOutputFilePath = io.getProcessedFilePath(
        'newseed_taged_info.csv')
    # get infoList
    infoList = io.loadData2Json(inputFilePath)
    # persist tagbase from redis
    tagbaseDic = util.getTagbaseDicFromRedis(initDic, tagbaseNameList)
    util.persistentTagbase(tagbaseDic, tagbaseFilePath)
    # load cut word user dict
    jieba.load_userdict(tagbaseFilePath)
    # get tagbaseList
    tagbaseList = io.readListFromTxt(tagbaseFilePath)
    # prepare for output
    fw = open(outputFilePath, 'w', encoding='utf-8')
    i = 1
    j = 0
    # traverse infoList
    for item in infoList:
        if item['startup']['productDesc']:
            productDesc = item['startup']['productDesc']
Ejemplo n.º 6
0
    return resultList




if __name__ == '__main__':

    tagbaseNameList = ['industry_tags']
    initDic = {}

    # 获取路径
    itjzFilePath = io.getSourceFilePath('investEvents_20161019121629.txt')
    itjzOutputFilePath = io.getProcessedFilePath('itjz_trial_company_tag.txt')
    itjzTagbaseFilePath = io.getProcessedFilePath('itjz_extract_theme_tagbase.txt')
    # 加载数据
    itjzInfoList = io.loadData2Json(itjzFilePath)

    fw = open(itjzOutputFilePath,'w',encoding='utf-8')
    fw.write('name,desc,originalTag,cutWordTag,stringTag,scanTagCutword,scanTagString' + '\n')

    # 1 获取公司实体
    productCompanyDicList = getProductCompanyList(itjzInfoList)
    print('productCompany字典长度:',len(productCompanyDicList))

    # 计算productDesc的平均长度
    # length = getDescAverageLen(productCompanyDicList)

    # 2 获取标签库标签,生成标签文本
    tagbaseDic = util.getTagbaseDicFromRedis(initDic,tagbaseNameList)
    # 生成标签库文本
    util.persistentTagbase(tagbaseDic,itjzTagbaseFilePath)
Ejemplo n.º 7
0
"""
    Target:找出投资事件中所有的轮次,生成轮次数据集set
    Source:investEvents_20161019121629.txt
    Instruction:
    1)修正词库中轮次的表示
    2)确定轮次的顺序
"""

if __name__ == '__main__':
    duringPath = 'data\\unprocessed'
    roundSet = set()

    filePath = os.path.join(index.ROOTPATH, duringPath,
                            'investEvents_20161019121629.txt')
    # 加载数据
    jsonList = io.loadData2Json(filePath)

    # # 显示所有的投资轮次
    # for i in range(len(jsonList)):
    #     if 'event' in jsonList[i].keys():
    #         roundSet.add(jsonList[i]['event']['investRound'])
    # print(type(roundSet),str(roundSet))

    # investTimeList = []
    j = 0
    for i in range(len(jsonList)):
        if 'event' in jsonList[i].keys():
            if not jsonList[i]['event']['investTime']:
                print(i, 'investTime为空')
            if jsonList[i]['event']['investTime']:
                if len(jsonList[i]['event']['investTime']) == 10:
Ejemplo n.º 8
0
    #             initDic['author'] = report[0]['name']
    #
    #             # 直接写入文件
    #             jsonDic = json.dumps(initDic, ensure_ascii=False)
    #             fw.write(jsonDic + '\n')
    #             print(i)
    #             i += 1
    #         except Exception as ex:
    #             print(ex)



    # 36kr文章

    # krInfoList = io.readListFromTxt(krFilePath)
    krInfoList = io.loadData2Json(krFilePath)

    i = 1
    for report in krInfoList:
        if report:
            # reportList = report.split('\t')
            # if len(reportList) == 4:
                try:
                    # content = reportList[3].strip()
                    content = report['data']['content']
                    # 0 去掉html标签
                    content = webpage.extractContentBetweenTags(content)
                    # 1 自定义停用词修正
                    cutOne = cutWord.cutStopWord(content)
                    # 2 过滤掉标签符号等
                    cutTwo = cutWord.cutNoiseWord(cutOne)