def extractTheme(tagList,tagbaseFilePath): themeList = [] tagbaseList = io.readListFromTxt(tagbaseFilePath) for item in tagList: if item in tagbaseList: themeList.append(item) return themeList
def filterTagFromTagbase(content,tagbaseFilePath): resultList = [] # 获取标签库列表 tagbaseList = io.readListFromTxt(tagbaseFilePath) for item in tagbaseList: if item in content: resultList.append(item) return resultList
def extractTheme(tagList,tagbaseFilePath): themeList = [] tagbaseList = io.readListFromTxt(tagbaseFilePath) for item in tagList: if item not in index.TAGBASE_STOP_WORD_LIST: if item in tagbaseList: themeList.append(item) return themeList
def cleanTheme(tagList): themeList = [] # 获取标签库中标签 filePath = io.getSourceFilePath('tagbase.txt') tagbaseList = io.readListFromTxt(filePath) for item in tagList: if item in tagbaseList: themeList.append(item) return themeList
def updateTagbase(): ''' 作为一个单独模块,对tagbase.txt进行调整 ''' # 对标签库进行了去重操作 tagbaseFilePath = io.getSourceFilePath('tagbase.txt') tagbaseList = io.readListFromTxt(tagbaseFilePath) # 68638 cleanTagbaseList = list(set(tagbaseList)) # 67523 io.writeList2Txt('tagbase.txt',cleanTagbaseList)
def scanTheme2Tag(themeList,tagbaseFilePath): ''' 从标签库中筛选标签 ''' tagList = [] tagbaseList = io.readListFromTxt(tagbaseFilePath) for item in themeList: if item in tagbaseList: tagList.append(item) return tagList
inputFilePath = io.getSourceFilePath('investEvents_20161227144154.txt') outputFilePath = io.getSourceFilePath( 'investEvents_taged_20161227144154.txt') tagbaseFilePath = io.getSourceFilePath( 'tagbase_iron_tag_all_product_company.txt') newseedInfoOutputFilePath = io.getProcessedFilePath( 'newseed_taged_info.csv') # get infoList infoList = io.loadData2Json(inputFilePath) # persist tagbase from redis tagbaseDic = util.getTagbaseDicFromRedis(initDic, tagbaseNameList) util.persistentTagbase(tagbaseDic, tagbaseFilePath) # load cut word user dict jieba.load_userdict(tagbaseFilePath) # get tagbaseList tagbaseList = io.readListFromTxt(tagbaseFilePath) # prepare for output fw = open(outputFilePath, 'w', encoding='utf-8') i = 1 j = 0 # traverse infoList for item in infoList: if item['startup']['productDesc']: productDesc = item['startup']['productDesc'] # get cleaned desc cleanedDesc = getCleanedDesc(productDesc) # get cut word list cutWordList = getCutWordList(cleanedDesc) # extract tag ironTagList = extractTag(cutWordList, tagbaseList) print(i, 'extracted tag:', ironTagList)