pattern = '' cmatch_words = map(lambda line: '.*' + line.strip() + '.*', match_words[0]) ematch_words = map(lambda line: '.*' + line.strip() + '.*', match_words[1]) if len(ematch_words) == 0: pattern = '|'.join(cmatch_words) else: pattern = '|'.join(cmatch_words) + '|' + '|'.join(ematch_words) return pattern if __name__ == '__main__': source_file_body = u.create_file_body(SOURCE_FILE) source_file_head = u.create_file_head(SOURCE_FILE) match_words = u.create_match_words(MATCH_FILE) pattern = createPattern(match_words) p = re.compile(pattern) for line in source_file_body: content = u.create_content(line, COLUMN) if p.match(content): resultFile.append(line) else: removeFile.append(line) resultFileName = getFileName(SOURCE_FILE, '-含关键词.csv') removeFileName = getFileName(SOURCE_FILE, '-不含关键词.csv') u.create_result_file(resultFileName, source_file_head, resultFile) u.create_result_file(removeFileName, source_file_head, removeFile)
labelType, labelNum = createMoreMatch(LABEL_PATH) matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]]) print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0], ACCURATE) p = re.compile(c) for num, line in enumerate(source_file_body): content = u.create_content(line, COLUNM) if p.match(content): source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 if len(words) == 2: c = createPattern(words[0], ACCURATE) f = createPattern(words[1], ACCURATE) cp = re.compile(c) fp = re.compile(f) for num, line in enumerate(source_file_body): content = u.create_content(line, COLUNM) if cp.match(content) and not fp.match(content): source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 u.create_result_file(u.setFileName(SOURCE_FILE, LABEL_FILE), matchHead, source_file_body) u.writeDictFile(u.changeFileName(combinefileName(SOURCE_FILE, LABEL_FILE), '统计.txt'), keyWordCount, 1) except: traceback.print_exc(file=open('error.txt', 'w+'))
head = linecache.getline(u.utf8_2_gbk(SOURCE_FILE), 1).strip() TOTALCOLUNM = len(head.split(',')) print u.utf8_2_gbk('标签词个数:') + u.printDictLen(labelWordp) source_file_body = u.create_file_body(SOURCE_FILE) for key, value in labelWordp.items(): i += 1 print u.utf8_2_gbk('当前执行到{0}个'.format(i)) for num, line in enumerate(source_file_body): data = line.strip().split(',') if len(data) == TOTALCOLUNM + 1: continue content = data[COLUNM - 1] p = re.compile(value) if p.match(content): source_file_body[num] = source_file_body[num].strip() + ',' + key + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 # 补全格式 for num, line in enumerate(source_file_body): data = line.strip().split(',') if len(data) == TOTALCOLUNM + 1: continue source_file_body[num] = source_file_body[num].strip() + ',' + '' + '\n' result_file_head = u.create_file_head(SOURCE_FILE, 'right', [u.gbk_2_utf8(columnName)]) u.create_result_file(RESULT_FILE, result_file_head, source_file_body) KEYWORD_FILE = LABELWORD.split('.')[0] + '统计.txt' u.writeDictFile(KEYWORD_FILE, keyWordCount) # 输出统计结果
FILE_NAME, FILE_PATH = u.getFirstFile('csv') def create_file(fileName): fileList = linecache.getlines(u.utf8_2_gbk(fileName)) fileHead = fileList[0] fileBody = fileList[1:] fileBLen = len(fileBody) return [fileHead, fileBody, fileBLen] if __name__ == '__main__': try: fileHead, fileBody, fileBLen = create_file(FILE_PATH) middle = (fileBLen / COUNT) + 1 for num in range(COUNT): left = num * middle right = (num + 1) * middle u.create_result_file(u.changeFileName(FILE_NAME, '-' + str(num) + '.csv'), fileHead, fileBody[left:right]) except: traceback.print_exc() print '==============================================================' print u.utf8_2_gbk('运行出错') print u.utf8_2_gbk('常见错误') print u.utf8_2_gbk('IndexError: list index out of range') print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空') print '==============================================================' raw_input('Press Enter to exit...')
if __name__ == "__main__": try: print u.utf8_2_gbk('开始执行') result_file_head = u.create_file_head(SOURCE_FILE) # 文件标题 result_file_body = u.create_file_body(SOURCE_FILE) # 文件内容 factory(result_file_body) # 构造输出文件 for num in save_list: save_file_list.append(result_file_body[int(num) - 1]) for num in remove_list: remove_file_list.append(result_file_body[int(num) - 1]) print u.utf8_2_gbk(SAVE_FILE + '行数:' + str(len(save_file_list))) print u.utf8_2_gbk(REMOVE_FILE + '行数:' + str(len(remove_file_list))) u.create_result_file(REMOVE_FILE, result_file_head, remove_file_list) # 符合条件的输出文件(大于等于101次) u.create_result_file(SAVE_FILE, result_file_head, save_file_list) # 不符合条件的输出文件 except: traceback.print_exc() print '==============================================================' print u.utf8_2_gbk('运行出错') print u.utf8_2_gbk('常见错误') print u.utf8_2_gbk('IndexError: list index out of range') print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空') print '==============================================================' raw_input('Press Enter to exit...')
def createPattern(fileName): content = '' f = open(u.utf8_2_gbk(fileName)) for line in f: if line[:-1].strip(): content += '|' + '.*' + line.strip() + '.*' f.close() return content[1:].lower() if __name__ == '__main__': source_file_body = u.create_file_body(SOURCE_FILE) source_file_head = u.create_file_head(SOURCE_FILE) m = createPattern(MATCH_FILE) print m + '===>>' + u.utf8_2_gbk('若乱码,匹配词文件请使用gbk编码') p = re.compile(m) print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body)) for line in source_file_body: content = u.create_content(line, COLUMN).lower() if p.match(content): result_list.append(line) else: remove_list.append(line) print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list)) print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list)) u.create_result_file(u.changeFileName(SOURCE_FILE, '-含关键词.csv'), source_file_head, result_list) u.create_result_file(u.changeFileName(SOURCE_FILE, '-不含关键词.csv'), source_file_head, remove_list) raw_input('Press Enter to exit...')
print u.utf8_2_gbk('开始执行聚类') for num, line in enumerate(source_file_body): content = re.sub(pattern, '', u.create_content(line, COLUMN)) if len(content) <= 20: keywords = jieba.analyse.extract_tags(content, topK=2) else: keywords = jieba.analyse.extract_tags(content, topK=TOPKET) keywords.sort() key = ','.join(keywords) cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1) print u.utf8_2_gbk('聚类完成,生成输出文件') for num, value in enumerate(cluster.itervalues()): cluster_list = value[2:].split(',') count_file_dict[num] = len(cluster_list) for n in cluster_list: result_file_body.append( str(num) + ',' + source_file_body[int(n) - 1]) u.create_result_file(u.changeFileName(SOURCENAME, '-聚类.csv'), source_file_head, result_file_body) u.writeDictFile(u.changeFileName(SOURCENAME, '-聚类统计.txt'), count_file_dict, 1) except: traceback.print_exc() print '==============================================================' print u.utf8_2_gbk('运行出错') print u.utf8_2_gbk('常见错误') print u.utf8_2_gbk('IndexError: list index out of range') print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空') print '==============================================================' raw_input('Press Enter to exit...')
REMOVE_FILE = "remove.csv" # 输出文件(含关键词) FILETER_FILE = "filter.txt" # 过滤词文件 ################################################ result_list = [] remove_list = [] if __name__ == "__main__": result_file_head = u.create_file_head(SOURCE_FILE) result_file_body = u.create_file_body(SOURCE_FILE) words_file = u.create_match_words(FILETER_FILE) chiness_words = words_file[0] english_words = words_file[1] pattern = u.build_pattern(chiness_words, english_words) print 'start' for line in result_file_body: content = u.create_content(line, COLUMN) if pattern.match(content): remove_list.append(line) else: result_list.append(line) print 'end' u.create_result_file(RESULT_FILE, result_file_head, result_list) u.create_result_file(REMOVE_FILE, result_file_head, remove_list)
cluster = {} result_file_body = [] pattern = re.compile("\w|[/.,/#@$%^& ]") count_file_list = [] if __name__ == '__main__': source_file_head = u.create_file_head(SOURCE_FILE, 'left', ['类型']) source_file_body = u.create_file_body(SOURCE_FILE) for num, line in enumerate(source_file_body): content = re.sub(pattern, '', u.create_content(line, COLUMN)) if len(content) <= 20: keywords = jieba.analyse.extract_tags(content, topK=2) else: keywords = jieba.analyse.extract_tags(content, topK=TOPKET) keywords.sort() key = ','.join(keywords) cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1) for num, value in enumerate(cluster.itervalues()): cluster_list = value[2:].split(',') count_file_list.append(str(num) + '\t' + str(len(cluster_list)) + '\n') for n in cluster_list: result_file_body.append( str(num) + ',' + source_file_body[int(n) - 1]) u.create_result_file(RESULT_FILE, source_file_head, result_file_body) u.create_result_file(COUNT_FILE, ['type\tcount\n'], count_file_list)