labelType, labelNum = createMoreMatch(LABEL_PATH) matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]]) print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0], ACCURATE) p = re.compile(c) for num, line in enumerate(source_file_body): content = u.create_content(line, COLUNM) if p.match(content): source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 if len(words) == 2: c = createPattern(words[0], ACCURATE) f = createPattern(words[1], ACCURATE) cp = re.compile(c) fp = re.compile(f) for num, line in enumerate(source_file_body): content = u.create_content(line, COLUNM) if cp.match(content) and not fp.match(content): source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 u.create_result_file(u.setFileName(SOURCE_FILE, LABEL_FILE), matchHead, source_file_body) u.writeDictFile(u.changeFileName(combinefileName(SOURCE_FILE, LABEL_FILE), '统计.txt'), keyWordCount, 1) except: traceback.print_exc(file=open('error.txt', 'w+'))
matchHead = u.utf8_2_gbk('内容' + ',' + LABEL_FILE.split('.')[0] + '\n') print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0]) p = re.compile(c) for num, line in enumerate(source_file_body): if p.match(line): source_file_body[num] = source_file_body[num].strip( ) + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 if len(words) == 2: c = createPattern(words[0]) f = createPattern(words[1]) cp = re.compile(c) fp = re.compile(f) for num, line in enumerate(source_file_body): if cp.match(line) and not fp.match(line): source_file_body[num] = source_file_body[num].strip( ) + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 u.create_result_file(u.setFileName(SOURCE_FILE, LABEL_FILE), matchHead, source_file_body) u.writeDictFile(u.changeFileName(LABEL_FILE, '统计.csv'), keyWordCount, 1)
print u.utf8_2_gbk('开始执行') file_list = u.GetFileList(FILE_PATH, []) for f in file_list: file_dict[f.encode('gbk')] = rm_repeat(u.create_file_body(f.encode('utf-8'))) total_file = file('total.csv', 'w+') for key, value in file_dict.items(): total_file.writelines(value) total_file.close() count_file = open('total.csv', 'rb') for line in count_file: content = u.create_content(line, 1) count_dict[content] = count_dict.get(content, 0) + 1 count_file.close() u.writeDictFile(RESULT_FILE, count_dict, 1) print u.utf8_2_gbk('执行完毕') print u.utf8_2_gbk('输出文件路径:') + sys.path[0] + u.utf8_2_gbk('\\' + RESULT_FILE) except: traceback.print_exc() print '==============================================================' print u.utf8_2_gbk('运行出错') print u.utf8_2_gbk('常见错误') print u.utf8_2_gbk('IndexError: list index out of range') print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空') print '==============================================================' raw_input('Press Enter to exit...') os.remove('total.csv')
head = linecache.getline(u.utf8_2_gbk(SOURCE_FILE), 1).strip() TOTALCOLUNM = len(head.split(',')) print u.utf8_2_gbk('标签词个数:') + u.printDictLen(labelWordp) source_file_body = u.create_file_body(SOURCE_FILE) for key, value in labelWordp.items(): i += 1 print u.utf8_2_gbk('当前执行到{0}个'.format(i)) for num, line in enumerate(source_file_body): data = line.strip().split(',') if len(data) == TOTALCOLUNM + 1: continue content = data[COLUNM - 1] p = re.compile(value) if p.match(content): source_file_body[num] = source_file_body[num].strip() + ',' + key + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 # 补全格式 for num, line in enumerate(source_file_body): data = line.strip().split(',') if len(data) == TOTALCOLUNM + 1: continue source_file_body[num] = source_file_body[num].strip() + ',' + '' + '\n' result_file_head = u.create_file_head(SOURCE_FILE, 'right', [u.gbk_2_utf8(columnName)]) u.create_result_file(RESULT_FILE, result_file_head, source_file_body) KEYWORD_FILE = LABELWORD.split('.')[0] + '统计.txt' u.writeDictFile(KEYWORD_FILE, keyWordCount) # 输出统计结果
print u.utf8_2_gbk('开始执行聚类') for num, line in enumerate(source_file_body): content = re.sub(pattern, '', u.create_content(line, COLUMN)) if len(content) <= 20: keywords = jieba.analyse.extract_tags(content, topK=2) else: keywords = jieba.analyse.extract_tags(content, topK=TOPKET) keywords.sort() key = ','.join(keywords) cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1) print u.utf8_2_gbk('聚类完成,生成输出文件') for num, value in enumerate(cluster.itervalues()): cluster_list = value[2:].split(',') count_file_dict[num] = len(cluster_list) for n in cluster_list: result_file_body.append( str(num) + ',' + source_file_body[int(n) - 1]) u.create_result_file(u.changeFileName(SOURCENAME, '-聚类.csv'), source_file_head, result_file_body) u.writeDictFile(u.changeFileName(SOURCENAME, '-聚类统计.txt'), count_file_dict, 1) except: traceback.print_exc() print '==============================================================' print u.utf8_2_gbk('运行出错') print u.utf8_2_gbk('常见错误') print u.utf8_2_gbk('IndexError: list index out of range') print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空') print '==============================================================' raw_input('Press Enter to exit...')