def createContent(fileName, rows): result = [] count = 0 f = open(u.utf8_2_gbk(fileName), 'rb') f.next() for line in f: count += 1 result.append(u.create_content(line, rows).lower() + ',' + '\n') f.close() return [result, count]
def factory(body_list): count = 0 for line in result_file_body: count += 1 content = u.create_content(line, COLUMN) res[content] = str(res.get(content, 0)) + "," + str(count) for key, value in res.items(): rowNum_list = value[2:].split(",") if len(rowNum_list) >= NUMBER: for num in rowNum_list: remove_list.append(num) else: for num in rowNum_list: save_list.append(num)
print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE) source_file_body = u.create_file_body(SOURCE_PATH) for num, line in enumerate(source_file_body): source_file_body[num] = line.strip().lower() + ',' + '\n' labelType, labelNum = createMoreMatch(LABEL_PATH) matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]]) print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0], ACCURATE) p = re.compile(c) for num, line in enumerate(source_file_body): content = u.create_content(line, COLUNM) if p.match(content): source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 if len(words) == 2: c = createPattern(words[0], ACCURATE) f = createPattern(words[1], ACCURATE) cp = re.compile(c) fp = re.compile(f) for num, line in enumerate(source_file_body): content = u.create_content(line, COLUNM) if cp.match(content) and not fp.match(content): source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 u.create_result_file(u.setFileName(SOURCE_FILE, LABEL_FILE), matchHead, source_file_body)
#!/bin/bash # coding=utf-8 import re import sys import os import util as u ''' 功能说明:先groupby某一列,然后统计每个类的数量,输出一份文件 ''' ##############参数说明############### SOURCE_FILE = 'hebing.csv' # 输入文件 RESULT_FILE = 'result.csv' # 输出文件 COLUMN = 1 # 要groupby和统计的列的列 #################################### count_dict = {} if __name__ == '__main__': source_file_head = u.create_file_head(SOURCE_FILE, 'right', ['次数']) source_file_body = u.create_file_body(SOURCE_FILE) for line in source_file_body: content = u.create_content(line, COLUMN) count_dict[content] = count_dict.get(content, 0) + 1 result_file = file(RESULT_FILE, 'w+') result_file.write(source_file_head) for key, value in count_dict.items(): result_file.write(key + ',' + str(value) + '\n') result_file.close()
def rm_repeat(file_list): rm_set = set() for line in file_list: content = u.create_content(line, 1) rm_set.add(content.replace('"', '') + '\n') return list(rm_set)
def rm_repeat(file_list): rm_set = set() for line in file_list: content = u.create_content(line, 1) rm_set.add(content.replace('"', '') + '\n') return list(rm_set) if __name__ == '__main__': file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), []) for f in file_list: file_dict[f.encode('gbk')] = rm_repeat( u.create_file_body(f.encode('utf-8'))) total_file = file('total.csv', 'w+') for key, value in file_dict.items(): total_file.writelines(value) total_file.close() count_file = open('total.csv', 'rb') for line in count_file: content = u.create_content(line, 1) count_dict[content] = count_dict.get(content, 0) + 1 result_file = file(RESULT_FILE, 'w+') for key, value in count_dict.items(): result_file.write(key + '\t' + str(value) + '\n') result_file.close() os.remove('total.csv')
def createPattern(fileName): content = '' f = open(u.utf8_2_gbk(fileName)) for line in f: if line[:-1].strip(): content += '|' + '.*' + line.strip() + '.*' f.close() return content[1:].lower() if __name__ == '__main__': source_file_body = u.create_file_body(SOURCE_FILE) source_file_head = u.create_file_head(SOURCE_FILE) m = createPattern(MATCH_FILE) print m + '===>>' + u.utf8_2_gbk('若乱码,匹配词文件请使用gbk编码') p = re.compile(m) print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body)) for line in source_file_body: content = u.create_content(line, COLUMN).lower() if p.match(content): result_list.append(line) else: remove_list.append(line) print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list)) print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list)) u.create_result_file(u.changeFileName(SOURCE_FILE, '-含关键词.csv'), source_file_head, result_list) u.create_result_file(u.changeFileName(SOURCE_FILE, '-不含关键词.csv'), source_file_head, remove_list) raw_input('Press Enter to exit...')
__author__ = "liangzhicheng" SOURCENAME, SOURCEPATH = u.getFirstFile('csv') cluster = {} result_file_body = [] pattern = re.compile("\w|[/.,/#@$%^& ]") count_file_dict = {} if __name__ == '__main__': try: source_file_head = u.create_file_head(SOURCEPATH, 'left', ['类型']) source_file_body = u.create_file_body(SOURCEPATH) print u.utf8_2_gbk('开始执行聚类') for num, line in enumerate(source_file_body): content = re.sub(pattern, '', u.create_content(line, COLUMN)) if len(content) <= 20: keywords = jieba.analyse.extract_tags(content, topK=2) else: keywords = jieba.analyse.extract_tags(content, topK=TOPKET) keywords.sort() key = ','.join(keywords) cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1) print u.utf8_2_gbk('聚类完成,生成输出文件') for num, value in enumerate(cluster.itervalues()): cluster_list = value[2:].split(',') count_file_dict[num] = len(cluster_list) for n in cluster_list: result_file_body.append( str(num) + ',' + source_file_body[int(n) - 1]) u.create_result_file(u.changeFileName(SOURCENAME, '-聚类.csv'),