def createPattern(str, accurate): # 添加正则匹配规则 if accurate: result = '^' + str.replace(',', '$|^') + '$' else: result = '.*' + str.replace(',', '.*|.*') + '.*' return result if __name__ == '__main__': try: LABEL_FILE, LABEL_PATH = u.getFirstFile('txt') SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv') print u.utf8_2_gbk('打标签文件:' + LABEL_FILE) print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE) source_file_body = u.create_file_body(SOURCE_PATH) for num, line in enumerate(source_file_body): source_file_body[num] = line.strip().lower() + ',' + '\n' labelType, labelNum = createMoreMatch(LABEL_PATH) matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]]) print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0], ACCURATE) p = re.compile(c) for num, line in enumerate(source_file_body): content = u.create_content(line, COLUNM) if p.match(content):
#!/bin/bash # coding=utf-8 import re import sys import os import util as u ''' 功能说明:先groupby某一列,然后统计每个类的数量,输出一份文件 ''' ##############参数说明############### SOURCE_FILE = 'hebing.csv' # 输入文件 RESULT_FILE = 'result.csv' # 输出文件 COLUMN = 1 # 要groupby和统计的列的列 #################################### count_dict = {} if __name__ == '__main__': source_file_head = u.create_file_head(SOURCE_FILE, 'right', ['次数']) source_file_body = u.create_file_body(SOURCE_FILE) for line in source_file_body: content = u.create_content(line, COLUMN) count_dict[content] = count_dict.get(content, 0) + 1 result_file = file(RESULT_FILE, 'w+') result_file.write(source_file_head) for key, value in count_dict.items(): result_file.write(key + ',' + str(value) + '\n') result_file.close()
def rm_repeat(file_list): rm_set = set() for line in file_list: content = u.create_content(line, 1) rm_set.add(content.replace('"', '') + '\n') return list(rm_set) if __name__ == '__main__': file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), []) for f in file_list: file_dict[f.encode('gbk')] = rm_repeat( u.create_file_body(f.encode('utf-8'))) total_file = file('total.csv', 'w+') for key, value in file_dict.items(): total_file.writelines(value) total_file.close() count_file = open('total.csv', 'rb') for line in count_file: content = u.create_content(line, 1) count_dict[content] = count_dict.get(content, 0) + 1 result_file = file(RESULT_FILE, 'w+') for key, value in count_dict.items(): result_file.write(key + '\t' + str(value) + '\n') result_file.close()
for key, value in res.items(): rowNum_list = value[2:].split(",") if len(rowNum_list) >= NUMBER: for num in rowNum_list: remove_list.append(num) else: for num in rowNum_list: save_list.append(num) if __name__ == "__main__": try: print u.utf8_2_gbk('开始执行') result_file_head = u.create_file_head(SOURCE_FILE) # 文件标题 result_file_body = u.create_file_body(SOURCE_FILE) # 文件内容 factory(result_file_body) # 构造输出文件 for num in save_list: save_file_list.append(result_file_body[int(num) - 1]) for num in remove_list: remove_file_list.append(result_file_body[int(num) - 1]) print u.utf8_2_gbk(SAVE_FILE + '行数:' + str(len(save_file_list))) print u.utf8_2_gbk(REMOVE_FILE + '行数:' + str(len(remove_file_list))) u.create_result_file(REMOVE_FILE, result_file_head, remove_file_list) # 符合条件的输出文件(大于等于101次) u.create_result_file(SAVE_FILE, result_file_head, save_file_list) # 不符合条件的输出文件 except: traceback.print_exc()
def rm_repeat(file_list): rm_set = set() for line in file_list: content = u.create_content(line, 1) rm_set.add(content.replace('"', '') + '\n') return list(rm_set) if __name__ == '__main__': try: print u.utf8_2_gbk('开始执行') file_list = u.GetFileList(FILE_PATH, []) for f in file_list: file_dict[f.encode('gbk')] = rm_repeat(u.create_file_body(f.encode('utf-8'))) total_file = file('total.csv', 'w+') for key, value in file_dict.items(): total_file.writelines(value) total_file.close() count_file = open('total.csv', 'rb') for line in count_file: content = u.create_content(line, 1) count_dict[content] = count_dict.get(content, 0) + 1 count_file.close() u.writeDictFile(RESULT_FILE, count_dict, 1) print u.utf8_2_gbk('执行完毕') print u.utf8_2_gbk('输出文件路径:') + sys.path[0] + u.utf8_2_gbk('\\' + RESULT_FILE)
# coding=utf-8 import sys import re import util as u import os ''' 功能说明:将指定路径的文件合并成一个文件(文件格式为csv) ''' ##################参数说明################# FILE_PATH = r'G:\merge_n_file\data' # 文件路径 RESULT_FILE = 'result.csv' # 合并之后的文件 RESULT_HEAD = r'G:\merge_n_file\data\3779_利鑫-999道私房菜.csv' # 指定合并后的文件标题基准 ########################################## file_dict = {} if __name__ == '__main__': file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), []) for f in file_list: file_dict[f.encode('gbk')] = u.create_file_body(f.encode('utf-8')) result_file = file(RESULT_FILE, 'w+') result_file_head = u.create_file_head(RESULT_HEAD) result_file.write(result_file_head) for key, value in file_dict.items(): result_file.writelines(value) result_file.close()