def autoExtractSheetHead(self, configuremodel): storepath = configuremodel.storepath # 所有需要合并的表格数据 exceldatalist = JSExcelHandler.getPathFromRootFolder(storepath) excelhandler = JSExcelController() excelhandler.getPathFromRootFolder() print("开始搜索表头")
def __init__(self, master, controller): my_path = os.path.dirname(os.path.realpath(sys.argv[0])) my_path = os.path.join(my_path, 'ETLFile') self.result = os.path.join(my_path, 'result') self.delegate = controller self.master = master # self.master.config(bg='blue') self.face = tkinter.Frame(self.master, ) self.face.pack() # 主窗口 # 创建根据标准表头进行合并数据的按钮 self.ETLButton = tkinter.Button(self.face, command=self.ETLButtonHandler, text="执行校验") self.ETLButton.grid(row=5, column=2) self.datatext = tkinter.Label(self.face, text='校验文件路径', bd=4) self.datatext.grid(row=0, column=1) # self.datalabel = tkinter.Entry(self.face, width=80) # self.datalabel.grid(row=0, column=2) # self.datalabel.insert(END, result) self.headstext = tkinter.Label(self.face, text='身份证校验字段', bd=4) self.headstext.grid(row=1, column=1) self.headslabel = tkinter.Entry(self.face, width=80) self.headslabel.grid(row=1, column=2) self.headslabel.insert(END, '身份证') self.validrangetext = tkinter.Label(self.face, text='空值校验字段', bd=4) self.validrangetext.grid(row=3, column=1) self.validrangelabel = tkinter.Entry(self.face, width=80) self.validrangelabel.grid(row=3, column=2) self.validrangelabel.insert(END, '单位,金额') self.datetext = tkinter.Label(self.face, text='统一时间列', bd=4) self.datetext.grid(row=4, column=1) self.datelabel = tkinter.Entry(self.face, width=80) self.datelabel.grid(row=4, column=2) self.datelabel.insert(END, '入职时间') self.btn_back = tkinter.Button(self.face, text='返回批处理界面', command=self.back) self.btn_back.grid(row=6, column=2) self.comboxlist = ttk.Combobox(self.face, width= 75, postcommand = self.pathCallback) # 初始化 self.comboxlist["values"] = JSExcelHandler.getPathFromRootFolder(self.result) # self.comboxlist.current(0) # 选择第一个 # self.comboxlist.bind("<<ComboboxSelect>>",callbackFunc) # 绑定事件,(下拉列表框被选中时,绑定go()函数) self.comboxlist.grid(row=0, column=2)
def ReadDataThenCompareAndExtract(self, configuremodel): # 存放表头的路径 datapath = configuremodel.datapath # 读取所有数据文件 datafilelist = JSExcelHandler.getPathFromRootFolder(datapath) # 类型是 map headsmaplist = self.readStandardHeadFromFolder(configuremodel) for path in datafilelist: for df in headsmaplist.values(): # 获取模板表头的行数,用于数据表中获取表头范围 totalrows = len(df.index) # 根据标准表头获取数据里的表头进行比对 datadf = pd.read_excel(path, nrows=totalrows) if datadf.equals(df): print('is the same')
def readStandardHeadFromFolder(self, configuremodel): headspath = configuremodel.headspath excellist = JSExcelHandler.getPathFromRootFolder(headspath) dfmaplist = {} # 从标准头路径中读取标准表头的格式,为比对做准备 for index, excelpath in enumerate(excellist): newdf = pd.read_excel(str(excelpath)) # '/Users/sun/Desktop/heads/head3.xlsx' if len(dfmaplist) > 0: for olddf in dfmaplist.values(): if newdf.equals(olddf) is True: break else: # 以路径来作为唯一标识,因为路径是唯一的 dfmaplist[excelpath] = newdf return dfmaplist
def readStandardHeadFromFolder(self, configuremodel): """ 从标准表头存放路径遍历出所有的模板,生成 文件名:模板 dataframe 的 map :param configuremodel: view 层的 model :return: 标准表头文件名 : 标准表头 dataframe """ headspath = configuremodel.headspath headlist = [] headlist = JSExcelHandler.getPathFromRootFolder(headspath, headlist) dfmaplist = {} dflist = list(dfmaplist.keys()) # 从标准头路径中读取标准表头的格式,为比对做准备 for excelpath in headlist: filename = os.path.split(excelpath)[-1] startrow = 0 if filename in configuremodel.validrange: startrow = int(configuremodel.validrange[filename]) # try: # path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None # titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None) newdf = pd.read_excel(str(excelpath), skiprows=startrow, header=None) # newdf.fillna(method='pad', inplace=True) # except Exception as e: # JSExcelHandler.errorlog("获取标准表头-{}".format(excelpath)) # pass if len(dflist) > 0: for olddf in dflist: if newdf.equals(olddf) == True: break else: dfmaplist[excelpath] = newdf dflist = list(dfmaplist.keys()) else: dfmaplist[excelpath] = newdf dflist = list(dfmaplist.values()) return dfmaplist
#!/usr/bin/env python # coding=utf-8 # ======================================== # # Time: 3/26/21 12:31 PM # Author: Sun # Software: PyCharm # Description: # # # ======================================== from Tools.JSExcelHandler import JSExcelHandler import pandas as pd if __name__ == '__main__': list = JSExcelHandler.getPathFromRootFolder(r'C:\Users\sjj-001\Desktop\2021年处理数据源集合\未处理\2020年市级收支月报') reslist = [] for path in list: filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path) df = pd.read_excel(path, sheet_name='YB01', skiprows=3) df = df.fillna('') sum = [] for index , row in df.iterrows(): str = row[1 list = [row[0], str, row[2], filename] sum.append(list) for index , row in df.iterrows(): str = row[4] list = [row[3], str, row[5], filename] sum.append(list) ttt = pd.DataFrame(sum, columns=['科目编码', '科目名称', '金额', '来源'])
def autoExtractSheetHead(self, configuremodel): """ 自动根据数据提取表头 :param configuremodel: 读取 view 层的 model """ print("开始抽取降范式的模板表头") datapath = configuremodel.datapath # 所有需要合并的表格数据 exceldatalist = JSExcelHandler.getPathFromRootFolder(datapath) dflist = [] for path in exceldatalist: try: readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path) for sheetname in sheetnames: rSheet = readOpenXlsx.sheet_by_name(sheetname) # mergecells = rSheet.merged_cells resultCells = rSheet.merged_cells mergecells = [] # 添加扫描的行范围,默认为前 8 行进行表头的搜寻 rangerow = 8 # 根据原表中的 cellrange 确定合并单元格的范围, 筛选出 header 部分有合并单元格的情况 for index, cell in enumerate(resultCells): crlow, crhign, cclow, cchigh = cell if crlow < rangerow: mergecells.append(cell) # 默认第一行为表头 没有合并单元格的情况 if len(mergecells) == 0: headrow = 1 else: levelmap = self.createLevelMap(mergecells, rSheet) # print(path) dummy = self.createLevelTree(levelmap, rSheet) # 根据层级来生成最终的 valuelist, keywords # self.travel(dummy, valuelist, keywords, maxlevel) # print(keywords) maxlevel = max(levelmap.keys()) # 这里应该比较所有叶子节点的下边界 lowestnodes = levelmap[maxlevel] # print(lowestnodes) headrow = 0 for node in lowestnodes: if len(node.child) > 0: headrow = maxlevel + 2 # break rlow, rhign, clow, chigh = node.cellrange if (rhign > headrow): headrow = rhign # 找到每个 sheet 的表头范围进行和之前的对比 TODO 这里可以对提取出来的表头进行处理 curdf = pd.read_excel(path, sheetname, nrows=headrow, header=None) # 每个进行遍历对比,完全不同则添加到 list 中 flag = True for olddf in dflist: if curdf.equals(olddf): # JSExcelHandler.pathlog(path) flag = False pass # 第一次 if len(dflist) == 0: flag = True if flag: dflist.append(curdf) filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path) # 这里要保存为 xlsx 为了兼容合并单元格的功能 objectpath = "{}/{}_{}.{}".format(configuremodel.headspath, filename, sheetname, 'xlsx') self.restoreHead(path, sheetname, objectpath, curdf, headrow) except Exception as e: print(str(e)) JSExcelHandler.errorlog("自动根据数据提取表头,原数据文件有问题-{}".format(path)) pass print("抽取模板表头完成")
def ReadDataThenCompareAndExtract(self, configuremodel): print("开始抽取合并数据 %s" % datetime.now()) # 存放表头的路径 datapath = configuremodel.datapath # 读取所有数据文件 datafilelist = [] datafilelist = JSExcelHandler.getPathFromRootFolder(datapath, datafilelist) if len(datafilelist) == 0: messagebox.showinfo("message", "未找到原始数据") else: # 存放合并结果的路径 resultfilepath = configuremodel.storepath # 类型是 map {路径 : df}, 这里是按照设置的起始行读取标准模板表头,默认是 0 headsmaplist = self.readStandardHeadFromFolder(configuremodel) if headsmaplist == {}: messagebox.showinfo("message", "未找到模板表头") else: for dfpath in headsmaplist.keys(): sum = [] # 拼接结果文件路径, 用 os.path win 和 linux 会有自己判断 filename = os.path.split(dfpath)[-1] startrow = 0 rangemap = configuremodel.validrange # 配置模板的起始降范式位置 if rangemap != {}: if filename in configuremodel.validrange: startrow = int(configuremodel.validrange[filename]) newfile = os.path.join(resultfilepath, filename) ''' 这里是为所有的标准模板预先建立数据抽取的结果文件 ''' try: # 降低表头的范式 valuelist, lastrows, keywords = self.lowerDimensionOfTitle(dfpath, startrow) # 创建文件及表头文件到result文件夹下为抽取该模板做准备 newfile, newsheetname = self.newfilesave(dfpath, newfile, valuelist, startrow) except Exception as e: JSExcelHandler.errorlog("降范式并且新建表头文件未抽取数据做准备-{}".format(dfpath)) pass # 遍历目标数据文件下所有 sheet 是否与标准模板匹配,如果匹配则进行数据抽取合并操作 # 对文件进行分组合并 # def add(work, lock): # for path in work: # print(path) # try: # readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path) # for sheetname in sheetnames: # # print("校验模板表:" + str(dfpath) + "\n与数据表:" + path + '下工作表sheetname:' + sheetname + "对比") # # 这里不能单纯的将标题范围下的数据一并写入,要重新建立按 keywords 排列顺序的 dataframe # # 获取模板表头的行数,用于数据表中获取表头范围 # totalrows = headsmaplist[dfpath].shape[0] # # 根据标准表头获取数据里的表头进行比对 # titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, # header=None) # # 这里要进行最下层 node 的比对 # # if titledf.equals(headsmaplist[dfpath]): # # 抽取模块headsmaplist, 并对匹配上的表进行记录 # JSExcelHandler.excuteCheck("{} ---- {}".format(dfpath, path)) # # 抽取数据 dafaframe # df = pd.read_excel(path, sheetname, skiprows=lastrows, header=None, dtype='str') # filename = os.path.split(path)[-1] # df[r'来源文件'] = filename # lock.acquire() # # self.apendDataFrame(df, newfile, newsheetname, False) # sum.append(df) # lock.release() # # except Exception as e: # print(str(e)) # JSExcelHandler.errorlog(r"遍历目标数据文件下所有 sheet 与标准模板匹配后追加数据-{}".format(path)) # # threads = [] # lock = threading.Lock() # for i in range(0, len(datafilelist), 5): # work = datafilelist[i:i + 5] # # results.append(None) # enlarge the results list, so we have room for this thread's result # t = threading.Thread(target=add, args=(work, lock )) # t.start() # threads.append(t) # # for t in threads: # t.join() for path in datafilelist: print(path) try: readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path) for sheetname in sheetnames: # print("校验模板表:" + str(dfpath) + "\n与数据表:" + path + '下工作表sheetname:' + sheetname + "对比") # 这里不能单纯的将标题范围下的数据一并写入,要重新建立按 keywords 排列顺序的 dataframe # 获取模板表头的行数,用于数据表中获取表头范围 totalrows = headsmaplist[dfpath].shape[0] # 根据标准表头获取数据里的表头进行比对 titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None) # titledf.fillna(method='pad', inplace=True) # 这里要进行最下层 node 的比对 if titledf.equals(headsmaplist[dfpath]): # 抽取模块headsmaplist, 并对匹配上的表进行记录 JSExcelHandler.excuteCheck("{} ---- {}".format(dfpath, path)) # 抽取数据 dafafram df = pd.read_excel(path, sheetname, skiprows=lastrows, header=None) filename = os.path.split(path)[-1] df[r'来源文件'] = '''%s|%s''' % (filename, sheetname) sum.append(df) except Exception as e: JSExcelHandler.errorlog("降范式并且新建表头文件未抽取数据做准备-{}".format(dfpath)) pass res = pd.DataFrame([]) for tmpdf in sum: res = res.append(tmpdf) self.apendDataFrame(res, newfile, newsheetname, False) print("合并数据完成 %s" % datetime.now()) messagebox.showinfo("message", "合并完成\n输出路径为:%s" % (configuremodel.storepath))
def pathCallback(self): datalist = JSExcelHandler.getPathFromRootFolder(self.result) self.comboxlist["values"] = datalist
dfmaplist[excelpath] = newdf return dfmaplist if __name__ == '__main__': # # 配置文件路径 # path = os.path.abspath('..') + "/configureFile.txt" # 表示当前所处的文件夹上一级文件夹的绝对路径 # controller = JSETLController() # # View = JSConfigureView(path) # # view 层读取配置文件 # View.readConfigureFile() # # # Controller 调度执行读取模板表头文件 | 根据数据文件自动识别表头 # controller = JSETLController() # # 根据标准表头进行比对和抽取合并数据 # controller.ReadDataThenCompareAndExtract(View.configuremodel) path = '/Users/sun/Desktop/test/' list = JSExcelHandler.getPathFromRootFolder(path) controller = JSETLController() for path in list: readOpenXls, sheetnames, workpath = JSExcelHandler.OpenXls(path) for sheetname in sheetnames: rSheet = readOpenXls.sheet_by_name(sheetname) controller.findMinMergeCellRange(rSheet)
#!/usr/bin/env python # coding=utf-8 # ======================================== # # Time: 3/26/21 12:31 PM # Author: Sun # Software: PyCharm # Description: # # # ======================================== from Tools.JSExcelHandler import JSExcelHandler import pandas as pd if __name__ == '__main__': list = JSExcelHandler.getPathFromRootFolder(r'C:\Users\sjj-001\Desktop\AUTOETLTEST\data') reslist = [] for path in list: df = pd.read_excel(path, sheet_name='YB01',skiprows=3) df = df.fillna('') sum = [] for index , row in df.iterrows(): str = row[1] list = [str, row[2], str.count(' ')] sum.append(list) for index , row in df.iterrows(): str = row[4] list = [str, row[5], str.count(' ')] sum.append(list) ttt = pd.DataFrame(sum, columns=['科目名称', '数值', 'count']) reslist.append(ttt)