# # Time: 3/26/21 12:31 PM # Author: Sun # Software: PyCharm # Description: # # # ======================================== from Tools.JSExcelHandler import JSExcelHandler import pandas as pd if __name__ == '__main__': list = JSExcelHandler.getPathFromRootFolder(r'C:\Users\sjj-001\Desktop\2021年处理数据源集合\未处理\2020年市级收支月报') reslist = [] for path in list: filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path) df = pd.read_excel(path, sheet_name='YB01', skiprows=3) df = df.fillna('') sum = [] for index , row in df.iterrows(): str = row[1 list = [row[0], str, row[2], filename] sum.append(list) for index , row in df.iterrows(): str = row[4] list = [row[3], str, row[5], filename] sum.append(list) ttt = pd.DataFrame(sum, columns=['科目编码', '科目名称', '金额', '来源']) reslist.append(ttt)
def autoExtractSheetHead(self, configuremodel): """ 自动根据数据提取表头 :param configuremodel: 读取 view 层的 model """ print("开始抽取降范式的模板表头") datapath = configuremodel.datapath # 所有需要合并的表格数据 exceldatalist = JSExcelHandler.getPathFromRootFolder(datapath) dflist = [] for path in exceldatalist: try: readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path) for sheetname in sheetnames: rSheet = readOpenXlsx.sheet_by_name(sheetname) # mergecells = rSheet.merged_cells resultCells = rSheet.merged_cells mergecells = [] # 添加扫描的行范围,默认为前 8 行进行表头的搜寻 rangerow = 8 # 根据原表中的 cellrange 确定合并单元格的范围, 筛选出 header 部分有合并单元格的情况 for index, cell in enumerate(resultCells): crlow, crhign, cclow, cchigh = cell if crlow < rangerow: mergecells.append(cell) # 默认第一行为表头 没有合并单元格的情况 if len(mergecells) == 0: headrow = 1 else: levelmap = self.createLevelMap(mergecells, rSheet) # print(path) dummy = self.createLevelTree(levelmap, rSheet) # 根据层级来生成最终的 valuelist, keywords # self.travel(dummy, valuelist, keywords, maxlevel) # print(keywords) maxlevel = max(levelmap.keys()) # 这里应该比较所有叶子节点的下边界 lowestnodes = levelmap[maxlevel] # print(lowestnodes) headrow = 0 for node in lowestnodes: if len(node.child) > 0: headrow = maxlevel + 2 # break rlow, rhign, clow, chigh = node.cellrange if (rhign > headrow): headrow = rhign # 找到每个 sheet 的表头范围进行和之前的对比 TODO 这里可以对提取出来的表头进行处理 curdf = pd.read_excel(path, sheetname, nrows=headrow, header=None) # 每个进行遍历对比,完全不同则添加到 list 中 flag = True for olddf in dflist: if curdf.equals(olddf): # JSExcelHandler.pathlog(path) flag = False pass # 第一次 if len(dflist) == 0: flag = True if flag: dflist.append(curdf) filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path) # 这里要保存为 xlsx 为了兼容合并单元格的功能 objectpath = "{}/{}_{}.{}".format(configuremodel.headspath, filename, sheetname, 'xlsx') self.restoreHead(path, sheetname, objectpath, curdf, headrow) except Exception as e: print(str(e)) JSExcelHandler.errorlog("自动根据数据提取表头,原数据文件有问题-{}".format(path)) pass print("抽取模板表头完成")