コード例 #1
0
ファイル: lowerTitle.py プロジェクト: Paprika0120/AUTOETL
#
# Time: 3/26/21 12:31 PM
# Author: Sun
# Software: PyCharm
# Description:
#
#
# ========================================
from Tools.JSExcelHandler import JSExcelHandler
import pandas as pd

if __name__ == '__main__':
    list = JSExcelHandler.getPathFromRootFolder(r'C:\Users\sjj-001\Desktop\2021年处理数据源集合\未处理\2020年市级收支月报')
    reslist = []
    for path in list:
        filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path)
        df = pd.read_excel(path, sheet_name='YB01', skiprows=3)
        df = df.fillna('')
        sum = []
        for index , row in df.iterrows():
            str = row[1
            list = [row[0], str, row[2], filename]
            sum.append(list)
        for index , row in df.iterrows():
            str = row[4]
            list = [row[3], str, row[5], filename]
            sum.append(list)
        ttt = pd.DataFrame(sum, columns=['科目编码', '科目名称', '金额', '来源'])
        reslist.append(ttt)

コード例 #2
0
    def autoExtractSheetHead(self, configuremodel):
        """
        自动根据数据提取表头
        :param configuremodel: 读取 view 层的 model
        """
        print("开始抽取降范式的模板表头")
        datapath = configuremodel.datapath
        # 所有需要合并的表格数据
        exceldatalist = JSExcelHandler.getPathFromRootFolder(datapath)
        dflist = []
        for path in exceldatalist:
            try:
                readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path)
                for sheetname in sheetnames:
                    rSheet = readOpenXlsx.sheet_by_name(sheetname)
                    # mergecells = rSheet.merged_cells
                    resultCells = rSheet.merged_cells
                    mergecells = []
                    # 添加扫描的行范围,默认为前 8 行进行表头的搜寻
                    rangerow = 8
                    # 根据原表中的 cellrange 确定合并单元格的范围, 筛选出 header 部分有合并单元格的情况
                    for index, cell in enumerate(resultCells):
                        crlow, crhign, cclow, cchigh = cell
                        if crlow < rangerow:
                            mergecells.append(cell)
                    # 默认第一行为表头 没有合并单元格的情况
                    if len(mergecells) == 0:
                        headrow = 1
                    else:
                        levelmap = self.createLevelMap(mergecells, rSheet)
                        # print(path)
                        dummy = self.createLevelTree(levelmap, rSheet)
                        # 根据层级来生成最终的 valuelist, keywords
                        # self.travel(dummy, valuelist, keywords, maxlevel)
                        # print(keywords)
                        maxlevel = max(levelmap.keys())
                        # 这里应该比较所有叶子节点的下边界
                        lowestnodes = levelmap[maxlevel]
                        # print(lowestnodes)
                        headrow = 0
                        for node in lowestnodes:
                            if len(node.child) > 0:
                                headrow = maxlevel + 2
                                # break
                            rlow, rhign, clow, chigh = node.cellrange
                            if (rhign > headrow):
                                headrow = rhign


                    # 找到每个 sheet 的表头范围进行和之前的对比 TODO 这里可以对提取出来的表头进行处理
                    curdf = pd.read_excel(path, sheetname, nrows=headrow, header=None)
                    # 每个进行遍历对比,完全不同则添加到 list 中
                    flag = True
                    for olddf in dflist:
                        if curdf.equals(olddf):
                            # JSExcelHandler.pathlog(path)
                            flag = False
                            pass
                    # 第一次
                    if len(dflist) == 0:
                        flag = True
                    if flag:
                        dflist.append(curdf)
                        filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path)
                        # 这里要保存为 xlsx 为了兼容合并单元格的功能
                        objectpath = "{}/{}_{}.{}".format(configuremodel.headspath, filename, sheetname, 'xlsx')
                        self.restoreHead(path, sheetname, objectpath, curdf, headrow)
            except Exception as e:
                print(str(e))
                JSExcelHandler.errorlog("自动根据数据提取表头,原数据文件有问题-{}".format(path))
                pass
        print("抽取模板表头完成")