コード例 #1
0
 def autoExtractSheetHead(self, configuremodel):
     storepath = configuremodel.storepath
     # 所有需要合并的表格数据
     exceldatalist = JSExcelHandler.getPathFromRootFolder(storepath)
     excelhandler = JSExcelController()
     excelhandler.getPathFromRootFolder()
     print("开始搜索表头")
コード例 #2
0
ファイル: JSETLWindow.py プロジェクト: Paprika0120/AUTOETL
    def __init__(self, master, controller):
        my_path = os.path.dirname(os.path.realpath(sys.argv[0]))
        my_path = os.path.join(my_path, 'ETLFile')
        self.result = os.path.join(my_path, 'result')

        self.delegate = controller
        self.master = master
        # self.master.config(bg='blue')
        self.face = tkinter.Frame(self.master, )
        self.face.pack()
        # 主窗口
        # 创建根据标准表头进行合并数据的按钮
        self.ETLButton = tkinter.Button(self.face, command=self.ETLButtonHandler, text="执行校验")
        self.ETLButton.grid(row=5, column=2)

        self.datatext = tkinter.Label(self.face, text='校验文件路径', bd=4)
        self.datatext.grid(row=0, column=1)

        # self.datalabel = tkinter.Entry(self.face, width=80)
        # self.datalabel.grid(row=0, column=2)
        # self.datalabel.insert(END, result)

        self.headstext = tkinter.Label(self.face, text='身份证校验字段', bd=4)
        self.headstext.grid(row=1, column=1)
        self.headslabel = tkinter.Entry(self.face, width=80)
        self.headslabel.grid(row=1, column=2)
        self.headslabel.insert(END, '身份证')

        self.validrangetext = tkinter.Label(self.face, text='空值校验字段', bd=4)
        self.validrangetext.grid(row=3, column=1)
        self.validrangelabel = tkinter.Entry(self.face, width=80)
        self.validrangelabel.grid(row=3, column=2)
        self.validrangelabel.insert(END, '单位,金额')

        self.datetext = tkinter.Label(self.face, text='统一时间列', bd=4)
        self.datetext.grid(row=4, column=1)
        self.datelabel = tkinter.Entry(self.face, width=80)
        self.datelabel.grid(row=4, column=2)
        self.datelabel.insert(END, '入职时间')

        self.btn_back = tkinter.Button(self.face, text='返回批处理界面', command=self.back)
        self.btn_back.grid(row=6, column=2)



        self.comboxlist = ttk.Combobox(self.face, width= 75, postcommand = self.pathCallback)  # 初始化
        self.comboxlist["values"] = JSExcelHandler.getPathFromRootFolder(self.result)
        # self.comboxlist.current(0)  # 选择第一个
        # self.comboxlist.bind("<<ComboboxSelect>>",callbackFunc)  # 绑定事件,(下拉列表框被选中时,绑定go()函数)
        self.comboxlist.grid(row=0, column=2)
コード例 #3
0
 def ReadDataThenCompareAndExtract(self, configuremodel):
     # 存放表头的路径
     datapath = configuremodel.datapath
     # 读取所有数据文件
     datafilelist = JSExcelHandler.getPathFromRootFolder(datapath)
     # 类型是 map
     headsmaplist = self.readStandardHeadFromFolder(configuremodel)
     for path in datafilelist:
         for df in headsmaplist.values():
             # 获取模板表头的行数,用于数据表中获取表头范围
             totalrows = len(df.index)
             # 根据标准表头获取数据里的表头进行比对
             datadf = pd.read_excel(path, nrows=totalrows)
             if datadf.equals(df):
                 print('is the same')
コード例 #4
0
 def readStandardHeadFromFolder(self, configuremodel):
     headspath = configuremodel.headspath
     excellist = JSExcelHandler.getPathFromRootFolder(headspath)
     dfmaplist = {}
     # 从标准头路径中读取标准表头的格式,为比对做准备
     for index, excelpath in enumerate(excellist):
         newdf = pd.read_excel(str(excelpath))
         # '/Users/sun/Desktop/heads/head3.xlsx'
         if len(dfmaplist) > 0:
             for olddf in dfmaplist.values():
                 if newdf.equals(olddf) is True:
                     break
         else:
             # 以路径来作为唯一标识,因为路径是唯一的
             dfmaplist[excelpath] = newdf
     return dfmaplist
コード例 #5
0
 def readStandardHeadFromFolder(self, configuremodel):
     """
     从标准表头存放路径遍历出所有的模板,生成 文件名:模板 dataframe 的 map
     :param configuremodel: view 层的 model
     :return: 标准表头文件名 : 标准表头 dataframe
     """
     headspath = configuremodel.headspath
     headlist = []
     headlist = JSExcelHandler.getPathFromRootFolder(headspath, headlist)
     dfmaplist = {}
     dflist = list(dfmaplist.keys())
     # 从标准头路径中读取标准表头的格式,为比对做准备
     for excelpath in headlist:
         filename = os.path.split(excelpath)[-1]
         startrow = 0
         if filename in configuremodel.validrange:
             startrow = int(configuremodel.validrange[filename])
         # try:
         # path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None
         # titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None)
         newdf = pd.read_excel(str(excelpath), skiprows=startrow, header=None)
         # newdf.fillna(method='pad', inplace=True)
         # except Exception as e:
         #     JSExcelHandler.errorlog("获取标准表头-{}".format(excelpath))
         #     pass
         if len(dflist) > 0:
             for olddf in dflist:
                 if newdf.equals(olddf) == True:
                     break
                 else:
                     dfmaplist[excelpath] = newdf
                     dflist = list(dfmaplist.keys())
         else:
             dfmaplist[excelpath] = newdf
             dflist = list(dfmaplist.values())
     return dfmaplist
コード例 #6
0
ファイル: lowerTitle.py プロジェクト: Paprika0120/AUTOETL
#!/usr/bin/env python
# coding=utf-8
# ========================================
#
# Time: 3/26/21 12:31 PM
# Author: Sun
# Software: PyCharm
# Description:
#
#
# ========================================
from Tools.JSExcelHandler import JSExcelHandler
import pandas as pd

if __name__ == '__main__':
    list = JSExcelHandler.getPathFromRootFolder(r'C:\Users\sjj-001\Desktop\2021年处理数据源集合\未处理\2020年市级收支月报')
    reslist = []
    for path in list:
        filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path)
        df = pd.read_excel(path, sheet_name='YB01', skiprows=3)
        df = df.fillna('')
        sum = []
        for index , row in df.iterrows():
            str = row[1
            list = [row[0], str, row[2], filename]
            sum.append(list)
        for index , row in df.iterrows():
            str = row[4]
            list = [row[3], str, row[5], filename]
            sum.append(list)
        ttt = pd.DataFrame(sum, columns=['科目编码', '科目名称', '金额', '来源'])
コード例 #7
0
    def autoExtractSheetHead(self, configuremodel):
        """
        自动根据数据提取表头
        :param configuremodel: 读取 view 层的 model
        """
        print("开始抽取降范式的模板表头")
        datapath = configuremodel.datapath
        # 所有需要合并的表格数据
        exceldatalist = JSExcelHandler.getPathFromRootFolder(datapath)
        dflist = []
        for path in exceldatalist:
            try:
                readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path)
                for sheetname in sheetnames:
                    rSheet = readOpenXlsx.sheet_by_name(sheetname)
                    # mergecells = rSheet.merged_cells
                    resultCells = rSheet.merged_cells
                    mergecells = []
                    # 添加扫描的行范围,默认为前 8 行进行表头的搜寻
                    rangerow = 8
                    # 根据原表中的 cellrange 确定合并单元格的范围, 筛选出 header 部分有合并单元格的情况
                    for index, cell in enumerate(resultCells):
                        crlow, crhign, cclow, cchigh = cell
                        if crlow < rangerow:
                            mergecells.append(cell)
                    # 默认第一行为表头 没有合并单元格的情况
                    if len(mergecells) == 0:
                        headrow = 1
                    else:
                        levelmap = self.createLevelMap(mergecells, rSheet)
                        # print(path)
                        dummy = self.createLevelTree(levelmap, rSheet)
                        # 根据层级来生成最终的 valuelist, keywords
                        # self.travel(dummy, valuelist, keywords, maxlevel)
                        # print(keywords)
                        maxlevel = max(levelmap.keys())
                        # 这里应该比较所有叶子节点的下边界
                        lowestnodes = levelmap[maxlevel]
                        # print(lowestnodes)
                        headrow = 0
                        for node in lowestnodes:
                            if len(node.child) > 0:
                                headrow = maxlevel + 2
                                # break
                            rlow, rhign, clow, chigh = node.cellrange
                            if (rhign > headrow):
                                headrow = rhign


                    # 找到每个 sheet 的表头范围进行和之前的对比 TODO 这里可以对提取出来的表头进行处理
                    curdf = pd.read_excel(path, sheetname, nrows=headrow, header=None)
                    # 每个进行遍历对比,完全不同则添加到 list 中
                    flag = True
                    for olddf in dflist:
                        if curdf.equals(olddf):
                            # JSExcelHandler.pathlog(path)
                            flag = False
                            pass
                    # 第一次
                    if len(dflist) == 0:
                        flag = True
                    if flag:
                        dflist.append(curdf)
                        filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path)
                        # 这里要保存为 xlsx 为了兼容合并单元格的功能
                        objectpath = "{}/{}_{}.{}".format(configuremodel.headspath, filename, sheetname, 'xlsx')
                        self.restoreHead(path, sheetname, objectpath, curdf, headrow)
            except Exception as e:
                print(str(e))
                JSExcelHandler.errorlog("自动根据数据提取表头,原数据文件有问题-{}".format(path))
                pass
        print("抽取模板表头完成")
コード例 #8
0
    def ReadDataThenCompareAndExtract(self, configuremodel):
        print("开始抽取合并数据 %s" % datetime.now())
        # 存放表头的路径
        datapath = configuremodel.datapath
        # 读取所有数据文件
        datafilelist = []
        datafilelist = JSExcelHandler.getPathFromRootFolder(datapath, datafilelist)
        if len(datafilelist) == 0:
            messagebox.showinfo("message", "未找到原始数据")
        else:
            # 存放合并结果的路径
            resultfilepath = configuremodel.storepath
            # 类型是 map {路径 : df}, 这里是按照设置的起始行读取标准模板表头,默认是 0
            headsmaplist = self.readStandardHeadFromFolder(configuremodel)
            if headsmaplist == {}:
                messagebox.showinfo("message", "未找到模板表头")
            else:
                for dfpath in headsmaplist.keys():
                    sum = []
                    # 拼接结果文件路径, 用 os.path win 和 linux 会有自己判断
                    filename = os.path.split(dfpath)[-1]
                    startrow = 0
                    rangemap = configuremodel.validrange

                    # 配置模板的起始降范式位置
                    if rangemap != {}:
                        if filename in configuremodel.validrange:
                            startrow = int(configuremodel.validrange[filename])
                    newfile = os.path.join(resultfilepath, filename)
                    '''
                    这里是为所有的标准模板预先建立数据抽取的结果文件
                    '''
                    try:
                        # 降低表头的范式
                        valuelist, lastrows, keywords = self.lowerDimensionOfTitle(dfpath, startrow)
                        # 创建文件及表头文件到result文件夹下为抽取该模板做准备
                        newfile, newsheetname = self.newfilesave(dfpath, newfile, valuelist, startrow)
                    except Exception as e:
                        JSExcelHandler.errorlog("降范式并且新建表头文件未抽取数据做准备-{}".format(dfpath))
                        pass
                    # 遍历目标数据文件下所有 sheet 是否与标准模板匹配,如果匹配则进行数据抽取合并操作
                    # 对文件进行分组合并
                    # def add(work, lock):
                    #     for path in work:
                    #         print(path)
                    #         try:
                    #             readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path)
                    #             for sheetname in sheetnames:
                    #                 # print("校验模板表:" + str(dfpath) + "\n与数据表:" + path + '下工作表sheetname:' + sheetname + "对比")
                    #                 # 这里不能单纯的将标题范围下的数据一并写入,要重新建立按 keywords 排列顺序的 dataframe
                    #                 # 获取模板表头的行数,用于数据表中获取表头范围
                    #                 totalrows = headsmaplist[dfpath].shape[0]
                    #                 # 根据标准表头获取数据里的表头进行比对
                    #                 titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow,
                    #                                         header=None)
                    #                 # 这里要进行最下层 node 的比对
                    #
                    #                 if titledf.equals(headsmaplist[dfpath]):
                    #                     # 抽取模块headsmaplist, 并对匹配上的表进行记录
                    #                     JSExcelHandler.excuteCheck("{} ---- {}".format(dfpath, path))
                    #                     # 抽取数据 dafaframe
                    #                     df = pd.read_excel(path, sheetname, skiprows=lastrows, header=None, dtype='str')
                    #                     filename = os.path.split(path)[-1]
                    #                     df[r'来源文件'] = filename
                    #                     lock.acquire()
                    #                     # self.apendDataFrame(df, newfile, newsheetname, False)
                    #                     sum.append(df)
                    #                     lock.release()
                    #
                    #         except Exception as e:
                    #             print(str(e))
                    #             JSExcelHandler.errorlog(r"遍历目标数据文件下所有 sheet 与标准模板匹配后追加数据-{}".format(path))
                    #
                    # threads = []
                    # lock = threading.Lock()
                    # for i in range(0, len(datafilelist), 5):
                    #     work = datafilelist[i:i + 5]
                    #     # results.append(None) # enlarge the results list, so we have room for this thread's result
                    #     t = threading.Thread(target=add, args=(work, lock ))
                    #     t.start()
                    #     threads.append(t)
                    #
                    # for t in threads:
                    #     t.join()

                    for path in datafilelist:
                        print(path)
                        try:
                            readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path)
                            for sheetname in sheetnames:
                                # print("校验模板表:" + str(dfpath) + "\n与数据表:" + path + '下工作表sheetname:' + sheetname + "对比")
                                # 这里不能单纯的将标题范围下的数据一并写入,要重新建立按 keywords 排列顺序的 dataframe
                                # 获取模板表头的行数,用于数据表中获取表头范围
                                totalrows = headsmaplist[dfpath].shape[0]
                                # 根据标准表头获取数据里的表头进行比对
                                titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None)
                                # titledf.fillna(method='pad', inplace=True)
                                # 这里要进行最下层 node 的比对
                                if titledf.equals(headsmaplist[dfpath]):
                                    # 抽取模块headsmaplist, 并对匹配上的表进行记录
                                    JSExcelHandler.excuteCheck("{} ---- {}".format(dfpath, path))
                                    # 抽取数据 dafafram
                                    df = pd.read_excel(path, sheetname, skiprows=lastrows, header=None)
                                    filename = os.path.split(path)[-1]
                                    df[r'来源文件'] = '''%s|%s''' % (filename, sheetname)
                                    sum.append(df)
                        except Exception as e:
                            JSExcelHandler.errorlog("降范式并且新建表头文件未抽取数据做准备-{}".format(dfpath))
                            pass
                    res = pd.DataFrame([])
                    for tmpdf in sum:
                       res = res.append(tmpdf)
                    self.apendDataFrame(res, newfile, newsheetname, False)
                print("合并数据完成 %s" % datetime.now())
                messagebox.showinfo("message", "合并完成\n输出路径为:%s" % (configuremodel.storepath))
コード例 #9
0
ファイル: JSETLWindow.py プロジェクト: Paprika0120/AUTOETL
 def pathCallback(self):
     datalist = JSExcelHandler.getPathFromRootFolder(self.result)
     self.comboxlist["values"] = datalist
コード例 #10
0
                dfmaplist[excelpath] = newdf
        return dfmaplist


if __name__ == '__main__':
    # # 配置文件路径
    # path = os.path.abspath('..') + "/configureFile.txt"  # 表示当前所处的文件夹上一级文件夹的绝对路径
    # controller = JSETLController()
    #
    # View = JSConfigureView(path)
    # # view 层读取配置文件
    # View.readConfigureFile()
    #
    # # Controller 调度执行读取模板表头文件 | 根据数据文件自动识别表头
    # controller = JSETLController()
    # # 根据标准表头进行比对和抽取合并数据
    # controller.ReadDataThenCompareAndExtract(View.configuremodel)
    path = '/Users/sun/Desktop/test/'
    list = JSExcelHandler.getPathFromRootFolder(path)
    controller = JSETLController()

    for path in list:
        readOpenXls, sheetnames, workpath = JSExcelHandler.OpenXls(path)
        for sheetname in sheetnames:
            rSheet = readOpenXls.sheet_by_name(sheetname)
            controller.findMinMergeCellRange(rSheet)




コード例 #11
0
#!/usr/bin/env python
# coding=utf-8
# ========================================
#
# Time: 3/26/21 12:31 PM
# Author: Sun
# Software: PyCharm
# Description:
#
#
# ========================================
from Tools.JSExcelHandler import JSExcelHandler
import pandas as pd

if __name__ == '__main__':
    list = JSExcelHandler.getPathFromRootFolder(r'C:\Users\sjj-001\Desktop\AUTOETLTEST\data')
    reslist = []
    for path in list:
        df = pd.read_excel(path, sheet_name='YB01',skiprows=3)
        df = df.fillna('')
        sum = []
        for index , row in df.iterrows():
            str = row[1]
            list = [str, row[2], str.count(' ')]
            sum.append(list)
        for index , row in df.iterrows():
            str = row[4]
            list = [str, row[5], str.count(' ')]
            sum.append(list)
        ttt = pd.DataFrame(sum, columns=['科目名称', '数值', 'count'])
        reslist.append(ttt)