def process_pdf(self, pdffile, skippage):
        basefile = fo.get_base_filename(pdffile)
        outfile = "{}/{}.csv".format(self.outdir, basefile)
        tempout = "{}/{}.csv".format(self.tempdir, basefile)
        dqout = "{}/dq.xlsx".format(self.dqoutdir, )
        filelist = self.preprocess_pdf(pdffile=pdffile, html_dir=self.htmldir)
        filecounter = 1
        tempfileno = open(tempout, 'wt', encoding='utf-8', newline="")
        tempwriter = csv.writer(tempfileno, delimiter=",")
        try:
            for htmlfile in filelist:
                if int(
                        str(fo.get_base_filename(htmlfile)).replace(
                            "page", "")) <= skippage:
                    continue
                tempfile = "{0}/{1}_temp.csv".format(
                    self.tempdir, fo.get_base_filename(htmlfile))
                self.process_pdf_tab(pdffile=pdffile,
                                     htmlfile=htmlfile,
                                     outfile=tempfile,
                                     begin=self.begin,
                                     end=self.end,
                                     runtype=self.runtype)
                cp.arrange_csv(
                    tempfile,
                    outfile=tempwriter,
                    pivotcol=self.pivotcolumn,
                    pivotregexp=self.regexppivot,
                    ignorecounter=0,
                    multipage=True,
                    fileno=filecounter,
                    headercol=2,
                )
                filecounter += 1
        except Exception:
            raise
        finally:
            tempfileno.close()
        self._process_csv(tempout, outfile, self.addressindex)
        self.dq_check(auditfile=self.auditfile,
                      pdffile=pdffile,
                      outfile=outfile,
                      auditout=dqout)


# - ビル名称
# - 竣工
# - 所在地
# - 最寄駅
# - フロア
# - 面積 (m2)
# - 面積 (坪)
# - 入居時期
# - 備考
# - 担当者
# - 連絡先TEL
예제 #2
0
    def process_pdf(self, pdffile, skippage):
        basefile = fo.get_base_filename(pdffile)
        outfile = "{}/{}.csv".format(self.outdir, basefile)
        tempout = "{}/{}.csv".format(self.tempdir, basefile)
        filelist = self.preprocess_pdf(pdffile=pdffile, html_dir=self.htmldir)
        print(filelist)
        filecounter = 1
        tempfileno = open(tempout, 'wt', encoding='utf-8', newline="")
        tempwriter = csv.writer(tempfileno, delimiter=",")
        try:
            for htmlfile in filelist:
                if fo.get_base_filename(htmlfile) == 'page' + str(skippage):
                    continue
                tempfile = "{0}/{1}_temp.csv".format(
                    self.tempdir, fo.get_base_filename(htmlfile))
                self.process_pdf_tab(pdffile=pdffile,
                                     htmlfile=htmlfile,
                                     outfile=tempfile,
                                     begin=self.begin,
                                     end=self.end,
                                     runtype=self.runtype)
                cp.arrange_csv(tempfile,
                               outfile=tempwriter,
                               pivotcol=self.pivotcolumn,
                               pivotregexp=self.regexppivot,
                               ignorecounter=0,
                               multipage=True,
                               fileno=filecounter,
                               headercol=4)
                filecounter += 1
        except Exception:
            raise
        finally:
            tempfileno.close()
        self._process_csv(tempout, outfile)
        #self.refill_csv(infile=tempout, outfile=outfile, ignorefirst=False)


# - ビル名称
# - 竣工
# - 所在地
# - 最寄駅
# - フロア
# - 面積 (m2)
# - 面積 (坪)
# - 入居時期
# - 備考
# - 担当者
# - 連絡先TEL
예제 #3
0
 def import_audit_sheet(auditfile, pdffile):
     """
     :param auditfile: location of the audit file
     :param pdffile: file name of the PDF file with Path
     :return: dataframe of the auditfile for the pdffile
     """
     basefile = fo.get_base_filename(pdffile)
     audit = pd.read_excel(auditfile)
     filename = basefile.split('_')[0]
     audit = audit[audit["FileName"] == filename]
     return audit, basefile
예제 #4
0
 def process_pdf(self, pdffile):
     basefile = fo.get_base_filename(pdffile)
     outfile = "{}/{}.csv".format(
         self.outdir,
         basefile
     )
     tempout = "{}/{}.csv".format(
         self.tempdir,
         basefile
     )
     dqout = "{}/dq.xlsx".format(
         self.dqoutdir,
     )
     filelist = self.preprocess_pdf(pdffile=pdffile, html_dir=self.htmldir)
     tempfileno = open(tempout, 'wt', encoding='utf-8', newline="")
     tempwriter = csv.writer(tempfileno, delimiter=",")
     filecounter = 1
     try:
         for htmlfile in filelist:
             tempfile = "{0}/{1}_temp.csv".format(
                 self.tempdir,
                 fo.get_base_filename(htmlfile)
             )
             self.process_pdf_tab(
                 pdffile=pdffile, htmlfile=htmlfile, outfile=tempfile,
                 begin=self.begin, end=self.end, runtype = self.runtype
             )
             cp.arrange_csv(
                 tempfile, outfile=tempwriter, pivotcol=self.pivotcolumn,
                 pivotregexp=self.regexppivot, ignorecounter=0,
                 multipage=True, fileno=filecounter, headercol = 0
             )
             filecounter += 1
     except Exception:
         raise
     finally:
         tempfileno.close()
     self._process_csv(tempout, outfile)
     self.dq_check(auditfile=self.auditfile, pdffile=pdffile.replace("Apple","AppleResi"), outfile=outfile.replace("Apple","AppleResi"), auditout=dqout)
     self.dq_check(auditfile=self.auditfile, pdffile=pdffile.replace("Apple","AppleRetail"), outfile=outfile.replace("Apple","AppleRetail"), auditout=dqout)