def process_pdf(self, pdffile, skippage): basefile = fo.get_base_filename(pdffile) outfile = "{}/{}.csv".format(self.outdir, basefile) tempout = "{}/{}.csv".format(self.tempdir, basefile) dqout = "{}/dq.xlsx".format(self.dqoutdir, ) filelist = self.preprocess_pdf(pdffile=pdffile, html_dir=self.htmldir) filecounter = 1 tempfileno = open(tempout, 'wt', encoding='utf-8', newline="") tempwriter = csv.writer(tempfileno, delimiter=",") try: for htmlfile in filelist: if int( str(fo.get_base_filename(htmlfile)).replace( "page", "")) <= skippage: continue tempfile = "{0}/{1}_temp.csv".format( self.tempdir, fo.get_base_filename(htmlfile)) self.process_pdf_tab(pdffile=pdffile, htmlfile=htmlfile, outfile=tempfile, begin=self.begin, end=self.end, runtype=self.runtype) cp.arrange_csv( tempfile, outfile=tempwriter, pivotcol=self.pivotcolumn, pivotregexp=self.regexppivot, ignorecounter=0, multipage=True, fileno=filecounter, headercol=2, ) filecounter += 1 except Exception: raise finally: tempfileno.close() self._process_csv(tempout, outfile, self.addressindex) self.dq_check(auditfile=self.auditfile, pdffile=pdffile, outfile=outfile, auditout=dqout) # - ビル名称 # - 竣工 # - 所在地 # - 最寄駅 # - フロア # - 面積 (m2) # - 面積 (坪) # - 入居時期 # - 備考 # - 担当者 # - 連絡先TEL
def process_pdf(self, pdffile, skippage): basefile = fo.get_base_filename(pdffile) outfile = "{}/{}.csv".format(self.outdir, basefile) tempout = "{}/{}.csv".format(self.tempdir, basefile) filelist = self.preprocess_pdf(pdffile=pdffile, html_dir=self.htmldir) print(filelist) filecounter = 1 tempfileno = open(tempout, 'wt', encoding='utf-8', newline="") tempwriter = csv.writer(tempfileno, delimiter=",") try: for htmlfile in filelist: if fo.get_base_filename(htmlfile) == 'page' + str(skippage): continue tempfile = "{0}/{1}_temp.csv".format( self.tempdir, fo.get_base_filename(htmlfile)) self.process_pdf_tab(pdffile=pdffile, htmlfile=htmlfile, outfile=tempfile, begin=self.begin, end=self.end, runtype=self.runtype) cp.arrange_csv(tempfile, outfile=tempwriter, pivotcol=self.pivotcolumn, pivotregexp=self.regexppivot, ignorecounter=0, multipage=True, fileno=filecounter, headercol=4) filecounter += 1 except Exception: raise finally: tempfileno.close() self._process_csv(tempout, outfile) #self.refill_csv(infile=tempout, outfile=outfile, ignorefirst=False) # - ビル名称 # - 竣工 # - 所在地 # - 最寄駅 # - フロア # - 面積 (m2) # - 面積 (坪) # - 入居時期 # - 備考 # - 担当者 # - 連絡先TEL
def import_audit_sheet(auditfile, pdffile): """ :param auditfile: location of the audit file :param pdffile: file name of the PDF file with Path :return: dataframe of the auditfile for the pdffile """ basefile = fo.get_base_filename(pdffile) audit = pd.read_excel(auditfile) filename = basefile.split('_')[0] audit = audit[audit["FileName"] == filename] return audit, basefile
def process_pdf(self, pdffile): basefile = fo.get_base_filename(pdffile) outfile = "{}/{}.csv".format( self.outdir, basefile ) tempout = "{}/{}.csv".format( self.tempdir, basefile ) dqout = "{}/dq.xlsx".format( self.dqoutdir, ) filelist = self.preprocess_pdf(pdffile=pdffile, html_dir=self.htmldir) tempfileno = open(tempout, 'wt', encoding='utf-8', newline="") tempwriter = csv.writer(tempfileno, delimiter=",") filecounter = 1 try: for htmlfile in filelist: tempfile = "{0}/{1}_temp.csv".format( self.tempdir, fo.get_base_filename(htmlfile) ) self.process_pdf_tab( pdffile=pdffile, htmlfile=htmlfile, outfile=tempfile, begin=self.begin, end=self.end, runtype = self.runtype ) cp.arrange_csv( tempfile, outfile=tempwriter, pivotcol=self.pivotcolumn, pivotregexp=self.regexppivot, ignorecounter=0, multipage=True, fileno=filecounter, headercol = 0 ) filecounter += 1 except Exception: raise finally: tempfileno.close() self._process_csv(tempout, outfile) self.dq_check(auditfile=self.auditfile, pdffile=pdffile.replace("Apple","AppleResi"), outfile=outfile.replace("Apple","AppleResi"), auditout=dqout) self.dq_check(auditfile=self.auditfile, pdffile=pdffile.replace("Apple","AppleRetail"), outfile=outfile.replace("Apple","AppleRetail"), auditout=dqout)