def merge_metadata(src,dst): ''' Sets the metadata of dst to the metadata of src pdf files. .. note:: pdftk is used for extracting and updating the metadata. ''' tmp=NamedTemporaryFile(dir=".",delete=False) srcm=_(["pdftk",src,"dump_data"],stdout=PIPE) dstm=_(["pdftk",dst,"update_info","-","output",tmp.name],stdin=srcm.stdout) srcm.stdout.close() # for pipe to work correctly (SIGPIPE) dstm.communicate() # move temporary file to the actual file os.rename(tmp.name, dst)
def merge_metadata(src, dst): ''' Sets the metadata of dst to the metadata of src pdf files. .. note:: pdftk is used for extracting and updating the metadata. ''' tmp = NamedTemporaryFile(dir=".", delete=False) srcm = _(["pdftk", src, "dump_data"], stdout=PIPE) dstm = _(["pdftk", dst, "update_info", "-", "output", tmp.name], stdin=srcm.stdout) srcm.stdout.close() # for pipe to work correctly (SIGPIPE) dstm.communicate() # move temporary file to the actual file os.rename(tmp.name, dst)
def call_tesseract(file): ''' Calls tesseract to generate the hocr of a page. The output pdf file name is the image filename plus ".pdf" extension. :param file: the path to the page image ''' args=["tesseract",file,file,"hocr"] p=_(args); p.communicate()
def call_tesseract(file): ''' Calls tesseract to generate the hocr of a page. The output pdf file name is the image filename plus ".pdf" extension. :param file: the path to the page image ''' args = ["tesseract", file, file, "hocr"] p = _(args) p.communicate()
def call_hocr(pdffile,imagefile,hocrfile): ''' calls hocr2pdf and generates a pdf from a single page pdf file. :param pdffile: the output pdf filename :param imagefile: the input image filename :param hocrfile: the input hocr filename ''' args=["hocr2pdf","-i",imagefile,"-o",pdffile] p=_(args,stdin=PIPE) with open(hocrfile,"r") as hocr: p.communicate(input=hocr.read())
def call_hocr(pdffile, imagefile, hocrfile): ''' calls hocr2pdf and generates a pdf from a single page pdf file. :param pdffile: the output pdf filename :param imagefile: the input image filename :param hocrfile: the input hocr filename ''' args = ["hocr2pdf", "-i", imagefile, "-o", pdffile] p = _(args, stdin=PIPE) with open(hocrfile, "r") as hocr: p.communicate(input=hocr.read())
def merge_pdfs(pdflist,output): ''' Merge the pdf files into a single one. The page order is the same as the order of the list. .. note:: ghostscript with pdfwrite is used for this process. :param pdflist: list of pdf filenames :param output: the name of the output pdf ''' args=["gs","-dBATCH","-dNOPAUSE","-q","-sDEVICE=pdfwrite", "-dNumRenderingThreads=%d"%THREADS, "-sOutputFile="+output, "-c","30000000 setvmthreshold", "-f"]+pdflist p=_(args); p.communicate()
def merge_pdfs(pdflist, output): ''' Merge the pdf files into a single one. The page order is the same as the order of the list. .. note:: ghostscript with pdfwrite is used for this process. :param pdflist: list of pdf filenames :param output: the name of the output pdf ''' args = [ "gs", "-dBATCH", "-dNOPAUSE", "-q", "-sDEVICE=pdfwrite", "-dNumRenderingThreads=%d" % THREADS, "-sOutputFile=" + output, "-c", "30000000 setvmthreshold", "-f" ] + pdflist p = _(args) p.communicate()
def create_tiffs(pdf): ''' Split pdf into images for each page of the pdf. The images are output in tiff format and named "image<page-number>.tiff". .. note:: The images are generated in the current directory. .. note:: ghostscript with tiffg4 device is used. :param pdf: the path to the pdf file. ''' args=["gs","-dNOPAUSE","-sDEVICE=tiffg4", "-dNumRenderingThreads=%d"%THREADS, "-dFirstPage=1", "-sOutputFile=image%d.tiff", "-r%d"%RESOLUTION,"-q", "-c","30000000 setvmthreshold", "-f",pdf,"-c","quit"] p=_(args); p.communicate()
def create_tiffs(pdf): ''' Split pdf into images for each page of the pdf. The images are output in tiff format and named "image<page-number>.tiff". .. note:: The images are generated in the current directory. .. note:: ghostscript with tiffg4 device is used. :param pdf: the path to the pdf file. ''' args = [ "gs", "-dNOPAUSE", "-sDEVICE=tiffg4", "-dNumRenderingThreads=%d" % THREADS, "-dFirstPage=1", "-sOutputFile=image%d.tiff", "-r%d" % RESOLUTION, "-q", "-c", "30000000 setvmthreshold", "-f", pdf, "-c", "quit" ] p = _(args) p.communicate()
def which(exe): t = _p(_(["where.exe", exe], capture_output=True).stdout.decode().strip()).resolve() if t.is_file(): return str(t)