def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.textboxes = [] self.page_width = [] self.page_height = [] self.classified = [] self.classified_header = [] self.classified_paragraph = [] self.classified_section = [] self.classified_subsection = [] self.tree = Tree() self.tree.create_node("Documents", 'documents') self.num_tabs = 0 self.write_header() self.headerExist = False self.in_li = False json_file = open('data/model.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) self.model.load_weights("data/model.h5") self.tokenizer = [] with open('data/tokenizer.pickle', 'rb') as handle: self.tokenizer = pickle.load(handle) return
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, showpageno=False, imagewriter=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno self.imagewriter = imagewriter self.blotterProcessor = BlotterProcessor(outfp) self.coro = self.blotterProcessor.processDocument() return
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, showpageno=False, imagewriter=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno self.imagewriter = imagewriter self.current_total_height = 0 return
def __init__(self, rsrcmgr, doc, codec='utf-8', pageno=1, laparams=None, imagewriter=None): PDFConverter.__init__(self, rsrcmgr, None, codec=codec, pageno=pageno, laparams=laparams) self.imagewriter = imagewriter self.laparams = laparams self.doc = doc self.sizes = [] return
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False, document=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.document = document self.rsrcmgr = rsrcmgr self.write_header() return
def render_image(self, name, stream): """ Some dummy functions to save memory/CPU when all that is wanted is text. This stops all the image and drawing output from being recorded and taking up RAM. """ if self.imagewriter is None: return PDFConverter.render_image(self, name, stream) return
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.textboxes = [] self.page_width = [] self.page_height = [] self.write_header() return
def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None, showpageno=False): PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams) self.showpageno = showpageno
def __init__(self, rsrcmgr, outfp): laparams = LAParams() laparams.char_margin=0.1 PDFConverter.__init__(self, rsrcmgr, outfp, codec='utf-8', laparams=laparams) self.lines = [] self.boxes = [] self.writer = csv.writer(outfp, lineterminator='\n') self.writer.writerow(["企業・事業場名称", "所在地", "公表日", "違反法条", "事案概要", "その他参考事項"]) return
def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None, showpageno=False, imagewriter=None): PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams) self.showpageno = showpageno self.imagewriter = imagewriter self.outtext = ''
def __init__(self, rsrcmgr, recorder, codec='utf-8', pageno=1, laparams=None, imagewriter=None, pages='all'): PDFConverter.__init__(self, rsrcmgr, outfp=sys.stderr, codec=codec, pageno=pageno, laparams=laparams) self.recorder = recorder #custom class which is definied next self.pages = pages
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.layoutmode = 'normal' self._yoffset = 50 self._font = None self._fontstack = [] self._posstack = [] self._texts = []
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.root = ContentNode(type="pages") return
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None): PDFConverter.__init__(self, rsrcmgr, codecs.getwriter( locale.getpreferredencoding())(outfp), codec=codec, pageno=pageno, laparams=laparams) self.document_root = {'tag': 'pages'} self.stack = [] self.taglists = dict([ (t, []) for t in "char pages page textbox textline textbox page rect polygon line figure curve textgrouplrtb textgrouptbrl" .split() ]) self.hasrun = False self.interesting_attributes = 'tag index id bbox rotate pageid text size sizes orientation fontname fontnames fontstyles fonts bboxes length height width'.split( ' ') self.open()
def pdf_to_file(file_path): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = PDFConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(converter, resource_manager) with open(filename, 'rb') as fp: fp.seek(0, 0) for page in PDFPage.get_pages(fp, caching=True, check_extractable=True): page_interpreter.process_page(int(0)) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() if text: return text
def __init__( self, rsrcmgr, codec="utf-8", pageno=1, laparams=None, imagewriter=None, stripcontrol=False, ): PDFConverter.__init__(self, rsrcmgr, sys.stdout, codec=codec, pageno=pageno, laparams=laparams) self.rpa_pdf_document = RpaPdfDocument() self.figure = None self.current_page = None self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.write_header()
def __init__(self, rsrcmgr): PDFConverter.__init__(self, rsrcmgr, None, codec='utf-8', pageno=1, laparams=None) self.pages = {} return
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.textlines = []
def render_image(self, name, stream): if self.imagewriter is None: return PDFConverter.render_image(self, name, stream) return