Beispiel #1
0
    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
                 imagewriter=None, stripcontrol=False):
        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
                              laparams=laparams)
        self.imagewriter = imagewriter
        self.stripcontrol = stripcontrol
        self.textboxes = []
        self.page_width = []
        self.page_height = []
        self.classified = []
        self.classified_header = []
        self.classified_paragraph = []
        self.classified_section = []
        self.classified_subsection = []
        self.tree = Tree()
        self.tree.create_node("Documents", 'documents')
        self.num_tabs = 0
        self.write_header()

        self.headerExist = False
        self.in_li = False

        json_file = open('data/model.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.model = model_from_json(loaded_model_json)
        self.model.load_weights("data/model.h5")
        
        self.tokenizer = []

        with open('data/tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        
        return
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, showpageno=False, imagewriter=None):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
     self.showpageno       = showpageno
     self.imagewriter      = imagewriter
     self.blotterProcessor = BlotterProcessor(outfp)
     self.coro             = self.blotterProcessor.processDocument()
     return
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
              showpageno=False, imagewriter=None):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
     self.showpageno = showpageno
     self.imagewriter = imagewriter
     self.current_total_height = 0
     return
Beispiel #4
0
 def __init__(self, rsrcmgr, doc, codec='utf-8', pageno=1,
          laparams=None, imagewriter=None):
     PDFConverter.__init__(self, rsrcmgr, None, codec=codec, pageno=pageno, laparams=laparams)
     self.imagewriter = imagewriter
     self.laparams = laparams
     self.doc = doc
     self.sizes = []
     return
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
              laparams=None, imagewriter=None, stripcontrol=False, document=None):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
     self.imagewriter  = imagewriter
     self.stripcontrol = stripcontrol
     self.document     = document
     self.rsrcmgr      = rsrcmgr
     self.write_header()
     return
 def render_image(self, name, stream):
     """
     Some dummy functions to save memory/CPU when all that is wanted
     is text.  This stops all the image and drawing output from being
     recorded and taking up RAM.
     """
     if self.imagewriter is None:
         return
     PDFConverter.render_image(self, name, stream)
     return
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
              imagewriter=None, stripcontrol=False):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
                           laparams=laparams)
     self.imagewriter = imagewriter
     self.stripcontrol = stripcontrol
     self.textboxes = []
     self.page_width = []
     self.page_height = []
     self.write_header()
     return
Beispiel #8
0
 def __init__(self,
              rsrcmgr,
              outfp,
              pageno=1,
              laparams=None,
              showpageno=False):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           outfp,
                           pageno=pageno,
                           laparams=laparams)
     self.showpageno = showpageno
Beispiel #9
0
 def __init__(self, rsrcmgr, outfp):
     laparams = LAParams()
     laparams.char_margin=0.1
     PDFConverter.__init__(self, rsrcmgr, outfp, codec='utf-8', laparams=laparams)
     self.lines = []
     self.boxes = []
     self.writer = csv.writer(outfp, lineterminator='\n')
     self.writer.writerow(["企業・事業場名称",
                           "所在地",
                           "公表日",
                           "違反法条",
                           "事案概要",
                           "その他参考事項"])
     return
Beispiel #10
0
 def __init__(self,
              rsrcmgr,
              outfp,
              pageno=1,
              laparams=None,
              showpageno=False,
              imagewriter=None):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           outfp,
                           pageno=pageno,
                           laparams=laparams)
     self.showpageno = showpageno
     self.imagewriter = imagewriter
     self.outtext = ''
Beispiel #11
0
 def __init__(self,
              rsrcmgr,
              recorder,
              codec='utf-8',
              pageno=1,
              laparams=None,
              imagewriter=None,
              pages='all'):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           outfp=sys.stderr,
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.recorder = recorder  #custom class which is definied next
     self.pages = pages
Beispiel #12
0
    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
        PDFConverter.__init__(self,
                              rsrcmgr,
                              outfp,
                              codec=codec,
                              pageno=pageno,
                              laparams=laparams)

        self.layoutmode = 'normal'
        self._yoffset = 50

        self._font = None
        self._fontstack = []

        self._posstack = []
        self._texts = []
Beispiel #13
0
 def __init__(self,
              rsrcmgr,
              outfp,
              codec='utf-8',
              pageno=1,
              laparams=None,
              imagewriter=None,
              stripcontrol=False):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           outfp,
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.imagewriter = imagewriter
     self.stripcontrol = stripcontrol
     self.root = ContentNode(type="pages")
     return
Beispiel #14
0
 def __init__(self,
              rsrcmgr,
              doc,
              codec='utf-8',
              pageno=1,
              laparams=None,
              imagewriter=None):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           None,
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.imagewriter = imagewriter
     self.laparams = laparams
     self.doc = doc
     self.sizes = []
     return
Beispiel #15
0
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           codecs.getwriter(
                               locale.getpreferredencoding())(outfp),
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.document_root = {'tag': 'pages'}
     self.stack = []
     self.taglists = dict([
         (t, []) for t in
         "char pages page textbox textline textbox page rect polygon line figure curve textgrouplrtb textgrouptbrl"
         .split()
     ])
     self.hasrun = False
     self.interesting_attributes = 'tag index id bbox rotate pageid text size sizes orientation fontname fontnames fontstyles fonts bboxes length height width'.split(
         ' ')
     self.open()
Beispiel #16
0
def pdf_to_file(file_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = PDFConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(converter, resource_manager)

    with open(filename, 'rb') as fp:
        fp.seek(0, 0)
        for page in PDFPage.get_pages(fp, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(int(0))

        text = fake_file_handle.getvalue()

    converter.close()
    fake_file_handle.close()

    if text:
        return text
Beispiel #17
0
 def __init__(
     self,
     rsrcmgr,
     codec="utf-8",
     pageno=1,
     laparams=None,
     imagewriter=None,
     stripcontrol=False,
 ):
     PDFConverter.__init__(self,
                           rsrcmgr,
                           sys.stdout,
                           codec=codec,
                           pageno=pageno,
                           laparams=laparams)
     self.rpa_pdf_document = RpaPdfDocument()
     self.figure = None
     self.current_page = None
     self.imagewriter = imagewriter
     self.stripcontrol = stripcontrol
     self.write_header()
Beispiel #18
0
 def __init__(self, rsrcmgr):
     PDFConverter.__init__(self, rsrcmgr, None, codec='utf-8', pageno=1, laparams=None)
     self.pages = {}
     return
 def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
     PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
     self.textlines = []
Beispiel #20
0
 def render_image(self, name, stream):
     if self.imagewriter is None:
         return
     PDFConverter.render_image(self, name, stream)
     return