Ejemplo n.º 1
0
def pdf_to_text(pdf):
    """Return extracted text from PDF.

  Warning: This function can be slow... up to 300ms per page
  This function does not perform optical character recognition.

  Args:
    pdf: bytestring of PDF file
  Returns:
    str of text extracted from `pdf` contents.
  """
    # make input and output buffers
    in_buffer = StringIO.StringIO(pdf)
    out_buffer = StringIO.StringIO()

    # configure pdf parser
    parser = pdfparser.PDFParser(in_buffer)
    doc = pdfparser.PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password='')
    rsrcmgr = pdfinterp.PDFResourceManager()
    laparams = layout.LAParams()

    # convert pdf to text
    device = converter.TextConverter(rsrcmgr,
                                     outfp=out_buffer,
                                     codec='utf-8',
                                     laparams=laparams)
    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)

    for page in doc.get_pages():
        interpreter.process_page(page)

    return out_buffer.getvalue()
Ejemplo n.º 2
0
    def _parse_pdf(self, stream):
        """Parse a single PDF and return the date and description."""
        LOG.info("Parsing accident report data from %s" % stream.name)
        fields = self._get_fields()

        try:
            # so much pdfminer boilerplate....
            document = pdfdocument.PDFDocument(pdfparser.PDFParser(stream))
            rsrcmgr = pdfinterp.PDFResourceManager()
            device = pdfconverter.PDFPageAggregator(
                rsrcmgr, laparams=pdflayout.LAParams())
            interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        except psparser.PSException as err:
            LOG.warn("Parsing %s failed, skipping: %s" % (stream.name, err))
            return dict([(f.name, f.value) for f in fields])

        page_num = 1

        for page in pdfpage.PDFPage.create_pages(document):
            LOG.debug("Parsing page %s" % page_num)

            interpreter.process_page(page)
            layout = device.get_result()

            for field in fields:
                field.update(layout, page=page_num)
                if (not field.value and field.short_circuit
                        and page_num >= field.maxpage):
                    LOG.warn("No %s found in %s, aborting parsing" %
                             (field.name, stream.name))
                    return dict([(f.name, f.value) for f in fields])

            page_num += 1
        return dict([(f.name, f.value) for f in fields])
Ejemplo n.º 3
0
def converte_pdf(nome_arquivo, semaforo=None, profundidade=None, pagina=None):
    parametros = pdf_layout.LAParams(word_margin=100)
    gerenciador = pdf_interp.PDFResourceManager()
    str_saida = cStringIO.StringIO()

    arquivo_pdf = file(nome_arquivo, 'rb')

    dispositivo = pdf_converter.HTMLConverter(gerenciador, str_saida, \
     codec='utf-8', laparams=parametros)
    interpretador = pdf_interp.PDFPageInterpreter(gerenciador, dispositivo)

    if pagina == None:
        tipo_aux = 'pdf'
        pg_inicio, pg_fim = 0, -1
    else:
        tipo_aux = 'pdf_parte'
        pg_inicio, pg_fim = pagina - 1, pagina + 1

    tipo = tipo_aux

    try:
        parser = pdf_parser.PDFParser(arquivo_pdf)
        documento = pdf_parser.PDFDocument()
        parser.set_document(documento)
        documento.set_parser(parser)

        paginas = [p for p in documento.get_pages()]

        if len(paginas) > MAX_PAGINAS_PDF:
            return 'pdf_longo', None

        for pagina_atual in paginas[pg_inicio:pg_fim]:
            try:
                interpretador.process_page(pagina_atual)
            except Exception:
                tipo = tipo_aux + '_defeito'

        dados_html = str_saida.getvalue()
    except (AssertionError, pdf_parser.PDFSyntaxError):
        return tipo, None
    finally:
        arquivo_pdf.close()
        dispositivo.close()
        str_saida.close()

    removido_tags = _converte_html(dados_html, 'div')
    removido_espacos_desnecessarios = re.sub("\ +ç", "ç", removido_tags, \
     flags=re.IGNORECASE)
    sem_numero_pagina = re.sub("\nPage\ [0-9]+\ *\n[0-9]+\ *\n", "\n", \
     removido_espacos_desnecessarios)
    texto_final = re.sub('(\ *\n)+', '\n', re.sub('[\ \t]+', r' ', \
     sem_numero_pagina))

    return tipo, texto_final
Ejemplo n.º 4
0
def parseXFA(path):
	with open(path, "rb") as pdf_file:
		text = ''
		parser = pdfparser.PDFParser(pdf_file)
		document = pdfdocument.PDFDocument(parser)
		tempy = json.dumps(
			[
				str( (k, stream_raw_data(v)) )
				for (k,v) in xfa_alist(xfa(acroform(document)))
			], indent=4,
		)
		text += ''.join([ch for ch in tempy if ch in (ascii_letters + digits + ' ' + '/')])
		
	return stripHTML(text)
Ejemplo n.º 5
0
def pdf_metadata(fname):
    ret = {}
    with open(fname, "rb") as f:
        p = pdfparser.PDFParser(f)
        doc = pdfdocument.PDFDocument(p)

        for info in doc.info:
            for k in info:
                try:
                    v = info[k].resolve()
                except AttributeError:
                    v = str(info[k])
                ret[k.lower()] = v
    return ret
Ejemplo n.º 6
0
def get_identifier(stream):
    """
    返回文献标示符
    :return: 标示类型和值,例如'{'arXiv': '1805.03977'}, {'doi': '10.1016/j.rser.2016.06.056'}, {'None': ''}'
    """
    identifier = {}
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pdf_stream = pdfparser.PDFParser(stream)
    doc = pdfdocument.PDFDocument(pdf_stream, caching=True)
    if 'Metadata' in dict(doc.catalog).keys():
        metadata = pdftypes.resolve1(
            doc.catalog['Metadata']).get_data().decode()
        if get_doi(metadata):
            identifier['doi'] = get_doi(metadata)
        else:
            identifier['None'] = ""
    else:
        try:
            stream = BufferedReader(stream._file)
            for page in PDFPage.get_pages(stream):
                interpreter.process_page(page)
            text = sio.getvalue()
            line = text
            line = line.replace(' ', '')
            line = line.replace('\n', '')
            res = re.findall(vixra_regex, line, re.IGNORECASE)
            if res:
                arxiv_id = list(set([r.strip(".") for r in res]))[0][::-1]
                arxiv_id = re.sub(r'v([0-9])', '', arxiv_id)
                identifier['arXiv'] = arxiv_id
            else:
                identifier['None'] = ""
        except:
            return ""
    return identifier
Ejemplo n.º 7
0
    def scan(self, file_object, options):
        extract_text = options.get("extract_text", False)
        file_limit = options.get("limit", 2000)

        self.metadata["total"] = {"objects": 0, "extracted": 0}
        extracted_objects = set()

        try:
            with io.BytesIO(file_object.data) as pdf_object:
                parsed_pdf = pdfparser.PDFParser(pdf_object)
                pdf_document = pdfdocument.PDFDocument(parsed_pdf)

                self.metadata.setdefault("annotatedUris", [])
                for xref in pdf_document.xrefs:
                    for object_id in xref.get_objids():
                        self.metadata["total"]["objects"] += 1

                        try:
                            object = pdf_document.getobj(object_id)
                            if isinstance(object, dict):
                                for (key, value) in object.items():
                                    if key in ["AA", "OpenAction"]:
                                        file_object.flags.append(f"{self.scanner_name}::auto_action")
                                    if key in ["JS", "Javascript"]:
                                        file_object.flags.append(f"{self.scanner_name}::javascript_embedded")

                                    try:
                                        if key == "A":
                                            uri = value.get("URI")
                                            if (uri is not None and
                                                uri not in self.metadata["annotatedUris"]):
                                                    self.metadata["annotatedUris"].append(uri)

                                    except AttributeError:
                                        pass

                            if self.metadata["total"]["extracted"] >= file_limit:
                                continue
                            if isinstance(object, pdftypes.PDFStream):
                                try:
                                    child_filename = f"{self.scanner_name}::object_{object_id}"
                                    child_fo = objects.StrelkaFile(data=object.get_data(),
                                                                   filename=child_filename,
                                                                   depth=file_object.depth + 1,
                                                                   parent_uid=file_object.uid,
                                                                   root_uid=file_object.root_uid,
                                                                   parent_hash=file_object.hash,
                                                                   root_hash=file_object.root_hash,
                                                                   source=self.scanner_name)
                                    if object_id not in extracted_objects:
                                        self.children.append(child_fo)
                                        extracted_objects.add(object_id)
                                        self.metadata["total"]["extracted"] += 1

                                except TypeError:
                                    file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}")
                                except struct.error:
                                    file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}")

                        except ValueError:
                            file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}")
                        except pdftypes.PDFObjectNotFound:
                            file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}")
                        except pdftypes.PDFNotImplementedError:
                            file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}")
                        except pdftypes.PSSyntaxError:
                            file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}")

                if extract_text:
                    rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
                    retstr = io.StringIO()
                    la_params = layout.LAParams(detect_vertical=True,
                                                char_margin=1.0,
                                                line_margin=0.3,
                                                word_margin=0.3)
                    device = converter.TextConverter(rsrcmgr, retstr,
                                                     codec="utf-8",
                                                     laparams=la_params)
                    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
                    for page in pdfpage.PDFPage.get_pages(pdf_object, set()):
                        try:
                            interpreter.process_page(page)

                        except struct.error:
                            file_object.flags.append(f"{self.scanner_name}::text_struct_error")

                    pdf_object_text = retstr.getvalue()
                    child_filename = f"{self.scanner_name}::text"
                    child_fo = objects.StrelkaFile(data=pdf_object_text,
                                                   filename=child_filename,
                                                   depth=file_object.depth + 1,
                                                   parent_uid=file_object.uid,
                                                   root_uid=file_object.root_uid,
                                                   parent_hash=file_object.hash,
                                                   root_hash=file_object.root_hash,
                                                   source=self.scanner_name)
                    self.children.append(child_fo)
                    file_object.flags.append(f"{self.scanner_name}::extracted_text")
                    device.close()
                    retstr.close()

        except IndexError:
            file_object.flags.append(f"{self.scanner_name}::index_error")
        except pdfdocument.PDFEncryptionError:
            file_object.flags.append(f"{self.scanner_name}::encrypted_pdf")
        except pdfparser.PDFSyntaxError:
            file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error")
        except psparser.PSEOF:
            file_object.flags.append(f"{self.scanner_name}::ps_eof")
        except psparser.PSSyntaxError:
            file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
Ejemplo n.º 8
0
    def scan(self, data, file, options, expire_at):
        extract_text = options.get('extract_text', False)
        file_limit = options.get('limit', 2000)

        self.event['total'] = {'objects': 0, 'extracted': 0}
        extracted_objects = set()

        try:
            with io.BytesIO(data) as pdf_io:
                parsed = pdfparser.PDFParser(pdf_io)
                pdf = pdfdocument.PDFDocument(parsed)

                self.event.setdefault('annotated_uris', [])
                for xref in pdf.xrefs:
                    for object_id in xref.get_objids():
                        self.event['total']['objects'] += 1

                        try:
                            object = pdf.getobj(object_id)
                            if isinstance(object, dict):
                                for (key, value) in object.items():
                                    if key in ['AA', 'OpenAction']:
                                        self.flags.append('auto_action')
                                    if key in ['JS', 'Javascript']:
                                        self.flags.append(
                                            'javascript_embedded')

                                    try:
                                        if key == 'A':
                                            uri = value.get('URI')
                                            if uri not in self.event[
                                                    'annotated_uris']:
                                                self.event[
                                                    'annotated_uris'].append(
                                                        uri)

                                    except AttributeError:
                                        pass

                            if self.event['total']['extracted'] >= file_limit:
                                continue
                            if isinstance(object, pdftypes.PDFStream):
                                try:
                                    if object_id not in extracted_objects:
                                        extract_file = strelka.File(
                                            name=f'object_{object_id}',
                                            source=self.name,
                                        )

                                        for c in strelka.chunk_string(
                                                object.get_data()):
                                            self.upload_to_cache(
                                                extract_file.pointer,
                                                c,
                                                expire_at,
                                            )

                                        self.files.append(extract_file)
                                        self.event['total']['extracted'] += 1
                                        extracted_objects.add(object_id)

                                except TypeError:
                                    self.flags.append('type_error_{object_id}')
                                except struct.error:
                                    self.flags.append(
                                        'struct_error_{object_id}')

                        except ValueError:
                            self.flags.append('value_error_{object_id}')
                        except pdftypes.PDFObjectNotFound:
                            self.flags.append('object_not_found_{object_id}')
                        except pdftypes.PDFNotImplementedError:
                            self.flags.append(
                                'not_implemented_error_{object_id}')
                        except psparser.PSSyntaxError:
                            self.flags.append('ps_syntax_error_{object_id}')

                if extract_text:
                    rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
                    retstr = io.StringIO()
                    la_params = layout.LAParams(
                        detect_vertical=True,
                        char_margin=1.0,
                        line_margin=0.3,
                        word_margin=0.3,
                    )
                    device = converter.TextConverter(
                        rsrcmgr,
                        retstr,
                        codec='utf-8',
                        laparams=la_params,
                    )
                    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
                    for page in pdfpage.PDFPage.get_pages(data, set()):
                        try:
                            interpreter.process_page(page)

                        except struct.error:
                            self.flags.append('text_struct_error')

                    extract_file = strelka.File(
                        name='text',
                        source=self.name,
                    )
                    for c in strelka.chunk_string(retstr.getvalue()):
                        self.upload_to_cache(
                            extract_file.pointer,
                            c,
                            expire_at,
                        )
                    self.files.append(extract_file)

                    self.flags.append('extracted_text')
                    device.close()
                    retstr.close()

        except IndexError:
            self.flags.append('index_error')
        except pdfdocument.PDFEncryptionError:
            self.flags.append('encrypted_pdf')
        except pdfparser.PDFSyntaxError:
            self.flags.append('pdf_syntax_error')
        except psparser.PSEOF:
            self.flags.append('ps_eof')
        except psparser.PSSyntaxError:
            self.flags.append('ps_syntax_error')
Ejemplo n.º 9
0
 def _get_document(self, my_file):
     """Returns a PDFDocument from a file name."""
     fp = open(my_file, "rb")
     parser = pdfparser.PDFParser(fp)
     return pdfdocument.PDFDocument(parser)