def compare(file1, file2, **kwargs): # If any LAParams group arguments were passed, # create an LAParams object and # populate with given args. Otherwise, set it to None. if kwargs.get('laparams', None) is None: laparams = layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = kwargs.get(param, None) if paramv is not None: laparams[param] = paramv kwargs['laparams'] = laparams s1 = io.StringIO() with open(file1, "rb") as fp: high_level.extract_text_to_fp(fp, s1, **kwargs) s2 = io.StringIO() with open(file2, "rb") as fp: high_level.extract_text_to_fp(fp, s2, **kwargs) import difflib s1.seek(0) s2.seek(0) s1, s2 = s1.readlines(), s2.readlines() import os.path try: extension = os.path.splitext(kwargs['outfile'])[1][1:4] if extension.lower() == 'htm': return difflib.HtmlDiff().make_file(s1, s2) except KeyError: pass return difflib.unified_diff(s1, s2, n=kwargs['context_lines'])
def pdf_to_text(pdf): """Return extracted text from PDF. Warning: This function can be slow... up to 300ms per page This function does not perform optical character recognition. Args: pdf: bytestring of PDF file Returns: str of text extracted from `pdf` contents. """ # make input and output buffers in_buffer = StringIO.StringIO(pdf) out_buffer = StringIO.StringIO() # configure pdf parser parser = pdfparser.PDFParser(in_buffer) doc = pdfparser.PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(password='') rsrcmgr = pdfinterp.PDFResourceManager() laparams = layout.LAParams() # convert pdf to text device = converter.TextConverter(rsrcmgr, outfp=out_buffer, codec='utf-8', laparams=laparams) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) return out_buffer.getvalue()
def reorder_objects(objects, method='LRTB'): """Reorder objects according to their bounding boxes. Parameters ---------- objects : list A list of any type of objects defined in `pdfminer.layout`. method : str, optional Method for reordering objects. - 'LRTB': reorder objects from top-left to bottom-right. - 'TBRL': reorder objects from top-right to bottom-left. Returns ------- objects : list Ordered objects. See also: https://github.com/pdfminer/pdfminer.six/blob/7254530/pdfminer/layout.py#L573-L590 """ if method not in ['LRTB', 'TBRL']: raise ValueError(f'`method` should be one of {["LRTB", "TBRL"]}') laparams = pmla.LAParams() if method == 'LRTB': objects.sort(key=lambda v:( (1 - laparams.boxes_flow) * v.x0 - (1 + laparams.boxes_flow) * (v.y0 + v.y1) )) else: objects.sort(key=lambda v:( - (1 + laparams.boxes_flow) * (v.x0 + v.x1) - (1 - laparams.boxes_flow) * v.y1 )) return objects
def _parse_pdf(self, stream): """Parse a single PDF and return the date and description.""" LOG.info("Parsing accident report data from %s" % stream.name) fields = self._get_fields() try: # so much pdfminer boilerplate.... document = pdfdocument.PDFDocument(pdfparser.PDFParser(stream)) rsrcmgr = pdfinterp.PDFResourceManager() device = pdfconverter.PDFPageAggregator( rsrcmgr, laparams=pdflayout.LAParams()) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) except psparser.PSException as err: LOG.warn("Parsing %s failed, skipping: %s" % (stream.name, err)) return dict([(f.name, f.value) for f in fields]) page_num = 1 for page in pdfpage.PDFPage.create_pages(document): LOG.debug("Parsing page %s" % page_num) interpreter.process_page(page) layout = device.get_result() for field in fields: field.update(layout, page=page_num) if (not field.value and field.short_circuit and page_num >= field.maxpage): LOG.warn("No %s found in %s, aborting parsing" % (field.name, stream.name)) return dict([(f.name, f.value) for f in fields]) page_num += 1 return dict([(f.name, f.value) for f in fields])
def check_first_page_is_cover(pdf: bytes) -> bool: """Reads pdf and returns True if it is a cover page""" with io.StringIO() as test_string: params = layout.LAParams(line_margin=2) extract_text_to_fp(pdf, test_string, page_numbers=[0], laparams=params) first_page = test_string.getvalue() return len(first_page.split()) <= 100
def extract(pdf_path): text = pdf.extract_text(pdf_path, laparams=pdflayout.LAParams(char_margin=1000.0)) selected_needles = list(filter(lambda needle: needle in text, parsers.keys())) if len(selected_needles) == 0: print("Pdf", pdf_path, "ignored, not parser for it.") return text, None parser = parsers[selected_needles[0]] print("Loaded", pdf_path, "as", parser.__module__.split(".")[-1]) return parser, pdf_path, text
def converte_pdf(nome_arquivo, semaforo=None, profundidade=None, pagina=None): parametros = pdf_layout.LAParams(word_margin=100) gerenciador = pdf_interp.PDFResourceManager() str_saida = cStringIO.StringIO() arquivo_pdf = file(nome_arquivo, 'rb') dispositivo = pdf_converter.HTMLConverter(gerenciador, str_saida, \ codec='utf-8', laparams=parametros) interpretador = pdf_interp.PDFPageInterpreter(gerenciador, dispositivo) if pagina == None: tipo_aux = 'pdf' pg_inicio, pg_fim = 0, -1 else: tipo_aux = 'pdf_parte' pg_inicio, pg_fim = pagina - 1, pagina + 1 tipo = tipo_aux try: parser = pdf_parser.PDFParser(arquivo_pdf) documento = pdf_parser.PDFDocument() parser.set_document(documento) documento.set_parser(parser) paginas = [p for p in documento.get_pages()] if len(paginas) > MAX_PAGINAS_PDF: return 'pdf_longo', None for pagina_atual in paginas[pg_inicio:pg_fim]: try: interpretador.process_page(pagina_atual) except Exception: tipo = tipo_aux + '_defeito' dados_html = str_saida.getvalue() except (AssertionError, pdf_parser.PDFSyntaxError): return tipo, None finally: arquivo_pdf.close() dispositivo.close() str_saida.close() removido_tags = _converte_html(dados_html, 'div') removido_espacos_desnecessarios = re.sub("\ +ç", "ç", removido_tags, \ flags=re.IGNORECASE) sem_numero_pagina = re.sub("\nPage\ [0-9]+\ *\n[0-9]+\ *\n", "\n", \ removido_espacos_desnecessarios) texto_final = re.sub('(\ *\n)+', '\n', re.sub('[\ \t]+', r' ', \ sem_numero_pagina)) return tipo, texto_final
def get_ltpages(infile, caching=True): rm = pdfinterp.PDFResourceManager(caching=caching) laparams = layout.LAParams(detect_vertical=False) device = converter.PDFPageAggregator(rm, laparams=laparams) interpreter = pdfinterp.PDFPageInterpreter(rm, device) ltpages = [] for page in PDFPage.get_pages(infile, caching=caching): interpreter.process_page(page) ltpages.append(device.get_result()) device.close() return ltpages
def _parse_pages(self, document): """Return the info extracted for the PDF BORME pages.""" resource_manager = pdfinterp.PDFResourceManager() # value is specified not as an actual length, but as a proportion of # the length to the size of each character in question. # Two text chunks whose distance is closer than the **char_margin** # is considered continuous and get grouped into one. # it may be required to insert blank characters (spaces) as necessary # if the distance between two words is greater than the # **word_margin**. # as a blank between words might not be represented as a space, but # indicated by the positioning of each word. # two lines whose distance is closer than the line_margin is grouped as # a text box, which is a rectangular area that contains a "cluster" of # text portions. # 6.0 --> all without one # params = layout.LAParams(char_margin=8.0) params = layout.LAParams(char_margin=14.0) device = converter.PDFPageAggregator(resource_manager, laparams=params) interpreter = pdfinterp.PDFPageInterpreter(resource_manager, device) pdf_pages = [page for page in pdfpage.PDFPage.create_pages(document)] raw_pages = [] is_first_page = True for page in pdf_pages: interpreter.process_page(page) my_layout = device.get_result() acts = self._parse_raw_acts(my_layout, is_first_page) raw_pages.append(acts) is_first_page = False debug_txt = "Page number: %i Acts: %i" % (len(raw_pages), len(acts)) self._log.debug(debug_txt) pages = [] counter = len(raw_pages) for page in raw_pages: counter -= 1 is_last_page = (counter == 0) next_pages = [] if not is_last_page: next_pages = raw_pages[(len(raw_pages) - counter):] last_page_act = None if pages: if pages[-1]: last_page_act = pages[-1][-1] acts = self._parse_acts(page, next_pages, last_page_act) pages.append(acts) return pages
def pdf_miner(from_file, to_txt): log.debug('trying with pdfminer') pdf = codecs.open(from_file.path, mode='rb') output = codecs.open(to_txt.path, mode='wb') try: resourceman = pdfinterp.PDFResourceManager() device = converter.TextConverter( resourceman, output, laparams=layout.LAParams()) interpreter = pdfinterp.PDFPageInterpreter(resourceman, device) for page in pdfpage.PDFPage.get_pages(pdf): interpreter.process_page(page) output.close() device.close() pdf.close() except Exception, e: log.critical(e) return
def get_layouts(self): """ get layouts from raw pdf file. :return: (page idx, layout obj) generator """ with open(self.file_path, 'rb') as fp: parser = PDFParser(fp) document = PDFDocument(parser, password='') # stores the document structure. if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() # stores shared resources. laparams = LT.LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for idx, page in enumerate(PDFPage.create_pages(document)): interpreter.process_page(page) yield idx, device.get_result() # page idx, layout obj
def create_html(pdf_file_path, newfile, password): doc = doc_parser(pdf_file_path, password) #manager to store resources such as fonts, images rsrcmgr = PDFResourceManager() #set params for analysis.layout analizer returns a LTPage that is a tree with child objects, like textbox, figure, curve, text line laparams = Layout.LAParams(all_texts=True, detect_vertical=True) with open(newfile, 'w+') as f: # PDFDevice to translate content to our needs device = HTMLConverter(rsrcmgr, f, laparams=laparams, layoutmode='loose', showpageno=False, rect_colors={'curve': None}) # processes page contents, renders intructions for device interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page)
def scan(self, file_object, options): extract_text = options.get("extract_text", False) file_limit = options.get("limit", 2000) self.metadata["total"] = {"objects": 0, "extracted": 0} extracted_objects = set() try: with io.BytesIO(file_object.data) as pdf_object: parsed_pdf = pdfparser.PDFParser(pdf_object) pdf_document = pdfdocument.PDFDocument(parsed_pdf) self.metadata.setdefault("annotatedUris", []) for xref in pdf_document.xrefs: for object_id in xref.get_objids(): self.metadata["total"]["objects"] += 1 try: object = pdf_document.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ["AA", "OpenAction"]: file_object.flags.append(f"{self.scanner_name}::auto_action") if key in ["JS", "Javascript"]: file_object.flags.append(f"{self.scanner_name}::javascript_embedded") try: if key == "A": uri = value.get("URI") if (uri is not None and uri not in self.metadata["annotatedUris"]): self.metadata["annotatedUris"].append(uri) except AttributeError: pass if self.metadata["total"]["extracted"] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: child_filename = f"{self.scanner_name}::object_{object_id}" child_fo = objects.StrelkaFile(data=object.get_data(), filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) if object_id not in extracted_objects: self.children.append(child_fo) extracted_objects.add(object_id) self.metadata["total"]["extracted"] += 1 except TypeError: file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}") except struct.error: file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}") except ValueError: file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}") except pdftypes.PDFObjectNotFound: file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}") except pdftypes.PDFNotImplementedError: file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}") except pdftypes.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}") if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams(detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3) device = converter.TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=la_params) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(pdf_object, set()): try: interpreter.process_page(page) except struct.error: file_object.flags.append(f"{self.scanner_name}::text_struct_error") pdf_object_text = retstr.getvalue() child_filename = f"{self.scanner_name}::text" child_fo = objects.StrelkaFile(data=pdf_object_text, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) file_object.flags.append(f"{self.scanner_name}::extracted_text") device.close() retstr.close() except IndexError: file_object.flags.append(f"{self.scanner_name}::index_error") except pdfdocument.PDFEncryptionError: file_object.flags.append(f"{self.scanner_name}::encrypted_pdf") except pdfparser.PDFSyntaxError: file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error") except psparser.PSEOF: file_object.flags.append(f"{self.scanner_name}::ps_eof") except psparser.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
def read_file(self): output = io.StringIO() laparams = layout.LAParams() with open(self.path, "rb") as pdffile: high_level.extract_text_to_fp(pdffile, output, laparams=laparams) self.roster_as_string = output.getvalue().split('\n')
def scrape_pdf(path_to_pdf): """Convert PDF to text and scrape data from text. Args: path_to_pdf (str): path to folder, containing PDFs. Returns: List of lists. Each nested list contains class objects as elements. """ # making parameters for PDFminer for this specific PDFs la_params = layout.LAParams( line_overlap=0.4, char_margin=3.0, line_margin=1.0, word_margin=0.15, boxes_flow=0.3, detect_vertical=False, all_texts=False, ) # defining fields fields = [ "Chief of State", "Head of Government", "Government Type", "Capital", "Legislature", "Judiciary", "Ambassador to US", "US Ambassador", "Area", "Climate", "Natural Resources", "Economic Overview", "GDP (Purchasing Power Parity)", "GDP per capita (Purchasing Power Parity)", "Exports", "Imports", "Population", "Population Growth", "Ethnicity", "Language", "Religion", "Urbanization", "Literacy", ] country_general = [] country_natural_resources = [] country_language = [] country_religion = [] country_ethnicity = [] country_import_partners = [] country_export_partners = [] for name in os.listdir(path_to_pdf): temp_general = [] filepath = os.path.join(path_to_pdf, name) # scraping unformatted text using pdfminer.six text = high_level.extract_text(filepath, laparams=la_params) # here we extract country name from text text = text.split("\n", 1) country_id = text.pop(0) if country_id == "SAO TOMEAND PRINCIPE": country_id = "SAO TOME AND PRINCIPE" text = text[0].split() # searching for the date of last update for PDF and removing it last_update_index = [ (i, i + 2) for i in range(len(text)) if text[i: i + 2] == ["as", "of"] ] last_update = " ".join( text[last_update_index[0][1]: last_update_index[0][1] + 2] ) last_update = last_update.strip() del text[last_update_index[0][0]: last_update_index[0][0] + 4] temp_general.append(last_update) # We index text with get_index function index_dict = create_index(fields, text, country_id) # we work with fieldnames in reverse order, since we need to parse # text from end to begining for field_name in fields[::-1]: field_func = format_field_data(field_name) # this handles some expections for Sudan and Chad, where # fields Chief of State and Head of Government are joined if index_dict[field_name] == [] and field_name == "Chief of State": field_data = " ".join(text[5:]) # some countries don't have some fields, like Literacy # so with such countries we set value for this fields to None elif index_dict[field_name] == []: field_data = None # This is main part, that works with most of the text. # It starts with the end of text and finds the last field # based on index that we got from get_index function else: start_field = index_dict[field_name][0] start_content = index_dict[field_name][1] field_data = " ".join(text[start_content:]) del text[start_field:] if field_data in ("NA", "N/A"): field_data = None three_arg_fields = ["Imports", "Exports", "Religion", "Ethnicity", "Language"] two_arg_fields = ["Population", "Area", "Natural Resources"] if field_name in three_arg_fields: temp = field_func(country_id, field_data, field_name) if field_name == "Imports": temp_general.extend(temp[0]) temp_list = [sc.CountryImportPartners( *item) for item in temp[1]] country_import_partners.extend(temp_list) elif field_name == "Exports": temp_general.extend(temp[0]) temp_list = [sc.CountryExportPartners( *item) for item in temp[1]] country_export_partners.extend(temp_list) elif field_name == "Ethnicity": temp_list = [sc.CountryEthnicity(*item) for item in temp] country_ethnicity.extend(temp_list) elif field_name == "Religion": temp_list = [sc.CountryReligion(*item) for item in temp] country_religion.extend(temp_list) elif field_name == 'Language': temp_list = [sc.CountryLanguage(*item) for item in temp] country_language.extend(temp_list) elif field_name in two_arg_fields: temp = field_func(country_id, field_data) if field_name == "Natural Resources": temp_list = [sc.CountryNaturalResources( *item) for item in temp] country_natural_resources.extend(temp_list) else: temp_general.extend(temp) else: temp = field_func(field_data) temp_general.extend(temp) temp_general.append(country_id) temp_general = sc.CountryGeneral(*temp_general[::-1]) country_general.append(temp_general) print("Finished scraping PDF") return [country_general, country_natural_resources, country_export_partners, country_import_partners, country_ethnicity, country_language, country_religion]
def scan(self, data, file, options, expire_at): extract_text = options.get('extract_text', False) file_limit = options.get('limit', 2000) self.event['total'] = {'objects': 0, 'extracted': 0} extracted_objects = set() try: with io.BytesIO(data) as pdf_io: parsed = pdfparser.PDFParser(pdf_io) pdf = pdfdocument.PDFDocument(parsed) self.event.setdefault('annotated_uris', []) for xref in pdf.xrefs: for object_id in xref.get_objids(): self.event['total']['objects'] += 1 try: object = pdf.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ['AA', 'OpenAction']: self.flags.append('auto_action') if key in ['JS', 'Javascript']: self.flags.append( 'javascript_embedded') try: if key == 'A': uri = value.get('URI') if uri not in self.event[ 'annotated_uris']: self.event[ 'annotated_uris'].append( uri) except AttributeError: pass if self.event['total']['extracted'] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: if object_id not in extracted_objects: extract_file = strelka.File( name=f'object_{object_id}', source=self.name, ) for c in strelka.chunk_string( object.get_data()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 extracted_objects.add(object_id) except TypeError: self.flags.append('type_error_{object_id}') except struct.error: self.flags.append( 'struct_error_{object_id}') except ValueError: self.flags.append('value_error_{object_id}') except pdftypes.PDFObjectNotFound: self.flags.append('object_not_found_{object_id}') except pdftypes.PDFNotImplementedError: self.flags.append( 'not_implemented_error_{object_id}') except psparser.PSSyntaxError: self.flags.append('ps_syntax_error_{object_id}') if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams( detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3, ) device = converter.TextConverter( rsrcmgr, retstr, codec='utf-8', laparams=la_params, ) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(data, set()): try: interpreter.process_page(page) except struct.error: self.flags.append('text_struct_error') extract_file = strelka.File( name='text', source=self.name, ) for c in strelka.chunk_string(retstr.getvalue()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.flags.append('extracted_text') device.close() retstr.close() except IndexError: self.flags.append('index_error') except pdfdocument.PDFEncryptionError: self.flags.append('encrypted_pdf') except pdfparser.PDFSyntaxError: self.flags.append('pdf_syntax_error') except psparser.PSEOF: self.flags.append('ps_eof') except psparser.PSSyntaxError: self.flags.append('ps_syntax_error')