def mine_text(pdf_path): # with open('samples/simple1.pdf', 'rb') as fin: # extract_text_to_fp(fin, output_string) # print(output_string.getvalue().strip()) # text = extract_text(pdf_path) # print(repr(text)) # print(text) if sys.version_info > (3, 0): from io import StringIO else: from io import BytesIO as StringIO from pdfminer.layout import LAParams output_string = StringIO() with open(pdf_path, 'rb') as fin: extract_text_to_fp(fin, output_string, laparams=LAParams(), output_type='html', codec=None) str_html = output_string.getvalue().strip() with open('temp.html', 'w') as fh: fh.write(str_html) # url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)" # # # Make a GET request to fetch the raw HTML content # html_content = requests.get(url).text # Parse the html content soup = BeautifulSoup(str_html, "lxml")
def _parse_calendar(self, response): """Parse dates and details from schedule PDF""" lp = LAParams(line_margin=0.1) out_str = StringIO() extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp) pdf_text = re.sub(r"\s+", " ", out_str.getvalue()).replace(" ,", ",") for idx, date_str in enumerate( re.findall(r"[a-zA-Z]{3,10} \d{1,2}, \d{4}", pdf_text)): # Ignore every other item if idx % 2 == 1: continue meeting = Meeting( title="Urban Design and Historic Preservation Commission", description="", classification=COMMISSION, start=self._parse_start(date_str), end=None, all_day=False, time_notes="Confirm details with agency", location=self.location, links=[], source=self.start_urls[0], ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_calendar(self, response): lp = LAParams(line_margin=5.0) out_str = StringIO() extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp) pdf_text = out_str.getvalue() split_dates = re.split(r"([A-Z][a-z]{2,8}\s+\d{1,2}, \d{4}[ \n$])", pdf_text, flags=re.M) date_groups = [split_dates[1]] for split_str in split_dates[2:]: if re.search(r"([A-Z][a-z]{2,8}\s+\d{1,2}, \d{4}[ \n$])", split_str): date_groups.append(split_str) else: date_groups[-1] = date_groups[-1] + split_str for date_item_str in date_groups: item = re.sub(r" +", " ", date_item_str).strip() start = self._parse_start(item) if not start: continue meeting = Meeting( title="Board of Trustees", description="", classification=BOARD, start=start, end=None, all_day=False, time_notes="", location=self._parse_location(item), links=self.agenda_map.get(start.date(), []), source=response.url, ) meeting["status"] = self._get_status(meeting, text=item) meeting["id"] = self._get_id(meeting) yield meeting
def check_first_page_is_cover(pdf: bytes) -> bool: """Reads pdf and returns True if it is a cover page""" with io.StringIO() as test_string: params = layout.LAParams(line_margin=2) extract_text_to_fp(pdf, test_string, page_numbers=[0], laparams=params) first_page = test_string.getvalue() return len(first_page.split()) <= 100
def get_pdf_text(update, context, is_file): if not check_user_data(update, context, PDF_INFO): return ConversationHandler.END _ = set_lang(update, context) update.effective_message.reply_text( _("Extracting text from your PDF file"), reply_markup=ReplyKeyboardRemove() ) with tempfile.NamedTemporaryFile() as tf: user_data = context.user_data file_id, file_name = user_data[PDF_INFO] pdf_file = context.bot.get_file(file_id) pdf_file.download(custom_path=tf.name) with tempfile.TemporaryDirectory() as dir_name: tmp_text = tempfile.TemporaryFile() with open(tf.name, "rb") as f: extract_text_to_fp(f, tmp_text) tmp_text.seek(0) pdf_texts = textwrap.wrap(tmp_text.read().decode("utf-8").strip()) out_fn = os.path.join(dir_name, f"{os.path.splitext(file_name)[0]}.txt") send_pdf_text(update, context, pdf_texts, is_file, out_fn) # Clean up memory if user_data[PDF_INFO] == file_id: del user_data[PDF_INFO] return ConversationHandler.END
def _parse_notice(self, response): """ Parse meeting from notice text if embedded text, otherwise use text in meta """ lp = LAParams(line_margin=0.1) out_str = StringIO() extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp) pdf_text = out_str.getvalue() if not pdf_text.strip(): yield self._parse_meeting_text(response.meta["meeting_text"], response.url) else: date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}", response.meta["meeting_text"]) if date_match: date_obj = datetime.strptime( date_match.group().replace(",", ""), "%B %d %Y").date() if "Notice" not in [ link["title"] for link in self.link_date_map[date_obj] ]: self.link_date_map[date_obj].append({ "title": "Notice", "href": response.url }) yield self._parse_meeting_text(re.sub(r"\s+", " ", pdf_text), response.meta["source"])
def pdf_to_text(html=False): # Returns a dictionary where keyword is the file, # and value is the TEXT content of the pdf pdfs = get_pdfs() rename_files() all_tasks = {} for pdf in pdfs: pdf = pdf.strip(".pdf") if html: output_string = StringIO() with open(Path(PDF_FOLDER) / Path(pdf + ".pdf"), "rb") as fin: extract_text_to_fp( fin, output_string, output_type="html", codec=None, ) all_tasks[pdf] = output_string.getvalue() else: text = extract_text(Path(PDF_FOLDER) / Path(pdf + ".pdf")) # Replace \n to html linebreaks: text = text.replace("\n", "<br />\n").strip() all_tasks[pdf] = text return all_tasks
def convert_pdf_to_text(path): output = StringIO() with open(path, "rb") as f: extract_text_to_fp(f, output) return output.getvalue()
def compare(file1, file2, **kwargs): # If any LAParams group arguments were passed, # create an LAParams object and # populate with given args. Otherwise, set it to None. if kwargs.get('laparams', None) is None: laparams = layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = kwargs.get(param, None) if paramv is not None: laparams[param] = paramv kwargs['laparams'] = laparams s1 = io.StringIO() with open(file1, "rb") as fp: high_level.extract_text_to_fp(fp, s1, **kwargs) s2 = io.StringIO() with open(file2, "rb") as fp: high_level.extract_text_to_fp(fp, s2, **kwargs) import difflib s1.seek(0) s2.seek(0) s1, s2 = s1.readlines(), s2.readlines() import os.path try: extension = os.path.splitext(kwargs['outfile'])[1][1:4] if extension.lower() == 'htm': return difflib.HtmlDiff().make_file(s1, s2) except KeyError: pass return difflib.unified_diff(s1, s2, n=kwargs['context_lines'])
def get_decision_citation_item(source: PDPCDecisionItem, options: Options = None) -> (str, str): """ Gets the citation and case number for a PDPCDecisionItem. :param options: :param source: The PDPCDecisionItem to get the citation and case number. :return: A tuple consisting of (citation, case_number) """ from pdfminer.high_level import extract_text_to_fp import io import re citation = '' case_number = '' if check_pdf(source.download_url): with PDFFile(source, options) as pdf, io.StringIO() as output_string: extract_text_to_fp(pdf, output_string, page_numbers=[0, 1]) contents = output_string.getvalue() summary_match = re.search(r'SUMMARY OF THE DECISION', contents) if not summary_match: citation_match = re.search(r'(\[\d{4}])\s+((?:\d\s+)?[A-Z|()]+)\s+\[?(\d+)\]?', contents) if citation_match: citation = citation_match.expand(r'\1 \2 \3') else: logger.warning(f'No citation found for {source}') else: logger.info(f'Decision <{source}> is a summary and does not have a a citation.') case_match = re.search(r'DP-\s*(\w*)-\s*(\w*)', contents) if case_match: case_number = case_match.expand(r'DP-\1-\2') else: logger.warning(f'No case number found for {source}') return citation, case_number
def get_contents_with_attributes(path): output_io = io.StringIO() with open(path, 'rb') as input: extract_text_to_fp(input, output_io, laparams=LAParams(line_margin=0.21, line_overlap=0.4, all_texts=False), output_type='html', codec=None) html = BeautifulSoup(output_io.getvalue(), 'html.parser') final_content = [] for div in html.find_all("div"): temp_div = [] for span in div.find_all("span"): if 'bold' in span['style'].lower(): if span.text.strip(): temp_div.append(f'<b>{span.text.strip()}</b>') if 'bold' not in span['style'].lower(): if span.text.strip(): temp_div.append(span.text.strip()) if temp_div: final_content.append(" ".join(temp_div)) output_io.close() return final_content
def convert_file(filepath: Path) -> None: output_filepath = filepath.with_suffix(".txt") with filepath.open("rb") as pdf_file: output_string = StringIO() extract_text_to_fp(pdf_file, outfp=output_string, laparams=LAParams(), output_type="text") with output_filepath.open("w") as txt_file: txt_file.write(output_string.getvalue().strip())
def attribute_checking(self,input_pdf, text,encoding): text_out = [] if input_pdf.startswith('\\'): if not self.output_io.getvalue(): extract_text_to_fp(self.input_file, self.output_io,laparams=LAParams(line_margin=0.18, line_overlap=0.4, all_texts=False), output_type='html', codec=None) else: pass else: if not self.output_io.getvalue(): with open(self.flat_pdf,'rb') as input: extract_text_to_fp(input, self.output_io,laparams=LAParams(line_margin=0.18, line_overlap=0.4, all_texts=False),output_type='html', codec=None) else: pass html = BeautifulSoup(self.output_io.getvalue(), 'html.parser') results = html.find_all(lambda tag: tag.name == "div" and ' '.join(text.replace('\n', '').split()[:3]) in tag.text.replace('\n', '')) if results: if 'bold' in str(results[-1]).lower(): for span in results[-1]: if 'bold' in span['style'].lower(): text_out.append(f'<b>{span.text}</b>') if 'bold' not in span['style'].lower(): text_out.append(span.text) # print(' '.join(text_out)) return ' '.join(text_out) else: return None else: return None
def attribute(input_pdf, pages, text): text_out = [] output_io = io.StringIO() with open(input_pdf, 'rb') as input: extract_text_to_fp(input, output_io, page_numbers=[int(pages) - 1], laparams=LAParams(line_margin=0.18, line_overlap=0.4, all_texts=False), output_type='html', codec=None) html = BeautifulSoup(output_io.getvalue(), 'html.parser') results = html.find_all(lambda tag: tag.name == "div" and fuzz.ratio( text.lower(), tag.text.lower().replace('/n', '')) > 70) # print(html) if results: if 'bold' in str(results[-1]).lower(): for span in results[-1]: if 'bold' in span['style'].lower(): new_text = span.text.split('\n') text_out.append(f'<b>{new_text[0]}</b>') if 'bold' not in span['style'].lower(): # print('yes') new_text = span.text.split('\n') text_out.append(new_text[0]) # print(' '.join(text_out)) return ' '.join(text_out) else: return None
def Cognitive_PDF(PATH_PDF): from io import StringIO from pdfminer.high_level import extract_text_to_fp output_string = StringIO() with open(PATH_PDF, 'rb') as fin: extract_text_to_fp(fin, output_string) return output_string.getvalue()
def extracttextfp(i): fr = open(i, 'rb') output = StringIO() extract_text_to_fp(fr, output, output_type='text', laparams=LAParams()) fw = open('MomentText/Round 2/file.txt', 'w', encoding='utf-8') fw.write(output.getvalue()) fw.close() return
def parse_file(self): output = StringIO() with open(self.filepath, 'rb') as pdf_file: extract_text_to_fp(pdf_file, output, laparams=LAParams(), output_type='html', codec=None) self.tree = etree.parse(StringIO(output.getvalue()), etree.HTMLParser())
def read(self, path, html=False): text = StringIO() if html: with open(path, "rb") as f: extract_text_to_fp(f, text, laparams=LAParams(), output_type="html", codec=None) text = text.getvalue() else: text = extract_text(path) return text
def _parse_agenda(self, response): lp = LAParams(line_margin=5.0) out_str = StringIO() extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp) pdf_text = out_str.getvalue() date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}", pdf_text) if date_match: date_str = date_match.group().replace(",", "") date_obj = datetime.strptime(date_str, "%B %d %Y").date() self.agenda_map[date_obj] = [{"title": "Agenda", "href": response.url}]
def extract_pdf_pdfminer_format_without_output(pdf_path): output_string = StringIO() with open(pdf_path, 'rb') as f: extract_text_to_fp(f, output_string, laparams=LAParams(), output_type='html', codec=None) context = output_string.getvalue() return context
def convert_pdf_to_xml(path): '''get all pdf data as xml file format''' output = StringIO() with open(path, 'rb') as pdf_file: extract_text_to_fp(pdf_file, output, laparams=LAParams(), output_type='xml', codec=None) xml = output.getvalue() return xml
def text_pdf_fp(self, document: str, n_page: int = None, max_pages: int = 0) -> str: output_string = StringIO() with open(document, 'rb') as f: extract_text_to_fp(f, output_string, maxpages=max_pages, page_numbers=n_page) return self.clean_text(output_string.getvalue())
def parse_pdf(self): """ The meat of the SportingCode instance. This attempts to take the downloaded PDF contents and convert it into Python objects that can be more easily parsed, read, and manipulated. """ if self.parsed: return self.raw_content.write(urlopen(self.url).read()) out_io = StringIO() extract_text_to_fp( self.raw_content, out_io, laparams=LAParams(), output_type='text', strip_control=True, codec=None ) # Saved directly as it was parsed by pdfminer.six self.raw_parsed_content = out_io.getvalue().strip().replace("", "") # We then run the content through a bunch of custom filters that will # massage the PDF contents into something more friendly to parse page_splitter = '\n\n'+self.PAGE_BREAK_INDICATOR # 1. Since each page has the same footer, we can use that to replace # the hard to read page breaks with something that says `<! -- PAGE BREAK -->` self.parsed_content = re.sub( r'[\n ]*Version - 2018.09[\n ]*\d+[\n ]*', page_splitter, self.raw_parsed_content ) # 2. Remove the first couple pages (title and table of contents) since we # don't really care to parse these self.parsed_content = self.parsed_content.split( page_splitter, self.start_page_parsing_at-1 )[self.start_page_parsing_at-1] # 3. Iterate through the sporting code and if the line contains a section ID or index # then we will create a new section, or else we will keep appending to the current. self.parse_content_into_sections() # 4. Try to build a section hierarchy, meaning 1.1. is a child of 1. # This will allow us to easily grab all sections including children self.build_section_hierarchy() # Uncomment this if you want to write the sporting code to a file to check it # with open('sporting_code.md', 'w') as f: # f.write(self.markdown()) self.parsed = True
def extracttexthtml(i): fr = open(i, 'rb') output = StringIO() extract_text_to_fp(fr, output, output_type='html', laparams=LAParams(), codec=None) fw = open('K3407623935.html', 'w', encoding='utf-8') fw.write(output.getvalue()) fw.close() return
def extract_text_from_pdf_bio(pdf_fo: BinaryIO) -> str: """ Extracts text from a PDF :param pdf_fo: a byte file object representing a PDF file :return: extracted text :raises pdfminer.pdftypes.PDFException: on invalid PDF """ out_fo = StringIO() layout = LAParams(all_texts=True) extract_text_to_fp(pdf_fo, out_fo, laparams=layout) return out_fo.getvalue()
def _parse_pdf(self, response): lp = LAParams(line_margin=5.0) out_str = StringIO() extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp) pdf_text = re.sub(r"\s+", " ", out_str.getvalue()).strip() date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}", pdf_text) if not date_match: return date_obj = datetime.strptime(date_match.group().replace(",", ""), "%B %d %Y").date() self.link_date_map[date_obj].append({ "title": "Agenda" if "agenda" in response.url.lower() else "Minutes", "href": response.url, })
def _parse_pdf(self, response): """Parse data from PDF file of schedule""" lp = LAParams(line_margin=5.0) out_str = StringIO() extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp) pdf_text = out_str.getvalue() split_dates = re.split(r"([A-Z][a-z]{2,8}\s+\d{1,2}[ \n$])", pdf_text) desc_str = split_dates[0] self._validate_location(desc_str) date_groups = [split_dates[1]] for split_line in split_dates[2:]: if re.search(r"([A-Z][a-z]{2,8}\s+\d{1,2}[ \n$])", split_line): date_groups.append(split_line) else: date_groups[-1] = date_groups[-1] + split_line year_str = re.search(r"\d{4}", desc_str).group() for date_group in date_groups: item = date_group.strip() date_str = re.search(r"^[A-Z][a-z]{2,8} \d{2}", item).group() if "Hearing" in item: time_strs = [ t[0] for t in re.findall(r"(\d{1,2}(:\d{2})? [APM]{2})", item) ] details = [ ("Public Hearing", time_strs[0].lower()), ("Board", time_strs[1].lower()), ] else: details = [("Board", "5:30 pm")] for title, start_str in details: meeting = Meeting( title=title, description="", classification=self._parse_classification(title), start=self._parse_start(date_str, start_str, year_str), end=None, all_day=False, time_notes="", location=self.location, links=[], source=response.url, ) meeting["status"] = self._get_status(meeting, text=item) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_schedule_pdf(self, response): """Parse dates and details from schedule PDF""" lp = LAParams(line_margin=0.1) out_str = StringIO() extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp) pdf_text = out_str.getvalue().replace("\n", "") # Remove duplicate characters not followed by lowercase (as in 5:00pm) clean_text = re.sub(r"([A-Z0-9:])\1(?![a-z])", r"\1", pdf_text, flags=re.M) # Remove duplicate spaces clean_text = re.sub(r"\s+", " ", clean_text) year_str = re.search(r"\d{4}", clean_text).group() self._validate_location(clean_text) for date_str in re.findall(r"[A-Z]{3,10}\s+\d{1,2}(?!\d)", clean_text): self.meeting_starts.append(self._parse_start(date_str, year_str))
def parse(self, fname): """ Assumes the input file [fname] is small enough to read in its entirety\ into memory. This should be fixed to use a temporary file otherwise. """ outfp = io.StringIO() with open(fname, "rb") as fp: try: high_level.extract_text_to_fp(fp, **locals()) except pdfdocument.PDFTextExtractionNotAllowed as e: raise ReaderException(e) except pdfparser.PDFSyntaxError as e: raise ReaderException(e) outfp.seek(0) contents = outfp.read() return PdfReader._replace_cids_(contents)
def download_pdf_url(url): headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", } html = requests.get(url, headers=headers, timeout=10).content with open('temp.pdf', 'wb') as f: f.write(html) output_string = StringIO() with open('temp.pdf', 'rb') as f: try: extract_text_to_fp(f, output_string, laparams=LAParams(), output_type='html', codec=None) except (PDFSyntaxError): print('Could not read this pdf') return output_string.getvalue().strip()
def pdf_text(pdf_data: bytes) -> str: laparams = pdfminer.layout.LAParams() output = StringIO() high_level.extract_text_to_fp(BytesIO(pdf_data), output, laparams=laparams) return output.getvalue()