Python extract_text_to_fp Examples, pdfminer.high_level.extract_text_to_fp Python Examples

Example #1

0

Show file

def mine_text(pdf_path):
    # with open('samples/simple1.pdf', 'rb') as fin:
    #     extract_text_to_fp(fin, output_string)
    # print(output_string.getvalue().strip())

    # text = extract_text(pdf_path)
    # print(repr(text))
    # print(text)

    if sys.version_info > (3, 0):
        from io import StringIO
    else:
        from io import BytesIO as StringIO
    from pdfminer.layout import LAParams
    output_string = StringIO()
    with open(pdf_path, 'rb') as fin:
        extract_text_to_fp(fin,
                           output_string,
                           laparams=LAParams(),
                           output_type='html',
                           codec=None)
    str_html = output_string.getvalue().strip()
    with open('temp.html', 'w') as fh:
        fh.write(str_html)

    # url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
    #
    # # Make a GET request to fetch the raw HTML content
    # html_content = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(str_html, "lxml")

Example #2

0

Show file

    def _parse_calendar(self, response):
        """Parse dates and details from schedule PDF"""
        lp = LAParams(line_margin=0.1)
        out_str = StringIO()
        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
        pdf_text = re.sub(r"\s+", " ", out_str.getvalue()).replace(" ,", ",")

        for idx, date_str in enumerate(
                re.findall(r"[a-zA-Z]{3,10} \d{1,2}, \d{4}", pdf_text)):
            # Ignore every other item
            if idx % 2 == 1:
                continue
            meeting = Meeting(
                title="Urban Design and Historic Preservation Commission",
                description="",
                classification=COMMISSION,
                start=self._parse_start(date_str),
                end=None,
                all_day=False,
                time_notes="Confirm details with agency",
                location=self.location,
                links=[],
                source=self.start_urls[0],
            )

            meeting["status"] = self._get_status(meeting)
            meeting["id"] = self._get_id(meeting)

            yield meeting

Example #3

0

Show file

File: cuya_community_college.py Project: skorasaurus/city-scrapers-cle

    def _parse_calendar(self, response):
        lp = LAParams(line_margin=5.0)
        out_str = StringIO()
        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
        pdf_text = out_str.getvalue()
        split_dates = re.split(r"([A-Z][a-z]{2,8}\s+\d{1,2}, \d{4}[ \n$])", pdf_text, flags=re.M)
        date_groups = [split_dates[1]]
        for split_str in split_dates[2:]:
            if re.search(r"([A-Z][a-z]{2,8}\s+\d{1,2}, \d{4}[ \n$])", split_str):
                date_groups.append(split_str)
            else:
                date_groups[-1] = date_groups[-1] + split_str

        for date_item_str in date_groups:
            item = re.sub(r" +", " ", date_item_str).strip()
            start = self._parse_start(item)
            if not start:
                continue
            meeting = Meeting(
                title="Board of Trustees",
                description="",
                classification=BOARD,
                start=start,
                end=None,
                all_day=False,
                time_notes="",
                location=self._parse_location(item),
                links=self.agenda_map.get(start.date(), []),
                source=response.url,
            )

            meeting["status"] = self._get_status(meeting, text=item)
            meeting["id"] = self._get_id(meeting)

            yield meeting

Example #4

0

Show file

File: common.py Project: houfu/pdpc-decisions

def check_first_page_is_cover(pdf: bytes) -> bool:
    """Reads pdf and returns True if it is a cover page"""
    with io.StringIO() as test_string:
        params = layout.LAParams(line_margin=2)
        extract_text_to_fp(pdf, test_string, page_numbers=[0], laparams=params)
        first_page = test_string.getvalue()
        return len(first_page.split()) <= 100

Example #5

0

Show file

File: text.py Project: kmacprt/pdfbot

def get_pdf_text(update, context, is_file):
    if not check_user_data(update, context, PDF_INFO):
        return ConversationHandler.END

    _ = set_lang(update, context)
    update.effective_message.reply_text(
        _("Extracting text from your PDF file"), reply_markup=ReplyKeyboardRemove()
    )

    with tempfile.NamedTemporaryFile() as tf:
        user_data = context.user_data
        file_id, file_name = user_data[PDF_INFO]
        pdf_file = context.bot.get_file(file_id)
        pdf_file.download(custom_path=tf.name)

        with tempfile.TemporaryDirectory() as dir_name:
            tmp_text = tempfile.TemporaryFile()
            with open(tf.name, "rb") as f:
                extract_text_to_fp(f, tmp_text)

            tmp_text.seek(0)
            pdf_texts = textwrap.wrap(tmp_text.read().decode("utf-8").strip())
            out_fn = os.path.join(dir_name, f"{os.path.splitext(file_name)[0]}.txt")
            send_pdf_text(update, context, pdf_texts, is_file, out_fn)

    # Clean up memory
    if user_data[PDF_INFO] == file_id:
        del user_data[PDF_INFO]

    return ConversationHandler.END

Example #6

0

Show file

 def _parse_notice(self, response):
     """
     Parse meeting from notice text if embedded text, otherwise use text in meta
     """
     lp = LAParams(line_margin=0.1)
     out_str = StringIO()
     extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
     pdf_text = out_str.getvalue()
     if not pdf_text.strip():
         yield self._parse_meeting_text(response.meta["meeting_text"],
                                        response.url)
     else:
         date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}",
                                response.meta["meeting_text"])
         if date_match:
             date_obj = datetime.strptime(
                 date_match.group().replace(",", ""), "%B %d %Y").date()
             if "Notice" not in [
                     link["title"] for link in self.link_date_map[date_obj]
             ]:
                 self.link_date_map[date_obj].append({
                     "title": "Notice",
                     "href": response.url
                 })
         yield self._parse_meeting_text(re.sub(r"\s+", " ", pdf_text),
                                        response.meta["source"])

Example #7

0

Show file

File: tasks.py Project: kalzun/assignment_view

def pdf_to_text(html=False):
    # Returns a dictionary where keyword is the file,
    # and value is the TEXT content of the pdf
    pdfs = get_pdfs()
    rename_files()
    all_tasks = {}
    for pdf in pdfs:
        pdf = pdf.strip(".pdf")
        if html:
            output_string = StringIO()
            with open(Path(PDF_FOLDER) / Path(pdf + ".pdf"), "rb") as fin:
                extract_text_to_fp(
                    fin,
                    output_string,
                    output_type="html",
                    codec=None,
                )
                all_tasks[pdf] = output_string.getvalue()

        else:
            text = extract_text(Path(PDF_FOLDER) / Path(pdf + ".pdf"))
            # Replace \n to html linebreaks:
            text = text.replace("\n", "<br />\n").strip()
            all_tasks[pdf] = text
    return all_tasks

Example #8

0

Show file

def convert_pdf_to_text(path):
    output = StringIO()

    with open(path, "rb") as f:
        extract_text_to_fp(f, output)

    return output.getvalue()

Example #9

0

Show file

File: pdfdiff.py Project: zorroroot/pdfminer.six

def compare(file1, file2, **kwargs):
    # If any LAParams group arguments were passed,
    # create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if kwargs.get('laparams', None) is None:
        laparams = layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = kwargs.get(param, None)
            if paramv is not None:
                laparams[param] = paramv
        kwargs['laparams'] = laparams

    s1 = io.StringIO()
    with open(file1, "rb") as fp:
        high_level.extract_text_to_fp(fp, s1, **kwargs)

    s2 = io.StringIO()
    with open(file2, "rb") as fp:
        high_level.extract_text_to_fp(fp, s2, **kwargs)

    import difflib
    s1.seek(0)
    s2.seek(0)
    s1, s2 = s1.readlines(), s2.readlines()

    import os.path
    try:
        extension = os.path.splitext(kwargs['outfile'])[1][1:4]
        if extension.lower() == 'htm':
            return difflib.HtmlDiff().make_file(s1, s2)
    except KeyError:
        pass
    return difflib.unified_diff(s1, s2, n=kwargs['context_lines'])

Example #10

0

Show file

File: scraper_extras.py Project: houfu/pdpc-decisions

def get_decision_citation_item(source: PDPCDecisionItem, options: Options = None) -> (str, str):
    """
    Gets the citation and case number for a PDPCDecisionItem.

    :param options:
    :param source: The PDPCDecisionItem to get the citation and case number.
    :return: A tuple consisting of (citation, case_number)
    """
    from pdfminer.high_level import extract_text_to_fp
    import io
    import re
    citation = ''
    case_number = ''
    if check_pdf(source.download_url):
        with PDFFile(source, options) as pdf, io.StringIO() as output_string:
            extract_text_to_fp(pdf, output_string, page_numbers=[0, 1])
            contents = output_string.getvalue()
        summary_match = re.search(r'SUMMARY OF THE DECISION', contents)
        if not summary_match:
            citation_match = re.search(r'(\[\d{4}])\s+((?:\d\s+)?[A-Z|()]+)\s+\[?(\d+)\]?', contents)
            if citation_match:
                citation = citation_match.expand(r'\1 \2 \3')
            else:
                logger.warning(f'No citation found for {source}')
        else:
            logger.info(f'Decision <{source}> is a summary and does not have a a citation.')
        case_match = re.search(r'DP-\s*(\w*)-\s*(\w*)', contents)
        if case_match:
            case_number = case_match.expand(r'DP-\1-\2')
        else:
            logger.warning(f'No case number found for {source}')
    return citation, case_number

Example #11

0

Show file

def get_contents_with_attributes(path):
    output_io = io.StringIO()
    with open(path, 'rb') as input:
        extract_text_to_fp(input,
                           output_io,
                           laparams=LAParams(line_margin=0.21,
                                             line_overlap=0.4,
                                             all_texts=False),
                           output_type='html',
                           codec=None)
    html = BeautifulSoup(output_io.getvalue(), 'html.parser')
    final_content = []
    for div in html.find_all("div"):
        temp_div = []
        for span in div.find_all("span"):
            if 'bold' in span['style'].lower():
                if span.text.strip():
                    temp_div.append(f'<b>{span.text.strip()}</b>')
            if 'bold' not in span['style'].lower():
                if span.text.strip():
                    temp_div.append(span.text.strip())
        if temp_div:
            final_content.append(" ".join(temp_div))
    output_io.close()
    return final_content

Example #12

0

Show file

File: convert.py Project: tarasivashchuk/Cookbooks

def convert_file(filepath: Path) -> None:
    output_filepath = filepath.with_suffix(".txt")
    with filepath.open("rb") as pdf_file:
        output_string = StringIO()
        extract_text_to_fp(pdf_file, outfp=output_string, laparams=LAParams(), output_type="text")
        with output_filepath.open("w") as txt_file:
            txt_file.write(output_string.getvalue().strip())

Example #13

0

Show file

File: DG_processing.py Project: vj-kanagaraj/SGK-AI-API

 def attribute_checking(self,input_pdf, text,encoding):
     text_out = []
     if input_pdf.startswith('\\'):
         if not self.output_io.getvalue():
             extract_text_to_fp(self.input_file, self.output_io,laparams=LAParams(line_margin=0.18, line_overlap=0.4, all_texts=False),
                                    output_type='html', codec=None)
         else:
             pass
     else:
         if not self.output_io.getvalue():
             with open(self.flat_pdf,'rb') as input:
                 extract_text_to_fp(input, self.output_io,laparams=LAParams(line_margin=0.18, line_overlap=0.4, all_texts=False),output_type='html', codec=None)
         else:
             pass
     html = BeautifulSoup(self.output_io.getvalue(), 'html.parser')
     results = html.find_all(lambda tag: tag.name == "div" and ' '.join(text.replace('\n', '').split()[:3]) in tag.text.replace('\n', ''))
     if results:
         if 'bold' in str(results[-1]).lower():
             for span in results[-1]:
                 if 'bold' in span['style'].lower():
                     text_out.append(f'<b>{span.text}</b>')
                 if 'bold' not in span['style'].lower():
                     text_out.append(span.text)
             # print(' '.join(text_out))
             return ' '.join(text_out)
         else:
             return None
     else:
         return None

Example #14

0

Show file

File: ferrero_f8_processing.py Project: vj-kanagaraj/SGK-AI-API

def attribute(input_pdf, pages, text):
    text_out = []
    output_io = io.StringIO()
    with open(input_pdf, 'rb') as input:
        extract_text_to_fp(input,
                           output_io,
                           page_numbers=[int(pages) - 1],
                           laparams=LAParams(line_margin=0.18,
                                             line_overlap=0.4,
                                             all_texts=False),
                           output_type='html',
                           codec=None)

    html = BeautifulSoup(output_io.getvalue(), 'html.parser')
    results = html.find_all(lambda tag: tag.name == "div" and fuzz.ratio(
        text.lower(),
        tag.text.lower().replace('/n', '')) > 70)
    #     print(html)
    if results:
        if 'bold' in str(results[-1]).lower():
            for span in results[-1]:
                if 'bold' in span['style'].lower():
                    new_text = span.text.split('\n')
                    text_out.append(f'&lt;b&gt;{new_text[0]}&lt;/b&gt;')
                if 'bold' not in span['style'].lower():
                    #                 print('yes')
                    new_text = span.text.split('\n')
                    text_out.append(new_text[0])
            #             print(' '.join(text_out))
            return ' '.join(text_out)
        else:
            return None

Example #15

0

Show file

File: cognitive_pdf.py Project: kurama8103/myfunc

def Cognitive_PDF(PATH_PDF):
    from io import StringIO
    from pdfminer.high_level import extract_text_to_fp

    output_string = StringIO()
    with open(PATH_PDF, 'rb') as fin:
        extract_text_to_fp(fin, output_string)
    return output_string.getvalue()

Example #16

0

Show file

File: pdfminer_high_level.py Project: xmorgan/data_extraction_using_pdfminer

def extracttextfp(i):
    fr = open(i, 'rb')
    output = StringIO()
    extract_text_to_fp(fr, output, output_type='text', laparams=LAParams())
    fw = open('MomentText/Round 2/file.txt', 'w', encoding='utf-8')
    fw.write(output.getvalue())
    fw.close()
    return

Example #17

0

Show file

 def parse_file(self):
     output = StringIO()
     with open(self.filepath, 'rb') as pdf_file:
         extract_text_to_fp(pdf_file,
                            output,
                            laparams=LAParams(),
                            output_type='html',
                            codec=None)
         self.tree = etree.parse(StringIO(output.getvalue()),
                                 etree.HTMLParser())

Example #18

0

Show file

File: pdf_reader.py Project: Khachikyan01/esg-evaluation-by-nlp

 def read(self, path, html=False):
     text = StringIO()
     if html:
         with open(path, "rb") as f:
             extract_text_to_fp(f, text, laparams=LAParams(),
                                output_type="html", codec=None)
         text = text.getvalue()
     else:
         text = extract_text(path)
     return text

Example #19

0

Show file

File: cuya_community_college.py Project: skorasaurus/city-scrapers-cle

 def _parse_agenda(self, response):
     lp = LAParams(line_margin=5.0)
     out_str = StringIO()
     extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
     pdf_text = out_str.getvalue()
     date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}", pdf_text)
     if date_match:
         date_str = date_match.group().replace(",", "")
         date_obj = datetime.strptime(date_str, "%B %d %Y").date()
         self.agenda_map[date_obj] = [{"title": "Agenda", "href": response.url}]

Example #20

0

Show file

def extract_pdf_pdfminer_format_without_output(pdf_path):
    output_string = StringIO()
    with open(pdf_path, 'rb') as f:
        extract_text_to_fp(f,
                           output_string,
                           laparams=LAParams(),
                           output_type='html',
                           codec=None)
        context = output_string.getvalue()
    return context

Example #21

0

Show file

def convert_pdf_to_xml(path):
    '''get all pdf data as xml file format'''
    output = StringIO()
    with open(path, 'rb') as pdf_file:
        extract_text_to_fp(pdf_file,
                           output,
                           laparams=LAParams(),
                           output_type='xml',
                           codec=None)
    xml = output.getvalue()
    return xml

Example #22

0

Show file

 def text_pdf_fp(self,
                 document: str,
                 n_page: int = None,
                 max_pages: int = 0) -> str:
     output_string = StringIO()
     with open(document, 'rb') as f:
         extract_text_to_fp(f,
                            output_string,
                            maxpages=max_pages,
                            page_numbers=n_page)
     return self.clean_text(output_string.getvalue())

Example #23

0

Show file

    def parse_pdf(self):
        """
        The meat of the SportingCode instance. This attempts to take the
        downloaded PDF contents and convert it into Python objects that can
        be more easily parsed, read, and manipulated.
        """
        if self.parsed:
            return

        self.raw_content.write(urlopen(self.url).read())
        out_io = StringIO()
        extract_text_to_fp(
            self.raw_content,
            out_io,
            laparams=LAParams(),
            output_type='text',
            strip_control=True,
            codec=None
        )
        # Saved directly as it was parsed by pdfminer.six
        self.raw_parsed_content = out_io.getvalue().strip().replace("", "")

        # We then run the content through a bunch of custom filters that will
        # massage the PDF contents into something more friendly to parse
        page_splitter = '\n\n'+self.PAGE_BREAK_INDICATOR

        # 1. Since each page has the same footer, we can use that to replace
        #    the hard to read page breaks with something that says `<! -- PAGE BREAK -->`
        self.parsed_content = re.sub(
            r'[\n ]*Version - 2018.09[\n ]*\d+[\n ]*',
            page_splitter,
            self.raw_parsed_content
        )

        # 2. Remove the first couple pages (title and table of contents) since we
        #    don't really care to parse these
        self.parsed_content = self.parsed_content.split(
            page_splitter,
            self.start_page_parsing_at-1
        )[self.start_page_parsing_at-1]

        # 3. Iterate through the sporting code and if the line contains a section ID or index
        #    then we will create a new section, or else we will keep appending to the current.
        self.parse_content_into_sections()

        # 4. Try to build a section hierarchy, meaning 1.1. is a child of 1.
        #    This will allow us to easily grab all sections including children
        self.build_section_hierarchy()

        # Uncomment this if you want to write the sporting code to a file to check it
        # with open('sporting_code.md', 'w') as f:
        #     f.write(self.markdown())

        self.parsed = True

Example #24

0

Show file

def extracttexthtml(i):
    fr = open(i, 'rb')
    output = StringIO()
    extract_text_to_fp(fr,
                       output,
                       output_type='html',
                       laparams=LAParams(),
                       codec=None)
    fw = open('K3407623935.html', 'w', encoding='utf-8')
    fw.write(output.getvalue())
    fw.close()
    return

Example #25

0

Show file

def extract_text_from_pdf_bio(pdf_fo: BinaryIO) -> str:
    """
    Extracts text from a PDF

    :param pdf_fo: a byte file object representing a PDF file
    :return: extracted text
    :raises pdfminer.pdftypes.PDFException: on invalid PDF
    """
    out_fo = StringIO()
    layout = LAParams(all_texts=True)
    extract_text_to_fp(pdf_fo, out_fo, laparams=layout)
    return out_fo.getvalue()

Example #26

0

Show file

File: cuya_health.py Project: skorasaurus/city-scrapers-cle

 def _parse_pdf(self, response):
     lp = LAParams(line_margin=5.0)
     out_str = StringIO()
     extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
     pdf_text = re.sub(r"\s+", " ", out_str.getvalue()).strip()
     date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}", pdf_text)
     if not date_match:
         return
     date_obj = datetime.strptime(date_match.group().replace(",", ""), "%B %d %Y").date()
     self.link_date_map[date_obj].append({
         "title": "Agenda" if "agenda" in response.url.lower() else "Minutes",
         "href": response.url,
     })

Example #27

0

Show file

    def _parse_pdf(self, response):
        """Parse data from PDF file of schedule"""
        lp = LAParams(line_margin=5.0)
        out_str = StringIO()
        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
        pdf_text = out_str.getvalue()
        split_dates = re.split(r"([A-Z][a-z]{2,8}\s+\d{1,2}[ \n$])", pdf_text)
        desc_str = split_dates[0]
        self._validate_location(desc_str)

        date_groups = [split_dates[1]]
        for split_line in split_dates[2:]:
            if re.search(r"([A-Z][a-z]{2,8}\s+\d{1,2}[ \n$])", split_line):
                date_groups.append(split_line)
            else:
                date_groups[-1] = date_groups[-1] + split_line
        year_str = re.search(r"\d{4}", desc_str).group()

        for date_group in date_groups:
            item = date_group.strip()
            date_str = re.search(r"^[A-Z][a-z]{2,8} \d{2}", item).group()
            if "Hearing" in item:
                time_strs = [
                    t[0]
                    for t in re.findall(r"(\d{1,2}(:\d{2})? [APM]{2})", item)
                ]
                details = [
                    ("Public Hearing", time_strs[0].lower()),
                    ("Board", time_strs[1].lower()),
                ]
            else:
                details = [("Board", "5:30 pm")]

            for title, start_str in details:
                meeting = Meeting(
                    title=title,
                    description="",
                    classification=self._parse_classification(title),
                    start=self._parse_start(date_str, start_str, year_str),
                    end=None,
                    all_day=False,
                    time_notes="",
                    location=self.location,
                    links=[],
                    source=response.url,
                )

                meeting["status"] = self._get_status(meeting, text=item)
                meeting["id"] = self._get_id(meeting)

                yield meeting

Example #28

0

Show file

File: chi_human_relations.py Project: JeyRathnam/city-scrapers

    def _parse_schedule_pdf(self, response):
        """Parse dates and details from schedule PDF"""
        lp = LAParams(line_margin=0.1)
        out_str = StringIO()
        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
        pdf_text = out_str.getvalue().replace("\n", "")
        # Remove duplicate characters not followed by lowercase (as in 5:00pm)
        clean_text = re.sub(r"([A-Z0-9:])\1(?![a-z])", r"\1", pdf_text, flags=re.M)
        # Remove duplicate spaces
        clean_text = re.sub(r"\s+", " ", clean_text)
        year_str = re.search(r"\d{4}", clean_text).group()
        self._validate_location(clean_text)

        for date_str in re.findall(r"[A-Z]{3,10}\s+\d{1,2}(?!\d)", clean_text):
            self.meeting_starts.append(self._parse_start(date_str, year_str))

Example #29

0

Show file

    def parse(self, fname):
        """ Assumes the input file [fname] is small enough to read in its entirety\
            into memory.  This should be fixed to use a temporary file otherwise. """

        outfp = io.StringIO()
        with open(fname, "rb") as fp:

            try:
                high_level.extract_text_to_fp(fp, **locals())
            except pdfdocument.PDFTextExtractionNotAllowed as e:
                raise ReaderException(e)
            except pdfparser.PDFSyntaxError as e:
                raise ReaderException(e)

        outfp.seek(0)
        contents = outfp.read()

        return PdfReader._replace_cids_(contents)

Example #30

0

Show file

def download_pdf_url(url):
    headers = {
        'User-Agent':
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    }
    html = requests.get(url, headers=headers, timeout=10).content
    with open('temp.pdf', 'wb') as f:
        f.write(html)
    output_string = StringIO()
    with open('temp.pdf', 'rb') as f:
        try:
            extract_text_to_fp(f,
                               output_string,
                               laparams=LAParams(),
                               output_type='html',
                               codec=None)
        except (PDFSyntaxError):
            print('Could not read this pdf')
    return output_string.getvalue().strip()

Example #31

0

Show file

File: utils.py Project: tutorcruncher/pydf

def pdf_text(pdf_data: bytes) -> str:
    laparams = pdfminer.layout.LAParams()
    output = StringIO()
    high_level.extract_text_to_fp(BytesIO(pdf_data), output, laparams=laparams)
    return output.getvalue()