Beispiel #1
0
def pdfminer_parse(file_path, file_hash):
    output_string = StringIO()
    try:
        with open(file_path, 'rb') as fp:
            parser = PDFParser(fp)
            doc = PDFDocument(parser)

            laparams = LAParams()
            rsrcmgr = PDFResourceManager(caching=True)
            txt_device = TextConverter(rsrcmgr, output_string, codec='utf-8', laparams=laparams)
            aggr_device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            txt_interpreter = PDFPageInterpreter(rsrcmgr, txt_device)
            aggr_interpreter = PDFPageInterpreter(rsrcmgr, aggr_device)
            from pdfminer.high_level import extract_pages
            extract_pages("test.pdf")
            for page in PDFPage.get_pages(fp, caching=True):
                txt_interpreter.process_page(page)
                aggr_interpreter.process_page(page)
                layout = aggr_device.get_result()
                # for element in layout:
                #     if isinstance(element, LTImage):
                #         print()
                #     elif isinstance(element, LTFigure):
                #         print()
                # TODO Complete
    except pdfparser.PDFSyntaxError:
        print(f"{file_hash} is not a PDF")
        return None

    return {
        'text': output_string.getvalue(),
        'meta': flatten_and_unicode(doc.info)
    }
Beispiel #2
0
def test_get_common_font_from_pages(get_test_decision_standard_pdf_path,
                                    get_test_decision_v1_pdf_path):
    pages_1 = extract_pages(get_test_decision_standard_pdf_path.open('rb'))
    assert common.get_common_font_from_pages(
        list(pages_1)) == 'TimesNewRomanPSMT'
    pages_2 = extract_pages(get_test_decision_v1_pdf_path.open('rb'))
    assert common.get_common_font_from_pages(list(pages_2)) == 'Arial'
    def test_line_margin(self):
        # The lines have margin 0.2 relative to the height.
        # Extract with line_margin 0.19 should break into 3 separate textboxes.
        pages = list(
            extract_pages(self._get_test_file_path(),
                          laparams=LAParams(line_margin=0.19)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [
            element for element in page if isinstance(element, LTTextContainer)
        ]
        self.assertEqual(len(elements), 3)
        self.assertEqual(elements[0].get_text(), "Text1\n")
        self.assertEqual(elements[1].get_text(), "Text2\n")
        self.assertEqual(elements[2].get_text(), "Text3\n")

        # Extract with line_margin 0.21 should merge into one textbox.
        pages = list(
            extract_pages(self._get_test_file_path(),
                          laparams=LAParams(line_margin=0.21)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [
            element for element in page if isinstance(element, LTTextContainer)
        ]
        self.assertEqual(len(elements), 1)
        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
Beispiel #4
0
def test_extract_text_containers(get_test_decision_standard_pdf_path):
    page = extract_pages(get_test_decision_standard_pdf_path.open('rb'),
                         page_numbers=[1])
    text_containers = common.extract_text_containers(page)
    assert len(text_containers) == 31
    page = extract_pages(get_test_decision_standard_pdf_path.open('rb'),
                         page_numbers=[2])
    text_containers = common.extract_text_containers(page)
    assert len(text_containers) == 33
Beispiel #5
0
    def load(
        cls, fn, page_numbers=None, preload=False, reader_kwargs=None,
        **kwargs
    ):
        """Load PDF file.

        Parameters
        ----------
        fn : str
            Filename of PDF.
        page_numbers : list
            Pages to be loaded. (0-indexed)
        preload : bool
            Preload content from all pages.
        reader_kwargs : dict
            Other kwargs for `pdfminer.high_level.extract_pages()`.
        """
        if reader_kwargs:
            reader_kwargs.pop('page_numbers', None)
        else:
            reader_kwargs = {}

        pages = pmhl.extract_pages(
            fn, page_numbers=page_numbers, **reader_kwargs
        )
        if preload:
            pages = list(pages)
        return cls(pages)
Beispiel #6
0
def element_generator(
        file_path: str,
        page_numbers=None) -> Generator[LTTextContainer, None, None]:
    """
    yields flat list of paragraphs within a document.
    :param file_path:
    :return:
    """
    pNumber = 0
    # disable boxes_flow, style based hierarchy detection is based on purely flat list of paragraphs
    params = LAParams(boxes_flow=None,
                      detect_vertical=False)  # setting for easy doc
    # params = LAParams(boxes_flow=0.5, detect_vertical=True) # setting for column doc
    # todo, do pre-analysis in count_sizes --> are there many boxes within same line
    # todo, understand LAParams, for columns, NONE works better, for vertical only layout LAParams(boxes_flow=None, detect_vertical=False) works better!! :O
    #   do some sort of layout analyis, if there are many boxes vertically next to each other, use layout analysis
    #   - column type
    #   - straight forward document
    for page_layout in extract_pages(file_path,
                                     laparams=params,
                                     page_numbers=page_numbers):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                element.meta = {"page": pNumber}
                yield element
        pNumber += 1
Beispiel #7
0
def parse():
    files = os.listdir(bp.config['UPLOAD_FOLDER'])
    filepaths = [bp.config['UPLOAD_FOLDER'] + '/' + x for x in files]
    d_files = {k: v for k, v in enumerate(filepaths)}


    print(d_files)

    VocabularyIndex = InvertedIndex()

    for k, v in d_files.items():
        # iterate through document apges
        for page_layout in extract_pages(v):

            # compile all text on page
            page_text = ""
            for (count, element) in enumerate(page_layout, 1):
                if isinstance(element, LTTextContainer):
                    page_text += element.get_text()

            #print(page_text)

            # add page to inverted index
            page_no = int(page_layout.pageid)
            VocabularyIndex.index_document(page_no, page_text)

            print(f"processed {page_no}")

    data = VocabularyIndex.get_index()
    print(data)

    '''
Beispiel #8
0
def findMaxFontSize(pdf):
    """ 求出目标 pdf 第一页中的最大字体大小并返回

    :param pdf: 待处理的 pdf 文件url
    :return: 最大字体大小 
    """
    maxFontSize = 0
    count = 0
    LIMIT = 4  # 最多处理到第 LIMIT 个元素

    for page_layout in extract_pages(pdf, page_numbers=0):
        for element in page_layout:
            count += 1
            if not isinstance(element, LTTextContainer): continue

            first_line = next(element.__iter__())
            first_ch = next(first_line.__iter__())
            if not isinstance(first_ch, LTChar): continue

            # 只要求该元素第一行的第一个字符的大小即可
            size = first_ch.size
            if (size > maxFontSize):
                maxFontSize = size

            if count == LIMIT: break  # end for element
        break  # end for page_layout
    return maxFontSize
def get_layout(file_path):
    for page_layout in extract_pages(file_path):
        for element in page_layout:
            #if isinstance(element, LTTextBoxHorizontal):
                #print(element.get_text())
            with open('../data/xml_files/adho_conferences/testlayout.txt', 'a',encoding="utf-8") as fd:
                fd.write(str(element))
Beispiel #10
0
def pdf_to_text(fp, parse_page=[]):
    '''
    Input parameters
    ------
    fp: file path of pdf
    parse_page: page numbers (starting at 0), to conver to text. All pages if empty []
    
    Output returns
    ------
    string_output: String output of contents in pdf
    '''

    output_string = StringIO()
    with open(fp, 'rb') as in_file:
        num_pages = len(list(extract_pages(in_file)))
        #print('Number of pages {}'.format(num_pages))
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for i, page in enumerate(PDFPage.create_pages(doc)):
            if len(parse_page) > 0 and i not in parse_page:
                continue
            #print('Processing page number {}'.format(i))
            interpreter.process_page(page)

    string_output = output_string.getvalue()
    # Close open handles
    device.close()
    output_string.close()

    return string_output
Beispiel #11
0
def get_aadhar_details(path, password):
    i = 0
    try:
        for page_layout in extract_pages(path, password):
            for element in page_layout:
                if isinstance(element, LTTextBoxHorizontal):
                    i += 1
                    if i == 2:
                        addr = element.get_text()
                    elif i == 24:
                        aadhar_no = element.get_text()

        addr_list = addr.splitlines()
        num_list = aadhar_no.splitlines()
        #name
        # print(addr_list[2])
        # #mobile number
        # print(addr_list[-1])
        # #aadhar number
        aadhar = num_list[0]
        splitted_aadhar_with_space = aadhar.split(' ')
        aadhar = "".join(splitted_aadhar_with_space)
        return (addr_list[2], addr_list[-1], aadhar)
    except:
        return False
Beispiel #12
0
 def identify_features(self,pdf_path,doc2vec):
     #we download text examples representing features
     features=pickle.load(open("features.p","rb"))
     #we transform all the features to doc2vec embedings
     f_vecs=doc2vec.transform([v for k,v in features.items()])
     paragraphs=[]
     #we go through the pages of the pdf_document
     for page_layout in extract_pages(pdf_path):
         #through all the elements in the page_layout
         for i, element in enumerate(page_layout):
             #we Only append if it is a textCOntainer and it has a certain length
             if isinstance(element, LTTextContainer) and len(element.get_text()) > 20 and re.match('[a-zA-Z]+',element.get_text()[0]) != None:
                 paragraphs.append(element.get_text())
     #we transform all paragraphs in doc2vec embeddings
     paragraphs_d2v=doc2vec.transform(paragraphs)
     #we create in which we will keep which paragraphs are most representative of certain feature. Important it is an ordered dict
     #because the order will used to access the calculated similarities
     features_similarities=OrderedDict({k:[] for k in features.keys()})
     for i,p in enumerate(paragraphs_d2v):
         #we calculate of a paragraph and all the feature examples
         sims=doc2vec.model.wv.cosine_similarities(p.toarray().transpose(),f_vecs.toarray())
         #go through all the features and append their corresponding similarity
         for ix,k in enumerate(features_similarities.keys()):
             features_similarities[k].append([i,sims[ix][ix]])
     #sorting the similarities
     for k in features_similarities.keys():
         features_similarities[k]=sorted(features_similarities[k],key=lambda x: x[1])
     return paragraphs,features_similarities
 def read(
         self,
         override_la_params=None,
         override_page_numbers=None
 ) -> Generator[LTTextContainer, Any, None]:
     pNumber = 0
     # disable boxes_flow, style based hierarchy detection is based on purely flat list of paragraphs
     # params = LAParams(boxes_flow=None, detect_vertical=False)  # setting for easy doc
     # params = LAParams(boxes_flow=0.5, detect_vertical=True) # setting for column doc
     # todo, do pre-analysis in count_sizes --> are there many boxes within same line
     # todo, understand LAParams, for columns, NONE works better, for vertical only layout LAParams(boxes_flow=None, detect_vertical=False) works better!! :O
     #   do some sort of layout analyis, if there are many boxes vertically next to each other, use layout analysis
     #   - column type
     #   - straight forward document
     for page_layout in extract_pages(
             self.uri,
             laparams=self.la_params
             if not override_la_params else override_la_params,
             page_numbers=self.page_numbers
             if not override_page_numbers else override_page_numbers):
         for element in page_layout:
             element.page = pNumber
             if isinstance(element, LTTextContainer):
                 yield from self.split_boxes_by_style(element)
                 #yield element
             elif isinstance(element, LTFigure):
                 yield from self.__handle_lt_figure(element)
         pNumber += 1
Beispiel #14
0
def layout():
    writeText()
    la_params = LAParams(boxes_flow=-0.5)
    pages = extract_pages(FILE_PATH, laparams=la_params)
    for page_layout in pages:
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                print(element.get_text().encode('utf8'))
Beispiel #15
0
 def mock_download(self):
     pages = list(
         extract_pages(
             join(DOWNLOADING_PATH, f'{product}_sections_{language}.pdf'),
             laparams=LAParams(boxes_flow=BOXES_FLOW,
                               char_margin=CHAR_MARGIN),
         ))
     return pages
Beispiel #16
0
 def _extract_pages_and_text_containers(self, pdf):
     self._pages = list(
         extract_pages(pdf, laparams=self.data.get('laparams', None)))[1:]
     for page in self._pages:
         self._text_containers.append([
             element for element in page
             if isinstance(element, LTTextContainer) and element.get_text()
             != '' and not re.search(r'^\s+$', element.get_text())
         ])
Beispiel #17
0
def extractAuthor(pdf, elem, line):
    """ 根据作者所在元素等位置信息求出目标 pdf 中作者信息并处理后返回

    :param pdf: 待处理的 pdf 文件url
    :param elem: 含作者信息的元素位置
    :param line: 作者信息所在行的偏移量
    :return: 作者信息列表
    """
    # print('[Authors]')
    author_str = ''

    for page_layout in extract_pages(pdf, page_numbers=0):
        elem_count = 0
        for element in page_layout:
            if not isinstance(element, LTTextContainer): continue

            if elem_count != elem:
                elem_count += 1
                continue

            if line == 0:  # 此时读入目标元素的全部内容
                for text_line in element:
                    size = next(text_line.__iter__()).size
                    for character in text_line:
                        ch = character.get_text()
                        # 去掉特殊的角标
                        if ch == ' ' or ch == '†' or ch == '*':
                            author_str += ' '
                        elif not isinstance(character, LTChar):
                            continue
                        elif str(character.size) == str(size):
                            author_str += ch
            else:  # 此时从当前元素的剩余行读入作者信息
                line_count = 0
                for text_line in element:
                    if line_count < line:
                        line_count += 1
                        continue

                    size = next(text_line.__iter__()).size
                    for character in text_line:
                        ch = character.get_text()
                        # 去掉特殊的角标
                        if ch == ' ' or ch == '†' or ch == '*':
                            author_str += ' '
                        elif not isinstance(character, LTChar):
                            continue
                        elif str(character.size) == str(size):
                            author_str += ch
                    line_count += 1
            break  # end for element
        break  # end for page_layout

    author_str = author_str.replace(u'\xa0',
                                    u'').replace('&',
                                                 ',').replace('and ', ',')
    return author_str.encode('gbk', 'ignore')
Beispiel #18
0
 def _extract_pages_and_text_containers(self, pdf):
     self._pages = list(
         extract_pages(pdf, laparams=self.data.get('laparams', None)))
     for page in self._pages:
         self._text_containers.append([
             element for element in page
             if isinstance(element, LTTextContainer) and all([
                 element.get_text().strip() != '',
             ])
         ])
def extractpages(i):
    for layout in extract_pages(i):
        for element in layout:
            if isinstance(element, LTTextContainer):
                text = element.get_text()
                c.append(text)
            #elif isinstance(element, LTImage):
            #text = element.get_any()
            #c.append(text)
    return c
Beispiel #20
0
 def _extract_pages_and_text_containers(self, pdf):
     """Method called to prepare the pages and containers from pdf for processing during pre-process."""
     self._pages = list(
         extract_pages(pdf, laparams=self.data.get('laparams', None)))
     for page in self._pages:
         self._text_containers.append([
             element for element in page
             if isinstance(element, LTTextContainer) and element.get_text()
             != '' and not re.search(r'^\s+$', element.get_text())
         ])
 def download(self):
     """Download EPAR pdf."""
     path = urlretrieve(self.download_url_)[0]
     pages = list(
         extract_pages(
             path,
             laparams=LAParams(boxes_flow=self.BOXES_FLOW_,
                               char_margin=self.CHAR_MARGIN_),
         ))
     return pages
Beispiel #22
0
def page():
    layout = extract_pages(f)
    for i in layout:
        if isinstance(i, LTTextContainer):
            print(i.get_text())
            for text_line in i:
                for character in text_line:
                    if isinstance(character, LTChar):
                        print(character.fontname)
                        print(character.size)
Beispiel #23
0
 def check_decision(cls,
                    item: Optional[PDPCDecisionItem] = None,
                    options: Optional[Options] = None) -> bool:
     with pdpc_decisions.classes.PDFFile(item, options) as pdf:
         first_page = extract_pages(pdf, page_numbers=[0])
         containers = common.extract_text_containers(first_page)
         for container in containers:
             if container.get_text().strip() == 'SUMMARY OF THE DECISION':
                 return True
     return False
Beispiel #24
0
def load(
    pdf_file: IO,
    pdf_file_path: Optional[str] = None,
    la_params: Optional[Dict] = None,
    **kwargs,
) -> PDFDocument:
    """
    Loads the pdf file into a PDFDocument.

    Args:
        pdf_file (io): The PDF file.
        la_params (dict): The layout parameters passed to PDF Miner for analysis. See
            the PDFMiner documentation here:
            https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams.
            Note that py_pdf_parser will re-order the elements it receives from PDFMiner
            so options relating to element ordering will have no effect.
        pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
            for `PDFDocument`.
        kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`.

    Returns:
        PDFDocument: A PDFDocument with the file loaded.
    """
    if la_params is None:
        la_params = {}
    la_params = {**DEFAULT_LA_PARAMS, **la_params}

    pages: Dict[int, Page] = {}
    for page in extract_pages(pdf_file, laparams=LAParams(**la_params)):
        elements = [
            element for element in page if isinstance(element, LTTextBox)
        ]

        # If all_texts=True then we may get some text from inside figures
        if la_params.get("all_texts"):
            figures = (element for element in page
                       if isinstance(element, LTFigure))
            for figure in figures:
                elements += [
                    element for element in figure
                    if isinstance(element, LTTextBox)
                ]

        if not elements:
            logger.warning(
                f"No elements detected on page {page.pageid}, skipping this page."
            )
            continue

        pages[page.pageid] = Page(width=page.width,
                                  height=page.height,
                                  elements=elements)

    return PDFDocument(pages=pages, pdf_file_path=pdf_file_path, **kwargs)
Beispiel #25
0
def from_pdf_to_txt(read_file):
    results = ''
    for page_layout in extract_pages(read_file):
        for element in page_layout:
            # print(element)
            if isinstance(element, LTTextBoxHorizontal):
                string = element.get_text()
                string = string.replace(":", '')
                string = string.replace(":", '')
                results = results + string.replace(' ', '')
    # print(results)
    return results
    def test_no_boxes_flow(self):
        pages = list(
            extract_pages(self._get_test_file_path(),
                          laparams=LAParams(boxes_flow=None)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [
            element for element in page if isinstance(element, LTTextContainer)
        ]
        self.assertEqual(len(elements), 1)
        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
def extract_PDF_textbox(pdf_name=PDF_NAME):
    text_group_index = 0
    for page_layout in extract_pages(pdf_name, laparams=LAParams(line_margin=LINE_MARGIN)):
        for element in page_layout:
            if isinstance(element, LTTextBox):
                text_group_index += 1
                RESULT[text_group_index] = []
                for text_line in element:
                    text = text_line.get_text()
                    RESULT[text_group_index].append(text)

    return f"Converted {text_group_index} group of texts from PDF"
Beispiel #28
0
 def _extract_pages_and_text_containers(self, pdf):
     self._pages = list(extract_pages(pdf, laparams=self.data.get('laparams', None)))
     if common.check_first_page_is_cover(pdf):
         self._pages = self._pages[1:]
     for page in self._pages:
         containers = [element for element in page if isinstance(element, LTTextContainer) and
                       element.get_text() != '' and not
                       re.search(r'^\s+$', element.get_text())]
         containers = common.split_joined_text_containers(containers)
         containers = sorted(containers, key=lambda item: item.x0)
         containers = sorted(containers, key=lambda item: item.y0, reverse=True)
         self._text_containers.append(containers)
Beispiel #29
0
def process_args(inputfile: str, outputfile: str):
    FINAL_OUTPUT = "Created with Ardio by Bell Eapen at nuchange.com. "
    print('{0} is the input and {1} is output'.format(inputfile, outputfile))
    common_font_size = get_common_font_size(inputfile)
    for page_layout in extract_pages(inputfile):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                if(get_common_font_size_of_element(element) == common_font_size):
                    FINAL_OUTPUT = FINAL_OUTPUT + \
                        remove_all_but_alpabets(element.get_text())
    print(FINAL_OUTPUT)
    write_audio_file(FINAL_OUTPUT, outputfile)
Beispiel #30
0
def get_common_font_size(inputfile: str):
    SIZE_COUNT = {}
    for page_layout in extract_pages(inputfile):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    for character in text_line:
                        if isinstance(character, LTChar):
                            if character.size in SIZE_COUNT:
                                SIZE_COUNT[character.size] = SIZE_COUNT[character.size] + 1
                            else:
                                SIZE_COUNT[character.size] = 1
    return max(SIZE_COUNT.items(), key=operator.itemgetter(1))[0]