Beispiel #1
0
    def raw_auditor(self):
        with _by_pdfplumber(self.pdf_obj) as pdf:
            auditor_report_last_pages = [
                pdf.pages[page_nums[-1]]
                for page_nums in self.get_outline_pageRange()
            ]
            search_results = [
                search_pattern_from_page_or_cols(
                    page=page, pattern=AnnualReport.auditor_regex)
                for page in auditor_report_last_pages
            ]

            if not any(search_results) and any(
                    map(abnormal_page, auditor_report_last_pages)):
                auditor_report_last_pages = [
                    turn_n_page(pdf, page, 2) if is_landscape(page) else
                    turn_n_page(pdf, page, -1) if is_full_cn(page) else page
                    for page in auditor_report_last_pages
                ]
                search_results = [
                    search_pattern_from_page_or_cols(
                        page=page, pattern=AnnualReport.auditor_regex)
                    for page in auditor_report_last_pages
                ]

            auditors = set()

            for result in filter(None, search_results):
                if type(result) is tuple:
                    for r in filter(None, result):
                        auditors.add(r.group('auditor'))
                else:
                    auditors.add(result.group('auditor'))
            return tuple(auditors)
Beispiel #2
0
    def search_outline_in_pages(self, pattern, page_range=None, size='fontname', verbose=False, show_matched=False) -> list:
        '''
        return a list of pages number in tuples that contains pattern
        '''
        print('search by page!')
        # print(f'pattern: {pattern}')
        pages = set()
        matched_pattern = []
        with _by_pdfplumber(self.pdf_obj) as pdf:
            if not page_range:
                page_range = pdf.pages
            else:
                page_range = [pdf.pages[p] for p in page_range]
            
            for page in page_range:
                p = page.page_number - 1
                # if verbose: print(f'searching p.{p}')
                
                try:
                    title_alike_txts = get_title_liked_txt(page, size=size)
                except KeyError:
                    logging.warning('Non textual page')
                    continue
                for txt in title_alike_txts:
                    if search_pattern_from_txt(txt, pattern):
                        pages.add(p)
                        matched_pattern.append(txt)
                        if verbose: print(f'with pattern: found {txt} on p.{p}!')

            consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))]
            if show_matched:
                return consecutive_pages, matched_pattern
            return consecutive_pages
Beispiel #3
0
 def get_outline_pageRange(self, outline_pattern=None):
     if outline_pattern is None:
         outline_pattern = AnnualReport.audit_report_regex
     with _by_pdfplumber(self.pdf_obj) as pdf:
         return _get_page_by_outline(
             self.toc, outline_pattern) or _get_page_by_page_title_search(
                 pdf, outline_pattern)
 def target_section(self, p):
     with _by_pdfplumber(self.pdf_obj) as pdf:
         page = pdf.pages[p]
         df = pd.DataFrame(page.chars)
         df = df[~df.text.str.contains(r'[^\x00-\x7F]+')]
         
         target_x0, target_x1 = float(self.target_x0(df)), float(self.target_x1(df))
         target_top = float(self.target_top(df))
         target_bottom = self.target_bottom(df) or page.height
         # print(tagert_x0, tagert_x1)
         # x0, x1 = 0, float(page.width)
         # section = page.crop((x0, target_top , x1, float(target_bottom)), relative=True)            
         section = page.crop((target_x0, target_top , target_x1, float(target_bottom)), relative=True)            
         return section