def run_paragraph_test(cls, text: str, expected_paragraphs, window_pre=3, window_post=3): """ Base test method to run against text with given results. """ def remove_blankspace(r: str): r = r.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') while ' ' in r: r = r.replace(' ', ' ') return r.strip() # Get list from text actual_paragraphs = list( get_paragraphs(text, window_pre=window_pre, window_post=window_post, return_spans=False)) actual_paragraphs = [remove_blankspace(p) for p in actual_paragraphs] expected_paragraphs = [ remove_blankspace(p) for p in expected_paragraphs ] assert_list_equal(actual_paragraphs, expected_paragraphs)
def test_get_paragraphs_too_small_text_with_spans(): text = '\nToo small text\n' spans = list(get_paragraphs(text=text, return_spans=True)) assert_tuple_equal((text, 0, len(text)), spans[0])
def get_paragraphs(self, text=None): if not text: text = self.text return list(lex_paragraphs.get_paragraphs(text))
def extract_text_and_structure(pdf_fn: str, pdf_password: str = None, timeout_sec: int = 3600, language: str = "", correct_pdf: bool = False, render_coords_debug: bool = False) \ -> Tuple[ str, TextAndPDFCoordinates, str, Dict[int, float]]: # text, structure, corrected_pdf_fn, page_rotate_angles if render_coords_debug: correct_pdf = True java_modules_path = get_settings().java_modules_path # Convert language to language code lang_converter = LanguageConverter() language, locale_code = lang_converter.get_language_and_locale_code( language) temp_dir = mkdtemp(prefix='pdf_text_') out_fn = os.path.join( temp_dir, os.path.splitext(os.path.basename(pdf_fn))[0] + '.msgpack') out_pdf_fn = pdf_fn try: args = [ 'java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.GetTextFromPDF', pdf_fn, out_fn, '-f', 'pages_msgpack' ] if pdf_password: args.append('-p') args.append(pdf_password) if correct_pdf: out_pdf_fn = os.path.join( temp_dir, os.path.splitext(os.path.basename(pdf_fn))[0] + '_corr.pdf') args.append('-corrected_output') args.append(out_pdf_fn) if render_coords_debug: args.append('-render_char_rects') completed_process: CompletedProcess = subprocess.run( args, check=False, timeout=timeout_sec, universal_newlines=True, stderr=PIPE, stdout=PIPE) raise_from_process( log, completed_process, process_title=lambda: f'Extract text and structure from {pdf_fn}') raise_from_pdfbox_error_messages(completed_process) with open(out_fn, 'rb') as pages_f: # see object structure in com.lexpredict.textextraction.dto.PDFPlainText pdfbox_res: Dict[str, Any] = msgpack.unpack(pages_f, raw=False) # Remove Null characters because of incompatibility with PostgreSQL text = pdfbox_res['text'].replace("\x00", "") if len(text) == 0: pdf_coordinates = PDFCoordinates( char_bboxes=pdfbox_res['charBBoxes']) text_struct = PlainTextStructure( title='', language=language or 'en', # FastText returns English for empty strings pages=[], sentences=[], paragraphs=[], sections=[]) yield text, \ TextAndPDFCoordinates(text_structure=text_struct, pdf_coordinates=pdf_coordinates), \ out_pdf_fn, \ None return page_rotate_angles: List[float] = [ pdfpage['deskewAngle'] for pdfpage in pdfbox_res['pages'] ] pages = [] num: int = 0 for p in pdfbox_res['pages']: p_res = PlainTextPage(number=num, start=p['location'][0], end=p['location'][1], bbox=p['bbox']) pages.append(p_res) num += 1 sentence_spans = get_sentence_span_list(text) lang = get_lang_detector() sentences = [ PlainTextSentence(start=start, end=end, language=language or lang.predict_lang(segment)) for start, end, segment in sentence_spans ] # There was a try-except in Contraxsuite catching some lexnlp exception. # Not putting it here because it should be solved on lexnlp side. paragraphs = [ PlainTextParagraph(start=start, end=end, language=language or lang.predict_lang(segment)) for segment, start, end in get_paragraphs(text, return_spans=True) ] sections = [ PlainTextSection(title=sect.title, start=sect.start, end=sect.end, title_start=sect.title_start, title_end=sect.title_end, level=sect.level, abs_level=sect.abs_level) for sect in get_document_sections_with_titles( text, sentence_list=sentence_spans) ] try: title = next(get_titles(text)) except StopIteration: title = None text_struct = PlainTextStructure(title=title, language=language or lang.predict_lang(text), pages=pages, sentences=sentences, paragraphs=paragraphs, sections=sections) char_bboxes = pdfbox_res['charBBoxes'] pdf_coordinates = PDFCoordinates(char_bboxes=char_bboxes) yield text, TextAndPDFCoordinates( text_structure=text_struct, pdf_coordinates=pdf_coordinates), out_pdf_fn, page_rotate_angles return finally: shutil.rmtree(temp_dir, ignore_errors=True)
def test_date_text(self): text = '2021-01-20T10:32:31.938706' ps = list(get_paragraphs(text=text, return_spans=False)) self.assertEqual(text, ps[0])