Ejemplo n.º 1
0
    def run_paragraph_test(cls,
                           text: str,
                           expected_paragraphs,
                           window_pre=3,
                           window_post=3):
        """
        Base test method to run against text with given results.
        """
        def remove_blankspace(r: str):
            r = r.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            while '  ' in r:
                r = r.replace('  ', ' ')
            return r.strip()

        # Get list from text
        actual_paragraphs = list(
            get_paragraphs(text,
                           window_pre=window_pre,
                           window_post=window_post,
                           return_spans=False))

        actual_paragraphs = [remove_blankspace(p) for p in actual_paragraphs]
        expected_paragraphs = [
            remove_blankspace(p) for p in expected_paragraphs
        ]

        assert_list_equal(actual_paragraphs, expected_paragraphs)
Ejemplo n.º 2
0
def test_get_paragraphs_too_small_text_with_spans():
    text = '\nToo small text\n'
    spans = list(get_paragraphs(text=text, return_spans=True))
    assert_tuple_equal((text, 0, len(text)), spans[0])
Ejemplo n.º 3
0
 def get_paragraphs(self, text=None):
     if not text:
         text = self.text
     return list(lex_paragraphs.get_paragraphs(text))
Ejemplo n.º 4
0
def extract_text_and_structure(pdf_fn: str,
                               pdf_password: str = None,
                               timeout_sec: int = 3600,
                               language: str = "",
                               correct_pdf: bool = False,
                               render_coords_debug: bool = False) \
        -> Tuple[
            str, TextAndPDFCoordinates, str, Dict[int, float]]:  # text, structure, corrected_pdf_fn, page_rotate_angles

    if render_coords_debug:
        correct_pdf = True

    java_modules_path = get_settings().java_modules_path

    # Convert language to language code
    lang_converter = LanguageConverter()
    language, locale_code = lang_converter.get_language_and_locale_code(
        language)

    temp_dir = mkdtemp(prefix='pdf_text_')
    out_fn = os.path.join(
        temp_dir,
        os.path.splitext(os.path.basename(pdf_fn))[0] + '.msgpack')
    out_pdf_fn = pdf_fn
    try:
        args = [
            'java', '-cp', f'{java_modules_path}/*',
            'com.lexpredict.textextraction.GetTextFromPDF', pdf_fn, out_fn,
            '-f', 'pages_msgpack'
        ]

        if pdf_password:
            args.append('-p')
            args.append(pdf_password)

        if correct_pdf:
            out_pdf_fn = os.path.join(
                temp_dir,
                os.path.splitext(os.path.basename(pdf_fn))[0] + '_corr.pdf')
            args.append('-corrected_output')
            args.append(out_pdf_fn)

            if render_coords_debug:
                args.append('-render_char_rects')

        completed_process: CompletedProcess = subprocess.run(
            args,
            check=False,
            timeout=timeout_sec,
            universal_newlines=True,
            stderr=PIPE,
            stdout=PIPE)
        raise_from_process(
            log,
            completed_process,
            process_title=lambda: f'Extract text and structure from {pdf_fn}')

        raise_from_pdfbox_error_messages(completed_process)

        with open(out_fn, 'rb') as pages_f:
            # see object structure in com.lexpredict.textextraction.dto.PDFPlainText
            pdfbox_res: Dict[str, Any] = msgpack.unpack(pages_f, raw=False)

        # Remove Null characters because of incompatibility with PostgreSQL
        text = pdfbox_res['text'].replace("\x00", "")
        if len(text) == 0:
            pdf_coordinates = PDFCoordinates(
                char_bboxes=pdfbox_res['charBBoxes'])
            text_struct = PlainTextStructure(
                title='',
                language=language
                or 'en',  # FastText returns English for empty strings
                pages=[],
                sentences=[],
                paragraphs=[],
                sections=[])
            yield text, \
                  TextAndPDFCoordinates(text_structure=text_struct, pdf_coordinates=pdf_coordinates), \
                  out_pdf_fn, \
                  None

            return

        page_rotate_angles: List[float] = [
            pdfpage['deskewAngle'] for pdfpage in pdfbox_res['pages']
        ]

        pages = []
        num: int = 0
        for p in pdfbox_res['pages']:
            p_res = PlainTextPage(number=num,
                                  start=p['location'][0],
                                  end=p['location'][1],
                                  bbox=p['bbox'])
            pages.append(p_res)
            num += 1

        sentence_spans = get_sentence_span_list(text)

        lang = get_lang_detector()

        sentences = [
            PlainTextSentence(start=start,
                              end=end,
                              language=language or lang.predict_lang(segment))
            for start, end, segment in sentence_spans
        ]

        # There was a try-except in Contraxsuite catching some lexnlp exception.
        # Not putting it here because it should be solved on lexnlp side.
        paragraphs = [
            PlainTextParagraph(start=start,
                               end=end,
                               language=language or lang.predict_lang(segment))
            for segment, start, end in get_paragraphs(text, return_spans=True)
        ]

        sections = [
            PlainTextSection(title=sect.title,
                             start=sect.start,
                             end=sect.end,
                             title_start=sect.title_start,
                             title_end=sect.title_end,
                             level=sect.level,
                             abs_level=sect.abs_level)
            for sect in get_document_sections_with_titles(
                text, sentence_list=sentence_spans)
        ]

        try:
            title = next(get_titles(text))
        except StopIteration:
            title = None

        text_struct = PlainTextStructure(title=title,
                                         language=language
                                         or lang.predict_lang(text),
                                         pages=pages,
                                         sentences=sentences,
                                         paragraphs=paragraphs,
                                         sections=sections)

        char_bboxes = pdfbox_res['charBBoxes']
        pdf_coordinates = PDFCoordinates(char_bboxes=char_bboxes)
        yield text, TextAndPDFCoordinates(
            text_structure=text_struct,
            pdf_coordinates=pdf_coordinates), out_pdf_fn, page_rotate_angles
        return

    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)
Ejemplo n.º 5
0
 def test_date_text(self):
     text = '2021-01-20T10:32:31.938706'
     ps = list(get_paragraphs(text=text, return_spans=False))
     self.assertEqual(text, ps[0])