def test_estimate_dense_text(self):
        text = load_resource_document(
            "lexnlp/utils/parsing/pdf_malformat_parsed_default.txt", 'utf-8')
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        self.assertGreater(estim.extra_line_breaks_prob, 50)

        text = load_resource_document(
            'lexnlp/utils/parsing/pdf_malformat_parsed_stripper.txt', 'utf-8')
        estim = estimator.estimate_text(text)
        self.assertLess(estim.extra_line_breaks_prob, 30)
Exemple #2
0
 def test_definitions_in_sentences_text(self):
     text = load_resource_document(
         'lexnlp/extract/en/tests/test_definitions/test_definition_in_sentences.csv',
         'utf-8')
     defs = list(get_definition_annotations(text))
     self.assertGreater(len(defs), 16)
     self.assertLess(len(defs), 25)
 def test_estimate_text_abusing_headers(self):
     text = load_resource_document(
         'lexnlp/utils/parsing/text_abusing_headers.txt', 'utf-8')
     text = pre_process_document(text)
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     self.assertLess(estim.extra_line_breaks_prob, 50)
 def test_parse_large_text(self):
     text = load_resource_document(
         'lexnlp/extract/es/sample_es_regulations.txt', 'utf-8')
     ret = parser.parse(text)
     self.assertGreater(len(ret), 100)
     html = annotate_text(text, ret)
     save_test_document('sample_es_regulations.html', html)
    def test_definitions_sample_doc(self):
        text = load_resource_document(
            'lexnlp/extract/en/definitions/en_definitions_sample_doc.txt',
            'utf-8')
        definitions = self.parse(text)
        self.assertGreater(len(definitions), 2)  # 10)
        self.annotate_document(text, definitions,
                               'output/en_definitions_sample_doc.html')

        text = load_resource_document(
            'lexnlp/extract/en/definitions/pure_definitions.txt', 'utf-8')
        lines_count = text.count('\n\n') + 1
        definitions = self.parse(text)
        self.assertGreater(len(definitions), lines_count)
        self.annotate_document(text, definitions,
                               'output/pure_definitions.html')
 def test_parse_de_definitions_simple(self):
     parser = make_es_definitions_parser()
     text = load_resource_document('lexnlp/extract/es/definitions/eula.txt',
                                   'utf-8')
     ret = parser.parse(text)
     self.assertGreater(len(ret), 4)
     annotate_definitions_text(text, ret, 'output/es_definitions_01.html')
Exemple #7
0
 def test_load_courts_with_toponims(self):
     text = load_resource_document(
         'lexnlp/extract/de/sample_de_courts02.txt', 'utf-8')
     ret = list(get_courts(text))
     self.assertEqual(2, len(ret))
     jurisdiction = ret[0]["tags"]["Extracted Entity Court Jurisdiction"]
     self.assertEqual("Federal", jurisdiction)
 def test_hit_or_miss_samples(self):
     text = load_resource_document(
         'lexnlp/extract/en/definitions/definitions_hit_or_miss.txt',
         'utf-8')
     definitions = self.parse(text)
     self.assertGreater(len(definitions), 0)
     self.annotate_document(text, definitions,
                            'output/definitions_hit_or_miss.html')
Exemple #9
0
 def test_legacy_parse_court_annotations(self):
     court_config_list = self.load_en_courts()
     text = load_resource_document(
         'lexnlp/extract/en/courts/courts_sample_01.txt', 'utf-8')
     ants = list(get_court_annotations_custom('en', text,
                                              court_config_list))
     self.assertEqual(3, len(ants))
     self.assertEqual('court', ants[0].record_type)
 def process_big_document_with_false_positives(self):
     text = load_resource_document(
         'lexnlp/extract/en/definitions/definitions_fp_collections.txt',
         'utf-8')
     definitions = self.parse(text)
     self.assertGreater(len(definitions), 0)
     self.annotate_document(text, definitions,
                            'output/definitions_fp_collections.html')
Exemple #11
0
    def test_parse_de_definitions_simple(self):
        text = load_resource_document('lexnlp/extract/de/sample_de_definitions01.txt', 'utf-8')
        ret = get_definition_list(text)
        self.assertGreater(len(ret), 5)

        start = ret[0].coords[0]
        end = ret[0].coords[1]
        def_name = ret[0].name
        self.assertTrue("Diensteanbieter" in def_name)
        definition = text[start:end]
        self.assertTrue(def_name in definition)
        annotate_definitions_text(text, ret, 'output/de_definitions_01.html')
Exemple #12
0
    def test_check_match_attrs(self):
        parser = self.make_en_parser()
        text = load_resource_document(
            'lexnlp/extract/en/courts/courts_sample_01.txt', 'utf-8')
        ret_list = parser.parse(text)
        self.assertEqual(4, len(ret_list))

        for rv in [r.to_dictionary() for r in ret_list]:
            self.assertGreater(rv["attrs"]["end"], rv["attrs"]["start"])
            self.assertGreater(rv["attrs"]["end"], 0)
            self.assertGreater(len(rv["tags"]["Extracted Entity Type"]), 0)
            _ = text[rv["attrs"]["start"]:rv["attrs"]["end"]]
Exemple #13
0
 def test_definition_fixed(self):
     text = load_resource_document(
         'lexnlp/extract/en/tests/test_definitions/test_definition_fixed.csv',
         'utf-8')
     defs = list(get_definition_annotations(text))
     self.assertGreater(len(defs), 12)
     self.assertLess(len(defs), 25)
     for df in defs:
         txt = df.name.strip('''"[]'{}.\t ''')
         self.assertGreater(len(txt), 0)
         txt = df.name.strip('''"[]'{}.\t ''')
         self.assertGreater(len(txt), 0)
 def test_definition_fixed(self):
     text = load_resource_document(
         'lexnlp/extract/en/tests/test_definitions/test_definition_fixed.csv',
         'utf-8')
     defs = self.parse(text)
     self.assertGreater(len(defs), 12)
     self.assertLess(len(defs), 25)
     for df in defs:
         txt = df["tags"]["Extracted Entity Definition Name"].strip(
             '''"[]'{}.\t ''')
         self.assertGreater(len(txt), 0)
         txt = df["tags"]["Extracted Entity Text"].strip('''"[]'{}.\t ''')
         self.assertGreater(len(txt), 0)
Exemple #15
0
    def test_compare_to_legacy_parser(self):
        parser = self.make_en_parser()
        text = load_resource_document(
            'lexnlp/extract/en/courts/courts_sample_01.txt', 'utf-8')

        start = time.time()
        ret_n = parser.parse(text)
        _ = (time.time() - start)
        self.assertEqual(4, len(ret_n))

        start = time.time()
        ret_l = [c for c in self.parse_courts_legacy_function(text)]
        __ = (time.time() - start)
        self.assertEqual(3, len(ret_l))
 def test_overlapping_defs(self):
     text = load_resource_document(
         'lexnlp/extract/en/tests/test_definitions/bad_def.txt', 'utf-8')
     defs = list(get_definitions(text))
     self.assertGreater(len(defs), 12)
 def test_estimate_dense_text(self):
     text = load_resource_document(
         'lexnlp/utils/parsing/pdf_malformat_parsed_default.txt', 'utf-8')
     corrector = ParsedTextCorrector()
     corr = corrector.correct_line_breaks(text)
     self.assertLess(len(corr), len(text))
 def test_long_doc(self):
     text = load_resource_document(
         'lexnlp/extract/de/sample_de_court_citations01.txt', 'utf-8')
     items = get_court_citation_list(text, "xz")
     self.assertEqual(2, len(items))
     self.assertEqual("xz", items[0].locale)
Exemple #19
0
 def test_load_courts(self):
     text = load_resource_document(
         'lexnlp/extract/de/sample_de_courts01.txt', 'utf-8')
     ret = get_court_list(text, "y")
     self.assertEqual(4, len(ret))
     self.assertEqual("y", ret[0].locale)