Ejemplo n.º 1
0
 def init_parser():
     split_params = LineSplitParams()
     split_params.line_breaks = {'\n', '.', ';', '!', '?'}
     split_params.abbreviations = DeLanguageTokens.abbreviations
     split_params.abbr_ignore_case = True
     CopyrightDeParser.line_processor = LineProcessor(
         line_split_params=split_params)
Ejemplo n.º 2
0
 def test_line_processor_phrases_de(self):
     text = """
     (2) Vermögenswerte im Sinne dieses Gesetzes sind bebaute und unbebaute Grundstücke sowie rechtlich selbständige Gebäude und Baulichkeiten (im folgenden Grundstücke und Gebäude genannt), Nutzungsrechte und dingliche Rechte an Grundstücken oder Gebäuden, bewegliche Sachen sowie gewerbliche Schutzrechte, Urheberrechte und verwandte Schutzrechte. Vermögenswerte im Sinne dieses Gesetzes sind auch Kontoguthaben und sonstige auf Geldzahlungen gerichtete Forderungen sowie Eigentum/Beteiligungen an Unternehmen oder an Betriebsstätten/Zweigniederlassungen von Unternehmen mit Sitz außerhalb der Deutschen Demokratischen Republik.
     """
     ptrs = LineSplitParams()
     ptrs.line_breaks = {'\n', '.', ';'}
     proc = LineProcessor(line_split_params=ptrs)
     lines = [line for line in proc.split_text_on_line_with_endings(text)]
     assert len(lines) == 3  # plus one for an empty line
Ejemplo n.º 3
0
 def test_de_linebreaks(self):
     split_params = LineSplitParams()
     split_params.line_breaks = {'.', ';', '!', '?'}
     split_params.abbreviations = {
         'nr.', 'abs.', 'no.', 'act.', 'inc.', 'p.'
     }
     split_params.abbr_ignore_case = True
     text = 'Nach der Allgemeine\nGebührenverordnung'
     proc = LineProcessor(line_split_params=split_params)
     sents = list(proc.split_text_on_line_with_endings(text))
     self.assertEqual(1, len(sents))
Ejemplo n.º 4
0
    def test_line_processor_phrases(self):
        text = """
Once upon a midnight dreary

While I pounded, weak and weary. Over many a quaint and curious volume of forgotten lore,
While I nodded, nearly napping; suddenly there came a tapping,
As of some one gently rapping, rapping at my chamber door."""
        ptrs = LineSplitParams()
        ptrs.line_breaks = {'\n', '.', ';'}
        proc = LineProcessor(line_split_params=ptrs)
        lines = [line for line in proc.split_text_on_line_with_endings(text)]
        assert len(lines) == 6
Ejemplo n.º 5
0
def make_es_definitions_parser():
    split_params = LineSplitParams()
    split_params.line_breaks = {'\n', '.', ';', '!', '?'}
    split_params.abbreviations = EsLanguageTokens.abbreviations
    split_params.abbr_ignore_case = True

    functions = [CommonDefinitionPatterns.match_es_def_by_semicolon,
                 SpanishParsingMethods.match_es_def_by_hereafter,
                 SpanishParsingMethods.match_es_def_by_reffered]

    parser = UniversalDefinitionsParser(functions, split_params)
    return parser
Ejemplo n.º 6
0
def make_de_definitions_parser():
    split_params = LineSplitParams()
    split_params.line_breaks = {'\n', '.', ';', '!', '?'}
    split_params.abbreviations = DeLanguageTokens.abbreviations
    split_params.abbr_ignore_case = True

    functions = [CommonDefinitionPatterns.match_es_def_by_semicolon,
                 DeutscheParsingMethods.match_ist_jeder,
                 DeutscheParsingMethods.match_im_sinne]

    parser = UniversalDefinitionsParser(functions, split_params)
    parser.prohibited_words = {w for w in DeLanguageTokens.articles + DeLanguageTokens.conjunctions}
    return parser
Ejemplo n.º 7
0
    def test_line_processor_phrases_abbr(self):
        text = 'Articolul saisprezece (16) nr. 2. Textul:'
        ptrs = LineSplitParams()
        ptrs.line_breaks = {'\n', '.', ';'}
        proc = LineProcessor(line_split_params=ptrs)

        lines = [line for line in proc.split_text_on_line_with_endings(text)]
        assert len(lines) == 3

        ptrs.abbreviations = {'nr.', 'abs.'}
        ptrs.abbr_ignore_case = True
        proc = LineProcessor(line_split_params=ptrs)
        lines = [line for line in proc.split_text_on_line_with_endings(text)]
        assert len(lines) == 2
Ejemplo n.º 8
0
    def test_de_abbrs(self):
        split_params = LineSplitParams()
        split_params.line_breaks = {'.', ';', '!', '?'}
        split_params.abbreviations = {'nr.', 'abs.', 'no.', 'act.', 'a.D.'}
        split_params.abbr_ignore_case = True

        text = '1000 a.D. und drang'
        proc = LineProcessor(line_split_params=split_params)
        sents = list(proc.split_text_on_line_with_endings(text))
        self.assertEqual(1, len(sents))

        text = '1000 A.d. und drang'
        sents = list(proc.split_text_on_line_with_endings(text))
        self.assertGreater(len(sents), 1)
Ejemplo n.º 9
0
def make_es_copyrights_parser():
    split_params = LineSplitParams()
    split_params.line_breaks = {'\n', '.', ';', '!', '?'}
    split_params.abbreviations = EsLanguageTokens.abbreviations
    split_params.abbr_ignore_case = True
    methods = SpanishCopyrightParsingMethods()

    functions = [methods.match_word_c_years, methods.match_c_years_word]

    p = CopyrightParser(functions, split_params)
    p.prohibited_words = {
        w
        for w in EsLanguageTokens.articles + EsLanguageTokens.conjunctions
    }
    return p
Ejemplo n.º 10
0
    def __init__(self, gesetze_df: pd.DataFrame, verordnungen_df: pd.DataFrame,
                 concept_df: pd.DataFrame):
        self.locale = ''
        parse_columns = ('Kurztitel', 'Titel', 'Abkürzung')
        dependent_columns = {'Titel': 'External Reference Normalized'}
        preformed_entity = {
            'External Reference Type': 'Laws and Rules',
            'External Reference Source': 'BaFin',
            'External Reference Issuing Country': 'Germany'
        }
        split_params = LineSplitParams()
        split_params.line_breaks = {'.', ';', '!', '?'}
        split_params.abbreviations = DeLanguageTokens.abbreviations
        split_params.abbr_ignore_case = True
        proc = LineProcessor(line_split_params=split_params)

        self.gesetze_parser = DataframeEntityParser(
            gesetze_df,
            parse_columns,
            result_columns=dependent_columns,
            preformed_entity=preformed_entity,
            line_processor=proc)

        self.verordnungen_parser = DataframeEntityParser(
            verordnungen_df,
            parse_columns,
            result_columns=dependent_columns,
            preformed_entity=preformed_entity,
            line_processor=proc)

        parse_columns = ('b', )
        dependent_columns = {
            'b': 'External Reference Normalized',
            'a': 'External Reference Type'
        }
        preformed_entity.pop('External Reference Type')

        self.concept_parser = DataframeEntityParser(
            concept_df,
            parse_columns,
            result_columns=dependent_columns,
            preformed_entity=preformed_entity,
            line_processor=proc)