Marker("label") + Suppress(":") +
    atomic.part +
    Suppress("-") +
    (atomic.section | atomic.appendix) +
    ZeroOrMore(Suppress("-") + _label_part) +
    Suppress("]")
).setParseAction(tokenize_override_ps)

# Looks like: [subject-group(Some text Goes Here)]
subject_group = (
    context_certainty +
    Suppress("[subject-group") +
    QuotedString(quoteChar='(', endQuoteChar=')').setResultsName("subgroup") +
    Suppress("]")
).setParseAction(lambda m: tokens.Context(
    [None, 'Subjgrp:' + subjgrp_label(m.subgroup, [])], bool(m.certain)))

# Phrases like '“Nonimmigrant visa”' become 'p12345678'
_double_quote_label = QuotedString(
    quoteChar=u'“', endQuoteChar=u'”'
).setParseAction(lambda m: "p{}".format(hash_for_paragraph(m[0])))
# Phrases like "definition for the term “Nonimmigrant visa”" become a
# paragraph token with the appropriate paragraph label set
definition = (
    Marker("definition") +
    (Marker("of") | Marker("for")) +
    Optional(Marker("the") + Marker("term")) +
    _double_quote_label.copy().setResultsName("paragraph")
).setParseAction(lambda m: tokens.Paragraph(paragraphs=[m.paragraph]))

#   grammar which captures all of these possibilities
    Marker("label") + Suppress(":") +
    atomic.part +
    Suppress("-") +
    (atomic.section | atomic.appendix) +
    ZeroOrMore(Suppress("-") + _label_part) +
    Suppress("]")
).setParseAction(tokenize_override_ps)

# Looks like: [subject-group(Some text Goes Here)]
subject_group = (
    context_certainty +
    Suppress("[subject-group") +
    QuotedString(quoteChar='(', endQuoteChar=')').setResultsName("subgroup") +
    Suppress("]")
).setParseAction(lambda m: tokens.Context(
    [None, 'Subjgrp:' + subjgrp_label(m.subgroup, [])], bool(m.certain)))

# Phrases like '“Nonimmigrant visa”' become 'p12345678'
_double_quote_label = QuotedString(
    quoteChar=u'“', endQuoteChar=u'”'
).setParseAction(lambda m: "p{0}".format(hash_for_paragraph(m[0])))
# Phrases like "definition for the term “Nonimmigrant visa”" become a
# paragraph token with the appropriate paragraph label set
definition = (
    Marker("definition") +
    (Marker("of") | Marker("for")) +
    Optional(Marker("the") + Marker("term")) +
    _double_quote_label.copy().setResultsName("paragraph")
).setParseAction(lambda m: tokens.Paragraph.make(paragraphs=[m.paragraph]))

#   grammar which captures all of these possibilities
def test_subjgrp_label(text, existing, expected):
    assert reg_text.subjgrp_label(text, existing) == expected
def test_subjgrp_label(text, existing, expected):
    assert reg_text.subjgrp_label(text, existing) == expected
Beispiel #5
0
    def test_subjgrp_label(self):
        # Single words:
        result = reg_text.subjgrp_label('Penalties', [])
        self.assertEqual('Pe', result)
        result = reg_text.subjgrp_label('Penalties', ['Pe'])
        self.assertEqual('Pe.', result)
        result = reg_text.subjgrp_label('Penalties', ['Pe', 'Pe.'])
        self.assertEqual('Pen', result)
        result = reg_text.subjgrp_label('Penalties', ['Pe', 'Pe.', 'Pen'])
        self.assertEqual('Pen.', result)
        result = reg_text.subjgrp_label('Pe', ['Pe', 'Pe.'])
        self.assertEqual('Pe-a', result)
        result = reg_text.subjgrp_label('Pe', ['Pe', 'Pe.', 'Pe-a'])
        self.assertEqual('Pe.-a', result)
        result = reg_text.subjgrp_label('Pe', ['Pe', 'Pe.', 'Pe-a', 'Pe.-a'])
        self.assertEqual('Pe-b', result)

        # Multiple words:
        result = reg_text.subjgrp_label('Change of Ownership', [])
        self.assertEqual('CoO', result)
        result = reg_text.subjgrp_label('Change of Ownership', ['CoO'])
        self.assertEqual('C.o.O.', result)
        result = reg_text.subjgrp_label('Change of Ownership',
                                        ['CoO', 'C.o.O.'])
        self.assertEqual('C_o_O', result)
        result = reg_text.subjgrp_label('Change of Ownership',
                                        ['CoO', 'C.o.O.', 'C-o-O', 'C_o_O'])
        self.assertEqual('ChofOw', result)
        result = reg_text.subjgrp_label(
            'Change of Ownership', ['CoO', 'C.o.O.', 'C_o_O', 'ChofOw'])
        self.assertEqual('Ch.of.Ow.', result)
        result = reg_text.subjgrp_label(
            'Change of Ownership',
            ['CoO', 'C.o.O.', 'C_o_O', 'ChofOw', 'Ch.of.Ow.'])
        self.assertEqual('Ch_of_Ow', result)
        result = reg_text.subjgrp_label(
            'C o O', ['CoO', 'C.o.O.', 'C_o_O'])
        self.assertEqual('CoO-a', result)