Example #1
0
 def test_multiple_keyterms(self):
     self.assertEqual(
         list(
             tokenize('Decreto-Barros',
                      {'Decreto', 'Decreto-Lei', '-Barro'})),
         [Token('Decreto'), Token('-Barro'),
          Token('s')])
Example #2
0
    def test_keyterm_in_begin(self):
        self.assertEqual(list(tokenize('pre-foo-suf', ('pre', 'pre-foo-suf'))),
                         [Token('pre-foo-suf')])

        self.assertEqual(
            list(tokenize('d-pre-foo-suf', ('pre', 'pre-foo-suf'))),
            [Token('d-'), Token('pre-foo-suf')])
Example #3
0
 def test_shifted_keyterms(self):
     self.assertEqual(
         list(
             tokenize('the foo is bad',
                      (' ', 'the foo stay', 'foo is bad'))),
         [Token('the'), Token(' '),
          Token('foo is bad')])
Example #4
0
    def observe(self, index, token, caught):
        if not caught and re.match(DOCUMENT_NUMBER_REGEX, token.as_str()):
            self._numbers[index] = token
            return True

        if token in (Token('.'), Token('\n')):
            self.finish()

        return False
Example #5
0
    def test_keyterm_in_end(self):
        self.assertEqual(list(tokenize('pre-foo-suf', ('pre-foo-suf', 'suf'))),
                         [Token('pre-foo-suf')])

        self.assertEqual(
            list(tokenize('n.º 2', (' ', 'n.º', '.º'))),
            [Token('n.º'), Token(' '), Token('2')])

        self.assertEqual(list(tokenize('foo-bar', ('foo-bar', 'bar tal'))),
                         [Token('foo-bar')])
Example #6
0
    def test_single(self):
        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver})]

        self._test('Decreto-Lei nº 2/2013.', managers,
                   [(DocumentReference('2/2013', Token('Decreto-Lei')), 4)])

        self._test('Decreto-Lei nº 2/2013/A,', managers,
                   [(DocumentReference('2/2013/A', Token('Decreto-Lei')), 4)])

        self._test('Decreto-Lei nº 2-A/2013,', managers,
                   [(DocumentReference('2-A/2013', Token('Decreto-Lei')), 4)])
Example #7
0
    def test_single(self):
        managers = [ObserverManager({'Diretiva': EULawRefObserver,
                                     'Decisão de Execução': EULawReference})]

        self._test('Diretiva nº 2011/778/UE ', managers,
                   [(EULawReference('2011/778/UE', Token('Diretiva')), 4)])
        self._test('Diretiva nº 2000/29/CE,', managers,
                   [(EULawReference('2000/29/CE', Token('Diretiva')), 4)])
        self._test('Diretiva nº 2000/778/UE.', managers,
                   [(EULawReference('2000/778/UE', Token('Diretiva')), 4)])

        self._test('Diretiva False.', managers, ())
Example #8
0
    def test_keyterm_subset_of_keyterm(self):
        """
        When keyterm is a subset of the other, return other.
        """
        self.assertEqual(list(tokenize('Decreto-Lei', {'Decreto'})),
                         [Token('Decreto'), Token('-Lei')])

        self.assertEqual(
            list(tokenize('Decreto-Lei', {'Decreto', 'Decreto-Lei'})),
            [Token('Decreto-Lei')])

        self.assertEqual(
            list(tokenize('Decreto-Barro', {'Decreto', 'Decreto-Lei'})),
            [Token('Decreto'), Token('-Barro')])
Example #9
0
    def test_many_separated(self):

        managers = [ObserverManager({'foo': DocumentRefObserver,
                                     'bar': DocumentRefObserver})]

        string = 'foo 1/2000. bar 2/2000'
        expected = [(DocumentReference('1/2000', Token('foo')), 2),
                    (DocumentReference('2/2000', Token('bar')), 7)]

        self._test(string, managers, expected)

        string = 'foo 1/2000 bar 2/2000'
        expected = [(DocumentReference('1/2000', Token('foo')), 2),
                    (DocumentReference('2/2000', Token('bar')), 6)]

        self._test(string, managers, expected)
Example #10
0
def analyse(tokens):
    root = Document()
    root_parser = HierarchyParser(root)

    paragraph = Paragraph()
    block_mode = False
    for token in tokens:
        if token.as_str() == '':
            continue
        # start of quote
        if token == Token('«') and len(paragraph) == 0:
            block_mode = True
            block_parser = HierarchyParser(QuotationSection(), add_links=False)
        # end of quote
        elif token == Token('»') and len(paragraph) == 0:
            block_mode = False
            root_parser.add(block_parser.root)
            paragraph = Paragraph()
        # construct the paragraphs
        # paragraph can end by '\n' or by starting a new section.
        elif isinstance(token, Anchor) or token.string == '\n':
            # it is end of paragraph; complete it if it ends by a normal \n.
            if token.string == '\n':
                paragraph.append(token)

            # select current parser
            p = root_parser
            if block_mode:
                p = block_parser

            # add paragraph to the current parser if it is not empty
            if len(paragraph):
                p.add(paragraph)

            # start a new paragraph
            paragraph = Paragraph()
            if isinstance(token, Anchor):
                # create a new section
                section = p.new_section(token)

                # if new new section is inline, change to inline paragraph
                if isinstance(section, InlineDocumentSection):
                    paragraph = InlineParagraph()
        else:
            paragraph.append(token)

    return root
Example #11
0
    def test_keyterm_in_word(self):
        """
        262.º with keyterm `.º` must return 262 and .º
        """
        self.assertEqual(list(tokenize('262.º', (
            ' ',
            ',',
            '.º',
        ))), [Token('262'), Token('.º')])

        self.assertEqual(
            list(tokenize('262.º-A', (
                ' ',
                ',',
                '.º',
            ))),
            [Token('262'), Token('.º'), Token('-A')])
Example #12
0
 def test_keyterm_not_found(self):
     self.assertEqual(
         list(tokenize('this is the foo of', (
             ' ',
             'the end',
         ))), [
             Token('this'),
             Token(' '),
             Token('is'),
             Token(' '),
             Token('the'),
             Token(' '),
             Token('foo'),
             Token(' '),
             Token('of')
         ])
Example #13
0
    def test_with_document(self):
        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}),
                    ObserverManager({'artigo': ArticleRefObserver}),
                    ObserverManager({'nº': NumberRefObserver,
                                     'n.os': NumberRefObserver})]

        doc = DocumentReference('2/2013', Token('Decreto-Lei'))
        art = ArticleReference('26º', doc)
        self._test('no nº 2 do artigo 26º do Decreto-Lei 2/2013,',
                   managers, [(doc, 16), (art, 10),
                              (NumberReference('2', art), 4)])
Example #14
0
    def observe(self, index, token, caught):
        if token in (Token('.'), Token('\n')):
            self.finish()
            return False

        if self._parent:
            return False

        if not caught and re.match(LINE_REGEX, token.as_str()):
            self._numbers[index] = token
            return True
        elif re.match(NUMBER_REGEX, token.as_str()):
            self._parent = index
        elif re.match(ARTICLE_NUMBER_REGEX, token.as_str()):
            self._parent = index
        # never found such case
        # elif re.match(DOCUMENT_NUMBER_REGEX, token.as_str()):
        #    self._parent = index

        return False
Example #15
0
    def test_with_document(self):

        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}),
                    ObserverManager({'artigos': ArticleRefObserver})]

        doc = DocumentReference('2/2013', Token('Decreto-Lei'))
        self._test('Os artigos 3º, 4º-A, 7º e 25º do Decreto-Lei 2/2013,',
                   managers,
                   [(ArticleReference('3º', doc), 4),
                    (ArticleReference('4º-A', doc), 7),
                    (ArticleReference('7º', doc), 10),
                    (ArticleReference('25º', doc), 14),
                    (doc, 20)])
Example #16
0
    def test_with_document(self):
        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}),
                    ObserverManager({'artigo': ArticleRefObserver}),
                    ObserverManager({'nº': NumberRefObserver}),
                    ObserverManager({'alíneas': LineRefObserver})]

        document = DocumentReference('2/2013', Token('Decreto-Lei'))
        article = ArticleReference('26º', document)
        number = NumberReference('4', article)

        self._test('nas alíneas f) e g) do nº 4 do artigo 26º '
                   'do Decreto-Lei nº 2/2013', managers,
                   [(LineReference('f)', number), 4),
                    (LineReference('g)', number), 8), (number, 14),
                    (article, 20), (document, 28)])
Example #17
0
    def test_tokenize(self):
        self.assertEqual(
            list(tokenize('the end', ' ')),
            [Token('the'), Token(' '), Token('end')])

        self.assertEqual(list(tokenize('the end is', (
            ' ',
            'the end',
        ))), [Token('the end'), Token(' '),
              Token('is')])
Example #18
0
    def test_similar_keyterms(self):
        expected = [
            Token('this'),
            Token(' '),
            Token('is'),
            Token(' '),
            Token('the'),
            Token(' '),
            Token('foo'),
            Token(' '),
            Token('of')
        ]

        self.assertEqual(
            list(tokenize('this is the foo of', (' ', 'the end', 'the bar'))),
            expected)

        self.assertEqual(
            list(
                tokenize('this is the foo of',
                         (' ', 'the foo is', 'foo is bad'))), expected)
Example #19
0
    def test_many(self):
        managers = [ObserverManager({'Decretos-Leis': DocumentRefObserver})]

        self._test('Decretos-Leis n.os 1/2006, 2/2006, e 3/2006', managers,
                   [(DocumentReference('1/2006', Token('Decretos-Leis')), 4),
                    (DocumentReference('2/2006', Token('Decretos-Leis')), 7),
                    (DocumentReference('3/2006', Token('Decretos-Leis')), 12)])

        self._test('Decretos-Leis n.os 1/2006, e 2/2006', managers,
                   [(DocumentReference('1/2006', Token('Decretos-Leis')), 4),
                    (DocumentReference('2/2006', Token('Decretos-Leis')), 9)])

        self._test('Decretos-Leis n.os 64/2006, de 21 de março, '
                   '88/2006, de 23 de maio, e '
                   '196/2006, de 10 de outubro', managers,
                   [(DocumentReference('64/2006', Token('Decretos-Leis')), 4),
                    (DocumentReference('88/2006', Token('Decretos-Leis')), 16),
                    (DocumentReference('196/2006', Token('Decretos-Leis')), 30)])
Example #20
0
    def test_fail(self):
        managers = [ObserverManager({'\n': ClauseObserver})]
        result = parser.parse('\nImagina\n', managers, {'\n'})

        self.assertEqual([Token('\n'), Token('Imagina'), Token('\n')], result)
Example #21
0
def tokenize(string, keyterms=()):
    return [Token(token) for token in _tokenizer.tokenize(string, keyterms)]
Example #22
0
    def test_old_notation(self):
        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver})]

        self._test('Decreto-Lei nº 2-A/90,', managers,
                   [(DocumentReference('2-A/90', Token('Decreto-Lei')), 4)])
Example #23
0
 def replace_in(self, result):
     result[self._index + 2] = Token('')
     result[self._index + 1] = Annex('')
Example #24
0
 def _replace_in(self, result):
     result[self._index] = EULawReference(self._number, Token(self._string))
Example #25
0
    def test_simple(self):
        managers = [ObserverManager({'\n': ClauseObserver})]
        result = parser.parse('\nIV\n', managers, {'\n'})

        self.assertEqual([Token('\n'), Clause('IV'), Token('')], result)
Example #26
0
 def test_real(self):
     self.assertEqual(
         list(
             tokenize('no n.º 2 do artigo 26.º do Decreto-Lei 2/2013,',
                      (' ', '.', ',', 'Decreto-Lei', 'Decretos-Leis', 'n.º',
                       '.º', 'n.os'))), [
                           Token('no'),
                           Token(' '),
                           Token('n.º'),
                           Token(' '),
                           Token('2'),
                           Token(' '),
                           Token('do'),
                           Token(' '),
                           Token('artigo'),
                           Token(' '),
                           Token('26'),
                           Token('.º'),
                           Token(' '),
                           Token('do'),
                           Token(' '),
                           Token('Decreto-Lei'),
                           Token(' '),
                           Token('2/2013'),
                           Token(',')
                       ])
Example #27
0
 def replace_in(self, result):
     assert (self._number_index == self._index + self.number_at)
     for i in reversed(range(2, self.take_up_to + 1)):
         result[self._index + i] = Token('')  # '\n'
     result[self._index + 1] = self.anchor_klass(self._number)
Example #28
0
    def test_simple(self):
        # document 455149 contains this example
        managers = parser.common_managers
        result = parser.parse('\nAnexo\n', managers, {'\n'})

        self.assertEqual([Token('\n'), Annex(''), Token('')], result)
Example #29
0
    def test_fails(self):
        managers = [ObserverManager({'\n': UnnumberedAnnexObserver})]
        result = parser.parse('\nTítulo\n', managers, {'\n'})

        self.assertEqual([Token('\n'), Token('Título'), Token('\n')], result)
Example #30
0
 def test_basic(self):
     self.assertEqual(
         list(tokenize('the the', ' ')),
         [Token('the'), Token(' '), Token('the')])