Ejemplo n.º 1
0
 def test_multiple_keyterms(self):
     self.assertEqual(
         list(
             tokenize('Decreto-Barros',
                      {'Decreto', 'Decreto-Lei', '-Barro'})),
         [Token('Decreto'), Token('-Barro'),
          Token('s')])
Ejemplo n.º 2
0
    def test_keyterm_in_begin(self):
        self.assertEqual(list(tokenize('pre-foo-suf', ('pre', 'pre-foo-suf'))),
                         [Token('pre-foo-suf')])

        self.assertEqual(
            list(tokenize('d-pre-foo-suf', ('pre', 'pre-foo-suf'))),
            [Token('d-'), Token('pre-foo-suf')])
Ejemplo n.º 3
0
 def test_shifted_keyterms(self):
     self.assertEqual(
         list(
             tokenize('the foo is bad',
                      (' ', 'the foo stay', 'foo is bad'))),
         [Token('the'), Token(' '),
          Token('foo is bad')])
Ejemplo n.º 4
0
    def observe(self, index, token, caught):
        if not caught and re.match(DOCUMENT_NUMBER_REGEX, token.as_str()):
            self._numbers[index] = token
            return True

        if token in (Token('.'), Token('\n')):
            self.finish()

        return False
Ejemplo n.º 5
0
    def test_keyterm_in_end(self):
        self.assertEqual(list(tokenize('pre-foo-suf', ('pre-foo-suf', 'suf'))),
                         [Token('pre-foo-suf')])

        self.assertEqual(
            list(tokenize('n.º 2', (' ', 'n.º', '.º'))),
            [Token('n.º'), Token(' '), Token('2')])

        self.assertEqual(list(tokenize('foo-bar', ('foo-bar', 'bar tal'))),
                         [Token('foo-bar')])
Ejemplo n.º 6
0
    def test_single(self):
        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver})]

        self._test('Decreto-Lei nº 2/2013.', managers,
                   [(DocumentReference('2/2013', Token('Decreto-Lei')), 4)])

        self._test('Decreto-Lei nº 2/2013/A,', managers,
                   [(DocumentReference('2/2013/A', Token('Decreto-Lei')), 4)])

        self._test('Decreto-Lei nº 2-A/2013,', managers,
                   [(DocumentReference('2-A/2013', Token('Decreto-Lei')), 4)])
Ejemplo n.º 7
0
    def test_single(self):
        managers = [ObserverManager({'Diretiva': EULawRefObserver,
                                     'Decisão de Execução': EULawReference})]

        self._test('Diretiva nº 2011/778/UE ', managers,
                   [(EULawReference('2011/778/UE', Token('Diretiva')), 4)])
        self._test('Diretiva nº 2000/29/CE,', managers,
                   [(EULawReference('2000/29/CE', Token('Diretiva')), 4)])
        self._test('Diretiva nº 2000/778/UE.', managers,
                   [(EULawReference('2000/778/UE', Token('Diretiva')), 4)])

        self._test('Diretiva False.', managers, ())
Ejemplo n.º 8
0
    def test_keyterm_subset_of_keyterm(self):
        """
        When keyterm is a subset of the other, return other.
        """
        self.assertEqual(list(tokenize('Decreto-Lei', {'Decreto'})),
                         [Token('Decreto'), Token('-Lei')])

        self.assertEqual(
            list(tokenize('Decreto-Lei', {'Decreto', 'Decreto-Lei'})),
            [Token('Decreto-Lei')])

        self.assertEqual(
            list(tokenize('Decreto-Barro', {'Decreto', 'Decreto-Lei'})),
            [Token('Decreto'), Token('-Barro')])
Ejemplo n.º 9
0
    def test_many_separated(self):

        managers = [ObserverManager({'foo': DocumentRefObserver,
                                     'bar': DocumentRefObserver})]

        string = 'foo 1/2000. bar 2/2000'
        expected = [(DocumentReference('1/2000', Token('foo')), 2),
                    (DocumentReference('2/2000', Token('bar')), 7)]

        self._test(string, managers, expected)

        string = 'foo 1/2000 bar 2/2000'
        expected = [(DocumentReference('1/2000', Token('foo')), 2),
                    (DocumentReference('2/2000', Token('bar')), 6)]

        self._test(string, managers, expected)
Ejemplo n.º 10
0
def analyse(tokens):
    root = Document()
    root_parser = HierarchyParser(root)

    paragraph = Paragraph()
    block_mode = False
    for token in tokens:
        if token.as_str() == '':
            continue
        # start of quote
        if token == Token('«') and len(paragraph) == 0:
            block_mode = True
            block_parser = HierarchyParser(QuotationSection(), add_links=False)
        # end of quote
        elif token == Token('»') and len(paragraph) == 0:
            block_mode = False
            root_parser.add(block_parser.root)
            paragraph = Paragraph()
        # construct the paragraphs
        # paragraph can end by '\n' or by starting a new section.
        elif isinstance(token, Anchor) or token.string == '\n':
            # it is end of paragraph; complete it if it ends by a normal \n.
            if token.string == '\n':
                paragraph.append(token)

            # select current parser
            p = root_parser
            if block_mode:
                p = block_parser

            # add paragraph to the current parser if it is not empty
            if len(paragraph):
                p.add(paragraph)

            # start a new paragraph
            paragraph = Paragraph()
            if isinstance(token, Anchor):
                # create a new section
                section = p.new_section(token)

                # if new new section is inline, change to inline paragraph
                if isinstance(section, InlineDocumentSection):
                    paragraph = InlineParagraph()
        else:
            paragraph.append(token)

    return root
Ejemplo n.º 11
0
    def test_keyterm_in_word(self):
        """
        262.º with keyterm `.º` must return 262 and .º
        """
        self.assertEqual(list(tokenize('262.º', (
            ' ',
            ',',
            '.º',
        ))), [Token('262'), Token('.º')])

        self.assertEqual(
            list(tokenize('262.º-A', (
                ' ',
                ',',
                '.º',
            ))),
            [Token('262'), Token('.º'), Token('-A')])
Ejemplo n.º 12
0
 def test_keyterm_not_found(self):
     self.assertEqual(
         list(tokenize('this is the foo of', (
             ' ',
             'the end',
         ))), [
             Token('this'),
             Token(' '),
             Token('is'),
             Token(' '),
             Token('the'),
             Token(' '),
             Token('foo'),
             Token(' '),
             Token('of')
         ])
Ejemplo n.º 13
0
    def test_with_document(self):
        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}),
                    ObserverManager({'artigo': ArticleRefObserver}),
                    ObserverManager({'nº': NumberRefObserver,
                                     'n.os': NumberRefObserver})]

        doc = DocumentReference('2/2013', Token('Decreto-Lei'))
        art = ArticleReference('26º', doc)
        self._test('no nº 2 do artigo 26º do Decreto-Lei 2/2013,',
                   managers, [(doc, 16), (art, 10),
                              (NumberReference('2', art), 4)])
Ejemplo n.º 14
0
    def observe(self, index, token, caught):
        if token in (Token('.'), Token('\n')):
            self.finish()
            return False

        if self._parent:
            return False

        if not caught and re.match(LINE_REGEX, token.as_str()):
            self._numbers[index] = token
            return True
        elif re.match(NUMBER_REGEX, token.as_str()):
            self._parent = index
        elif re.match(ARTICLE_NUMBER_REGEX, token.as_str()):
            self._parent = index
        # never found such case
        # elif re.match(DOCUMENT_NUMBER_REGEX, token.as_str()):
        #    self._parent = index

        return False
Ejemplo n.º 15
0
    def test_with_document(self):

        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}),
                    ObserverManager({'artigos': ArticleRefObserver})]

        doc = DocumentReference('2/2013', Token('Decreto-Lei'))
        self._test('Os artigos 3º, 4º-A, 7º e 25º do Decreto-Lei 2/2013,',
                   managers,
                   [(ArticleReference('3º', doc), 4),
                    (ArticleReference('4º-A', doc), 7),
                    (ArticleReference('7º', doc), 10),
                    (ArticleReference('25º', doc), 14),
                    (doc, 20)])
Ejemplo n.º 16
0
    def test_with_document(self):
        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}),
                    ObserverManager({'artigo': ArticleRefObserver}),
                    ObserverManager({'nº': NumberRefObserver}),
                    ObserverManager({'alíneas': LineRefObserver})]

        document = DocumentReference('2/2013', Token('Decreto-Lei'))
        article = ArticleReference('26º', document)
        number = NumberReference('4', article)

        self._test('nas alíneas f) e g) do nº 4 do artigo 26º '
                   'do Decreto-Lei nº 2/2013', managers,
                   [(LineReference('f)', number), 4),
                    (LineReference('g)', number), 8), (number, 14),
                    (article, 20), (document, 28)])
Ejemplo n.º 17
0
    def test_tokenize(self):
        self.assertEqual(
            list(tokenize('the end', ' ')),
            [Token('the'), Token(' '), Token('end')])

        self.assertEqual(list(tokenize('the end is', (
            ' ',
            'the end',
        ))), [Token('the end'), Token(' '),
              Token('is')])
Ejemplo n.º 18
0
    def test_similar_keyterms(self):
        expected = [
            Token('this'),
            Token(' '),
            Token('is'),
            Token(' '),
            Token('the'),
            Token(' '),
            Token('foo'),
            Token(' '),
            Token('of')
        ]

        self.assertEqual(
            list(tokenize('this is the foo of', (' ', 'the end', 'the bar'))),
            expected)

        self.assertEqual(
            list(
                tokenize('this is the foo of',
                         (' ', 'the foo is', 'foo is bad'))), expected)
Ejemplo n.º 19
0
    def test_many(self):
        managers = [ObserverManager({'Decretos-Leis': DocumentRefObserver})]

        self._test('Decretos-Leis n.os 1/2006, 2/2006, e 3/2006', managers,
                   [(DocumentReference('1/2006', Token('Decretos-Leis')), 4),
                    (DocumentReference('2/2006', Token('Decretos-Leis')), 7),
                    (DocumentReference('3/2006', Token('Decretos-Leis')), 12)])

        self._test('Decretos-Leis n.os 1/2006, e 2/2006', managers,
                   [(DocumentReference('1/2006', Token('Decretos-Leis')), 4),
                    (DocumentReference('2/2006', Token('Decretos-Leis')), 9)])

        self._test('Decretos-Leis n.os 64/2006, de 21 de março, '
                   '88/2006, de 23 de maio, e '
                   '196/2006, de 10 de outubro', managers,
                   [(DocumentReference('64/2006', Token('Decretos-Leis')), 4),
                    (DocumentReference('88/2006', Token('Decretos-Leis')), 16),
                    (DocumentReference('196/2006', Token('Decretos-Leis')), 30)])
Ejemplo n.º 20
0
    def test_fail(self):
        managers = [ObserverManager({'\n': ClauseObserver})]
        result = parser.parse('\nImagina\n', managers, {'\n'})

        self.assertEqual([Token('\n'), Token('Imagina'), Token('\n')], result)
Ejemplo n.º 21
0
def tokenize(string, keyterms=()):
    return [Token(token) for token in _tokenizer.tokenize(string, keyterms)]
Ejemplo n.º 22
0
    def test_old_notation(self):
        managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver})]

        self._test('Decreto-Lei nº 2-A/90,', managers,
                   [(DocumentReference('2-A/90', Token('Decreto-Lei')), 4)])
Ejemplo n.º 23
0
 def replace_in(self, result):
     result[self._index + 2] = Token('')
     result[self._index + 1] = Annex('')
Ejemplo n.º 24
0
 def _replace_in(self, result):
     result[self._index] = EULawReference(self._number, Token(self._string))
Ejemplo n.º 25
0
    def test_simple(self):
        managers = [ObserverManager({'\n': ClauseObserver})]
        result = parser.parse('\nIV\n', managers, {'\n'})

        self.assertEqual([Token('\n'), Clause('IV'), Token('')], result)
Ejemplo n.º 26
0
 def test_real(self):
     self.assertEqual(
         list(
             tokenize('no n.º 2 do artigo 26.º do Decreto-Lei 2/2013,',
                      (' ', '.', ',', 'Decreto-Lei', 'Decretos-Leis', 'n.º',
                       '.º', 'n.os'))), [
                           Token('no'),
                           Token(' '),
                           Token('n.º'),
                           Token(' '),
                           Token('2'),
                           Token(' '),
                           Token('do'),
                           Token(' '),
                           Token('artigo'),
                           Token(' '),
                           Token('26'),
                           Token('.º'),
                           Token(' '),
                           Token('do'),
                           Token(' '),
                           Token('Decreto-Lei'),
                           Token(' '),
                           Token('2/2013'),
                           Token(',')
                       ])
Ejemplo n.º 27
0
 def replace_in(self, result):
     assert (self._number_index == self._index + self.number_at)
     for i in reversed(range(2, self.take_up_to + 1)):
         result[self._index + i] = Token('')  # '\n'
     result[self._index + 1] = self.anchor_klass(self._number)
Ejemplo n.º 28
0
    def test_simple(self):
        # document 455149 contains this example
        managers = parser.common_managers
        result = parser.parse('\nAnexo\n', managers, {'\n'})

        self.assertEqual([Token('\n'), Annex(''), Token('')], result)
Ejemplo n.º 29
0
    def test_fails(self):
        managers = [ObserverManager({'\n': UnnumberedAnnexObserver})]
        result = parser.parse('\nTítulo\n', managers, {'\n'})

        self.assertEqual([Token('\n'), Token('Título'), Token('\n')], result)
Ejemplo n.º 30
0
 def test_basic(self):
     self.assertEqual(
         list(tokenize('the the', ' ')),
         [Token('the'), Token(' '), Token('the')])