def test_multiple_keyterms(self): self.assertEqual( list( tokenize('Decreto-Barros', {'Decreto', 'Decreto-Lei', '-Barro'})), [Token('Decreto'), Token('-Barro'), Token('s')])
def test_keyterm_in_begin(self): self.assertEqual(list(tokenize('pre-foo-suf', ('pre', 'pre-foo-suf'))), [Token('pre-foo-suf')]) self.assertEqual( list(tokenize('d-pre-foo-suf', ('pre', 'pre-foo-suf'))), [Token('d-'), Token('pre-foo-suf')])
def test_shifted_keyterms(self): self.assertEqual( list( tokenize('the foo is bad', (' ', 'the foo stay', 'foo is bad'))), [Token('the'), Token(' '), Token('foo is bad')])
def observe(self, index, token, caught): if not caught and re.match(DOCUMENT_NUMBER_REGEX, token.as_str()): self._numbers[index] = token return True if token in (Token('.'), Token('\n')): self.finish() return False
def test_keyterm_in_end(self): self.assertEqual(list(tokenize('pre-foo-suf', ('pre-foo-suf', 'suf'))), [Token('pre-foo-suf')]) self.assertEqual( list(tokenize('n.º 2', (' ', 'n.º', '.º'))), [Token('n.º'), Token(' '), Token('2')]) self.assertEqual(list(tokenize('foo-bar', ('foo-bar', 'bar tal'))), [Token('foo-bar')])
def test_single(self): managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver})] self._test('Decreto-Lei nº 2/2013.', managers, [(DocumentReference('2/2013', Token('Decreto-Lei')), 4)]) self._test('Decreto-Lei nº 2/2013/A,', managers, [(DocumentReference('2/2013/A', Token('Decreto-Lei')), 4)]) self._test('Decreto-Lei nº 2-A/2013,', managers, [(DocumentReference('2-A/2013', Token('Decreto-Lei')), 4)])
def test_single(self): managers = [ObserverManager({'Diretiva': EULawRefObserver, 'Decisão de Execução': EULawReference})] self._test('Diretiva nº 2011/778/UE ', managers, [(EULawReference('2011/778/UE', Token('Diretiva')), 4)]) self._test('Diretiva nº 2000/29/CE,', managers, [(EULawReference('2000/29/CE', Token('Diretiva')), 4)]) self._test('Diretiva nº 2000/778/UE.', managers, [(EULawReference('2000/778/UE', Token('Diretiva')), 4)]) self._test('Diretiva False.', managers, ())
def test_keyterm_subset_of_keyterm(self): """ When keyterm is a subset of the other, return other. """ self.assertEqual(list(tokenize('Decreto-Lei', {'Decreto'})), [Token('Decreto'), Token('-Lei')]) self.assertEqual( list(tokenize('Decreto-Lei', {'Decreto', 'Decreto-Lei'})), [Token('Decreto-Lei')]) self.assertEqual( list(tokenize('Decreto-Barro', {'Decreto', 'Decreto-Lei'})), [Token('Decreto'), Token('-Barro')])
def test_many_separated(self): managers = [ObserverManager({'foo': DocumentRefObserver, 'bar': DocumentRefObserver})] string = 'foo 1/2000. bar 2/2000' expected = [(DocumentReference('1/2000', Token('foo')), 2), (DocumentReference('2/2000', Token('bar')), 7)] self._test(string, managers, expected) string = 'foo 1/2000 bar 2/2000' expected = [(DocumentReference('1/2000', Token('foo')), 2), (DocumentReference('2/2000', Token('bar')), 6)] self._test(string, managers, expected)
def analyse(tokens): root = Document() root_parser = HierarchyParser(root) paragraph = Paragraph() block_mode = False for token in tokens: if token.as_str() == '': continue # start of quote if token == Token('«') and len(paragraph) == 0: block_mode = True block_parser = HierarchyParser(QuotationSection(), add_links=False) # end of quote elif token == Token('»') and len(paragraph) == 0: block_mode = False root_parser.add(block_parser.root) paragraph = Paragraph() # construct the paragraphs # paragraph can end by '\n' or by starting a new section. elif isinstance(token, Anchor) or token.string == '\n': # it is end of paragraph; complete it if it ends by a normal \n. if token.string == '\n': paragraph.append(token) # select current parser p = root_parser if block_mode: p = block_parser # add paragraph to the current parser if it is not empty if len(paragraph): p.add(paragraph) # start a new paragraph paragraph = Paragraph() if isinstance(token, Anchor): # create a new section section = p.new_section(token) # if new new section is inline, change to inline paragraph if isinstance(section, InlineDocumentSection): paragraph = InlineParagraph() else: paragraph.append(token) return root
def test_keyterm_in_word(self): """ 262.º with keyterm `.º` must return 262 and .º """ self.assertEqual(list(tokenize('262.º', ( ' ', ',', '.º', ))), [Token('262'), Token('.º')]) self.assertEqual( list(tokenize('262.º-A', ( ' ', ',', '.º', ))), [Token('262'), Token('.º'), Token('-A')])
def test_keyterm_not_found(self): self.assertEqual( list(tokenize('this is the foo of', ( ' ', 'the end', ))), [ Token('this'), Token(' '), Token('is'), Token(' '), Token('the'), Token(' '), Token('foo'), Token(' '), Token('of') ])
def test_with_document(self): managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}), ObserverManager({'artigo': ArticleRefObserver}), ObserverManager({'nº': NumberRefObserver, 'n.os': NumberRefObserver})] doc = DocumentReference('2/2013', Token('Decreto-Lei')) art = ArticleReference('26º', doc) self._test('no nº 2 do artigo 26º do Decreto-Lei 2/2013,', managers, [(doc, 16), (art, 10), (NumberReference('2', art), 4)])
def observe(self, index, token, caught): if token in (Token('.'), Token('\n')): self.finish() return False if self._parent: return False if not caught and re.match(LINE_REGEX, token.as_str()): self._numbers[index] = token return True elif re.match(NUMBER_REGEX, token.as_str()): self._parent = index elif re.match(ARTICLE_NUMBER_REGEX, token.as_str()): self._parent = index # never found such case # elif re.match(DOCUMENT_NUMBER_REGEX, token.as_str()): # self._parent = index return False
def test_with_document(self): managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}), ObserverManager({'artigos': ArticleRefObserver})] doc = DocumentReference('2/2013', Token('Decreto-Lei')) self._test('Os artigos 3º, 4º-A, 7º e 25º do Decreto-Lei 2/2013,', managers, [(ArticleReference('3º', doc), 4), (ArticleReference('4º-A', doc), 7), (ArticleReference('7º', doc), 10), (ArticleReference('25º', doc), 14), (doc, 20)])
def test_with_document(self): managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver}), ObserverManager({'artigo': ArticleRefObserver}), ObserverManager({'nº': NumberRefObserver}), ObserverManager({'alíneas': LineRefObserver})] document = DocumentReference('2/2013', Token('Decreto-Lei')) article = ArticleReference('26º', document) number = NumberReference('4', article) self._test('nas alíneas f) e g) do nº 4 do artigo 26º ' 'do Decreto-Lei nº 2/2013', managers, [(LineReference('f)', number), 4), (LineReference('g)', number), 8), (number, 14), (article, 20), (document, 28)])
def test_tokenize(self): self.assertEqual( list(tokenize('the end', ' ')), [Token('the'), Token(' '), Token('end')]) self.assertEqual(list(tokenize('the end is', ( ' ', 'the end', ))), [Token('the end'), Token(' '), Token('is')])
def test_similar_keyterms(self): expected = [ Token('this'), Token(' '), Token('is'), Token(' '), Token('the'), Token(' '), Token('foo'), Token(' '), Token('of') ] self.assertEqual( list(tokenize('this is the foo of', (' ', 'the end', 'the bar'))), expected) self.assertEqual( list( tokenize('this is the foo of', (' ', 'the foo is', 'foo is bad'))), expected)
def test_many(self): managers = [ObserverManager({'Decretos-Leis': DocumentRefObserver})] self._test('Decretos-Leis n.os 1/2006, 2/2006, e 3/2006', managers, [(DocumentReference('1/2006', Token('Decretos-Leis')), 4), (DocumentReference('2/2006', Token('Decretos-Leis')), 7), (DocumentReference('3/2006', Token('Decretos-Leis')), 12)]) self._test('Decretos-Leis n.os 1/2006, e 2/2006', managers, [(DocumentReference('1/2006', Token('Decretos-Leis')), 4), (DocumentReference('2/2006', Token('Decretos-Leis')), 9)]) self._test('Decretos-Leis n.os 64/2006, de 21 de março, ' '88/2006, de 23 de maio, e ' '196/2006, de 10 de outubro', managers, [(DocumentReference('64/2006', Token('Decretos-Leis')), 4), (DocumentReference('88/2006', Token('Decretos-Leis')), 16), (DocumentReference('196/2006', Token('Decretos-Leis')), 30)])
def test_fail(self): managers = [ObserverManager({'\n': ClauseObserver})] result = parser.parse('\nImagina\n', managers, {'\n'}) self.assertEqual([Token('\n'), Token('Imagina'), Token('\n')], result)
def tokenize(string, keyterms=()): return [Token(token) for token in _tokenizer.tokenize(string, keyterms)]
def test_old_notation(self): managers = [ObserverManager({'Decreto-Lei': DocumentRefObserver})] self._test('Decreto-Lei nº 2-A/90,', managers, [(DocumentReference('2-A/90', Token('Decreto-Lei')), 4)])
def replace_in(self, result): result[self._index + 2] = Token('') result[self._index + 1] = Annex('')
def _replace_in(self, result): result[self._index] = EULawReference(self._number, Token(self._string))
def test_simple(self): managers = [ObserverManager({'\n': ClauseObserver})] result = parser.parse('\nIV\n', managers, {'\n'}) self.assertEqual([Token('\n'), Clause('IV'), Token('')], result)
def test_real(self): self.assertEqual( list( tokenize('no n.º 2 do artigo 26.º do Decreto-Lei 2/2013,', (' ', '.', ',', 'Decreto-Lei', 'Decretos-Leis', 'n.º', '.º', 'n.os'))), [ Token('no'), Token(' '), Token('n.º'), Token(' '), Token('2'), Token(' '), Token('do'), Token(' '), Token('artigo'), Token(' '), Token('26'), Token('.º'), Token(' '), Token('do'), Token(' '), Token('Decreto-Lei'), Token(' '), Token('2/2013'), Token(',') ])
def replace_in(self, result): assert (self._number_index == self._index + self.number_at) for i in reversed(range(2, self.take_up_to + 1)): result[self._index + i] = Token('') # '\n' result[self._index + 1] = self.anchor_klass(self._number)
def test_simple(self): # document 455149 contains this example managers = parser.common_managers result = parser.parse('\nAnexo\n', managers, {'\n'}) self.assertEqual([Token('\n'), Annex(''), Token('')], result)
def test_fails(self): managers = [ObserverManager({'\n': UnnumberedAnnexObserver})] result = parser.parse('\nTítulo\n', managers, {'\n'}) self.assertEqual([Token('\n'), Token('Título'), Token('\n')], result)
def test_basic(self): self.assertEqual( list(tokenize('the the', ' ')), [Token('the'), Token(' '), Token('the')])