def test_last_modified(self): t = Tokenizer.for_extension('.txt')(language=MockLang('english')) f = MockCorpusFile('test') tokens = t.tokenize(f, MockConfig(type='fs')) token = tokens[0] token.gold = token.original self.assertEqual( token.gold, token.original, f'Resulting token.gold should be identical with original: {vars(token)}' ) self.assertFalse( token.is_discarded, f'Resulting token should not be discarded: {vars(token)}') last_modified = token.last_modified token.is_discarded = True self.assertTrue(token.is_discarded, f'Resulting token should be discarded: {vars(token)}') self.assertEqual( token.gold, '', f'Resulting token.gold should be cleared: {vars(token)}') self.assertTrue( token.last_modified > last_modified, f'Resulting token should have updated last_modified (was {last_modified}): {vars(token)}' )
def __init__(self, root, docid, contents): self.root = root self.docid = docid t = Tokenizer.for_extension('.txt')(language=MockLang('english')) tokens = t.tokenize(MockCorpusFile(contents, self.docid), MockConfig(type='mem')) self.doc = MockDocument(docid, tokens) self.docs = {docid: self.doc}
def test_pdf_tokenization(self): t = Tokenizer.for_extension('.pdf')(language=MockLang('english')) f = pathlib.Path(__file__).parent.joinpath('test.pdf') tokens = t.tokenize(f, MockConfig(type='fs')) self.assertEqual(str(tokens), 'Once upen a ti- me.', f'Resulting string does not contain expected tokens')
def test_auto_dehyphenation_soft(self): t = Tokenizer.for_extension('.txt')(language=MockLang('english')) f = MockCorpusFile('Str\xad ing Te\xadst') tokens = t.tokenize(f, MockConfig(type='fs')) tokens.dehyphenate() self.assertEqual( str(tokens), 'String Te\xadst', f'Resulting string should be dehyphenated in {tokens}.')
def test_manual_dehyphenation(self): t = Tokenizer.for_extension('.txt')(language=MockLang('english')) f = MockCorpusFile('Str- ing') tokens = t.tokenize(f, MockConfig(type='fs')) self.assertEqual(str(tokens), 'Str- ing', f'Resulting string should not be dehyphenated.') tokens[0].is_hyphenated = True self.assertEqual(str(tokens), 'String', f'Resulting string should be dehyphenated.')
def test_auto_dehyphenation(self): t = Tokenizer.for_extension('.txt')(language=MockLang('english')) f = MockCorpusFile('String') tokens = t.tokenize(f, MockConfig(type='fs')) token = tokens[0] self.assertIsNone(token.bin, f'Token should not be in any bin.') token.kbest = { 1: KBestItem("String", 1.0), } dictionary = set(["String"]) settings = { 1: "o", } heuristics = Heuristics(settings, dictionary) heuristics.bin_tokens(tokens) self.assertEqual(token.bin.number, 1, f'Token should be in bin 1.')