def test_udpipe(self): """Test udpipe token lemmatization""" normalizer = preprocess.UDPipeLemmatizer('Slovenian') self.corpus.metas[0, 0] = 'sem' corpus = normalizer(self.corpus) self.assertListEqual(list(corpus.tokens[0]), ['biti']) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_udpipe_doc(self): """Test udpipe lemmatization with its own tokenization """ normalizer = preprocess.UDPipeLemmatizer() normalizer.language = 'Slovenian' normalizer.use_tokenizer = True self.assertListEqual(normalizer.normalize_doc('Gori na gori hiša gori'), ['gora', 'na', 'gora', 'hiša', 'goreti'])
def test_udpipe_doc(self): """Test udpipe lemmatization with its own tokenization """ normalizer = preprocess.UDPipeLemmatizer('Slovenian', True) self.corpus.metas[0, 0] = 'Gori na gori hiša gori' corpus = normalizer(self.corpus) self.assertListEqual(list(corpus.tokens[0]), ['gora', 'na', 'gora', 'hiša', 'goreti']) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1)
def test_udpipe_pickle(self): normalizer = preprocess.UDPipeLemmatizer() normalizer.language = 'English' loaded = pickle.loads(pickle.dumps(normalizer)) self.assertEqual(normalizer.language, loaded.language) self.assertEqual(loaded.normalize_doc('peter piper pickled'), ['peter', 'piper', 'pickle'])
def test_udpipe_deepcopy(self): normalizer = preprocess.UDPipeLemmatizer('Slovenian', True) copied = copy.deepcopy(normalizer) self.assertEqual(normalizer._UDPipeLemmatizer__language, copied._UDPipeLemmatizer__language) self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer, copied._UDPipeLemmatizer__use_tokenizer) self.corpus.metas[0, 0] = 'Gori na gori hiša gori' self.assertEqual(list(copied(self.corpus).tokens[0]), ['gora', 'na', 'gora', 'hiša', 'goreti'])
def test_udpipe_pickle(self): normalizer = preprocess.UDPipeLemmatizer('Slovenian', True) loaded = pickle.loads(pickle.dumps(normalizer)) self.assertEqual(normalizer._UDPipeLemmatizer__language, loaded._UDPipeLemmatizer__language) self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer, loaded._UDPipeLemmatizer__use_tokenizer) self.corpus.metas[0, 0] = 'Gori na gori hiša gori' self.assertEqual(list(loaded(self.corpus).tokens[0]), ['gora', 'na', 'gora', 'hiša', 'goreti'])
def test_cache(self): normalizer = preprocess.UDPipeLemmatizer('Slovenian') with self.corpus.unlocked(): self.corpus.metas[0, 0] = 'sem' normalizer(self.corpus) self.assertEqual(normalizer._normalization_cache['sem'], 'biti') self.assertEqual(40, len(normalizer._normalization_cache)) # cache should not be pickled loaded_normalizer = pickle.loads(pickle.dumps(normalizer)) self.assertEqual(0, len(loaded_normalizer._normalization_cache))
def test_udpipe_pickle(self): normalizer = preprocess.UDPipeLemmatizer('Slovenian', True) # udpipe store model after first call - model is not picklable normalizer(self.corpus) loaded = pickle.loads(pickle.dumps(normalizer)) self.assertEqual(normalizer._UDPipeLemmatizer__language, loaded._UDPipeLemmatizer__language) self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer, loaded._UDPipeLemmatizer__use_tokenizer) with self.corpus.unlocked(): self.corpus.metas[0, 0] = 'Gori na gori hiša gori' self.assertEqual(list(loaded(self.corpus).tokens[0]), ['gora', 'na', 'gora', 'hiša', 'goreti'])
def test_call_UDPipe(self): pp = preprocess.UDPipeLemmatizer() self.assertFalse(self.corpus.has_tokens()) corpus = pp(self.corpus) self.assertTrue(corpus.has_tokens()) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_udpipe(self): """Test udpipe token lemmatization""" normalizer = preprocess.UDPipeLemmatizer() normalizer.language = 'Slovenian' self.assertEqual(normalizer('sem'), 'biti')