class TestThirdParty(unittest.TestCase): def setUp(self): self.config = AppConfig('stemmer') self.fac = ImportConfigFactory(self.config, shared=False) def test_stemmer(self): tnfac = ImportConfigFactory(self.config) sent = 'Bobby is fast and runs with dogs, armies, and sheep from the police.' doc_parser = self.fac.instance( 'doc_parser', token_normalizer=tnfac.instance('nonorm_token_normalizer')) doc = doc_parser.parse(sent) feats = tuple(doc.norm_token_iter()) self.assertEqual( ('Bobby', 'is', 'fast', 'and', 'runs', 'with', 'dogs', ',', 'armies', ',', 'and', 'sheep', 'from', 'the', 'police', '.'), feats) self.assertEqual( ('Bobby', 'be', 'fast', 'and', 'run', 'with', 'dog', ',', 'army', ',', 'and', 'sheep', 'from', 'the', 'police', '.'), tuple(map(lambda f: f.lemma_, doc.token_iter()))) doc_parser = self.fac.instance( 'doc_parser', token_normalizer=tnfac.instance('stemmer_token_normalizer')) doc = doc_parser.parse(sent) feats = tuple(doc.norm_token_iter()) self.assertEqual( ('bobbi', 'is', 'fast', 'and', 'run', 'with', 'dog', ',', 'armi', ',', 'and', 'sheep', 'from', 'the', 'polic', '.'), feats)
def test_filter_features(self): tnfac = ImportConfigFactory(self.config) dp = self.fac('default_doc_parser', token_normalizer=tnfac.instance( 'feature_no_filter_token_normalizer')) feats = dp( 'I am a citizen of the United States of America.').token_iter() self.assertEqual(('I', 'am', 'a', 'citizen', 'of', 'the United States of America', '.'), tuple(map(lambda f: f.norm, feats))) dp = self.fac('default_doc_parser', token_normalizer=tnfac.instance( 'feature_default_filter_token_normalizer')) feats = dp.parse( 'I am a citizen of the United States of America.').token_iter() self.assertEqual( ('I', 'am', 'citizen', 'of', 'the United States of America'), tuple(map(lambda f: f.norm, feats))) dp = self.fac('default_doc_parser', token_normalizer=tnfac.instance( 'feature_stop_filter_token_normalizer')) feats = dp.parse( 'I am a citizen of the United States of America.').token_iter() self.assertEqual(('citizen', 'the United States of America'), tuple(map(lambda f: f.norm, feats)))
def create_facade(self) -> ModelFacade: """Create a new instance of the facade.""" # we must create a new (non-shared) instance of the facade since it # will get deallcated after complete. config = self.config model_path = self.model_path if self.config_overwrites is not None: config = cp.deepcopy(config) config.merge(self.config_overwrites) if model_path is None: cf = ImportConfigFactory(config, **self.config_factory_args) facade: ModelFacade = cf.instance(self.facade_name) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created facade: {facade}') self.dealloc_resources.extend((cf, facade)) else: if logger.isEnabledFor(logging.INFO): logger.info(f'loading model from {model_path}') with dealloc(ImportConfigFactory( config, **self.config_factory_args)) as cf: cls: Type[ModelFacade] = cf.get_class(self.facade_name) facade: ModelFacade = cls.load_from_path(model_path) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created facade: {type(facade)} ' + f'from path: {model_path}') self.dealloc_resources.append(facade) return facade
class TestFeatureVectorization(unittest.TestCase): def setUp(self): if hasattr(self.__class__, 'CONF_FILE'): path = self.CONF_FILE else: path = 'test-resources/features.conf' config = AppConfig(path) self.fac = ImportConfigFactory(config, shared=True) self.sent_text = 'I am a citizen of the United States of America.' self.def_parse = ('I', 'am', 'a', 'citizen', 'of', 'the United States of America', '.') if not hasattr(self.__class__, 'NO_VECTORIZER'): self.vmng = self.fac.instance('feature_vectorizer_manager') self.sent_text2 = self.sent_text + " My name is Paul Landes." def assertTensorEquals(self, should, tensor): self.assertEqual(should.shape, tensor.shape) try: eq = TorchConfig.equal(should, tensor) except RuntimeError as e: logger.error(f'error comparing {should} with {tensor}') raise e if not eq: logger.error(f'tensor {should} does not equal {tensor}') self.assertTrue(eq) def _to_sparse(self, arr: Tensor): return SparseTensorFeatureContext.to_sparse(arr)[0][0]
def test_feature(self): tnfac = ImportConfigFactory(self.config, shared=False) tn = tnfac.instance('default_token_normalizer') doc_parser = self.fac('default_doc_parser', token_normalizer=tn) self.assertEqual( 'MapTokenNormalizer: embed=True, reload=False, lemma_token_mapper', str(tn)) fd = doc_parser(self.sent) res = fd.asdict() if 0: with open(self.config.feature_path, 'w') as f: f.write(fd.asjson(indent=4)) with open(self.config.feature_path) as f: c = json.load(f) self.assertEqual(rec_sort(c), rec_sort(res)) tn = tnfac.instance('nonorm_token_normalizer') doc_parser = self.fac('default_doc_parser', token_normalizer=tn) res = tuple(map(lambda x: x.norm, doc_parser(self.sent).token_iter())) self.assertEqual(('Dan', 'throws', 'the', 'ball', '.'), res)
class SqliteTestCase(unittest.TestCase): def setUp(self): self.config = AppConfig.instance() self.target_path = Path('./target') if self.target_path.exists(): shutil.rmtree(self.target_path) self.fac = ImportConfigFactory(self.config) @staticmethod def init_logging(): logging.basicConfig(level=logging.INFO) logger.setLevel(logging.DEBUG) def _test_inst_persister(self): persister = self.fac.instance('inst_db_persister', row_factory=Person) db_path = Path(self.target_path, 'sql-test2.db') self.assertFalse(db_path.exists()) self.assertEqual(0, persister.get_count()) self.assertEqual(1, persister.insert_row('paul', 23)) self.assertEqual(2, persister.insert_row('sue', 33)) self.assertTrue(db_path.exists()) peeps = persister.get() self.assertTrue(2, len(peeps)) self.assertEqual({'id': 1, 'name': 'paul', 'age': 23}, peeps[0].get_attrs()) self.assertEqual({'id': 2, 'name': 'sue', 'age': 33}, peeps[1].get_attrs()) peeps = persister.get() self.assertEqual((1, 'paul', 23), peeps[0].get_row()) self.assertEqual(('paul', 23), peeps[0].get_insert_row()) peeps = persister.get() self.assertEqual('id: 1, name: paul, age: 23', str(peeps[0])) self.assertEqual('id: 2, name: sue, age: 33', str(peeps[1])) peeps = persister.get() self.assertEqual('id: 1, name: paul, age: 23', str(peeps[0])) self.assertEqual('id: 2, name: sue, age: 33', str(peeps[1])) new_peeps = (('bob', 42), ('jane', 90),) self.assertEqual(4, persister.insert_rows(new_peeps)) peeps = persister.get() self.assertEqual({'id': 3, 'name': 'bob', 'age': 42}, peeps[0].get_attrs()) self.assertEqual({'id': 4, 'name': 'jane', 'age': 90}, peeps[1].get_attrs()) bean = Person('kyle', 52) self.assertEqual(None, bean.id) self.assertEqual(5, persister.insert(bean)) self.assertEqual(5, bean.id) self.assertEqual(((5,),), persister.execute_by_name('people_count', row_factory='tuple')) peep = persister.get_by_id(2) self.assertEqual('id: 2, name: sue, age: 33', str(peep)) peep = persister.get_by_id(5) self.assertEqual('id: 5, name: kyle, age: 52', str(peep)) self.assertEqual(None, persister.get_by_id(100)) self.assertTrue(persister.exists(1)) self.assertTrue(persister.exists(5)) self.assertFalse(persister.exists(100)) peep = persister.get_by_id(2) peep.age = 41 self.assertTrue(2, persister.update(peep)) peep = persister.get_by_id(2) self.assertEqual('id: 2, name: sue, age: 41', str(peep)) self.assertTrue(persister.exists(2)) self.assertTrue(2, persister.delete(2)) self.assertFalse(persister.exists(2)) self.assertEqual(((4,),), persister.execute_by_name('people_count', row_factory='tuple')) self.assertEqual(4, persister.get_count()) self.assertEqual((1, 3, 4, 5), tuple(persister.get_keys())) new_peeps = (Person('jake', 62), Person('christina', 22),) self.assertEqual(7, persister.insert_beans(new_peeps)) peeps = persister.get() self.assertEqual({'id': 6, 'name': 'jake', 'age': 62}, peeps[2].get_attrs()) self.assertEqual({'id': 7, 'name': 'christina', 'age': 22}, peeps[1].get_attrs()) return persister
class TestWordPieceTokenization(unittest.TestCase): def setUp(self): path = 'test-resources/transformer.conf' config = AppConfig(path) self.fac = ImportConfigFactory(config) self.vmng = self.fac.instance('feature_vectorizer_manager') def _test_tok(self, vec_name: str, sent: str, should_tok_len: int, should: Tuple[Tuple[str, Tuple[str]]]): doc: FeatureDocument = self.vmng.parse(sent) vec = self.vmng[vec_name] tdoc: TokenizedFeatureDocument = vec.tokenize(doc) self.assertEqual(TokenizedFeatureDocument, type(tdoc)) smaps = tdoc.map_word_pieces_to_tokens() self.assertEqual(len(should), len(smaps)) for sent_map, should_sent in zip(smaps, should): sent: FeatureSentence = sent_map['sent'] tmap: Tuple[FeatureToken, Tuple[str]] = sent_map['map'] tok: FeatureToken ttoks: Tuple[str] for (tok, ttoks), (should_tok, should_ttoks) in zip(tmap, should_sent): self.assertEqual(FeatureToken, type(tok)) self.assertEqual(str, type(ttoks[0])) self.assertEqual(tok.norm, should_tok) self.assertEqual(ttoks, should_ttoks) arr = vec.transform(doc) self.assertEqual((len(should), should_tok_len, 768), tuple(arr.shape)) def _test_sent_1(self, vec_name: str): sent = 'The gunships are nearer than you think. Their heading is changing.' should = ((('The', ('The', )), ('gunships', ('guns', 'hips')), ('are', ('are', )), ('nearer', ('nearer', )), ('than', ('than', )), ('you', ('you', )), ('think', ('think', )), ('.', ('.', ))), (('Their', ('Their', )), ('heading', ('heading', )), ('is', ('is', )), ('changing', ('changing', )), ('.', ('.', )))) self._test_tok(vec_name, sent, 11, should) def _test_sent_2(self, vec_name: str): sent = 'The guns are near. Their heading is changing to the gunships.' should = ((('The', ('The', )), ('guns', ('guns', )), ('are', ('are', )), ('near', ('near', )), ('.', ('.', ))), (('Their', ('Their', )), ('heading', ('heading', )), ('is', ('is', )), ('changing', ('changing', )), ('to', ('to', )), ('the', ('the', )), ('gunships', ('guns', 'hips')), ('.', ('.', )))) self._test_tok(vec_name, sent, 11, should) def _test_sent_3(self, vec_name: str): sent = 'Their heading is changing to the gunships.' should = ((('Their', ('Their', )), ('heading', ('heading', )), ('is', ('is', )), ('changing', ('changing', )), ('to', ('to', )), ('the', ('the', )), ('gunships', ('guns', 'hips')), ('.', ('.', ))), ) self._test_tok(vec_name, sent, 11, should) def _test_sent_4(self, vec_name: str): sent = ( 'The guns are near. Their heading is changing to the gunships.' + ' The United States schooner created a gridlocking situation.') should = ((('The', ('The', )), ('guns', ('guns', )), ('are', ('are', )), ('near', ('near', )), ('.', ('.', ))), (('Their', ('Their', )), ('heading', ('heading', )), ('is', ('is', )), ('changing', ('changing', )), ('to', ('to', )), ('the', ('the', )), ('gunships', ('guns', 'hips')), ('.', ('.', ))), (('The', ('The', )), ('United States', ('United', 'States')), ('schooner', ( 'sch', 'oon', 'er', )) if vec_name == 'transformer_roberta' else ('schooner', ('schooner', )), ('created', ('created', )), ('a', ('a', )), ('gridlocking', ('grid', 'locking')) if vec_name == 'transformer_roberta' else ('gridlocking', ('grid', 'lock', 'ing')), ('situation', ('situation', )), ('.', ('.', )))) self._test_tok(vec_name, sent, 14 if vec_name == 'transformer_roberta' else 13, should) def test_bert(self): vec_name = 'transformer_bert' self._test_sent_1(vec_name) self._test_sent_2(vec_name) self._test_sent_3(vec_name) self._test_sent_4(vec_name) def test_roberta(self): vec_name = 'transformer_roberta' self._test_sent_1(vec_name) self._test_sent_2(vec_name) self._test_sent_3(vec_name) self._test_sent_4(vec_name) def test_distilbert(self): vec_name = 'transformer_distilbert' self._test_sent_1(vec_name) self._test_sent_2(vec_name) self._test_sent_3(vec_name) self._test_sent_4(vec_name)