def test_skip_certain_docs(self): """docs do not have any of the limited features should be omitted""" settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 CrossmapFeatures(settings, features=self.limited_features) indexer = CrossmapIndexer(settings) with self.assertLogs(level="WARNING") as cm: indexer.build() self.assertTrue("Skipping item" in str(cm.output))
def test_indexer_build_rebuild(self): """run a build when indexes already exist""" self.assertFalse(exists(self.index_file)) self.indexer.build() # the second indexer is created from scratch, with the same settings # build should detect presence of indexes and load instead with self.assertLogs(level="WARNING") as cm: newindexer = CrossmapIndexer(self.indexer.settings) newindexer.build() self.assertTrue("Skip" in str(cm.output)) # after build, the indexer should be ready to use ids_targets = newindexer.db.all_ids("targets") ids_docs = newindexer.db.all_ids("documents") self.assertEqual(len(ids_targets), 6, "dataset still has six items") self.assertGreater(len(ids_docs), 6, "targets have many items") self.assertTrue(exists(newindexer.index_files["targets"])) self.assertTrue(exists(newindexer.index_files["documents"]))
class CrossmapIndexerBuildTests(unittest.TestCase): """Creating nearest neighbor indexes from documents and text tokens""" def setUp(self): settings = CrossmapSettings(config_plain, create_dir=True) settings.tokens.k = 10 # initialize the db with custom features CrossmapFeatures(settings, features=test_features) self.indexer = CrossmapIndexer(settings) self.index_file = settings.index_file("targets") def tearDown(self): remove_crossmap_cache(data_dir, "crossmap_simple") def test_indexer_build(self): """build indexes from a simple configuration""" self.assertFalse(exists(self.index_file)) self.indexer.build() ids_targets = self.indexer.db.all_ids("targets") ids_docs = self.indexer.db.all_ids("documents") self.assertEqual(len(ids_targets), 6, "dataset has six items") self.assertGreater(len(ids_docs), 6, "documents have several items") self.assertEqual(len(self.indexer.index_files), 2, "one index for targets, one for documents") self.assertTrue(exists(self.indexer.index_files["targets"])) self.assertTrue(exists(self.indexer.index_files["documents"])) def test_indexer_load(self): """prepared indexes from disk""" indexer = self.indexer self.assertFalse(exists(self.index_file)) indexer.build() indexer.indexes = [] indexer.load() self.assertEqual(len(indexer.indexes), 2) # both index and data db should record items self.assertEqual(len(indexer.db.all_ids("targets")), 6) self.assertGreater(len(indexer.db.all_ids("documents")), 6) def test_indexer_build_rebuild(self): """run a build when indexes already exist""" self.assertFalse(exists(self.index_file)) self.indexer.build() # the second indexer is created from scratch, with the same settings # build should detect presence of indexes and load instead with self.assertLogs(level="WARNING") as cm: newindexer = CrossmapIndexer(self.indexer.settings) newindexer.build() self.assertTrue("Skip" in str(cm.output)) # after build, the indexer should be ready to use ids_targets = newindexer.db.all_ids("targets") ids_docs = newindexer.db.all_ids("documents") self.assertEqual(len(ids_targets), 6, "dataset still has six items") self.assertGreater(len(ids_docs), 6, "targets have many items") self.assertTrue(exists(newindexer.index_files["targets"])) self.assertTrue(exists(newindexer.index_files["documents"])) def test_indexer_str(self): """str summarizes main properties""" self.assertTrue("Indexes:\t0" in str(self.indexer)) self.indexer.build() self.assertTrue("Indexes:\t2" in str(self.indexer))