Exemple #1
0
 def test_skip_certain_docs(self):
     """docs do not have any of the limited features should be omitted"""
     settings = CrossmapSettings(config_plain, create_dir=True)
     settings.tokens.k = 10
     CrossmapFeatures(settings, features=self.limited_features)
     indexer = CrossmapIndexer(settings)
     with self.assertLogs(level="WARNING") as cm:
         indexer.build()
     self.assertTrue("Skipping item" in str(cm.output))
Exemple #2
0
    def test_indexer_build_rebuild(self):
        """run a build when indexes already exist"""

        self.assertFalse(exists(self.index_file))
        self.indexer.build()
        # the second indexer is created from scratch, with the same settings
        # build should detect presence of indexes and load instead
        with self.assertLogs(level="WARNING") as cm:
            newindexer = CrossmapIndexer(self.indexer.settings)
            newindexer.build()
        self.assertTrue("Skip" in str(cm.output))
        # after build, the indexer should be ready to use
        ids_targets = newindexer.db.all_ids("targets")
        ids_docs = newindexer.db.all_ids("documents")
        self.assertEqual(len(ids_targets), 6, "dataset still has six items")
        self.assertGreater(len(ids_docs), 6, "targets have many items")
        self.assertTrue(exists(newindexer.index_files["targets"]))
        self.assertTrue(exists(newindexer.index_files["documents"]))
Exemple #3
0
class CrossmapIndexerBuildTests(unittest.TestCase):
    """Creating nearest neighbor indexes from documents and text tokens"""
    def setUp(self):
        settings = CrossmapSettings(config_plain, create_dir=True)
        settings.tokens.k = 10
        # initialize the db with custom features
        CrossmapFeatures(settings, features=test_features)
        self.indexer = CrossmapIndexer(settings)
        self.index_file = settings.index_file("targets")

    def tearDown(self):
        remove_crossmap_cache(data_dir, "crossmap_simple")

    def test_indexer_build(self):
        """build indexes from a simple configuration"""

        self.assertFalse(exists(self.index_file))
        self.indexer.build()
        ids_targets = self.indexer.db.all_ids("targets")
        ids_docs = self.indexer.db.all_ids("documents")
        self.assertEqual(len(ids_targets), 6, "dataset has six items")
        self.assertGreater(len(ids_docs), 6, "documents have several items")
        self.assertEqual(len(self.indexer.index_files), 2,
                         "one index for targets, one for documents")
        self.assertTrue(exists(self.indexer.index_files["targets"]))
        self.assertTrue(exists(self.indexer.index_files["documents"]))

    def test_indexer_load(self):
        """prepared indexes from disk"""

        indexer = self.indexer
        self.assertFalse(exists(self.index_file))
        indexer.build()
        indexer.indexes = []
        indexer.load()
        self.assertEqual(len(indexer.indexes), 2)
        # both index and data db should record items
        self.assertEqual(len(indexer.db.all_ids("targets")), 6)
        self.assertGreater(len(indexer.db.all_ids("documents")), 6)

    def test_indexer_build_rebuild(self):
        """run a build when indexes already exist"""

        self.assertFalse(exists(self.index_file))
        self.indexer.build()
        # the second indexer is created from scratch, with the same settings
        # build should detect presence of indexes and load instead
        with self.assertLogs(level="WARNING") as cm:
            newindexer = CrossmapIndexer(self.indexer.settings)
            newindexer.build()
        self.assertTrue("Skip" in str(cm.output))
        # after build, the indexer should be ready to use
        ids_targets = newindexer.db.all_ids("targets")
        ids_docs = newindexer.db.all_ids("documents")
        self.assertEqual(len(ids_targets), 6, "dataset still has six items")
        self.assertGreater(len(ids_docs), 6, "targets have many items")
        self.assertTrue(exists(newindexer.index_files["targets"]))
        self.assertTrue(exists(newindexer.index_files["documents"]))

    def test_indexer_str(self):
        """str summarizes main properties"""

        self.assertTrue("Indexes:\t0" in str(self.indexer))
        self.indexer.build()
        self.assertTrue("Indexes:\t2" in str(self.indexer))