Exemple #1
0
 def setUp(self):
     settings = CrossmapSettings(config_plain, create_dir=True)
     settings.tokens.k = 10
     # initialize the db with custom features
     CrossmapFeatures(settings, features=test_features)
     self.indexer = CrossmapIndexer(settings)
     self.index_file = settings.index_file("targets")
Exemple #2
0
 def test_skip_certain_docs(self):
     """docs do not have any of the limited features should be omitted"""
     settings = CrossmapSettings(config_plain, create_dir=True)
     settings.tokens.k = 10
     CrossmapFeatures(settings, features=self.limited_features)
     indexer = CrossmapIndexer(settings)
     with self.assertLogs(level="WARNING") as cm:
         indexer.build()
     self.assertTrue("Skipping item" in str(cm.output))
Exemple #3
0
    def setUpClass(cls):
        """build an indexer using a fixed featuremap"""

        settings = CrossmapSettings(config_featuremap, create_dir=True)
        settings.tokens.k = 20
        cls.indexer = CrossmapIndexer(settings)
        cls.indexer.build()
        cls.feature_map = cls.indexer.encoder.feature_map
Exemple #4
0
    def setUpClass(cls):
        """build an indexer using target documents only"""

        settings = CrossmapSettings(config_single, create_dir=True)
        settings.tokens.k = 10
        CrossmapFeatures(settings, features=test_features)
        cls.indexer = CrossmapIndexer(settings)
        cls.indexer.build()
Exemple #5
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_plain, create_dir=True)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()
     cls.diffuser = CrossmapDiffuser(settings)
     cls.diffuser.build()
     cls.feature_map = cls.diffuser.feature_map
     cls.db = cls.diffuser.db
     cls.encoder = cls.indexer.encoder
Exemple #6
0
    def test_indexer_build_rebuild(self):
        """run a build when indexes already exist"""

        self.assertFalse(exists(self.index_file))
        self.indexer.build()
        # the second indexer is created from scratch, with the same settings
        # build should detect presence of indexes and load instead
        with self.assertLogs(level="WARNING") as cm:
            newindexer = CrossmapIndexer(self.indexer.settings)
            newindexer.build()
        self.assertTrue("Skip" in str(cm.output))
        # after build, the indexer should be ready to use
        ids_targets = newindexer.db.all_ids("targets")
        ids_docs = newindexer.db.all_ids("documents")
        self.assertEqual(len(ids_targets), 6, "dataset still has six items")
        self.assertGreater(len(ids_docs), 6, "targets have many items")
        self.assertTrue(exists(newindexer.index_files["targets"]))
        self.assertTrue(exists(newindexer.index_files["documents"]))
Exemple #7
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_longword, create_dir=True)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()
     cls.diffuser = CrossmapDiffuser(settings)
     cls.diffuser.build()
     cls.feature_map = cls.diffuser.feature_map
     cls.db = cls.diffuser.db
     cls.encoder = cls.indexer.encoder
     cls.plain_tokenizer = CrossmapTokenizer(settings)
     cls.diff_tokenizer = CrossmapDiffusionTokenizer(settings)
     # extract data vectors
     cls.data = dict()
     temp = cls.db.get_data(dataset="targets",
                            ids=["L0", "L1", "L2", "L3", "L4"])
     for _ in temp:
         cls.data[_["id"]] = sparse_to_dense(_["data"])
Exemple #8
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_plain, create_dir=True)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()
     cls.diffuser = CrossmapDiffuser(settings)
     cls.diffuser.build()
Exemple #9
0
class CrossmapIndexerBuildTests(unittest.TestCase):
    """Creating nearest neighbor indexes from documents and text tokens"""
    def setUp(self):
        settings = CrossmapSettings(config_plain, create_dir=True)
        settings.tokens.k = 10
        # initialize the db with custom features
        CrossmapFeatures(settings, features=test_features)
        self.indexer = CrossmapIndexer(settings)
        self.index_file = settings.index_file("targets")

    def tearDown(self):
        remove_crossmap_cache(data_dir, "crossmap_simple")

    def test_indexer_build(self):
        """build indexes from a simple configuration"""

        self.assertFalse(exists(self.index_file))
        self.indexer.build()
        ids_targets = self.indexer.db.all_ids("targets")
        ids_docs = self.indexer.db.all_ids("documents")
        self.assertEqual(len(ids_targets), 6, "dataset has six items")
        self.assertGreater(len(ids_docs), 6, "documents have several items")
        self.assertEqual(len(self.indexer.index_files), 2,
                         "one index for targets, one for documents")
        self.assertTrue(exists(self.indexer.index_files["targets"]))
        self.assertTrue(exists(self.indexer.index_files["documents"]))

    def test_indexer_load(self):
        """prepared indexes from disk"""

        indexer = self.indexer
        self.assertFalse(exists(self.index_file))
        indexer.build()
        indexer.indexes = []
        indexer.load()
        self.assertEqual(len(indexer.indexes), 2)
        # both index and data db should record items
        self.assertEqual(len(indexer.db.all_ids("targets")), 6)
        self.assertGreater(len(indexer.db.all_ids("documents")), 6)

    def test_indexer_build_rebuild(self):
        """run a build when indexes already exist"""

        self.assertFalse(exists(self.index_file))
        self.indexer.build()
        # the second indexer is created from scratch, with the same settings
        # build should detect presence of indexes and load instead
        with self.assertLogs(level="WARNING") as cm:
            newindexer = CrossmapIndexer(self.indexer.settings)
            newindexer.build()
        self.assertTrue("Skip" in str(cm.output))
        # after build, the indexer should be ready to use
        ids_targets = newindexer.db.all_ids("targets")
        ids_docs = newindexer.db.all_ids("documents")
        self.assertEqual(len(ids_targets), 6, "dataset still has six items")
        self.assertGreater(len(ids_docs), 6, "targets have many items")
        self.assertTrue(exists(newindexer.index_files["targets"]))
        self.assertTrue(exists(newindexer.index_files["documents"]))

    def test_indexer_str(self):
        """str summarizes main properties"""

        self.assertTrue("Indexes:\t0" in str(self.indexer))
        self.indexer.build()
        self.assertTrue("Indexes:\t2" in str(self.indexer))
Exemple #10
0
 def setUpClass(cls):
     settings = CrossmapSettings(config_plain, create_dir=True)
     settings.tokens.k = 10
     CrossmapFeatures(settings, features=test_features)
     cls.indexer = CrossmapIndexer(settings)
     cls.indexer.build()