def test_save_and_load(encoder): encoder.save_config() assert os.path.exists(encoder.config_abspath) test_data = np.array(['a', 'b', 'c', 'x', '!']) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) assert encoder_loaded.max_length == encoder.max_length np.testing.assert_array_equal(encoded_data_control, encoded_data_test)
def test_save_and_load(mocker): metas = get_metas() encoder = FarmTextEncoder(metas=metas) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) assert encoder_loaded.pretrained_model_name_or_path == encoder.pretrained_model_name_or_path np.testing.assert_array_equal(encoded_data_control, encoded_data_test) rm_files([encoder.save_abspath])
def test_save_and_load(self): encoder = self.get_encoder() if encoder is None: return test_data = np.random.rand(10, self.input_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() self.assertTrue(os.path.exists(encoder.save_abspath)) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) np.testing.assert_array_equal(encoded_data_test, encoded_data_control)
def test_save_and_load(self): encoder = self.get_encoder() test_data = np.array( ['it is a good day!', 'the dog sits on the floor.']) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() self.assertTrue(os.path.exists(encoder.save_abspath)) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) self.assertEqual(encoder_loaded.max_length, encoder.max_length) np.testing.assert_array_equal(encoded_data_control, encoded_data_test)
def test_incremental_indexing_parallel_indexers(random_workspace): total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow().add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml'), name='inc_vec').add( uses=os.path.join(cur_dir, 'uniq_docindexer.yml'), name='inc_doc', needs=['gateway']).add(needs=['inc_vec', 'inc_doc'])) with f: f.index(duplicate_docs[:500]) f.index(duplicate_docs) with BaseExecutor.load( (random_workspace / 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == num_uniq_docs with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == num_uniq_docs
def test_save_and_load(): input_dim = 224 encoder = get_encoder() test_data = np.random.rand(2, 3, input_dim, input_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) assert encoder_loaded.channel_axis == encoder.channel_axis np.testing.assert_array_equal(encoded_data_control, encoded_data_test)
def test_save_and_load(mocker): metas = get_metas() encoder = UniversalSentenceEncoder(metas=metas) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) assert encoder_loaded.model_url == encoder.model_url np.testing.assert_array_equal(encoded_data_control, encoded_data_test) rm_files([encoder.save_abspath])
def test_close_and_load_executor(): with Flow().add( uses=os.path.join(cur_dir, 'yaml/slowexecutor.yml')).build() as f: pass exec = BaseExecutor.load(save_abs_path) assert isinstance(exec, SlowSaveExecutor) assert hasattr(exec, 'test') assert exec.test == 10 assert exec.save_abspath == save_abs_path os.remove(save_abs_path)
def test_save_and_load(): encoder = get_encoder(model_path) test_data = np.random.rand(num_samples, 3, input_dim, input_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) assert encoder_loaded.raw_model_path == encoder.raw_model_path np.testing.assert_array_equal(encoded_data_control, encoded_data_test) rm_files([encoder.save_abspath, encoder.config_abspath])
def test_save_and_load(*args, **kwargs): encoder = get_encoder() test_data = np.random.rand(batch_size, num_frames, channel, input_dim, input_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) assert encoder_loaded.channel_axis == encoder.channel_axis np.testing.assert_array_equal(encoded_data_control, encoded_data_test) rm_files([encoder.save_abspath, encoder.config_abspath])
def test_save_and_load(): encoder = TSNEEncoder(output_dim=target_output_dim) assert encoder is not None test_data = np.random.rand(10, input_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) np.testing.assert_array_equal(encoded_data_test, encoded_data_control) rm_files([encoder.save_abspath])
def test_incremental_indexing_parallel_indexers(tmpdir): os.environ['JINA_TEST_INCREMENTAL_INDEX_WORKSPACE'] = str(tmpdir) doc0 = jina_pb2.Document() doc0.embedding.CopyFrom(array2pb(np.array([0]))) doc0.text = 'I am doc0' doc1 = jina_pb2.Document() doc1.embedding.CopyFrom(array2pb(np.array([2]))) doc1.text = 'I am doc2' doc2 = jina_pb2.Document() doc2.embedding.CopyFrom(array2pb(np.array([2]))) doc2.text = 'I am doc2' f = Flow(). \ add(uses=os.path.join(cur_dir, 'vectorindexer.yml'), shards=1, name='vec_idx').add( uses=os.path.join(cur_dir, 'docindexer.yml'), shards=1, name='doc_idx', needs=['gateway']). \ add(uses='_merge', needs=['vec_idx', 'doc_idx'], name='join_all') with f: f.index([doc0, doc1]) with BaseExecutor.load(os.path.join(tmpdir, 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == 2 with BaseExecutor.load(os.path.join(tmpdir, 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == 2 with f: f.index([doc0, doc2]) with BaseExecutor.load(os.path.join(tmpdir, 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == 3 with BaseExecutor.load(os.path.join(tmpdir, 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == 3 del os.environ['JINA_TEST_INCREMENTAL_INDEX_WORKSPACE']
def test_incremental_indexing_parallel_indexers_with_shards(random_workspace): total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) num_shards = 4 f = (Flow().add(uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before='_unique', shards=num_shards, name='inc_vec', separated_workspace=True).add( uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before='_unique', shards=num_shards, name='inc_doc', needs=['gateway'], separated_workspace=True).add( uses='_merge', needs=['inc_vec', 'inc_doc'])) with f: f.index(duplicate_docs[:500]) f.index(duplicate_docs) vect_idx_size = 0 for shard_idx in range(num_shards): save_abspath = (random_workspace / f'vec_idx-{shard_idx + 1}' / 'vec_idx.bin') with BaseExecutor.load(save_abspath) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) vect_idx_size += vector_indexer._size assert vect_idx_size == num_uniq_docs doc_idx_size = 0 for shard_idx in range(num_shards): save_abspath = (random_workspace / f'doc_idx-{shard_idx + 1}' / 'doc_idx.bin') with BaseExecutor.load(save_abspath) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) doc_idx_size += doc_indexer._size assert doc_idx_size == num_uniq_docs
def test_incremental_indexing_sequential_indexers_with_shards( random_workspace, restful): total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) num_shards = 4 # can't use plain _unique in uses_before because workspace will conflict with other f = (Flow(restful=restful).add( uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_vec.yml'), shards=num_shards, separated_workspace=True).add( uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_doc.yml'), shards=num_shards, separated_workspace=True)) with f: f.index(duplicate_docs[:500]) with f: f.index(duplicate_docs) vect_idx_size = 0 for shard_idx in range(num_shards): save_abspath = (random_workspace / f'vec_idx-{shard_idx + 1}' / 'vec_idx.bin') with BaseExecutor.load(save_abspath) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) vect_idx_size += vector_indexer._size assert vect_idx_size == num_uniq_docs doc_idx_size = 0 for shard_idx in range(num_shards): save_abspath = (random_workspace / f'doc_idx-{shard_idx + 1}' / 'doc_idx.bin') with BaseExecutor.load(save_abspath) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) doc_idx_size += doc_indexer._size assert doc_idx_size == num_uniq_docs
def test_unique_indexing_docindexers_before(random_workspace): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow().add(uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before='_unique')) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer.size == num_uniq_docs
def test_unique_indexing_docindexers(random_workspace, restful, separated_workspace): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow(restful=restful) .add(uses=os.path.join(cur_dir, 'uniq_docindexer.yml'), shards=1, separated_workspace=separated_workspace)) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer.size == num_uniq_docs
def test_unique_indexing_vecindexers(random_workspace, restful): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow(restful=restful) .add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml'), name='vec_idx')) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer.size == num_uniq_docs
def test_save_and_load(self): encoder = self.get_encoder() if encoder is None: return test_data = np.random.rand(2, 3, 3, 224, 224) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() self.assertTrue(os.path.exists(encoder.save_abspath)) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) self.assertEqual(encoder_loaded.model_name, encoder.model_name) np.testing.assert_array_equal(encoded_data_control, encoded_data_test)
def test_save_and_load(encoder): input_dim = 224 test_data = np.random.rand(2, 3, input_dim, input_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) encoder_loaded = BaseExecutor.load(encoder.save_abspath) encoded_data_test = encoder_loaded.encode(test_data) assert encoder_loaded.channel_axis == encoder.channel_axis assert encoder_loaded.pool_strategy == encoder.pool_strategy assert encoder_loaded.layer_name == encoder.layer_name np.testing.assert_array_equal(encoded_data_control, encoded_data_test)
def test_load_yaml2(test_metas): a = BaseExecutor.load_config(os.path.join(cur_dir, 'yaml/test-exec-with-driver.yml')) assert len(a._drivers) == 2 # should be able to auto fill in ControlRequest assert 'ControlRequest' in a._drivers a.save_config() p = a.config_abspath b = BaseExecutor.load_config(p) assert a._drivers == b._drivers a.touch() a.save() c = BaseExecutor.load(a.save_abspath) assert a._drivers == c._drivers
def test_dump_excutor_with_drivers(tmpdir): a = BaseExecutor.load_config(f'{cur_dir}/yaml/route.yml') a.touch() a._drivers['ControlRequest'][0].idle_dealer_ids = ('hello', 'there') a.save(str(tmpdir / 'a.bin')) print(a._drivers) b = BaseExecutor.load(str(tmpdir / 'a.bin')) print(b._drivers) assert id(b._drivers['ControlRequest'][0]) != id( a._drivers['ControlRequest'][0]) assert not b._drivers['ControlRequest'][0].idle_dealer_ids
def save_and_load(encoder, requires_train_after_load): test_data = np.random.rand(10, input_dim) encoded_data_control = encoder.encode(test_data) encoder.touch() encoder.save() assert os.path.exists(encoder.save_abspath) if not requires_train_after_load: encoder_loaded = BaseExecutor.load(encoder.save_abspath) # some models are not deterministic when training, so even with same training data, we cannot ensure # same encoding results encoded_data_test = encoder_loaded.encode(test_data) np.testing.assert_array_equal(encoded_data_test, encoded_data_control)
def test_incremental_indexing_sequential_indexers(random_workspace): total_docs = 20 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow().add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml')).add( uses=os.path.join(cur_dir, 'uniq_docindexer.yml'))) PyClient.check_input(duplicate_docs[:10]) PyClient.check_input(duplicate_docs) with f: f.index(duplicate_docs[:10]) with f: f.index(duplicate_docs) with BaseExecutor.load(random_workspace / 'vec_idx.bin') as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == num_uniq_docs with BaseExecutor.load(random_workspace / 'doc_idx.bin') as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == num_uniq_docs
def test_incremental_indexing_parallel_indexers_with_shards(tmpdir): os.environ['JINA_TEST_INCREMENTAL_INDEX_WORKSPACE'] = str(tmpdir) docs = [] for i in range(1000): doc = jina_pb2.Document() doc.embedding.CopyFrom(array2pb(np.array([i]))) doc.text = f'I am doc{i}' docs.append(doc) f = Flow(). \ add(uses=os.path.join(cur_dir, 'vectorindexer.yml'), shards=3, name='vec_idx').add( uses=os.path.join(cur_dir, 'docindexer.yml'), shards=3, name='doc_idx', needs=['gateway']). \ add(uses='_merge', needs=['vec_idx', 'doc_idx'], name='join_all') with f: f.index(docs[0: 900]) with BaseExecutor.load(os.path.join(tmpdir, 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == 900 with BaseExecutor.load(os.path.join(tmpdir, 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == 900 with f: f.index(docs[0: 950]) with BaseExecutor.load(os.path.join(tmpdir, 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == 950 with BaseExecutor.load(os.path.join(tmpdir, 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == 950 del os.environ['JINA_TEST_INCREMENTAL_INDEX_WORKSPACE']
def test_incremental_indexing_sequential_indexers_with_shards(tmpdir): os.environ['JINA_TEST_INCREMENTAL_INDEX_WORKSPACE'] = str(tmpdir) total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) num_shards = 4 f = (Flow().add(uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before='_unique', shards=num_shards, separated_workspace=True).add(uses=os.path.join( cur_dir, 'docindexer.yml'), uses_before='_unique', shards=num_shards, separated_workspace=True)) with f: f.index(duplicate_docs[:500]) f.index(duplicate_docs) vect_idx_size = 0 for shard_idx in range(num_shards): save_abspath = os.path.join(tmpdir, f'vec_idx-{shard_idx + 1}', 'vec_idx.bin') with BaseExecutor.load(save_abspath) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) vect_idx_size += vector_indexer._size assert vect_idx_size == num_uniq_docs doc_idx_size = 0 for shard_idx in range(num_shards): save_abspath = os.path.join(tmpdir, f'doc_idx-{shard_idx + 1}', 'doc_idx.bin') with BaseExecutor.load(save_abspath) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) doc_idx_size += doc_indexer._size assert doc_idx_size == num_uniq_docs del os.environ['JINA_TEST_INCREMENTAL_INDEX_WORKSPACE']
def test_load_yaml2(tmpdir): os.environ['JINA_TEST_EXEC_WITH_DRIVER'] = str(tmpdir) a = BaseExecutor.load_config(os.path.join(cur_dir, 'yaml/test-exec-with-driver.yml')) assert len(a._drivers) == 2 # should be able to auto fill in ControlRequest assert 'ControlRequest' in a._drivers a.save_config() p = a.config_abspath b = BaseExecutor.load_config(p) assert a._drivers == b._drivers a.touch() a.save() c = BaseExecutor.load(a.save_abspath) assert a._drivers == c._drivers del os.environ['JINA_TEST_EXEC_WITH_DRIVER']
def test_unique_indexing_docindexers_before(random_workspace, restful): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) # can't use plain _unique because workspace will conflict with other tests f = (Flow(restful=restful) .add(uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_doc.yml'))) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer.size == num_uniq_docs
def test_load_yaml2(self): a = BaseExecutor.load_config('yaml/test-exec-with-driver.yml') assert len(a._drivers) == 2 # should be able to auto fill in ControlRequest self.assertTrue('ControlRequest' in a._drivers) a.save_config() p = a.config_abspath b = BaseExecutor.load_config(p) assert a._drivers == b._drivers self.add_tmpfile(p) a.touch() a.save() c = BaseExecutor.load(a.save_abspath) assert a._drivers == c._drivers self.add_tmpfile(a.save_abspath)
def test_unique_indexing_vecindexers_before(random_workspace): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) # can't use plain _unique because workspace will conflict with other tests f = (Flow().add(uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_vec.yml'))) with f: f.index(duplicate_docs) with BaseExecutor.load( (random_workspace / 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer.size == num_uniq_docs
def test_load_yaml2(self): a = BaseExecutor.load_config( os.path.join(cur_dir, 'yaml/test-exec-with-driver.yml')) self.assertEqual(len(a._drivers), 2) # should be able to auto fill in ControlRequest self.assertTrue('ControlRequest' in a._drivers) a.save_config() p = a.config_abspath b = BaseExecutor.load_config(p) self.assertEqual(a._drivers, b._drivers) self.add_tmpfile(p) a.touch() a.save() c = BaseExecutor.load(a.save_abspath) self.assertEqual(a._drivers, c._drivers) self.add_tmpfile(a.save_abspath)