def index_generator(db_file_path: str): documents = fvecs_read(db_file_path) for id, data in enumerate(documents): with Document() as doc: doc.content = data doc.tags['id'] = id yield doc
def evaluate_generator(db_file_path: str, groundtruth_path: str): documents = fvecs_read(db_file_path) groundtruths = ivecs_read(groundtruth_path) for data_doc, gt_indexes in zip(documents, groundtruths): with Document() as doc: doc.content = data_doc with Document() as groundtruth: for index in gt_indexes: with Document() as match: match.tags['id'] = int(index.item()) groundtruth.matches.add(match) yield doc, groundtruth
def read_data(db_file_path: str, batch_size: int, max_docs: int = None): vectors = fvecs_read(db_file_path) num_vectors = vectors.shape[0] batch_size = 1 if batch_size == -1 else batch_size num_batches = int(num_vectors / batch_size) if max_docs is not None: batch_size = max_docs num_batches = 1 for i in range(1, num_batches + 1): start_batch = (i - 1) * batch_size end_batch = i * batch_size if i * batch_size < num_vectors else num_vectors keys = np.arange(start_batch, end_batch).reshape(end_batch - start_batch, 1) yield keys, vectors[start_batch:end_batch]
def read_data(db_file_path: str): return fvecs_read(db_file_path)
__copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved." __license__ = "Apache-2.0" import gzip import os from read_vectors_files import fvecs_read os.environ['TMP_DATA_DIR'] = '/tmp/jina/faiss/siftsmall' train_filepath = 'workspace/train.tgz' train_fvecs_path = os.path.join(os.environ['TMP_DATA_DIR'], 'siftsmall_learn.fvecs') train_data = fvecs_read(train_fvecs_path) with gzip.open(train_filepath, 'wb', compresslevel=1) as f: f.write(train_data.tobytes())