def swap_index(self, location): logging.debug("Swapping index to FAISS") faiss_location = location + "/h_faiss" # init faiss index self.index = hfaiss.Faiss(faiss_location) # train faiss index self.index.init_faiss(self.training_data) # migrate data for idx_ in range(int(self.KV_store.get(byt(-1)))): value = self.KV_store.get(byt(idx_)) if value: cid_len_ = int(value[:2]) + 2 self.index.add_vectors([{ "_id": int(idx_), "code": CID.bson2doc(value[cid_len_:])["code"] }]) # set active index self.active_index = INDEX_LABEL[1]
def test_2_db_exist_create(self): schema_def1 = { "description": "this is my database", "unique": "r8and0mseEd90", "encoder": "example.com/autoencoder/API", "codelen": 3, "metadata": { "name": "string", "age": "number" } } schema_def2 = { "description": "this is my database", "unique": "r8and0mseEd90", "encoder": "example.com/autoencoder/API", "codelen": 3, "metadata": { "name": "string", "age": "number" } } database_name = router.create_database(schema_def1) schema_def = schema.generate_schema(schema_def2) database_name_ = CID.doc2CID(schema_def) self.assertEqual(database_name, database_name_, "DB name doesn't match")
def get_nearest(self, qmatrix, k, rad): ids = [] dists = [] qmatrix = self.resize_matrix(qmatrix, int(os.environ["FIXED_VEC_DIMENSION"])) # radius defined, if rad is not None: if k is not None: ids, dists = self.index.get_nearest_rad(qmatrix, rad) else: ids, dists = self.index.get_nearest_rad(qmatrix, rad)[:k] else: ids, dists = self.index.get_nearest_k(qmatrix, k) # get docs for idx_, idb in enumerate(ids): for idx__, id_ in enumerate(idb): value = self.KV_store.get(byt(id_)) if value: cid_len_ = int(value[:2]) + 2 ids[idx_][idx__] = CID.bson2doc(value[cid_len_:]) else: ids[idx_][idx__] = None return ids, dists
def test_1_auth_create_db(self): # deploy app index.server.start() schema_def = { "description": "this is my database", "unique": "r8and0mseEd905", "encoder": "example.com/autoencoder/API", "codelen": 30, "metadata": { "name": "string", "age": "number" } } data_ = {"schema": schema_def} data_bson = bson.dumps(data_) # generate hash hash = SHA384.new() hash.update(data_bson) # Sign with pvt key signer = pkcs1_15.new(priv_key) signature = signer.sign(hash) signature = base58.b58encode(signature).decode("utf-8") url = "http://127.0.0.1:5001/db/create" headers = CaseInsensitiveDict() headers["Content-Type"] = "application/json" data = {"data": data_, "signature": signature} data = json.dumps(data) resp = requests.post(url, headers=headers, data=data) database_name_ = resp.json()["database_name"] schema_def = schema.generate_schema(schema_def) database_name = CID.doc2CID(schema_def) index.server.terminate() index.server.join() self.assertEqual(database_name, database_name_, "DB name doesn't match")
def __init__(self, json_schema): # get database name from schema CID database_name = CID.doc2CID(json_schema) # keep database name self.database_name = database_name # set DB disk location self.DB_disk_location = STORE_LOCATION + database_name # create data directory for database if not os.path.exists(self.DB_disk_location): os.makedirs(self.DB_disk_location) # keep schema in store location with open(self.DB_disk_location + '/schema.json', 'w') as oschema: json.dump(json_schema, oschema) # get vector index self.active_index = INDEX_LABEL[0] self.index = self.get_index(self.DB_disk_location) # Create KV store instance self.KV_store = plyvel.DB(self.DB_disk_location + "/kv.db", create_if_missing=True) if self.KV_store.get(byt(-1)) == None: self.KV_store.put(byt(-1), byt(0)) # Training data holder self.training_data = [] self.TD_location = self.DB_disk_location + "/TD" # Try loading training data self.load_TD_from_disk() # spawn worker thread self.q_maxsize = MAX_Q_LEN self.process_flag = True self.process_timeout_sec = PROCESS_TIMEOUT self.spawn()
def create_database(json_schema): """ Create a database from a given valid JSON schema """ # TBD: write ahead logging (INIT) # generate proper schema definition from templete schema json_schema = schema.generate_schema(json_schema) # identify invalid schema template if json_schema == None: return None # Check if database already exists database_name = CID.doc2CID(json_schema) if databases.get(database_name): # return database name logging.debug("Database already exists") return database_name # If database doesn't exist already, # then create one manager_h = manager.VecManager(json_schema) database_name = manager_h.database_name validator_fn = schema.compile(json_schema) databases[database_name] = { "manager_h": manager_h, "schema": { "json": json_schema, "validator": validator_fn } } # TBD: save schema to storage # TBD: write ahead logging (END) return database_name
def insert_docs(docs, database_name): """ Insert a set of valid documents to database """ # write ahead log (INIT) cids_ = [] docs_ = [] # get manager_h for database_name database_h = load_database(database_name) # invalid database name if not database_h: logging.debug("Database doesn't exist. Please create one.") return cids_ # validate docs against schema # and add CID for doc_ in docs: if schema.validate_json_docs(database_h["schema"]["validator"], doc_): CID_ = CID.doc2CID(doc_) cids_.append(CID_) doc_["CID"] = CID_ docs_.append(doc_) else: cids_.append(None) # get manager_h for database_name manager_h = database_h["manager_h"] manager_h.add_vectors(docs_) # write ahead log (END) return cids_
def add_vectors(self, documents): # add to KV store next_index = int(self.KV_store.get(byt(-1))) # check if it is ready to swap index if next_index > TRAIN_DAT_LEN and self.active_index == INDEX_LABEL[0] \ and len(self.training_data) >= TRAIN_DAT_LEN: # swap index self.swap_index(self.DB_disk_location) # init batch write to DB wb_ = self.KV_store.write_batch() for idx_, doc_ in enumerate(documents): cid_ = byt(doc_["CID"]) # resize "code" doc_["code"] = self.resize_vector( doc_["code"], int(os.environ["FIXED_VEC_DIMENSION"])) # cod_ = pickle.dumps(cod_) documents[idx_]["_id"] = next_index # TBD: convert to bulk insert wb_.put(byt(next_index), byt(len(cid_)) + cid_ + CID.doc2bson(doc_)) wb_.put(cid_, byt(next_index)) next_index += 1 wb_.put(byt(-1), byt(next_index)) # commit DB write wb_.write() # push to training data self.update_training_data(documents) # add vectors to index return self.index.add_vectors(documents)