def add_batch_to_database(word_batch, pq_quantization, con, cur, index_config, batch_size, logger): values = [] for i in range(len(pq_quantization)): output_vec = utils.serialize_vector(pq_quantization[i][1]) values.append({ "coarse_id": str(pq_quantization[i][0]), "word": word_batch[i][:100], "vector": output_vec }) if (i % (batch_size - 1) == 0) or (i == (len(pq_quantization) - 1)): if USE_BYTEA_TYPE: cur.executemany( "INSERT INTO " + index_config.get_value('fine_table_name') + " (coarse_id, word,vector) VALUES (%(coarse_id)s, %(word)s, vec_to_bytea(%(vector)s::int2[]))", tuple(values)) else: cur.executemany( "INSERT INTO " + index_config.get_value('fine_table_name') + " (coarse_id, word,vector) VALUES (%(coarse_id)s, %(word)s, %(vector)s)", tuple(values)) con.commit() values = [] return
def add_to_database(words, cq, codebook, pq_quantization, coarse_counts, fine_counts, con, cur, index_config, batch_size, logger): # add codebook db_export.add_codebook_to_database(codebook, fine_counts, con, cur, index_config) # add coarse quantization db_export.add_cq_to_database(cq, coarse_counts, con, cur, index_config) # add fine qunatization values = [] for i in range(len(pq_quantization)): output_vec = utils.serialize_vector(pq_quantization[i][1]) values.append({ "coarse_id": str(pq_quantization[i][0]), "word": words[i][:100], "vector": output_vec }) if (i % (batch_size - 1) == 0) or (i == (len(pq_quantization) - 1)): if USE_BYTEA_TYPE: cur.executemany( "INSERT INTO " + index_config.get_value('fine_table_name') + " (coarse_id, word,vector) VALUES (%(coarse_id)s, %(word)s, vec_to_bytea(%(vector)s::int2[]))", tuple(values)) else: cur.executemany( "INSERT INTO " + index_config.get_value('fine_table_name') + " (coarse_id, word,vector) VALUES (%(coarse_id)s, %(word)s, %(vector)s)", tuple(values)) con.commit() logger.log(Logger.INFO, 'Inserted ' + str(i + 1) + ' vectors') values = [] return
def add_codebook_to_database(codebook, counts, con, cur, index_config): for pos in range(len(codebook)): values = [] for i in range(len(codebook[pos])): output_vec = utils.serialize_vector(codebook[pos][i]) values.append({"pos": pos, "code": i, "vector": output_vec, "count": counts[(pos, i)]}) if USE_BYTEA_TYPE: cur.executemany("INSERT INTO "+ index_config.get_value("cb_table_name") + " (pos,code,vector, count) VALUES (%(pos)s, %(code)s, vec_to_bytea(%(vector)s::float4[]), %(count)s)", tuple(values)) else: cur.executemany("INSERT INTO "+ index_config.get_value("cb_table_name") + " (pos,code,vector, count) VALUES (%(pos)s, %(code)s, %(vector)s, %(count)s)", tuple(values)) con.commit() return
def add_cq_to_database(cq, coarse_counts, con, cur, index_config): # add coarse quantization values = [] for i in range(len(cq)):# output_vec = utils.serialize_vector(cq[i]) count = coarse_counts[i] if i in coarse_counts else 0 values.append({"id": i, "vector": output_vec, "count": count}) if USE_BYTEA_TYPE: cur.executemany("INSERT INTO " + index_config.get_value('coarse_table_name') + " (id, vector, count) VALUES (%(id)s, vec_to_bytea(%(vector)s::float4[]), %(count)s)", tuple(values)) else: cur.executemany("INSERT INTO " + index_config.get_value('coarse_table_name') + " (id, vector, count) VALUES (%(id)s, %(vector)s, %(count)s)", tuple(values)) con.commit() return
def add_to_database(words, cq, codebook, pq_quantization, coarse_counts, \ fine_counts, con, cur, index_config, batch_size, logger): # add codebook db_export.add_codebook_to_database(codebook, fine_counts, con, cur, index_config) # add coarse quantization if COARSE_TYPE == 'MULTI_INDEX': db_export.add_multi_cq_to_database(cq, coarse_counts, con, cur, index_config) else: db_export.add_cq_to_database(cq, coarse_counts, con, cur, index_config) # add fine qunatization values = [] coarse_id_column = 'coarse_id' for i in range(len(pq_quantization)): output_vec = utils.serialize_vector(pq_quantization[i][1]) # print('pq_quantization[i]', pq_quantization[i]) coarse_id = None if COARSE_TYPE == 'MULTI_INDEX': coarse_id = str( combine_centroids(pq_quantization[i][0], index_config.get_value('k_coarse'))) else: coarse_id = str(pq_quantization[i][0]) value_entry = { "id": i + 1, "vector": output_vec, "coarse_id": coarse_id } values.append(value_entry) if (i % (batch_size - 1) == 0) or (i == (len(pq_quantization) - 1)): if USE_BYTEA_TYPE: query = "INSERT INTO "+ index_config.get_value('fine_table_name') + \ " (" + coarse_id_column + ", id,vector) VALUES (" + \ '%(coarse_id)s' + ", %(id)s, vec_to_bytea(%(vector)s::int2[]))" cur.executemany(query, tuple(values)) else: query = "INSERT INTO "+ index_config.get_value('fine_table_name') + \ " (" + coarse_id_column + ", id,vector) VALUES ("+ \ '%(coarse_id)s' + ", %(id)s, %(vector)s)" cur.executemany(query, tuple(values)) con.commit() logger.log(Logger.INFO, 'Inserted ' + str(i + 1) + ' vectors') values = [] return
def add_multi_cq_to_database(cq, coarse_counts, con, cur, index_config): BATCH_SIZE = 100 m = len(cq) num_centr = index_config.get_value('k_coarse') # add quantizer for pos in range(len(cq)): values = [] for i in range(len(cq[pos])): output_vec = utils.serialize_vector(cq[pos][i]) values.append({"pos": pos, "code": i, "vector": output_vec}) if USE_BYTEA_TYPE: cur.executemany( "INSERT INTO " + index_config.get_value('coarse_table_name') + " (pos,code,vector) VALUES (%(pos)s, %(code)s, vec_to_bytea(%(vector)s::float4[]))", tuple(values)) else: cur.executemany( "INSERT INTO " + index_config.get_value('coarse_table_name') + " (pos,code,vector) VALUES (%(pos)s, %(code)s, %(vector)s)", tuple(values)) con.commit() # add counts divide_code = lambda code, units, length: tuple( [int((code / units**i) % units) for i in range(length)]) # devides code into centroid ids batch = [] for code in range(num_centr**m): key = divide_code(code, num_centr, m) count = coarse_counts[key] if key in coarse_counts else 0 batch.append({"id": code, "count": count}) if code % BATCH_SIZE == 0: cur.executemany( "INSERT INTO " + index_config.get_value('coarse_table_name') + "_counts" + " (id, count) VALUES (%(id)s, %(count)s)", tuple(batch)) con.commit() batch = [] cur.executemany( "INSERT INTO " + index_config.get_value('coarse_table_name') + "_counts" + " (id, count) VALUES (%(id)s, %(count)s)", tuple(batch)) con.commit() return
def add_to_database(words, codebook, pq_quantization, counts, con, cur, index_config, batch_size, logger): logger.log(Logger.INFO, 'Length of words: ' + str(len(words)) + ' Length of pq_quantization: ' + str(len(pq_quantization))) # add codebook add_codebook_to_database(codebook, counts, con, cur, index_config) # add pq qunatization values = [] for i in range(len(pq_quantization)): output_vec = utils.serialize_vector(pq_quantization[i]) values.append({"word": words[i][:100], "vector": output_vec}) if (i % (batch_size-1) == 0) or (i == (len(pq_quantization)-1)): if USE_BYTEA_TYPE: cur.executemany("INSERT INTO "+ index_config.get_value("pq_table_name") + " (word,vector) VALUES (%(word)s, vec_to_bytea(%(vector)s::int2[]))", tuple(values)) else: cur.executemany("INSERT INTO "+ index_config.get_value("pq_table_name") + " (word,vector) VALUES (%(word)s, %(vector)s)", tuple(values)) con.commit() logger.log(Logger.INFO, 'Inserted ' + str(i+1) + ' vectors') values = [] return