def main(argc, argv): if argc > 2: db_config = Configuration(argv[2]) else: db_config = Configuration('config/db_config.json') logger = Logger(db_config.get_value('log')) if argc < 2: logger.log(Logger.ERROR, 'Configuration file for index creation required') return vec_config = Configuration(argv[1]) user = db_config.get_value('username') password = db_config.get_value('password') host = db_config.get_value('host') db_name = db_config.get_value('db_name') # init db connection try: con = psycopg2.connect("dbname='" + db_name + "' user='******' host='" + host + "' password='******'") except: logger.log(Logger.ERROR, 'Can not connect to database') return cur = con.cursor() init_tables(con, cur, vec_config.get_value('table_name'), logger) insert_vectors(vec_config.get_value('vec_file_path'), con, cur, vec_config.get_value('table_name'), db_config.get_value('batch_size'), vec_config.get_value('normalized'), logger) # commit changes con.commit() # create index utils.create_index(vec_config.get_value('table_name'), vec_config.get_value('index_name'), 'word', con, cur, logger) # close connection con.close()
def add_to_database(db_config, index_config, type, index_file, logger): # create db connection con = None try: con = psycopg2.connect("dbname='" + db_config.get_value('db_name') + "' user='******'username') + "' host='" + db_config.get_value('host') + "' password='******'password') + "'") except: logger.log(Logger.ERROR, 'Can not connect to database') return cur = con.cursor() if type == 'pq': data = im.load_index(index_file) utils.init_tables(con, cur, pq_index.get_table_information(index_config), logger) pq_index.add_to_database(data['words'], data['codebook'], data['index'], data['counts'], con, cur, index_config, db_config.get_value('batch_size'), logger) utils.create_index(index_config.get_value("pq_table_name"), index_config.get_value("pq_index_name"), 'word', con, cur, logger) elif type == 'ivfadc': data = im.load_index(index_file) utils.init_tables(con, cur, ivfadc.get_table_information(index_config), logger) ivfadc.add_to_database(data['words'], data['cq'], data['codebook'], data['index'], data['coarse_counts'], data['fine_counts'], con, cur, index_config, db_config.get_value('batch_size'), logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger) elif type == 'ivfadc_pipeline': # TODO test data = im.load_pipeline_ivfadc_index( index_file, index_file + '.tmp', index_config.get_value('coarse_quantizer_file'), index_config.get_value('residual_codebook_file')) utils.init_tables(con, cur, ivfadc.get_table_information(index_config), logger) ivfadc.add_to_database(data['words'], data['cq'], data['codebook'], data['index'], data['coarse_counts'], data['fine_counts'], con, cur, index_config, db_config.get_value('batch_size'), logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger) utils.create_index(index_config.get_value('coarse_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger) elif type == 'pq_pipeline': # TODO test data = im.load_pipeline_pq_index( index_file, index_file + '.tmp', index_config.get_value('codebook_file')) utils.init_tables(con, cur, pq.get_table_information(index_config), logger) pq.add_to_database(data['words'], data['codebook'], data['index'], data['counts'], con, cur, index_config, db_config.get_value('batch_size'), logger) utils.create_index(index_config.get_value("pq_table_name"), index_config.get_value("pq_index_name"), 'word', con, cur, logger) else: logger.log(logger.WARNING, 'Index type ' + str(type) + ' unknown') return
def main(argc, argv): db_config = Configuration('config/db_config.json') logger = Logger(db_config.get_value('log')) if argc < 2: logger.log(Logger.ERROR, 'Configuration file for index creation required') return index_config = Configuration(argv[1]) batch_size = db_config.get_value("batch_size") train_size_coarse = index_config.get_value('train_size_coarse') train_size_fine = index_config.get_value('train_size_fine') centr_num_coarse = index_config.get_value('k_coarse') m = index_config.get_value('m') k = index_config.get_value('k') # get vectors words, vectors, vectors_size = \ utils.get_vectors(index_config.get_value('vec_file_path'), logger) logger.log(logger.INFO, 'vectors_size :' + str(vectors_size)) # determine coarse quantizer cq = None cq_filename = index_config.get_value('coarse_quantizer_file') if \ index_config.has_key('coarse_quantizer_file') else None cq_output_name = cq_filename if cq_filename != None else 'coarse_quantizer.pcl' if COARSE_TYPE == 'MULTI_INDEX': cq = qcreator.construct_quantizer( qcreator.create_quantizer, (vectors[:train_size_fine], 2, centr_num_coarse, logger), logger, input_name=cq_filename, output_name=cq_output_name) else: cq = qcreator.construct_quantizer( qcreator.create_coarse_quantizer, (vectors[:train_size_coarse], centr_num_coarse), logger, input_name=cq_filename, output_name=cq_output_name) # determine codebook codebook = None codebook_filename = index_config.get_value('codebook_file') if \ index_config.has_key('codebook_file') else None codebook_output_name = codebook_filename if codebook_filename != None else 'codebook.pcl' codebook = qcreator.construct_quantizer( qcreator.create_quantizer, (vectors[:train_size_fine], m, k, logger), logger, input_name=codebook_filename, output_name=codebook_output_name) # create db connection con, cur = db_export.create_connection(db_config, logger) # prepare database utils.init_tables(con, cur, get_table_information(index_config), logger) utils.disable_triggers(index_config.get_value('fine_table_name'), con, cur) # create index with quantizers logger.log(logger.INFO, 'Start index creation (single cycle)') start = time.time() index, coarse_counts, fine_counts = \ create_index_data(vectors[:vectors_size], cq, codebook, logger) end = time.time() logger.log(logger.INFO, 'Finish index creation after ' + str(end - start) + ' seconds') # add to database add_to_database(words, cq, codebook, index, coarse_counts, fine_counts, con, cur, index_config, batch_size, logger) logger.log(logger.INFO, 'Create database index structures') utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'id', con, cur, logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger) # create statistics if (index_config.has_key('statistic_table') and index_config.has_key('statistic_column') and CREATE_STATS_TABLE): utils.create_statistics_table( index_config.get_value('statistic_table'), index_config.get_value('statistic_column'), index_config.get_value('coarse_table_name'), con, cur, logger) utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur)
def main(argc, argv): db_config = Configuration('config/db_config.json') logger = Logger(db_config.get_value('log')) if argc < 2: logger.log(Logger.ERROR, 'Configuration file for index creation required') return index_config = Configuration(argv[1]) batch_size = db_config.get_value("batch_size") train_size_coarse = index_config.get_value('train_size_coarse') train_size_fine = index_config.get_value('train_size_fine') centr_num_coarse = index_config.get_value('k_coarse') m = index_config.get_value('m') k = index_config.get_value('k') # get vectors words, vectors, vectors_size = utils.get_vectors(index_config.get_value('vec_file_path'), logger) logger.log(logger.INFO, 'vectors_size :' + str(vectors_size)) # determine coarse quantizer cq = None if index_config.has_key('coarse_quantizer_file'): cq_filename = index_config.get_value('coarse_quantizer_file') if cq_filename: logger.log(Logger.INFO, 'Use coarse quantizer from ' + cq_filename) cq = qcreator.load_quantizer(cq_filename) if type(cq) == type(None): logger.log(Logger.INFO, 'Create new coarse quantizer') # create coarse quantizer cq = qcreator.create_coarse_quantizer(vectors[:train_size_coarse], centr_num_coarse) # store coarse quantizer qcreator.store_quantizer(cq, 'coarse_quantizer.pcl') # determine codebook codebook = None if index_config.has_key('residual_codebook_file'): codebook_filename = index_config.get_value('residual_codebook_file') if codebook_filename: logger.log(Logger.INFO, 'Use residual codebook from ' + codebook_filename) codebook = qcreator.load_quantizer(codebook_filename) if type(codebook) == type(None): logger.log(Logger.INFO, 'Create new residual codebook') # calculate codebook based on residuals codebook = create_fine_quantizer(cq, vectors[:train_size_fine], m, k, logger) # store codebook qcreator.store_quantizer(codebook, 'residual_codebook.pcl') con = None cur = None if (index_config.get_value('add_to_database')): # create db connection try: con = psycopg2.connect("dbname='" + db_config.get_value('db_name') + "' user='******'username') + "' host='" + db_config.get_value('host') + "' password='******'password') + "'") except: logger.log(logger.ERROR, 'Can not connect to database') return cur = con.cursor() utils.init_tables(con, cur, get_table_information(index_config), logger) utils.disable_triggers(index_config.get_value('fine_table_name'),con, cur) # create index with quantizers use_pipeline = False if index_config.has_key('pipeline'): use_pipeline = index_config.get_value('pipeline') # single cycle if not use_pipeline: logger.log(logger.INFO, 'Start index creation (single cycle)') start = time.time() index, coarse_counts, fine_counts = create_index_with_faiss(vectors[:vectors_size], cq, codebook, logger) end = time.time() logger.log(logger.INFO, 'Finish index creation after ' + str(end - start) + ' seconds') # add to file if (index_config.get_value('export_filename')): index_data = dict({ 'words': words, 'cq': cq, 'codebook': codebook, 'index': index, 'coarse_counts': coarse_counts, 'fine_counts': fine_counts }) im.save_index(index_data, index_config.get_value('export_filename')) if (index_config.get_value('add_to_database')): add_to_database(words, cq, codebook, index, coarse_counts, fine_counts, con, cur, index_config, batch_size, logger) logger.log(logger.INFO, 'Create database index structures') utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger) utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur) # pipeline approach if use_pipeline: logger.log(logger.INFO, 'Start index creation (pipeline)') start = time.time() feeder = VectorFeeder(vectors[:vectors_size], words) m = len(codebook) len_centr = int(len(vectors[0]) / m) calculation = IVFADCIndexCreator(cq, codebook, m, len_centr, logger) fine_counts = dict() coarse_counts = dict() output_file = None if (index_config.get_value('export_pipeline_data')): output_file = open(index_config.get_value('export_pipeline_data'), 'wb') while (feeder.has_next()): # calculate batch, word_batch = feeder.get_next_batch(batch_size) entries, coarse_counts, fine_counts = calculation.index_batch(batch) # write to database or add to file if (index_config.get_value('add_to_database')): # add to database add_batch_to_database(word_batch, entries, con, cur, index_config, batch_size, logger) logger.log(logger.INFO, 'Added ' + str(feeder.get_cursor() - batch_size + len(batch)) + ' vectors to the database') if (index_config.get_value('export_pipeline_data')): # write to file index_batch = dict({ 'words': word_batch, 'index': entries, }) count_data = dict({ 'coarse_counts': coarse_counts, 'fine_counts': fine_counts }) pickle.dump(index_batch, output_file) f = open(index_config.get_value('export_pipeline_data')+'.tmp', 'wb') pickle.dump(count_data, f) f.close() logger.log(logger.INFO, 'Processed ' + str(feeder.get_cursor() - batch_size + len(batch)) + ' vectors') if output_file: output_file.close() if (index_config.get_value('add_to_database')): # add codebook and cq to database add_codebook_to_database(codebook, fine_counts, con, cur, index_config) logger.log(Logger.INFO, 'Added residual codebook entries into database') add_cq_to_database(cq, coarse_counts, con, cur, index_config) logger.log(Logger.INFO, 'Added coarse quantizer entries into database') logger.log(logger.INFO, 'Create database index structures') utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger) utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur) end = time.time()