def prepare_train(base, base_base_gnd, partition_info, config): save_dir = '%s/Classifier_%d/prepare_train_sample' % ( config['program_train_para_dir'], config['classifier_number']) dir_io.mkdir(save_dir) classifier_number = config['classifier_number'] prepare_method = factory(config['type']) global label_k if 'label_k' in config: label_k = config['label_k'] print( "label k %d ==========================================================================================" % label_k) start_time = time.time() print('start prepare data %s' % classifier_number) trainloader, valloader = prepare_method(base, base_base_gnd, partition_info, config['n_cluster']) print('finish prepare_data %s' % classifier_number) end_time = time.time() intermediate_config = {'time': end_time - start_time} if save_sample: save(trainloader, valloader, save_dir) return (trainloader, valloader), intermediate_config
def execute(pq, X, Q, G, metric, config, train_size=100000): np.random.seed(123) print("# ranking metric {}".format(metric)) print("# " + pq.class_message()) pq.fit(X[:train_size].astype(dtype=np.float32), iter=40) print('# compress items') compressed = chunk_compress(pq, X) print(compressed.dtype) print("# sorting items") Ts = [2**i for i in range(1 + int(math.log2(len(X))))] recalls = BatchSorter(compressed, Q, X, G, Ts, metric=metric, batch_size=200).recall() print("# searching!") res_l = [] # print("expected items, overall time, avg recall, avg precision, avg error, avg items") for i, (t, recall) in enumerate(zip(Ts, recalls)): tmp_res = {'n_candidate': t, 'recall': recall} res_l.append(tmp_res) # print("{}, {}, {}, {}, {}, {}".format( # 2 ** i, 0, recall, recall * len(G[0]) / t, 0, t)) save_data_dir = '/home/zhengbian/NN_as_Classification/data/result/%s_%d_baseline_%d_%s' % ( config['dataset'], config['n_cluster'], config['codebook'], config['method']) dir_io.delete_dir_if_exist(save_data_dir) dir_io.mkdir(save_data_dir) dir_io.save_json(save_data_dir, 'result.json', res_l)
def integrate_single(score_table, gnd, config): torch.set_num_threads(12) start_time = time.time() # long_term_config, short_term_config, short_term_config_before_run, intermediate_result, total_score_table dir_io.mkdir(config['program_result_dir']) recall_l = [] print("start evaluate") for i, score_arr in enumerate(score_table, 0): if i % 50 == 0: print("evaluate " + str(i)) efsearch_recall_l = evaluate(score_arr, config['efSearch_l'], gnd[i], config['k']) recall_l.append(efsearch_recall_l) print('get all the recall') # transpose makes the same efsearch in every row of recall recall_l = np.array(recall_l).transpose() result_n_candidate_recall = [] for i, efSearch in enumerate(config['efSearch_l'], 0): recall_avg = np.mean(recall_l[i]) result_item = {'n_candidate': efSearch, "recall": recall_avg} result_n_candidate_recall.append(result_item) print('recall: {}, n_candidates: {}'.format(recall_avg, efSearch)) dir_io.save_json(config['program_result_dir'], 'result.json', result_n_candidate_recall) recall_l_save_dir = '%s/recall_l.txt' % config['program_result_dir'] dir_io.save_array_txt(recall_l_save_dir, recall_l, '%.3f') end_time = time.time() intermediate = {'time': end_time - start_time} return intermediate
def prepare_data(config): save_dir = '%s/data/dataset/%s_%d' % (config['save_dir'], config['data_fname'], config['k']) dir_io.delete_dir_if_exist(save_dir) dir_io.mkdir(save_dir) base = np.random.normal(loc=config['miu'], scale=data_config['sigma'], size=(config['base']['length'], config['base']['dim'])) base_save_dir = '%s/base.fvecs' % save_dir vecs_io.fvecs_write(base_save_dir, base) query = np.random.normal(loc=config['miu'], scale=data_config['sigma'], size=(config['query']['length'], config['query']['dim'])) query_save_dir = '%s/query.fvecs' % save_dir vecs_io.fvecs_write(query_save_dir, query) base = base.astype(np.float32) query = query.astype(np.float32) gnd = groundtruth.get_gnd(base, query, config['k']) gnd_save_dir = '%s/gnd.ivecs' % save_dir vecs_io.ivecs_write(gnd_save_dir, gnd) base_base_gnd = groundtruth.get_gnd(base, base, config['base_base_gnd_k']) base_base_gnd_save_dir = '%s/base_base_gnd.ivecs' % save_dir vecs_io.ivecs_write(base_base_gnd_save_dir, base_base_gnd) print("base", base.shape) print("query", query.shape) print("gnd", gnd.shape) print("base_base_gnd", base_base_gnd.shape)
def preprocess(base, config): dir_io.mkdir(config['program_train_para_dir']) start_time = time.time() multiple_model_ins = factory(config) model_l, model_intermediate = multiple_model_ins.preprocess(base) end_time = time.time() model_intermediate['total_time'] = end_time - start_time return model_l, model_intermediate
def prepare_data(config): data_dir = '%s/data/dataset/%s' % (config['project_dir'], config['data_fname']) print("data_dir", data_dir) config['data_dir'] = data_dir dir_io.delete_dir_if_exist(data_dir) dir_io.mkdir(data_dir) # read data hdfFile = h5py.File(config['source_data_dir'], 'r') base_info = config['file']['base'] base = hdfFile.get(base_info['name']) if base_info['length'] != -1 and base_info['length'] < len(base): base = base[:base_info['length']] if config['normalization']: base = normalization(base) print("normalize base") base = base.astype(np.float32) save_base_dir = '%s/base.fvecs' % data_dir vecs_io.fvecs_write(save_base_dir, base) print("save base") query_info = config['file']['query'] query = hdfFile.get(query_info['name']) if query_info['length'] != -1 and query_info['length'] < len(query): query = query[:query_info['length']] if config['normalization']: query = normalization(query) print("normalize query") query = query.astype(np.float32) save_query_dir = '%s/query.fvecs' % data_dir vecs_io.fvecs_write(save_query_dir, query) print("save query") save_gnd_dir = '%s/gnd-%d.ivecs' % (data_dir, config['k']) gnd = groundtruth.get_gnd(base, query, config['k']) vecs_io.ivecs_write(save_gnd_dir, gnd) print("save gnd") base_base_gnd_npy_dir = '%s/base_base_gnd-%d.ivecs' % ( config['data_dir'], config['base_base_gnd_k']) base_base_gnd = groundtruth.get_gnd( base, base, max(config['base_base_gnd_k'], config['k'])) vecs_io.ivecs_write(base_base_gnd_npy_dir, base_base_gnd) print("save base_base_gnd for the training set preparation") print("base:", base.shape) print("query:", query.shape) print("gnd:", gnd.shape) print("base_base_gnd:", base_base_gnd.shape) hdfFile.close()
def __init__(self, config): self.type = config['type'] self.save_dir = '%s/dataset_partition' % config['save_dir'] dir_io.mkdir(self.save_dir) self.classifier_number = config['classifier_number'] self.obj_id = "%s_%d" % (self.type, self.classifier_number) # number of cluster self.n_cluster = config['n_cluster'] self.model_info = None # the key of map is the number of every class, its value is the index that belongs to the cluster in base self.label_map = {} # to count the number of points in different bucket self.n_point_label = None self.labels = None
def preprocess(base, config): program_train_para_dir = config['program_train_para_dir'] dir_io.mkdir(program_train_para_dir) start_time = time.time() ds_partition_config = config['dataset_partition'] ds_partition_config['program_train_para_dir'] = program_train_para_dir ds_partition_config['n_cluster'] = config['n_cluster'] ds_partition_config['n_instance'] = config['n_instance'] multiple_model = factory(ds_partition_config) model_l, model_intermediate = multiple_model.preprocess(base) end_time = time.time() model_intermediate['total_time'] = end_time - start_time return model_l, model_intermediate
def convert_data_type(config): dir_io.mkdir(config['data_dir']) print("create directory") base_dir = '%s/%s' % (config['source_data_dir'], config['source_data_fname']['base']) base_save_dir = '%s/%s' % (config['data_dir'], 'base.fvecs') base = vecs2vecs(base_dir, base_save_dir, config['source_data_type']['base'], 'fvecs', file_len=config['base_len']) print("extract base") query_dir = '%s/%s' % (config['source_data_dir'], config['source_data_fname']['query']) query_save_dir = '%s/%s' % (config['data_dir'], 'query.fvecs') query = vecs2vecs(query_dir, query_save_dir, config['source_data_type']['query'], 'fvecs', file_len=config['query_len']) print("extract query") if config['minus_avg']: average_vecs = np.average(base, axis=0) print(average_vecs) base = base - average_vecs query = query - average_vecs print("minus average number in each dimension") gnd = groundtruth.get_gnd(base, query, config['k']) gnd_save_dir = '%s/%s' % (config['data_dir'], 'gnd.ivecs') vecs_io.ivecs_write(gnd_save_dir, gnd) print("extract gnd") base_base_gnd = groundtruth.get_gnd(base, base, config['base_base_gnd_k']) base_base_gnd_save_dir = '%s/%s' % (config['data_dir'], 'base_base_gnd.ivecs') vecs_io.ivecs_write(base_base_gnd_save_dir, base_base_gnd) print("extract base_base_gnd for the training set preparation") print("base: ", base.shape) print("query: ", query.shape) print("gnd: ", gnd.shape) print("base_base_gnd: ", base_base_gnd.shape) return base, query, gnd, base_base_gnd
def __init__(self, config): self.save_dir = '%s/dataset_partition' % config['save_dir'] dir_io.mkdir(self.save_dir) self.type = config['type'] self.classifier_number = config['classifier_number'] self.distance_metric = config[ 'distance_metric'] if 'distance_metric' in config else 'l2' # pq_nn will not include this parameter # number of cluster self.n_cluster = config['n_cluster'] self.model_info = None # the key of map is the number of every class, its value is the index that belongs to the cluster in base self.label_map = [] # to count the number of points in different bucket self.n_point_label = None self.intermediate = {} self.labels = None
def __init__(self, config): self.program_train_para_dir = config['program_train_para_dir'] self.type = config['type'] self.n_instance = config['n_instance'] self.n_cluster = config['n_cluster'] self.model_l = [] self.intermediate = {} # for identification for i in range(self.n_instance): tmp_config = copy.deepcopy(config) tmp_config['type'] = self.type tmp_config['classifier_number'] = i + 1 tmp_config['n_cluster'] = self.n_cluster tmp_config['save_dir'] = '%s/Classifier_%d' % ( self.program_train_para_dir, tmp_config['classifier_number']) dir_io.mkdir(tmp_config['save_dir']) tmp_model = self.get_model(tmp_config) self.model_l.append(tmp_model)
def integrate(score_table_ptr_l, gnd, config): # long_term_config, short_term_config, short_term_config_before_run, intermediate_result, total_score_table dir_io.mkdir(config['program_result_dir']) recall_l = [] iter_idx = 0 while True: end_of_file = False # get the total recall for each query total_score_arr = None for score_table_ptr in score_table_ptr_l: line = score_table_ptr.readline() if not line or line == '': end_of_file = True break tmp_score_table = np.array( [float(number) for number in line.split(' ')]) if total_score_arr is None: total_score_arr = tmp_score_table else: total_score_arr += tmp_score_table if end_of_file: break efsearch_recall_l = evaluate(total_score_arr, config['efSearch_l'], gnd[iter_idx], config['k']) recall_l.append(efsearch_recall_l) iter_idx += 1 print('get all the recall') # transpose makes the same efsearch in every row of recall recall_l = np.array(recall_l).transpose() result_n_candidate_recall = [] for i, efSearch in enumerate(config['efSearch_l'], 0): recall_avg = np.mean(recall_l[i]) result_item = {'n_candidate': efSearch, "recall": recall_avg} result_n_candidate_recall.append(result_item) print('recall: {}, n_candidates: {}'.format(recall_avg, efSearch)) dir_io.save_json(config['program_result_dir'], 'result.json', result_n_candidate_recall) recall_l_save_dir = '%s/recall_l.txt' % config['program_result_dir'] dir_io.save_array_txt(recall_l_save_dir, recall_l, '%.3f')
def __init__(self, config): self.program_train_para_dir = config['program_train_para_dir'] self.type = config['dataset_partition']['type'] self.n_instance = config['n_instance'] self.n_cluster = config['n_cluster'] self.kahip_dir = config['kahip_dir'] self.distance_metric = config['distance_metric'] self.model_l = [] for i in range(self.n_instance): tmp_config = copy.deepcopy(config['dataset_partition']) tmp_config['type'] = self.type tmp_config['classifier_number'] = i tmp_config['n_cluster'] = self.n_cluster tmp_config['kahip_dir'] = self.kahip_dir tmp_config['distance_metric'] = self.distance_metric tmp_config['save_dir'] = '%s/Classifier_%d' % ( self.program_train_para_dir, tmp_config['classifier_number']) dir_io.mkdir(tmp_config['save_dir']) tmp_model = self.get_model(tmp_config) self.model_l.append(tmp_model)
def save(self): dir_io.mkdir(self.save_dir) eval_res_dir = '%s/eval_res.txt' % self.save_dir dir_io.save_array_txt(eval_res_dir, self.result, fmt='%.3f')
def prepare_data(config): data_dir = '%s/data/dataset/%s_%d' % (config['project_dir'], config['data_fname'], config['k']) print("data_dir", data_dir) dir_io.delete_dir_if_exist(data_dir) ''' dataset preparation make directory, extract base, query, gnd ''' dir_io.mkdir(data_dir) print("create directory") dataset_dir = '%s/%s' % (config['source_data_dir'], config['source_data']['name']) dataset, alphabet = read_txt(dataset_dir, word_len=config['padding_length'], n_character=config['n_character']) print(alphabet) base = dataset[:config['source_data']['base_len']] query = dataset[-config['source_data']['query_len']:] start = time.time() gnd = groundtruth.get_gnd(base, query, config['k'], metrics="string") gnd_save_dir = '%s/%s' % (data_dir, 'gnd.npy') print(gnd.dtype) dir_io.save_numpy(gnd_save_dir, gnd) # vecs_io.ivecs_write(gnd_save_dir, gnd) end = time.time() print("save gnd, time:", end - start) print("gnd: ", gnd.shape) del gnd start = time.time() base_base_gnd, n_base_base_gnd = get_base_base_gnd(base, config) base_base_gnd_save_dir = '%s/%s' % (data_dir, 'base_base_gnd.npy') # vecs_io.ivecs_write(base_base_gnd_save_dir, base_base_gnd) print(base_base_gnd.dtype) dir_io.save_numpy(base_base_gnd_save_dir, base_base_gnd) end = time.time() print("save base_base_gnd for the training set preparation, time:", end - start) print("base_base_gnd: ", base_base_gnd.shape) del base_base_gnd # encoding and padding base_save_dir = '%s/%s' % (data_dir, 'base.npy') start = time.time() base = words2vector(base, config['padding_length'], alphabet) print(base.dtype) print("wrods2vector time consume %d" % (time.time() - start)) # vecs_io.ivecs_write(base_save_dir, base) dir_io.save_numpy(base_save_dir, base) print("save base") print("base: ", base.shape) query_save_dir = '%s/%s' % (data_dir, 'query.npy') query = words2vector(query, config['padding_length'], alphabet) # vecs_io.ivecs_write(query_save_dir, query) print(query.dtype) dir_io.save_numpy(query_save_dir, query) print("save query") print("query: ", query.shape) description_dir = '%s/%s' % (data_dir, 'readme.txt') ptr = dir_io.write_ptr(description_dir) ptr.write('the max base_base_gnd is %d\n' % n_base_base_gnd) ptr.close()