def __prep_faiss_search_results(block_id=1): # 1 ~ 32 dataset = loader.load_train_dataset() with timer('Loading train19 landmark dict'): landmark_dict = load_train19_landmark_dict() size_train = dataset.feats_train.shape[0] part_size = int(size_train / 32) idx_train_start = (block_id - 1) * part_size idx_train_end = (block_id) * part_size if block_id == 32: idx_train_end = size_train cpu_index = faiss.IndexFlatL2(dataset.feats_train.shape[1]) gpu_index = faiss.index_cpu_to_all_gpus(cpu_index) gpu_index.add(dataset.feats_train) dists, topk_idx = gpu_index.search( x=dataset.feats_train[idx_train_start:idx_train_end], k=1000) df = pd.DataFrame(dataset.ids_train[idx_train_start:idx_train_end], columns=['id']) df['images'] = np.apply_along_axis(' '.join, axis=1, arr=dataset.ids_train[topk_idx]) print('generate sub') rows = [] for imidx, (_, r) in tqdm.tqdm(enumerate(df.iterrows()), total=len(df)): landmark_id = landmark_dict[r['id']] same_landmark_images = [] for rank, imid in enumerate(r.images.split(' ')): if landmark_id == landmark_dict[imid]: same_landmark_images.append( f'{rank}:{dists[imidx, rank]:.8f}:{imid}') if len(same_landmark_images) >= 100: break rows.append({ 'id': r['id'], 'landmark_id': landmark_id, 'matched': ' '.join(same_landmark_images), }) fn = ('data/working/exp12/' f'train19_train19_faiss_search_same_landmarks_blk{block_id}.csv.gz') Path(fn).parent.mkdir(parents=True, exist_ok=True) print('to_csv') df = pd.DataFrame(rows).to_csv(fn, index=False, compression='gzip')
def __scoring_with_top100_arcfacefish_v4(): dataset = loader.load_train_dataset() fn_out = 'data/working/exp12/v7_fish_dba2qe10.h5' if not Path(fn_out).exists(): __search(dataset, fn_out, dba_niters=2, qe_topk=10) dataset = loader.load_train_dataset_singlefile(fn_out) with timer('Loading train19 landmark dict'): landmark_dict = load_train19_landmark_dict() fn_sub = 'data/working/exp12/v7_fish_nodba_top40_train19_v4.csv.gz' cpu_index = faiss.IndexFlatL2(dataset.feats_train.shape[1]) gpu_index = faiss.index_cpu_to_all_gpus(cpu_index) gpu_index.add(dataset.feats_train) dists, topk_idx = gpu_index.search(x=dataset.feats_test, k=100) df = pd.DataFrame(dataset.ids_test, columns=['id']) df['images'] = np.apply_along_axis(' '.join, axis=1, arr=dataset.ids_train[topk_idx]) print('generate sub') rows = [] max_value = sum([np.exp(np.sqrt(i)) for i in range(40)]) for _, r in tqdm.tqdm(df.iterrows(), total=len(df)): image_ids = [name.split('/')[-1] for name in r.images.split(' ')] counter = Counter() for i, image_id in enumerate(image_ids[:40]): landmark_id = landmark_dict[image_id] counter[landmark_id] += np.exp(-np.sqrt(i + 1)) landmark_id, score = counter.most_common(1)[0] score = score / max_value rows.append({ 'id': r['id'], 'landmarks': f'{landmark_id} {score:.9f}', }) print('to_csv') df = pd.DataFrame(rows) df_sub = pd.read_csv('data/recognition_sample_submission.csv') df_sub = df_sub[['id']].merge(df, how='left', on='id') df_sub[['id', 'landmarks']].to_csv(fn_sub, index=False, compression='gzip')