def prepare_data(dataset: DatasetEnum, index: Index): cursor = MongoDescriptor.objects(dataset=dataset) pbar = tqdm.tqdm(cursor, total=cursor.count(), desc=f"Processing {dataset} dataset") new_dataset = [] dataset_info = {} for query in pbar: # type: MongoDescriptor dists, ids = index.search(np.expand_dims(query.descriptor, axis=0), 70) dataset_object = {"query": query.photo_id, "neighbours": []} dataset_info[query.photo_id] = {"neighbours": [], "num_neighbours": 0} for neighbour_id, dist in zip(ids[0, :], dists[0, :]): dataset_object['neighbours'].append(int(neighbour_id)) neighbour_doc: MongoDescriptor = MongoDescriptor.objects( dataset=DatasetEnum.DATABASE, photo_id=neighbour_id).first() geo_dist = compute_geo_distance( np.array([query.coordinates]), np.array([neighbour_doc.coordinates])) dataset_info[query.photo_id]["neighbours"].append({ "id": int(neighbour_id), "geo_dist": float(geo_dist[0]), "desc_dist": float(dist) }) new_dataset.append(dataset_object) return new_dataset, dataset_info
def prepare_data(dataset: DatasetEnum): cursor = MongoDescriptor.objects(dataset=dataset) pbar = tqdm.tqdm(cursor, total=cursor.count(), desc=f"Processing {dataset} dataset") new_dataset = [] dataset_info = {} for query in pbar: # type: MongoDescriptor dataset_object = {"query": query.photo_id, "neighbours": []} dataset_info[query.photo_id] = {"neighbours": [], "num_neighbours": 0} neighbours = MongoDescriptor.objects( dataset=DatasetEnum.DATABASE, coords__near=query.coordinates)[:50] for neighbour in neighbours: # type: MongoDescriptor dataset_object['neighbours'].append(neighbour.photo_id) geo_dist = compute_geo_distance(np.array([query.coordinates]), np.array([neighbour.coordinates])) dist = np.linalg.norm(query.descriptor - neighbour.descriptor) dataset_info[query.photo_id]["neighbours"].append({ "id": neighbour.photo_id, "geo_dist": float(geo_dist[0]), "desc_dist": float(dist) }) new_dataset.append(dataset_object) return new_dataset, dataset_info
def perform_localisation_benchmark( model_params: ModelParameters, index_config: IndexConfig, benchmark_params: BenchmarkParameters) -> BenchmarkResult: model = LocalisationModel(model_params.localisation_type, index_config, model_params.sigma, model_params.m, model_params.k, model_params.num_workers) log.info(f"Localisation model: {repr(model)}") if index_config.index_dir is None: log.info("Getting training data") ids, coordinates, descriptors = MongoDescriptor.get_data_as_arrays( DatasetEnum.DATABASE) log.info("Finished getting training data") else: log.info("Getting ids and coords for training data") ids, coordinates = MongoDescriptor.get_ids_and_coords( DatasetEnum.DATABASE) descriptors = None log.info("Finished getting training data") log.debug(f"Current memory usage: {utils.get_memory_usage():.2f}GB") log.info(f"Fitting model...") model.fit(ids, coordinates, descriptors) del ids, coordinates, descriptors log.debug(f"Current memory usage: {utils.get_memory_usage():.2f}GB") log.info(f"Model is trained.") log.info(f"Getting query data") q_ids, q_coordinates, q_descriptors = MongoDescriptor.get_data_as_arrays( dataset=benchmark_params.query_dataset) log.info("Finished getting query data") log.debug(f"Current memory usage: {utils.get_memory_usage():.2f}GB") log.info("Getting predictions...") predicted_locations = model.predict(q_descriptors) log.info("Finished getting predictions") if benchmark_params.extended_results: img_ids = q_ids else: img_ids = None result = _get_benchmark_results(predicted_locations, q_coordinates, img_ids) if benchmark_params.save_result: log.info("Saving test results") with open(benchmark_params.save_path, 'w') as f: json.dump(asdict(result), f) return result
def localisation_tuning(parameters: TuningParameters): log.info("Getting ids and coords for training data") ids, coordinates = MongoDescriptor.get_ids_and_coords(DatasetEnum.DATABASE) log.info("Finished getting training data") log.debug(f"Current memory usage: {utils.get_memory_usage():.2f}GB") log.info(f"Getting query data") q_ids, q_coordinates, q_descriptors = MongoDescriptor.get_data_as_arrays( dataset=parameters.query_dataset) log.info("Finished getting query data") log.debug(f"Current memory usage: {utils.get_memory_usage():.2f}GB") coord_map = LocalisationModel.compute_coordinate_map(ids, coordinates) grid_tuples = _param_grid(parameters.grid) parameters_name = parameters.grid.keys() records = [] for index_config in parameters.index_configs: for i, tup in enumerate(grid_tuples): start = time.time() experiment_parameters = _tuple_to_dict(parameters_name, tup) log.info( f"Tuning experiment: {i + 1}/{len(grid_tuples)}, index: {index_config.index_type.name}, " f"parameters: {experiment_parameters}") model_params = _params_from_tuple(parameters_name, tup, parameters.default_parameters) model = LocalisationModel(model_params.localisation_type, index_config, model_params.sigma, model_params.m, model_params.k, model_params.num_workers) model.fit_from_coord_map(coord_map) predicted_locations = model.predict(q_descriptors) result = _get_benchmark_results(predicted_locations, q_coordinates) tuning_record = { "index_type": index_config.index_type.value, "parameters": experiment_parameters, "accuracy": result.accuracy, "errors": result.errors, "predictions_by_dist": result.predictions_by_dist } records.append(tuning_record) end = time.time() log.info(f"Current iteration time: {end - start:.3f}s") if (i + 1) % parameters.save_every == 0 or (i + 1) == len(grid_tuples): log.info( f"Saving tuning results. Step: {i + 1}/{len(grid_tuples)}, " f"index: {index_config.index_type.name}, parameters: {experiment_parameters}" ) _save_tuning_results(records, parameters.save_path) log.info(f"Results saved to: {parameters.save_path}")
def __getitem__(self, index) -> T_co: photo_id = self.__ids[index] doc: MongoDescriptor = MongoDescriptor.objects(photo_id=photo_id).first() descriptor = torch.tensor(doc.descriptor, dtype=torch.float32) coordinates = torch.tensor(doc.coordinates, dtype=torch.float32) return descriptor, photo_id, coordinates
def get_index(index_config: IndexConfig) -> Index: if index_config.index_dir is None: log.info("Getting training data") ids, coordinates, descriptors = MongoDescriptor.get_data_as_arrays( DatasetEnum.DATABASE) log.info("Finished getting training data") index = IndexBuilder(index_config, descriptors, ids).build() else: index = IndexBuilder(index_config).build() return index
def __getitem__(self, index) -> T_co: query_id = self.dataset[index]['query'] neighbours_ids = self.dataset[index]['neighbours'] query_doc = MongoDescriptor.objects(photo_id=query_id).first() query = query_doc.descriptor q_coords = query_doc.coordinates neighbour_docs = MongoDescriptor.objects(photo_id__in=neighbours_ids) neighbours = [] n_coords = [] for n_doc in neighbour_docs: neighbours.append(n_doc.descriptor) n_coords.append(n_doc.coordinates) query = torch.tensor(query, dtype=torch.float32) neighbours = torch.tensor(neighbours, dtype=torch.float32) q_coords = torch.tensor(q_coords, dtype=torch.float32) n_coords = torch.tensor(n_coords, dtype=torch.float32) query_id = torch.tensor(query_id) neighbours_ids = torch.tensor(neighbours_ids) return query, neighbours, q_coords, n_coords, query_id, neighbours_ids
def migrate(): tables_pbar = tqdm(path_to_descriptors.items(), desc="Processing ") for dataset, path in tables_pbar: tables_pbar.set_description(f"Processing {dataset.value} dataset") with DescriptorsTable(path, 2048) as table: for descriptor in tqdm(table.iterrows(), leave=False, total=len(table)): desc = MongoDescriptor(photo_id=descriptor.photo_id, dataset=dataset) desc.coordinates = [descriptor.lon, descriptor.lat] desc.descriptor = descriptor.descriptor desc.save()
def create_and_save_index(index_config: IndexConfig, save_path): log.info("Reading data from db...") ids, coordinates, descriptors = MongoDescriptor.get_data_as_arrays( DatasetEnum.DATABASE) log.info("Finished reading data from db") log.info("Building index...") index = IndexBuilder(index_config, descriptors, ids).build() log.info(f"Index built: {repr(index)}") class_path = os.path.join(save_path, INDEX_CLASS_FILE) log.info(f"Saving class file to {class_path}") with open(class_path, "wb") as f: index_config.index_dir = save_path pickle.dump(index_config, f) index_config.index_dir = None index_path = os.path.join(save_path, INDEX_FILE_NAME) log.info(f"Saving index file to {index_path}") index.write_index(index_path)
import json import numpy as np import tqdm from mongoengine import connect from im2gps.data.descriptors import MongoDescriptor, DatasetEnum from im2gps.core.index import IndexConfig from im2gps.services.index import get_index from im2gps.core.metric import compute_geo_distance PATH_TO_DS = "/home/andrew/Documents/study/thesis/thesis-src/experiments/datasets/V1_0__Simple_dataset" connect(db="im2gps", host="localhost", port=51998) print("Getting queries") train_cursor = MongoDescriptor.objects(dataset=DatasetEnum.VALIDATION_QUERY) train_count = train_cursor.count() print("Building index") index_config = IndexConfig.from_path( "/home/andrew/Documents/study/thesis/indices/l2_512_index") index = get_index(index_config) train_dataset = [] train_info = {} pbar = tqdm.tqdm(train_cursor, total=train_count, desc="Processing train query") for i, train_query in enumerate(pbar): neighbours = MongoDescriptor.objects(
def prepare_data(dataset: DatasetEnum, index: Index): cursor = MongoDescriptor.objects(dataset=dataset) pbar = tqdm.tqdm(cursor, total=cursor.count(), desc=f"Processing {dataset} dataset") new_dataset = [] dataset_info = {} count = 0 for query in pbar: # type: MongoDescriptor dataset_object = { "query": query.photo_id, "neighbours": [], "target_id": -1 } dataset_info[query.photo_id] = { "neighbours": [], "num_neighbours": 0, "target_id": -1 } d_dists, d_neighbours_ids = index.search( np.expand_dims(query.descriptor, axis=0), 100) c_neighbours = [ desc_doc for desc_doc in MongoDescriptor.objects(dataset=DatasetEnum.DATABASE, coords__near=query.coordinates)[:500] ] n_descriptors = np.array( [desc_doc.descriptor for desc_doc in c_neighbours]) c_neighbours_ids_set = { neighbour.photo_id for neighbour in c_neighbours } d_neighbours_ids_set = set(d_neighbours_ids[0]) intersection = d_neighbours_ids_set.intersection(c_neighbours_ids_set) desc_dists = np.linalg.norm(query.descriptor - n_descriptors, axis=1) candidates = [] if len(intersection) == 0: # find closest in d-space among c_neighbours target_neighbour_idx = np.argmin(desc_dists) target_neighbour = c_neighbours[target_neighbour_idx].photo_id candidates.append(target_neighbour) candidates.extend(d_neighbours_ids[0][:50].tolist()) count = 0 for neighbour in c_neighbours: if neighbour.photo_id == target_neighbour: continue candidates.append(neighbour.photo_id) count += 1 if count == 49: break else: # find closest in c-space among intersection candidates.extend(list(intersection)) intersection_length = len(intersection) max_neighbours = 100 - intersection_length max_d_neighbours = round(max_neighbours / 2) max_c_neighbours = max_neighbours - max_d_neighbours count = 0 for d_id in d_neighbours_ids[0].tolist(): if d_id in intersection: continue candidates.append(d_id) count += 1 if count == max_d_neighbours: break target_neighbour = -1 for n in c_neighbours: if n.photo_id in intersection: target_neighbour = n.photo_id break count = 0 for neighbour in c_neighbours: if neighbour.photo_id in intersection: continue candidates.append(neighbour.photo_id) count += 1 if count == max_c_neighbours: break dataset_object['target_id'] = target_neighbour for neighbour_id in candidates: dataset_object['neighbours'].append(int(neighbour_id)) # add info neighbour_doc: MongoDescriptor = MongoDescriptor.objects( dataset=DatasetEnum.DATABASE, photo_id=neighbour_id).first() geo_dist = compute_geo_distance( np.array([query.coordinates]), np.array([neighbour_doc.coordinates])) desc_dist = np.linalg.norm(query.descriptor - neighbour_doc.descriptor) dataset_info[query.photo_id]["neighbours"].append({ "id": int(neighbour_id), "geo_dist": float(geo_dist[0]), "desc_dist": float(desc_dist) }) new_dataset.append(dataset_object) print(count) return new_dataset, dataset_info