def initialize(self): """Handle more expensive initialization.""" self.gaia_db = self.initialize_gaia_db() try: self.metric = DistanceFunctionFactory.create( 'euclidean', self.gaia_db.layout()) except Exception as ex: print(repr(ex)) self.gaia_db = self.transform(self.gaia_db) self.metric = DistanceFunctionFactory.create( 'euclidean', self.gaia_db.layout()) self.transformed = True
def __build_pca_metric(self): logger.info('Bulding metric for preset pca') preset_file = yaml.load(open(settings.PRESET_DIR + "pca.yaml")) distance = preset_file['distance']['type'] parameters = preset_file['distance']['parameters'] search_metric = DistanceFunctionFactory.create( str(distance), self.pca_dataset.layout(), parameters) self.metrics['pca'] = search_metric
def transform_and_save(self, dataset, path): """Transform dataset and save to disk.""" if not self.transformed: dataset = self.transform(dataset) self.metric = DistanceFunctionFactory.create( 'euclidean', dataset.layout()) self.transformed = True dataset.save(path) return dataset
def __build_metrics(self): for preset in PRESETS: logger.debug("Bulding metric for preset %s" % preset) name = preset path = PRESET_DIR + name + ".yaml" preset_file = yaml.load(open(path)) distance = preset_file["distance"]["type"] parameters = preset_file["distance"]["parameters"] search_metric = DistanceFunctionFactory.create(str(distance), self.original_dataset.layout(), parameters) self.metrics[name] = search_metric
def __load_datasets(self): self.as_dataset.load(self.__get_dataset_path(clust_settings.INDEX_NAME_AS)) self.as_view = View(self.as_dataset) # self.as_metric = DistanceFunctionFactory.create('euclidean', self.as_dataset.layout()) # self.as_metric = DistanceFunctionFactory.create('CosineSimilarity', self.as_dataset.layout()) # self.as_metric = DistanceFunctionFactory.create('CosineAngle', self.as_dataset.layout()) self.as_metric = DistanceFunctionFactory.create('Manhattan', self.as_dataset.layout()) self.tag_dataset.load(self.__get_dataset_path(clust_settings.INDEX_NAME_TAG)) self.tag_view = View(self.tag_dataset) self.tag_metric = DistanceFunctionFactory.create('euclidean', self.tag_dataset.layout()) self.fs_dataset.load(self.__get_dataset_path(clust_settings.INDEX_NAME_FS)) self.fs_view = View(self.fs_dataset) self.fs_metric = DistanceFunctionFactory.create('euclidean', self.fs_dataset.layout(), {'descriptorNames': 'pca'}) # self.gaia_similiarity = GaiaWrapperSimilarity() self.__load_ac_descriptors_dataset()
def __build_metrics(self): for preset in sim_settings.PRESETS: if preset != 'pca': # PCA metric is built only after pca dataset is created so it should not be built here logger.info('Bulding metric for preset %s' % preset) name = preset path = sim_settings.PRESET_DIR + name + ".yaml" preset_file = yaml.safe_load(open(path)) distance = preset_file['distance']['type'] parameters = preset_file['distance']['parameters'] search_metric = DistanceFunctionFactory.create( str(distance), self.original_dataset.layout(), parameters) self.metrics[name] = search_metric
def __load_ac_descriptors_dataset(self): self.ac_dataset.load(self.__get_dataset_path('FS_AC_descriptors_normalized')) # TODO: add this in clustering settings self.ac_view = View(self.ac_dataset) self.ac_metric = DistanceFunctionFactory.create('euclidean', self.ac_dataset.layout(), {'descriptorNames': [ 'ac_brightness', 'ac_boominess', 'ac_depth', 'ac_hardness', 'ac_roughness', 'ac_sharpness', 'ac_warmth' ]})
def api_search(self, target_type, target, filter, preset_name, metric_descriptor_names, num_results, offset, in_ids): # Check if index has sufficient points size = self.original_dataset.size() if size < sim_settings.SIMILARITY_MINIMUM_POINTS: msg = 'Not enough datapoints in the dataset (%s < %s).' % ( size, sim_settings.SIMILARITY_MINIMUM_POINTS) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.SERVER_ERROR_CODE } # Get some dataset parameters that will be useful later trans_hist = self.transformations_history layout = self.original_dataset.layout() pca_layout = self.pca_dataset.layout() coeffs = None # Get normalization coefficients for i in range(0, len(trans_hist)): if trans_hist[-(i + 1)]['Analyzer name'] == 'normalize': coeffs = trans_hist[-(i + 1)]['Applier parameters']['coeffs'] # Process target if target: if target_type == 'sound_id': query_point = str(target) if not self.original_dataset.contains(query_point): msg = "Sound with id %s doesn't exist in the dataset and can not be set as similarity target." \ % query_point logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.NOT_FOUND_CODE } else: query = query_point elif target_type == 'descriptor_values': # Transform input params to the normalized feature space and add them to a query point # If there are no params specified in the target, the point is set as empty (probably random sounds # are returned) feature_names = [] query = Point() query.setLayout(layout) try: for param in target.keys(): # Only add numerical parameters. Non numerical ones (like key) are only used as filters if param in coeffs.keys(): feature_names.append(str(param)) value = target[param] if coeffs: a = coeffs[param]['a'] b = coeffs[param]['b'] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) query.setValue(str(param), norm_value) else: query.setValue(str(param), value) except: return { 'error': True, 'result': 'Invalid target (descriptor values could not be correctly parsed)', 'status_code': sim_settings.BAD_REQUEST_CODE } # Overwrite metric with present descriptors in target metric = DistanceFunctionFactory.create( 'euclidean', layout, {'descriptorNames': feature_names}) elif target_type == 'file': # Target is specified as the attached file # Create a point with the data in 'descriptors_data' and search for it target_file_parsing_type = '-' try: # Try directly loading the file p, query = Point(), Point() p.loadFromString(yaml.dump(target)) if preset_name == 'pca': query = self.pca_dataset.history().mapPoint( p) # map point to pca dataset else: query = self.original_dataset.history().mapPoint( p) # map point to original dataset target_file_parsing_type = 'mapPoint' except Exception as e: logger.info( 'Unable to create gaia point from uploaded file (%s). ' 'Trying adding descriptors one by one.' % e) # If does not work load descriptors one by one try: query = Point() #query.setLayout(layout) feature_names = [] get_nested_descriptor_names(target, feature_names) feature_names = [ '.%s' % item for item in feature_names ] nonused_features = [] for param in feature_names: if param in coeffs.keys(): value = get_nested_dictionary_value( param[1:].split('.'), target) if coeffs: try: a = coeffs[param]['a'] b = coeffs[param]['b'] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) query.setValue(str(param[1:]), norm_value) except: nonused_features.append(param) else: query.setValue(str(param[1:]), value) else: nonused_features.append(param) if preset_name == 'pca': query = self.pca_dataset.history().mapPoint( query) # map point to pca dataset else: query = self.original_dataset.history().mapPoint( p) # map point to original dataset target_file_parsing_type = 'walkDict' except Exception as e: logger.info( 'Unable to create gaia point from uploaded file and adding descriptors one by ' 'one (%s)' % e) return { 'error': True, 'result': 'Unable to create gaia point from uploaded file. Probably the ' 'file does not have the required layout. Are you using the ' 'correct version of Essentia\'s Freesound extractor?', 'status_code': sim_settings.SERVER_ERROR_CODE } else: query = Point() # Empty target if preset_name == 'pca': query.setLayout(pca_layout) else: query.setLayout(layout) # Process filter if filter: filter = parse_filter_list(filter, coeffs) else: filter = "" # Empty filter # log log_message = 'Similarity search' if target: if target_type == 'sound_id': log_target = '%s (sound id)' % str(target) elif target_type == 'descriptor_values': log_target = '%s (descriptor values)' % str(target) elif target_type == 'file': log_target = 'uploaded file (%s)' % target_file_parsing_type log_message += ' with target: %s' % log_target if filter: log_message += ' with filter: %s' % str(filter) logger.info(log_message) # if in_ids is specified, edit the filter accordingly if in_ids: if not filter: filter = 'WHERE point.id IN ("' + '", "'.join(in_ids) + '")' else: filter += ' AND point.id IN ("' + '", "'.join(in_ids) + '")' # Set query metric metric = self.metrics[preset_name] if metric_descriptor_names: metric = DistanceFunctionFactory.create( 'euclidean', layout, {'descriptorNames': metric_descriptor_names}) # Do query! try: if target_type == 'descriptor_values' and target: search = self.view.nnSearch(query, metric, str(filter)) else: if preset_name == 'pca': search = self.view_pca.nnSearch(query, metric, str(filter)) else: search = self.view.nnSearch(query, metric, str(filter)) results = search.get(num_results, offset=offset) count = search.size() except Exception as e: return { 'error': True, 'result': 'Similarity server error', 'status_code': sim_settings.SERVER_ERROR_CODE } note = None if target_type == 'file': if target_file_parsing_type == 'walkDict': note = 'The layout of the given analysis file differed from what we expected. Similarity results ' \ 'might not be accurate. Was the file generated with the last version of Essentia\'s ' \ 'Freesound extractor?' return { 'error': False, 'result': { 'results': results, 'count': count, 'note': note } }
def query_dataset(self, query_parameters, number_of_results): size = self.original_dataset.size() if size < SIMILARITY_MINIMUM_POINTS: msg = "Not enough datapoints in the dataset (%s < %s)." % (size, SIMILARITY_MINIMUM_POINTS) logger.debug(msg) return {"error": True, "result": msg} # raise Exception('Not enough datapoints in the dataset (%s < %s).' % (size, SIMILARITY_MINIMUM_POINTS)) trans_hist = self.original_dataset.history().toPython() layout = self.original_dataset.layout() # Get normalization coefficients to transform the input data (get info from the last transformation which has been a normalization) coeffs = None for i in range(0, len(trans_hist)): if trans_hist[-(i + 1)]["Analyzer name"] == "normalize": coeffs = trans_hist[-(i + 1)]["Applier parameters"]["coeffs"] ############## # PARSE TARGET ############## # Transform input params to the normalized feature space and add them to a query point # If there are no params specified in the target, the point is set as empty (probably random sounds are returned) q = Point() q.setLayout(layout) feature_names = [] # If some target has been specified... if query_parameters["target"].keys(): for param in query_parameters["target"].keys(): # Only add numerical parameters. Non numerical ones (like key) are only used as filters if param in coeffs.keys(): feature_names.append(str(param)) value = query_parameters["target"][param] if coeffs: a = coeffs[param]["a"] b = coeffs[param]["b"] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) # text = str(type(param)) + " " + str(type(norm_value)) q.setValue(str(param), norm_value) else: q.setValue(str(param), value) ############## # PARSE FILTER ############## filter = "" # If some filter has been specified... if query_parameters["filter"]: if type(query_parameters["filter"][0:5]) == str: filter = query_parameters["filter"] else: filter = self.parse_filter_list(query_parameters["filter"], coeffs) ############# # DO QUERY!!! ############# logger.debug( "Content based search with target: " + str(query_parameters["target"]) + " and filter: " + str(filter) ) metric = DistanceFunctionFactory.create("euclidean", layout, {"descriptorNames": feature_names}) # Looks like that depending on the version of gaia, variable filter must go after or before the metric # For the gaia version we have currently (sep 2012) in freesound: nnSearch(query,filter,metric) # results = self.view.nnSearch(q,str(filter),metric).get(int(number_of_results)) # <- Freesound results = self.view.nnSearch(q, metric, str(filter)).get(int(number_of_results)) return {"error": False, "result": results}