def createDataSet(): l = PointLayout() l.add('a', RealType) ds = DataSet() # p1.a = (0.0, 0.0) p = Point() p.setName('p1') p.setLayout(l) p['a'] = (0.0, 0.0) ds.addPoint(p) # p2.a = (0.5, 1.0) p = Point() p.setName('p2') p.setLayout(l) p['a'] = (0.5, 1.0) ds.addPoint(p) if testdata.useFixedLength: ds = testdata.fixLength(ds) if testdata.useEnumerate: ds = testdata.enumerateStrings(ds) return ds
def testMergePointsWithDifferentEnumerationMaps(self): #'''ticket #74: when changing the layout of a point, we must also make sure that the enum maps are correctly mapped''' p1 = Point() p1.setName('p1') p1.setLayout(self.l1) p1['d'] = 'hello' p2 = Point() p2.setName('p2') p2.setLayout(self.l1) p2['d'] = 'world' ds = DataSet() ds.addPoint(p1) ds.addPoint(p2) self.assertEqual(ds.point('p1').label('d'), 'hello') self.assertEqual(ds.point('p2').label('d'), 'world') ds.removePoint('p2') ds = transform(ds, 'enumerate', { 'descriptorNames': 'd' }) ds.addPoint(p2) self.assertEqual(ds.point('p1').label('d'), 'hello') self.assertEqual(ds.point('p2').label('d'), 'world')
def testLayout(self): layout = testdata.createSimpleLayout() (tmpFile, tmpName) = tempfile.mkstemp() os.close(tmpFile) # write dataset with layout p = Point() p.setName('p1') p.setLayout(layout) p2 = Point() p2.setName('p2') p2.setLayout(layout) p['a.1'] = 23 self.assertEqual(p['a.1'], 23) self.assertRaises(Exception, p.setValue, 'a.4', 34) ds1 = DataSet() ds1.setName('ds1') ds1.addPoint(p) ds1.addPoint(p2) ds1.save(tmpName) # reload dataset ds2 = DataSet() ds2.load(tmpName) self.assertEqual(layout, ds2.layout()) self.assertEqual(ds2.point('p1')['a.1'], 23) # remove temp file os.remove(tmpName)
def newPoint(name): l = PointLayout() l.add('a', RealType) p = Point() p.setName(name) p.setLayout(l) return p
def testAddToDataSetWithDifferentLayouts(self): p1 = Point() p1.setLayout(self.l1) # +1, ref = 2 p2 = Point() ds = DataSet() ds.addPoint(p1) # +2 (dataset+pointcopy), ref = 4 self.assertRaises(Exception, ds.addPoint, p2) self.assertEqual(p1.layout().ref(), 4) self.assertEqual(p2.layout().ref(), 1)
def createSimpleDataSet(): global useFixedLength, useEnumerate l = createSimpleLayout() ds = DataSet() p = Point() p.setName('p') p.setLayout(l) ds.addPoint(p) if useFixedLength: ds = fixLength(ds) if useEnumerate: ds = enumerateStrings(ds) return ds
def testForceIdentity(self): l = PointLayout() l.add('a', RealType, FixedLength, 1) p = Point() p.setLayout(l) cd = MetricFactory.create('cosinesimilarity', p.layout(), { 'defaultValue': 0.5 }) self.assertEquals(cd(p, p), 0.5) ficd = MetricFactory.create('forceidentity', p.layout(), { 'distance': 'cosinesimilarity', 'params': { 'defaultValue': 0.5 } }) self.assertEquals(ficd(p, p), 0.0) p2 = Point(p) p2.setName('p2') self.assertEquals(ficd(p, p2), 0.5)
def testCreatedInputSpace(self): ds = testdata.createSimpleDataSet() ds.point('p')['a.1'] = 23.0 for i in range(5): p = Point() p.setName('p%d' % i) p.setLayout(ds.originalLayout()) p['a.1'] = float(i) ds.addPoint(p) ds = transform(ds, 'fixlength') dist = MetricFactory.create('euclidean', ds.layout()) v = View(ds) p = ds.point('p') RS_remove = v.nnSearch(p, dist) RS_remove.removePoints(['p2', 'p4']) RS_create = InputSpace() RS_create.addPoints(ds, ['p', 'p0', 'p1', 'p3']) rsc = v.nnSearch(p, RS_remove, dist) rsa = v.nnSearch(p, RS_create, dist) self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.), ('p0', 23.)), v.nnSearch(p, rsc, dist).get(10)) self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.), ('p0', 23.)), v.nnSearch(p, rsa, dist).get(10)) # test thresholdLimit method self.assertEqual((('p', 0.), ), v.nnSearch(p, rsa, dist).thresholdLimit(10).get(10)) self.assertEqual((('p', 0.), ('p3', 20.)), v.nnSearch(p, rsa, dist).thresholdLimit(20).get(10)) self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.)), v.nnSearch(p, rsa, dist).thresholdLimit(22.01).get(10))
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', { 'dimension': len(x[0]), 'resultName': 'pca' }) result = [] for p in ds.points(): result.append(p['pca']) return result
def testCenter(self): ds = testdata.createSimpleDataSet() l = testdata.createSimpleLayout() for i in range(4): p = Point() p.setName('p%d' % i) p.setLayout(l) ds.addPoint(p) ds.removePoint('p') ds.point('p0')['a.1'] = [ 0, 1 ] ds.point('p1')['a.1'] = [ 4, 3 ] ds.point('p2')['a.1'] = [ 6, 9 ] ds.point('p3')['a.1'] = [ 2, 27 ] # mean = [ 3, 10 ] ds = transform(ds, 'fixlength') dsc = transform(ds, 'center', { 'descriptorNames': 'a.1' }) self.assertEqual(dsc.point('p0')['a.1'], (-3, -9)) self.assertEqual(dsc.point('p1')['a.1'], ( 1, -7)) self.assertEqual(dsc.point('p2')['a.1'], ( 3, -1)) self.assertEqual(dsc.point('p3')['a.1'], (-1, 17))
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'}) result = [] for p in ds.points(): result.append(p['pca']) return result
def readLibSVMDataSet(filename): data = [l.split() for l in open(filename).readlines()] minidx = maxidx = 1 for l in data: for i in range(1, len(l)): dim, value = l[i].split(':') l[i] = (int(dim), float(value)) minidx = min(minidx, int(dim)) maxidx = max(maxidx, int(dim)) dimension = maxidx - minidx + 1 layout = PointLayout() layout.add('class', StringType) layout.add('value', RealType) ds = DataSet() n = 0 points = [] for l in data: p = Point() p.setLayout(layout) p.setName('instance_%06d' % n) n += 1 p['class'] = l[0] desc = RealDescriptor(dimension, 0.0) for dim, value in l[1:]: desc[dim - minidx] = value p['value'] = desc points.append(p) ds.addPoints(points) return ds
def api_search(self, target_type, target, filter, preset_name, metric_descriptor_names, num_results, offset, in_ids): # Check if index has sufficient points size = self.original_dataset.size() if size < sim_settings.SIMILARITY_MINIMUM_POINTS: msg = 'Not enough datapoints in the dataset (%s < %s).' % ( size, sim_settings.SIMILARITY_MINIMUM_POINTS) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.SERVER_ERROR_CODE } # Get some dataset parameters that will be useful later trans_hist = self.transformations_history layout = self.original_dataset.layout() pca_layout = self.pca_dataset.layout() coeffs = None # Get normalization coefficients for i in range(0, len(trans_hist)): if trans_hist[-(i + 1)]['Analyzer name'] == 'normalize': coeffs = trans_hist[-(i + 1)]['Applier parameters']['coeffs'] # Process target if target: if target_type == 'sound_id': query_point = str(target) if not self.original_dataset.contains(query_point): msg = "Sound with id %s doesn't exist in the dataset and can not be set as similarity target." \ % query_point logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.NOT_FOUND_CODE } else: query = query_point elif target_type == 'descriptor_values': # Transform input params to the normalized feature space and add them to a query point # If there are no params specified in the target, the point is set as empty (probably random sounds # are returned) feature_names = [] query = Point() query.setLayout(layout) try: for param in target.keys(): # Only add numerical parameters. Non numerical ones (like key) are only used as filters if param in coeffs.keys(): feature_names.append(str(param)) value = target[param] if coeffs: a = coeffs[param]['a'] b = coeffs[param]['b'] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) query.setValue(str(param), norm_value) else: query.setValue(str(param), value) except: return { 'error': True, 'result': 'Invalid target (descriptor values could not be correctly parsed)', 'status_code': sim_settings.BAD_REQUEST_CODE } # Overwrite metric with present descriptors in target metric = DistanceFunctionFactory.create( 'euclidean', layout, {'descriptorNames': feature_names}) elif target_type == 'file': # Target is specified as the attached file # Create a point with the data in 'descriptors_data' and search for it target_file_parsing_type = '-' try: # Try directly loading the file p, query = Point(), Point() p.loadFromString(yaml.dump(target)) if preset_name == 'pca': query = self.pca_dataset.history().mapPoint( p) # map point to pca dataset else: query = self.original_dataset.history().mapPoint( p) # map point to original dataset target_file_parsing_type = 'mapPoint' except Exception as e: logger.info( 'Unable to create gaia point from uploaded file (%s). ' 'Trying adding descriptors one by one.' % e) # If does not work load descriptors one by one try: query = Point() #query.setLayout(layout) feature_names = [] get_nested_descriptor_names(target, feature_names) feature_names = [ '.%s' % item for item in feature_names ] nonused_features = [] for param in feature_names: if param in coeffs.keys(): value = get_nested_dictionary_value( param[1:].split('.'), target) if coeffs: try: a = coeffs[param]['a'] b = coeffs[param]['b'] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) query.setValue(str(param[1:]), norm_value) except: nonused_features.append(param) else: query.setValue(str(param[1:]), value) else: nonused_features.append(param) if preset_name == 'pca': query = self.pca_dataset.history().mapPoint( query) # map point to pca dataset else: query = self.original_dataset.history().mapPoint( p) # map point to original dataset target_file_parsing_type = 'walkDict' except Exception as e: logger.info( 'Unable to create gaia point from uploaded file and adding descriptors one by ' 'one (%s)' % e) return { 'error': True, 'result': 'Unable to create gaia point from uploaded file. Probably the ' 'file does not have the required layout. Are you using the ' 'correct version of Essentia\'s Freesound extractor?', 'status_code': sim_settings.SERVER_ERROR_CODE } else: query = Point() # Empty target if preset_name == 'pca': query.setLayout(pca_layout) else: query.setLayout(layout) # Process filter if filter: filter = parse_filter_list(filter, coeffs) else: filter = "" # Empty filter # log log_message = 'Similarity search' if target: if target_type == 'sound_id': log_target = '%s (sound id)' % str(target) elif target_type == 'descriptor_values': log_target = '%s (descriptor values)' % str(target) elif target_type == 'file': log_target = 'uploaded file (%s)' % target_file_parsing_type log_message += ' with target: %s' % log_target if filter: log_message += ' with filter: %s' % str(filter) logger.info(log_message) # if in_ids is specified, edit the filter accordingly if in_ids: if not filter: filter = 'WHERE point.id IN ("' + '", "'.join(in_ids) + '")' else: filter += ' AND point.id IN ("' + '", "'.join(in_ids) + '")' # Set query metric metric = self.metrics[preset_name] if metric_descriptor_names: metric = DistanceFunctionFactory.create( 'euclidean', layout, {'descriptorNames': metric_descriptor_names}) # Do query! try: if target_type == 'descriptor_values' and target: search = self.view.nnSearch(query, metric, str(filter)) else: if preset_name == 'pca': search = self.view_pca.nnSearch(query, metric, str(filter)) else: search = self.view.nnSearch(query, metric, str(filter)) results = search.get(num_results, offset=offset) count = search.size() except Exception as e: return { 'error': True, 'result': 'Similarity server error', 'status_code': sim_settings.SERVER_ERROR_CODE } note = None if target_type == 'file': if target_file_parsing_type == 'walkDict': note = 'The layout of the given analysis file differed from what we expected. Similarity results ' \ 'might not be accurate. Was the file generated with the last version of Essentia\'s ' \ 'Freesound extractor?' return { 'error': False, 'result': { 'results': results, 'count': count, 'note': note } }
def query_dataset(self, query_parameters, number_of_results): size = self.original_dataset.size() if size < SIMILARITY_MINIMUM_POINTS: msg = "Not enough datapoints in the dataset (%s < %s)." % (size, SIMILARITY_MINIMUM_POINTS) logger.debug(msg) return {"error": True, "result": msg} # raise Exception('Not enough datapoints in the dataset (%s < %s).' % (size, SIMILARITY_MINIMUM_POINTS)) trans_hist = self.original_dataset.history().toPython() layout = self.original_dataset.layout() # Get normalization coefficients to transform the input data (get info from the last transformation which has been a normalization) coeffs = None for i in range(0, len(trans_hist)): if trans_hist[-(i + 1)]["Analyzer name"] == "normalize": coeffs = trans_hist[-(i + 1)]["Applier parameters"]["coeffs"] ############## # PARSE TARGET ############## # Transform input params to the normalized feature space and add them to a query point # If there are no params specified in the target, the point is set as empty (probably random sounds are returned) q = Point() q.setLayout(layout) feature_names = [] # If some target has been specified... if query_parameters["target"].keys(): for param in query_parameters["target"].keys(): # Only add numerical parameters. Non numerical ones (like key) are only used as filters if param in coeffs.keys(): feature_names.append(str(param)) value = query_parameters["target"][param] if coeffs: a = coeffs[param]["a"] b = coeffs[param]["b"] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) # text = str(type(param)) + " " + str(type(norm_value)) q.setValue(str(param), norm_value) else: q.setValue(str(param), value) ############## # PARSE FILTER ############## filter = "" # If some filter has been specified... if query_parameters["filter"]: if type(query_parameters["filter"][0:5]) == str: filter = query_parameters["filter"] else: filter = self.parse_filter_list(query_parameters["filter"], coeffs) ############# # DO QUERY!!! ############# logger.debug( "Content based search with target: " + str(query_parameters["target"]) + " and filter: " + str(filter) ) metric = DistanceFunctionFactory.create("euclidean", layout, {"descriptorNames": feature_names}) # Looks like that depending on the version of gaia, variable filter must go after or before the metric # For the gaia version we have currently (sep 2012) in freesound: nnSearch(query,filter,metric) # results = self.view.nnSearch(q,str(filter),metric).get(int(number_of_results)) # <- Freesound results = self.view.nnSearch(q, metric, str(filter)).get(int(number_of_results)) return {"error": False, "result": results}
def testSimplifyHistory(self): ds = testdata.createSimpleDataSet() p = Point() p.setName('p2') p.setLayout(ds.layout()) p['a.2'] = [1.2, 2.3] ds.addPoint(p) ds0 = ds.copy() ds1 = ds.copy() ds1.simplifyHistory() self.assertEqual(ds1.history().size(), 0) ds = transform(ds, 'removevl') ds2 = ds.copy() ds2.simplifyHistory() self.assertEqual(ds2.history().toPython(), [{ 'Analyzer name': 'remove', 'Analyzer parameters': { 'descriptorNames': ['.a.2'] }, 'Applier name': 'removedesc', 'Applier parameters': { 'descriptorNames': ['.a.2'] }, 'Additional info': {} }]) ds = transform(ds, 'fixlength') ds3 = ds.copy() ds3.simplifyHistory() self.assertEqual(ds3.history().toPython(), [{ 'Analyzer name': 'remove', 'Analyzer parameters': { 'descriptorNames': ['.a.2'] }, 'Applier name': 'removedesc', 'Applier parameters': { 'descriptorNames': ['.a.2'] }, 'Additional info': {} }, { 'Analyzer name': 'fixlength', 'Analyzer parameters': { 'descriptorNames': ['.a.1', '.b', '.c', '.d'] }, 'Applier name': 'fixlengthapplier', 'Applier parameters': { 'descriptorNames': ['.a.1', '.b', '.c', '.d'] }, 'Additional info': {} }]) ds = transform(ds, 'remove', {'descriptorNames': 'a.*'}) ds4 = ds.copy() ds4.simplifyHistory() self.assertEqual(ds4.history().toPython(), [{ 'Analyzer name': 'remove', 'Analyzer parameters': { 'descriptorNames': ['.a.1', '.a.2'] }, 'Applier name': 'removedesc', 'Applier parameters': { 'descriptorNames': ['.a.1', '.a.2'] }, 'Additional info': {} }, { 'Analyzer name': 'fixlength', 'Analyzer parameters': { 'descriptorNames': ['.b', '.c', '.d'] }, 'Applier name': 'fixlengthapplier', 'Applier parameters': { 'descriptorNames': ['.b', '.c', '.d'] }, 'Additional info': {} }]) ds = transform(ds, 'select', {'descriptorNames': ['b', 'c']}) ds5 = ds.copy() ds5.simplifyHistory() self.assertEqual(ds5.history().toPython(), [{ 'Analyzer name': 'remove', 'Analyzer parameters': { 'descriptorNames': ['.a.1', '.a.2', '.d'] }, 'Applier name': 'removedesc', 'Applier parameters': { 'descriptorNames': ['.a.1', '.a.2', '.d'] }, 'Additional info': {} }, { 'Analyzer name': 'fixlength', 'Analyzer parameters': { 'descriptorNames': ['.b', '.c'] }, 'Applier name': 'fixlengthapplier', 'Applier parameters': { 'descriptorNames': ['.b', '.c'] }, 'Additional info': {} }]) p2 = Point() p2.setLayout(ds0.layout()) p2['b'] = 23 p2['c'] = 78 p2['a.2'] = [1, 2, 3, 4] p2m = ds5.history().mapPoint(p2) self.assertEqual(p2m.layout().descriptorNames(), ('.b', '.c')) self.assertEqual(p2m['b'], 23.) self.assertEqual(p2m['c'], 78.)