def testLayout(self): layout = testdata.createSimpleLayout() (tmpFile, tmpName) = tempfile.mkstemp() os.close(tmpFile) # write dataset with layout p = Point() p.setName('p1') p.setLayout(layout) p2 = Point() p2.setName('p2') p2.setLayout(layout) p['a.1'] = 23 self.assertEqual(p['a.1'], 23) self.assertRaises(Exception, p.setValue, 'a.4', 34) ds1 = DataSet() ds1.setName('ds1') ds1.addPoint(p) ds1.addPoint(p2) ds1.save(tmpName) # reload dataset ds2 = DataSet() ds2.load(tmpName) self.assertEqual(layout, ds2.layout()) self.assertEqual(ds2.point('p1')['a.1'], 23) # remove temp file os.remove(tmpName)
def testMergePointsWithDifferentEnumerationMaps(self): #'''ticket #74: when changing the layout of a point, we must also make sure that the enum maps are correctly mapped''' p1 = Point() p1.setName('p1') p1.setLayout(self.l1) p1['d'] = 'hello' p2 = Point() p2.setName('p2') p2.setLayout(self.l1) p2['d'] = 'world' ds = DataSet() ds.addPoint(p1) ds.addPoint(p2) self.assertEqual(ds.point('p1').label('d'), 'hello') self.assertEqual(ds.point('p2').label('d'), 'world') ds.removePoint('p2') ds = transform(ds, 'enumerate', { 'descriptorNames': 'd' }) ds.addPoint(p2) self.assertEqual(ds.point('p1').label('d'), 'hello') self.assertEqual(ds.point('p2').label('d'), 'world')
def add_point(self, point_location, point_name): if self.original_dataset.contains(str(point_name)): self.original_dataset.removePoint(str(point_name)) try: p = Point() p.load(str(point_location)) p.setName(str(point_name)) self.original_dataset.addPoint(p) size = self.original_dataset.size() logger.debug("Added point with name %s. Index has now %i points." % (str(point_name), size)) except: msg = "Point with name %s could NOT be added. Index has now %i points." % (str(point_name), size) logger.debug(msg) return {"error": True, "result": msg} # If when adding a new point we reach the minimum points for similarity, prepare the dataset, save and create view and distance metrics # This will most never happen, only the first time we start similarity server, there is no index created and we add 2000 points. if size == SIMILARITY_MINIMUM_POINTS: self.__prepare_original_dataset() self.__normalize_original_dataset() self.save_index(msg="(reaching 2000 points)") # build metrics for the different similarity presets self.__build_metrics() # create view view = View(self.original_dataset) self.view = view return {"error": False, "result": True}
def createDataSet(): l = PointLayout() l.add('a', RealType) ds = DataSet() # p1.a = (0.0, 0.0) p = Point() p.setName('p1') p.setLayout(l) p['a'] = (0.0, 0.0) ds.addPoint(p) # p2.a = (0.5, 1.0) p = Point() p.setName('p2') p.setLayout(l) p['a'] = (0.5, 1.0) ds.addPoint(p) if testdata.useFixedLength: ds = testdata.fixLength(ds) if testdata.useEnumerate: ds = testdata.enumerateStrings(ds) return ds
def newPoint(name): l = PointLayout() l.add('a', RealType) p = Point() p.setName(name) p.setLayout(l) return p
def testSecondChanceForLayoutEquality(self): '''ticket #21: points try to morph to adapt to dataset if they cannot be naturally inserted''' ds = DataSet() p = Point() p.setName('Paris Hilton') p.load('data/04 - Cansei de Ser Sexy - Meeting Paris Hilton.mp3.sig') ds.addPoint(p) p.setName('2005') p.load('data/11_2005-fwyh.mp3.sig') ds.addPoint(p) self.assertEqual(ds.point('2005')['title'], '2005')
def createSimpleDataSet(): global useFixedLength, useEnumerate l = createSimpleLayout() ds = DataSet() p = Point() p.setName('p') p.setLayout(l) ds.addPoint(p) if useFixedLength: ds = fixLength(ds) if useEnumerate: ds = enumerateStrings(ds) return ds
def testForceIdentity(self): l = PointLayout() l.add('a', RealType, FixedLength, 1) p = Point() p.setLayout(l) cd = MetricFactory.create('cosinesimilarity', p.layout(), { 'defaultValue': 0.5 }) self.assertEquals(cd(p, p), 0.5) ficd = MetricFactory.create('forceidentity', p.layout(), { 'distance': 'cosinesimilarity', 'params': { 'defaultValue': 0.5 } }) self.assertEquals(ficd(p, p), 0.0) p2 = Point(p) p2.setName('p2') self.assertEquals(ficd(p, p2), 0.5)
def testCreatedInputSpace(self): ds = testdata.createSimpleDataSet() ds.point('p')['a.1'] = 23.0 for i in range(5): p = Point() p.setName('p%d' % i) p.setLayout(ds.originalLayout()) p['a.1'] = float(i) ds.addPoint(p) ds = transform(ds, 'fixlength') dist = MetricFactory.create('euclidean', ds.layout()) v = View(ds) p = ds.point('p') RS_remove = v.nnSearch(p, dist) RS_remove.removePoints(['p2', 'p4']) RS_create = InputSpace() RS_create.addPoints(ds, ['p', 'p0', 'p1', 'p3']) rsc = v.nnSearch(p, RS_remove, dist) rsa = v.nnSearch(p, RS_create, dist) self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.), ('p0', 23.)), v.nnSearch(p, rsc, dist).get(10)) self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.), ('p0', 23.)), v.nnSearch(p, rsa, dist).get(10)) # test thresholdLimit method self.assertEqual((('p', 0.), ), v.nnSearch(p, rsa, dist).thresholdLimit(10).get(10)) self.assertEqual((('p', 0.), ('p3', 20.)), v.nnSearch(p, rsa, dist).thresholdLimit(20).get(10)) self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.)), v.nnSearch(p, rsa, dist).thresholdLimit(22.01).get(10))
def testComplexReferenceCounting(self): ds = DataSet() self.assertEqual(ds.layout().ref(), 2) # 1 + 1 from temp object p = Point() p.setName('p1') lext = PointLayout(p.layout()) # +1, {lext,p}.ref = 2 self.assertEqual(lext.ref(), 2) lext = p.layout().copy() # copy, lext.ref = 1; p.ref -= 1, = 1 self.assertEqual(lext.ref(), 1) ds.addPoint(p) # +3 (dataset + pointcopy), ref = 3 self.assertEqual(lext.ref(), 1) self.assertEqual(ds.layout().ref(), 4) # 3 + 1 temp object p2 = Point(p) # +1, {p,p2}.ref = 5 p2.setName('p2') self.assertEqual(ds.layout().ref(), 5) ds.addPoint(p2) self.assertEqual(ds.layout().ref(), 6) # +1 pointcopy, ref = 6
def testCenter(self): ds = testdata.createSimpleDataSet() l = testdata.createSimpleLayout() for i in range(4): p = Point() p.setName('p%d' % i) p.setLayout(l) ds.addPoint(p) ds.removePoint('p') ds.point('p0')['a.1'] = [ 0, 1 ] ds.point('p1')['a.1'] = [ 4, 3 ] ds.point('p2')['a.1'] = [ 6, 9 ] ds.point('p3')['a.1'] = [ 2, 27 ] # mean = [ 3, 10 ] ds = transform(ds, 'fixlength') dsc = transform(ds, 'center', { 'descriptorNames': 'a.1' }) self.assertEqual(dsc.point('p0')['a.1'], (-3, -9)) self.assertEqual(dsc.point('p1')['a.1'], ( 1, -7)) self.assertEqual(dsc.point('p2')['a.1'], ( 3, -1)) self.assertEqual(dsc.point('p3')['a.1'], (-1, 17))
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', { 'dimension': len(x[0]), 'resultName': 'pca' }) result = [] for p in ds.points(): result.append(p['pca']) return result
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'}) result = [] for p in ds.points(): result.append(p['pca']) return result
def readLibSVMDataSet(filename): data = [l.split() for l in open(filename).readlines()] minidx = maxidx = 1 for l in data: for i in range(1, len(l)): dim, value = l[i].split(':') l[i] = (int(dim), float(value)) minidx = min(minidx, int(dim)) maxidx = max(maxidx, int(dim)) dimension = maxidx - minidx + 1 layout = PointLayout() layout.add('class', StringType) layout.add('value', RealType) ds = DataSet() n = 0 points = [] for l in data: p = Point() p.setLayout(layout) p.setName('instance_%06d' % n) n += 1 p['class'] = l[0] desc = RealDescriptor(dimension, 0.0) for dim, value in l[1:]: desc[dim - minidx] = value p['value'] = desc points.append(p) ds.addPoints(points) return ds
def testSimple(self): ds = testdata.createSimpleDataSet() p2 = Point(ds.point('p')) p2.setName('p2') p3 = Point(ds.point('p')) p3.setName('p3') p3['a.1'] = 1 p4 = Point(ds.point('p')) p4.setName('p4') p4['a.1'] = 1 p4['a.2'] = 1 ds.addPoint(p2) ds.addPoint(p3) ds.addPoint(p4) ds = transform(ds, 'fixlength') dist = MetricFactory.create('euclidean', ds.layout()) v = View(ds) results = v.nnSearch('p', dist).get(10) self.assertEqual(results[0][1], 0.0) self.assertEqual(results[1][1], 0.0) self.assertSearchResultEqual(results[2], ('p3', 1.0)) self.assertSearchResultEqual(results[3], ('p4', math.sqrt(2)))
def add_point(self, point_location, point_name): if self.original_dataset.contains(str(point_name)): self.original_dataset.removePoint(str(point_name)) p = Point() if os.path.exists(str(point_location)): try: p.load(str(point_location)) p.setName(str(point_name)) if self.original_dataset.size( ) <= sim_settings.SIMILARITY_MINIMUM_POINTS: # Add point to original_dataset because PCA dataset has not been created yet self.original_dataset.addPoint(p) msg = 'Added point with name %s. Index has now %i points.' % \ (str(point_name), self.original_dataset.size()) logger.info(msg) else: # Add point to PCA dataset because it has been already created. # PCA dataset will take care of adding the point to the original dataset as well. self.pca_dataset.addPoint(p) msg = 'Added point with name %s. Index has now %i points (pca index has %i points).' % \ (str(point_name), self.original_dataset.size(), self.pca_dataset.size()) logger.info(msg) except Exception as e: msg = 'Point with name %s could NOT be added (%s).' % ( str(point_name), str(e)) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.SERVER_ERROR_CODE } else: msg = 'Point with name %s could NOT be added because analysis file does not exist (%s).' % \ (str(point_name), str(point_location)) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.SERVER_ERROR_CODE } if self.original_dataset.size( ) == sim_settings.SIMILARITY_MINIMUM_POINTS: # Do enumerate try: self.original_dataset = transform( self.original_dataset, 'enumerate', {'descriptorNames': ['.tonal.chords_progression']}) except: # TODO: exception too broad here... logger.info( 'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.' ) # If when adding a new point we reach the minimum points for similarity, do the needful so that the dataset # can be used for search. This includes preparing the dataset, normalizing it, saveing it and creating view and # distance metrics. This will only happen once when the size of the dataset reaches SIMILARITY_MINIMUM_POINTS. if self.original_dataset.size( ) == sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode: self.__prepare_original_dataset() self.__normalize_original_dataset() self.transformations_history = self.original_dataset.history( ).toPython() self.save_index(msg="(reaching %i points)" % sim_settings.SIMILARITY_MINIMUM_POINTS) # TODO: the code below is repeated from __load_dataset() method, should be moved into a util function # Build metrics for the different similarity presets, create a Gaia view self.__build_metrics() view = View(self.original_dataset) self.view = view # Compute PCA and create pca view and metric # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once # when the similarity server is loaded- self.pca_dataset = transform( self.original_dataset, 'pca', { 'descriptorNames': sim_settings.PCA_DESCRIPTORS, 'dimension': sim_settings.PCA_DIMENSIONS, 'resultName': 'pca' }) self.pca_dataset.setReferenceDataSet(self.original_dataset) self.view_pca = View(self.pca_dataset) self.__build_pca_metric() return {'error': False, 'result': msg}
def testSimplifyHistory(self): ds = testdata.createSimpleDataSet() p = Point() p.setName('p2') p.setLayout(ds.layout()) p['a.2'] = [1.2, 2.3] ds.addPoint(p) ds0 = ds.copy() ds1 = ds.copy() ds1.simplifyHistory() self.assertEqual(ds1.history().size(), 0) ds = transform(ds, 'removevl') ds2 = ds.copy() ds2.simplifyHistory() self.assertEqual(ds2.history().toPython(), [{ 'Analyzer name': 'remove', 'Analyzer parameters': { 'descriptorNames': ['.a.2'] }, 'Applier name': 'removedesc', 'Applier parameters': { 'descriptorNames': ['.a.2'] }, 'Additional info': {} }]) ds = transform(ds, 'fixlength') ds3 = ds.copy() ds3.simplifyHistory() self.assertEqual(ds3.history().toPython(), [{ 'Analyzer name': 'remove', 'Analyzer parameters': { 'descriptorNames': ['.a.2'] }, 'Applier name': 'removedesc', 'Applier parameters': { 'descriptorNames': ['.a.2'] }, 'Additional info': {} }, { 'Analyzer name': 'fixlength', 'Analyzer parameters': { 'descriptorNames': ['.a.1', '.b', '.c', '.d'] }, 'Applier name': 'fixlengthapplier', 'Applier parameters': { 'descriptorNames': ['.a.1', '.b', '.c', '.d'] }, 'Additional info': {} }]) ds = transform(ds, 'remove', {'descriptorNames': 'a.*'}) ds4 = ds.copy() ds4.simplifyHistory() self.assertEqual(ds4.history().toPython(), [{ 'Analyzer name': 'remove', 'Analyzer parameters': { 'descriptorNames': ['.a.1', '.a.2'] }, 'Applier name': 'removedesc', 'Applier parameters': { 'descriptorNames': ['.a.1', '.a.2'] }, 'Additional info': {} }, { 'Analyzer name': 'fixlength', 'Analyzer parameters': { 'descriptorNames': ['.b', '.c', '.d'] }, 'Applier name': 'fixlengthapplier', 'Applier parameters': { 'descriptorNames': ['.b', '.c', '.d'] }, 'Additional info': {} }]) ds = transform(ds, 'select', {'descriptorNames': ['b', 'c']}) ds5 = ds.copy() ds5.simplifyHistory() self.assertEqual(ds5.history().toPython(), [{ 'Analyzer name': 'remove', 'Analyzer parameters': { 'descriptorNames': ['.a.1', '.a.2', '.d'] }, 'Applier name': 'removedesc', 'Applier parameters': { 'descriptorNames': ['.a.1', '.a.2', '.d'] }, 'Additional info': {} }, { 'Analyzer name': 'fixlength', 'Analyzer parameters': { 'descriptorNames': ['.b', '.c'] }, 'Applier name': 'fixlengthapplier', 'Applier parameters': { 'descriptorNames': ['.b', '.c'] }, 'Additional info': {} }]) p2 = Point() p2.setLayout(ds0.layout()) p2['b'] = 23 p2['c'] = 78 p2['a.2'] = [1, 2, 3, 4] p2m = ds5.history().mapPoint(p2) self.assertEqual(p2m.layout().descriptorNames(), ('.b', '.c')) self.assertEqual(p2m['b'], 23.) self.assertEqual(p2m['c'], 78.)
def add_point(self, point_location, point_name): if self.original_dataset.contains(str(point_name)): self.original_dataset.removePoint(str(point_name)) p = Point() if os.path.exists(str(point_location)): try: p.load(str(point_location)) p.setName(str(point_name)) if self.original_dataset.size( ) <= settings.SIMILARITY_MINIMUM_POINTS: # Add point to original_dataset self.original_dataset.addPoint(p) msg = 'Added point with name %s. Index has now %i points.' % ( str(point_name), self.original_dataset.size()) logger.info(msg) else: # Add point to pca dataset (as it has been already created). Pca dataset will take are of add it to the original too self.pca_dataset.addPoint(p) msg = 'Added point with name %s. Index has now %i points (pca index has %i points).' % ( str(point_name), self.original_dataset.size(), self.pca_dataset.size()) logger.info(msg) except Exception as e: msg = 'Point with name %s could NOT be added (%s).' % ( str(point_name), str(e)) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': settings.SERVER_ERROR_CODE } else: msg = 'Point with name %s could NOT be added because analysis file does not exist (%s).' % ( str(point_name), str(point_location)) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': settings.SERVER_ERROR_CODE } if self.original_dataset.size() == settings.SIMILARITY_MINIMUM_POINTS: # Do enumerate try: self.original_dataset = transform( self.original_dataset, 'enumerate', {'descriptorNames': ['.tonal.chords_progression']}) except: logger.info( 'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.' ) # If when adding a new point we reach the minimum points for similarity, prepare the dataset, save and create view and distance metrics # This will most never happen, only the first time we start similarity server, there is no index created and we add 2000 points. if self.original_dataset.size( ) == settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode: #self.__prepare_original_dataset() #self.__normalize_original_dataset() self.transformations_history = self.original_dataset.history( ).toPython() self.save_index(msg="(reaching 2000 points)") # build metrics for the different similarity presets self.__build_metrics() # create view view = View(self.original_dataset) self.view = view # do pca and create pca view and metric self.pca_dataset = transform( self.original_dataset, 'pca', { 'descriptorNames': settings.PCA_DESCRIPTORS, 'dimension': settings.PCA_DIMENSIONS, 'resultName': 'pca' }) self.pca_dataset.setReferenceDataSet(self.original_dataset) self.view_pca = View(self.pca_dataset) self.__build_pca_metric() return {'error': False, 'result': msg}