def testRegressionGaia14(self): ds = testdata.loadSmallDB() ds = transform(ds, 'fixlength') to_remove = testdata.TEST_SMALLDB_VARLENGTH dsr = transform(ds, 'remove', {'descriptorNames': to_remove}) self.compareResults(search(dsr, '1_ethno.wav', 5), testdata.SMALL_DB_RAW_RESULTS) dsc = transform(dsr, 'cleaner') self.compareResults(search(dsc, '1_ethno.wav', 5), testdata.SMALL_DB_CLEAN_RESULTS) dsn = transform(dsc, 'normalize') self.compareResults(search(dsn, '1_ethno.wav', 5), testdata.SMALL_DB_NORM_RESULTS) dspca = transform(dsn, 'pca', { 'resultName': 'pca30', 'dimension': 30, 'descriptorNames': '*' }) self.compareResults(search(dspca, '1_ethno.wav', 5), testdata.SMALL_DB_PCA_RESULTS)
def train_SVM(dataset, groundTruth, descriptorNames, exclude=[], svmtype='c-svc', kernel='rbf', c=1, gamma=1): # recreate a copy of the given dataset without history ds = DataSet() ds.addPoints([p for p in dataset.points()]) ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames, 'except': exclude, 'independent': True }) ds = transform( ds, 'svmtrain', { 'descriptorNames': descriptorNames, 'except': exclude, 'className': groundTruth.className, 'type': svmtype, 'kernel': kernel, 'c': c, 'gamma': gamma }) h = ds.history() return lambda p: str(h.mapPoint(p)[groundTruth.className])
def testMerge(self): #setDebugLevel(GAlgorithms) ds = testdata.loadTestDB() ds1 = transform(ds, 'select', { 'descriptorNames': '*.mean' }) ds2 = transform(ds, 'select', { 'descriptorNames': '*.var' }) ds12 = transform(ds, 'select', { 'descriptorNames': [ '*.mean', '*.var'] }) ds_merged = mergeDataSets(ds1, ds2) # we need to do this because to add a Point we need it with the # original layout, not the FixedLength one testdata.resetSettings() ds_orig = testdata.loadTestDB() sp = ds_orig.samplePoint() # test if we can add points normally ds_merged.removePoint(sp.name()) ds_merged.addPoint(sp) # compare datasets contents self.compareDataSets(ds12, ds_merged) # test the mapDataSet function of the Merge applier ds_remapped = ds_merged.history().mapDataSet(ds_orig) self.compareDataSets(ds12, ds_remapped) self.compareDataSets(ds_merged, ds_remapped)
def testComplete2(self): # have a transformed 2.0 dataset, load it, and have gaia 2.1 transform # a point using the history. ds = DataSet() self.assertRaises(Exception, ds.load, testdata.GAIA_20_BACKWARDS_COMPAT_PCA_DATASET) return ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_PCA_DATASET) ds21 = DataSet() ds21.load(testdata.TEST_DATABASE) p = ds21.point("17 Blue Monday ['88 12' Version].mp3") ds21 = ds.history().mapDataSet(ds21) self.assertEqual(ds.history().mapPoint(p), ds21.history().mapPoint(p)) ds = transform(ds, 'fixlength') ds21 = transform(ds21, 'fixlength') def search(ds, p): p = ds.history().mapPoint(p) dist = MetricFactory.create('euclidean', ds.layout()) return View(ds).nnSearch(p, dist).get(5) self.compareResults(search(ds, p), search(ds21, p))
def testGaussianize(self): ds = testdata.loadTestDB() ds = transform(ds, 'removevl') ds = transform(ds, 'fixlength') self.assertEqual(ds[0]['tempotap_bpm'], 104.28208160400391) ds = transform(ds, 'gaussianize') self.assertEqual(ds[0]['tempotap_bpm'], -0.1928621232509613)
def train_SVM(dataset, groundTruth, descriptorNames, exclude=[], svmtype='c-svc', kernel='rbf', c=1, gamma=1, balanceClasses=False): # recreate a copy of the given dataset without history ds = dataset.copy() ds.forgetHistory() ds = transform(ds, 'select', { 'descriptorNames': descriptorNames, 'except': exclude }) ds = transform(ds, 'cleaner') ds = transform(ds, 'normalize', { 'independent': True }) ds = transform(ds, 'addfield', { 'string': groundTruth.className }) for p in ds.points(): p[groundTruth.className] = groundTruth[p.name()] ds = transform(ds, 'svmtrain', { 'className': groundTruth.className, 'type': svmtype, 'kernel': kernel, 'c': c, 'gamma': gamma, 'balanceClasses': balanceClasses}) h = ds.history() return lambda p: h.mapPoint(p)[groundTruth.className]
def train_SVM(dataset, groundTruth, descriptorNames, exclude = [], svmtype = 'c-svc', kernel = 'rbf', c = 1, gamma = 1): # recreate a copy of the given dataset without history ds = dataset.copy() ds.forgetHistory() ds = transform(ds, 'select', { 'descriptorNames': descriptorNames, 'except': exclude }) ds = transform(ds, 'cleaner') ds = transform(ds, 'normalize', { 'independent': True }) ds = transform(ds, 'addfield', { 'string': groundTruth.className }) for p in ds.points(): p[groundTruth.className] = groundTruth[p.name()] ds = transform(ds, 'svmtrain', { 'className': groundTruth.className, 'type': svmtype, 'kernel': kernel, 'c': c, 'gamma': gamma }) h = ds.history() return lambda p: h.mapPoint(p)[groundTruth.className]
def prepare_original_dataset_helper(ds): proc_ds1 = transform(ds, "RemoveVL") proc_ds2 = transform(proc_ds1, "FixLength") proc_ds1 = None prepared_ds = transform(proc_ds2, "Cleaner") proc_ds2 = None return prepared_ds
def normalize_dataset_helper(ds): # Remove ['.lowlevel.mfcc.cov','.lowlevel.mfcc.icov'] (they give errors when normalizing) ds = transform(ds, "remove", {"descriptorNames": [".lowlevel.mfcc.cov", ".lowlevel.mfcc.icov"]}) # Add normalization normalization_params = {"descriptorNames": "*", "independent": True, "outliers": -1} normalized_ds = transform(ds, "normalize", normalization_params) ds = None return normalized_ds
def testNormalize(self): ds = createDataSet() ds = transform(ds, 'fixlength') dsn = transform(ds, 'normalize') # by default, vector-normalization is used self.assertEqual(dsn.point('p2').value('a'), (0.5, 1.0)) dsn2 = transform(ds, 'normalize', { 'independent': True }) self.assertEqual(dsn2.point('p2').value('a'), (1.0, 1.0))
def testQt46FloatParameterBug(self): # Note: this was triggered by Qt 4.6 introducing a QVariant(float) constructor, which resulted # in pmapToPython to fail with an unknown type error (followed by a segfault)... ds = testdata.loadTestDB() ds = transform(ds, 'fixlength') ds = transform(ds, 'removevl') ds = transform(ds, 'normalize') self.assertEqual( ds.history().toPython()[-1]['Applier parameters']['coeffs'] ['.barkbands.mean']['a'][0], 24.922689437866211)
def testHistory(self): ds = testdata.loadTestDB() ignored_descs = testdata.TEST_DATABASE_VARLENGTH_REAL testdata.resetSettings() ds_orig = testdata.loadTestDB() # cleaning, mandatory step ds = transform(ds, 'fixlength', {'except': ignored_descs}) cleaned_db = transform(ds, 'cleaner', {'except': ignored_descs}) # removing annoying descriptors, like mfcc.cov & mfcc.icov, who don't # like to be normalized like the other ones (constant value: dimension) no_mfcc_db = transform(cleaned_db, 'remove', {'descriptorNames': '*mfcc*'}) # normalize, to have everyone change values normalized_db = transform(no_mfcc_db, 'normalize', {'except': ignored_descs}) testPoints = [ '01 Oye Como Va - Santana.mp3', '02 Carmen Burana- O Fortuna.mp3', '07 Romeo and Juliet- the Knights\' Dance.mp3', '11 Lambada.mp3' ] for pointName in testPoints: p1 = normalized_db.point(pointName) p2 = normalized_db.history().mapPoint(ds_orig.point(pointName)) for name in p1.layout().descriptorNames(): self.assertEqual(p1[name], p2[name]) (tmpFile, tmpName) = tempfile.mkstemp() os.close(tmpFile) normalized_db.save(tmpName) reloaded_db = DataSet() reloaded_db.load(tmpName) for pointName in testPoints: p1 = normalized_db.point(pointName) p2 = normalized_db.history().mapPoint(ds_orig.point(pointName)) p3 = reloaded_db.point(pointName) p4 = reloaded_db.history().mapPoint(ds_orig.point(pointName)) self.assert_(p1.layout() == p2.layout()) self.assert_(p2.layout() == p3.layout()) self.assert_(p3.layout() == p4.layout()) for name in p1.layout().descriptorNames(): self.assertEqual(p1[name], p2[name]) self.assertEqual(p2[name], p3[name]) self.assertEqual(p3[name], p4[name]) # remove temp file os.remove(tmpName)
def prepare_original_dataset_helper(ds): ds = transform( ds, 'FixLength' ) # this transformation marks which descriptors are of fixed length, it optimizes things ds = transform(ds, 'Cleaner') try: ds = transform(ds, 'enumerate', {'descriptorNames': ['.tonal.chords_progression']}) except: logger.info( 'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.' ) return ds
def gaia_transform(points): """ Takes a dict of point names and filepaths. Creates a DataSet and performs the standard transformations """ ds = DataSet.mergeFiles(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'cleaner') for desc in get_unused_descriptors(): try: ds = transform(ds, 'remove', desc) except Exception, e: log.error("Problem removing this descriptor: %s" % e)
def testRCA(self): ds = testdata.loadTestDB() ds = transform(ds, 'removevl') ds = transform(ds, 'fixlength') ds = transform(ds, 'remove', { 'descriptorNames': '*cov' }) ds = transform(ds, 'cleaner') ds = transform(ds, 'normalize') ds = transform(ds, 'pca', { 'resultName': 'pca15', 'dimension': 15 }) ds_rca = transform(ds, 'rca', { 'resultName': 'rca10', 'dimension': 10, 'classFile': testdata.RCA_GENRE_GT }) v = View(ds_rca) dist = MetricFactory.create('euclidean', ds_rca.layout()) self.compareResults(v.nnSearch('01 Cigarettes And Alcohol - Oasis.mp3', dist).get(10), testdata.RCA_GENRE_RESULTS) # try by passing directly the groundtruth map import gaia2.fastyaml as yaml ds_rca = transform(ds, 'rca', { 'resultName': 'rca10', 'dimension': 10, 'classMap': yaml.load(open(testdata.RCA_GENRE_GT).read()) }) v = View(ds_rca) dist = MetricFactory.create('euclidean', ds_rca.layout()) self.compareResults(v.nnSearch('01 Cigarettes And Alcohol - Oasis.mp3', dist).get(10), testdata.RCA_GENRE_RESULTS)
def prepare_original_dataset_helper(ds): ds = transform( ds, 'FixLength' ) # Needed to optimize use of fixed-length descriptors and save memory ds = transform( ds, 'Cleaner' ) # Remove descriptors that will cause problems in further transformations try: ds = transform(ds, 'enumerate', {'descriptorNames': ['.tonal.chords_progression']}) except: # TODO: exception too broad here... logger.info( 'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.' ) return ds
def testDeleteUnderlyingDataSet(self): ds = testdata.loadTestDB() params = {'descriptorNames': ['*.mean', '*.var']} ds = transform(ds, 'fixlength', params) ds = transform(ds, 'cleaner', params) ds = transform(ds, 'normalize', params) dist = MetricFactory.create('euclidean', ds.layout(), params) v = View(ds) del ds #self.assertRaises(Exception, v.nnSearch, '01 Respect.mp3') # this doesn't throw anymore, as the View keeps a ref to the dataset v.nnSearch('01 Respect.mp3', dist)
def testEnumerateKey(self): db = testdata.loadTestDB() testdata.useEnumerate = True dbe = testdata.loadTestDB() # also make sure we can map single points correctly # we need to load it separately and not take it from the dataset to ensure # that it'll have a different enum map p = Point() p.load('data/dataset_small/Vocal and Acapella/04 Blue Skies.mp3.sig') print(p.name()) #also create a transfo that forwards enums after we did the enumerate transfo dbes = transform(dbe, 'select', { 'descriptorNames': '*key*' }) pe = dbes.history().mapPoint(p) self.assertEqual(p['key_mode'], pe['key_mode']) self.assertEqual(p['key_key'], pe['key_key']) self.assertNotEqual(db.layout(), dbe.layout()) for p in db.points(): pe = dbe.point(p.name()) self.assertEqual(p.label('key_key'), pe.label('key_key')) self.assertEqual(p.label('key_mode'), pe.label('key_mode'))
def __load_dataset(self): """ Loads the dataset, does all the necessary steps to make it available for similarity queries and creates the PCA version of it. If dataset does not exist, creates a new empty one. NOTE: we assume that loaded datasets will have been prepared and normalized (see_ _prepare_original_dataset() and __normalize_original_dataset()) on due time (see add_point() method below), therefore this function does not prepare or normalize loaded datasets. """ if not os.path.exists(sim_settings.INDEX_DIR): os.makedirs(sim_settings.INDEX_DIR) # load original dataset if os.path.exists(self.original_dataset_path): self.original_dataset.load(self.original_dataset_path) self.__calculate_descriptor_names() if self.original_dataset.size( ) >= sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode: # Save transformation history so we do not need to compute it every time we need it self.transformations_history = self.original_dataset.history( ).toPython() # Build metrics for the different similarity presets, create a Gaia view self.__build_metrics() view = View(self.original_dataset) self.view = view # Compute PCA and create pca view and metric # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once # when the similarity server is loaded- self.pca_dataset = transform( self.original_dataset, 'pca', { 'descriptorNames': sim_settings.PCA_DESCRIPTORS, 'dimension': sim_settings.PCA_DIMENSIONS, 'resultName': 'pca' }) self.pca_dataset.setReferenceDataSet(self.original_dataset) self.view_pca = View(self.pca_dataset) self.__build_pca_metric() if self.original_dataset.history().size() <= 0: logger.info('Dataset loaded, size: %s points' % (self.original_dataset.size())) else: logger.info( 'Dataset loaded, size: %s points (%i fixed-length desc., %i variable-length desc.)' % (self.original_dataset.size(), len(self.descriptor_names['fixed-length']), len(self.descriptor_names['variable-length']))) else: # If there is no existing dataset we create an empty one. # For the moment we do not create any distance metric nor a view because search won't be possible until # the DB has a minimum of SIMILARITY_MINIMUM_POINTS self.original_dataset.save(self.original_dataset_path) self.__calculate_descriptor_names() logger.info('Created new dataset, size: %s points (should be 0)' % (self.original_dataset.size()))
def testKullbackLeibler(self): ds = transform(testdata.loadTestDB(), 'fixlength') # creates a test with more than 1000 points otherwise the test is useless because # we split the workload in chunks of 1000 points when computing the distance dstest = DataSet() ncopy = 20 for cidx in range(ncopy): points = list(ds.points()) for p in points: p.setName(p.name() + '-%d' % cidx) dstest.addPoints(points) # test whether KL doesn't break with multithreading (did in 2.2.1) v = View(dstest) dist = MetricFactory.create('kullbackleibler', dstest.layout(), { 'descriptorName': 'mfcc' }) results = v.nnSearch(ds.samplePoint(), dist).get(6*ncopy) expected = [ 0.0 ]*2*ncopy + [ 6.1013755798339844 ]*ncopy expected += [ 6.4808731079101562 ]*2*ncopy + [ 6.7828292846679688 ]*ncopy for r, e in zip(results, expected): self.assertAlmostEqual(r[1], e, 5)
def testMergePointsWithDifferentEnumerationMaps(self): #'''ticket #74: when changing the layout of a point, we must also make sure that the enum maps are correctly mapped''' p1 = Point() p1.setName('p1') p1.setLayout(self.l1) p1['d'] = 'hello' p2 = Point() p2.setName('p2') p2.setLayout(self.l1) p2['d'] = 'world' ds = DataSet() ds.addPoint(p1) ds.addPoint(p2) self.assertEqual(ds.point('p1').label('d'), 'hello') self.assertEqual(ds.point('p2').label('d'), 'world') ds.removePoint('p2') ds = transform(ds, 'enumerate', { 'descriptorNames': 'd' }) ds.addPoint(p2) self.assertEqual(ds.point('p1').label('d'), 'hello') self.assertEqual(ds.point('p2').label('d'), 'world')
def select(self, dbfile, pca_covered_variance=75, highlevel=True): ''' dbfile: the path to the gaia dataset pca_covered_variance: the pca transofrmation should keep at least this variance highlevel:include high level descriptors ''' if not os.path.exists("transformed_dbs"): os.mkdir("transformed_dbs") prefix = dbfile[dbfile.rfind("/") + 1:dbfile.rfind(".")] print dbfile ds = gaia2.DataSet() ds.load(dbfile) cleaner = gaia2.AnalyzerFactory.create('cleaner') cleanDB = cleaner.analyze(ds).applyToDataSet(ds) if highlevel: to_remove = ['*.dmean2', '*.dvar2', '*.min', '*.max', '*cov'] else: to_remove = [ '.highlevel.*', '*.dmean2', '*.dvar2', '*.min', '*.max', '*cov' ] fselectDB = gaia2.transform(cleanDB, 'remove', {'descriptorNames': to_remove}) # NORMALIZE, PCA & Friends normalize = gaia2.AnalyzerFactory.create('normalize') normalizedDB = normalize.analyze(fselectDB).applyToDataSet(fselectDB) pcavar = gaia2.AnalyzerFactory.create( 'pca', { 'coveredVariance': pca_covered_variance, 'resultName': 'pca%dtransform' % pca_covered_variance }) pcaDB = pcavar.analyze(normalizedDB).applyToDataSet(normalizedDB) mfccDB = gaia2.transform( cleanDB, 'select', { 'descriptorNames': [ '*mfcc*', '.highlevel.*', '.rhythm.bpm', '.rhythm.onset_rate' ] }) finalDB = gaia2.mergeDataSets(mfccDB, pcaDB) outfile = "transformed_dbs/" + prefix + ".db" finalDB.save(outfile)
def testSVM(self): trainingDS = testdata.readLibSVMDataSet(testdata.SVM_TRAINING_SET) trainingDS = transform(trainingDS, 'fixlength') trained = transform(trainingDS, 'svmtrain', { 'descriptorNames': 'value', 'className': 'class', # setting this to True make the results # different... bug or libsvm feature? #'probability': True }) testDS = testdata.readLibSVMDataSet(testdata.SVM_TESTING_SET) predicted = trained.history().mapDataSet(testDS) expected = [ l.strip() for l in open(testdata.SVM_RESULT).readlines() ] for p, expectedClass in zip(predicted.points(), expected): self.assertEqual(p.label('class'), expectedClass)
def testExponentialCompress(self): ds = createDataSet() ds = transform(ds, 'fixlength') dist = MetricFactory.create('ExponentialCompress', ds.layout(), { 'distance': 'euclidean' }) self.assertEqual(dist(ds.point('p1'), ds.point('p1')), 0.0) self.assertAlmostEqual(dist(ds.point('p1'), ds.point('p0')), 0.63212056) # 1-exp(-1) self.assertAlmostEqual(dist(ds.point('p1'), ds.point('p3')), 0.86466472) # 1-exp(-2)
def testAddFieldFixedLength(self): ds = testdata.loadTestDB() ds_fl = transform(ds, 'fixlength') ds_addvl = transform(ds, 'addfield', { 'real': 'hello' }) ds_fl_addvl = transform(ds_fl, 'addfield', { 'real': 'hello' }) self.assertEqual(ds_addvl.layout().descriptorLocation('hello').lengthType(), VariableLength) self.assertEqual(ds_fl_addvl.layout().descriptorLocation('hello').lengthType(), VariableLength) ds_addvl_fl = transform(ds_addvl, 'fixlength') ds_fl_addvl_fl = transform(ds_fl_addvl, 'fixlength') self.assertEqual(ds_addvl_fl.layout(), ds_fl_addvl_fl.layout()) ds_fl_addfl = transform(ds_fl, 'addfield', { 'real': 'hello', 'size': { 'hello': 1 } }) self.assertEqual(ds_fl_addfl.layout(), ds_fl_addvl_fl.layout()) self.assertEqual(ds_fl_addfl[0]['hello'], 0) ds_fl_addfl2 = transform(ds_fl, 'addfield', { 'real': 'hello', 'string': 'youhou', 'size': { 'hello': 3, 'youhou': 6 }, 'default': { 'hello': [ 2, 5, 3 ], 'youhou': [ 'a', 'b', 'c', 'd', 'e', 'f' ] } }) self.assertEqual(ds_fl_addfl2.layout().descriptorLocation('hello').dimension(), 3) self.assertEqual(ds_fl_addfl2.layout().descriptorLocation('youhou').dimension(), 6) self.assertEqual(ds_fl_addfl2[0]['hello'], (2, 5, 3))
def normalize_dataset_helper(ds, descriptor_names): # Add normalization normalization_params = { "descriptorNames": descriptor_names, "independent": True, "outliers": -1 } ds = transform(ds, 'normalize', normalization_params) return ds
def mergeDataSet(eqloud): if eqloud == 'eqloud': ext = 'sig' else: ext = 'neq.sig' datasetName = datasetdir + className + '_%s.db' % eqloud if os.path.exists(datasetName): print('Dataset already exists:', datasetName) return if className == 'genre_itms': mergelist, groundTruth = getSignatureData_iTMS(str(basedir), ext, metafile) elif className == 'mood': mergelist, groundTruth = getSignatureData_CyrilMoods(str(basedir), ext) elif className == 'mood_mirex': mergelist, groundTruth = getSignatureData_MirexMoods(str(basedir), ext) elif className == 'artist': mergelist, groundTruth = getSignatureData_MirexArtist( str(basedir), ext) else: # genre mergelist, groundTruth = getSignatureData(str(basedir), ext) # merge dataset import gaia2 ds = gaia2.DataSet.mergeFiles(mergelist) # preprocessing common to all tests: ds = gaia2.transform(ds, 'removevl') ds = gaia2.transform(ds, 'fixlength') ds = gaia2.transform(ds, 'cleaner') # transform the dataset to add the class information ds = gaia2.transform(ds, 'addfield', {'string': className}) for p in ds.points(): p[className] = groundTruth[p.name()] ds.save(datasetName) # write groundTruth as pickled file import pickle pickle.dump(groundTruth, open(datasetName + '.groundtruth', 'w'))
def train_SVM(dataset, groundTruth, descriptorNames, exclude = [], svmtype = 'c-svc', kernel = 'rbf', c = 1, gamma = 1): # recreate a copy of the given dataset without history ds = DataSet() ds.addPoints([ p for p in dataset.points() ]) ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames, 'except': exclude, 'independent': True }) ds = transform(ds, 'svmtrain', { 'descriptorNames': descriptorNames, 'except': exclude, 'className': groundTruth.className, 'type': svmtype, 'kernel': kernel, 'c': c, 'gamma': gamma}) h = ds.history() return lambda p: str(h.mapPoint(p)[groundTruth.className])
def transform(dataset): """Transform dataset for distance computations.""" dataset = transform(dataset, 'fixlength') dataset = transform(dataset, 'cleaner') # dataset = transform(dataset, 'remove', {'descriptorNames': '*mfcc*'}) for field in ('*beats_position*', '*bpm_estimates*', '*bpm_intervals*', '*onset_times*', '*oddtoevenharmonicenergyratio*'): try: dataset = transform(dataset, 'remove', {'descriptorNames': field}) except Exception as ex: print(repr(ex)) dataset = transform(dataset, 'normalize') dataset = transform(dataset, 'pca', { 'dimension': 30, 'descriptorNames': ['*'], 'resultName': 'pca30' }) return dataset
def transform(dataset): """Transform dataset for distance computations.""" dataset = transform(dataset, 'fixlength') dataset = transform(dataset, 'cleaner') # dataset = transform(dataset, 'remove', {'descriptorNames': '*mfcc*'}) for field in ('*beats_position*', '*bpm_estimates*', '*bpm_intervals*', '*onset_times*', '*oddtoevenharmonicenergyratio*'): try: dataset = transform( dataset, 'remove', {'descriptorNames': field}) except Exception as ex: print(repr(ex)) dataset = transform(dataset, 'normalize') dataset = transform( dataset, 'pca', { 'dimension': 30, 'descriptorNames': ['*'], 'resultName': 'pca30'}) return dataset
def mergeDataSet(eqloud): if eqloud == 'eqloud': ext = 'sig' else: ext = 'neq.sig' datasetName = datasetdir + className + '_%s.db' % eqloud if os.path.exists(datasetName): print 'Dataset already exists:', datasetName return if className == 'genre_itms': mergelist, groundTruth = getSignatureData_iTMS(str(basedir), ext, metafile) elif className == 'mood': mergelist, groundTruth = getSignatureData_CyrilMoods(str(basedir), ext) elif className == 'mood_mirex': mergelist, groundTruth = getSignatureData_MirexMoods(str(basedir), ext) elif className == 'artist': mergelist, groundTruth = getSignatureData_MirexArtist(str(basedir), ext) else: # genre mergelist, groundTruth = getSignatureData(str(basedir), ext) # merge dataset import gaia2 ds = gaia2.DataSet.mergeFiles(mergelist) # preprocessing common to all tests: ds = gaia2.transform(ds, 'removevl') ds = gaia2.transform(ds, 'fixlength') ds = gaia2.transform(ds, 'cleaner') # transform the dataset to add the class information ds = gaia2.transform(ds, 'addfield', { 'string': className }) for p in ds.points(): p[className] = groundTruth[p.name()] ds.save(datasetName) # write groundTruth as pickled file import cPickle cPickle.dump(groundTruth, open(datasetName + '.groundtruth', 'w'))
def testWrongArgument(self): ds = testdata.loadTestDB() ds = transform(ds, 'fixlength') ds = transform(ds, 'removevl') ds = transform(ds, 'cleaner') ds = transform(ds, 'normalize') # missing param: className self.assertRaises(Exception, transform, ds, 'svmtrain', {'descriptorNames': '*.mean'}) # wrong param: descriptorName self.assertRaises(Exception, transform, ds, 'svmtrain', { 'className': 'kloug', 'descriptorName': '*.mean' }) # missing param: resultName self.assertRaises(Exception, transform, ds, 'pca', { 'dimension': 15, 'resultName': '' })
def addRCA(ds, groundTruth, dim, selectConfig = {}): #ds_rca = transform(ds, 'fixlength') # should be unnecessary ds_rca = ds if selectConfig: ds_rca = transform(ds_rca, 'select', selectConfig) ds_rca = transform(ds_rca, 'gaussianize') # if dimension is too high, we need to preprocess before with a PCA, otherwise RCA doesn't work l = ds_rca.layout() descdim = l.descriptorLocation(l.descriptorNames()).dimension(RealType) if descdim > 80: ds_rca = transform(ds_rca, 'pca', { 'resultName': 'pca%d' % 80, 'dimension': 80 }) ds_rca = transform(ds_rca, 'rca', { 'resultName': 'rca%d' % dim, 'dimension': dim, 'classMap': pmap(groundTruth) }) return mergeDataSets(ds, ds_rca)
def testParsedVsConstructedFilters(self): ds = testdata.loadTestDB() ds = transform(ds, 'fixlength') p = ds.samplePoint() p2 = ds.point('Higher State of Consciousness.mp3') queries = [ (p, '', ''), (p2, '', ''), (p2, 'WHERE value.tempotap_bpm.value > 140', Filter('tempotap_bpm.value', '>', 140)), (p, 'WHERE value.tempotap_bpm > 110', Filter('tempotap_bpm', '>', 110)), (p, 'WHERE value.tempotap_bpm > -10', Filter('tempotap_bpm', '>', -10)), (p, 'WHERE value.tempotap_bpm > 23000', Filter('tempotap_bpm', '>', 23000)), (p, 'WHERE value.tempotap_bpm > 120 AND value.tempotap_bpm < 130', AndFilter([ Filter('tempotap_bpm', '>', 120), Filter('tempotap_bpm', '<', 130) ])), (p, 'WHERE value.tempotap_bpm BETWEEN 130 AND 120', Filter('tempotap_bpm', 'between', [130, 120])), (p, 'WHERE label.key_key = "C"', Filter('key_key', '==', 'C')), (p2, '''WHERE ((label.key_key = "A" AND label.key_mode = "major") OR (label.key_key = "E" AND label.key_mode = "minor")) AND value.tempotap_bpm < 90''', AndFilter([ OrFilter([ AndFilter([ Filter('key_key', '==', 'A'), Filter('key_mode', '==', 'major') ]), AndFilter([ Filter('key_key', '==', 'E'), Filter('key_mode', '==', 'minor') ]) ]), Filter('tempotap_bpm', '<', 90) ])) ] dist = MetricFactory.create('euclidean', ds.layout(), {'descriptorNames': '*.mean'}) v = View(ds) for (pt, filtstr, filt) in queries: self.assertEqualSearchSpace(v.nnSearch(pt, dist, filtstr), v.nnSearch(pt, dist, filt))
def testCenter(self): ds = testdata.createSimpleDataSet() l = testdata.createSimpleLayout() for i in range(4): p = Point() p.setName('p%d' % i) p.setLayout(l) ds.addPoint(p) ds.removePoint('p') ds.point('p0')['a.1'] = [ 0, 1 ] ds.point('p1')['a.1'] = [ 4, 3 ] ds.point('p2')['a.1'] = [ 6, 9 ] ds.point('p3')['a.1'] = [ 2, 27 ] # mean = [ 3, 10 ] ds = transform(ds, 'fixlength') dsc = transform(ds, 'center', { 'descriptorNames': 'a.1' }) self.assertEqual(dsc.point('p0')['a.1'], (-3, -9)) self.assertEqual(dsc.point('p1')['a.1'], ( 1, -7)) self.assertEqual(dsc.point('p2')['a.1'], ( 3, -1)) self.assertEqual(dsc.point('p3')['a.1'], (-1, 17))
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', { 'dimension': len(x[0]), 'resultName': 'pca' }) result = [] for p in ds.points(): result.append(p['pca']) return result
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'}) result = [] for p in ds.points(): result.append(p['pca']) return result
def testWeightedPearson(self): ds = testdata.createSimpleDataSet() ds.point('p')['a.1'] = [ 0, 0 ] # need to have 2 values before fixing length p1 = transform(ds, 'fixlength').point('p') p2 = Point(p1) dist = MetricFactory.create('WeightedPearson', p1.layout(), { 'weights': { '1': 0.3, 'c': 0.7 } }) p1['a.1'] = [ 0.12, 2.71 ] p1['c'] = 4.32 p2['1'] = [ 0.46, 1.12 ] p2['c'] = 2.4242 self.assertAlmostEqual(dist(p1, p2), 0.038222129799, 6)
def normalize_dataset_helper(ds, descriptor_names): # NOTE: The "except" list of descriptors below should be reviewed if a new extractor is used. The point is to # remove descriptors can potentially break normalize transform (e.g. descriptors with value = 0) normalization_params = { "descriptorNames": descriptor_names, "except": [ "*.min", "*.max", "tonal.chords_histogram", ], "independent": True, "outliers": -1 } ds = transform(ds, 'normalize', normalization_params) return ds
def addVarFromCov(ds, desc): '''Adds the .var aggregate descriptor to the specified descriptor using its .cov aggregate, for all the points in the dataset.''' ds = transform(ds, 'addfield', { 'real': desc + '.var' }) # add the .var descriptor using .cov (it's the diagonal of the matrix) for p in ds.points(): m = utils.toMatrix(p.value(desc + '.cov')) dim = len(m) diag = RealDescriptor(dim, 0.0) for i in range(dim): diag[i] = m[i][i] p.setValue(desc + '.var', diag) return ds
def harmonizeChunks(partfiles): # TODO: check all histories are the same, if not, try to do sth about it # find the GCLD (greatest common layout divisor :-) ) ds = DataSet() ds.load(partfiles[0]) origLayout = ds.layout().copy() gcld = ds.layout().copy() for pfile in partfiles[1:]: ds.load(pfile) gcld = gcld & ds.layout() # keep some stats about which descriptors got removed and the reason why before throwing # away the original history and simplifying it vldescs = set() nandescs = set() # now that we have our GCLD, transform all the chunks so they have the same layout (our GCLD) # and simplify their histories so that they also have the same history (the minimum history # required to arrive at this target layout). for pfile in partfiles: ds.load(pfile) for t in ds.history().toPython(): tname = t['Analyzer name'] descs = t['Applier parameters']['descriptorNames'] if tname == 'cleaner': nandescs.update(descs) elif tname == 'removevl': vldescs.update(descs) toremove = ds.layout().differenceWith(gcld) if toremove: ds = transform(ds, 'remove', { 'descriptorNames': toremove }) ds.simplifyHistory() ds.save(pfile) # also get the other descriptors that got removed (because of a select or remove transfo) rdescs = set(origLayout.differenceWith(gcld)) - (vldescs | nandescs) return vldescs, nandescs, rdescs
def highlevel_mosaic(target, tcorpus, scorpus, scope=5): """ This will be used to test the highlevel mosaicing process. The scope variable controls the number of results which are returned for each target unit which is sought. """ # Create a temporary file for the mosaic audio filepath = os.path.join(os.getcwd(), 'temp_mosaic.wav') if os.path.isfile(filepath): os.remove(filepath) mosaic = Mosaic(filepath) cost = RepeatUnitCost() context = Context() gridder = Gridder() units = tcorpus.list_audio_units(audio_filename=target, chop='highlevel') hdb = scorpus.get_gaia_unit_db(chop='highlevel_%s' % self.chop) distance = get_mood_distance(hdb) v = View(hdb, distance) results = {} for f in units: p = Point() p.load(switch_ext(f, '.yaml')) unit_name = switch_ext(os.path.basename(f), '') p.setName(unit_name) p_m = hdb.history().mapPoint(p) results.update({f:v.nnSearch(p_m).get(scope)}) log.debug("Ok, now we have a dict with each target segment, along with its corresponding nearest matches in source db") log.debug("Check to see that we have every second of target audio accounted for - I think not!") #return results #new_results = results.copy() ds = DataSet() for r in results: units = [] for u in results[r]: ds.load(switch_ext(u[0], '.db')) for n in ds.pointNames(): units.append(n) new_ds = gaia_transform(dict(zip(units, units))) results.update({r:new_ds}) #return results # Very important - target units must be in correct order index = 0 index_skip = 0 for r in sorted(results.keys()): tds = DataSet() tds.load(switch_ext(r, '.db')) #return tds, results sds = results[r] source_set = set(sds.layout().descriptorNames()) target_set = set(tds.layout().descriptorNames()) remove_from_source = source_set.difference(target_set) remove_from_target = target_set.difference(source_set) if len(remove_from_source) > 0: log.debug("Will try to remove %s from the source DataSet" % remove_from_source) try: sds = transform(results[r], 'remove', {'descriptorNames':list(remove_from_source)}) except Exception, e: log.error("Failed to remove %s from source DataSet" % list(remove_from_source)) return results[r], tds if len(remove_from_target) > 0: log.debug("Will try to remove %s from the target DataSet" % remove_from_source) try: tds = transform(tds, 'remove', {'descriptorNames':list(remove_from_target)}) except Exception, e: log.error("Failed to remove %s from target DataSet" % list(remove_from_target)) return results[r], tds
from gaia2 import DataSet, transform def gaia_transform(points): """ Takes a dict of point names and filepaths. Creates a DataSet and performs the standard transformations """ ds = DataSet.mergeFiles(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'cleaner') for desc in get_unused_descriptors(): try: ds = transform(ds, 'remove', desc) except Exception, e: log.error("Problem removing this descriptor: %s" % e) ds = transform(ds, 'normalize') return ds def get_unused_descriptors(): """ Gets some descriptors which are not commonly used in order to remove them from the analysis """ for d in ['rhythm.beats_position', 'rhythm.bpm_estimates', 'rhythm.bpm_intervals', 'rhythm.onset_times', 'rhythm.rubato_start', 'rhythm.rubato_stop', ]: yield {'descriptorNames': [d]} def process_highlevel(corpus, filepath, chop):