Esempio n. 1
0
 def get_miximized_tracks(self, filenames):
     """Get list of tracks in ideal order."""
     for filename in filenames:
         self.queue.put((ADD, filename))
     while self.queue.qsize():
         print("waiting for analysis")
         sleep(10)
     encoded = [f.encode('utf-8') for f in filenames]
     dataset = DataSet()
     number_of_tracks = len(filenames)
     for filename in encoded:
         if not self.gaia_db.contains(filename):
             continue
         point = self.gaia_db.point(filename)
         dataset.addPoint(point)
     dataset = self.transform(dataset)
     matrix = {}
     for filename in encoded:
         matrix[filename] = {
             name: score for score, name in self.get_neighbours(
                 dataset, filename, number_of_tracks)}
     clusterer = Clusterer(encoded, lambda f1, f2: matrix[f1][f2])
     clusterer.cluster()
     result = []
     for cluster in clusterer.clusters:
         result.extend([encoded.index(filename) for filename in cluster])
     return result
Esempio n. 2
0
 def get_miximized_tracks(self, filenames):
     """Get list of tracks in ideal order."""
     for filename in filenames:
         self.queue.put((ADD, filename))
     while self.queue.qsize():
         print("waiting for analysis")
         sleep(10)
     encoded = [f.encode('utf-8') for f in filenames]
     dataset = DataSet()
     number_of_tracks = len(filenames)
     for filename in encoded:
         if not self.gaia_db.contains(filename):
             continue
         point = self.gaia_db.point(filename)
         dataset.addPoint(point)
     dataset = self.transform(dataset)
     matrix = {}
     for filename in encoded:
         matrix[filename] = {
             name: score
             for score, name in self.get_neighbours(dataset, filename,
                                                    number_of_tracks)
         }
     clusterer = Clusterer(encoded, lambda f1, f2: matrix[f1][f2])
     clusterer.cluster()
     result = []
     for cluster in clusterer.clusters:
         result.extend([encoded.index(filename) for filename in cluster])
     return result
Esempio n. 3
0
    def testKullbackLeibler(self):
        ds = transform(testdata.loadTestDB(), 'fixlength')

        # creates a test with more than 1000 points otherwise the test is useless because
        # we split the workload in chunks of 1000 points when computing the distance
        dstest = DataSet()
        ncopy = 20
        for cidx in range(ncopy):
            points = list(ds.points())
            for p in points:
                p.setName(p.name() + '-%d' % cidx)
            dstest.addPoints(points)

        # test whether KL doesn't break with multithreading (did in 2.2.1)
        v = View(dstest)
        dist = MetricFactory.create('kullbackleibler',
                                    dstest.layout(),
                                    { 'descriptorName': 'mfcc' })


        results = v.nnSearch(ds.samplePoint(), dist).get(6*ncopy)
        expected = [ 0.0 ]*2*ncopy + [ 6.1013755798339844 ]*ncopy
        expected += [ 6.4808731079101562 ]*2*ncopy + [ 6.7828292846679688 ]*ncopy

        for r, e in zip(results, expected):
            self.assertAlmostEqual(r[1], e, 5)
Esempio n. 4
0
def train_SVM(dataset,
              groundTruth,
              descriptorNames,
              exclude=[],
              svmtype='c-svc',
              kernel='rbf',
              c=1,
              gamma=1):
    # recreate a copy of the given dataset without history
    ds = DataSet()
    ds.addPoints([p for p in dataset.points()])

    ds = transform(ds, 'normalize', {
        'descriptorNames': descriptorNames,
        'except': exclude,
        'independent': True
    })

    ds = transform(
        ds, 'svmtrain', {
            'descriptorNames': descriptorNames,
            'except': exclude,
            'className': groundTruth.className,
            'type': svmtype,
            'kernel': kernel,
            'c': c,
            'gamma': gamma
        })

    h = ds.history()
    return lambda p: str(h.mapPoint(p)[groundTruth.className])
def train_svm_history(project, params, output_file_path):
    params_model = params["model"]
    if params_model.get("classifier") != "svm":
        raise GaiaWrapperException(
            "Can only use this script on SVM config parameters.")

    ds = DataSet()
    ds.load(
        os.path.join(
            project["datasetsDirectory"], "%s-%s.db" %
            (project["className"], params_model["preprocessing"])))

    gt = GroundTruth.fromFile(project["groundtruth"])
    gt.className = "highlevel." + project["className"]

    history = train_svm(
        ds,
        gt,
        type=params_model["type"],
        kernel=params_model["kernel"],
        C=params_model["C"],
        gamma=params_model["gamma"])  # doing the whole training
    if isinstance(output_file_path, unicode):
        output_file_path = output_file_path.encode("utf-8")
    history.save(output_file_path)
Esempio n. 6
0
def trainSVMHistory(configFilename, paramsFilename, outputHistoryFilename,
                    className):
    config = yaml.load(open(configFilename).read())
    params = yaml.load(open(paramsFilename).read())['model']

    if params.pop('classifier') != 'svm':
        raise Exception('Can only use this script on SVM config parameters.')

    preproc = params.pop('preprocessing')

    ds = DataSet()
    ds.load(
        join(
            split(configFilename)[0],  # base dir
            config['datasetsDirectory'],  # datasets dir
            '%s-%s.db' % (config['className'], preproc)))  # dataset name

    gt = GroundTruth.fromFile(config['groundtruth'])

    if className:
        gt.className = className

    # add 'highlevel.' in front of the descriptor, this is what will appear in the final Essentia sigfile
    gt.className = 'highlevel.' + gt.className

    # do the whole training
    h = trainSVM(ds, gt, **params)

    h.save(outputHistoryFilename)
Esempio n. 7
0
    def testMergePointsWithDifferentEnumerationMaps(self):
        #'''ticket #74: when changing the layout of a point, we must also make sure that the enum maps are correctly mapped'''

        p1 = Point()
        p1.setName('p1')
        p1.setLayout(self.l1)
        p1['d'] = 'hello'

        p2 = Point()
        p2.setName('p2')
        p2.setLayout(self.l1)
        p2['d'] = 'world'

        ds = DataSet()
        ds.addPoint(p1)
        ds.addPoint(p2)

        self.assertEqual(ds.point('p1').label('d'), 'hello')
        self.assertEqual(ds.point('p2').label('d'), 'world')

        ds.removePoint('p2')
        ds = transform(ds, 'enumerate', { 'descriptorNames': 'd' })
        ds.addPoint(p2)

        self.assertEqual(ds.point('p1').label('d'), 'hello')
        self.assertEqual(ds.point('p2').label('d'), 'world')
Esempio n. 8
0
 def initialize_gaia_db(self):
     """Load or initialize the gaia database."""
     if not os.path.isfile(self.gaia_db_path):
         dataset = DataSet()
     else:
         dataset = self.load_gaia_db()
         self.transformed = True
     print("songs in db: %d" % dataset.size())
     return dataset
Esempio n. 9
0
    def run(self, className, outfilename, param, dsname, gtname, evalconfig):

        try:
            classifier = param['classifier']
            gt = GroundTruth(classifier)
            gt.load(gtname)

            # force the GroundTruth class name to be the one specified by our project file, not
            # the one in the original groundtruth file
            gt.className = className

            ds = DataSet()
            ds.load(dsname)

            # some points may have failed to be analyzed, remove those from the GroundTruth
            pnames = ds.pointNames()
            for pid in list(gt.keys()):
                if pid not in pnames:
                    log.warning(
                        'Removing %s from GroundTruth as it could not be found in the merged dataset'
                        % pid)
                    del gt[pid]

            trainerFun, trainingparam, newds = getTrainer(
                classifier, param, ds)

            # run all the evaluations specified in the evaluation config
            for i, evalparam in enumerate(evalconfig):
                # if we already ran this evaluation, no need to run it again...
                resultFilename = outfilename + '_%d.result' % i
                if exists(resultFilename):
                    log.info('File %s already exists. Skipping evaluation...' %
                             resultFilename)
                    continue

                log.info(
                    'Running evaluation %d for: %s with classifier %s and dataset %s'
                    % (i, outfilename, param['classifier'],
                       param['preprocessing']))
                log.info('    PID: %d, parameters: %s' %
                         (os.getpid(), json.dumps(param)))

                # run evaluation
                confusion = evaluateNfold(evalparam['nfold'], ds, gt,
                                          trainerFun, **trainingparam)

                # write evaluation params & result
                with open(outfilename + '_%d.param' % i, 'w') as f:
                    yaml.dump({'model': param, 'evaluation': evalparam}, f)

                confusion.save(resultFilename)

        except Exception:
            log.error(
                'While doing evaluation with param = %s\nevaluation = %s' %
                (param, evalconfig))
            raise
Esempio n. 10
0
 def initialize_gaia_db(self):
     """Load or initialize the gaia database."""
     if not os.path.isfile(self.gaia_db_path):
         dataset = DataSet()
     else:
         dataset = self.load_gaia_db()
         self.transformed = True
     print("songs in db: %d" % dataset.size())
     return dataset
Esempio n. 11
0
    def testAddToDataSetWithDifferentLayouts(self):
        p1 = Point()
        p1.setLayout(self.l1) # +1, ref = 2
        p2 = Point()

        ds = DataSet()
        ds.addPoint(p1) # +2 (dataset+pointcopy), ref = 4
        self.assertRaises(Exception, ds.addPoint, p2)
        self.assertEqual(p1.layout().ref(), 4)
        self.assertEqual(p2.layout().ref(), 1)
Esempio n. 12
0
def loadTestDB():
    global useFixedLength, useEnumerate
    ds = DataSet()
    ds.load(TEST_DATABASE)

    if useFixedLength:
        ds = fixLength(ds)

    if useEnumerate:
        ds = enumerateStrings(ds, exclude='chords_progression_hash.value')

    return ds
Esempio n. 13
0
def loadSmallDB():
    global useFixedLength, useEnumerate
    ds = DataSet()
    ds.load(TEST_SMALLDB)

    if useFixedLength:
        ds = fixLength(ds)

    if useEnumerate:
        ds = enumerateStrings(ds)

    return ds
Esempio n. 14
0
def loadGaia20DB():
    global useFixedLength, useEnumerate
    ds = DataSet()
    ds.load(GAIA20_DB)

    if useFixedLength:
        ds = fixLength(ds)

    if useEnumerate:
        ds = enumerateStrings(ds)

    return ds
Esempio n. 15
0
    def run(self, className, outfilename, param, dsname, gtname, evalconfig):

        try:
            classifier = param['classifier']
            gt = GroundTruth(classifier)
            gt.load(gtname)

            # force the GroundTruth class name to be the one specified by our project file, not
            # the one in the original groundtruth file
            gt.className = className

            ds = DataSet()
            ds.load(dsname)

            # some points may have failed to be analyzed, remove those from the GroundTruth
            pnames = ds.pointNames()
            for pid in list(gt.keys()):
                if pid not in pnames:
                    log.warning('Removing %s from GroundTruth as it could not be found in the merged dataset' % pid)
                    del gt[pid]

            trainerFun, trainingparam, newds = getTrainer(classifier, param, ds)

            # run all the evaluations specified in the evaluation config
            for i, evalparam in enumerate(evalconfig):
                # if we already ran this evaluation, no need to run it again...
                resultFilename = outfilename + '_%d.result' % i
                if exists(resultFilename):
                    log.info('File %s already exists. Skipping evaluation...' % resultFilename)
                    continue

                log.info('Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename,
                                                                                              param['classifier'],
                                                                                              param['preprocessing']))
                log.info('    PID: %d, parameters: %s' % (os.getpid(), json.dumps(param)))

                # run evaluation
                confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam)

                # write evaluation params & result
                with open(outfilename + '_%d.param' % i, 'w') as f:
                    yaml.dump({ 'model': param, 'evaluation': evalparam }, f)

                confusion.save(resultFilename)

        except Exception:
            log.error('While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig))
            raise
Esempio n. 16
0
def createSimpleDataSet():
    global useFixedLength, useEnumerate
    l = createSimpleLayout()
    ds = DataSet()
    p = Point()
    p.setName('p')
    p.setLayout(l)
    ds.addPoint(p)

    if useFixedLength:
        ds = fixLength(ds)

    if useEnumerate:
        ds = enumerateStrings(ds)

    return ds
Esempio n. 17
0
 def __init__(self):
     self.index_path = INDEX_DIR
     self.original_dataset = DataSet()
     self.original_dataset_path = self.__get_dataset_path(INDEX_NAME)
     self.metrics = {}
     self.view = None
     self.__load_dataset()
Esempio n. 18
0
    def testValues(self):
        collection = yaml.load(open(testdata.TEST_DATABASE_FILES, 'r').read())

        # prepend 'data/' to the filenames
        for pid, filename in list(collection.items()):
            collection[pid] = 'data/' + filename

        cvar.verbose = False
        ds = DataSet.mergeFiles(collection)
        cvar.verbose = True

        self.assertAlmostEqual(
            ds.point('Panic-The Smiths.mp3').value('danceability'),
            0.5691167712)

        self.assertAlmostEqual(
            ds.point('11 Go.mp3').value('energy.mean'), 0.0231081359)

        self.assertAlmostEqual(
            ds.point('03 The Chopper [Shy FX Remix].mp3').value(
                'chords_number_rate'), 0.0551007539)

        self.assertEqual(
            ds.point('08 I Can\'t Dance - Genesis.mp3').label('key_key'), 'D#')

        self.assertEqual(
            ds.point('06 Booo!.mp3').label('chords_mode'), 'major')

        ds.save(testdata.TEST_DATABASE)
def train_svm_history(project, params, output_file_path):
    params_model = params["model"]
    if params_model.pop("classifier") != "svm":
        raise GaiaWrapperException("Can only use this script on SVM config parameters.")

    ds = DataSet()
    ds.load(os.path.join(
        project["datasetsDirectory"],
        "%s-%s.db" % (project["className"], params_model.pop("preprocessing"))
    ))

    gt = GroundTruth.fromFile(project["groundtruth"])
    gt.className = "highlevel." + project["className"]

    history = train_svm(ds, gt, **params_model)  # doing the whole training
    history.save(output_file_path)
Esempio n. 20
0
    def __init__(self, indexing_only_mode=False):
        self.indexing_only_mode = indexing_only_mode
        self.index_path = INDEX_DIR
        self.original_dataset = DataSet()
        self.pca_dataset = DataSet()
        if not self.indexing_only_mode:
            self.original_dataset_path = self.__get_dataset_path(INDEX_NAME)
        else:
            self.original_dataset_path = self.__get_dataset_path(
                INDEXING_SERVER_INDEX_NAME)
        self.descriptor_names = {}
        self.metrics = {}
        self.view = None
        self.view_pca = None
        self.transformations_history = None

        self.__load_dataset()
Esempio n. 21
0
def train_SVM(dataset, groundTruth, descriptorNames, exclude = [], svmtype = 'c-svc', kernel = 'rbf', c = 1, gamma = 1):
    # recreate a copy of the given dataset without history
    ds = DataSet()
    ds.addPoints([ p for p in dataset.points() ])

    ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames,
                                      'except': exclude,
                                      'independent': True })

    ds = transform(ds, 'svmtrain', { 'descriptorNames': descriptorNames,
                                     'except': exclude,
                                     'className': groundTruth.className,
                                     'type': svmtype,
                                     'kernel': kernel,
                                     'c': c,
                                     'gamma': gamma})

    h = ds.history()
    return lambda p: str(h.mapPoint(p)[groundTruth.className])
def train_svm_history(project, params, output_file_path):
    params_model = params["model"]
    if params_model.get("classifier") != "svm":
        raise GaiaWrapperException("Can only use this script on SVM config parameters.")

    ds = DataSet()
    ds.load(os.path.join(
        project["datasetsDirectory"],
        "%s-%s.db" % (project["className"], params_model["preprocessing"])
    ))

    gt = GroundTruth.fromFile(project["groundtruth"])
    gt.className = "highlevel." + project["className"]

    history = train_svm(ds, gt, type=params_model["type"], kernel=params_model["kernel"],
                        C=params_model["C"], gamma=params_model["gamma"])  # doing the whole training
    if isinstance(output_file_path, unicode):
        output_file_path = output_file_path.encode("utf-8")
    history.save(output_file_path)
Esempio n. 23
0
def evaluateNfold(nfold, dataset, groundTruth, trainingFunc, *args, **kwargs):
    """Evaluate the classifier on the given dataset and returns the confusion matrix.

    The evaluation is performed using n-fold cross validation.
    Uses only the points that are in the groundTruth parameter for the evaluation.

    Parameters
    ----------

    nfold        : the number of folds to use for the cross-validation
    dataset      : the dataset from which to get the points
    groundTruth  : a map from the points to classify to their respective class
    trainingFunc : a function which will train and return a classifier given a dataset,
                   the groundtruth, and the *args and **kwargs arguments
    """
    log.info('Doing %d-fold cross validation' % nfold)
    classes = set(groundTruth.values())
    progress = TextProgress(nfold, 'Evaluating fold %(current)d/%(total)d')

    # get map from class to point names
    iclasses = {}
    for c in classes:
        iclasses[c] = [ p for p in groundTruth.keys() if groundTruth[p] == c ]
        random.shuffle(iclasses[c])

    # get folds
    folds = {}
    for i in range(nfold):
        folds[i] = []
        for c in iclasses.values():
            foldsize = (len(c)-1)//nfold + 1 # -1/+1 so we take all instances into account, last fold might have fewer instances
            folds[i] += c[ foldsize * i : foldsize * (i+1) ]

    # build sub-datasets and run evaluation on them
    confusion = None
    pnames = [ p.name() for p in dataset.points() ]

    for i in range(nfold):
        if log.isEnabledFor(logging.INFO):
            progress.update(i+1)

        trainds = DataSet()
        trainds.addPoints([ dataset.point(pname) for pname in pnames if pname not in folds[i] ])
        traingt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p not in folds[i] ]))

        testds = DataSet()
        testds.addPoints([ dataset.point(str(pname)) for pname in folds[i] ])
        testgt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p in folds[i] ]))

        classifier = trainingFunc(trainds, traingt, *args, **kwargs)
        confusion = evaluate(classifier, testds, testgt, confusion, verbose = False)

    return confusion
Esempio n. 24
0
File: fusion.py Progetto: DomT4/gaia
def transformDataSet(inputFilename, outputFilename, transfoFile = None):
    """Apply the list of transformations given as a yaml sequence to the specified dataset."""
    print 'Preprocessing dataset chunk for %s...' % outputFilename
    gaia2.cvar.verbose = False

    transfoList = '''
    - transfo: removevl
    - transfo: fixlength
    - transfo: cleaner
    '''

    if transfoFile is not None:
        transfoList = open(transfoFile).read()

    ds = DataSet()
    ds.load(inputFilename)

    ds = applyTransfoChain(ds, transfoList)

    ds.save(outputFilename)
Esempio n. 25
0
def createDataSet():
 
    l = PointLayout()
    l.add('a', RealType)

    ds = DataSet()

    # p1.a = (0.0, 0.0)
    p = Point()
    p.setName('p1')
    p.setLayout(l)
    p['a'] = (0.0, 0.0)
    ds.addPoint(p)

    # p2.a = (0.5, 1.0)
    p = Point()
    p.setName('p2')
    p.setLayout(l)
    p['a'] = (0.5, 1.0)
    ds.addPoint(p)

    if testdata.useFixedLength:
        ds = testdata.fixLength(ds)

    if testdata.useEnumerate:
        ds = testdata.enumerateStrings(ds)

    return ds
Esempio n. 26
0
def gaia_transform(points):
    """
        Takes a dict of point names and filepaths.
        Creates a DataSet and performs the standard transformations 
    """
    ds = DataSet.mergeFiles(points)
    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'cleaner')
    for desc in get_unused_descriptors():
        try:   
            ds = transform(ds, 'remove', desc)
        except Exception, e:
            log.error("Problem removing this descriptor: %s" % e)
Esempio n. 27
0
    def testComplexReferenceCounting(self):
        ds = DataSet()
        self.assertEqual(ds.layout().ref(), 2) # 1 + 1 from temp object

        p = Point()
        p.setName('p1')
        lext = PointLayout(p.layout()) # +1, {lext,p}.ref = 2
        self.assertEqual(lext.ref(), 2)

        lext = p.layout().copy() # copy, lext.ref = 1; p.ref -= 1, = 1
        self.assertEqual(lext.ref(), 1)

        ds.addPoint(p) # +3 (dataset + pointcopy), ref = 3

        self.assertEqual(lext.ref(), 1)
        self.assertEqual(ds.layout().ref(), 4) # 3 + 1 temp object

        p2 = Point(p) # +1, {p,p2}.ref = 5
        p2.setName('p2')
        self.assertEqual(ds.layout().ref(), 5)
        ds.addPoint(p2)
        self.assertEqual(ds.layout().ref(), 6) # +1 pointcopy, ref = 6
Esempio n. 28
0
File: pca.py Progetto: DomT4/gaia
def PCA(x):
    points = []
    layout = PointLayout()
    layout.add('x', RealType)

    for i, l in enumerate(x):
        p = Point()
        p.setName('p%d' % i)
        p.setLayout(layout)
        p['x'] = l
        points.append(p)

    ds = DataSet()
    ds.addPoints(points)

    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'pca', { 'dimension': len(x[0]), 'resultName': 'pca' })

    result = []
    for p in ds.points():
        result.append(p['pca'])

    return result
Esempio n. 29
0
def PCA(x):
    points = []
    layout = PointLayout()
    layout.add('x', RealType)

    for i, l in enumerate(x):
        p = Point()
        p.setName('p%d' % i)
        p.setLayout(layout)
        p['x'] = l
        points.append(p)

    ds = DataSet()
    ds.addPoints(points)

    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'})

    result = []
    for p in ds.points():
        result.append(p['pca'])

    return result
Esempio n. 30
0
def readLibSVMDataSet(filename):
    data = [l.split() for l in open(filename).readlines()]
    minidx = maxidx = 1
    for l in data:
        for i in range(1, len(l)):
            dim, value = l[i].split(':')
            l[i] = (int(dim), float(value))
            minidx = min(minidx, int(dim))
            maxidx = max(maxidx, int(dim))

    dimension = maxidx - minidx + 1

    layout = PointLayout()
    layout.add('class', StringType)
    layout.add('value', RealType)

    ds = DataSet()
    n = 0
    points = []

    for l in data:
        p = Point()
        p.setLayout(layout)
        p.setName('instance_%06d' % n)
        n += 1

        p['class'] = l[0]
        desc = RealDescriptor(dimension, 0.0)
        for dim, value in l[1:]:
            desc[dim - minidx] = value
        p['value'] = desc

        points.append(p)

    ds.addPoints(points)

    return ds
Esempio n. 31
0
    def testComplete(self):
        # load 2.0 dataset, history, apply history to dataset
        # check nn-search results are the same as the ones we get when doing it from gaia 2.0
        ds = DataSet()
        ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET)

        h = TransfoChain()

        self.assertRaises(Exception, h.load, testdata.GAIA_20_BACKWARDS_COMPAT_HISTORY)
        return

        h.load(testdata.GAIA_20_BACKWARDS_COMPAT_HISTORY)

        ds = h.mapDataSet(ds)
        v = View(ds)
        dist = MetricFactory.create('euclidean', ds.layout())

        results = v.nnSearch('01 Respect.mp3', dist).get(5)
        self.compareResults(results, testdata.GAIA_20_BACKWARDS_COMPAT_RESULTS)

        ds21 = DataSet()
        ds21.load(testdata.TEST_DATABASE)
        results = v.nnSearch(h.mapPoint(ds21.point('01 Respect.mp3')), dist).get(5)
        self.compareResults(results, testdata.GAIA_20_BACKWARDS_COMPAT_RESULTS)
Esempio n. 32
0
def createDataSet():
    ds = DataSet()

    # p0.a = (0.0, 0.0) (α = undefined)
    p0 = newPoint('p0')
    p0['a'] = (0.0, 0.0)

    # p1.a = (1.0, 0.0) (α = 0)
    p1 = newPoint('p1')
    p1['a'] = (1.0, 0.0)

    # p2.a = (0.0, 1.0) (α = π/2)
    p2 = newPoint('p2')
    p2['a'] = (0.0, 1.0)

    # p3.a = (-1.0, 0.0) (α = π)
    p3 = newPoint('p3')
    p3['a'] = (-1.0, 0.0)

    # p4.a = (1.0, 1.0) (α = π/4)
    p4 = newPoint('p4')
    p4['a'] = (1.0, 1.0)

    # p5.a = (1.0, -1.0) (α = -π/4)
    p5 = newPoint('p5')
    p5['a'] = (1.0, -1.0)

    ds.addPoints([ p0, p1, p2, p3, p4, p5 ])

    if testdata.useFixedLength:
        ds = testdata.fixLength(ds)

    if testdata.useEnumerate:
        ds = testdata.enumerateStrings(ds)

    return ds
Esempio n. 33
0
    def testDataSet(self):
        # load 2.0 dataset, check some values are correct
        ds = DataSet()
        ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET)

        self.assertAlmostEqual(ds.point('01 Message - Grandmaster Flash.mp3').value('tempotap_bpm'),
            101.05792999)

        self.assertEqual(ds.point('04 Blue Skies.mp3').label('key_key'),
            'G#')
Esempio n. 34
0
    def testHistory(self):
        ds = testdata.loadTestDB()
        ignored_descs = testdata.TEST_DATABASE_VARLENGTH_REAL

        testdata.resetSettings()
        ds_orig = testdata.loadTestDB()

        # cleaning, mandatory step
        ds = transform(ds, 'fixlength', {'except': ignored_descs})
        cleaned_db = transform(ds, 'cleaner', {'except': ignored_descs})

        # removing annoying descriptors, like mfcc.cov & mfcc.icov, who don't
        # like to be normalized like the other ones (constant value: dimension)
        no_mfcc_db = transform(cleaned_db, 'remove',
                               {'descriptorNames': '*mfcc*'})

        # normalize, to have everyone change values
        normalized_db = transform(no_mfcc_db, 'normalize',
                                  {'except': ignored_descs})

        testPoints = [
            '01 Oye Como Va - Santana.mp3', '02 Carmen Burana- O Fortuna.mp3',
            '07 Romeo and Juliet- the Knights\' Dance.mp3', '11 Lambada.mp3'
        ]

        for pointName in testPoints:
            p1 = normalized_db.point(pointName)
            p2 = normalized_db.history().mapPoint(ds_orig.point(pointName))

            for name in p1.layout().descriptorNames():
                self.assertEqual(p1[name], p2[name])

        (tmpFile, tmpName) = tempfile.mkstemp()
        os.close(tmpFile)
        normalized_db.save(tmpName)
        reloaded_db = DataSet()
        reloaded_db.load(tmpName)

        for pointName in testPoints:
            p1 = normalized_db.point(pointName)
            p2 = normalized_db.history().mapPoint(ds_orig.point(pointName))
            p3 = reloaded_db.point(pointName)
            p4 = reloaded_db.history().mapPoint(ds_orig.point(pointName))

            self.assert_(p1.layout() == p2.layout())
            self.assert_(p2.layout() == p3.layout())
            self.assert_(p3.layout() == p4.layout())

            for name in p1.layout().descriptorNames():
                self.assertEqual(p1[name], p2[name])
                self.assertEqual(p2[name], p3[name])
                self.assertEqual(p3[name], p4[name])

        # remove temp file
        os.remove(tmpName)
Esempio n. 35
0
    def testSecondChanceForLayoutEquality(self):
        '''ticket #21: points try to morph to adapt to dataset if they cannot be naturally inserted'''
        ds = DataSet()
        p = Point()

        p.setName('Paris Hilton')
        p.load('data/04 - Cansei de Ser Sexy - Meeting Paris Hilton.mp3.sig')
        ds.addPoint(p)

        p.setName('2005')
        p.load('data/11_2005-fwyh.mp3.sig')
        ds.addPoint(p)

        self.assertEqual(ds.point('2005')['title'], '2005')
Esempio n. 36
0
    def __init__(self):
        self.as_dataset = DataSet()
        self.tag_dataset = DataSet()
        self.fs_dataset = DataSet()
        self.ac_dataset = DataSet()
        self.gaia_similiarity = None

        self.index_path = clust_settings.INDEX_DIR

        self.as_view = None
        self.as_metric = None
        self.tag_view = None
        self.tag_metric = None
        self.fs_view = None
        self.fs_metric = None
        self.ac_view = None
        self.ac_metric = None

        self.__load_datasets()
Esempio n. 37
0
def transformDataSet(inputFilename, outputFilename, transfoFile=None):
    """Apply the list of transformations given as a yaml sequence to the specified dataset."""
    print('Preprocessing dataset chunk for %s...' % outputFilename)
    gaia2.cvar.verbose = False

    transfoList = '''
    - transfo: removevl
    - transfo: fixlength
    - transfo: cleaner
    '''

    if transfoFile is not None:
        transfoList = open(transfoFile).read()

    ds = DataSet()
    ds.load(inputFilename)

    ds = applyTransfoChain(ds, transfoList)

    ds.save(outputFilename)
Esempio n. 38
0
class GaiaWrapper:
    def __init__(self, indexing_only_mode=False):
        self.indexing_only_mode = indexing_only_mode
        self.index_path = sim_settings.INDEX_DIR
        self.original_dataset = DataSet()
        self.pca_dataset = DataSet()
        if not self.indexing_only_mode:
            self.original_dataset_path = self.__get_dataset_path(
                sim_settings.INDEX_NAME)
        else:
            self.original_dataset_path = self.__get_dataset_path(
                sim_settings.INDEXING_SERVER_INDEX_NAME)
        self.descriptor_names = {}
        self.metrics = {}
        self.view = None
        self.view_pca = None
        self.transformations_history = None

        self.__load_dataset()

    def __get_dataset_path(self, ds_name):
        return os.path.join(sim_settings.INDEX_DIR, ds_name + '.db')

    def __load_dataset(self):
        """
        Loads the dataset, does all the necessary steps to make it available for similarity queries and creates the PCA
        version of it. If dataset does not exist, creates a new empty one.
        NOTE: we assume that loaded datasets will have been prepared and normalized (see_
        _prepare_original_dataset() and __normalize_original_dataset()) on due time (see add_point() method below),
        therefore this function does not prepare or normalize loaded datasets.
        """

        if not os.path.exists(sim_settings.INDEX_DIR):
            os.makedirs(sim_settings.INDEX_DIR)

        # load original dataset
        if os.path.exists(self.original_dataset_path):
            self.original_dataset.load(self.original_dataset_path)
            self.__calculate_descriptor_names()

            if self.original_dataset.size(
            ) >= sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode:

                # Save transformation history so we do not need to compute it every time we need it
                self.transformations_history = self.original_dataset.history(
                ).toPython()

                # Build metrics for the different similarity presets, create a Gaia view
                self.__build_metrics()
                view = View(self.original_dataset)
                self.view = view

                # Compute PCA and create pca view and metric
                # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once
                # when the similarity server is loaded-
                self.pca_dataset = transform(
                    self.original_dataset, 'pca', {
                        'descriptorNames': sim_settings.PCA_DESCRIPTORS,
                        'dimension': sim_settings.PCA_DIMENSIONS,
                        'resultName': 'pca'
                    })
                self.pca_dataset.setReferenceDataSet(self.original_dataset)
                self.view_pca = View(self.pca_dataset)
                self.__build_pca_metric()

            if self.original_dataset.history().size() <= 0:
                logger.info('Dataset loaded, size: %s points' %
                            (self.original_dataset.size()))
            else:
                logger.info(
                    'Dataset loaded, size: %s points (%i fixed-length desc., %i variable-length desc.)'
                    % (self.original_dataset.size(),
                       len(self.descriptor_names['fixed-length']),
                       len(self.descriptor_names['variable-length'])))

        else:
            # If there is no existing dataset we create an empty one.
            # For the moment we do not create any distance metric nor a view because search won't be possible until
            # the DB has a minimum of SIMILARITY_MINIMUM_POINTS
            self.original_dataset.save(self.original_dataset_path)
            self.__calculate_descriptor_names()
            logger.info('Created new dataset, size: %s points (should be 0)' %
                        (self.original_dataset.size()))

    def __prepare_original_dataset(self):
        logger.info('Preparing the original dataset.')
        self.original_dataset = self.prepare_original_dataset_helper(
            self.original_dataset)
        self.__calculate_descriptor_names()

    def __normalize_original_dataset(self):
        logger.info('Normalizing the original dataset.')
        self.original_dataset = self.normalize_dataset_helper(
            self.original_dataset, self.descriptor_names['fixed-length'])

    def __calculate_descriptor_names(self):
        layout = self.original_dataset.layout()
        all_descriptor_names = layout.descriptorNames()
        fixed_length_descritpor_names = []
        variable_length_descritpor_names = []
        multidimensional_descriptor_names = []

        for name in all_descriptor_names:
            region = layout.descriptorLocation(name)
            if region.lengthType() == VariableLength:
                variable_length_descritpor_names.append(name)
            else:
                fixed_length_descritpor_names.append(name)
                try:
                    if region.dimension() > 1:
                        multidimensional_descriptor_names.append(name)
                except:  # TODO: exception too broad here...
                    pass

        self.descriptor_names = {
            'all': all_descriptor_names,
            'fixed-length': fixed_length_descritpor_names,
            'variable-length': variable_length_descritpor_names,
            'multidimensional': multidimensional_descriptor_names
        }

    @staticmethod
    def prepare_original_dataset_helper(ds):
        ds = transform(
            ds, 'FixLength'
        )  # Needed to optimize use of fixed-length descriptors and save memory
        ds = transform(
            ds, 'Cleaner'
        )  # Remove descriptors that will cause problems in further transformations
        try:
            ds = transform(ds, 'enumerate',
                           {'descriptorNames': ['.tonal.chords_progression']})
        except:  # TODO: exception too broad here...
            logger.info(
                'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.'
            )
        return ds

    @staticmethod
    def normalize_dataset_helper(ds, descriptor_names):
        # NOTE: The "except" list of descriptors below should be reviewed if a new extractor is used. The point is to
        # remove descriptors can potentially break normalize transform (e.g. descriptors with value = 0)
        normalization_params = {
            "descriptorNames": descriptor_names,
            "except": [
                "*.min",
                "*.max",
                "tonal.chords_histogram",
            ],
            "independent": True,
            "outliers": -1
        }
        ds = transform(ds, 'normalize', normalization_params)
        return ds

    def __build_metrics(self):
        for preset in sim_settings.PRESETS:
            if preset != 'pca':  # PCA metric is built only after pca dataset is created so it should not be built here
                logger.info('Bulding metric for preset %s' % preset)
                name = preset
                path = sim_settings.PRESET_DIR + name + ".yaml"
                preset_file = yaml.safe_load(open(path))
                distance = preset_file['distance']['type']
                parameters = preset_file['distance']['parameters']
                search_metric = DistanceFunctionFactory.create(
                    str(distance), self.original_dataset.layout(), parameters)
                self.metrics[name] = search_metric

    def __build_pca_metric(self):
        logger.info('Bulding metric for preset pca')
        preset_file = yaml.safe_load(open(sim_settings.PRESET_DIR +
                                          "pca.yaml"))
        distance = preset_file['distance']['type']
        parameters = preset_file['distance']['parameters']
        search_metric = DistanceFunctionFactory.create(
            str(distance), self.pca_dataset.layout(), parameters)
        self.metrics['pca'] = search_metric

    def add_point(self, point_location, point_name):

        if self.original_dataset.contains(str(point_name)):
            self.original_dataset.removePoint(str(point_name))

        p = Point()
        if os.path.exists(str(point_location)):
            try:
                p.load(str(point_location))
                p.setName(str(point_name))
                if self.original_dataset.size(
                ) <= sim_settings.SIMILARITY_MINIMUM_POINTS:
                    # Add point to original_dataset because PCA dataset has not been created yet
                    self.original_dataset.addPoint(p)
                    msg = 'Added point with name %s. Index has now %i points.' % \
                          (str(point_name), self.original_dataset.size())
                    logger.info(msg)
                else:
                    # Add point to PCA dataset because it has been already created.
                    # PCA dataset will take care of adding the point to the original dataset as well.
                    self.pca_dataset.addPoint(p)
                    msg = 'Added point with name %s. Index has now %i points (pca index has %i points).' % \
                          (str(point_name), self.original_dataset.size(), self.pca_dataset.size())
                    logger.info(msg)

            except Exception as e:
                msg = 'Point with name %s could NOT be added (%s).' % (
                    str(point_name), str(e))
                logger.info(msg)
                return {
                    'error': True,
                    'result': msg,
                    'status_code': sim_settings.SERVER_ERROR_CODE
                }
        else:
            msg = 'Point with name %s could NOT be added because analysis file does not exist (%s).' % \
                  (str(point_name), str(point_location))
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.SERVER_ERROR_CODE
            }

        if self.original_dataset.size(
        ) == sim_settings.SIMILARITY_MINIMUM_POINTS:
            # Do enumerate
            try:
                self.original_dataset = transform(
                    self.original_dataset, 'enumerate',
                    {'descriptorNames': ['.tonal.chords_progression']})
            except:  # TODO: exception too broad here...
                logger.info(
                    'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.'
                )

        # If when adding a new point we reach the minimum points for similarity, do the needful so that the dataset
        # can be used for search. This includes preparing the dataset, normalizing it, saveing it and creating view and
        # distance metrics. This will only happen once when the size of the dataset reaches SIMILARITY_MINIMUM_POINTS.
        if self.original_dataset.size(
        ) == sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode:
            self.__prepare_original_dataset()
            self.__normalize_original_dataset()
            self.transformations_history = self.original_dataset.history(
            ).toPython()
            self.save_index(msg="(reaching %i points)" %
                            sim_settings.SIMILARITY_MINIMUM_POINTS)

            # TODO: the code below is repeated from __load_dataset() method, should be moved into a util function
            # Build metrics for the different similarity presets, create a Gaia view
            self.__build_metrics()
            view = View(self.original_dataset)
            self.view = view

            # Compute PCA and create pca view and metric
            # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once
            # when the similarity server is loaded-
            self.pca_dataset = transform(
                self.original_dataset, 'pca', {
                    'descriptorNames': sim_settings.PCA_DESCRIPTORS,
                    'dimension': sim_settings.PCA_DIMENSIONS,
                    'resultName': 'pca'
                })
            self.pca_dataset.setReferenceDataSet(self.original_dataset)
            self.view_pca = View(self.pca_dataset)
            self.__build_pca_metric()

        return {'error': False, 'result': msg}

    def delete_point(self, point_name):
        if self.original_dataset.contains(str(point_name)):
            if self.original_dataset.size(
            ) <= sim_settings.SIMILARITY_MINIMUM_POINTS:
                # Remove from original dataset
                self.original_dataset.removePoint(str(point_name))
            else:
                # Remove from pca dataset (pca dataset will take care of removing from original dataset too)
                self.pca_dataset.removePoint(str(point_name))
            logger.info(
                'Deleted point with name %s. Index has now %i points (pca index has %i points).'
                % (str(point_name), self.original_dataset.size(),
                   self.pca_dataset.size()))
            return {'error': False, 'result': True}
        else:
            msg = 'Can\'t delete point with name %s because it does not exist.' % str(
                point_name)
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.NOT_FOUND_CODE
            }

    def get_point(self, point_name):
        logger.info('Getting point with name %s' % str(point_name))
        if self.original_dataset.contains(str(point_name)):
            return self.original_dataset.point(str(point_name))

    def get_all_point_names(self):
        point_names = sorted(
            [int(name) for name in self.original_dataset.pointNames()])
        logger.info('Getting all point names (%i points)' % len(point_names))
        return {'error': False, 'result': point_names}

    def save_index(self, filename=None, msg=""):
        tic = time.time()
        path = self.original_dataset_path
        if filename:
            path = sim_settings.INDEX_DIR + filename + ".db"
        logger.info('Saving index to (%s)...' % path + msg)
        self.original_dataset.save(path)
        toc = time.time()
        logger.info(
            'Finished saving index (done in %.2f seconds, index has now %i points).'
            % ((toc - tic), self.original_dataset.size()))
        return {'error': False, 'result': path}

    def contains(self, point_name):
        logger.info('Checking if index has point with name %s' %
                    str(point_name))
        return {
            'error': False,
            'result': self.original_dataset.contains(point_name)
        }

    def get_sounds_descriptors(self,
                               point_names,
                               descriptor_names=None,
                               normalization=True,
                               only_leaf_descriptors=False):
        """
        Returns a list with the descriptor values for all requested point names
        """

        logger.info('Getting descriptors for points %s' %
                    ','.join([str(name) for name in point_names]))

        # Add dot '.' at the beginning of descriptor names if not present
        if descriptor_names:
            descriptor_names_aux = list()
            for name in descriptor_names:
                if name[0] != '.':
                    descriptor_names_aux.append('.' + name)
                else:
                    descriptor_names_aux.append(name)
            descriptor_names = descriptor_names_aux[:]
        data = dict()
        required_descriptor_names = self.__calculate_complete_required_descriptor_names(
            descriptor_names, only_leaf_descriptors=only_leaf_descriptors)

        if type(required_descriptor_names) == dict:
            return required_descriptor_names  # There has been an error

        for point_name in point_names:
            sound_descriptors = self.__get_point_descriptors(
                point_name, required_descriptor_names, normalization)
            if 'error' not in sound_descriptors:
                data[point_name] = sound_descriptors

        return {'error': False, 'result': data}

    def __calculate_complete_required_descriptor_names(
            self, descriptor_names, only_leaf_descriptors=False):
        if not descriptor_names:
            descriptor_names = self.descriptor_names['all'][:]
        try:
            structured_layout = generate_structured_dict_from_layout(
                self.descriptor_names['all'][:])
            processed_descriptor_names = []
            for name in descriptor_names:
                nested_descriptors = get_nested_dictionary_value(
                    name.split('.')[1:], structured_layout)
                if not nested_descriptors:
                    processed_descriptor_names.append(name)
                else:
                    if only_leaf_descriptors:
                        # only return descriptors if nested descriptors are statistics
                        if len(
                                set(nested_descriptors.keys()).intersection([
                                    'min', 'max', 'dvar2', 'dmean2', 'dmean',
                                    'var', 'dvar', 'mean'
                                ])) > 0:
                            for extra_name in nested_descriptors.keys():
                                processed_descriptor_names.append(
                                    '%s.%s' % (name, extra_name))
                    else:
                        # Return all nested descriptor names
                        extra_names = []
                        get_nested_descriptor_names(nested_descriptors,
                                                    extra_names)
                        for extra_name in extra_names:
                            processed_descriptor_names.append(
                                '%s.%s' % (name, extra_name))
            processed_descriptor_names = list(set(processed_descriptor_names))
            return processed_descriptor_names
        except:
            return {
                'error': True,
                'result': 'Wrong descriptor names, unable to create layout.',
                'status_code': sim_settings.BAD_REQUEST_CODE
            }

    def __get_point_descriptors(self,
                                point_name,
                                required_descriptor_names,
                                normalization=True):
        """
        Get normalization coefficients to transform the input data (get info from the last transformation which has
        been a normalization)
        """

        normalization_coeffs = None
        if not normalization:
            trans_hist = self.transformations_history
            for i in range(0, len(trans_hist)):
                if trans_hist[-(i + 1)]['Analyzer name'] == 'normalize':
                    normalization_coeffs = trans_hist[-(
                        i + 1)]['Applier parameters']['coeffs']

        required_layout = generate_structured_dict_from_layout(
            required_descriptor_names)
        try:
            p = self.original_dataset.point(str(point_name))
        except:
            return {
                'error': True,
                'result': 'Sound does not exist in gaia index.',
                'status_code': sim_settings.NOT_FOUND_CODE
            }

        for descriptor_name in required_descriptor_names:
            try:
                value = p.value(str(descriptor_name))
                if normalization_coeffs:
                    if descriptor_name in normalization_coeffs:
                        a = normalization_coeffs[descriptor_name]['a']
                        b = normalization_coeffs[descriptor_name]['b']
                        if len(a) == 1:
                            value = float(value - b[0]) / a[0]
                        else:
                            normalized_value = []
                            for i in range(0, len(a)):
                                normalized_value.append(
                                    float(value[i] - b[i]) / a[i])
                            value = normalized_value
            except:
                try:
                    value = p.label(str(descriptor_name))
                except:
                    value = None

            if descriptor_name[0] == '.':
                descriptor_name = descriptor_name[1:]
            set_nested_dictionary_value(descriptor_name.split('.'),
                                        required_layout, value)
        return required_layout

    # SIMILARITY SEARCH and CONTENT SEARCH

    def search_dataset(self,
                       query_point,
                       number_of_results,
                       preset_name,
                       offset=0):
        preset_name = str(preset_name)
        results = []
        count = 0
        size = self.original_dataset.size()
        if size < sim_settings.SIMILARITY_MINIMUM_POINTS:
            msg = 'Not enough datapoints in the dataset (%s < %s).' % (
                size, sim_settings.SIMILARITY_MINIMUM_POINTS)
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.SERVER_ERROR_CODE
            }

        query_point = str(query_point)
        logger.info('NN search for point with name %s (preset = %s)' %
                    (query_point, preset_name))
        results = []

        if not self.original_dataset.contains(query_point):
            msg = "Sound with id %s doesn't exist in the dataset." % query_point
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.NOT_FOUND_CODE
            }
        if preset_name == 'pca':
            # Search on PCA view
            search = self.view_pca.nnSearch(query_point,
                                            self.metrics[preset_name])
        else:
            # Search on original dataset view
            search = self.view.nnSearch(query_point, self.metrics[preset_name])
        results = search.get(int(number_of_results), offset=int(offset))
        count = search.size()

        return {'error': False, 'result': {'results': results, 'count': count}}

    def api_search(self, target_type, target, filter, preset_name,
                   metric_descriptor_names, num_results, offset, in_ids):

        # Check if index has sufficient points
        size = self.original_dataset.size()
        if size < sim_settings.SIMILARITY_MINIMUM_POINTS:
            msg = 'Not enough datapoints in the dataset (%s < %s).' % (
                size, sim_settings.SIMILARITY_MINIMUM_POINTS)
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.SERVER_ERROR_CODE
            }

        # Get some dataset parameters that will be useful later
        trans_hist = self.transformations_history
        layout = self.original_dataset.layout()
        pca_layout = self.pca_dataset.layout()
        coeffs = None  # Get normalization coefficients
        for i in range(0, len(trans_hist)):
            if trans_hist[-(i + 1)]['Analyzer name'] == 'normalize':
                coeffs = trans_hist[-(i + 1)]['Applier parameters']['coeffs']

        # Process target
        if target:
            if target_type == 'sound_id':
                query_point = str(target)
                if not self.original_dataset.contains(query_point):
                    msg = "Sound with id %s doesn't exist in the dataset and can not be set as similarity target." \
                          % query_point
                    logger.info(msg)
                    return {
                        'error': True,
                        'result': msg,
                        'status_code': sim_settings.NOT_FOUND_CODE
                    }
                else:
                    query = query_point

            elif target_type == 'descriptor_values':
                # Transform input params to the normalized feature space and add them to a query point
                # If there are no params specified in the target, the point is set as empty (probably random sounds
                # are returned)
                feature_names = []
                query = Point()
                query.setLayout(layout)
                try:
                    for param in target.keys():
                        # Only add numerical parameters. Non numerical ones (like key) are only used as filters
                        if param in coeffs.keys():
                            feature_names.append(str(param))
                            value = target[param]
                            if coeffs:
                                a = coeffs[param]['a']
                                b = coeffs[param]['b']
                                if len(a) == 1:
                                    norm_value = a[0] * value + b[0]
                                else:
                                    norm_value = []
                                    for i in range(0, len(a)):
                                        norm_value.append(a[i] * value[i] +
                                                          b[i])
                                query.setValue(str(param), norm_value)
                            else:
                                query.setValue(str(param), value)
                except:
                    return {
                        'error': True,
                        'result':
                        'Invalid target (descriptor values could not be correctly parsed)',
                        'status_code': sim_settings.BAD_REQUEST_CODE
                    }

                # Overwrite metric with present descriptors in target
                metric = DistanceFunctionFactory.create(
                    'euclidean', layout, {'descriptorNames': feature_names})

            elif target_type == 'file':
                # Target is specified as the attached file
                # Create a point with the data in 'descriptors_data' and search for it
                target_file_parsing_type = '-'

                try:
                    # Try directly loading the file
                    p, query = Point(), Point()
                    p.loadFromString(yaml.dump(target))
                    if preset_name == 'pca':
                        query = self.pca_dataset.history().mapPoint(
                            p)  # map point to pca dataset
                    else:
                        query = self.original_dataset.history().mapPoint(
                            p)  # map point to original dataset
                    target_file_parsing_type = 'mapPoint'

                except Exception as e:
                    logger.info(
                        'Unable to create gaia point from uploaded file (%s). '
                        'Trying adding descriptors one by one.' % e)

                    # If does not work load descriptors one by one
                    try:
                        query = Point()
                        #query.setLayout(layout)

                        feature_names = []
                        get_nested_descriptor_names(target, feature_names)
                        feature_names = [
                            '.%s' % item for item in feature_names
                        ]
                        nonused_features = []

                        for param in feature_names:
                            if param in coeffs.keys():
                                value = get_nested_dictionary_value(
                                    param[1:].split('.'), target)
                                if coeffs:
                                    try:
                                        a = coeffs[param]['a']
                                        b = coeffs[param]['b']
                                        if len(a) == 1:
                                            norm_value = a[0] * value + b[0]
                                        else:
                                            norm_value = []
                                            for i in range(0, len(a)):
                                                norm_value.append(a[i] *
                                                                  value[i] +
                                                                  b[i])
                                        query.setValue(str(param[1:]),
                                                       norm_value)
                                    except:
                                        nonused_features.append(param)
                                else:
                                    query.setValue(str(param[1:]), value)
                            else:
                                nonused_features.append(param)

                        if preset_name == 'pca':
                            query = self.pca_dataset.history().mapPoint(
                                query)  # map point to pca dataset
                        else:
                            query = self.original_dataset.history().mapPoint(
                                p)  # map point to original dataset

                        target_file_parsing_type = 'walkDict'

                    except Exception as e:
                        logger.info(
                            'Unable to create gaia point from uploaded file and adding descriptors one by '
                            'one (%s)' % e)
                        return {
                            'error':
                            True,
                            'result':
                            'Unable to create gaia point from uploaded file. Probably the '
                            'file does not have the required layout. Are you using the '
                            'correct version of Essentia\'s Freesound extractor?',
                            'status_code':
                            sim_settings.SERVER_ERROR_CODE
                        }
        else:
            query = Point()  # Empty target
            if preset_name == 'pca':
                query.setLayout(pca_layout)
            else:
                query.setLayout(layout)

        # Process filter
        if filter:
            filter = parse_filter_list(filter, coeffs)
        else:
            filter = ""  # Empty filter

        # log
        log_message = 'Similarity search'
        if target:
            if target_type == 'sound_id':
                log_target = '%s (sound id)' % str(target)
            elif target_type == 'descriptor_values':
                log_target = '%s (descriptor values)' % str(target)
            elif target_type == 'file':
                log_target = 'uploaded file (%s)' % target_file_parsing_type
            log_message += ' with target: %s' % log_target
        if filter:
            log_message += ' with filter: %s' % str(filter)
        logger.info(log_message)

        # if in_ids is specified, edit the filter accordingly
        if in_ids:
            if not filter:
                filter = 'WHERE point.id IN ("' + '", "'.join(in_ids) + '")'
            else:
                filter += ' AND point.id IN ("' + '", "'.join(in_ids) + '")'

        # Set query metric
        metric = self.metrics[preset_name]
        if metric_descriptor_names:
            metric = DistanceFunctionFactory.create(
                'euclidean', layout,
                {'descriptorNames': metric_descriptor_names})

        # Do query!
        try:
            if target_type == 'descriptor_values' and target:
                search = self.view.nnSearch(query, metric, str(filter))
            else:
                if preset_name == 'pca':
                    search = self.view_pca.nnSearch(query, metric, str(filter))
                else:
                    search = self.view.nnSearch(query, metric, str(filter))
            results = search.get(num_results, offset=offset)
            count = search.size()
        except Exception as e:
            return {
                'error': True,
                'result': 'Similarity server error',
                'status_code': sim_settings.SERVER_ERROR_CODE
            }

        note = None
        if target_type == 'file':
            if target_file_parsing_type == 'walkDict':
                note = 'The layout of the given analysis file differed from what we expected. Similarity results ' \
                       'might not be accurate. Was the file generated with the last version of Essentia\'s ' \
                       'Freesound extractor?'

        return {
            'error': False,
            'result': {
                'results': results,
                'count': count,
                'note': note
            }
        }
Esempio n. 39
0
File: fusion.py Progetto: DomT4/gaia
def harmonizeChunks(partfiles):
    # TODO: check all histories are the same, if not, try to do sth about it
    # find the GCLD (greatest common layout divisor :-) )
    ds = DataSet()
    ds.load(partfiles[0])
    origLayout = ds.layout().copy()
    gcld = ds.layout().copy()

    for pfile in partfiles[1:]:
        ds.load(pfile)
        gcld = gcld & ds.layout()

    # keep some stats about which descriptors got removed and the reason why before throwing
    # away the original history and simplifying it
    vldescs = set()
    nandescs = set()

    # now that we have our GCLD, transform all the chunks so they have the same layout (our GCLD)
    # and simplify their histories so that they also have the same history (the minimum history
    # required to arrive at this target layout).
    for pfile in partfiles:
        ds.load(pfile)

        for t in ds.history().toPython():
            tname = t['Analyzer name']
            descs = t['Applier parameters']['descriptorNames']
            if   tname == 'cleaner':  nandescs.update(descs)
            elif tname == 'removevl': vldescs.update(descs)

        toremove = ds.layout().differenceWith(gcld)
        if toremove:
            ds = transform(ds, 'remove', { 'descriptorNames': toremove })

        ds.simplifyHistory()
        ds.save(pfile)

    # also get the other descriptors that got removed (because of a select or remove transfo)
    rdescs = set(origLayout.differenceWith(gcld)) - (vldescs | nandescs)

    return vldescs, nandescs, rdescs
Esempio n. 40
0
def highlevel_mosaic(target, tcorpus, scorpus, scope=5):
    """
        This will be used to test the highlevel mosaicing process.
        The scope variable controls the number of results which are returned 
        for each target unit which is sought.

    """
    # Create a temporary file for the mosaic audio
    filepath = os.path.join(os.getcwd(), 'temp_mosaic.wav')
    if os.path.isfile(filepath):
        os.remove(filepath)
    mosaic = Mosaic(filepath)
    cost = RepeatUnitCost()
    context = Context()
    gridder = Gridder()
    units = tcorpus.list_audio_units(audio_filename=target, chop='highlevel')
    hdb = scorpus.get_gaia_unit_db(chop='highlevel_%s' % self.chop)
    distance = get_mood_distance(hdb)
    v = View(hdb, distance)
    results = {}
    for f in units:
        p = Point()
        p.load(switch_ext(f, '.yaml'))
        unit_name = switch_ext(os.path.basename(f), '')
        p.setName(unit_name)
        p_m = hdb.history().mapPoint(p)
        results.update({f:v.nnSearch(p_m).get(scope)})
    log.debug("Ok, now we have a dict with each target segment, along with its corresponding nearest matches in source db")
    log.debug("Check to see that we have every second of target audio accounted for - I think not!") 
    #return results
    #new_results = results.copy()
    ds = DataSet()
    for r in results:
        units = []
        for u in results[r]:
            ds.load(switch_ext(u[0], '.db'))
            for n in ds.pointNames():
                units.append(n)
        new_ds = gaia_transform(dict(zip(units, units)))
        results.update({r:new_ds})
    #return results
    # Very important - target units must be in correct order
    index = 0
    index_skip = 0
    for r in sorted(results.keys()):
        tds = DataSet()
        tds.load(switch_ext(r, '.db'))
        #return tds, results
        sds = results[r]
        source_set = set(sds.layout().descriptorNames())
        target_set = set(tds.layout().descriptorNames())
        remove_from_source = source_set.difference(target_set)
        remove_from_target = target_set.difference(source_set)
        if len(remove_from_source) > 0:
            log.debug("Will try to remove %s from the source DataSet" % remove_from_source)
            try:
                sds = transform(results[r], 'remove', {'descriptorNames':list(remove_from_source)})
            except Exception, e:
                log.error("Failed to remove %s from source DataSet" % list(remove_from_source))
                return results[r], tds
        if len(remove_from_target) > 0:
            log.debug("Will try to remove %s from the target DataSet" % remove_from_source)
            try:
                tds = transform(tds, 'remove', {'descriptorNames':list(remove_from_target)})
            except Exception, e:
                log.error("Failed to remove %s from target DataSet" % list(remove_from_target))
                return results[r], tds
Esempio n. 41
0
 def load_gaia_db(self):
     """Load the gaia database from disk."""
     dataset = DataSet()
     dataset.load(self.gaia_db_path)
     return dataset
Esempio n. 42
0
 def testDoubleLoadMixedVersions(self):
     ds = DataSet()
     ds.load(testdata.TEST_DATABASE)
     ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET)
Esempio n. 43
0
File: fusion.py Progetto: DomT4/gaia
def mergeAll(pointList, outputFilename, chunkSize, transfoFile, select = None, exclude = None):
    # TODO: validation of the yaml file format? (ie: pre-2.3 yaml files should be rejected)
    totalPoints = len(fastyaml.load(open(pointList).read()))

    begin, end = 0, chunkSize
    partfiles = []
    partfileTemplate = outputFilename + '_%d_%d.partdb'

    # keep this information for future reference as it won't be accessible anymore
    # once the dataset is merged
    excluded = []
    if exclude:
        try:
            p = gaia2.Point()
            p.load(gaia2.fastyaml.loadfile(pointList).items()[0][1])
            excluded = p.layout().descriptorNames(exclude)
        except:
            raise

    # merge each chunk separately
    # this includes removevl and fixlength, which should yield smaller files than just after
    # merging, so it should then be possible to load all of them together to merge them
    while begin < totalPoints:
        end = min(end, totalPoints)
        partfile = partfileTemplate % (begin, end)
        partfiles += [ partfile ]

        mergeChunk(pointList, partfile, transfoFile, begin, end, select, exclude)
        begin, end = end, end + chunkSize

        horizontalLine()

    # make sure all histories are the same, if not do whatever it takes to reach that point
    # also "simplify" the histories so that they are the minimum history representation required
    # to get to the layout of the final dataset
    print 'Harmonizing chunks so that they all have the same layout & history...'
    vldescs, nandescs, rdescs = harmonizeChunks(partfiles)
    rdescs = rdescs | set(excluded)
    horizontalLine()

    # merge all those partfiles together
    print 'Assembling full dataset together...'
    dstotal = DataSet()

    for pfile in partfiles:
        print 'Merging partfile', pfile
        ds = DataSet()
        ds.load(pfile)
        dstotal.appendDataSet(ds)

    dstotal.save(outputFilename)

    # print a nice informative summary of what has been done to the dataset
    horizontalLine()

    msg = '''
Final dataset information
-------------------------

Number of points: %s

Descriptors removed:
  - because they were of variable length: %s
  - because they were either constant, contained NaN or contained Inf: %s
  - because they were removed explicitly: %s

Your dataset has been saved at %s'''

    # remove leading dot
    vldescs = sorted( d[1:] for d in vldescs )
    nandescs = sorted( d[1:] for d in nandescs )
    rdescs = sorted( d[1:] for d in rdescs )

    print msg % (str(dstotal.size()), ', '.join(vldescs), ', '.join(nandescs), ', '.join(rdescs), outputFilename)

    # clean up temporary files
    for pfile in partfiles:
        os.remove(pfile)
        os.remove(pfile + '.raw')
Esempio n. 44
0
class GaiaWrapper:
    def __init__(self):
        self.index_path = INDEX_DIR
        self.original_dataset = DataSet()
        self.original_dataset_path = self.__get_dataset_path(INDEX_NAME)
        self.metrics = {}
        self.view = None
        self.__load_dataset()

    def __get_dataset_path(self, ds_name):
        return os.path.join(INDEX_DIR, ds_name + ".db")

    def __load_dataset(self):
        # Loads the dataset, applies transforms if needed and saves. If dataset does not exists, creates an empty one and saves.

        if not os.path.exists(INDEX_DIR):
            os.makedirs(INDEX_DIR)

        # load original dataset
        if os.path.exists(self.original_dataset_path):
            self.original_dataset.load(self.original_dataset_path)
            if self.original_dataset.size() >= SIMILARITY_MINIMUM_POINTS:

                # if we have loaded a dataset of the correct size but it is unprepared, prepare it
                if self.original_dataset.history().size() <= 0:
                    self.__prepare_original_dataset()
                    self.__normalize_original_dataset()
                    self.original_dataset.save(self.original_dataset_path)

                # if we have loaded a dataset which has not been normalized, normalize it
                normalized = False
                for element in self.original_dataset.history().toPython():
                    if element["Analyzer name"] == "normalize":
                        normalized = True
                        break
                if not normalized:
                    self.__normalize_original_dataset()
                    self.original_dataset.save(self.original_dataset_path)

                # build metrics for the different similarity presets
                self.__build_metrics()
                # create view
                view = View(self.original_dataset)
                self.view = view

            logger.debug("Dataset loaded, size: %s points" % (self.original_dataset.size()))

        else:
            # If there is no existing dataset we create an empty one.
            # For the moment we do not create any distance metric nor a view because search won't be possible until the DB has a minimum of SIMILARITY_MINIMUM_POINTS
            self.original_dataset.save(self.original_dataset_path)
            logger.debug("Created new dataset, size: %s points (should be 0)" % (self.original_dataset.size()))

    def __prepare_original_dataset(self):
        logger.debug("Preparing the original dataset.")
        self.original_dataset = self.prepare_original_dataset_helper(self.original_dataset)

    def __normalize_original_dataset(self):
        logger.debug("Normalizing the original dataset.")
        self.original_dataset = self.normalize_dataset_helper(self.original_dataset)

    @staticmethod
    def prepare_original_dataset_helper(ds):
        proc_ds1 = transform(ds, "RemoveVL")
        proc_ds2 = transform(proc_ds1, "FixLength")
        proc_ds1 = None
        prepared_ds = transform(proc_ds2, "Cleaner")
        proc_ds2 = None

        return prepared_ds

    @staticmethod
    def normalize_dataset_helper(ds):
        # Remove ['.lowlevel.mfcc.cov','.lowlevel.mfcc.icov'] (they give errors when normalizing)
        ds = transform(ds, "remove", {"descriptorNames": [".lowlevel.mfcc.cov", ".lowlevel.mfcc.icov"]})
        # Add normalization
        normalization_params = {"descriptorNames": "*", "independent": True, "outliers": -1}
        normalized_ds = transform(ds, "normalize", normalization_params)
        ds = None

        return normalized_ds

    def __build_metrics(self):
        for preset in PRESETS:
            logger.debug("Bulding metric for preset %s" % preset)
            name = preset
            path = PRESET_DIR + name + ".yaml"
            preset_file = yaml.load(open(path))
            distance = preset_file["distance"]["type"]
            parameters = preset_file["distance"]["parameters"]
            search_metric = DistanceFunctionFactory.create(str(distance), self.original_dataset.layout(), parameters)
            self.metrics[name] = search_metric

    def add_point(self, point_location, point_name):
        if self.original_dataset.contains(str(point_name)):
            self.original_dataset.removePoint(str(point_name))
        try:
            p = Point()
            p.load(str(point_location))
            p.setName(str(point_name))
            self.original_dataset.addPoint(p)
            size = self.original_dataset.size()
            logger.debug("Added point with name %s. Index has now %i points." % (str(point_name), size))
        except:
            msg = "Point with name %s could NOT be added. Index has now %i points." % (str(point_name), size)
            logger.debug(msg)
            return {"error": True, "result": msg}

        # If when adding a new point we reach the minimum points for similarity, prepare the dataset, save and create view and distance metrics
        #   This will most never happen, only the first time we start similarity server, there is no index created and we add 2000 points.
        if size == SIMILARITY_MINIMUM_POINTS:
            self.__prepare_original_dataset()
            self.__normalize_original_dataset()
            self.save_index(msg="(reaching 2000 points)")

            # build metrics for the different similarity presets
            self.__build_metrics()
            # create view
            view = View(self.original_dataset)
            self.view = view

        return {"error": False, "result": True}

    def delete_point(self, point_name):
        if self.original_dataset.contains(str(point_name)):
            self.original_dataset.removePoint(str(point_name))
            logger.debug(
                "Deleted point with name %s. Index has now %i points." % (str(point_name), self.original_dataset.size())
            )
            return {"error": False, "result": True}
        else:
            msg = "Can't delete point with name %s because it does not exist." % str(point_name)
            logger.debug(msg)
            return {"error": True, "result": msg}

    def get_point(self, point_name):
        logger.debug("Getting point with name %s" % str(point_name))
        if self.original_dataset.contains(str(point_name)):
            return self.original_dataset.point(str(point_name))

    def save_index(self, filename=None, msg=""):
        tic = time.time()
        path = self.original_dataset_path
        if filename:
            path = INDEX_DIR + filename + ".db"
        logger.debug("Saving index to (%s)..." % path + msg)
        self.original_dataset.save(path)
        toc = time.time()
        logger.debug(
            "Finished saving index (done in %.2f seconds, index has now %i points)."
            % ((toc - tic), self.original_dataset.size())
        )
        return {"error": False, "result": path}

    def contains(self, point_name):
        logger.debug("Checking if index has point with name %s" % str(point_name))
        return {"error": False, "result": self.original_dataset.contains(point_name)}

    # SIMILARITY SEARCH (WEB and API)
    def search_dataset(self, query_point, number_of_results, preset_name):
        preset_name = str(preset_name)
        query_point = str(query_point)
        logger.debug("NN search for point with name %s (preset = %s)" % (query_point, preset_name))
        size = self.original_dataset.size()
        if size < SIMILARITY_MINIMUM_POINTS:
            msg = "Not enough datapoints in the dataset (%s < %s)." % (size, SIMILARITY_MINIMUM_POINTS)
            logger.debug(msg)
            return {"error": True, "result": msg}
            # raise Exception('Not enough datapoints in the dataset (%s < %s).' % (size, SIMILARITY_MINIMUM_POINTS))

        if query_point.endswith(".yaml"):
            # The point doesn't exist in the dataset....
            # So, make a temporary point, add all the transformations
            # to it and search for it
            p, p1 = Point(), Point()
            p.load(query_point)
            p1 = self.original_dataset.history().mapPoint(p)
            similar_sounds = self.view.nnSearch(p1, self.metrics[preset_name]).get(int(number_of_results))
        else:
            if not self.original_dataset.contains(query_point):
                msg = "Sound with id %s doesn't exist in the dataset." % query_point
                logger.debug(msg)
                return {"error": True, "result": msg}
                # raise Exception("Sound with id %s doesn't exist in the dataset." % query_point)

            similar_sounds = self.view.nnSearch(query_point, self.metrics[preset_name]).get(int(number_of_results))

        return {"error": False, "result": similar_sounds}

    # CONTENT-BASED SEARCH (API)
    def query_dataset(self, query_parameters, number_of_results):

        size = self.original_dataset.size()
        if size < SIMILARITY_MINIMUM_POINTS:
            msg = "Not enough datapoints in the dataset (%s < %s)." % (size, SIMILARITY_MINIMUM_POINTS)
            logger.debug(msg)
            return {"error": True, "result": msg}
            # raise Exception('Not enough datapoints in the dataset (%s < %s).' % (size, SIMILARITY_MINIMUM_POINTS))

        trans_hist = self.original_dataset.history().toPython()
        layout = self.original_dataset.layout()

        # Get normalization coefficients to transform the input data (get info from the last transformation which has been a normalization)
        coeffs = None
        for i in range(0, len(trans_hist)):
            if trans_hist[-(i + 1)]["Analyzer name"] == "normalize":
                coeffs = trans_hist[-(i + 1)]["Applier parameters"]["coeffs"]

        ##############
        # PARSE TARGET
        ##############

        # Transform input params to the normalized feature space and add them to a query point
        # If there are no params specified in the target, the point is set as empty (probably random sounds are returned)
        q = Point()
        q.setLayout(layout)
        feature_names = []
        # If some target has been specified...
        if query_parameters["target"].keys():
            for param in query_parameters["target"].keys():
                # Only add numerical parameters. Non numerical ones (like key) are only used as filters
                if param in coeffs.keys():
                    feature_names.append(str(param))
                    value = query_parameters["target"][param]
                    if coeffs:
                        a = coeffs[param]["a"]
                        b = coeffs[param]["b"]
                        if len(a) == 1:
                            norm_value = a[0] * value + b[0]
                        else:
                            norm_value = []
                            for i in range(0, len(a)):
                                norm_value.append(a[i] * value[i] + b[i])
                        # text = str(type(param)) + " " + str(type(norm_value))
                        q.setValue(str(param), norm_value)
                    else:
                        q.setValue(str(param), value)

        ##############
        # PARSE FILTER
        ##############

        filter = ""
        # If some filter has been specified...
        if query_parameters["filter"]:
            if type(query_parameters["filter"][0:5]) == str:
                filter = query_parameters["filter"]
            else:
                filter = self.parse_filter_list(query_parameters["filter"], coeffs)

        #############
        # DO QUERY!!!
        #############

        logger.debug(
            "Content based search with target: " + str(query_parameters["target"]) + " and filter: " + str(filter)
        )
        metric = DistanceFunctionFactory.create("euclidean", layout, {"descriptorNames": feature_names})
        # Looks like that depending on the version of gaia, variable filter must go after or before the metric
        # For the gaia version we have currently (sep 2012) in freesound: nnSearch(query,filter,metric)
        # results = self.view.nnSearch(q,str(filter),metric).get(int(number_of_results)) # <- Freesound
        results = self.view.nnSearch(q, metric, str(filter)).get(int(number_of_results))

        return {"error": False, "result": results}

    # UTILS for content-based search
    def prepend_value_label(self, f):
        if f["type"] == "NUMBER" or f["type"] == "RANGE" or f["type"] == "ARRAY":
            return "value"
        else:
            return "label"

    def parse_filter_list(self, filter_list, coeffs):

        # TODO: eliminate this?
        # coeffs = None

        filter = "WHERE"
        for f in filter_list:
            if type(f) != dict:
                filter += f
            else:
                if f["type"] == "NUMBER" or f["type"] == "STRING" or f["type"] == "ARRAY":

                    if f["type"] == "NUMBER":
                        if coeffs:
                            norm_value = coeffs[f["feature"]]["a"][0] * f["value"] + coeffs[f["feature"]]["b"][0]
                        else:
                            norm_value = f["value"]
                    elif f["type"] == "ARRAY":
                        if coeffs:
                            norm_value = []
                            for i in range(len(f["value"])):
                                norm_value.append(
                                    coeffs[f["feature"]]["a"][i] * f["value"][i] + coeffs[f["feature"]]["b"][i]
                                )
                        else:
                            norm_value = f["value"]
                    else:
                        norm_value = f["value"]
                    filter += " " + self.prepend_value_label(f) + f["feature"] + "=" + str(norm_value) + " "

                else:
                    filter += " "
                    if f["value"]["min"]:
                        if coeffs:
                            norm_value = coeffs[f["feature"]]["a"][0] * f["value"]["min"] + coeffs[f["feature"]]["b"][0]
                        else:
                            norm_value = f["value"]["min"]
                        filter += self.prepend_value_label(f) + f["feature"] + ">" + str(norm_value) + " "
                    if f["value"]["max"]:
                        if f["value"]["min"]:
                            filter += "AND "
                        if coeffs:
                            norm_value = coeffs[f["feature"]]["a"][0] * f["value"]["max"] + coeffs[f["feature"]]["b"][0]
                        else:
                            norm_value = f["value"]["max"]
                        filter += self.prepend_value_label(f) + f["feature"] + "<" + str(norm_value) + " "

        return filter