Beispiel #1
0
    def testRCA(self):
        ds = testdata.loadTestDB()
        ds = transform(ds, 'removevl')
        ds = transform(ds, 'fixlength')
        ds = transform(ds, 'remove', { 'descriptorNames': '*cov' })
        ds = transform(ds, 'cleaner')
        ds = transform(ds, 'normalize')
        ds = transform(ds, 'pca', { 'resultName': 'pca15',
                                    'dimension': 15 })
        ds_rca = transform(ds, 'rca', { 'resultName': 'rca10',
                                        'dimension': 10,
                                        'classFile': testdata.RCA_GENRE_GT })

        v = View(ds_rca)
        dist = MetricFactory.create('euclidean', ds_rca.layout())
        self.compareResults(v.nnSearch('01 Cigarettes And Alcohol - Oasis.mp3', dist).get(10),
                            testdata.RCA_GENRE_RESULTS)

        # try by passing directly the groundtruth map
        import gaia2.fastyaml as yaml
        ds_rca = transform(ds, 'rca', { 'resultName': 'rca10',
                                        'dimension': 10,
                                        'classMap': yaml.load(open(testdata.RCA_GENRE_GT).read()) })

        v = View(ds_rca)
        dist = MetricFactory.create('euclidean', ds_rca.layout())
        self.compareResults(v.nnSearch('01 Cigarettes And Alcohol - Oasis.mp3', dist).get(10),
                            testdata.RCA_GENRE_RESULTS)
Beispiel #2
0
    def testParserStillInValidStateAfterParserError(self):
        '''ticket #20: parser is in invalid state after parser error'''
        ds = testdata.createSimpleDataSet()
        dist = MetricFactory.create('null', ds.layout())
        v = View(ds)

        result = v.nnSearch(ds.samplePoint(), dist, 'WHERE true').get(1)
        clause = 'WHERE label.tonal_key_mode.value = \\"major"'
        try:
            result = v.nnSearch(ds.samplePoint(), dist, clause).get(1)
        except:
            pass  # filter correctly failed to compile
        result = v.nnSearch(ds.samplePoint(), dist, 'WHERE true').get(1)
Beispiel #3
0
    def testParsedVsConstructedFilters(self):
        ds = testdata.loadTestDB()
        ds = transform(ds, 'fixlength')

        p = ds.samplePoint()
        p2 = ds.point('Higher State of Consciousness.mp3')

        queries = [
            (p, '', ''), (p2, '', ''),
            (p2, 'WHERE value.tempotap_bpm.value > 140',
             Filter('tempotap_bpm.value', '>', 140)),
            (p, 'WHERE value.tempotap_bpm > 110',
             Filter('tempotap_bpm', '>', 110)),
            (p, 'WHERE value.tempotap_bpm > -10',
             Filter('tempotap_bpm', '>', -10)),
            (p, 'WHERE value.tempotap_bpm > 23000',
             Filter('tempotap_bpm', '>', 23000)),
            (p, 'WHERE value.tempotap_bpm > 120 AND value.tempotap_bpm < 130',
             AndFilter([
                 Filter('tempotap_bpm', '>', 120),
                 Filter('tempotap_bpm', '<', 130)
             ])),
            (p, 'WHERE value.tempotap_bpm BETWEEN 130 AND 120',
             Filter('tempotap_bpm', 'between', [130, 120])),
            (p, 'WHERE label.key_key = "C"', Filter('key_key', '==', 'C')),
            (p2,
             '''WHERE ((label.key_key = "A" AND label.key_mode = "major") OR
                                   (label.key_key = "E" AND label.key_mode = "minor"))
                                  AND value.tempotap_bpm < 90''',
             AndFilter([
                 OrFilter([
                     AndFilter([
                         Filter('key_key', '==', 'A'),
                         Filter('key_mode', '==', 'major')
                     ]),
                     AndFilter([
                         Filter('key_key', '==', 'E'),
                         Filter('key_mode', '==', 'minor')
                     ])
                 ]),
                 Filter('tempotap_bpm', '<', 90)
             ]))
        ]

        dist = MetricFactory.create('euclidean', ds.layout(),
                                    {'descriptorNames': '*.mean'})
        v = View(ds)

        for (pt, filtstr, filt) in queries:
            self.assertEqualSearchSpace(v.nnSearch(pt, dist, filtstr),
                                        v.nnSearch(pt, dist, filt))
Beispiel #4
0
    def testKullbackLeibler(self):
        ds = transform(testdata.loadTestDB(), 'fixlength')

        # creates a test with more than 1000 points otherwise the test is useless because
        # we split the workload in chunks of 1000 points when computing the distance
        dstest = DataSet()
        ncopy = 20
        for cidx in range(ncopy):
            points = list(ds.points())
            for p in points:
                p.setName(p.name() + '-%d' % cidx)
            dstest.addPoints(points)

        # test whether KL doesn't break with multithreading (did in 2.2.1)
        v = View(dstest)
        dist = MetricFactory.create('kullbackleibler',
                                    dstest.layout(),
                                    { 'descriptorName': 'mfcc' })


        results = v.nnSearch(ds.samplePoint(), dist).get(6*ncopy)
        expected = [ 0.0 ]*2*ncopy + [ 6.1013755798339844 ]*ncopy
        expected += [ 6.4808731079101562 ]*2*ncopy + [ 6.7828292846679688 ]*ncopy

        for r, e in zip(results, expected):
            self.assertAlmostEqual(r[1], e, 5)
Beispiel #5
0
    def testDeleteUnderlyingDataSet(self):
        ds = testdata.loadTestDB()

        params = {'descriptorNames': ['*.mean', '*.var']}

        ds = transform(ds, 'fixlength', params)
        ds = transform(ds, 'cleaner', params)
        ds = transform(ds, 'normalize', params)
        dist = MetricFactory.create('euclidean', ds.layout(), params)

        v = View(ds)
        del ds

        #self.assertRaises(Exception, v.nnSearch, '01 Respect.mp3')
        # this doesn't throw anymore, as the View keeps a ref to the dataset
        v.nnSearch('01 Respect.mp3', dist)
Beispiel #6
0
    def testSubspaceSearch(self):
        ds = testdata.loadTestDB()
        ds = transform(ds, 'fixlength')
        dist = MetricFactory.create('euclidean', ds.layout(),
                                    {'descriptorNames': '*.mean'})
        v = View(ds)
        pid = 'Higher State of Consciousness.mp3'

        key_a_minor = v.nnSearch(
            pid, dist,
            'WHERE label.key_key = "A" AND label.key_mode = "minor"')
        key_a = v.nnSearch(pid, dist, 'WHERE label.key_key = "A"')
        key_minor = v.nnSearch(pid, dist, 'WHERE label.key_mode = "minor"')

        key_a_minor_sspace1 = v.nnSearch(pid, key_minor, dist,
                                         'WHERE label.key_key = "A"')
        key_a_minor_sspace2 = v.nnSearch(pid, key_a, dist,
                                         'WHERE label.key_mode = "minor"')

        self.assertEqualSearchSpace(key_a_minor, key_a_minor_sspace1)
        self.assertEqualSearchSpace(key_a_minor, key_a_minor_sspace2)
Beispiel #7
0
def testValidPoint(dataset, clause, fromList=None):
    # search the point using the clause:
    # if we have a result, the clause was true
    # if we have no result, the clause was false
    v = View(dataset)
    dist = MetricFactory.create('null', dataset.layout())
    filtr = 'WHERE ' + clause
    if fromList:
        filtr = 'FROM ' + fromList + ' ' + filtr
    result = v.nnSearch(dataset.samplePoint(), dist, filtr).get(1)

    if len(result) == 1:
        return True
    return False
Beispiel #8
0
    def testComplete(self):
        # load 2.0 dataset, history, apply history to dataset
        # check nn-search results are the same as the ones we get when doing it from gaia 2.0
        ds = DataSet()
        ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET)

        h = TransfoChain()

        self.assertRaises(Exception, h.load, testdata.GAIA_20_BACKWARDS_COMPAT_HISTORY)
        return

        h.load(testdata.GAIA_20_BACKWARDS_COMPAT_HISTORY)

        ds = h.mapDataSet(ds)
        v = View(ds)
        dist = MetricFactory.create('euclidean', ds.layout())

        results = v.nnSearch('01 Respect.mp3', dist).get(5)
        self.compareResults(results, testdata.GAIA_20_BACKWARDS_COMPAT_RESULTS)

        ds21 = DataSet()
        ds21.load(testdata.TEST_DATABASE)
        results = v.nnSearch(h.mapPoint(ds21.point('01 Respect.mp3')), dist).get(5)
        self.compareResults(results, testdata.GAIA_20_BACKWARDS_COMPAT_RESULTS)
Beispiel #9
0
    def get_neighbours(self, dataset, encoded_filename, number,
                       encoded_request=None):
        """Get a number of nearest neighbours."""
        view = View(dataset)
        request_point = self.gaia_db.point(
            encoded_request) if encoded_request else None
        try:
            total = view.nnSearch(
                encoded_filename, self.metric).get(number + 1)[1:]
        except Exception as e:
            print(e)
            return []

        result = sorted([
            (self.compute_score(
                score, name, request_point=request_point) * 1000,
             name)
            for name, score in total])
        if request_point:
            # Filter out the worst matches for the requested track
            return result[:max(1, number / 2)]
        return result
Beispiel #10
0
    def testCreatedInputSpace(self):
        ds = testdata.createSimpleDataSet()
        ds.point('p')['a.1'] = 23.0

        for i in range(5):
            p = Point()
            p.setName('p%d' % i)
            p.setLayout(ds.originalLayout())
            p['a.1'] = float(i)
            ds.addPoint(p)

        ds = transform(ds, 'fixlength')
        dist = MetricFactory.create('euclidean', ds.layout())
        v = View(ds)

        p = ds.point('p')

        RS_remove = v.nnSearch(p, dist)
        RS_remove.removePoints(['p2', 'p4'])

        RS_create = InputSpace()
        RS_create.addPoints(ds, ['p', 'p0', 'p1', 'p3'])

        rsc = v.nnSearch(p, RS_remove, dist)
        rsa = v.nnSearch(p, RS_create, dist)

        self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.), ('p0', 23.)),
                         v.nnSearch(p, rsc, dist).get(10))

        self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.), ('p0', 23.)),
                         v.nnSearch(p, rsa, dist).get(10))

        # test thresholdLimit method
        self.assertEqual((('p', 0.), ),
                         v.nnSearch(p, rsa, dist).thresholdLimit(10).get(10))
        self.assertEqual((('p', 0.), ('p3', 20.)),
                         v.nnSearch(p, rsa, dist).thresholdLimit(20).get(10))
        self.assertEqual((('p', 0.), ('p3', 20.), ('p1', 22.)),
                         v.nnSearch(p, rsa,
                                    dist).thresholdLimit(22.01).get(10))
Beispiel #11
0
    def get_neighbours(self,
                       dataset,
                       encoded_filename,
                       number,
                       encoded_request=None):
        """Get a number of nearest neighbours."""
        view = View(dataset)
        request_point = self.gaia_db.point(
            encoded_request) if encoded_request else None
        try:
            total = view.nnSearch(encoded_filename,
                                  self.metric).get(number + 1)[1:]
        except Exception as e:
            print(e)
            return []

        result = sorted([
            (self.compute_score(score, name, request_point=request_point) *
             1000, name) for name, score in total
        ])
        if request_point:
            # Filter out the worst matches for the requested track
            return result[:max(1, number / 2)]
        return result
Beispiel #12
0
    def testSimple(self):
        ds = testdata.createSimpleDataSet()
        p2 = Point(ds.point('p'))
        p2.setName('p2')
        p3 = Point(ds.point('p'))
        p3.setName('p3')
        p3['a.1'] = 1
        p4 = Point(ds.point('p'))
        p4.setName('p4')
        p4['a.1'] = 1
        p4['a.2'] = 1
        ds.addPoint(p2)
        ds.addPoint(p3)
        ds.addPoint(p4)

        ds = transform(ds, 'fixlength')
        dist = MetricFactory.create('euclidean', ds.layout())
        v = View(ds)

        results = v.nnSearch('p', dist).get(10)
        self.assertEqual(results[0][1], 0.0)
        self.assertEqual(results[1][1], 0.0)
        self.assertSearchResultEqual(results[2], ('p3', 1.0))
        self.assertSearchResultEqual(results[3], ('p4', math.sqrt(2)))
Beispiel #13
0
class GaiaWrapper:
    def __init__(self, indexing_only_mode=False):
        self.indexing_only_mode = indexing_only_mode
        self.index_path = sim_settings.INDEX_DIR
        self.original_dataset = DataSet()
        self.pca_dataset = DataSet()
        if not self.indexing_only_mode:
            self.original_dataset_path = self.__get_dataset_path(
                sim_settings.INDEX_NAME)
        else:
            self.original_dataset_path = self.__get_dataset_path(
                sim_settings.INDEXING_SERVER_INDEX_NAME)
        self.descriptor_names = {}
        self.metrics = {}
        self.view = None
        self.view_pca = None
        self.transformations_history = None

        self.__load_dataset()

    def __get_dataset_path(self, ds_name):
        return os.path.join(sim_settings.INDEX_DIR, ds_name + '.db')

    def __load_dataset(self):
        """
        Loads the dataset, does all the necessary steps to make it available for similarity queries and creates the PCA
        version of it. If dataset does not exist, creates a new empty one.
        NOTE: we assume that loaded datasets will have been prepared and normalized (see_
        _prepare_original_dataset() and __normalize_original_dataset()) on due time (see add_point() method below),
        therefore this function does not prepare or normalize loaded datasets.
        """

        if not os.path.exists(sim_settings.INDEX_DIR):
            os.makedirs(sim_settings.INDEX_DIR)

        # load original dataset
        if os.path.exists(self.original_dataset_path):
            self.original_dataset.load(self.original_dataset_path)
            self.__calculate_descriptor_names()

            if self.original_dataset.size(
            ) >= sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode:

                # Save transformation history so we do not need to compute it every time we need it
                self.transformations_history = self.original_dataset.history(
                ).toPython()

                # Build metrics for the different similarity presets, create a Gaia view
                self.__build_metrics()
                view = View(self.original_dataset)
                self.view = view

                # Compute PCA and create pca view and metric
                # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once
                # when the similarity server is loaded-
                self.pca_dataset = transform(
                    self.original_dataset, 'pca', {
                        'descriptorNames': sim_settings.PCA_DESCRIPTORS,
                        'dimension': sim_settings.PCA_DIMENSIONS,
                        'resultName': 'pca'
                    })
                self.pca_dataset.setReferenceDataSet(self.original_dataset)
                self.view_pca = View(self.pca_dataset)
                self.__build_pca_metric()

            if self.original_dataset.history().size() <= 0:
                logger.info('Dataset loaded, size: %s points' %
                            (self.original_dataset.size()))
            else:
                logger.info(
                    'Dataset loaded, size: %s points (%i fixed-length desc., %i variable-length desc.)'
                    % (self.original_dataset.size(),
                       len(self.descriptor_names['fixed-length']),
                       len(self.descriptor_names['variable-length'])))

        else:
            # If there is no existing dataset we create an empty one.
            # For the moment we do not create any distance metric nor a view because search won't be possible until
            # the DB has a minimum of SIMILARITY_MINIMUM_POINTS
            self.original_dataset.save(self.original_dataset_path)
            self.__calculate_descriptor_names()
            logger.info('Created new dataset, size: %s points (should be 0)' %
                        (self.original_dataset.size()))

    def __prepare_original_dataset(self):
        logger.info('Preparing the original dataset.')
        self.original_dataset = self.prepare_original_dataset_helper(
            self.original_dataset)
        self.__calculate_descriptor_names()

    def __normalize_original_dataset(self):
        logger.info('Normalizing the original dataset.')
        self.original_dataset = self.normalize_dataset_helper(
            self.original_dataset, self.descriptor_names['fixed-length'])

    def __calculate_descriptor_names(self):
        layout = self.original_dataset.layout()
        all_descriptor_names = layout.descriptorNames()
        fixed_length_descritpor_names = []
        variable_length_descritpor_names = []
        multidimensional_descriptor_names = []

        for name in all_descriptor_names:
            region = layout.descriptorLocation(name)
            if region.lengthType() == VariableLength:
                variable_length_descritpor_names.append(name)
            else:
                fixed_length_descritpor_names.append(name)
                try:
                    if region.dimension() > 1:
                        multidimensional_descriptor_names.append(name)
                except:  # TODO: exception too broad here...
                    pass

        self.descriptor_names = {
            'all': all_descriptor_names,
            'fixed-length': fixed_length_descritpor_names,
            'variable-length': variable_length_descritpor_names,
            'multidimensional': multidimensional_descriptor_names
        }

    @staticmethod
    def prepare_original_dataset_helper(ds):
        ds = transform(
            ds, 'FixLength'
        )  # Needed to optimize use of fixed-length descriptors and save memory
        ds = transform(
            ds, 'Cleaner'
        )  # Remove descriptors that will cause problems in further transformations
        try:
            ds = transform(ds, 'enumerate',
                           {'descriptorNames': ['.tonal.chords_progression']})
        except:  # TODO: exception too broad here...
            logger.info(
                'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.'
            )
        return ds

    @staticmethod
    def normalize_dataset_helper(ds, descriptor_names):
        # NOTE: The "except" list of descriptors below should be reviewed if a new extractor is used. The point is to
        # remove descriptors can potentially break normalize transform (e.g. descriptors with value = 0)
        normalization_params = {
            "descriptorNames": descriptor_names,
            "except": [
                "*.min",
                "*.max",
                "tonal.chords_histogram",
            ],
            "independent": True,
            "outliers": -1
        }
        ds = transform(ds, 'normalize', normalization_params)
        return ds

    def __build_metrics(self):
        for preset in sim_settings.PRESETS:
            if preset != 'pca':  # PCA metric is built only after pca dataset is created so it should not be built here
                logger.info('Bulding metric for preset %s' % preset)
                name = preset
                path = sim_settings.PRESET_DIR + name + ".yaml"
                preset_file = yaml.safe_load(open(path))
                distance = preset_file['distance']['type']
                parameters = preset_file['distance']['parameters']
                search_metric = DistanceFunctionFactory.create(
                    str(distance), self.original_dataset.layout(), parameters)
                self.metrics[name] = search_metric

    def __build_pca_metric(self):
        logger.info('Bulding metric for preset pca')
        preset_file = yaml.safe_load(open(sim_settings.PRESET_DIR +
                                          "pca.yaml"))
        distance = preset_file['distance']['type']
        parameters = preset_file['distance']['parameters']
        search_metric = DistanceFunctionFactory.create(
            str(distance), self.pca_dataset.layout(), parameters)
        self.metrics['pca'] = search_metric

    def add_point(self, point_location, point_name):

        if self.original_dataset.contains(str(point_name)):
            self.original_dataset.removePoint(str(point_name))

        p = Point()
        if os.path.exists(str(point_location)):
            try:
                p.load(str(point_location))
                p.setName(str(point_name))
                if self.original_dataset.size(
                ) <= sim_settings.SIMILARITY_MINIMUM_POINTS:
                    # Add point to original_dataset because PCA dataset has not been created yet
                    self.original_dataset.addPoint(p)
                    msg = 'Added point with name %s. Index has now %i points.' % \
                          (str(point_name), self.original_dataset.size())
                    logger.info(msg)
                else:
                    # Add point to PCA dataset because it has been already created.
                    # PCA dataset will take care of adding the point to the original dataset as well.
                    self.pca_dataset.addPoint(p)
                    msg = 'Added point with name %s. Index has now %i points (pca index has %i points).' % \
                          (str(point_name), self.original_dataset.size(), self.pca_dataset.size())
                    logger.info(msg)

            except Exception as e:
                msg = 'Point with name %s could NOT be added (%s).' % (
                    str(point_name), str(e))
                logger.info(msg)
                return {
                    'error': True,
                    'result': msg,
                    'status_code': sim_settings.SERVER_ERROR_CODE
                }
        else:
            msg = 'Point with name %s could NOT be added because analysis file does not exist (%s).' % \
                  (str(point_name), str(point_location))
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.SERVER_ERROR_CODE
            }

        if self.original_dataset.size(
        ) == sim_settings.SIMILARITY_MINIMUM_POINTS:
            # Do enumerate
            try:
                self.original_dataset = transform(
                    self.original_dataset, 'enumerate',
                    {'descriptorNames': ['.tonal.chords_progression']})
            except:  # TODO: exception too broad here...
                logger.info(
                    'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.'
                )

        # If when adding a new point we reach the minimum points for similarity, do the needful so that the dataset
        # can be used for search. This includes preparing the dataset, normalizing it, saveing it and creating view and
        # distance metrics. This will only happen once when the size of the dataset reaches SIMILARITY_MINIMUM_POINTS.
        if self.original_dataset.size(
        ) == sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode:
            self.__prepare_original_dataset()
            self.__normalize_original_dataset()
            self.transformations_history = self.original_dataset.history(
            ).toPython()
            self.save_index(msg="(reaching %i points)" %
                            sim_settings.SIMILARITY_MINIMUM_POINTS)

            # TODO: the code below is repeated from __load_dataset() method, should be moved into a util function
            # Build metrics for the different similarity presets, create a Gaia view
            self.__build_metrics()
            view = View(self.original_dataset)
            self.view = view

            # Compute PCA and create pca view and metric
            # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once
            # when the similarity server is loaded-
            self.pca_dataset = transform(
                self.original_dataset, 'pca', {
                    'descriptorNames': sim_settings.PCA_DESCRIPTORS,
                    'dimension': sim_settings.PCA_DIMENSIONS,
                    'resultName': 'pca'
                })
            self.pca_dataset.setReferenceDataSet(self.original_dataset)
            self.view_pca = View(self.pca_dataset)
            self.__build_pca_metric()

        return {'error': False, 'result': msg}

    def delete_point(self, point_name):
        if self.original_dataset.contains(str(point_name)):
            if self.original_dataset.size(
            ) <= sim_settings.SIMILARITY_MINIMUM_POINTS:
                # Remove from original dataset
                self.original_dataset.removePoint(str(point_name))
            else:
                # Remove from pca dataset (pca dataset will take care of removing from original dataset too)
                self.pca_dataset.removePoint(str(point_name))
            logger.info(
                'Deleted point with name %s. Index has now %i points (pca index has %i points).'
                % (str(point_name), self.original_dataset.size(),
                   self.pca_dataset.size()))
            return {'error': False, 'result': True}
        else:
            msg = 'Can\'t delete point with name %s because it does not exist.' % str(
                point_name)
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.NOT_FOUND_CODE
            }

    def get_point(self, point_name):
        logger.info('Getting point with name %s' % str(point_name))
        if self.original_dataset.contains(str(point_name)):
            return self.original_dataset.point(str(point_name))

    def get_all_point_names(self):
        point_names = sorted(
            [int(name) for name in self.original_dataset.pointNames()])
        logger.info('Getting all point names (%i points)' % len(point_names))
        return {'error': False, 'result': point_names}

    def save_index(self, filename=None, msg=""):
        tic = time.time()
        path = self.original_dataset_path
        if filename:
            path = sim_settings.INDEX_DIR + filename + ".db"
        logger.info('Saving index to (%s)...' % path + msg)
        self.original_dataset.save(path)
        toc = time.time()
        logger.info(
            'Finished saving index (done in %.2f seconds, index has now %i points).'
            % ((toc - tic), self.original_dataset.size()))
        return {'error': False, 'result': path}

    def contains(self, point_name):
        logger.info('Checking if index has point with name %s' %
                    str(point_name))
        return {
            'error': False,
            'result': self.original_dataset.contains(point_name)
        }

    def get_sounds_descriptors(self,
                               point_names,
                               descriptor_names=None,
                               normalization=True,
                               only_leaf_descriptors=False):
        """
        Returns a list with the descriptor values for all requested point names
        """

        logger.info('Getting descriptors for points %s' %
                    ','.join([str(name) for name in point_names]))

        # Add dot '.' at the beginning of descriptor names if not present
        if descriptor_names:
            descriptor_names_aux = list()
            for name in descriptor_names:
                if name[0] != '.':
                    descriptor_names_aux.append('.' + name)
                else:
                    descriptor_names_aux.append(name)
            descriptor_names = descriptor_names_aux[:]
        data = dict()
        required_descriptor_names = self.__calculate_complete_required_descriptor_names(
            descriptor_names, only_leaf_descriptors=only_leaf_descriptors)

        if type(required_descriptor_names) == dict:
            return required_descriptor_names  # There has been an error

        for point_name in point_names:
            sound_descriptors = self.__get_point_descriptors(
                point_name, required_descriptor_names, normalization)
            if 'error' not in sound_descriptors:
                data[point_name] = sound_descriptors

        return {'error': False, 'result': data}

    def __calculate_complete_required_descriptor_names(
            self, descriptor_names, only_leaf_descriptors=False):
        if not descriptor_names:
            descriptor_names = self.descriptor_names['all'][:]
        try:
            structured_layout = generate_structured_dict_from_layout(
                self.descriptor_names['all'][:])
            processed_descriptor_names = []
            for name in descriptor_names:
                nested_descriptors = get_nested_dictionary_value(
                    name.split('.')[1:], structured_layout)
                if not nested_descriptors:
                    processed_descriptor_names.append(name)
                else:
                    if only_leaf_descriptors:
                        # only return descriptors if nested descriptors are statistics
                        if len(
                                set(nested_descriptors.keys()).intersection([
                                    'min', 'max', 'dvar2', 'dmean2', 'dmean',
                                    'var', 'dvar', 'mean'
                                ])) > 0:
                            for extra_name in nested_descriptors.keys():
                                processed_descriptor_names.append(
                                    '%s.%s' % (name, extra_name))
                    else:
                        # Return all nested descriptor names
                        extra_names = []
                        get_nested_descriptor_names(nested_descriptors,
                                                    extra_names)
                        for extra_name in extra_names:
                            processed_descriptor_names.append(
                                '%s.%s' % (name, extra_name))
            processed_descriptor_names = list(set(processed_descriptor_names))
            return processed_descriptor_names
        except:
            return {
                'error': True,
                'result': 'Wrong descriptor names, unable to create layout.',
                'status_code': sim_settings.BAD_REQUEST_CODE
            }

    def __get_point_descriptors(self,
                                point_name,
                                required_descriptor_names,
                                normalization=True):
        """
        Get normalization coefficients to transform the input data (get info from the last transformation which has
        been a normalization)
        """

        normalization_coeffs = None
        if not normalization:
            trans_hist = self.transformations_history
            for i in range(0, len(trans_hist)):
                if trans_hist[-(i + 1)]['Analyzer name'] == 'normalize':
                    normalization_coeffs = trans_hist[-(
                        i + 1)]['Applier parameters']['coeffs']

        required_layout = generate_structured_dict_from_layout(
            required_descriptor_names)
        try:
            p = self.original_dataset.point(str(point_name))
        except:
            return {
                'error': True,
                'result': 'Sound does not exist in gaia index.',
                'status_code': sim_settings.NOT_FOUND_CODE
            }

        for descriptor_name in required_descriptor_names:
            try:
                value = p.value(str(descriptor_name))
                if normalization_coeffs:
                    if descriptor_name in normalization_coeffs:
                        a = normalization_coeffs[descriptor_name]['a']
                        b = normalization_coeffs[descriptor_name]['b']
                        if len(a) == 1:
                            value = float(value - b[0]) / a[0]
                        else:
                            normalized_value = []
                            for i in range(0, len(a)):
                                normalized_value.append(
                                    float(value[i] - b[i]) / a[i])
                            value = normalized_value
            except:
                try:
                    value = p.label(str(descriptor_name))
                except:
                    value = None

            if descriptor_name[0] == '.':
                descriptor_name = descriptor_name[1:]
            set_nested_dictionary_value(descriptor_name.split('.'),
                                        required_layout, value)
        return required_layout

    # SIMILARITY SEARCH and CONTENT SEARCH

    def search_dataset(self,
                       query_point,
                       number_of_results,
                       preset_name,
                       offset=0):
        preset_name = str(preset_name)
        results = []
        count = 0
        size = self.original_dataset.size()
        if size < sim_settings.SIMILARITY_MINIMUM_POINTS:
            msg = 'Not enough datapoints in the dataset (%s < %s).' % (
                size, sim_settings.SIMILARITY_MINIMUM_POINTS)
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.SERVER_ERROR_CODE
            }

        query_point = str(query_point)
        logger.info('NN search for point with name %s (preset = %s)' %
                    (query_point, preset_name))
        results = []

        if not self.original_dataset.contains(query_point):
            msg = "Sound with id %s doesn't exist in the dataset." % query_point
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.NOT_FOUND_CODE
            }
        if preset_name == 'pca':
            # Search on PCA view
            search = self.view_pca.nnSearch(query_point,
                                            self.metrics[preset_name])
        else:
            # Search on original dataset view
            search = self.view.nnSearch(query_point, self.metrics[preset_name])
        results = search.get(int(number_of_results), offset=int(offset))
        count = search.size()

        return {'error': False, 'result': {'results': results, 'count': count}}

    def api_search(self, target_type, target, filter, preset_name,
                   metric_descriptor_names, num_results, offset, in_ids):

        # Check if index has sufficient points
        size = self.original_dataset.size()
        if size < sim_settings.SIMILARITY_MINIMUM_POINTS:
            msg = 'Not enough datapoints in the dataset (%s < %s).' % (
                size, sim_settings.SIMILARITY_MINIMUM_POINTS)
            logger.info(msg)
            return {
                'error': True,
                'result': msg,
                'status_code': sim_settings.SERVER_ERROR_CODE
            }

        # Get some dataset parameters that will be useful later
        trans_hist = self.transformations_history
        layout = self.original_dataset.layout()
        pca_layout = self.pca_dataset.layout()
        coeffs = None  # Get normalization coefficients
        for i in range(0, len(trans_hist)):
            if trans_hist[-(i + 1)]['Analyzer name'] == 'normalize':
                coeffs = trans_hist[-(i + 1)]['Applier parameters']['coeffs']

        # Process target
        if target:
            if target_type == 'sound_id':
                query_point = str(target)
                if not self.original_dataset.contains(query_point):
                    msg = "Sound with id %s doesn't exist in the dataset and can not be set as similarity target." \
                          % query_point
                    logger.info(msg)
                    return {
                        'error': True,
                        'result': msg,
                        'status_code': sim_settings.NOT_FOUND_CODE
                    }
                else:
                    query = query_point

            elif target_type == 'descriptor_values':
                # Transform input params to the normalized feature space and add them to a query point
                # If there are no params specified in the target, the point is set as empty (probably random sounds
                # are returned)
                feature_names = []
                query = Point()
                query.setLayout(layout)
                try:
                    for param in target.keys():
                        # Only add numerical parameters. Non numerical ones (like key) are only used as filters
                        if param in coeffs.keys():
                            feature_names.append(str(param))
                            value = target[param]
                            if coeffs:
                                a = coeffs[param]['a']
                                b = coeffs[param]['b']
                                if len(a) == 1:
                                    norm_value = a[0] * value + b[0]
                                else:
                                    norm_value = []
                                    for i in range(0, len(a)):
                                        norm_value.append(a[i] * value[i] +
                                                          b[i])
                                query.setValue(str(param), norm_value)
                            else:
                                query.setValue(str(param), value)
                except:
                    return {
                        'error': True,
                        'result':
                        'Invalid target (descriptor values could not be correctly parsed)',
                        'status_code': sim_settings.BAD_REQUEST_CODE
                    }

                # Overwrite metric with present descriptors in target
                metric = DistanceFunctionFactory.create(
                    'euclidean', layout, {'descriptorNames': feature_names})

            elif target_type == 'file':
                # Target is specified as the attached file
                # Create a point with the data in 'descriptors_data' and search for it
                target_file_parsing_type = '-'

                try:
                    # Try directly loading the file
                    p, query = Point(), Point()
                    p.loadFromString(yaml.dump(target))
                    if preset_name == 'pca':
                        query = self.pca_dataset.history().mapPoint(
                            p)  # map point to pca dataset
                    else:
                        query = self.original_dataset.history().mapPoint(
                            p)  # map point to original dataset
                    target_file_parsing_type = 'mapPoint'

                except Exception as e:
                    logger.info(
                        'Unable to create gaia point from uploaded file (%s). '
                        'Trying adding descriptors one by one.' % e)

                    # If does not work load descriptors one by one
                    try:
                        query = Point()
                        #query.setLayout(layout)

                        feature_names = []
                        get_nested_descriptor_names(target, feature_names)
                        feature_names = [
                            '.%s' % item for item in feature_names
                        ]
                        nonused_features = []

                        for param in feature_names:
                            if param in coeffs.keys():
                                value = get_nested_dictionary_value(
                                    param[1:].split('.'), target)
                                if coeffs:
                                    try:
                                        a = coeffs[param]['a']
                                        b = coeffs[param]['b']
                                        if len(a) == 1:
                                            norm_value = a[0] * value + b[0]
                                        else:
                                            norm_value = []
                                            for i in range(0, len(a)):
                                                norm_value.append(a[i] *
                                                                  value[i] +
                                                                  b[i])
                                        query.setValue(str(param[1:]),
                                                       norm_value)
                                    except:
                                        nonused_features.append(param)
                                else:
                                    query.setValue(str(param[1:]), value)
                            else:
                                nonused_features.append(param)

                        if preset_name == 'pca':
                            query = self.pca_dataset.history().mapPoint(
                                query)  # map point to pca dataset
                        else:
                            query = self.original_dataset.history().mapPoint(
                                p)  # map point to original dataset

                        target_file_parsing_type = 'walkDict'

                    except Exception as e:
                        logger.info(
                            'Unable to create gaia point from uploaded file and adding descriptors one by '
                            'one (%s)' % e)
                        return {
                            'error':
                            True,
                            'result':
                            'Unable to create gaia point from uploaded file. Probably the '
                            'file does not have the required layout. Are you using the '
                            'correct version of Essentia\'s Freesound extractor?',
                            'status_code':
                            sim_settings.SERVER_ERROR_CODE
                        }
        else:
            query = Point()  # Empty target
            if preset_name == 'pca':
                query.setLayout(pca_layout)
            else:
                query.setLayout(layout)

        # Process filter
        if filter:
            filter = parse_filter_list(filter, coeffs)
        else:
            filter = ""  # Empty filter

        # log
        log_message = 'Similarity search'
        if target:
            if target_type == 'sound_id':
                log_target = '%s (sound id)' % str(target)
            elif target_type == 'descriptor_values':
                log_target = '%s (descriptor values)' % str(target)
            elif target_type == 'file':
                log_target = 'uploaded file (%s)' % target_file_parsing_type
            log_message += ' with target: %s' % log_target
        if filter:
            log_message += ' with filter: %s' % str(filter)
        logger.info(log_message)

        # if in_ids is specified, edit the filter accordingly
        if in_ids:
            if not filter:
                filter = 'WHERE point.id IN ("' + '", "'.join(in_ids) + '")'
            else:
                filter += ' AND point.id IN ("' + '", "'.join(in_ids) + '")'

        # Set query metric
        metric = self.metrics[preset_name]
        if metric_descriptor_names:
            metric = DistanceFunctionFactory.create(
                'euclidean', layout,
                {'descriptorNames': metric_descriptor_names})

        # Do query!
        try:
            if target_type == 'descriptor_values' and target:
                search = self.view.nnSearch(query, metric, str(filter))
            else:
                if preset_name == 'pca':
                    search = self.view_pca.nnSearch(query, metric, str(filter))
                else:
                    search = self.view.nnSearch(query, metric, str(filter))
            results = search.get(num_results, offset=offset)
            count = search.size()
        except Exception as e:
            return {
                'error': True,
                'result': 'Similarity server error',
                'status_code': sim_settings.SERVER_ERROR_CODE
            }

        note = None
        if target_type == 'file':
            if target_file_parsing_type == 'walkDict':
                note = 'The layout of the given analysis file differed from what we expected. Similarity results ' \
                       'might not be accurate. Was the file generated with the last version of Essentia\'s ' \
                       'Freesound extractor?'

        return {
            'error': False,
            'result': {
                'results': results,
                'count': count,
                'note': note
            }
        }
Beispiel #14
0
    def testRegressionIndexing(self):
        ds = testdata.loadTestDB()
        ds = transform(ds, 'removevl')
        ds = transform(ds, 'fixlength')

        p = ds.samplePoint().name()
        p2 = 'Higher State of Consciousness.mp3'

        queries = [
            (p, ''), (p2, ''),
            (p, 'WHERE (True AND True) and (true and TRUE)'),
            (p, 'WHERE (false AND True) OR (true and false)'),
            (p2, 'WHERE value.tempotap_bpm.value > 140'),
            (p, 'WHERE true AND value.tempotap_bpm.value > 140'),
            (p, 'WHERE value.tempotap_bpm > 110'),
            (p, 'WHERE value.tempotap_bpm > -10'),
            (p, 'WHERE value.tempotap_bpm < -10'),
            (p, 'WHERE value.tempotap_bpm > 23000'),
            (p, 'WHERE value.tempotap_bpm < 23000'),
            (p, 'WHERE value.tempotap_bpm > 120 AND value.tempotap_bpm < 130'),
            (p, 'WHERE value.tempotap_bpm BETWEEN 120 AND 130'),
            (p, 'WHERE value.tempotap_bpm BETWEEN 130 AND 120'),
            (p, 'WHERE value.tempotap_bpm BETWEEN 120 AND 120'),
            (p, 'WHERE value.tempotap_bpm BETWEEN -2.3 AND 4096'),
            (p, "WHERE value.tempotap_bpm BETWEEN -2.3 AND -1.4"),
            (p, "WHERE value.tempotap_bpm BETWEEN 2048 AND 4096"),
            (p, 'WHERE label.key_key = "C"'), (p,
                                               'WHERE label.key_key != "C"'),
            (p, 'WHERE label.key_key = "X"'),
            (p, 'WHERE label.key_key != "X"'),
            (p, 'WHERE label.key_key != "C" AND label.key_mode != "major"'),
            (p2,
             '''WHERE ((label.key_key = "A" AND label.key_mode = "major") OR
                                   (label.key_key = "E" AND label.key_mode = "minor"))
                                  AND value.tempotap_bpm < 90''')
        ]

        # test with standard views
        dist = MetricFactory.create('euclidean', ds.layout(),
                                    {'descriptorNames': '*.mean'})
        v = View(ds)
        vidx = View(ds)
        vidx.indexOn('tempotap_bpm')
        vidx.indexOn('key_key')
        vidx.indexOn('key_mode')

        for (pt, filt) in queries:
            self.assertEqualSearchSpace(v.nnSearch(pt, dist, filt),
                                        vidx.nnSearch(pt, dist, filt))

        # test with frozen views
        dsr = transform(
            ds, 'select',
            {'descriptorNames': ds.layout().descriptorNames(RealType)})
        dsnorm = transform(dsr, 'normalize')
        dspca = transform(dsnorm, 'pca', {
            'resultName': 'pca',
            'dimension': 25
        })
        fds = FrozenDataSet()
        fds.fromDataSet(dspca)
        fds.setReferenceDataSet(ds)
        dist = FrozenDistanceFactory.create('Euclidean', fds.layout(),
                                            {'descriptorName': 'pca'})

        v = FrozenView(fds)
        vidx = FrozenView(fds)
        vidx.indexOn('tempotap_bpm')
        vidx.indexOn('key_key')
        vidx.indexOn('key_mode')

        for (pt, filt) in queries:
            self.assertEqualSearchSpace(v.nnSearch(pt, dist, filt),
                                        vidx.nnSearch(pt, dist, filt))
Beispiel #15
0
def search(dataset, id, n):
    v = View(dataset)
    dist = MetricFactory.create('euclidean', dataset.layout())
    return v.nnSearch(id, dist).get(n)
Beispiel #16
0
class GaiaWrapper:
    def __init__(self):
        self.as_dataset = DataSet()
        self.tag_dataset = DataSet()
        self.fs_dataset = DataSet()
        self.ac_dataset = DataSet()
        self.gaia_similiarity = None

        self.index_path = clust_settings.INDEX_DIR

        self.as_view = None
        self.as_metric = None
        self.tag_view = None
        self.tag_metric = None
        self.fs_view = None
        self.fs_metric = None
        self.ac_view = None
        self.ac_metric = None

        self.__load_datasets()

    def __get_dataset_path(self, ds_name):
        return os.path.join(clust_settings.INDEX_DIR, ds_name + '.db')

    def __load_datasets(self):
        self.as_dataset.load(self.__get_dataset_path(clust_settings.INDEX_NAME_AS))
        self.as_view = View(self.as_dataset)
        # self.as_metric = DistanceFunctionFactory.create('euclidean', self.as_dataset.layout())
        # self.as_metric = DistanceFunctionFactory.create('CosineSimilarity',  self.as_dataset.layout())
        # self.as_metric = DistanceFunctionFactory.create('CosineAngle',  self.as_dataset.layout())
        self.as_metric = DistanceFunctionFactory.create('Manhattan',  self.as_dataset.layout())

        self.tag_dataset.load(self.__get_dataset_path(clust_settings.INDEX_NAME_TAG))
        self.tag_view = View(self.tag_dataset)
        self.tag_metric = DistanceFunctionFactory.create('euclidean', self.tag_dataset.layout())

        self.fs_dataset.load(self.__get_dataset_path(clust_settings.INDEX_NAME_FS))
        self.fs_view = View(self.fs_dataset)
        self.fs_metric = DistanceFunctionFactory.create('euclidean', self.fs_dataset.layout(), {'descriptorNames': 'pca'})

        # self.gaia_similiarity = GaiaWrapperSimilarity()

        self.__load_ac_descriptors_dataset()

    def __load_ac_descriptors_dataset(self):
        self.ac_dataset.load(self.__get_dataset_path('FS_AC_descriptors_normalized'))  # TODO: add this in clustering settings
        self.ac_view = View(self.ac_dataset)
        self.ac_metric = DistanceFunctionFactory.create('euclidean', self.ac_dataset.layout(), 
            {'descriptorNames': [
                'ac_brightness', 
                'ac_boominess', 
                'ac_depth', 
                'ac_hardness', 
                'ac_roughness', 
                'ac_sharpness', 
                'ac_warmth'
            ]})

    def search_nearest_neighbors(self, sound_id, k, in_sound_ids=[], features='audio_as'):
        if in_sound_ids:
            filter = 'WHERE point.id IN ("' + '", "'.join(in_sound_ids) + '")'
        else:
            filter = None
        try:
            if features == 'audio_as':
                nearest_neighbors = self.as_view.nnSearch(sound_id, self.as_metric, filter).get(k)[1:]
            elif features == 'tag':
                nearest_neighbors = self.tag_view.nnSearch(sound_id, self.tag_metric, filter).get(k)[1:]
            elif features == 'audio_fs':
                nearest_neighbors = self.gaia_similiarity.view_pca.nnSearch(sound_id, 
                                                                            self.gaia_similiarity.metrics['pca'], 
                                                                            filter).get(k)[1:]
            elif features == 'audio_fs_selected':
                nearest_neighbors = self.fs_view.nnSearch(sound_id, self.fs_metric, filter).get(k)[1:]
            elif features == 'audio_ac':
                nearest_neighbors = self.ac_view.nnSearch(sound_id, self.ac_metric, filter).get(k)[1:]

            if not nearest_neighbors:
                logger.info("No nearest neighbors found for point with id '{}'".format(sound_id))
            return nearest_neighbors
        except Exception as e:
            logger.info(e)
            return []

    def return_sound_tag_features(self, sound_ids):
        tag_features = []
        for sound_id in sound_ids:
            try:
                tag_features.append(self.tag_dataset.point(sound_id).value('tags_lda'))  # TODO: add this in clustering settings
            except Exception as e:
                #logger.info(e)
                tag_features.append(None)
        return tag_features