Esempio n. 1
0
def generateProject():
    parser = OptionParser(
        usage=
        '%prog [options] groundtruth_file filelist_file project_file datasets_dir results_dir'
    )

    options, args = parser.parse_args()

    try:
        groundtruth_file = args[0]
        filelist_file = args[1]
        project_file = args[2]
        datasets_dir = args[3]
        results_dir = args[4]
    except:
        parser.print_help()
        sys.exit(1)

    gt = yaml.load(open(groundtruth_file, 'r'))
    try:
        className = gt['className']
        groundTruth = gt['groundTruth']
    except:
        print groundtruth_file, "groundtruth file has incorrect format"
        sys.exit(2)

    fl = yaml.load(open(filelist_file, 'r'))

    gt_trackids = groundTruth.keys()
    fl_trackids = fl.keys()

    # check that there are no dublicate ids
    if len(gt_trackids) != len(set(gt_trackids)):
        print groundtruth_file, "contains dublicate track ids"
        sys.exit(3)

    if len(fl_trackids) != len(set(fl_trackids)):
        print filelist_file, "contains dublicate track ids"
        sys.exit(3)

    # check if filelist is consistent with groundtruth (no files missing)
    if set(gt_trackids) != set(fl_trackids):
        print "track ids found in", groundtruth_file, "are inconsistent with", filelist_file
        sys.exit(4)

    # write the project file
    with open(project_file, 'w') as pfile:
        pfile.write(
            PROJECT_TEMPLATE % {
                'className': className,
                'datasetsDirectory': abspath(datasets_dir),
                'resultsDirectory': abspath(results_dir),
                'filelist': abspath(filelist_file),
                'groundtruth': abspath(groundtruth_file)
            })

    print 'Successfully written', project_file
def generateProject():
    parser = OptionParser(usage = '%prog [options] groundtruth_file filelist_file project_file datasets_dir results_dir')

    options, args = parser.parse_args()

    try:
        groundtruth_file = args[0]
        filelist_file = args[1]
        project_file = args[2]
        datasets_dir = args[3]
        results_dir = args[4]
    except:
        parser.print_help()
        sys.exit(1)

    gt = yaml.load(open(groundtruth_file, 'r'))
    try:
        className = gt['className']
        groundTruth = gt['groundTruth']   
    except:
        print groundtruth_file, "groundtruth file has incorrect format"
        sys.exit(2)

    fl = yaml.load(open(filelist_file, 'r'))

    gt_trackids = groundTruth.keys()
    fl_trackids = fl.keys()

    # check that there are no dublicate ids 
    if len(gt_trackids) != len(set(gt_trackids)):
        print groundtruth_file, "contains dublicate track ids"
        sys.exit(3)
    
    if len(fl_trackids) != len(set(fl_trackids)):
        print filelist_file, "contains dublicate track ids"
        sys.exit(3)  

    # check if filelist is consistent with groundtruth (no files missing)
    if set(gt_trackids) != set(fl_trackids):
        print "track ids found in", groundtruth_file, "are inconsistent with", filelist_file
        sys.exit(4)

    # write the project file
    with open(project_file, 'w') as pfile:
        pfile.write(PROJECT_TEMPLATE % { 'className': className,
                                         'datasetsDirectory': abspath(datasets_dir),
                                         'resultsDirectory': abspath(results_dir),
                                         'filelist': abspath(filelist_file),
                                         'groundtruth': abspath(groundtruth_file) })

    print 'Successfully written', project_file
Esempio n. 3
0
    def testValues(self):
        collection = yaml.load(open(testdata.TEST_DATABASE_FILES, 'r').read())

        # prepend 'data/' to the filenames
        for pid, filename in collection.items():
            collection[pid] = 'data/' + filename

        cvar.verbose = False
        ds = DataSet.mergeFiles(collection)
        cvar.verbose = True

        self.assertAlmostEqual(ds.point('Panic-The Smiths.mp3').value('danceability'),
                               0.5691167712)

        self.assertAlmostEqual(ds.point('11 Go.mp3').value('energy.mean'),
                               0.0231081359)

        self.assertAlmostEqual(ds.point('03 The Chopper [Shy FX Remix].mp3').value('chords_number_rate'),
                               0.0551007539)

        self.assertEqual(ds.point('08 I Can\'t Dance - Genesis.mp3').label('key_key'),
                         'D#')

        self.assertEqual(ds.point('06 Booo!.mp3').label('chords_mode'),
                         'major')

        ds.save(testdata.TEST_DATABASE)
Esempio n. 4
0
    def testValues(self):
        collection = yaml.load(open(testdata.TEST_DATABASE_FILES, 'r').read())

        # prepend 'data/' to the filenames
        for pid, filename in list(collection.items()):
            collection[pid] = 'data/' + filename

        cvar.verbose = False
        ds = DataSet.mergeFiles(collection)
        cvar.verbose = True

        self.assertAlmostEqual(
            ds.point('Panic-The Smiths.mp3').value('danceability'),
            0.5691167712)

        self.assertAlmostEqual(
            ds.point('11 Go.mp3').value('energy.mean'), 0.0231081359)

        self.assertAlmostEqual(
            ds.point('03 The Chopper [Shy FX Remix].mp3').value(
                'chords_number_rate'), 0.0551007539)

        self.assertEqual(
            ds.point('08 I Can\'t Dance - Genesis.mp3').label('key_key'), 'D#')

        self.assertEqual(
            ds.point('06 Booo!.mp3').label('chords_mode'), 'major')

        ds.save(testdata.TEST_DATABASE)
def convertJsonToSig(filelist_file, result_filelist_file):
    fl = yaml.load(open(filelist_file, 'r'))

    result_fl = fl
    errors = []

    for trackid, json_file in fl.iteritems():
        try:
            data = json.load(open(json_file))

            # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts
            if 'tags' in data['metadata']:
                del data['metadata']['tags']
            if 'sample_rate' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['sample_rate']

            sig_file = os.path.splitext(json_file)[0] + '.sig'

            yaml.dump(data, open(sig_file, 'w'))
            result_fl[trackid] = sig_file

        except:
            errors += [json_file]

    yaml.dump(result_fl, open(result_filelist_file, 'w'))

    print "Failed to convert", len(errors), "files:"
    for e in errors:
        print e

    return len(errors) == 0
Esempio n. 6
0
def convertJsonToSig(filelist_file, result_filelist_file):
    fl = yaml.load(open(filelist_file, 'r'))

    result_fl = fl
    errors = []

    for trackid, json_file in fl.iteritems():
        try:
            data = json.load(open(json_file))

            # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts
            if 'tags' in data['metadata']:
                del data['metadata']['tags']
            if 'sample_rate' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['sample_rate']
            if 'lossless' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['lossless']

            sig_file = os.path.splitext(json_file)[0] + '.sig'

            yaml.safe_dump(data, open(sig_file, 'w'))
            result_fl[trackid] = sig_file

        except:
            errors += [json_file]

    yaml.dump(result_fl, open(result_filelist_file, 'w'))

    print("Failed to convert", len(errors), "files:")
    for e in errors:
        print(e)

    return len(errors) == 0
Esempio n. 7
0
    def load(self, filename):
        with open(filename) as f:
            data = yaml.load(f.read())

        # convert to a defaultdict the data we just loaded
        self.matrix = defaultdict(lambda: defaultdict(list))
        for k, v in data.items():
            self.matrix[k] = defaultdict(list, v)
Esempio n. 8
0
    def load(self, filename):
        with open(filename) as f:
            data = yaml.load(f.read())

        # convert to a defaultdict the data we just loaded
        self.matrix = defaultdict(lambda: defaultdict(list))
        for k, v in data.items():
            self.matrix[k] = defaultdict(list, v)
Esempio n. 9
0
    def __call__(self, *args, **kwargs):
        if kwargs:
            raise NotImplementedError('Cannot use keyword arguments with YamlRPC at the moment...')

        if VERBOSE: serializeStart = time.time()

        try:
            q = yaml.dump({ 'method': self.methodName,
                            'params': list(args),
                            'id': 'gloubi-boulga'
                            })
        except:
            raise RuntimeError('Could not serialize Yaml request, most likely one of the arguments could not be serialized:\n%s' % list(args))

        if VERBOSE:
            responseTime = time.time() - serializeStart
            print ('serialized request in %f seconds' % responseTime)


        # we don't want the '+'-quoting
        params = urlencode({ 'q': q }).replace('+', ' ')

        headers = { 'Content-type': 'application/x-www-form-urlencoded',
                    'Accept': 'text/plain'
                    }

        if VERBOSE: startTime = time.time()

        conn = http_client.HTTPConnection(self.endPoint)

        try:
            conn.request('POST', '/', params, headers)
        except Exception as e:
            raise RuntimeError('request failed', self.endPoint, self.methodName, args, e)

        response = conn.getresponse()

        if VERBOSE:
            responseTime = time.time() - startTime
            print ('received answer in %f seconds' % responseTime)
            #print response.status, response.reason

            startParseTime = time.time()

        result = yaml.load(response.read())

        if VERBOSE:
            responseTime = time.time() - startParseTime
            print ('parsed answer in %f seconds' % responseTime)

            responseTime = time.time() - serializeStart
            print ('total time: %f seconds' % responseTime)

        if 'error' in result:
            raise RuntimeError(result['error']['message'])

        return result['result']
def generateProject(groundtruth_file, filelist_file, project_file,
                    datasets_dir, results_dir):
    gt = yaml.load(open(groundtruth_file, 'r'))
    try:
        className = gt['className']
        groundTruth = gt['groundTruth']
    except:
        print(groundtruth_file, "groundtruth file has incorrect format")
        sys.exit(2)

    fl = yaml.load(open(filelist_file, 'r'))

    gt_trackids = list(groundTruth.keys())
    fl_trackids = list(fl.keys())

    # check that there are no dublicate ids
    if len(gt_trackids) != len(set(gt_trackids)):
        print(groundtruth_file, "contains dublicate track ids")
        sys.exit(3)

    if len(fl_trackids) != len(set(fl_trackids)):
        print(filelist_file, "contains dublicate track ids")
        sys.exit(3)

    # check if filelist is consistent with groundtruth (no files missing)
    if set(gt_trackids) != set(fl_trackids):
        print("track ids found in", groundtruth_file, "are inconsistent with",
              filelist_file)
        sys.exit(4)

    # write the project file
    with open(project_file, 'w') as pfile:
        pfile.write(
            PROJECT_TEMPLATE % {
                'className': className,
                'datasetsDirectory': abspath(datasets_dir),
                'resultsDirectory': abspath(results_dir),
                'filelist': abspath(filelist_file),
                'groundtruth': abspath(groundtruth_file)
            })

    print('Successfully written', project_file)
Esempio n. 11
0
    def __call__(self, *args, **kwargs):
        # pre-check for errors that might happen very often and where 1 good error message would
        # be real nice to have
        if (self.methodName.startswith('nnSearch') and self.methodName
                not in ('nnSearchById', 'nnSearchByIdWithFilter',
                        'nnSearchByExample', 'nnSearchByExampleWithFilter')):
            raise AttributeError(
                'You need to use either nnSearchById{WithFilter} or nnSearchByExample{WithFilter}'
            )

        # pre-processing for certain specific methods
        if self.methodName.startswith('nnSearchByExample'):
            args = (args[0].toBase64(), ) + args[1:]

        # in the case of an nnSearch request, we shouldn't do the query immediately but rather
        # return a proxy object that allows to chain queries using the search_space argument.
        # the actual query should only be resolved when the user calls the get() method on this
        # proxy object
        if self.methodName.startswith('nnSearch'):
            return ResultSet(self.endPoint, self.methodName, args, kwargs)

        # actual processing by the server
        result = YamlRPCMethod.__call__(self, *args, **kwargs)

        # post-processing for certain specific methods
        if self.methodName == 'layout':
            result = yaml.load(result)

        elif self.methodName == 'getPoint':
            try:
                import gaia2
            except ImportError:
                raise ImportError(
                    'You need to have the gaia2 python module installed in order to be able to retrieve single points'
                )
            p = gaia2.Point()
            p.fromBase64(result)
            result = p

        elif self.methodName == 'getPoints':
            try:
                import gaia2
            except ImportError:
                raise ImportError(
                    'You need to have the gaia2 python module installed in order to be able to retrieve points'
                )
            ds = gaia2.DataSet()
            ds.fromBase64(result)
            result = ds

        return result
Esempio n. 12
0
def generateProject(groundtruth_file, filelist_file, project_file, datasets_dir, results_dir):
    gt = yaml.load(open(groundtruth_file, 'r'))
    try:
        className = gt['className']
        groundTruth = gt['groundTruth']
    except:
        print groundtruth_file, "groundtruth file has incorrect format"
        sys.exit(2)

    fl = yaml.load(open(filelist_file, 'r'))

    gt_trackids = groundTruth.keys()
    fl_trackids = fl.keys()

    # check that there are no dublicate ids
    if len(gt_trackids) != len(set(gt_trackids)):
        print groundtruth_file, "contains dublicate track ids"
        sys.exit(3)

    if len(fl_trackids) != len(set(fl_trackids)):
        print filelist_file, "contains dublicate track ids"
        sys.exit(3)

    # check if filelist is consistent with groundtruth (no files missing)
    if set(gt_trackids) != set(fl_trackids):
        print "track ids found in", groundtruth_file, "are inconsistent with", filelist_file
        sys.exit(4)

    # write the project file
    with open(project_file, 'w') as pfile:
        pfile.write(PROJECT_TEMPLATE % { 'className': className,
                                         'datasetsDirectory': abspath(datasets_dir),
                                         'resultsDirectory': abspath(results_dir),
                                         'filelist': abspath(filelist_file),
                                         'groundtruth': abspath(groundtruth_file) })

    print 'Successfully written', project_file
Esempio n. 13
0
    def load(self, yamlfile):
        with open(yamlfile) as f:
            data = yaml.load(f.read())

        try:
            ver = data['version']
            tp = data['type']
        except:
            raise Exception("Groundtruth file should contain at least the 'version' and the 'type' fields")

        if ver != 1.0 and tp != 'singleClass':
            raise Exception("Groundtruth can only load 'singleClass v1.0' files")

        self.className = data['className']
        self.clear()
        self.update(data['groundTruth'])
Esempio n. 14
0
def convertJsonToSig():
    parser = OptionParser(usage = '%prog [options] filelist_file result_filelist_file\n' +
"""
Converts json files found in filelist_file into *.sig yaml files compatible with
Gaia. The result files are written to the same directory where original files were 
located.
"""
        )

    options, args = parser.parse_args()

    try:
        filelist_file = args[0]
        result_filelist_file = args[1]
    except:
        parser.print_help()
        sys.exit(1)

    fl = yaml.load(open(filelist_file, 'r'))

    result_fl = fl
    errors = []

    for trackid, json_file in fl.iteritems():
        try:
            data = json.load(open(json_file))

            # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts
            if 'tags' in data['metadata']:
                del data['metadata']['tags']
            if 'sample_rate' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['sample_rate']

            sig_file = os.path.splitext(json_file)[0] + '.sig'
            
            yaml.dump(data, open(sig_file, 'w'))           
            result_fl[trackid] = sig_file

        except:
            errors += [json_file]

    yaml.dump(result_fl, open(result_filelist_file, 'w'))
    
    print "Failed to convert", len(errors), "files:"
    for e in errors:
        print e
    return len(errors)
Esempio n. 15
0
def convertJsonToSig():
    parser = OptionParser(
        usage='%prog [options] filelist_file result_filelist_file\n' + """
Converts json files found in filelist_file into *.sig yaml files compatible with
Gaia. The result files are written to the same directory where original files were 
located.
""")

    options, args = parser.parse_args()

    try:
        filelist_file = args[0]
        result_filelist_file = args[1]
    except:
        parser.print_help()
        sys.exit(1)

    fl = yaml.load(open(filelist_file, 'r'))

    result_fl = fl
    errors = []

    for trackid, json_file in fl.iteritems():
        try:
            data = json.load(open(json_file))

            # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts
            if 'tags' in data['metadata']:
                del data['metadata']['tags']
            if 'sample_rate' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['sample_rate']

            sig_file = os.path.splitext(json_file)[0] + '.sig'

            yaml.dump(data, open(sig_file, 'w'))
            result_fl[trackid] = sig_file

        except:
            errors += [json_file]

    yaml.dump(result_fl, open(result_filelist_file, 'w'))

    print "Failed to convert", len(errors), "files:"
    for e in errors:
        print e
    return len(errors)
Esempio n. 16
0
    def readResults(self, dir):
        """Reads all the results file contained in the given directory, and generates the
        associated ConfusionMatrix for each one."""

        resultFiles = glob.glob(join(dir, '*.result'))
        progress = TextProgress(len(resultFiles))

        for i, filename in enumerate(resultFiles):
            cm = ConfusionMatrix()
            cm.load(filename)

            paramFile = splitext(filename)[0] + '.param'
            params = yaml.load(open(paramFile).read())

            self.results += [ (filename, cm, params) ]

            progress.update(i+1)
Esempio n. 17
0
    def readResults(self, dir):
        """Reads all the results file contained in the given directory, and generates the
        associated ConfusionMatrix for each one."""

        resultFiles = glob.glob(join(dir, '*.result'))
        progress = TextProgress(len(resultFiles))

        for i, filename in enumerate(resultFiles):
            cm = ConfusionMatrix()
            cm.load(filename)

            paramFile = splitext(filename)[0] + '.param'
            params = yaml.load(open(paramFile).read())

            self.results += [ (filename, cm, params) ]

            progress.update(i+1)
Esempio n. 18
0
def evaluateModels(extractor, resultsDir):
    if not isExecutable(extractor):
        print '%s does not seem to be an executable extractor... Exiting...' % extractor
        sys.exit(1)

    makedir(resultsDir)

    collections_file = join(gaia2.rootdir(), 'mtgdb', 'mtgdb_collections.yaml')
    collections = [ c['name'] for c in yaml.load(open(collections_file)) ]

    # do all the evaluations
    for collection in collections:
        evaluateCollection(collection, extractor, resultsDir)

    # print a report of the evaluations
    for collection in collections:
        checkErrors(collection, resultsDir)
def evaluateModels(extractor, resultsDir):
    if not isExecutable(extractor):
        print '%s does not seem to be an executable extractor... Exiting...' % extractor
        sys.exit(1)

    makedir(resultsDir)

    collections_file = join(gaia2.rootdir(), 'mtgdb', 'mtgdb_collections.yaml')
    collections = [c['name'] for c in yaml.load(open(collections_file))]

    # do all the evaluations
    for collection in collections:
        evaluateCollection(collection, extractor, resultsDir)

    # print a report of the evaluations
    for collection in collections:
        checkErrors(collection, resultsDir)
Esempio n. 20
0
    def __call__(self, *args, **kwargs):
        # pre-check for errors that might happen very often and where 1 good error message would
        # be real nice to have
        if (self.methodName.startswith('nnSearch') and
            self.methodName not in ('nnSearchById', 'nnSearchByIdWithFilter',
                                    'nnSearchByExample', 'nnSearchByExampleWithFilter')):
            raise AttributeError('You need to use either nnSearchById{WithFilter} or nnSearchByExample{WithFilter}')

        # pre-processing for certain specific methods
        if self.methodName.startswith('nnSearchByExample'):
            args = (args[0].toBase64(),) + args[1:]

        # in the case of an nnSearch request, we shouldn't do the query immediately but rather
        # return a proxy object that allows to chain queries using the search_space argument.
        # the actual query should only be resolved when the user calls the get() method on this
        # proxy object
        if self.methodName.startswith('nnSearch'):
            return ResultSet(self.endPoint, self.methodName, args, kwargs)

        # actual processing by the server
        result = YamlRPCMethod.__call__(self, *args, **kwargs)

        # post-processing for certain specific methods
        if self.methodName == 'layout':
            result = yaml.load(result)

        elif self.methodName == 'getPoint':
            try:
                import gaia2
            except ImportError:
                raise ImportError('You need to have the gaia2 python module installed in order to be able to retrieve single points')
            p = gaia2.Point()
            p.fromBase64(result)
            result = p

        elif self.methodName == 'getPoints':
            try:
                import gaia2
            except ImportError:
                raise ImportError('You need to have the gaia2 python module installed in order to be able to retrieve points')
            ds = gaia2.DataSet()
            ds.fromBase64(result)
            result = ds

        return result
def get_essentia_versions(filelist):
    versions = set()

    for v in filelist.values():
        try:
            version = yaml.load(open(v)).get('metadata',
                                             {}).get('version',
                                                     {}).get('essentia', {})
            if version:
                parsed_version = version.split('-')
                essentia_version = parsed_version[0]
                if parsed_version[1].startswith('beta'):
                    essentia_version += '-{}'.format(parsed_version[1])

                versions.add(essentia_version)
            else:
                versions.add('no_essentia_version_field')

        except IOError:
            print('Error retrieving the Essentia version of {}'.format(v))
    return versions
Esempio n. 22
0
def mergeAll(pointList,
             outputFilename,
             chunkSize,
             transfoFile,
             select=None,
             exclude=None):
    # TODO: validation of the yaml file format? (ie: pre-2.3 yaml files should be rejected)
    totalPoints = len(fastyaml.load(open(pointList).read()))

    begin, end = 0, chunkSize
    partfiles = []
    partfileTemplate = outputFilename + '_%d_%d.partdb'

    # keep this information for future reference as it won't be accessible anymore
    # once the dataset is merged
    excluded = []
    if exclude:
        try:
            p = gaia2.Point()
            p.load(list(gaia2.fastyaml.loadfile(pointList).items())[0][1])
            excluded = p.layout().descriptorNames(exclude)
        except:
            raise

    # merge each chunk separately
    # this includes removevl and fixlength, which should yield smaller files than just after
    # merging, so it should then be possible to load all of them together to merge them
    while begin < totalPoints:
        end = min(end, totalPoints)
        partfile = partfileTemplate % (begin, end)
        partfiles += [partfile]

        mergeChunk(pointList, partfile, transfoFile, begin, end, select,
                   exclude)
        begin, end = end, end + chunkSize

        horizontalLine()

    # make sure all histories are the same, if not do whatever it takes to reach that point
    # also "simplify" the histories so that they are the minimum history representation required
    # to get to the layout of the final dataset
    print(
        'Harmonizing chunks so that they all have the same layout & history...'
    )
    vldescs, nandescs, rdescs = harmonizeChunks(partfiles)
    rdescs = rdescs | set(excluded)
    horizontalLine()

    # merge all those partfiles together
    print('Assembling full dataset together...')
    dstotal = DataSet()

    for pfile in partfiles:
        print('Merging partfile', pfile)
        ds = DataSet()
        ds.load(pfile)
        dstotal.appendDataSet(ds)

    dstotal.save(outputFilename)

    # print a nice informative summary of what has been done to the dataset
    horizontalLine()

    msg = '''
Final dataset information
-------------------------

Number of points: %s

Descriptors removed:
  - because they were of variable length: %s
  - because they were either constant, contained NaN or contained Inf: %s
  - because they were removed explicitly: %s

Your dataset has been saved at %s'''

    # remove leading dot
    vldescs = sorted(d[1:] for d in vldescs)
    nandescs = sorted(d[1:] for d in nandescs)
    rdescs = sorted(d[1:] for d in rdescs)

    print(msg % (str(dstotal.size()), ', '.join(vldescs), ', '.join(nandescs),
                 ', '.join(rdescs), outputFilename))

    # clean up temporary files
    for pfile in partfiles:
        os.remove(pfile)
        os.remove(pfile + '.raw')
Esempio n. 23
0
    def testLibyaml1024CharLimit(self):
        l = PointLayout()
        l.add('a'*2000, RealType)

        from gaia2 import fastyaml as yaml
        yaml.load(l.toYaml())
Esempio n. 24
0
    def testLibyaml1024CharLimit(self):
        l = PointLayout()
        l.add('a'*2000, RealType)

        from gaia2 import fastyaml as yaml
        yaml.load(l.toYaml())
Esempio n. 25
0
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the Affero GNU General Public License     
# version 3 along with this program. If not, see http://www.gnu.org/licenses/



import gaia2
import gaia2.fastyaml as yaml
import environment
import collection
from os.path import join

COLLECTIONS_FILE = join(gaia2.filedir(), 'mtgdb_collections.yaml')
ALL_MTGDB_COLLECTIONS = dict((c['name'], c) for c in yaml.load(open(COLLECTIONS_FILE).read()))


class MtgdbCollection(collection.Collection):
    """An mtgdb.MtgdbCollection instance is a collection on the MTG-DB server that
    has been classified as stable and may be accessed directly by its name."""

    def __init__(self, name, groundTruth = None):
        try:
            collection = ALL_MTGDB_COLLECTIONS[name]
        except KeyError:
            raise ValueError('Collection "%s" is not known by Gaia. Available collections are: %s' % (name, ALL_MTGDB_COLLECTIONS.keys()))

        super(MtgdbCollection, self).__init__(join(environment.MTGDB_AUDIO,collection['location']), groundTruth)
        self.name = name
        self._properties = collection
Esempio n. 26
0
File: fusion.py Progetto: DomT4/gaia
def mergeAll(pointList, outputFilename, chunkSize, transfoFile, select = None, exclude = None):
    # TODO: validation of the yaml file format? (ie: pre-2.3 yaml files should be rejected)
    totalPoints = len(fastyaml.load(open(pointList).read()))

    begin, end = 0, chunkSize
    partfiles = []
    partfileTemplate = outputFilename + '_%d_%d.partdb'

    # keep this information for future reference as it won't be accessible anymore
    # once the dataset is merged
    excluded = []
    if exclude:
        try:
            p = gaia2.Point()
            p.load(gaia2.fastyaml.loadfile(pointList).items()[0][1])
            excluded = p.layout().descriptorNames(exclude)
        except:
            raise

    # merge each chunk separately
    # this includes removevl and fixlength, which should yield smaller files than just after
    # merging, so it should then be possible to load all of them together to merge them
    while begin < totalPoints:
        end = min(end, totalPoints)
        partfile = partfileTemplate % (begin, end)
        partfiles += [ partfile ]

        mergeChunk(pointList, partfile, transfoFile, begin, end, select, exclude)
        begin, end = end, end + chunkSize

        horizontalLine()

    # make sure all histories are the same, if not do whatever it takes to reach that point
    # also "simplify" the histories so that they are the minimum history representation required
    # to get to the layout of the final dataset
    print 'Harmonizing chunks so that they all have the same layout & history...'
    vldescs, nandescs, rdescs = harmonizeChunks(partfiles)
    rdescs = rdescs | set(excluded)
    horizontalLine()

    # merge all those partfiles together
    print 'Assembling full dataset together...'
    dstotal = DataSet()

    for pfile in partfiles:
        print 'Merging partfile', pfile
        ds = DataSet()
        ds.load(pfile)
        dstotal.appendDataSet(ds)

    dstotal.save(outputFilename)

    # print a nice informative summary of what has been done to the dataset
    horizontalLine()

    msg = '''
Final dataset information
-------------------------

Number of points: %s

Descriptors removed:
  - because they were of variable length: %s
  - because they were either constant, contained NaN or contained Inf: %s
  - because they were removed explicitly: %s

Your dataset has been saved at %s'''

    # remove leading dot
    vldescs = sorted( d[1:] for d in vldescs )
    nandescs = sorted( d[1:] for d in nandescs )
    rdescs = sorted( d[1:] for d in rdescs )

    print msg % (str(dstotal.size()), ', '.join(vldescs), ', '.join(nandescs), ', '.join(rdescs), outputFilename)

    # clean up temporary files
    for pfile in partfiles:
        os.remove(pfile)
        os.remove(pfile + '.raw')
def generate_project(groundtruth_file,
                     filelist_file,
                     project_file,
                     datasets_dir,
                     results_dir,
                     seed=None,
                     cluster_mode=False,
                     template=None,
                     force_consistency=False):

    gt = yaml.load(open(groundtruth_file, 'r'))
    try:
        className = gt['className']
        groundTruth = gt['groundTruth']
    except:
        print(groundtruth_file, "groundtruth file has incorrect format")
        sys.exit(2)

    fl = yaml.load(open(filelist_file, 'r'))

    gt_trackids = list(groundTruth.keys())
    fl_trackids = list(fl.keys())

    # check that there are no duplicated ids
    if len(gt_trackids) != len(set(gt_trackids)):
        print(groundtruth_file, "contains duplicated track ids")
        sys.exit(3)

    if len(fl_trackids) != len(set(fl_trackids)):
        print(filelist_file, "contains duplicated track ids")
        sys.exit(3)

    # check if filelist is consistent with groundtruth (no files missing)
    if set(gt_trackids) != set(fl_trackids):
        print("track ids found in", groundtruth_file, "are inconsistent with",
              filelist_file)
        sys.exit(4)

    if force_consistency:
        print(
            'Checking Essentia version in the descriptor files to ensure consistency...'
        )
        versions = get_essentia_versions(fl)

        if len(versions) > 1:
            raise Exception(
                "Couldn't find a unique Essentia version in the dataset. "
                "This exception is thrown because you are using the flag `force-consistency`"
            )
        print('ok!')

    if not template:
        print('No classification project template specified.')
        essentia_version = DEFAULT_VERSION

        if not force_consistency:
            print(
                'Analyzing the dataset to figure out which project template file to use...'
            )
            versions = get_essentia_versions(fl)

        if len(versions) == 1:
            essentia_version = list(versions)[0]
        else:
            print("Couldn't find a unique essentia version in the dataset.")

        template_version = VERSION_MAP.get(essentia_version, DEFAULT_VERSION)

        print('Using classification project template "{}"'.format(
            template_version))
        template = 'classification_project_template_{}.yaml'.format(
            template_version)

    project_template = open(join(filedir(), template)).read()

    # if not seed specified, get the current clock value
    if seed is None:
        import time
        seed = time.time()

    # write the project file
    with open(project_file, 'w') as pfile:
        pfile.write(
            project_template % {
                'className': className,
                'datasetsDirectory': abspath(datasets_dir),
                'resultsDirectory': abspath(results_dir),
                'filelist': abspath(filelist_file),
                'groundtruth': abspath(groundtruth_file),
                'seed': seed,
                'clusterMode': cluster_mode
            })

    print('Successfully written', project_file)
Esempio n. 28
0
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the Affero GNU General Public License
# version 3 along with this program. If not, see http://www.gnu.org/licenses/

from __future__ import absolute_import
import gaia2
import gaia2.fastyaml as yaml
from . import environment
from . import collection
from os.path import join

COLLECTIONS_FILE = join(gaia2.filedir(), 'mtgdb_collections.yaml')
ALL_MTGDB_COLLECTIONS = dict(
    (c['name'], c) for c in yaml.load(open(COLLECTIONS_FILE).read()))


class MtgdbCollection(collection.Collection):
    """An mtgdb.MtgdbCollection instance is a collection on the MTG-DB server that
    has been classified as stable and may be accessed directly by its name."""
    def __init__(self, name, groundTruth=None):
        try:
            collection = ALL_MTGDB_COLLECTIONS[name]
        except KeyError:
            raise ValueError(
                'Collection "%s" is not known by Gaia. Available collections are: %s'
                % (name, list(ALL_MTGDB_COLLECTIONS.keys())))

        super(MtgdbCollection, self).__init__(
            join(environment.MTGDB_AUDIO, collection['location']), groundTruth)