Example #1
0
    def run(self, className, outfilename, param, dsname, gtname, evalconfig):

        try:
            classifier = param['classifier']
            gt = GroundTruth(classifier)
            gt.load(gtname)

            # force the GroundTruth class name to be the one specified by our project file, not
            # the one in the original groundtruth file
            gt.className = className

            ds = DataSet()
            ds.load(dsname)

            # some points may have failed to be analyzed, remove those from the GroundTruth
            pnames = ds.pointNames()
            for pid in list(gt.keys()):
                if pid not in pnames:
                    log.warning('Removing %s from GroundTruth as it could not be found in the merged dataset' % pid)
                    del gt[pid]

            trainerFun, trainingparam, newds = getTrainer(classifier, param, ds)

            # run all the evaluations specified in the evaluation config
            for i, evalparam in enumerate(evalconfig):
                # if we already ran this evaluation, no need to run it again...
                resultFilename = outfilename + '_%d.result' % i
                if exists(resultFilename):
                    log.info('File %s already exists. Skipping evaluation...' % resultFilename)
                    continue

                log.info('Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename,
                                                                                              param['classifier'],
                                                                                              param['preprocessing']))
                log.info('    PID: %d, parameters: %s' % (os.getpid(), json.dumps(param)))

                # run evaluation
                confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam)

                # write evaluation params & result
                with open(outfilename + '_%d.param' % i, 'w') as f:
                    yaml.dump({ 'model': param, 'evaluation': evalparam }, f)

                confusion.save(resultFilename)

        except Exception:
            log.error('While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig))
            raise
Example #2
0
def trainSVMHistory(configFilename, paramsFilename, outputHistoryFilename,
                    className):
    config = yaml.load(open(configFilename).read())
    params = yaml.load(open(paramsFilename).read())['model']

    if params.pop('classifier') != 'svm':
        raise Exception('Can only use this script on SVM config parameters.')

    preproc = params.pop('preprocessing')

    ds = DataSet()
    ds.load(
        join(
            split(configFilename)[0],  # base dir
            config['datasetsDirectory'],  # datasets dir
            '%s-%s.db' % (config['className'], preproc)))  # dataset name

    gt = GroundTruth.fromFile(config['groundtruth'])

    if className:
        gt.className = className

    # add 'highlevel.' in front of the descriptor, this is what will appear in the final Essentia sigfile
    gt.className = 'highlevel.' + gt.className

    # do the whole training
    h = trainSVM(ds, gt, **params)

    h.save(outputHistoryFilename)
def train_svm_history(project, params, output_file_path):
    params_model = params["model"]
    if params_model.get("classifier") != "svm":
        raise GaiaWrapperException(
            "Can only use this script on SVM config parameters.")

    ds = DataSet()
    ds.load(
        os.path.join(
            project["datasetsDirectory"], "%s-%s.db" %
            (project["className"], params_model["preprocessing"])))

    gt = GroundTruth.fromFile(project["groundtruth"])
    gt.className = "highlevel." + project["className"]

    history = train_svm(
        ds,
        gt,
        type=params_model["type"],
        kernel=params_model["kernel"],
        C=params_model["C"],
        gamma=params_model["gamma"])  # doing the whole training
    if isinstance(output_file_path, unicode):
        output_file_path = output_file_path.encode("utf-8")
    history.save(output_file_path)
Example #4
0
    def run(self, className, outfilename, param, dsname, gtname, evalconfig):

        try:
            classifier = param['classifier']
            gt = GroundTruth(classifier)
            gt.load(gtname)

            # force the GroundTruth class name to be the one specified by our project file, not
            # the one in the original groundtruth file
            gt.className = className

            ds = DataSet()
            ds.load(dsname)

            # some points may have failed to be analyzed, remove those from the GroundTruth
            pnames = ds.pointNames()
            for pid in list(gt.keys()):
                if pid not in pnames:
                    log.warning(
                        'Removing %s from GroundTruth as it could not be found in the merged dataset'
                        % pid)
                    del gt[pid]

            trainerFun, trainingparam, newds = getTrainer(
                classifier, param, ds)

            # run all the evaluations specified in the evaluation config
            for i, evalparam in enumerate(evalconfig):
                # if we already ran this evaluation, no need to run it again...
                resultFilename = outfilename + '_%d.result' % i
                if exists(resultFilename):
                    log.info('File %s already exists. Skipping evaluation...' %
                             resultFilename)
                    continue

                log.info(
                    'Running evaluation %d for: %s with classifier %s and dataset %s'
                    % (i, outfilename, param['classifier'],
                       param['preprocessing']))
                log.info('    PID: %d, parameters: %s' %
                         (os.getpid(), json.dumps(param)))

                # run evaluation
                confusion = evaluateNfold(evalparam['nfold'], ds, gt,
                                          trainerFun, **trainingparam)

                # write evaluation params & result
                with open(outfilename + '_%d.param' % i, 'w') as f:
                    yaml.dump({'model': param, 'evaluation': evalparam}, f)

                confusion.save(resultFilename)

        except Exception:
            log.error(
                'While doing evaluation with param = %s\nevaluation = %s' %
                (param, evalconfig))
            raise
def train_svm_history(project, params, output_file_path):
    params_model = params["model"]
    if params_model.pop("classifier") != "svm":
        raise GaiaWrapperException("Can only use this script on SVM config parameters.")

    ds = DataSet()
    ds.load(os.path.join(
        project["datasetsDirectory"],
        "%s-%s.db" % (project["className"], params_model.pop("preprocessing"))
    ))

    gt = GroundTruth.fromFile(project["groundtruth"])
    gt.className = "highlevel." + project["className"]

    history = train_svm(ds, gt, **params_model)  # doing the whole training
    history.save(output_file_path)
Example #6
0
    def loadGroundTruth(self, name=None):
        gttypes = self._config['groundTruth'].keys()

        if name is None:
            name = gttypes[0]
            if len(gttypes) > 1:
                print 'WARNING: more than 1 GroundTruth file, selecting default "%s" (out of %s)' % (
                    name, gttypes)
        else:
            if name not in gttypes:
                print 'WARNING: invalid ground truth: "%s", selecting default one instead: "%s" (out of %s)' % (
                    name, gttypes[0], gttypes)
                name = gttypes[0]

        self._groundTruthFile = self.groundTruthFilePath(name)
        self.groundTruth = GroundTruth.fromFile(self._groundTruthFile)
Example #7
0
    def loadGroundTruth(self, name=None):
        gttypes = self._config["groundTruth"].keys()

        if name is None:
            name = gttypes[0]
            if len(gttypes) > 1:
                print 'WARNING: more than 1 GroundTruth file, selecting default "%s" (out of %s)' % (name, gttypes)
        else:
            if name not in gttypes:
                print 'WARNING: invalid ground truth: "%s", selecting default one instead: "%s" (out of %s)' % (
                    name,
                    gttypes[0],
                    gttypes,
                )
                name = gttypes[0]

        self._groundTruthFile = self.groundTruthFilePath(name)
        self.groundTruth = GroundTruth.fromFile(self._groundTruthFile)
def train_svm_history(project, params, output_file_path):
    params_model = params["model"]
    if params_model.get("classifier") != "svm":
        raise GaiaWrapperException("Can only use this script on SVM config parameters.")

    ds = DataSet()
    ds.load(os.path.join(
        project["datasetsDirectory"],
        "%s-%s.db" % (project["className"], params_model["preprocessing"])
    ))

    gt = GroundTruth.fromFile(project["groundtruth"])
    gt.className = "highlevel." + project["className"]

    history = train_svm(ds, gt, type=params_model["type"], kernel=params_model["kernel"],
                        C=params_model["C"], gamma=params_model["gamma"])  # doing the whole training
    if isinstance(output_file_path, unicode):
        output_file_path = output_file_path.encode("utf-8")
    history.save(output_file_path)
def validate(basedir):
    # let's start with some basic check
    print('Checking basic directory layout...')
    if not exists(basedir):
        raise Exception('The specified base directory does not exist')

    # check required metadata files are there and that they're valid
    configFile = join(basedir, 'metadata', 'config.yaml')
    if not exists(configFile):
        raise Exception(
            'config.yaml could not be found in the metadata/ folder')

    config = yaml.load(open(configFile).read())
    if 'version' not in config:
        raise Exception('config.yaml doesn\'t have a version number')

    # check that the specified audioFormats correspond to the audio/ subfolders
    print('Checking available audio formats...')
    audioFormats = config['audioFormats']
    if not audioFormats:
        raise Exception('audioFormats not specified in config.yaml')

    audioFolders = [
        basename(f) for f in glob.glob(join(basedir, 'audio', '*'))
    ]

    if len(audioFolders) != len(audioFormats):
        raise Exception(
            'Some audio folders are not described in the audioFormats section of the config.yaml'
        )

    print('Found formats:', str(audioFolders))

    # check the audio formats are valid, in particular that they have a valid filelist
    for format, desc in audioFormats.items():
        print("\nChecking format '%s':" % format)
        # TODO: at some point in the future we should also check for valid values in desc
        if not exists(join(basedir, 'audio', format)):
            raise Exception(
                '%s is listed as an audio format, but doesn\'t appear in the audio/ folder'
                % format)

        if 'filelist' not in desc:
            raise Exception('Audio format "%s" does not define a filelist' %
                            format)

        filelist = yaml.load(
            open(join(basedir, 'metadata', desc['filelist'])).read())
        print('  filelist OK, lists %d files' % len(filelist))

        for pid, filename in filelist.items():
            fullpath = join(basedir, 'audio', format, filename)
            if not exists(fullpath):
                raise Exception(
                    'For format "%s": file "%s" appears in filelist, however there is no corresponding "%s"'
                    % (format, filename, fullpath))

    # check that the groundtruth files do actually exist if they are specified
    print('\nChecking groundtruth files...')
    groundTruth = config.get('groundTruth', {})
    print('Found groundtruth files:', str(list(groundTruth.keys())))
    for name, gtfile in groundTruth.items():
        print("\nChecking groundtruth '%s':" % name)
        gt = GroundTruth('')
        gt.load(join(basedir, 'metadata', gtfile))
        # check that the IDs used in the groundtruth files exist in all the filelists
        for afname, af in audioFormats.items():
            flist = yaml.load(
                open(join(basedir, 'metadata', af['filelist'])).read()).keys()
            for gid in gt:
                if gid not in flist:
                    raise Exception(
                        "ID '%s' is in GroundTruth '%s', but could not be found in filelist for audio format '%s'"
                        % (gid, gtfile, afname))
        print('  gt filelist OK, found classes:',
              str(sorted(set(gt.values()))))
Example #10
0
def validate(basedir):
    # let's start with some basic check
    print 'Checking basic directory layout...'
    if not exists(basedir):
        raise Exception('The specified base directory does not exist')

    # check required metadata files are there and that they're valid
    configFile = join(basedir, 'metadata', 'config.yaml')
    if not exists(configFile):
        raise Exception('config.yaml could not be found in the metadata/ folder')

    config = yaml.load(open(configFile).read())
    if 'version' not in config:
        raise Exception('config.yaml doesn\'t have a version number')


    # check that the specified audioFormats correspond to the audio/ subfolders
    print 'Checking available audio formats...'
    audioFormats = config['audioFormats']
    if not audioFormats:
        raise Exception('audioFormats not specified in config.yaml')

    audioFolders = [ basename(f) for f in glob.glob(join(basedir, 'audio', '*')) ]

    if len(audioFolders) != len(audioFormats):
        raise Exception('Some audio folders are not described in the audioFormats section of the config.yaml')

    print 'Found formats:', str(audioFolders)

    # check the audio formats are valid, in particular that they have a valid filelist
    for format, desc in audioFormats.items():
        print "\nChecking format '%s':" % format
        # TODO: at some point in the future we should also check for valid values in desc
        if not exists(join(basedir, 'audio', format)):
            raise Exception('%s is listed as an audio format, but doesn\'t appear in the audio/ folder' % format)

        if 'filelist' not in desc:
            raise Exception('Audio format "%s" does not define a filelist' % format)

        filelist = yaml.load(open(join(basedir, 'metadata', desc['filelist'])).read())
        print '  filelist OK, lists %d files' % len(filelist)

        for pid, filename in filelist.items():
            fullpath = join(basedir, 'audio', format, filename)
            if not exists(fullpath):
                raise Exception('For format "%s": file "%s" appears in filelist, however there is no corresponding "%s"' % (format, filename, fullpath))


    # check that the groundtruth files do actually exist if they are specified
    print '\nChecking groundtruth files...'
    groundTruth = config.get('groundTruth', {})
    print 'Found groundtruth files:', str(groundTruth.keys())
    for name, gtfile in groundTruth.items():
        print "\nChecking groundtruth '%s':" % name
        gt = GroundTruth('')
        gt.load(join(basedir, 'metadata', gtfile))
        # check that the IDs used in the groundtruth files exist in all the filelists
        for afname, af in audioFormats.items():
            flist = yaml.load(open(join(basedir, 'metadata', af['filelist'])).read()).keys()
            for gid in gt:
                if gid not in flist:
                    raise Exception("ID '%s' is in GroundTruth '%s', but could not be found in filelist for audio format '%s'" % (gid, gtfile, afname))
        print '  gt filelist OK, found classes:', str(sorted(set(gt.values())))
def generate_std_metadata(basedir, gtname, options):
    audioFormats = {}

    # make sure metadata folder exists
    os.system('mkdir -p "%s"' % join(basedir, 'metadata'))

    # generate a filelist for each audio folder
    filelists = {}
    for format in glob.glob(join(basedir, 'audio', '*')):
        format = basename(format)

        flist = generateMergeFilelist(join(basedir, 'audio', format),
                                      validFile = lambda x: True,
                                      filename2gid = lambda x: x)

        filelists[format] = flist

        filelist = open(join(basedir, 'metadata', '%s_filelist.yaml' % format), 'w')
        yaml.dump(flist, filelist)

        audioFormats[format] = { 'filelist': '%s_filelist.yaml' % format }

    # generate groundtruth, if asked
    groundTruth = {}
    if options.gttype is not None:

        if options.gttype == 'dir':
            # use the last filelist to get the GT, which should be independent of audio format
            # as it relies on points IDs only
            gt = GroundTruth(gtname)
            for pid in flist:
                gt[pid] = pid.split('/')[0]

            gt.save(join(basedir, 'metadata', 'groundtruth.yaml'))
            groundTruth[gtname] = 'groundtruth.yaml'

        elif options.gttype == 'txt':
            gt = GroundTruth(gtname)
            for pid in flist:
                gtfile = join(basedir, 'metadata', format, pid) + '.txt'
                gt[pid] = open(gtfile).read().strip()

            gt.save(join(basedir, 'metadata', 'groundtruth.yaml'))
            groundTruth[gtname] = 'groundtruth.yaml'

        elif options.gttype == 'mdir':
            # look for all the directories which can be paired in a XXX / not_XXX fashion
            # and create a groundtruth file for each of those
            mdirs = set(pid.split('/')[0] for pid in flist)
            mdirs = [ c for c in mdirs if ('not_' + c in mdirs or 'not-' + c in mdirs) ]
            print 'Found following possible classes', mdirs

            for c in mdirs:
                gt = GroundTruth(gtname + '_' + c)
                for pid in flist:
                    cls = pid.split('/')[0]
                    # only keep those files which we are interested in for our specific subclass
                    if not (cls == c or cls == 'not_' + c or cls == 'not-' + c):
                        continue

                    gt[pid] = cls

                gt.save(join(basedir, 'metadata', 'groundtruth_%s.yaml' % c))
                groundTruth['%s_%s' % (gtname, c)] = 'groundtruth_%s.yaml' % c


        else:
            print 'WARNING: unknown groundtruth type:', str(options.gttype)
            print '         not generating any groundtruth files...'



    # write the main config file
    config = { 'version': 1.0,
               'audioFormats': audioFormats,
               'groundTruth': groundTruth
               }

    yaml.dump(config, open(join(basedir, 'metadata', 'config.yaml'), 'w'))
def generate_std_metadata(basedir, gtname, options):
    audioFormats = {}

    # make sure metadata folder exists
    os.system('mkdir -p "%s"' % join(basedir, 'metadata'))

    # generate a filelist for each audio folder
    filelists = {}
    for format in glob.glob(join(basedir, 'audio', '*')):
        format = basename(format)

        flist = generateMergeFilelist(join(basedir, 'audio', format),
                                      validFile = lambda x: True,
                                      filename2gid = lambda x: x)

        filelists[format] = flist

        filelist = open(join(basedir, 'metadata', '%s_filelist.yaml' % format), 'w')
        yaml.dump(flist, filelist)

        audioFormats[format] = { 'filelist': '%s_filelist.yaml' % format }

    # generate groundtruth, if asked
    groundTruth = {}
    if options.gttype is not None:

        if options.gttype == 'dir':
            # use the last filelist to get the GT, which should be independent of audio format
            # as it relies on points IDs only
            gt = GroundTruth(gtname)
            for pid in flist:
                gt[pid] = pid.split('/')[0]

            gt.save(join(basedir, 'metadata', 'groundtruth.yaml'))
            groundTruth[gtname] = 'groundtruth.yaml'

        elif options.gttype == 'txt':
            gt = GroundTruth(gtname)
            for pid in flist:
                gtfile = join(basedir, 'metadata', format, pid) + '.txt'
                gt[pid] = open(gtfile).read().strip()

            gt.save(join(basedir, 'metadata', 'groundtruth.yaml'))
            groundTruth[gtname] = 'groundtruth.yaml'

        elif options.gttype == 'mdir':
            # look for all the directories which can be paired in a XXX / not_XXX fashion
            # and create a groundtruth file for each of those
            mdirs = set(pid.split('/')[0] for pid in flist)
            mdirs = [ c for c in mdirs if ('not_' + c in mdirs or 'not-' + c in mdirs) ]
            print ('Found following possible classes', mdirs)

            for c in mdirs:
                gt = GroundTruth(gtname + '_' + c)
                for pid in flist:
                    cls = pid.split('/')[0]
                    # only keep those files which we are interested in for our specific subclass
                    if not (cls == c or cls == 'not_' + c or cls == 'not-' + c):
                        continue

                    gt[pid] = cls

                gt.save(join(basedir, 'metadata', 'groundtruth_%s.yaml' % c))
                groundTruth['%s_%s' % (gtname, c)] = 'groundtruth_%s.yaml' % c


        else:
            print ('WARNING: unknown groundtruth type:', str(options.gttype))
            print ('         not generating any groundtruth files...')



    # write the main config file
    config = { 'version': 1.0,
               'audioFormats': audioFormats,
               'groundTruth': groundTruth
               }

    yaml.dump(config, open(join(basedir, 'metadata', 'config.yaml'), 'w'))