Beispiel #1
0
def mergeDirectory(dirname, outputFilename, chunkSize, transfoFile, select = None, exclude = None):
    idIsFullPath = False
    ext = '.sig'
    # TODO: this should be more flexible
    validFile = lambda x: x.endswith('.sig') and not x.endswith('.neq.sig')

    # find the list of all the points that should go into the dataset
    plist = {}
    for root, dirs, files in os.walk(dirname):
        for filename in filter(validFile, files):
            fullpath = os.path.join(root, filename)
            pid = filename
            if idIsFullPath:
                pid = fullpath
            # remove extension from the point id
            pid = pid[:-len(ext)]
            plist[pid] = fullpath

    # write a temporary yaml filelist (should delete itself upon closing)
    import tempfile
    yamllist = tempfile.NamedTemporaryFile(mode = 'w+')
    fastyaml.dump(plist, yamllist)
    yamllist.flush()

    # call 'classic' merge function
    mergeAll(yamllist.name, outputFilename, chunkSize, transfoFile, select, exclude)
Beispiel #2
0
def launchMasterSlaves():
    config = {
        'port':
        8090,
        'slaves': [{
            'host': 'localhost',
            'port': 8091
        }, {
            'host': 'localhost',
            'port': 8092
        }]
    }

    yaml.dump(config, open('/tmp/cyclops_unittest_config.yaml', 'w'))

    pids = []
    for slave in config['slaves']:
        pids += [
            subprocess.Popen(
                ['cyclops', '-p', str(slave['port'])],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE).pid
        ]

    time.sleep(1)
    pids += [
        subprocess.Popen(
            ['cyclopsmaster', '/tmp/cyclops_unittest_config.yaml'],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE).pid
    ]

    time.sleep(1)

    return pids
Beispiel #3
0
def launchMasterSlaves():
    config = { 'port': 8090,
               'slaves': [ { 'host': 'localhost',
                             'port': 8091 },
                           { 'host': 'localhost',
                             'port': 8092 }
                           ]
               }

    yaml.dump(config, open('/tmp/cyclops_unittest_config.yaml', 'w'))

    pids = []
    for slave in config['slaves']:
        pids += [ subprocess.Popen([ 'cyclops', '-p', str(slave['port']) ],
                                   stdout = subprocess.PIPE,
                                   stderr = subprocess.PIPE).pid ]

    time.sleep(1)
    pids += [ subprocess.Popen([ 'cyclopsmaster', '/tmp/cyclops_unittest_config.yaml' ],
                               stdout = subprocess.PIPE,
                               stderr = subprocess.PIPE).pid ]

    time.sleep(1)

    return pids
def convertJsonToSig(filelist_file, result_filelist_file):
    fl = yaml.load(open(filelist_file, 'r'))

    result_fl = fl
    errors = []

    for trackid, json_file in fl.iteritems():
        try:
            data = json.load(open(json_file))

            # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts
            if 'tags' in data['metadata']:
                del data['metadata']['tags']
            if 'sample_rate' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['sample_rate']

            sig_file = os.path.splitext(json_file)[0] + '.sig'

            yaml.dump(data, open(sig_file, 'w'))
            result_fl[trackid] = sig_file

        except:
            errors += [json_file]

    yaml.dump(result_fl, open(result_filelist_file, 'w'))

    print "Failed to convert", len(errors), "files:"
    for e in errors:
        print e

    return len(errors) == 0
Beispiel #5
0
def mergeDirectory(dirname,
                   outputFilename,
                   chunkSize,
                   transfoFile,
                   select=None,
                   exclude=None):
    idIsFullPath = False
    ext = '.sig'
    # TODO: this should be more flexible
    validFile = lambda x: x.endswith('.sig') and not x.endswith('.neq.sig')

    # find the list of all the points that should go into the dataset
    plist = {}
    for root, dirs, files in os.walk(dirname):
        for filename in filter(validFile, files):
            fullpath = os.path.join(root, filename)
            pid = filename
            if idIsFullPath:
                pid = fullpath
            # remove extension from the point id
            pid = pid[:-len(ext)]
            plist[pid] = fullpath

    # write a temporary yaml filelist (should delete itself upon closing)
    import tempfile
    yamllist = tempfile.NamedTemporaryFile(mode='w+')
    fastyaml.dump(plist, yamllist)
    yamllist.flush()

    # call 'classic' merge function
    mergeAll(yamllist.name, outputFilename, chunkSize, transfoFile, select,
             exclude)
Beispiel #6
0
def convertJsonToSig(filelist_file, result_filelist_file):
    fl = yaml.load(open(filelist_file, 'r'))

    result_fl = fl
    errors = []

    for trackid, json_file in fl.iteritems():
        try:
            data = json.load(open(json_file))

            # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts
            if 'tags' in data['metadata']:
                del data['metadata']['tags']
            if 'sample_rate' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['sample_rate']
            if 'lossless' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['lossless']

            sig_file = os.path.splitext(json_file)[0] + '.sig'

            yaml.safe_dump(data, open(sig_file, 'w'))
            result_fl[trackid] = sig_file

        except:
            errors += [json_file]

    yaml.dump(result_fl, open(result_filelist_file, 'w'))

    print("Failed to convert", len(errors), "files:")
    for e in errors:
        print(e)

    return len(errors) == 0
Beispiel #7
0
 def save(self, filename):
     # convert to "normal" dicts before saving
     data = {
         'matrix': dict((k, dict(v)) for k, v in self.matrix.items()),
         'fold': self.folds
     }
     with open(filename, 'w') as f:
         yaml.dump(data, f)
Beispiel #8
0
    def run(self, className, outfilename, param, dsname, gtname, evalconfig):

        try:
            classifier = param['classifier']
            gt = GroundTruth(classifier)
            gt.load(gtname)

            # force the GroundTruth class name to be the one specified by our project file, not
            # the one in the original groundtruth file
            gt.className = className

            ds = DataSet()
            ds.load(dsname)

            # some points may have failed to be analyzed, remove those from the GroundTruth
            pnames = ds.pointNames()
            for pid in list(gt.keys()):
                if pid not in pnames:
                    log.warning(
                        'Removing %s from GroundTruth as it could not be found in the merged dataset'
                        % pid)
                    del gt[pid]

            trainerFun, trainingparam, newds = getTrainer(
                classifier, param, ds)

            # run all the evaluations specified in the evaluation config
            for i, evalparam in enumerate(evalconfig):
                # if we already ran this evaluation, no need to run it again...
                resultFilename = outfilename + '_%d.result' % i
                if exists(resultFilename):
                    log.info('File %s already exists. Skipping evaluation...' %
                             resultFilename)
                    continue

                log.info(
                    'Running evaluation %d for: %s with classifier %s and dataset %s'
                    % (i, outfilename, param['classifier'],
                       param['preprocessing']))
                log.info('    PID: %d, parameters: %s' %
                         (os.getpid(), json.dumps(param)))

                # run evaluation
                confusion = evaluateNfold(evalparam['nfold'], ds, gt,
                                          trainerFun, **trainingparam)

                # write evaluation params & result
                with open(outfilename + '_%d.param' % i, 'w') as f:
                    yaml.dump({'model': param, 'evaluation': evalparam}, f)

                confusion.save(resultFilename)

        except Exception:
            log.error(
                'While doing evaluation with param = %s\nevaluation = %s' %
                (param, evalconfig))
            raise
def evaluate_dataset(eval_job, dataset_dir, storage_dir):
    db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_RUNNING)

    eval_location = os.path.join(os.path.abspath(dataset_dir), eval_job["id"])
    utils.path.create_path(eval_location)
    temp_dir = tempfile.mkdtemp()

    try:
        snapshot = db.dataset.get_snapshot(eval_job["snapshot_id"])

        train, test = artistfilter.filter(eval_job["snapshot_id"], eval_job["options"])
        db.dataset_eval.add_sets_to_job(eval_job["id"], train, test)

        logging.info("Generating filelist.yaml and copying low-level data for evaluation...")
        filelist_path = os.path.join(eval_location, "filelist.yaml")
        filelist = dump_lowlevel_data(train.keys(), temp_dir)
        with open(filelist_path, "w") as f:
            yaml.dump(filelist, f)

        logging.info("Generating groundtruth.yaml...")
        groundtruth_path = os.path.join(eval_location, "groundtruth.yaml")
        with open(groundtruth_path, "w") as f:
            yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f)

        logging.info("Training model...")
        results = gaia_wrapper.train_model(
            project_dir=eval_location,
            groundtruth_file=groundtruth_path,
            filelist_file=filelist_path,
        )
        logging.info("Saving results...")
        save_history_file(storage_dir, results["history_path"], eval_job["id"])
        db.dataset_eval.set_job_result(eval_job["id"], json.dumps({
            "project_path": eval_location,
            "parameters": results["parameters"],
            "accuracy": results["accuracy"],
            "confusion_matrix": results["confusion_matrix"],
            "history_path": results["history_path"],
        }))
        db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE)
        logging.info("Evaluation job %s has been completed." % eval_job["id"])

    # TODO(roman): Also need to catch exceptions from Gaia.
    except db.exceptions.DatabaseException as e:
        logging.info("Evaluation job %s has failed!" % eval_job["id"])
        db.dataset_eval.set_job_status(
            job_id=eval_job["id"],
            status=db.dataset_eval.STATUS_FAILED,
            status_msg=str(e),
        )
        logging.info(e)

    finally:
        # Clean up the source files used to generate this model.
        # We can recreate them from the database if we need them
        # at a later stage.
        shutil.rmtree(temp_dir)
Beispiel #10
0
 def save(self, yamlfile):
     with open(yamlfile, 'w') as f:
         yaml.dump(
             {
                 'version': 1.0,
                 'type': 'singleClass',
                 'className': self.className,
                 'groundTruth': dict(self)
             }, f)
Beispiel #11
0
def main(dirname, options):
    if os.path.isdir(dirname):
        print ("running in dir", dirname)
        project_dir = os.path.abspath(dirname)
        projname = os.path.basename(project_dir)
    else:
        print("Invalid directory: " + dirname)
        sys.exit(2)
       
    # if config/results exist, need force to rm them
    project_file = os.path.join(project_dir, "%s.project" % projname)
    results_model_file = os.path.join(project_dir, "%s.history" % projname)
    resultsdir = os.path.join(project_dir, "results")
    datasetsdir = os.path.join(project_dir, "datasets")

    if os.path.exists(resultsdir):
        print >> sys.stderr, "Results directory already exists. Use -f to delete and re-run"
        return

    classes = [d for d in os.listdir(project_dir) \
            if os.path.isdir(os.path.join(project_dir, d))]
    print (classes)

    groundtruth_name = os.path.join(project_dir, "groundtruth.yaml")
    json_name = os.path.join(project_dir, "filelist.yaml")
    yaml_name = os.path.join(project_dir, "filelist-yaml.yaml")

    filelist = {}
    groundtruth = template
    missingsig = False
    for c in classes:
        files = get_files_in_dir(os.path.join(project_dir, c), "json")
        yamlfiles = get_files_in_dir(os.path.join(project_dir, c), "sig")

        if len(files) != len(yamlfiles):
            missingsig = True

        print ("got", len(files), "files in", c)
        for f in files:
            id = os.path.splitext(os.path.basename(f))[0]
            groundtruth["groundTruth"][id] = c
            filelist[id] = os.path.join(project_dir, c, f)

    # check directories for sig and convert
    groundtruth["className"] = projname
    yaml.dump(filelist, open(json_name, "w"))
    yaml.dump(groundtruth, open(groundtruth_name, "w"))

    if missingsig:
        print ("converting sig")
        json_to_sig.convertJsonToSig(json_name, yaml_name)

    # run
    train_model.trainModel(groundtruth_name, yaml_name, project_file, project_dir, results_model_file)
Beispiel #12
0
def main(dirname, options):
    print "running in dir", dirname
    project_dir = os.path.abspath(dirname)
    projname = os.path.basename(dirname)

    # if config/results exist, need force to rm them
    project_file = os.path.join(project_dir, "%s.project" % projname)
    results_model_file = os.path.join(project_dir, "%s.history" % projname)
    resultsdir = os.path.join(project_dir, "results")
    datasetsdir = os.path.join(project_dir, "datasets")

    if os.path.exists(resultsdir):
        print >> sys.stderr, "Results directory already exists. Use -f to delete and re-run"
        return

    classes = [d for d in os.listdir(project_dir) \
            if os.path.isdir(os.path.join(project_dir, d))]
    print classes

    groundtruth_name = os.path.join(project_dir, "groundtruth.yaml")
    json_name = os.path.join(project_dir, "filelist.yaml")
    yaml_name = os.path.join(project_dir, "filelist-yaml.yaml")

    filelist = {}
    groundtruth = template
    missingsig = False
    for c in classes:
        files = get_files_in_dir(os.path.join(project_dir, c), "json")
        yamlfiles = get_files_in_dir(os.path.join(project_dir, c), "sig")

        if len(files) != len(yamlfiles):
            missingsig = True

        print "got", len(files), "files in", c
        for f in files:
            id = os.path.splitext(os.path.basename(f))[0]
            groundtruth["groundTruth"][id] = c
            filelist[id] = os.path.join(project_dir, c, f)

    # check directories for sig and convert
    groundtruth["className"] = projname
    yaml.dump(filelist, open(json_name, "w"))
    yaml.dump(groundtruth, open(groundtruth_name, "w"))

    if missingsig:
        print "converting sig"
        json_to_sig.convertJsonToSig(json_name, yaml_name)

    # run
    train_model.trainModel(groundtruth_name, yaml_name, project_file, project_dir, results_model_file)
Beispiel #13
0
def generateProjectFromCollection():
    parser = OptionParser(
        usage='%prog [options] collection_name sigfiles_dir project_file\n\n' +
        'this will also generate a groundtruth and a filelist file to be used by the project file.'
    )

    parser.add_option(
        '-g',
        '--groundtruth',
        dest='desiredGroundTruth',
        help=
        'Which type of ground truth to use, in case the collection has more than one'
    )

    options, args = parser.parse_args()

    try:
        collection_name = args[0]
        sigfiles_dir = args[1]
        project_file = args[2]
    except:
        parser.print_help()
        sys.exit(1)

    # create collection from a directory collection_name if it exists, use an MTG-DB collection otherwise
    if os.path.isdir(collection_name):
        collec = gaia2.mtgdb.Collection(collection_name,
                                        groundTruth=options.desiredGroundTruth)
    else:
        collec = gaia2.mtgdb.MtgdbCollection(
            collection_name, groundTruth=options.desiredGroundTruth)

    # write yaml file of sigfiles to merge for this project
    filelistFilename = abspath(splitext(project_file)[0] + '.filelist.yaml')
    sigfileList = sigfileListFromCollection(collec, sigfiles_dir)

    with open(filelistFilename, 'w') as filelist:
        yaml.dump(sigfileList, filelist)

    # write the project file
    with open(project_file, 'w') as pfile:
        pfile.write(
            PROJECT_TEMPLATE % {
                'className': collec.groundTruth.className,
                'filelist': filelistFilename,
                'groundtruth': abspath(collec._groundTruthFile)
            })

    print 'Successfully written', project_file
Beispiel #14
0
    def run(self, className, outfilename, param, dsname, gtname, evalconfig):

        try:
            classifier = param['classifier']
            gt = GroundTruth(classifier)
            gt.load(gtname)

            # force the GroundTruth class name to be the one specified by our project file, not
            # the one in the original groundtruth file
            gt.className = className

            ds = DataSet()
            ds.load(dsname)

            # some points may have failed to be analyzed, remove those from the GroundTruth
            pnames = ds.pointNames()
            for pid in list(gt.keys()):
                if pid not in pnames:
                    log.warning('Removing %s from GroundTruth as it could not be found in the merged dataset' % pid)
                    del gt[pid]

            trainerFun, trainingparam, newds = getTrainer(classifier, param, ds)

            # run all the evaluations specified in the evaluation config
            for i, evalparam in enumerate(evalconfig):
                # if we already ran this evaluation, no need to run it again...
                resultFilename = outfilename + '_%d.result' % i
                if exists(resultFilename):
                    log.info('File %s already exists. Skipping evaluation...' % resultFilename)
                    continue

                log.info('Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename,
                                                                                              param['classifier'],
                                                                                              param['preprocessing']))
                log.info('    PID: %d, parameters: %s' % (os.getpid(), json.dumps(param)))

                # run evaluation
                confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam)

                # write evaluation params & result
                with open(outfilename + '_%d.param' % i, 'w') as f:
                    yaml.dump({ 'model': param, 'evaluation': evalparam }, f)

                confusion.save(resultFilename)

        except Exception:
            log.error('While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig))
            raise
Beispiel #15
0
    def __call__(self, *args, **kwargs):
        if kwargs:
            raise NotImplementedError('Cannot use keyword arguments with YamlRPC at the moment...')

        if VERBOSE: serializeStart = time.time()

        try:
            q = yaml.dump({ 'method': self.methodName,
                            'params': list(args),
                            'id': 'gloubi-boulga'
                            })
        except:
            raise RuntimeError('Could not serialize Yaml request, most likely one of the arguments could not be serialized:\n%s' % list(args))

        if VERBOSE:
            responseTime = time.time() - serializeStart
            print 'serialized request in %f seconds' % responseTime


        # we don't want the '+'-quoting
        params = urllib.urlencode({ 'q': q }).replace('+', ' ')

        headers = { 'Content-type': 'application/x-www-form-urlencoded',
                    'Accept': 'text/plain'
                    }

        if VERBOSE: startTime = time.time()

        conn = httplib.HTTPConnection(self.endPoint)

        try:
            conn.request('POST', '/', params, headers)
        except Exception, e:
            raise RuntimeError('request failed', self.endPoint, self.methodName, args, e)
Beispiel #16
0
def convertJsonToSig():
    parser = OptionParser(usage = '%prog [options] filelist_file result_filelist_file\n' +
"""
Converts json files found in filelist_file into *.sig yaml files compatible with
Gaia. The result files are written to the same directory where original files were 
located.
"""
        )

    options, args = parser.parse_args()

    try:
        filelist_file = args[0]
        result_filelist_file = args[1]
    except:
        parser.print_help()
        sys.exit(1)

    fl = yaml.load(open(filelist_file, 'r'))

    result_fl = fl
    errors = []

    for trackid, json_file in fl.iteritems():
        try:
            data = json.load(open(json_file))

            # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts
            if 'tags' in data['metadata']:
                del data['metadata']['tags']
            if 'sample_rate' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['sample_rate']

            sig_file = os.path.splitext(json_file)[0] + '.sig'
            
            yaml.dump(data, open(sig_file, 'w'))           
            result_fl[trackid] = sig_file

        except:
            errors += [json_file]

    yaml.dump(result_fl, open(result_filelist_file, 'w'))
    
    print "Failed to convert", len(errors), "files:"
    for e in errors:
        print e
    return len(errors)
def evaluate_dataset(eval_job):
    db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_RUNNING)
    temp_dir = tempfile.mkdtemp()

    try:
        dataset = db.dataset.get(eval_job["dataset_id"])

        logging.info("Generating filelist.yaml and copying low-level data for evaluation...")
        filelist_path = os.path.join(temp_dir, "filelist.yaml")
        filelist = dump_lowlevel_data(extract_recordings(dataset), os.path.join(temp_dir, "data"))
        with open(filelist_path, "w") as f:
            yaml.dump(filelist, f)

        logging.info("Generating groundtruth.yaml...")
        groundtruth_path = os.path.join(temp_dir, "groundtruth.yaml")
        with open(groundtruth_path, "w") as f:
            yaml.dump(create_groundtruth(dataset), f)

        logging.info("Training model...")
        results = gaia_wrapper.train_model(
            groundtruth_file=groundtruth_path,
            filelist_file=filelist_path,
            project_dir=temp_dir,
        )
        logging.info("Saving results...")
        save_history_file(results["history_path"], eval_job["id"])
        db.dataset_eval.set_job_result(eval_job["id"], json.dumps({
            "parameters": results["parameters"],
            "accuracy": results["accuracy"],
            "confusion_matrix": results["confusion_matrix"],
            "history_path": results["history_path"],
        }))
        db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE)
        logging.info("Evaluation job %s has been completed." % eval_job["id"])

    # TODO(roman): Also need to catch exceptions from Gaia.
    except db.exceptions.DatabaseException as e:
        logging.info("Evaluation job %s has failed!" % eval_job["id"])
        db.dataset_eval.set_job_status(
            job_id=eval_job["id"],
            status=db.dataset_eval.STATUS_FAILED,
            status_msg=str(e),
        )
        logging.info(e)

    finally:
        shutil.rmtree(temp_dir)  # Cleanup
Beispiel #18
0
def evaluate_dataset(eval_job):
    db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_RUNNING)
    temp_dir = tempfile.mkdtemp()

    try:
        dataset = db.dataset.get(eval_job["dataset_id"])

        logging.info("Generating filelist.yaml and copying low-level data for evaluation...")
        filelist_path = os.path.join(temp_dir, "filelist.yaml")
        filelist = dump_lowlevel_data(extract_recordings(dataset), os.path.join(temp_dir, "data"))
        with open(filelist_path, "w") as f:
            yaml.dump(filelist, f)

        logging.info("Generating groundtruth.yaml...")
        groundtruth_path = os.path.join(temp_dir, "groundtruth.yaml")
        with open(groundtruth_path, "w") as f:
            yaml.dump(create_groundtruth(dataset), f)

        logging.info("Training model...")
        results = gaia_wrapper.train_model(
            groundtruth_file=groundtruth_path,
            filelist_file=filelist_path,
            project_dir=temp_dir,
        )
        logging.info("Saving results...")
        save_history_file(results["history_path"], eval_job["id"])
        db.dataset_eval.set_job_result(eval_job["id"], json.dumps({
            "parameters": results["parameters"],
            "accuracy": results["accuracy"],
            "confusion_matrix": results["confusion_matrix"],
            "history_path": results["history_path"],
        }))
        db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE)
        logging.info("Evaluation job %s has been completed." % eval_job["id"])

    # TODO(roman): Also need to catch exceptions from Gaia.
    except db.exceptions.DatabaseException as e:
        logging.info("Evaluation job %s has failed!" % eval_job["id"])
        db.dataset_eval.set_job_status(
            job_id=eval_job["id"],
            status=db.dataset_eval.STATUS_FAILED,
            status_msg=str(e),
        )
        logging.info(e)

    finally:
        shutil.rmtree(temp_dir)  # Cleanup
Beispiel #19
0
def convertJsonToSig():
    parser = OptionParser(
        usage='%prog [options] filelist_file result_filelist_file\n' + """
Converts json files found in filelist_file into *.sig yaml files compatible with
Gaia. The result files are written to the same directory where original files were 
located.
""")

    options, args = parser.parse_args()

    try:
        filelist_file = args[0]
        result_filelist_file = args[1]
    except:
        parser.print_help()
        sys.exit(1)

    fl = yaml.load(open(filelist_file, 'r'))

    result_fl = fl
    errors = []

    for trackid, json_file in fl.iteritems():
        try:
            data = json.load(open(json_file))

            # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts
            if 'tags' in data['metadata']:
                del data['metadata']['tags']
            if 'sample_rate' in data['metadata']['audio_properties']:
                del data['metadata']['audio_properties']['sample_rate']

            sig_file = os.path.splitext(json_file)[0] + '.sig'

            yaml.dump(data, open(sig_file, 'w'))
            result_fl[trackid] = sig_file

        except:
            errors += [json_file]

    yaml.dump(result_fl, open(result_filelist_file, 'w'))

    print "Failed to convert", len(errors), "files:"
    for e in errors:
        print e
    return len(errors)
Beispiel #20
0
    def __call__(self, *args, **kwargs):
        if kwargs:
            raise NotImplementedError('Cannot use keyword arguments with YamlRPC at the moment...')

        if VERBOSE: serializeStart = time.time()

        try:
            q = yaml.dump({ 'method': self.methodName,
                            'params': list(args),
                            'id': 'gloubi-boulga'
                            })
        except:
            raise RuntimeError('Could not serialize Yaml request, most likely one of the arguments could not be serialized:\n%s' % list(args))

        if VERBOSE:
            responseTime = time.time() - serializeStart
            print ('serialized request in %f seconds' % responseTime)


        # we don't want the '+'-quoting
        params = urlencode({ 'q': q }).replace('+', ' ')

        headers = { 'Content-type': 'application/x-www-form-urlencoded',
                    'Accept': 'text/plain'
                    }

        if VERBOSE: startTime = time.time()

        conn = http_client.HTTPConnection(self.endPoint)

        try:
            conn.request('POST', '/', params, headers)
        except Exception as e:
            raise RuntimeError('request failed', self.endPoint, self.methodName, args, e)

        response = conn.getresponse()

        if VERBOSE:
            responseTime = time.time() - startTime
            print ('received answer in %f seconds' % responseTime)
            #print response.status, response.reason

            startParseTime = time.time()

        result = yaml.load(response.read())

        if VERBOSE:
            responseTime = time.time() - startParseTime
            print ('parsed answer in %f seconds' % responseTime)

            responseTime = time.time() - serializeStart
            print ('total time: %f seconds' % responseTime)

        if 'error' in result:
            raise RuntimeError(result['error']['message'])

        return result['result']
def lowlevel_data_to_yaml(data):
    """Prepares dictionary with low-level data about recording for processing
    and converts it into YAML string.
    """
    # Removing descriptors, that will otherwise break gaia_fusion due to
    # incompatibility of layouts (see Gaia implementation for more details).
    if "tags" in data["metadata"]:
        del data["metadata"]["tags"]
    if "sample_rate" in data["metadata"]["audio_properties"]:
        del data["metadata"]["audio_properties"]["sample_rate"]
    if 'lossless' in data['metadata']['audio_properties']:
        del data['metadata']['audio_properties']['lossless']

    return yaml.dump(data)
Beispiel #22
0
def lowlevel_data_to_yaml(data):
    """Prepares dictionary with low-level data about recording for processing
    and converts it into YAML string.
    """
    # Removing descriptors, that will otherwise break gaia_fusion due to
    # incompatibility of layouts (see Gaia implementation for more details).
    if "tags" in data["metadata"]:
        del data["metadata"]["tags"]
    if "sample_rate" in data["metadata"]["audio_properties"]:
        del data["metadata"]["audio_properties"]["sample_rate"]
    if 'lossless' in data['metadata']['audio_properties']:
        del data['metadata']['audio_properties']['lossless']

    return yaml.dump(data)
def generateProjectFromCollection():
    parser = OptionParser(usage = '%prog [options] collection_name sigfiles_dir project_file\n\n' +
                          'this will also generate a groundtruth and a filelist file to be used by the project file.')

    parser.add_option('-g', '--groundtruth', dest = 'desiredGroundTruth',
                      help = 'Which type of ground truth to use, in case the collection has more than one')

    options, args = parser.parse_args()

    try:
        collection_name = args[0]
        sigfiles_dir = args[1]
        project_file = args[2]
    except:
        parser.print_help()
        sys.exit(1)

    # create collection from a directory collection_name if it exists, use an MTG-DB collection otherwise
    if os.path.isdir(collection_name):
        collec = gaia2.mtgdb.Collection(collection_name, groundTruth = options.desiredGroundTruth)
    else:
        collec = gaia2.mtgdb.MtgdbCollection(collection_name, groundTruth = options.desiredGroundTruth)

    # write yaml file of sigfiles to merge for this project
    filelistFilename = abspath(splitext(project_file)[0] + '.filelist.yaml')
    sigfileList = sigfileListFromCollection(collec, sigfiles_dir)

    with open(filelistFilename, 'w') as filelist:
        yaml.dump(sigfileList, filelist)

    # write the project file
    with open(project_file, 'w') as pfile:
        pfile.write(PROJECT_TEMPLATE % { 'className': collec.groundTruth.className,
                                         'filelist': filelistFilename,
                                         'groundtruth': abspath(collec._groundTruthFile) })

    print 'Successfully written', project_file
Beispiel #24
0
    def __call__(self, *args, **kwargs):
        if kwargs:
            raise NotImplementedError(
                'Cannot use keyword arguments with YamlRPC at the moment...')

        if VERBOSE: serializeStart = time.time()

        try:
            q = yaml.dump({
                'method': self.methodName,
                'params': list(args),
                'id': 'gloubi-boulga'
            })
        except:
            raise RuntimeError(
                'Could not serialize Yaml request, most likely one of the arguments could not be serialized:\n%s'
                % list(args))

        if VERBOSE:
            responseTime = time.time() - serializeStart
            print 'serialized request in %f seconds' % responseTime

        # we don't want the '+'-quoting
        params = urllib.urlencode({'q': q}).replace('+', ' ')

        headers = {
            'Content-type': 'application/x-www-form-urlencoded',
            'Accept': 'text/plain'
        }

        if VERBOSE: startTime = time.time()

        conn = httplib.HTTPConnection(self.endPoint)

        try:
            conn.request('POST', '/', params, headers)
        except Exception, e:
            raise RuntimeError('request failed', self.endPoint,
                               self.methodName, args, e)
Beispiel #25
0
 def save(self, filename):
     # convert to "normal" dicts before saving
     data = dict((k, dict(v)) for k, v in list(self.matrix.items()))
     with open(filename, 'w') as f:
         yaml.dump(data, f)
Beispiel #26
0
 def save(self, filename):
     # convert to "normal" dicts before saving
     data = dict((k, dict(v)) for k, v in self.matrix.items())
     with open(filename, 'w') as f:
         yaml.dump(data, f)
Beispiel #27
0
            ds_harm_proc.save(ds_harm_filename)


if __name__ == '__main__':
    c = loadCollections()

    try:
        os.mkdir(WORK_DIR)
    except OSError:
        pass

    # need to do some prep work before to harmonize all datasets layouts. This won't be
    # necessary anymore in the future when all is nicely generated with a single coherent
    # script, but at the moment we have to work with the data we have...
    harmonizeDatasets(c)

    cachedFolds = False
    foldsFile = '%s/folds.yaml' % WORK_DIR
    if os.path.exists(foldsFile):
        folds = yaml.loadfile(foldsFile)
    else:
        print('Generating folds for all collections...')
        folds = generateFolds(c, NFOLDS)
        yaml.dump(folds, open(foldsFile, 'w'))

    print('Training SVM models for their corresponding folds...')
    trainSVMfolds(c, folds)

    print('Generating the evaluation datasets from the models...')
    generateEvaluationDatasets(c, folds)
Beispiel #28
0
        return result


if __name__ == '__main__':
    try:
        resultsdir = sys.argv[1]
    except:
        print 'Usage: %s results_dir [classifierType]' % sys.argv[0]
        exit(1)

    try:
        classifierType = sys.argv[2]
    except:
        classifierType = None

    cr = ClassificationResults()
    print 'Loading all results...'
    cr.readResults(resultsdir)

    print 'Best parameters:'
    for r, filename, params in cr.best(10, classifierType):
        print '*'*100
        print 'Correct classification: %2f%%' % r
        print 'Filename:', filename

        model = params['model']
        print 'Classifier:', model['classifier']
        print 'Parameters:'
        del model['classifier']
        print '    ' + yaml.dump(model).replace('\n', '\n    ')[:-4]
Beispiel #29
0


if __name__ == '__main__':
    c = loadCollections()

    try:
        os.mkdir(WORK_DIR)
    except OSError:
        pass

    # need to do some prep work before to harmonize all datasets layouts. This won't be
    # necessary anymore in the future when all is nicely generated with a single coherent
    # script, but at the moment we have to work with the data we have...
    harmonizeDatasets(c)

    cachedFolds = False
    foldsFile = '%s/folds.yaml' % WORK_DIR
    if os.path.exists(foldsFile):
        folds = yaml.loadfile(foldsFile)
    else:
        print 'Generating folds for all collections...'
        folds = generateFolds(c, NFOLDS)
        yaml.dump(folds, open(foldsFile, 'w'))

    print 'Training SVM models for their corresponding folds...'
    trainSVMfolds(c, folds)

    print 'Generating the evaluation datasets from the models...'
    generateEvaluationDatasets(c, folds)
Beispiel #30
0
def evaluate_dataset(eval_job, dataset_dir, storage_dir):
    db.dataset_eval.set_job_status(eval_job["id"],
                                   db.dataset_eval.STATUS_RUNNING)

    eval_location = os.path.join(os.path.abspath(dataset_dir), eval_job["id"])
    utils.path.create_path(eval_location)
    temp_dir = tempfile.mkdtemp()

    try:
        snapshot = db.dataset.get_snapshot(eval_job["snapshot_id"])

        train, test = artistfilter.filter(eval_job["snapshot_id"],
                                          eval_job["options"])
        db.dataset_eval.add_sets_to_job(eval_job["id"], train, test)

        logging.info(
            "Generating filelist.yaml and copying low-level data for evaluation..."
        )
        filelist_path = os.path.join(eval_location, "filelist.yaml")
        filelist = dump_lowlevel_data(train.keys(), temp_dir)
        with open(filelist_path, "w") as f:
            yaml.dump(filelist, f)

        logging.info("Generating groundtruth.yaml...")
        groundtruth_path = os.path.join(eval_location, "groundtruth.yaml")
        with open(groundtruth_path, "w") as f:
            yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train),
                      f)

        # Passing more user preferences to train the model.
        logging.info("Training model...")
        results = gaia_wrapper.train_model(
            project_dir=eval_location,
            groundtruth_file=groundtruth_path,
            filelist_file=filelist_path,
            c_values=eval_job["options"].get("c_values", []),
            gamma_values=eval_job["options"].get("gamma_values", []),
            preprocessing_values=eval_job["options"].get(
                "preprocessing_values", []),
        )
        logging.info("Saving results...")
        save_history_file(storage_dir, results["history_path"], eval_job["id"])
        db.dataset_eval.set_job_result(
            eval_job["id"],
            json.dumps({
                "project_path": eval_location,
                "parameters": results["parameters"],
                "accuracy": results["accuracy"],
                "confusion_matrix": results["confusion_matrix"],
                "history_path": results["history_path"],
            }))
        db.dataset_eval.set_job_status(eval_job["id"],
                                       db.dataset_eval.STATUS_DONE)
        logging.info("Evaluation job %s has been completed." % eval_job["id"])

    # TODO(roman): Also need to catch exceptions from Gaia.
    except db.exceptions.DatabaseException as e:
        logging.info("Evaluation job %s has failed!" % eval_job["id"])
        db.dataset_eval.set_job_status(
            job_id=eval_job["id"],
            status=db.dataset_eval.STATUS_FAILED,
            status_msg=str(e),
        )
        logging.info(e)

    finally:
        # Clean up the source files used to generate this model.
        # We can recreate them from the database if we need them
        # at a later stage.
        shutil.rmtree(temp_dir)
Beispiel #31
0
 def save(self, yamlfile):
     with open(yamlfile, 'w') as f:
         yaml.dump({ 'version': 1.0,
                     'type': 'singleClass',
                     'className': self.className,
                     'groundTruth': dict(self) }, f)
        return result


if __name__ == '__main__':
    try:
        resultsdir = sys.argv[1]
    except:
        print ('Usage: %s results_dir [classifierType]' % sys.argv[0])
        exit(1)

    try:
        classifierType = sys.argv[2]
    except:
        classifierType = None

    cr = ClassificationResults()
    print ('Loading all results...')
    cr.readResults(resultsdir)

    print ('Best parameters:')
    for r, filename, params in cr.best(10, classifierType):
        print ('*'*100)
        print ('Correct classification: %2f%%' % r)
        print ('Filename:', filename)

        model = params['model']
        print ('Classifier:', model['classifier'])
        print ('Parameters:')
        del model['classifier']
        print ('    ' + yaml.dump(model).replace('\n', '\n    ')[:-4])
def main(input_directory, output_directory, project_name, force=False,
         seed=None, cluster_mode=False, force_consistency=False):
    print("looking for data in dir", input_directory)
    print("storing results in dir", output_directory)

    project_dir = os.path.abspath(input_directory)

    projname = project_name

    output_dir = os.path.abspath(output_directory)

    # if config/results exist, need force to rm them
    project_file = os.path.join(output_dir, "%s.project" % projname)
    results_model_file = os.path.join(output_dir, "%s.history" % projname)
    resultsdir = os.path.join(output_dir, "results")

    if force:
        shutil.rmtree(output_directory, ignore_errors=True)

    if not os.path.exists(resultsdir):
        os.makedirs(resultsdir)

    classes = [d for d in os.listdir(project_dir)
               if os.path.isdir(os.path.join(project_dir, d))]
    print(classes)

    groundtruth_name = os.path.join(resultsdir, "groundtruth.yaml")
    json_name = os.path.join(resultsdir, "filelist-to-convert.yaml")
    yaml_name = os.path.join(resultsdir, "filelist-yaml.yaml")

    filelist = {}
    groundtruth = template

    for c in classes:
        jsonfiles = get_files_in_dir(os.path.join(project_dir, c), "json")
        yamlfiles = get_files_in_dir(os.path.join(project_dir, c), "sig")

        yamlfilesNoExt = [f.rstrip('.sig') for f in yamlfiles]

        if (len(jsonfiles) > 0):
            filesToConvert = {
                os.path.splitext(os.path.basename(f))[0]:
                os.path.join(project_dir, c, f)
                for f in jsonfiles if f.rstrip('.json') not in yamlfilesNoExt
            }

            print("{} json files have to be converted into yamls. "
                  "{} already exist.".format(len(filesToConvert),
                                             len(yamlfiles)))

            yaml.dump(filesToConvert, open(json_name, "w"))
            json_to_sig.convertJsonToSig(json_name, yaml_name)

        yamlfiles = get_files_in_dir(os.path.join(project_dir, c), "sig")

        print("got", len(yamlfiles), "files in", c)
        for f in yamlfiles:
            id = os.path.splitext(os.path.basename(f))[0]
            groundtruth["groundTruth"][id] = c
            filelist[id] = os.path.join(project_dir, c, f)

    # check directories for sig and convert
    groundtruth["className"] = projname
    yaml.dump(filelist, open(yaml_name, "w"))
    yaml.dump(groundtruth, open(groundtruth_name, "w"))

    if os.path.exists(json_name):
        os.remove(json_name)

    train_model.train_model(groundtruth_name, yaml_name,
                           project_file, resultsdir, results_model_file,
                           seed=seed, cluster_mode=cluster_mode,
                           force_consistency=force_consistency)