Example #1
0
def score_on_splits(perfmeasure, options, features, labels, featnames, splits):
    ''' Actually do the fitting and evaluating.

    perfmeasure = scoring mechanic. e.g. 'accuracy', 'precision', 'recall', 'f1'
        (this can be a scoring object or a string)
    options = the command line arguments
    features = the data records
    labels = the data labels
    featnames = names of the features of each record. From mldata.data_component
    splits = the results of, say, mldata.gen_splits

    returns (average of scores, standard deviation of scores, and the scores)
    '''

    # Score the splits
    est = mlalgos.get_estimator(options.algorithm, seed=options.seed)
    scores = cross_validation.cross_val_score(est, features, y=labels, \
                scoring=perfmeasure, cv=splits)

    # Print the results
    metric = 'ERROR'
    if(isinstance(perfmeasure, basestring)):
        metric = perfmeasure
    else:
        metric = 'f-'+str(options.beta)
    mlstat.printresults(metric, scores)

    # Icing on the cake: draw a decision tree graph
    # based on the fold with the best f1 score
    if(perfmeasure=='f1' and options.graphfile != None and \
        isinstance(est, tree.DecisionTreeClassifier)):
        mlalgos.dt_graph(est, splits, scores, features, labels, \
                        featnames, options.graphfile)

    return (scores.mean(), scores.std(), scores)
Example #2
0
def atrial(options):
    ''' Run a single machine learning trial.'''
    # TODO: make option for loading intermediate data to skip steps that have
    # been done in previous trials

    # Select data to read
    data = mldata.load_data(options.database)

    # Get a sample
    if (options.numsamples != None):  # Check to see if a sample was requested
        if (options.malfrac != None):
            sample = mldata.select_sample(int(options.seed), data, \
                options.numsamples, options.malfrac)
        else:  # Only use a percent malware if one was specified
            sample = mldata.select_sample(int(options.seed), data,
                                          options.numsamples)
    else:
        sample = data

    # Preprocess data
    # TODO: fill in this part

    # If specified, output the current database
    if (options.newdb != None):
        mldata.save_data(sample, options.newdb)

    # Original way to run a trial... probably going to be deleted eventually
    if (options.simplyAcc):
        return oldacc(options, sample)

    # Primary way to run a trial
    else:
        printparams(options)
        print('  Measure  Average  Fold-Scores')
        perfmeasures = ['accuracy', 'precision', 'recall', 'f1']
        avgs = []
        for perfmeasure in perfmeasures:
            # Extract the parts of the samples
            # Not yet using the filenames and feature names
            features, labels, _, featnames = mldata.data_components(sample)

            # Split the sample into 10 randomly stratified folds
            cvsplits = cross_validation.StratifiedShuffleSplit(labels, \
                        test_size=0.1, random_state=options.seed)

            # Score the folds
            est = mlalgos.get_estimator(options.algorithm)
            scores = cross_validation.cross_val_score(est, features, y=labels, \
                        scoring=perfmeasure, cv=cvsplits)

            # Print the results
            avgs.append(sum(scores) / len(scores))
            avgstr = '{:.4}'.format(avgs[-1]).rjust(7)
            resultstr = '{}  {} '.format(perfmeasure.rjust(9), avgstr)
            for score in scores:
                resultstr += ' {:.3}'.format(score)
            print(resultstr)

            # Icing on the cake: draw a decision tree graph
            # based on the fold with the best f1 score
            if(perfmeasure=='f1' and options.graphfile != None and \
                isinstance(est, tree.DecisionTreeClassifier)):
                mlalgos.dt_graph(est, cvsplits, scores, features, labels, \
                                featnames, options.graphfile)

        return (perfmeasures, avgs)
Example #3
0
def oldtrial(options):
    ''' Run a single machine learning trial.''' 
    # TODO: make option for loading intermediate data to skip steps that have
    # been done in previous trials

    # Select data to read
    data = mldata.load_data(options.database)

    # Get a sample
    if(options.numsamples != None): # Check to see if a sample was requested
        if(options.malfrac != None):
            sample = mldata.select_sample(int(options.seed), data, \
                options.numsamples, options.malfrac[0])
        else: # Only use a percent malware if one was specified
            sample = mldata.select_sample(int(options.seed), data,
                options.numsamples)
    else:
        sample = data

    # If specified, output the current database
    if(options.exportdb != None):
        mldata.save_data(sample, options.exportdb)

    # Original way to run a trial... probably going to be deleted eventually
    if(options.acc):
        return oldacc(options, sample)

    # Primary way to run a trial
    else:
        printparams(options)
        mlstat.print_results_header()
        perfmeasures = ['accuracy', 'precision', 'recall', 'f1']
        avgs = []
        for perfmeasure in perfmeasures:
            # Extract the parts of the samples
            # Not yet using the filenames and feature names
            features, labels, _, featnames = mldata.data_components(sample)

            # Split the sample into 10 randomly stratified folds
            cvsplits = cross_validation.StratifiedShuffleSplit(labels, \
                        test_size=0.1, random_state=options.seed)

            # Score the folds
            est = mlalgos.get_estimator(options.algorithm)
            scores = cross_validation.cross_val_score(est, features, y=labels, \
                        scoring=perfmeasure, cv=cvsplits)

            # Print the results
            avgs.append(sum(scores)/len(scores))
            avgstr = '{:.4}'.format(avgs[-1]).rjust(7)
            resultstr = '{}  {} '.format(perfmeasure.rjust(9), avgstr)
            for score in scores:
                resultstr += ' {:.3}'.format(score)
            print(resultstr)

            # Icing on the cake: draw a decision tree graph
            # based on the fold with the best f1 score
            if(perfmeasure=='f1' and options.graphfile != None and \
                isinstance(est, tree.DecisionTreeClassifier)):
                mlalgos.dt_graph(est, cvsplits, scores, features, labels, \
                                featnames, options.graphfile)

        return (perfmeasures, avgs)