Exemple #1
0
def sometrials(options):
    ''' Runs a single machine learning trial. '''
    
    # Load the data
    data = mldata.load_data(options.database)

    # Preprocess data
    # TODO: fill in this part

    # If specified, output the current database
    if(options.exportdb != None):
        mldata.save_data(data, options.exportdb)

    # Extract the basic data from the data
    features, labels, _, featnames = mldata.data_components(data)

    # Get the seeds for the splits.
    numsplits = 30 # Make this an option later, if need be.
    seeds = mldata.gen_seeds(options.seed, numsplits)

    # Generate the splits
    # For now, training data will compose 90% of each split.
    # Possibly make the an option later.
    tr_te_sizes = [int(round(0.9*options.numsamples)), \
        options.numsamples-int(round(0.9*options.numsamples))]
    splits = mldata.gen_splits(seeds, labels, tr_te_sizes, options.malfrac) 

    # Start printing the results
    printparams(options)
    mlstat.print_results_header()

    # make the fbeta scoring object
    scorer = make_scorer(fbeta_score, beta=options.beta)

    # Fit and score based on the various performance measures
    perfmeasures = ['accuracy', 'precision', 'recall', scorer]
    for perfmeasure in perfmeasures:
        score_on_splits(perfmeasure, options, features, labels, featnames, splits)

    return
Exemple #2
0
def oldtrial(options):
    ''' Run a single machine learning trial.''' 
    # TODO: make option for loading intermediate data to skip steps that have
    # been done in previous trials

    # Select data to read
    data = mldata.load_data(options.database)

    # Get a sample
    if(options.numsamples != None): # Check to see if a sample was requested
        if(options.malfrac != None):
            sample = mldata.select_sample(int(options.seed), data, \
                options.numsamples, options.malfrac[0])
        else: # Only use a percent malware if one was specified
            sample = mldata.select_sample(int(options.seed), data,
                options.numsamples)
    else:
        sample = data

    # If specified, output the current database
    if(options.exportdb != None):
        mldata.save_data(sample, options.exportdb)

    # Original way to run a trial... probably going to be deleted eventually
    if(options.acc):
        return oldacc(options, sample)

    # Primary way to run a trial
    else:
        printparams(options)
        mlstat.print_results_header()
        perfmeasures = ['accuracy', 'precision', 'recall', 'f1']
        avgs = []
        for perfmeasure in perfmeasures:
            # Extract the parts of the samples
            # Not yet using the filenames and feature names
            features, labels, _, featnames = mldata.data_components(sample)

            # Split the sample into 10 randomly stratified folds
            cvsplits = cross_validation.StratifiedShuffleSplit(labels, \
                        test_size=0.1, random_state=options.seed)

            # Score the folds
            est = mlalgos.get_estimator(options.algorithm)
            scores = cross_validation.cross_val_score(est, features, y=labels, \
                        scoring=perfmeasure, cv=cvsplits)

            # Print the results
            avgs.append(sum(scores)/len(scores))
            avgstr = '{:.4}'.format(avgs[-1]).rjust(7)
            resultstr = '{}  {} '.format(perfmeasure.rjust(9), avgstr)
            for score in scores:
                resultstr += ' {:.3}'.format(score)
            print(resultstr)

            # Icing on the cake: draw a decision tree graph
            # based on the fold with the best f1 score
            if(perfmeasure=='f1' and options.graphfile != None and \
                isinstance(est, tree.DecisionTreeClassifier)):
                mlalgos.dt_graph(est, cvsplits, scores, features, labels, \
                                featnames, options.graphfile)

        return (perfmeasures, avgs)