Ejemplo n.º 1
0
def main():
    ''' Test the functionality of splitting malprevs by taking a database and
    splitting into the splits. '''

    clparse = argparse.ArgumentParser('Test malprev splitting functionality.')
    clparse.add_argument("db", type=file, \
        help='.csv containing the database to test with')
    clparse.add_argument("outdir", help='directory to output to.')
    args = clparse.parse_args()

    data = mldata.load_data(args.db)
    feat, lab, _, _ = mldata.data_components(data)

    seeds = mldata.gen_seeds(42, 3)

    # Split the data twice. This is a proof of concept, so don't
    # worry that you're hardcoding the numsamples and the malprevs
    splits = mldata.gen_splits(seeds, lab, [9000,1000], [0.5, 0.1])

    # This parallels how the iteration works in cross_validation.cross_val_score
    cnt = 0
    for tr_idx, te_idx in splits:
        # Training data
        handle_idx(tr_idx, data, args, 'tr{}'.format(cnt))
        # Test data
        handle_idx(te_idx, data, args, 'te{}'.format(cnt))
        cnt += 1
Ejemplo n.º 2
0
def sometrials(options):
    ''' Runs a single machine learning trial. '''
    
    # Load the data
    data = mldata.load_data(options.database)

    # Preprocess data
    # TODO: fill in this part

    # If specified, output the current database
    if(options.exportdb != None):
        mldata.save_data(data, options.exportdb)

    # Extract the basic data from the data
    features, labels, _, featnames = mldata.data_components(data)

    # Get the seeds for the splits.
    numsplits = 30 # Make this an option later, if need be.
    seeds = mldata.gen_seeds(options.seed, numsplits)

    # Generate the splits
    # For now, training data will compose 90% of each split.
    # Possibly make the an option later.
    tr_te_sizes = [int(round(0.9*options.numsamples)), \
        options.numsamples-int(round(0.9*options.numsamples))]
    splits = mldata.gen_splits(seeds, labels, tr_te_sizes, options.malfrac) 

    # Start printing the results
    printparams(options)
    mlstat.print_results_header()

    # make the fbeta scoring object
    scorer = make_scorer(fbeta_score, beta=options.beta)

    # Fit and score based on the various performance measures
    perfmeasures = ['accuracy', 'precision', 'recall', scorer]
    for perfmeasure in perfmeasures:
        score_on_splits(perfmeasure, options, features, labels, featnames, splits)

    return