def score_on_splits(perfmeasure, options, features, labels, featnames, splits): ''' Actually do the fitting and evaluating. perfmeasure = scoring mechanic. e.g. 'accuracy', 'precision', 'recall', 'f1' (this can be a scoring object or a string) options = the command line arguments features = the data records labels = the data labels featnames = names of the features of each record. From mldata.data_component splits = the results of, say, mldata.gen_splits returns (average of scores, standard deviation of scores, and the scores) ''' # Score the splits est = mlalgos.get_estimator(options.algorithm, seed=options.seed) scores = cross_validation.cross_val_score(est, features, y=labels, \ scoring=perfmeasure, cv=splits) # Print the results metric = 'ERROR' if(isinstance(perfmeasure, basestring)): metric = perfmeasure else: metric = 'f-'+str(options.beta) mlstat.printresults(metric, scores) # Icing on the cake: draw a decision tree graph # based on the fold with the best f1 score if(perfmeasure=='f1' and options.graphfile != None and \ isinstance(est, tree.DecisionTreeClassifier)): mlalgos.dt_graph(est, splits, scores, features, labels, \ featnames, options.graphfile) return (scores.mean(), scores.std(), scores)
def atrial(options): ''' Run a single machine learning trial.''' # TODO: make option for loading intermediate data to skip steps that have # been done in previous trials # Select data to read data = mldata.load_data(options.database) # Get a sample if (options.numsamples != None): # Check to see if a sample was requested if (options.malfrac != None): sample = mldata.select_sample(int(options.seed), data, \ options.numsamples, options.malfrac) else: # Only use a percent malware if one was specified sample = mldata.select_sample(int(options.seed), data, options.numsamples) else: sample = data # Preprocess data # TODO: fill in this part # If specified, output the current database if (options.newdb != None): mldata.save_data(sample, options.newdb) # Original way to run a trial... probably going to be deleted eventually if (options.simplyAcc): return oldacc(options, sample) # Primary way to run a trial else: printparams(options) print(' Measure Average Fold-Scores') perfmeasures = ['accuracy', 'precision', 'recall', 'f1'] avgs = [] for perfmeasure in perfmeasures: # Extract the parts of the samples # Not yet using the filenames and feature names features, labels, _, featnames = mldata.data_components(sample) # Split the sample into 10 randomly stratified folds cvsplits = cross_validation.StratifiedShuffleSplit(labels, \ test_size=0.1, random_state=options.seed) # Score the folds est = mlalgos.get_estimator(options.algorithm) scores = cross_validation.cross_val_score(est, features, y=labels, \ scoring=perfmeasure, cv=cvsplits) # Print the results avgs.append(sum(scores) / len(scores)) avgstr = '{:.4}'.format(avgs[-1]).rjust(7) resultstr = '{} {} '.format(perfmeasure.rjust(9), avgstr) for score in scores: resultstr += ' {:.3}'.format(score) print(resultstr) # Icing on the cake: draw a decision tree graph # based on the fold with the best f1 score if(perfmeasure=='f1' and options.graphfile != None and \ isinstance(est, tree.DecisionTreeClassifier)): mlalgos.dt_graph(est, cvsplits, scores, features, labels, \ featnames, options.graphfile) return (perfmeasures, avgs)
def oldtrial(options): ''' Run a single machine learning trial.''' # TODO: make option for loading intermediate data to skip steps that have # been done in previous trials # Select data to read data = mldata.load_data(options.database) # Get a sample if(options.numsamples != None): # Check to see if a sample was requested if(options.malfrac != None): sample = mldata.select_sample(int(options.seed), data, \ options.numsamples, options.malfrac[0]) else: # Only use a percent malware if one was specified sample = mldata.select_sample(int(options.seed), data, options.numsamples) else: sample = data # If specified, output the current database if(options.exportdb != None): mldata.save_data(sample, options.exportdb) # Original way to run a trial... probably going to be deleted eventually if(options.acc): return oldacc(options, sample) # Primary way to run a trial else: printparams(options) mlstat.print_results_header() perfmeasures = ['accuracy', 'precision', 'recall', 'f1'] avgs = [] for perfmeasure in perfmeasures: # Extract the parts of the samples # Not yet using the filenames and feature names features, labels, _, featnames = mldata.data_components(sample) # Split the sample into 10 randomly stratified folds cvsplits = cross_validation.StratifiedShuffleSplit(labels, \ test_size=0.1, random_state=options.seed) # Score the folds est = mlalgos.get_estimator(options.algorithm) scores = cross_validation.cross_val_score(est, features, y=labels, \ scoring=perfmeasure, cv=cvsplits) # Print the results avgs.append(sum(scores)/len(scores)) avgstr = '{:.4}'.format(avgs[-1]).rjust(7) resultstr = '{} {} '.format(perfmeasure.rjust(9), avgstr) for score in scores: resultstr += ' {:.3}'.format(score) print(resultstr) # Icing on the cake: draw a decision tree graph # based on the fold with the best f1 score if(perfmeasure=='f1' and options.graphfile != None and \ isinstance(est, tree.DecisionTreeClassifier)): mlalgos.dt_graph(est, cvsplits, scores, features, labels, \ featnames, options.graphfile) return (perfmeasures, avgs)