Esempio n. 1
0
def oldacc(options, sample):
    ''' The simpleAcc way of running a trial '''
    # Get the final components from the data
    print("Building train and test data...")

    # Just partition it 80-20 training-testing
    trainsize = 0.8 * len(sample)
    train = sample[0:trainsize]
    test = sample[trainsize:]
    trfeat, trlab, _, _ = mldata.data_components(train)
    tefeat, telab, _, _ = mldata.data_components(test)

    # Check for valid learning algorithm
    if (not mlalgos.validate_algo(options.algorithm)):
        print("Invalid learning algorithm %s" % (options.algorithm))
        sys.exit(1)

    # Train
    print("Training...")
    model = mlalgos.learn(options.algorithm, trfeat, trlab, options.seed)

    # Test
    print("Testing...")
    preds = mlalgos.predict(model, tefeat)

    # Analyze -- FScore, acc, learning curves
    print("Results:")
    printparams(options)  # CL options

    # Model parameters
    pprint(vars(model))

    accuracy = mlstat.acc(preds, telab)
    print("Accuracy: %f" % (accuracy))

    return (['accuracy'], [accuracy])
Esempio n. 2
0
def oldacc(options, sample):
    ''' The simpleAcc way of running a trial ''' 
    # Get the final components from the data
    print("Building train and test data...")

    # Just partition it 80-20 training-testing
    trainsize = 0.8 * len(sample)
    train = sample[0:trainsize]
    test = sample[trainsize:]
    trfeat, trlab, _, _ = mldata.data_components(train)
    tefeat, telab, _, _ = mldata.data_components(test)

    # Check for valid learning algorithm
    if(not mlalgos.validate_algo(options.algorithm)):
        print("Invalid learning algorithm %s" % (options.algorithm))
        sys.exit(1)

    # Train
    print("Training...")
    model = mlalgos.learn(options.algorithm, trfeat, trlab, options.seed)

    # Test
    print("Testing...")
    preds = mlalgos.predict(model, tefeat)

    # Analyze -- FScore, acc, learning curves
    print("Results:")
    printparams(options) # CL options

    # Model parameters
    pprint(vars(model))

    accuracy = mlstat.acc(preds, telab)
    print("Accuracy: %f" % (accuracy))

    return (['accuracy'], [accuracy])
Esempio n. 3
0
def atrial(options):
    ''' Run a single machine learning trial.'''
    # TODO: make option for loading intermediate data to skip steps that have
    # been done in previous trials

    # Select data to read
    data = mldata.load_data(options.database)

    # Get a sample
    if (options.numsamples != None):  # Check to see if a sample was requested
        if (options.malfrac != None):
            sample = mldata.select_sample(int(options.seed), data, \
                options.numsamples, options.malfrac)
        else:  # Only use a percent malware if one was specified
            sample = mldata.select_sample(int(options.seed), data,
                                          options.numsamples)
    else:
        sample = data

    # Preprocess data
    # TODO: fill in this part

    # If specified, output the current database
    if (options.newdb != None):
        mldata.save_data(sample, options.newdb)

    # Original way to run a trial... probably going to be deleted eventually
    if (options.simplyAcc):
        return oldacc(options, sample)

    # Primary way to run a trial
    else:
        printparams(options)
        print('  Measure  Average  Fold-Scores')
        perfmeasures = ['accuracy', 'precision', 'recall', 'f1']
        avgs = []
        for perfmeasure in perfmeasures:
            # Extract the parts of the samples
            # Not yet using the filenames and feature names
            features, labels, _, featnames = mldata.data_components(sample)

            # Split the sample into 10 randomly stratified folds
            cvsplits = cross_validation.StratifiedShuffleSplit(labels, \
                        test_size=0.1, random_state=options.seed)

            # Score the folds
            est = mlalgos.get_estimator(options.algorithm)
            scores = cross_validation.cross_val_score(est, features, y=labels, \
                        scoring=perfmeasure, cv=cvsplits)

            # Print the results
            avgs.append(sum(scores) / len(scores))
            avgstr = '{:.4}'.format(avgs[-1]).rjust(7)
            resultstr = '{}  {} '.format(perfmeasure.rjust(9), avgstr)
            for score in scores:
                resultstr += ' {:.3}'.format(score)
            print(resultstr)

            # Icing on the cake: draw a decision tree graph
            # based on the fold with the best f1 score
            if(perfmeasure=='f1' and options.graphfile != None and \
                isinstance(est, tree.DecisionTreeClassifier)):
                mlalgos.dt_graph(est, cvsplits, scores, features, labels, \
                                featnames, options.graphfile)

        return (perfmeasures, avgs)
Esempio n. 4
0
def atrial(options):
    ''' Run a single machine learning trial.''' 
    # TODO: make option for loading intermediate data to skip steps that have
    # been done in previous trials

    # Select data to read
    data = mldata.load_data(options.database)

    # Get a sample
    if(options.numsamples != None): # Check to see if a sample was requested
        if(options.malfrac != None):
            sample = mldata.select_sample(int(options.seed), data, \
                options.numsamples, options.malfrac)
        else: # Only use a percent malware if one was specified
            sample = mldata.select_sample(int(options.seed), data,
                options.numsamples)
    else:
        sample = data

    # Preprocess data
    # TODO: fill in this part

    # If specified, output the current database
    if(options.newdb != None):
        mldata.save_data(sample, options.newdb)

    # Original way to run a trial... probably going to be deleted eventually
    if(options.simplyAcc):
        return oldacc(options, sample)

    # Primary way to run a trial
    else:
        printparams(options)
        print('  Measure  Average  Fold-Scores')
        perfmeasures = ['accuracy', 'precision', 'recall', 'f1']
        avgs = []
        for perfmeasure in perfmeasures:
            # Extract the parts of the samples
            # Not yet using the filenames and feature names
            features, labels, _, featnames = mldata.data_components(sample)

            # Split the sample into 10 randomly stratified folds
            cvsplits = cross_validation.StratifiedShuffleSplit(labels, \
                        test_size=0.1, random_state=options.seed)

            # Score the folds
            est = mlalgos.get_estimator(options.algorithm)
            scores = cross_validation.cross_val_score(est, features, y=labels, \
                        scoring=perfmeasure, cv=cvsplits)

            # Print the results
            avgs.append(sum(scores)/len(scores))
            avgstr = '{:.4}'.format(avgs[-1]).rjust(7)
            resultstr = '{}  {} '.format(perfmeasure.rjust(9), avgstr)
            for score in scores:
                resultstr += ' {:.3}'.format(score)
            print(resultstr)

            # Icing on the cake: draw a decision tree graph
            # based on the fold with the best f1 score
            if(perfmeasure=='f1' and options.graphfile != None and \
                isinstance(est, tree.DecisionTreeClassifier)):
                mlalgos.dt_graph(est, cvsplits, scores, features, labels, \
                                featnames, options.graphfile)

        return (perfmeasures, avgs)