def controller():
    nFeatures = [10, 25, 50, 70, 80, 90, 100, 110, 120, 130, 140, 150, 175, 200, 500, 1000, 5000]
    nSamples = 1500
    classifiers = [GaussianNB(), BernoulliNB(), LinearSVC(), DecisionTreeClassifier(), LogisticRegression(), SGDClassifier()]
    # scoreFunc = [None, chi2, f_classif]

    path = os.path.join(os.path.expanduser('~'), 'OneDrive\\RPI\\Summer Project\\URL Classifier\\Data\\URL Files')
    mal_samples = dataset.get_samples(os.path.join(path, 'mal_urls.csv'), amount=nSamples)
    ben_samples = dataset.get_samples(os.path.join(path, 'ben_urls.csv'), amount=nSamples)
    features = get_features('../Dataset/Vocab/ranked_words.csv', amount=None)
    nFeatures.append(len(features))
    accuracies = np.zeros((len(classifiers), len(nFeatures)))

    for col, nf in enumerate(nFeatures):
        X_mal, y_mal = create_testing_dataset(mal_samples, features[0:nf], 1)
        X_ben, y_ben = create_testing_dataset(ben_samples, features[0:nf], 0)
        X = np.concatenate((X_mal, X_ben), axis=0)
        y = np.concatenate((y_mal, y_ben))

        for row, clf in enumerate(classifiers):
            accuracy = get_accuracy(X, y, 10, clf) * 100
            accuracies[row, col] = accuracy

    print accuracies
    plot(accuracies, nFeatures, classifiers)
Esempio n. 2
0
def evaluate_learner_with_holdout(learner, num_labels, test_sets):
    '''
    If you're not considering a "finite pool" problem, this is the correct way to evaluate the trained classifiers. 
    
    @params
    learner -- the learner to be evaluated
    num_labels -- how many labels have been provided to the learner thus far
    test_sets -- the set(s) of examples to be used for evaluation. if there are multiple, it is assumed that they correspond to multiple feature
                            spaces, thus they will have to be cominbed somehow. The 'predict' method in the learner class(es) handles this, see that 
                            method in, e.g., base_learner, for more.
    '''
    results = {"size": num_labels}
    print "evaluating learner over %s instances." % len(
        learner.unlabeled_datasets[0].instances)
    fns = 0
    predictions = []
    point_sets = [dataset.get_samples() for dataset in test_sets]
    # the labels are assumed to be the same; thus we only use the labels for the first dataset
    true_labels = test_sets[0].get_labels()

    # loop over all of the examples, and feed to the predict method
    # the corresponding point in each feature-space
    for example_index in range(len(point_sets[0])):
        # hand the predict method a list of representations of x; one per feature space/model
        prediction = learner.predict([
            point_sets[feature_space_index][example_index]
            for feature_space_index in range(len(point_sets))
        ])
        predictions.append(prediction)

    conf_mat = _evaluate_predictions(predictions, true_labels)
    _calculate_metrics(conf_mat, results)
    return results
Esempio n. 3
0
def evaluate_learner_with_holdout(learner, num_labels, test_sets):
    '''
    If you're not considering a "finite pool" problem, this is the correct way to evaluate the trained classifiers. 
    
    @params
    learner -- the learner to be evaluated
    num_labels -- how many labels have been provided to the learner thus far
    test_sets -- the set(s) of examples to be used for evaluation. if there are multiple, it is assumed that they correspond to multiple feature
                            spaces, thus they will have to be cominbed somehow. The 'predict' method in the learner class(es) handles this, see that 
                            method in, e.g., base_learner, for more.
    '''
    results={"size":num_labels}
    print "evaluating learner over %s instances." % len(learner.unlabeled_datasets[0].instances)
    fns = 0
    predictions = []
    point_sets = [dataset.get_samples() for dataset in test_sets]
    # the labels are assumed to be the same; thus we only use the labels for the first dataset
    true_labels = test_sets[0].get_labels()
   
    # loop over all of the examples, and feed to the predict method 
    # the corresponding point in each feature-space
    for example_index in range(len(point_sets[0])):
        # hand the predict method a list of representations of x; one per feature space/model
        prediction = learner.predict([point_sets[feature_space_index][example_index] for feature_space_index in range(len(point_sets))])
        predictions.append(prediction)
    
    conf_mat =  _evaluate_predictions(predictions, true_labels)
    _calculate_metrics(conf_mat, results)
    return results
Esempio n. 4
0
def controller():
    nFeatures = [
        10, 25, 50, 70, 80, 90, 100, 110, 120, 130, 140, 150, 175, 200, 500,
        1000, 5000
    ]
    nSamples = 1500
    classifiers = [
        GaussianNB(),
        BernoulliNB(),
        LinearSVC(),
        DecisionTreeClassifier(),
        LogisticRegression(),
        SGDClassifier()
    ]
    # scoreFunc = [None, chi2, f_classif]

    path = os.path.join(
        os.path.expanduser('~'),
        'OneDrive\\RPI\\Summer Project\\URL Classifier\\Data\\URL Files')
    mal_samples = dataset.get_samples(os.path.join(path, 'mal_urls.csv'),
                                      amount=nSamples)
    ben_samples = dataset.get_samples(os.path.join(path, 'ben_urls.csv'),
                                      amount=nSamples)
    features = get_features('../Dataset/Vocab/ranked_words.csv', amount=None)
    nFeatures.append(len(features))
    accuracies = np.zeros((len(classifiers), len(nFeatures)))

    for col, nf in enumerate(nFeatures):
        X_mal, y_mal = create_testing_dataset(mal_samples, features[0:nf], 1)
        X_ben, y_ben = create_testing_dataset(ben_samples, features[0:nf], 0)
        X = np.concatenate((X_mal, X_ben), axis=0)
        y = np.concatenate((y_mal, y_ben))

        for row, clf in enumerate(classifiers):
            accuracy = get_accuracy(X, y, 10, clf) * 100
            accuracies[row, col] = accuracy

    print accuracies
    plot(accuracies, nFeatures, classifiers)