Exemple #1
0
def select_features(feature_list, my_dataset, k):
    '''
    Select k number of features based on SelectKBest function and
    StratifiedShuffleSplit

    feature_list = list of strings representing feature names
    my_dataset = dataset containing all features and labels
    k = number of desired features
    '''
    from sklearn.model_selection import StratifiedShuffleSplit

    # Create feature and label arrays from the dataset
    data = featureFormat(my_dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    # create sss with 1000 splits
    sss = StratifiedShuffleSplit(n_splits=1000, random_state=42)
    feature_scores = {}

    # Create 1000 different sets of training and testing samples
    for train, test in sss.split(features, labels):
        features_train = [features[i] for i in train]
        labels_train = [labels[i] for i in train]

        # fit the selectkbest function on each set of training data
        selector = SelectKBest(k=k)
        selector.fit(features_train, labels_train)

        # Get list of features, scores, and pvalues for each selector
        feature_indices = selector.get_support(indices=True)
        sel_features = [(feature_list[i + 1], selector.scores_[i],
                         selector.pvalues_[i]) for i in feature_indices]

        # Gather the scores and pvalue of each feature from each split
        for feat, score, pval in sel_features:
            if feat not in feature_scores:
                feature_scores[feat] = {"scores": [], "pvalue": []}
            feature_scores[feat]['scores'].append(score)
            feature_scores[feat]['pvalue'].append(pval)

    # Get average score and pvalue of each feature
    feature_scores_l = []
    for feat in feature_scores:
        feature_scores_l.append((feat, np.mean(feature_scores[feat]['scores']),
                                 np.mean(feature_scores[feat]['pvalue'])))

    import operator
    sorted_feature_scores = sorted(feature_scores_l,
                                   key=operator.itemgetter(1),
                                   reverse=True)
    sorted_feature_scores_str = [
        "{}: {} {}".format(z[0], z[1], z[2]) for z in sorted_feature_scores
    ]

    print "feature: score, p-value"
    for line in sorted_feature_scores_str:
        print line
    return
Exemple #2
0
# loop through each classifier and capture evaluation metrics
for c in clf_list:
    clf = c
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    #print pred
    print "Accuracy is ", clf.score(features_test, labels_test)
    print "precision = ", precision_score(labels_test, pred)
    print "recall = ", recall_score(labels_test, pred)
    recall_list.append(recall_score(labels_test, pred))
    print "\nRunning Stratisfied Shuffle Split cross validation to compare recall\n"
    print "printing mean of Stratisfied Shuffle Split"
    print "mean = ", cross_val_score(clf,
                                     features,
                                     labels,
                                     cv=cv.split(features, labels),
                                     scoring='recall').mean()
    mean_recall_list.append(
        cross_val_score(clf,
                        features,
                        labels,
                        cv=cv.split(features, labels),
                        scoring='recall').mean())
    print "\n"
    dump_classifier_and_data(clf, my_dataset, features_list)
    main()
print "printing summary of recall score from Classifiers, \n", recall_list
#print "printing summary of accuracy scores from Classifiers, \n", accuracy_list
print "printing summary of mean recall scores from Stratisfied Shuffle Split CV, \n", mean_recall_list

# Since I didn’t have a particular algorithm to try in mind, I chose to iterate through several classifiers to evaluate different metrics without making any parameter tunes to get a baseline of how each classifier performs. This will help me choose which classifier to focus tuning parameters on. When iterating, I captured the accuracy of a feature train/test split as well as a mean accuracy when running Stratisfied Shuffle split validation test to compare the two scores. I felt it was necessary to perform Stratisfied Shuffle split cross validation because of the imbalance in POIs and non POIs and to ensure all data is included in a test and a training procedure. Additionally, I dumped out the classifier, dataset, and feature list in each iteration so I can call tester.py. I recorded all the scores the tester file provides scores of each classifier. After iterating through each classier, I observed the following metrics: