Example #1
0
def GaussianNB(feature_list, dataset):
    from sklearn.naive_bayes import GaussianNB

    clf = GaussianNB()
    test_classifier(clf, dataset, feature_list)
    #score = clf.
    return clf
def RandomForest(feature_list,dataset):
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier()
    test_classifier(clf,dataset,feature_list)
    imp= clf.feature_importances_
    print_importance (feature_list,imp)
    return clf
Example #3
0
def tune_classifier(classifier, clf_params, max_features):
    ### features_list is a list of strings, each of which is a feature name.
    ### The first feature must be "poi".
    features_list = get_feature_list()

    ### Create new feature(s)
    ### Store to my_dataset for easy export below.
    my_dataset = get_data()

    ### Extract features and labels from dataset for local testing
    features_list = features_list[0:max_features+1]
    data, labels, features = get_features_and_labels(my_dataset, features_list)

    ### Tune your classifier to achieve better than .3 precision and recall
    ### using our testing script. Check the tester.py script in the final project
    ### folder for details on the evaluation method, especially the test_classifier
    ### function. Because of the small size of the dataset, the script uses
    ### stratified shuffle split cross validation. For more info:
    ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

    from sklearn.cross_validation import train_test_split
    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.3, random_state=42)

    # Testing
    clf = GridSearchCV(classifier, param_grid=clf_params, scoring=make_scorer(f1_score))
    clf.fit(features_train, labels_train)
    clf_final = clf.best_estimator_
    print "The best estimator = ", clf_final
    test_classifier(clf_final, my_dataset, features_list, 1000)
Example #4
0
def decisionTree(feature_list, dataset):
    from sklearn import tree

    clf = tree.DecisionTreeClassifier()
    test_classifier(clf, dataset, feature_list)
    print clf.feature_importances_
    return clf
def iterPipe(num1, num2):
    for i in range(num1, num2 + 1):
        # estimators = [('scaling', StandardScaler()),('reduce_dim', PCA()), ('dtc', DTC(min_samples_split=i*2))]
        # estimators = [('reduce_dim', PCA(n_components=2)), ('dtc', DTC(min_samples_split=i))]
        # clfIter = Pipeline(estimators)
        # clfIter.set_params(reduce_dim__n_components=3)
        clfIter = DTC(min_samples_split=i)
        test_classifier(clfIter, my_dataset, features_list)
def KNN(feature_list,dataset):
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler
	knn = KNeighborsClassifier()
	# feature scale
	estimators = [('scale', StandardScaler()), ('knn', knn)]
	clf = Pipeline(estimators)
	test_classifier(clf, my_dataset, features_list)
Example #7
0
def setup_and_test(my_dataset, features_list, classifier):
    # Dump classifier and features list, so we can test them
    dump_classifier_and_data(classifier, my_dataset, features_list)

    # load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    # Run testing script
    test_classifier(clf, dataset, feature_list)

    return
def tuneDT(feature_list,dataset):
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.grid_search import GridSearchCV
	from sklearn import tree
	tree_clf = tree.DecisionTreeClassifier()
	parameters = {'criterion':('gini', 'entropy'),
		'splitter':('best','random')}
	clf = GridSearchCV(tree_clf, parameters,scoring = 'recall')
	test_classifier(clf, my_dataset, features_list)
	print '###best_params'
	print clf.best_params_
Example #9
0
def detect_poi():
### Load the dictionary containing the dataset
    data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
### Task 1: Remove outliers
    data_dict.pop('TOTAL',0)    
    
### Task 2: Select what features
### 'stk_pay_ratio','to_poi_ratio', 'from_poi_ratio','bonus_salary_ratio'
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
    my_dataset = data_dict
    stk_pay_ratio(my_dataset)
    from_poi_ratio(my_dataset)
    to_poi_ratio(my_dataset)
    bonus_salary_ratio(my_dataset)
     
### Task 3: Feature Selection
### Generate a set of 15 feature lists from these 4 features
### This way, all possible combinations of these features are tested

    all_features_list = fList_set()

### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation in tester.py
    metrics = []    
    clf = GaussianNB()    
### ptest uses Stratified shuffle split cross validation and calculates the precision
### Find the precision for every list
    for i in range(0,15):
        metrics.append(ptest(clf,my_dataset,all_features_list[i]))
### Go for the feature list that produces the best precision.  
### For this dataset only, it is harder to get a high precision.
    best = np.array(metrics).argmax()  
    
### Run test_classifier to print evaluation metrics to console
    test_classifier(clf, my_dataset,all_features_list[best])

### Now use the same feature list to run the decison tree classifier
    features_list = all_features_list[best]
### Task 4: Try a varity of classifiers
    samples_split_values = [2,4]
    samples_leaf_values = [1,2]

    for split in samples_split_values:
        for leaf in samples_leaf_values:
            clf = tree.DecisionTreeClassifier(min_samples_split=split,\
            min_samples_leaf=leaf)
            test_classifier(clf, my_dataset, features_list)
            print_feature_importances(features_list, clf)
###Choose best classfier and feature set    
    clf = GaussianNB()   

### Dump classifier, dataset, and features_list
    dump_classifier_and_data(clf, my_dataset, features_list)
Example #10
0
def tuneKmeans(feature_list,dataset):
    from sklearn.cluster import KMeans
    from sklearn.grid_search import GridSearchCV
    km_clf = KMeans(n_clusters=2, tol=0.001)

    parameters = {'n_clusters': (2,10)}
    clf = GridSearchCV(km_clf, parameters, scoring='recall')
    test_classifier(clf, dataset, feature_list)
    print '###best_params'
    print clf.best_params_
    return clf.best_estimator_
def explore_scores():
    for n in features:
        for c in n_neighbor:
            for d in weights:
                for e in algorithm:
                    for f in leaf_size:
                        for g in p:
                            for h in metric:
                                feature = 0
                                feature = features_select(n)
                                pipeline = Pipeline([('normalization', scaler), 
                                             ('classifier', KNeighborsClassifier(n_neighbors=c, weights=d, algorithm=e, 
                                                                                 leaf_size=f, p=g, metric=h))])
                                test_classifier(pipeline, enron_data, feature)
def tuneKNN(feature_list,dataset):
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler
	from sklearn.grid_search import GridSearchCV
	knn = KNeighborsClassifier()
	# feature scale
	estimators = [('scale', StandardScaler()), ('knn', knn)]
	pipeline = Pipeline(estimators)
	parameters = {'knn__n_neighbors':[1,8],
		'knn__algorithm':('ball_tree','kd_tree','brute','auto')}
	clf = GridSearchCV(pipeline, parameters,scoring = 'recall')
	test_classifier(clf, my_dataset, features_list)
	print '###best_params'
	print clf.best_params_
def getRF():

    print "==============="
    print "RandomForests"
    print "==============="

    for score in scores:

        print score
        print

        #parameters = {'n_estimators':range(10, 150, 10), 'criterion':['gini', 'entropy'], 'min_samples_split':range(2, 8, 2)}
        parameters = {'rf__n_estimators':range(10, 150, 10), 'rf__criterion':['gini', 'entropy'], 'rf__min_samples_split':range(2, 8, 2), 
            'selector__k':range(3, 22, 1)}	

        gs = grid_search.GridSearchCV(rf_pipe, parameters, scoring=score, cv=cv)
            
        gs.fit(features, labels)

         #This is the model you pass to tester.py
        clf = gs.best_estimator_

        print " "
        print "Optimal Model - by Grid Search"
        print clf
        print " "

        best_parameters = gs.best_estimator_.get_params()

        print " "
        print "Best Parameters- by Grid Search"
        print best_parameters
        print " "

        labels_pred = gs.predict(features)

        # Print Results  (will print the Grid Search score)
        print "Grid Search Classification report:" 
        print " "
        print classification_report(labels, labels_pred)
        print ' ' 

        # Print Results  (will print the tester.py score)
        print "tester.py Classification report:" 
        print " "
        test_classifier(clf, my_dataset, features_list)
        print " "
        print
def getAda():
		
	print "==============="
	print "AdaBoost"
	print "==============="

	for score in scores:

		print score
		print

		#parameters = {'n_estimators':range(50, 100, 1), 'learning_rate':[x * 0.01 for x in range(100, 160, 1)]}
		parameters = {'ada__n_estimators': range(1, 100, 20), 'ada__learning_rate':[x * 0.01 for x in range(100, 160, 10)],
			'selector__k':range(3, 22, 1)}

		gs = grid_search.GridSearchCV(ada_pipe, parameters, scoring=score, cv=cv)

		gs.fit(features, labels)

		 #This is the model you pass to tester.py
		clf = gs.best_estimator_

		print " "
		print "Optimal Model - by Grid Search"
		print clf
		print " "

		best_parameters = gs.best_estimator_.get_params()

		print " "
		print "Best Parameters- by Grid Search"
		print best_parameters
		print " "

		labels_pred = gs.predict(features)

		# Print Results  (will print the Grid Search score)
		print "Grid Search Classification report:" 
		print " "
		print classification_report(labels, labels_pred)
		print ' ' 

		# Print Results  (will print the tester.py score)
		print "tester.py Classification report:" 
		print " "
		test_classifier(clf, my_dataset, features_list)
		print " "
		print
def getKNN():

    print "==============="
    print "KNeighborsClassifier"
    print "==============="

    for score in scores:

        print score
        print

        #parameters = {'n_neighbors':range(2, 10, 2), 'weights':['distance', 'uniform'], 'metric':['minkowski', 'euclidean']}
        parameters = {'knn__n_neighbors': range(2, 10, 2), 'knn__weights':['distance', 'uniform'], 'knn__metric':['minkowski', 'euclidean'], 
            'selector__k':range(3, 20, 1)}

        gs = grid_search.GridSearchCV(knn_pipe, parameters, scoring=score, cv=cv)

        gs.fit(features, labels)

         #This is the model you pass to tester.py
        clf = gs.best_estimator_

        print " "
        print "Optimal Model - by Grid Search"
        print clf
        print " "

        best_parameters = gs.best_estimator_.get_params()

        print " "
        print "Best Parameters- by Grid Search"
        print best_parameters
        print " "

        labels_pred = gs.predict(features)

        # Print Results  (will print the Grid Search score)
        print "Grid Search Classification report:" 
        print " "
        print classification_report(labels, labels_pred)
        print ' ' 

        # Print Results  (will print the tester.py score)
        print "tester.py Classification report:" 
        print " "
        test_classifier(clf, my_dataset, features_list)
        print " "
        print
def getSVC():
		
	print "==============="
	print "SVC"
	print "==============="

	for score in scores:

		print score
		print

		parameters = {'sv__C': [0.01, 0.1, 1, 500, 1000, 5000, 10000, 50000, 100000], 'sv__kernel':['linear'],
			'selector__k':range(3, 22, 1)} #'sv__gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 1, 10, 100, 500, 1000], 

		gs = grid_search.GridSearchCV(sv_pipe, parameters, scoring=score, cv=cv)

		gs.fit(features, labels)

		 #This is the model you pass to tester.py
		clf = gs.best_estimator_

		print " "
		print "Optimal Model - by Grid Search"
		print clf
		print " "

		best_parameters = gs.best_estimator_.get_params()

		print " "
		print "Best Parameters- by Grid Search"
		print best_parameters
		print " "

		labels_pred = gs.predict(features)

		# Print Results  (will print the Grid Search score)
		print "Grid Search Classification report:" 
		print " "
		print classification_report(labels, labels_pred)
		print ' ' 

		# Print Results  (will print the tester.py score)
		print "tester.py Classification report:" 
		print " "
		test_classifier(clf, my_dataset, features_list)
		print " "
		print
def getNB():

	print "==============="
	print "GaussianNB"
	print "==============="

	for score in scores:

		print score
		print

		parameters = {'selector__k':range(3, 22, 1)}	

		gs = grid_search.GridSearchCV(nb_pipe, parameters, scoring=score, cv=cv)
			
		gs.fit(features, labels)

		 #This is the model you pass to tester.py
		clf = gs.best_estimator_

		print " "
		print "Optimal Model - by Grid Search"
		print clf
		print " "

		best_parameters = gs.best_estimator_.get_params()

		print " "
		print "Best Parameters- by Grid Search"
		print best_parameters
		print " "

		labels_pred = gs.predict(features)

		# Print Results  (will print the Grid Search score)
		print "Grid Search Classification report:" 
		print " "
		print classification_report(labels, labels_pred)
		print ' ' 

		# Print Results  (will print the tester.py score)
		print "tester.py Classification report:" 
		print " "
		test_classifier(clf, my_dataset, features_list)
		print " "
		print
 def train_test():
 
     data = featureFormat(my_dataset, features_list, sort_keys = True)
     labels, features = targetFeatureSplit(data)
     
     features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42)
     
     clf = DecisionTreeClassifier(random_state=42)
     clf.fit(features_train, labels_train)
     print test_classifier(clf, my_dataset, features_list)
     
     ### Print feature importance in order
     
     features_imp = {}
     for i in xrange(len(features_list)-1):
         features_imp[features_list[1+i]] = clf.feature_importances_[i]
         
     pprint(sorted(features_imp.items(), key=operator.itemgetter(1),reverse=True))
Example #19
0
def main():
    data_dict = pickle.load(open("final_project_dataset.pkl", "r"))
    my_dataset = data_dict
    my_dataset = AddFeatures(my_dataset)
    # Exclude using Discretion.
    Exc1 = ["email_address"]
    # Replaced by creating better versions of the features
    Exc2 = ["to_messages", "from_messages", "from_this_person_to_poi", "from_poi_to_this_person"]
    # Exclude because Highly Correlated with stronger features
    Exc3 = [
        "deferral_payments",
        "expenses",
        "deferred_income",
        "restricted_stock_deferred",
        "director_fees",
        "long_term_incentive",
        "bonus",
        "total_payments",
        "salary",
        "total_stock_value",
        "restricted_stock",
        "exercised_stock_options",
        "other",
    ]
    exclude = Exc1 + Exc2 + Exc3
    # QueryDataSet(my_dataset)
    # ShowCorrel(my_dataset)
    features_list = next(my_dataset.itervalues()).keys()
    for i in exclude:
        features_list.remove(i)
    features_list.insert(0, features_list.pop(features_list.index("poi")))
    data = featureFormat(my_dataset, features_list, sort_keys=True)
    ### Extract features and labels from dataset for local testing
    labels, features = targetFeatureSplit(data)
    features_train, features_test, labels_train, labels_test = train_test_split(
        features, labels, test_size=0.1, random_state=42, stratify=labels
    )
    # clf=TuneSVM(features, labels,features_list)
    # clf=TuneKNN(features, labels,features_list)
    # clf=NoTuneDT(features, labels,features_list)
    # clf=TuneDT(features,labels,features_list)
    features_list.insert(0, "poi")
    dump_classifier_and_data(clf, my_dataset, features_list)
    test_classifier(clf, my_dataset, features_list)
    def train_and_predict(first,second):
        #trains the model and returns the value of desired evaluation metric
        
        features_list = ["poi",first,second]
        data = featureFormat(my_dataset, features_list, sort_keys = True)
        labels, features = targetFeatureSplit(data)

        from sklearn.naive_bayes import GaussianNB
        from sklearn import tree

        if dt:
            clf = tree.DecisionTreeClassifier()
        else:
            clf = GaussianNB()

        if f1:
            return test_classifier(clf, my_dataset, features_list,return_F1=True)
        else:
            return test_classifier(clf, my_dataset, features_list,return_precision=True)
Example #21
0
def get_top_features_all_data(X_df, y_df, grid_searcher, top_N=9):
    '''Give an estimate of the model produced by grid_search using features
        selected from using ExtraTreesClassifier on the entire dataset before
        searching for a model.
        
    In general, this may produce overly optimistic results since there is 
        leakage from the test dataset when selecting features using the entire
        dataset.
        This is to show that this can improve cross-validated internal testing
        over choosing kbest within each cross-validation fold, but is still
        overly optimistic if the model were to be used on completely new data.
        
    Args:
        X_df: Pandas dataframe of features used to predict.
        y_df: Pandas dataframe of labels being predicted.
        grid_searcher: GridSearchCV object being searched over for optimal 
            tuning parameters.
        top_N: Top N features to retain based on feature importances obtained
            from the ExtraTreesClassifier estimator used in the
            top_N_features() function.
    Returns:
        A list of the top N features that were selected to be fed into the
        GridSearchCV object.
    
    Prints:
        Test results from the 1000 cross-validation splits testing in tester.py
        
    '''
    top_N_features = top_importances(X_df, y_df, top_N=top_N)
    top_N_names = list(top_N_features.index)
    X_df = X_df[top_N_names]
    features_list = ['poi'] + list(top_N_names)
    grid_searcher.fit(X_df, y_df)
    clf = grid_searcher.best_estimator_
    my_dataset = combine_to_dict(features_df=X_df, labels_df=y_df)
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    test_classifier(clf, my_dataset, features_list)
    return top_N_features
Example #22
0
def find_best_features(feature_names, features, labels, classifier_fun, search_grid, normalize_data=False):
    results = []

    processed_features = np.array(features)
    processed_labels = labels
    if normalize_data:
        scaler = StandardScaler()
        processed_features = scaler.fit_transform(processed_features, processed_labels)

    feature_selector = SelectKBest(k="all")
    feature_selector.fit(processed_features, processed_labels)

    ranked_features = sorted(zip(feature_names, feature_selector.scores_), key=lambda t: t[1], reverse=True)
    ranked_feature_names = [t[0] for t in ranked_features]

    logging.info("Scored features:\n%s", pprint.pformat(ranked_features))
    logging.info("Ranked feature names: %s", ranked_feature_names)

    for k in range(1, len(feature_names) + 1):
        logging.info("Selecting %s best feature(s)", k)

        selected_feature_names = ranked_feature_names[:k]

        logging.info("Selected features: %s", selected_feature_names)

        feature_indices = [feature_names.index(f) for f in selected_feature_names]
        feature_subset = processed_features[:, feature_indices]

        clf = classifier_fun(random_state=98123)

        logging.info("Tuning classifier parameters.")
        clf_tune = grid_search.GridSearchCV(
            clf, search_grid, n_jobs=-1, cv=StratifiedShuffleSplit(labels, n_iter=1000, random_state=42), scoring="f1"
        )
        clf_tune.fit(feature_subset, processed_labels)

        logging.info("Scores:\n%s", pprint.pformat(clf_tune.grid_scores_))
        logging.info("Best parameters: %s with score %s", clf_tune.best_params_, clf_tune.best_score_)

        clf = classifier_fun(random_state=1987341, **clf_tune.best_params_)

        logging.info("Testing classifier.")

        precision, recall, f1 = test_classifier(clf, feature_subset, processed_labels)

        results.append((k, precision, recall, f1))

    logging.info("Best features:\n%s", pprint.pformat(results))
Example #23
0
def analyze_feats(each_feature_set, my_dataset, scoresheet_highest_accuracy,
                  scoresheet_highest_precision):
    data = featureFormat(my_dataset, each_feature_set, sort_keys=True)
    labels, features = targetFeatureSplit(data) 
    features_train, features_test, labels_train, labels_test = (
        train_test_split(features, labels, test_size=0.5, random_state=42))

    # ################## For each feature set, tune the SVC parameter and
    # return the best SVC parameters
    # tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 3, 10, 100, 1000],
    # 'degree':[1,2,3]}]
    # #score = 'precision'
    # clf = GridSearchCV(SVM, tuned_parameters)
    # clf.fit(features_train, labels_train)
    # SVM = clf.best_estimator_
    # print SVM

    # For each feature set, tune the SVC parameter and return the best SVC
    # parameters
    DT_tuned_parameters = [{'min_samples_split': [30, 40, 50]}]
    # score = 'precision'
    dt_clf = GridSearchCV(tree.DecisionTreeClassifier(), DT_tuned_parameters)
    dt_clf.fit(features_train, labels_train)
    DT = dt_clf.best_estimator_
    print DT
    classifier_type = [DT]
    # continue
    # run each type of classifier and return results
    try:
        total_results = []
        for index, each_clf in enumerate(classifier_type):
            results = test_classifier(each_clf, my_dataset, each_feature_set)
            print each_feature_set, results
            total_results.append(results)

        # for a given feature set, find the classifier with highest
        # precision/accuracy and store it in a list
        for index, num in enumerate(total_results):
            if num[1] == max([accuracy[1] for accuracy in total_results]):
                # print "Highest accuracy: \t", num[0], num[1]
                scoresheet_highest_accuracy.append(
                    [each_feature_set, total_results[index]])
            if num[1] == max([precision[1] for precision in total_results]):
                # print "Highest precision: \t", num[0], num[1]
                scoresheet_highest_precision.append(
                    [each_feature_set, total_results[index]])
    except:
        pass
Example #24
0
def try_all_k_best(max=13):
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.25, random_state=42)

    for k in range(1,max+1):
        pipe = Pipeline([('impute', Imputer(strategy='median')), 
                         ('select', SelectKBest(k=k)),
                         ('classify', LogisticRegressionCV())])
        pipe.fit(features_train, labels_train)
        total_predictions, accuracy, precision, recall, f1, f2 = \
          test_classifier(pipe, my_dataset, features_list, folds=1000)
        acc.append(accuracy)
        prec.append(precision)
        reca.append(recall)     
def one_feature_predict(features_list, my_dataset):
    all = []
    for i in features_list:
        if i != 'poi':
            l = []
            l.append('poi')
            l.append(i)
            all.append(l)
    #print all
    mycolumns = ['feature_list', 'accuracy', 'precision', 'recall', 'f1', 'f2']
    resultdf = pd.DataFrame(columns=mycolumns)
    for item in all:
        data = featureFormat(my_dataset, item, sort_keys = True)
        labels, features = targetFeatureSplit(data)

        clf = tree.DecisionTreeClassifier(min_samples_split = 4)
        clf.fit(features, labels)
        resultdf.loc[len(resultdf)] =  (test_classifier(clf, my_dataset, item))
    return resultdf
Example #26
0
def analyze_feats(each_feature_set, my_dataset, classifier_type, scoresheet_highest_accuracy, 
                scoresheet_highest_precision, scoresheet_highest_recall):

    # run each type of classifier and return results
    try:
        total_results = []

        for index, each_clf in enumerate(classifier_type):
            results, feature_importances = test_classifier(each_clf, my_dataset, each_feature_set)
            if len(feature_importances) > 0:
                # results, feature_importances = test_classifier(each_clf, my_dataset, each_feature_set)
                print "####CLF NAME",each_clf
                print "#####Length of feature_importances",len(feature_importances)
                np.asarray(feature_importances)
                importances = zip(np.mean(feature_importances, axis=0),each_feature_set[1:])
                importances = sorted(importances,key=lambda i:i[0],reverse=True)
                print "#####Length of importances",len(importances)
                print importances

            print each_feature_set, results
            total_results.append(results)

        print "total_results",total_results 
        # for a given feature set, find the classifier with highest
        # precision/accuracy and store it in a list
        for index, num in enumerate(total_results):
            if num[1] == max([accuracy[1] for accuracy in total_results]):
                print "Highest accuracy: \t", num[0], num[1]
                scoresheet_highest_accuracy.append(
                    [each_feature_set, total_results[index]])
            if num[2] == max([precision[2] for precision in total_results]):
                print "Highest precision: \t", num[0], num[2]
                scoresheet_highest_precision.append(
                    [each_feature_set, total_results[index]])
            if num[3] == max([recall[3] for recall in total_results]):
                print "Highest recall: \t", num[0], num[3]
                scoresheet_highest_recall.append(
                    [each_feature_set, total_results[index]])
    except:
        pass


    return 
Example #27
0
def tuneNB():
    for i in range(1, 20):
        acc = []
        prec = []
        reca = []
        testing_features_list = [u'poi']
        for feature in features_list_score_order:
            testing_features_list.append(feature)
            pipe = Pipeline([('impute', Imputer(strategy='median')), 
                    ('classify', GaussianNB(priors=[(i/2.)*.1, (1 - (i/2.)*.1)]))])
            total_predictions, accuracy, precision, recall, f1, f2 = \
                test_classifier(pipe, my_dataset, testing_features_list, folds=200)
            acc.append(accuracy)
            prec.append(precision)
            reca.append(recall)
        acc_all.append(acc)
        prec_all.append(prec)
        reca_all.append(reca)
        results_dict['prec' + str(i)] = prec
        results_dict['reca' + str(i)] = reca
        results_dict['acc' + str(i)] = acc
    'min_samples_leaf': [x for x in range(1, 10, 2)],
    'max_depth': [None, 1, 2, 4, 8, 12, 18],
    'max_features': ['log2', 'sqrt']
}

rfc = RandomForestClassifier(random_state=42)
clf = GridSearchCV(rfc, param_list, cv=5, verbose=3, n_jobs=-1)
clf_ = clf.fit(features, labels)
print clf.best_score_
print clf.best_estimator_

print "Training Set Score:", clf.score(X_train, y_train)
print "Validation Set Score:", clf.score(X_val, y_val)

## Cross Validation of Model
test_classifier(clf.best_estimator_, features, labels, folds=100)

###########################################################################
## Make predictions

test_features = scl.fit_transform(
    df_test.drop(['Survived', 'PassengerId'], axis=1).values)

clf.best_estimator_.fit(features, labels)
predictions = clf.best_estimator_.predict(test_features)

output_df = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': pd.Series(predictions)
})
output_df = output_df.astype('Int64')
Example #29
0
    'total_stock_value', 'prop_to_poi', 'prop_from_poi'
]  # You will need to use more features

from tester import test_classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42)
clf2 = GaussianNB()
clf3 = RandomForestClassifier(random_state=42)
clf4 = SGDClassifier(random_state=42)

test_classifier(clf, fp, features_list, folds=1000)
test_classifier(clf2, fp, features_list, folds=1000)
test_classifier(clf3, fp, features_list, folds=1000)
test_classifier(clf4, fp, features_list, folds=1000)

### Result from initial classifier using all features
### DecisionTreeClassifier Accuracy: 0.80840	Precision: 0.26327	Recall: 0.24300	F1: 0.25273	F2: 0.24680 #BEST higher overall precision, recall& F1
### GaussianNB             Accuracy: 0.83920	Precision: 0.32890	Recall: 0.19800	F1: 0.24719	F2: 0.21512
### RandomForestClassifier Accuracy: 0.86073	Precision: 0.42811	Recall: 0.13250	F1: 0.20237	F2: 0.15373
### SGDClassifier          Accuracy: 0.52980	Precision: 0.10665	Recall: 0.34250	F1: 0.16265	F2: 0.23747

# Using Decision Tree Classifier to find attributes of importance
fp = pd.DataFrame(fp)
fp = fp.transpose()

X = fp.drop(['poi'], axis=1)
Example #30
0
#dt = DecisionTreeClassifier()
t0 = time()
grid_obj = GridSearchCV(dt, parameters, scoring='f1', cv=sss)
print "======== Decision Tree (Optimized) ========"
print("DecisionTree tuning: %r" % round(time() - t0, 3))
# TODO: Fit the grid search object to the training data and find the optimal parameters
t0 = time()
grid_obj = grid_obj.fit(features, labels)
print("DecisionTree fitting: %r" % round(time() - t0, 3))
# Get the estimator
dt = grid_obj.best_estimator_
## Print the parameters
print dt.get_params(), '\n'

print 'Result of feature_list without new create feature:'
test_classifier(dt,
                my_dataset,
                features_list_without_create_feature,
                folds=100)

print 'Result of feature_list with new create feature:'
test_classifier(dt, my_dataset, features_list, folds=100)

clf = dt
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
Example #31
0
    clf = clf.fit(feature_train, target_train)
    accuracy_grid = clf.score(feature_train, target_train)
    print "Best estimator found by grid search:"
    print clf.best_estimator_
    return clf.best_estimator_, accuracy_grid


###############################################################################

enron_method_svm = SVR(kernel='rbf')
param_grid_svm = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
[clf_grid, acc5] = val_grid(enron_method_svm, features_train, labels_train,
                            param_grid_svm)

print acc5

[accuracy, precision, recall, f1,
 f2] = test_classifier(clf_svm,
                       pd.DataFrame(data_dict),
                       features_list,
                       folds=1000)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf_reg, my_dataset, features_list)
Example #32
0
# parameters = {
	# 'anova__k': (2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21),
	# 'anova__k': [2,4,6,8,10,12,14,16,18,20],
    # 'pca__whiten': [True, False], 
    # 'pca__n_components': [6,8,10,11],			# For use with PCA
    # 'clf__min_samples_split':[2,10,20,30,40,50], 		# for use with DecisionTree
    # 'clf__criterion': ['gini','entropy']				# for use with DecisionTree
    # 'clf__n_estimators': [50,100,200],				# For use with Adaboost
    # 'clf__C': [1, 10, 100, 1e3, 5e3, 1e4, 5e4, 1e5],	# for use with SVM
    # 'clf__gamma': [0.0001, 0.0005, 0.001, 0.005, 		# for use with SVM
    # 	0.01, 0.1],
    # 'clf__kernel': ['linear','rbf','poly']				# for use with SVM
# }

## Create Cross Validation object for use in GridSearchCV
# cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)

## Apply GridSearchCV to the dataset
# clf = GridSearchCV(clf, parameters, scoring = 'f1', cv=cv)
# clf.fit(features, labels)

## Set the best performing combination of parameters as the new classifier
# clf = clf.best_estimator_

## Use included tester function to assess performance using cross validation
test_classifier(clf, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)
Example #33
0
## The DT algorithm results will be displayed while running tester.py
# dt_pred = dt_clf.predict(features_test)
# print "DT best accuracy:", accuracy_score(dt_pred,labels_test)
# print "DT Precision:", precision_score(labels_test, dt_pred)
# print "DT Recall:", recall_score(labels_test, dt_pred)

###########################################################################################

## Run tester.py

selected_features = [
    'poi', 'total_payments', 'total_stock_value', 'salary', 'bonus',
    'fraction_from_poi', 'fraction_to_poi'
]
dump_classifier_and_data(dt_clf_best, enron_less_outliers, selected_features)
test_classifier(dt_clf_best, enron_less_outliers, selected_features)

##########################################################################################

###########################################################################################
# ##  Decision Tree Using SelectK in GridSearchCV
#
# print "Check performance of Decision Tree Using SelectK in GridSearchCV"
#
# data = featureFormat(enron_less_outliers, features_all, sort_keys=True)
# labels, features = targetFeatureSplit(data)
#
# features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
#     features, labels, test_size=0.2, random_state=42)
# dt = tree.DecisionTreeClassifier(random_state=42)
#
Example #34
0
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

print "Tuning parameters of classifiers"
print

# Naive Bayes classifier

print "Performing Grid Search of Naive Bayes classification"
param_dict_NB = {'feature_selection__k': range(5, len(features_list))}
gs = grid_search(steps_NB, param_dict_NB, features, labels)
gs_clf_NB = gs.best_estimator_
print '\n Score Metrics Decission Tree Classifier'
test_classifier(gs_clf_NB, data_dict, features_list, folds=1000)
print

# The rest of the parameter tuning is comented out due to time execution
# NB turned out to be the classifier with better scores
'''
# Decision Tree classifier

print "Performing Grid Search of Decission Tree classification"
param_dict_DT = {'feature_selection__k': range(5, len(features_list)),\
                 'Decission_Tree__criterion': ['gini', 'entropy'],\
                 'Decission_Tree__min_samples_split' : [2, 3, 4, 5, 6, 7, 8, 9, 10]}
gs = grid_search(steps_DT, param_dict_DT, features, labels)
gs_clf_DT = gs.best_estimator_
print '\n Score Metrics Decission Tree Classifier'
test_classifier(gs_clf_DT, data_dict, features_list, folds = 100)
                                           false_positives, false_negatives,
                                           true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf


##running through different fold inputs of k-fold cross-validation
#folds = [2,3,5,10]
#for each in folds:
#    test_classifier_kfold(clf,my_dataset,features_list,each)

##test the algorithm multiple times and obtain the accuracy, precision, and recall averages
tot_accuracy = 0
tot_precision = 0
tot_recall = 0
i = 0
while i < 10:
    accuracy, precision, recall = test_classifier(clf, my_dataset,
                                                  features_list)
    tot_accuracy += accuracy
    tot_precision += precision
    tot_recall += recall
    i += 1
print tot_accuracy / float(10), tot_precision / float(10), tot_recall / float(
    10)

### Dump your classifier, dataset, and features_list so
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)
Example #36
0
cv = StratifiedShuffleSplit(labels, folds, random_state=42)
for train_idx, test_idx in cv:
    features_train = []
    features_test = []
    labels_train = []
    labels_test = []
    for ii in train_idx:
        features_train.append(features[ii])
        labels_train.append(labels[ii])
    for jj in test_idx:
        features_test.append(features[jj])
        labels_test.append(labels[jj])

## Initial algorithms scores
clf_AB = AdaBoostClassifier()
tester.test_classifier(clf_AB, data_dict, features_list)
clf_RBF = SVC(kernel='rbf', max_iter=1000)
tester.test_classifier(clf_RBF, data_dict, features_list)
clf_RF = RandomForestClassifier()
tester.test_classifier(clf_RF, data_dict, features_list)
clf_SVC = SVC(kernel='linear', max_iter=1000)
tester.test_classifier(clf_SVC, data_dict, features_list)
clf_NB = GaussianNB()
tester.test_classifier(clf_NB, data_dict, features_list)
clf_KNN = KNeighborsClassifier()
tester.test_classifier(clf_KNN, data_dict, features_list)

# ### Task 5: Tune your classifier to achieve better than .3 precision and recall
# ### using our testing script. Check the tester.py script in the final project
# ### folder for details on the evaluation method, especially the test_classifier
# ### function. Because of the small size of the dataset, the script uses
Example #37
0
tree_features = {}

# show which features correspond to average importances
for idx, elem in enumerate(analyzed_features_list[1:]):
    tree_features[elem] = get[idx]
print tree_features

fin_feat_tree = [
    'poi', '%frompoi', 'shared_receipt_with_poi', 'exercised_stock_options',
    'expenses'
]

#Gaussian NB comparison
clf = GaussianNB()
test_classifier(clf, my_dataset, fin_feat_kbest, folds=1000)

clf = GaussianNB()
test_classifier(clf, my_dataset, fin_feat_tree, folds=1000)

# preparing ground for DT classifier
data = featureFormat(my_dataset, fin_feat_kbest, sort_keys=True)
data2 = featureFormat(my_dataset, fin_feat_tree, sort_keys=True)
labels, features = targetFeatureSplit(data)
labels2, features2 = targetFeatureSplit(data2)

tuned_parameters_tree = [{
    'max_depth': [2, 3, 4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4]
}]
Example #38
0
#    grid_search.fit(features,labels)
#    pprint.pprint(grid_search.grid_scores_)

# use K-best to rank the best features
k_best = SelectKBest()
k_best.fit(features, labels)
results_list = zip(k_best.get_support(), features_list[1:], k_best.scores_)
results_list = sorted(results_list, key=lambda x: x[2], reverse=True)
# print the scores for each feature
pprint.pprint(results_list)

# use feature_importances_ from a decision tree classifier to rank the best features
from tester import test_classifier, dump_classifier_and_data
from sklearn import tree
clf_test = tree.DecisionTreeClassifier()
test_classifier(clf_test, my_dataset, features_list)
importance = clf_test.feature_importances_
for i in range(len(importance)):
    print features_list[i + 1] + ": " + str(importance[i])

# Using the ranking from K-best anf merging it with the more important features obtained from de Decision tree clasifier
# I decided to select the best of both sets:
features_list = [
    'poi', 'exercised_stock_options', 'total_stock_value', 'salary',
    'fraction_to_poi', 'restricted_stock', 'shared_receipt_with_poi'
]

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
Example #39
0
labels, features = targetFeatureSplit(data)


# In[7]:


### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.

clf = GaussianNB()
test_classifier(clf,my_dataset,features_list)
clf1=tree.DecisionTreeClassifier()
test_classifier(clf1,my_dataset,features_list)
clf2 = AdaBoostClassifier()
test_classifier(clf2,my_dataset,features_list)
clf3=KNeighborsClassifier(n_neighbors = 4)
test_classifier(clf3,my_dataset,features_list)


# In[8]:

from sklearn.neighbors.nearest_centroid import NearestCentroid
clf4 = NearestCentroid()
test_classifier(clf4,my_dataset,features_list)

    ]))
feature_list_default.insert(
    0, feature_list_default.pop(feature_list_default.index("poi")))

# test different classifiers with the default feature set (for more details, please see the notebook)
for clf in [
        GaussianNB(),
        KMeans(),
        LogisticRegression(class_weight="balanced"),
        SVC(class_weight="balanced"),
        ADA(),
        DT(class_weight="balanced"),
        RF(class_weight="balanced")
]:

    tester.test_classifier(clf, final_dataset, feature_list_default)

### 4.1: Evaluate the impact of feature engineering on classification performance (for more details, please refer to the notebook). Test only with SVC (the best classifier).

print "BASELINE PERFORMANCE -default feature set and SVC with linear kernel"
print "--------------------------------------------------------------------"
clf = SVC(kernel="linear", class_weight="balanced")
tester.test_classifier(clf, final_dataset, feature_list_default)

print "EXTENDED FEATURE SET1 PERFORMANCE -default feature set + TF-IDF features and SVC with linear kernel"
print "--------------------------------------------------------------------"
selected_feature_list = feature_list_default + [
    'word_feature_2', 'word_feature_3'
]
clf = SVC(kernel="linear", class_weight="balanced")
tester.test_classifier(clf, final_dataset, selected_feature_list)
Example #41
0
parameters_NB = dict(SelectKBest__k=range(1, 10))

pipeline = sklearn.pipeline.Pipeline(steps_NB)
grid = GridSearchCV(pipeline, param_grid=parameters_NB, cv=cv, scoring='f1')
grid.fit(features_train, labels_train)
predict = grid.predict(features_test)
report = classification_report(labels_test, predict)
best_params = grid.best_params_
#print report
print "PARAMETERS USED:"
print best_params
print grid.best_score_
clf_GNB = grid.best_estimator_
print "TUNED CLASSIFICATION REPORT:"
test_classifier(clf_GNB, my_dataset, RF_features_list, folds=1000)

#overwrite features_list
features_list = RF_features_list

#tuned Random Forest **without** SKB
steps_RF = [
    ('minmax', mms),
    #('SelectKBest', skb),
    ('random_forest', clf_RF)
]

parameters_RF = dict(  #SelectKBest__k = [6],
    random_forest__criterion=['gini'],
    random_forest__n_estimators=[9],
    random_forest__min_samples_split=[2],
Example #42
0
    #效果不好,予以去除
    #ab_pca = {"PCA__n_components": range(4, 7), "PCA__whiten": [True, False]}
    #ab_k.update(ab_pca)

    ab_k.update(ab_params)

    enron.get_best_parameters_reports(pipe_ab, ab_k, features, labels)


if __name__ == '__main__':
    '''         GAUSSIAN NAIVE BAYES            '''

    #设置需要使用的分类器
    clf = GaussianNB()
    #CrossValidation进行测试
    print "Gaussian Naive Bayes : \n", tester.test_classifier(
        clf, my_dataset, best_features_list)
    """
    Gaussian Naive Bayes : 
    GaussianNB(priors=None)
        Accuracy: 0.84380	Precision: 0.40058	Recall: 0.34550	F1: 0.37101	F2: 0.35527
        Total predictions: 15000	True positives:  691	False positives: 1034	False negatives: 1309	True negatives: 11966
    
    None
        
    """
    '''         LOGISTIC REGRESSION             '''

    #使用tune获取每个算法的最佳参数

    #tune_logistic_regression()
    """
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html


# Provided to give you a starting point. Try a variety of classifiers.
#GaussianNB
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
test_classifier(clf,my_dataset,features_list,folds = 1000)

#decision tree
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_leaf=1)
test_classifier(clf,my_dataset,features_list,folds = 1000)

#Adaboost
from sklearn.ensemble import AdaBoostClassifier 
clf = AdaBoostClassifier()
test_classifier(clf,my_dataset,features_list,folds = 1000)

#kNearestNeighbours
from sklearn.neighbors import KNeighborsClassifier
clf=KNeighborsClassifier(n_neighbors = 4)
test_classifier(clf,my_dataset,features_list)
Example #44
0
for i, clf in enumerate(classifiers):
    print 'Step 1: ', i, names[i]

    clf.fit(features_train, labels_train)

    pred = clf.predict(features_test)

    print "Precision: ", precision_score(labels_test, pred)
    print "Recall:    ", recall_score(labels_test, pred)
    print "F1:        ", f1_score(labels_test, pred)

    print 'done...'

for clf in classifiers:
    test_classifier(clf, my_dataset, features_list)

print '-------------------------------------------------------------------------------'

#%% Step 2:

classifier_opt = []

parameters = [
    dict(),
    dict(n_neighbors=range(1, 20, 1), weights=['uniform', 'distance']),
    dict(criterion=['gini', 'entropy'],
         min_samples_split=range(10, 30, 1),
         min_samples_leaf=range(1, 11, 1)),
    dict(criterion=['gini', 'entropy'],
         n_estimators=[5, 8, 10, 12, 25],
Example #45
0
precision_knn  = []
recall_knn     = []

# Apply SelectKBest to each classifier for k=1 to k=19
for i in range(1, 20):
    k = SelectKBest(f_classif, k=i)
    features_new = k.fit_transform(features, labels)
    selected_features_index = k.get_support()
    selected_features_list = features_list[selected_features_index]
    selected_features_list = np.insert(selected_features_list, 0, 'poi')
    print "===================="
    print "Selected features: ", selected_features_list

    # DecisionTree
    clf = tree.DecisionTreeClassifier()
    pre, rec = test_classifier(clf, my_dataset, selected_features_list, folds = 1000)
    precision_tree.append(pre)
    recall_tree.append(rec)

    # Naive Bayes
    clf = naive_bayes.GaussianNB()
    pre, rec = test_classifier(clf, my_dataset, selected_features_list, folds = 1000)
    precision_nb.append(pre)
    recall_nb.append(rec)

    # K Nearest Neighbors
    clf = neighbors.KNeighborsClassifier(n_neighbors=3)
    pre, rec = test_classifier(clf, my_dataset, selected_features_list, folds = 1000)
    precision_knn.append(pre)
    recall_knn.append(rec)
Example #46
0
# Out of the three SVC seems to be most accurate. 


# In[89]:

from sklearn.tree import DecisionTreeClassifier
from tester import test_classifier
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit


test_classifier(DecisionTreeClassifier( random_state = 1), enron_data, features_final, folds = 100)

tree = DecisionTreeClassifier()

parameters = {'tree__criterion': ('gini','entropy'),
              'tree__splitter':('best','random'),
              'tree__min_samples_split':[2, 10, 20],
                'tree__max_depth':[10,15,20,25,30],
                'tree__max_leaf_nodes':[5,10,30]}
# use scaling in GridSearchCV
Min_Max_scaler = preprocessing.MinMaxScaler()


#features = Min_Max_scaler.fit_transform(features)
pipeline = Pipeline(steps=[('scaler', Min_Max_scaler), ('pca',PCA(n_components = 2)), ('tree', tree)])
cv = StratifiedShuffleSplit(target, 100, random_state = 42)
Example #47
0
          ('NB', nb_Clf), ('SVM', svm_Clf)]

results = []
names = []
scoring = 'accuracy'
df1 = df

f_list = df.columns
f_list = f_list[1:len(f_list) - 1]
f_list = map(str, f_list)
f_features = ['poi']
for fn in f_list:
    f_features.append(fn)
print(f_features)

test_classifier(l_Clf, my_dataset, f_features, folds=45)
test_classifier(lda_Clf, my_dataset, f_features, folds=45)
test_classifier(knn_Clf, my_dataset, f_features, folds=45)
test_classifier(rf_Clf, my_dataset, f_features, folds=45)

print("\n\n")
df['salary_bonus_ratio'] = df.salary.div(df.bonus)
df.loc[~np.isfinite(df['salary_bonus_ratio']), 'salary_bonus_ratio'] = 0
df['salary_expense_ratio'] = df.salary.div(df.expenses)
df.loc[~np.isfinite(df['salary_expense_ratio']), 'salary_expense_ratio'] = 0

features_list.append('salary_bonus_ratio')
features_list.append('salary_expense_ratio')

df = df[features_list]
df = df.apply(np.sqrt, axis=1)
Example #48
0

sd = StandardScaler()
fsl = FeatureSel(k_best=5, pca_comp=5)
# clf=Pipeline([("fsl",fsl),("sd",sd),("lvc",LinearSVC(C=0.000001))])


clf = Pipeline([("fsl", fsl), ("sd", sd), ("lvc", LinearSVC())])

gscv=GridSearchCV(clf,{"lvc__C":np.logspace(-6,-1,5),
                       "fsl__k_best":[1,5,10],
                       "fsl__pca_comp":[0,5,10]},
                  scoring="recall",verbose=0)


gscv.fit(np.array(features),np.array(labels))

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html


test_classifier(gscv.best_estimator_, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(gscv.best_estimator_, my_dataset, features_list)
sss = StratifiedShuffleSplit(labels, n_iter =100, test_size=0.3, random_state = 42)
grid_search = GridSearchCV(pipeline, param_grid=parameters, cv = sss)

### Tried different parameter for StratifiedShuffleSplit and GridSearcgCV
#sss= StratifiedShuffleSplit(n_iter = 20,test_size=0.5, random_state = 5)
#grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv = sss, verbose=10, scoring='f1')
#grid_search = GridSearchCV(pipeline, param_grid=parameters, cv = sss, error_score = 0, scoring='f1')

#print "Grid Search:   ", grid_search
#print(grid_search.best_estimator_.steps)
#print "\n", "Best parameters are: ", grid_search.best_params_, "\n"

grid_search.fit(features, labels)
clf = grid_search.best_estimator_

### Use test_classifier.py to test the best model found
from tester import test_classifier

# Use test_classifier to evaluate the model selected by GridSearchCV
print "\n", "Tester Classification report -  StratifiedShuffleSplit:" 
test_classifier(clf, data_dict, features_list)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

#print features_list
dump_classifier_and_data(clf, my_dataset, features_list)
Example #50
0
print("Training time : {}".format(end_fitting - start_fitting))

start_predicting = time()
svc_pred = svc_grid.predict(features_test)
end_predicting = time()
print("Predicting time : {}".format(end_predicting - start_predicting))

svc_accuracy = accuracy_score(svc_pred, labels_test)
print('SVC accuracy score : {}'.format(svc_accuracy))
print "f1 score :", f1_score(svc_pred, labels_test)
print "precision score :", precision_score(svc_pred, labels_test)
print "recall score :", recall_score(svc_pred, labels_test)
svc_best_estimator = svc_grid.best_estimator_
print(svc_best_estimator)

test_classifier(nb_grid.best_estimator_, my_dataset, features_list)

#Checking the affect of new feature on the final classifier

test_features_list = [
    'poi', 'total_stock_value', 'exercised_stock_options', 'bonus',
    'deferred_income', 'long_term_incentive', 'restricted_stock', 'salary',
    'total_payments', 'other', 'shared_receipt_with_poi',
    'fraction_from_this_person_to_poi'
]

print "\n=================Effect of new feature on final classifier================="

test_classifier(nb_grid.best_estimator_, my_dataset, test_features_list)

###Task 6: Dump your classifier, dataset, and features_list so anyone can
Example #51
0
}

dtc_clf = sklearn.tree.DecisionTreeClassifier()
dtcclf = grid_search.GridSearchCV(dtc_clf, parameters, scoring=scoring, cv=cv)

dtcclf.fit(features, labels)
print 'best estimator:', dtcclf.best_estimator_
print 'best score:', dtcclf.best_score_
print 'Processing time:', round(time() - t0, 3), 's'

#Validation of ClassifierClassifier validation
##DecisionTreeClassifier Validation No. 1 (StratifiedShuffleSplit, folds = 1000)
t0 = time()
dtc_best_clf = dtcclf.best_estimator_

test_classifier(dtc_best_clf, enron_data, eng_feature_list)

print 'Processing time:', round(time() - t0, 3), 's'

##DecisionTreeClassifier Validation No. 2 (Randomized, partitioned trials, n=1,000)
t0 = time()
dtc_best_clf = dtcclf.best_estimator_

evaluate.evaluate_clf(dtc_best_clf,
                      features,
                      labels,
                      num_iters=1000,
                      test_size=0.3)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print 'Processing time:', round(time() - t0, 3), 's'
Example #52
0
# Importing the AdaBoost from Scikit learn package.
from sklearn import tree

# Creating a classifier with the optimized parameters.
clf_ada = tree.DecisionTreeClassifier(splitter='best',
                                      criterion='gini',
                                      class_weight='balanced',
                                      min_samples_leaf=1,
                                      min_samples_split=2,
                                      max_depth=5,
                                      max_leaf_nodes=4)

# Testing the classifier.
tester.test_classifier(clf=clf_ada,
                       dataset=my_dataset,
                       feature_list=features_list)

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Please, find more information in the Jupyter Notebook.

# Example starting point. Try investigating other evaluation techniques!

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
Example #53
0
#

from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC

features_train_zero, features_test_zero, labels_train_zero, labels_test_zero = train_test_split(
    features, labels, test_size=0.3, random_state=42)

clf = GaussianNB()
clf.fit(features_train_zero, labels_train_zero)
print test_classifier(clf, my_dataset, features_list, folds=1000)

#
#### Task 5: Tune your classifier to achieve better than .3 precision and recall
#### using our testing script. Check the tester.py script in the final project
#### folder for details on the evaluation method, especially the test_classifier
#### function. Because of the small size of the dataset, the script uses
#### stratified shuffle split cross validation. For more info:
#### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
#
## Example starting point. Try investigating other evaluation techniques!
#from sklearn.cross_validation import train_test_split
#features_train, features_test, labels_train, labels_test = \
#    train_test_split(features, labels, test_size=0.3, random_state=42)
#
#### Task 6: Dump your classifier, dataset, and features_list so anyone can
Example #54
0
                                                    random_state=42,
                                                    stratify=labels)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
# clf = DecisionTreeClassifier(min_samples_split=100)
# clf = SVC(C=10.0, gamma=0.001)
# clf = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=100), algorithm="SAMME")
clf = KNeighborsClassifier(n_neighbors=5, weights="distance", algorithm="auto")

test_classifier(clf, data_dict, selected_features_list)

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# # tuned_parameters = {"criterion": ("gini", "entropy"), "max_depth": (None, 1, 2, 5, 7, 10), "min_samples_split": (10, 100, 250)}
# # tuned_parameters = {"C": (10.0, 100.0, 1000.0), "gamma": (1e-3, 1e-4)}
# # tuned_parameters = {"n_estimators": (50, 100, 150, 200), "learning_rate": (1.0, 1.5, 2.0), "algorithm": ("SAMME", "SAMME.R")}
# tuned_parameters = {"n_neighbors": (1, 5, 10, 15), "weights": ("uniform", "distance")}
#
# gs = GridSearchCV(clf, tuned_parameters, cv=10)
# gs.fit(X_train, y_train)
Example #55
0
clf_NB = GaussianNB()
parm = {}

clf_NB = Pipeline([('scaler', scaler), ('gnb', clf_NB)])
gs = GridSearchCV(clf_NB, parm)
gs.fit(features_train, labels_train)

clf_NB = gs.best_estimator_

print "\nGaussianNB score:\n", clf_NB.score(features_train, labels_train)
print "GaussianNB score time:", round(time() - t1, 3), "s"

##  Test Point


print "\nGaussianNB:\n", test_classifier(clf_NB, my_dataset, features_list)

## 2. Decision Tree Classifier


t2 = time()

parms = {'criterion': ['gini', 'entropy'], \
         'min_samples_split': [2, 5, 10, 20], \
         'max_depth': [None, 2, 5, 10], \
         'splitter': ['random', 'best'], \
         'max_leaf_nodes': [None, 5, 10, 20]}

clf_DT = tree.DecisionTreeClassifier()

gs = GridSearchCV(clf_DT, parms)
Example #56
0
parameters = {'max_depth': [1,2,3,4,5,6,8,9,10],
              'min_samples_split':[2,3,4,5,6,7,8],
              'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10],
              'criterion':('gini', 'entropy')}
dt_clf = DecisionTreeClassifier(random_state = 42)
cv = cross_validation.StratifiedShuffleSplit(labels, n_iter=10)
clf = GridSearchCV(dt_clf, parameters,cv=cv, scoring = 'f1')
clf.fit(features,labels)

predictor = clf.predict(features_test)
dt_best_estimator=clf.best_estimator_
precision = precision_score(labels_test,predictor)
recall = recall_score(labels_test,predictor)
f1_score=f1_score(labels_test,predictor)
print "Best score:%f"%clf.best_score_
print dt_best_estimator
print "processing time:", round(time()-t0, 3), "s"


# Classifier validation
##DecisionTreeClassifier Validation 1 (StratifiedShuffleSplit, folds = 1000)
t0 = time()
test_classifier(dt_best_estimator, my_dataset, my_features_list)
print 'Processing time:', round(time() - t0, 3), 's'

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(dt_best_estimator, my_dataset, my_features_list)
 
 ### Create StratifiedKFold
 
 skf = StratifiedKFold(labels_train,random_state=42)
 
 ### DecisionTreeClassifier
 
 algo_name = "DecisionTreeClassifier"
 print algo_name
 
 from sklearn.tree import DecisionTreeClassifier
 pipeline = Pipeline([('scaler', MinMaxScaler()), ('kbest', SelectKBest(f_classif)), ('dtc', DecisionTreeClassifier(random_state=42))])
 grid_search = GridSearchCV(pipeline, {'kbest__k': range(1,16), 'dtc__min_samples_split': [1,2,3], 'dtc__max_depth': [None, 10, 5]},scoring='f1',cv=skf)
 grid_search.fit(features_train, labels_train)
 clf =  grid_search.best_estimator_
 perf_dict[algo_name] = test_classifier(clf, my_dataset, features_list) 
 parm_dict[algo_name] = grid_search.best_params_
 
 
 ### Print SelectKBest scores, note these are the same for all classifiers
 
 kbest_scores = clf.named_steps['kbest'].scores_
 feature_scores = {}
 for i in xrange(1,len(features_list)):
     feature_scores[features_list[i]] = kbest_scores[i-1]  
 feature_scores = sorted(feature_scores.items(), key=operator.itemgetter(1),reverse=True)    
 i = 1
 for f in feature_scores:
     print "|{}|{}|{}|".format(i,f[0],f[1])
     i += 1
 
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
### Extract features and labels from my_dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
folds = 1000
cv = StratifiedShuffleSplit(labels, folds, random_state=42)
## Setting up 3 classifiers with feature scaling
## Gaussian NB
print "Gaussian NB classifier output:"
NB_clf = Pipeline(
    steps=[('scaling',
            preprocessing.MinMaxScaler()), ('classifier', GaussianNB())])
t0 = time()
tester.test_classifier(NB_clf, my_dataset, features_list)
print "Gaussian NB run time:", round(time() - t0, 3), "s"

## KMeans
print "KMeans classifier output:"
KM_clf = Pipeline(
    steps=[('scaling',
            preprocessing.MinMaxScaler()), ('classifier',
                                            KMeans(n_clusters=2))])
t0 = time()
tester.test_classifier(KM_clf, my_dataset, features_list)
print "KMeans run time:", round(time() - t0, 3), "s"

## Decision tree
print "Decision Tree classifier output:"
DT_clf = Pipeline(steps=[(
Example #59
0
#### keep the engineered features added to data_dict
my_dataset = data_dict

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

##### Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight='auto', random_state=42)
from time import time
from tester import test_classifier
t0 = time()
test_classifier(rf, data_dict, features_list, folds = 100)
print("Random forest fitting time: %rs" % round(time()-t0, 3))

###### Adaboost
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(random_state=42)
t0 = time()
test_classifier(ab, data_dict, features_list, folds = 100)
print("AdaBoost fitting time: %rs" % round(time()-t0, 3))

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
Example #60
0
                       'metric': ['manhattan', 'minkowski', 'euclidean'],
                       'weights': ['distance', 'uniform']
                   },
                   cv=cv,
                   scoring='f1')
knn.fit(features, labels)
print 'K Nearest Neighbors best estimator: ', knn.best_estimator_
print 'K Nearest Neighbors best parameters: ', knn.best_params_
print 'K Nearest Neighbors best score: ', knn.best_score_
# tester.test_classifier(knn.best_estimator_, my_dataset, best_features)

# Pipeline
print "Pipelining..."
pipeline = Pipeline([('normalization', scaler),
                     ('classifier', knn.best_estimator_)])
tester.test_classifier(pipeline, my_dataset, best_features)

# Tune K Means
kmeans = GridSearchCV(
    KMeans(),
    param_grid={
        'n_clusters': [2],
        'tol': [0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01],
        'max_iter': [300, 200, 400, 500, 600, 700],
        'init': ['k-means++', 'random'],
        'copy_x': [True, False]
    },
    cv=cv,
    scoring='f1')
kmeans.fit(features, labels)
print 'K Means best estimator: ', kmeans.best_estimator_