def test_classifier(clf, dataset, feature_list, folds = 1000):
    #print dataset
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    print len(labels)
    folds= 2
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    for train_index, test_index in cv:
        print("TRAIN:", train_index, "TEST:", test_index)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            else:
                true_positives += 1
    try:

        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print 'Feature List:', feature_list
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        return [feature_list, accuracy, precision, recall, f1, f2]

        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)

    except:
        print "Got a divide by zero when trying out:", clf, feature_list
        return [feature_list, -1, -1, -1, -1, -1]
def one_feature_predict(features_list, my_dataset):
    all = []
    for i in features_list:
        if i != 'poi':
            l = []
            l.append('poi')
            l.append(i)
            all.append(l)
    #print all
    mycolumns = ['feature_list', 'accuracy', 'precision', 'recall', 'f1', 'f2']
    resultdf = pd.DataFrame(columns=mycolumns)
    for item in all:
        data = featureFormat(my_dataset, item, sort_keys = True)
        labels, features = targetFeatureSplit(data)

        clf = tree.DecisionTreeClassifier(min_samples_split = 4)
        clf.fit(features, labels)
        resultdf.loc[len(resultdf)] =  (test_classifier(clf, my_dataset, item))
    return resultdf
#!/usr/bin/python

import pickle
import sys
import matplotlib.pyplot

sys.path.append("../tools/")
from MiniProjects.tools.feature_format import featureFormat, targetFeatureSplit


### read in data dictionary, convert to numpy array
data_dict = pickle.load(open("../../data/final_project_dataset.pkl", "r"))
data_dict.pop("TOTAL", 0)
# print data_dict
features = ["salary", "bonus"]
data = featureFormat(data_dict, features)

# print(type(data))
a = sorted(data, key=lambda tup: tup[0], reverse=True)
# print a


# LAVORATO, JOHN J                     339,288             8,000,000
# LAY, KENNETH L                    1,072,321              7,000,000
#
for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter(salary, bonus)
#
matplotlib.pyplot.xlabel("salary")
import sys
import pickle
sys.path.append("../tools/")
from MiniProjects.tools.feature_format  import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../../data/final_project_dataset_modified.pkl", "r") )


### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)


target, features = targetFeatureSplit( data )



### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"



### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color from "b" to "r"
### to differentiate training points from test points.

def topk_feature_predict(k_best_features, my_dataset, normalize_data = False):
    new_list = k_best_features.keys()
    all = []
    for i in range(len(new_list)):
        if i !=0: #already looked at individual features on step 1. Start by 2X2
            all.extend([sorted(l) for l in itertools.combinations(new_list, i+1)])

    #use this to select only combinations of 4
    #all.extend([sorted(l) for l in itertools.combinations(new_list, 4)])

    for item in all: #add 'poi' in the beginning to all combinations
        poi ='poi'
        item.insert(0, poi)


    mycolumns = ['feature_list', 'accuracy', 'precision', 'recall', 'f1', 'f2']
    resultdf2 = pd.DataFrame(columns=mycolumns)


    for item in all:
        data = featureFormat(my_dataset, item, sort_keys = True)


        if normalize_data:
            df = pd.DataFrame(data, columns=item)
            for column in df.columns[1:]:
                df[column] = (df[column] - df[column].mean()) / (df[column].std())
            labels = df['poi']
            features = df[item[1:]] # all expect for poi
        else:
            labels, features = targetFeatureSplit(data)


        # Tree with 5 best:
        #clf = tree.DecisionTreeClassifier(min_samples_split = 4)

        # KNeighborsClassifier with 5 best:
        #clf = KNeighborsClassifier(algorithm='auto', metric='minkowski', metric_params=None, n_neighbors=5, p=2, weights='distance')
        #clf = KNeighborsClassifier(algorithm='auto', metric='minkowski', metric_params=None, n_neighbors=6, p=2, weights='distance')
        #clf = KNeighborsClassifier(algorithm='auto', metric='manhattan', metric_params=None, n_neighbors=6, p=2, weights='distance' , leaf_size=30) # best 1
        clf = KNeighborsClassifier(algorithm='auto', metric='minkowski', metric_params=None, n_neighbors=6, p=2, weights='distance', leaf_size=30) #best 2




        #Logistic regression:
        #clf = LogisticRegression(C=1000,penalty='l1',random_state=42,tol=-1000,class_weight='auto')
        #clf = LogisticRegression( C=1,penalty='l1',random_state=42,tol=10**-10,class_weight='auto')

        #Random Forest:
        #clf = RandomForestClassifier(n_estimators=10, min_samples_split = 4, n_jobs = -1, max_features = 0.5)

        #AdaBoost:
        #clf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)

        clf.fit(features, labels)


        #only have to transform back to dictionary if I had to make it a data frame to normalize
        if normalize_data:
            new_dataset = df_to_dict(features_df=features, labels_df=labels)
        else:
            new_dataset = my_dataset
        resultdf2.loc[len(resultdf2)]= (test_classifier(clf, new_dataset, item))

    return resultdf2

### Task 3: Create new feature(s)
my_dataset = data_dict

for key in data_dict:
    if math.isnan(float(my_dataset[key]['bonus'])) or (math.isnan(float(my_dataset[key]['salary']))):
        my_dataset[key]['bonus_salary_ratio'] = 0
    else:
        my_dataset[key]['bonus_salary_ratio'] = round(float(my_dataset[key]['bonus']) /float(my_dataset[key]['salary']),2)
features_list.append('bonus_salary_ratio')



### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, all_features, sort_keys = True)
#pprint.pprint(my_dataset)


labels, features = targetFeatureSplit(data)

k_best_features = SelectKBestFeatures(features, labels, 5, False)


#uncomment for Step 1:
#resultdf1 = one_feature_predict(features_list, my_dataset)
#print  tabulate(resultdf1.sort(['recall','accuracy', 'precision'], ascending = [0,0,0]) , headers='keys', tablefmt='psql', floatfmt=".4f")


#uncomment for Step 2
#resultdf2 = topk_feature_predict(k_best_features, my_dataset, False)
    if value["exercised_stock_options"] != "NaN":
        min_max.append(value["exercised_stock_options"])

print min(min_max), max(min_max)


### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
# feature_3 = 'total_payments'
poi = "poi"
features_list = [poi, feature_1, feature_2]


data = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data)


### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, line below assumes 2 features)
for f1, f2, f3 in finance_features:
    plt.scatter(f1, f2)
plt.show()


from sklearn.cluster import KMeans

features_list = ["poi", feature_1, feature_2]