コード例 #1
0
def run():
    candidates = CandidateFeatureVector.objects.all().values()
    candidates = DataUtil.remove_fake_candidates(candidates)
    candidates_df = pd.DataFrame(candidates)
    candidates_df.set_index('id', inplace=True)
    candidates_df.drop(columns=['candidate_id'], inplace=True)

    candidates_df = EncodingUtil.basic_label_encode_cols(
        candidates_df, ConstantsUtil.BASIC_ENCODE_COLS)
    candidates_df = EncodingUtil.sort_position_cols_and_encode(
        candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS)
    candidates_df.drop(
        columns=ConstantsUtil.FEATURES_IGNORED_BY_INFORMATION_GAIN,
        inplace=True)

    X = candidates_df.drop(columns=['classification'])
    y = candidates_df['classification']

    knn = KerasNN(input_dim=len(X.columns))
    X_train, X_test, y_train, y_test = knn.fit_model(X, y, 120, .3, .2)
    knn.plot_training_validation()
    knn.plot_roc_curve()
    print(knn.get_classification_report(X_test, y_test))
    knn.get_accuracy_stats(X_test, y_test)

    results = knn.test_n_iterations(X, y, 120, .3, .2, 10)
    print(results)
    '''
コード例 #2
0
ファイル: perform_pca.py プロジェクト: dkStephanos/nba_thesis
def run():
    candidates = CandidateFeatureVector.objects.all().values()
    candidates_df = pd.DataFrame(candidates)
    candidates_df.set_index('id', inplace=True)
    candidates_df.drop(columns=['candidate_id', 'classification'], inplace=True)

    candidates_df = EncodingUtil.basic_label_encode_cols(candidates_df, ConstantsUtil.BASIC_ENCODE_COLS)
    candidates_df = EncodingUtil.sort_position_cols_and_encode(candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS)

    pca = PCA(.9)
    pca.fit(candidates_df)
    print(pca.components_)
コード例 #3
0
    def get_candidates_df(candidates, drop_fakes=False, convert_fakes=True, drop_min_features=False):
        if drop_fakes:
            candidates = DataUtil.remove_fake_candidates(candidates)
        elif convert_fakes:
            candidates = DataUtil.convert_fake_candidates(candidates)
        candidates_df = pd.DataFrame(candidates)
        candidates_df.set_index('id', inplace=True)
        candidates_df.drop(columns=['candidate_id'], inplace=True)

        candidates_df = EncodingUtil.basic_label_encode_cols(candidates_df, ConstantsUtil.BASIC_ENCODE_COLS)
        candidates_df = EncodingUtil.sort_position_cols_and_encode(candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS)
        if drop_min_features:
            candidates_df.drop(columns=ConstantsUtil.FEATURES_IGNORED_BY_INFORMATION_GAIN, inplace=True)

        return candidates_df
コード例 #4
0
def run():
    candidates = CandidateFeatureVector.objects.all().values()
    candidates_df = pd.DataFrame(candidates)
    candidates_df.set_index('id', inplace=True)
    candidates_df.drop(columns=['candidate_id'], inplace=True)

    candidates_df = EncodingUtil.basic_label_encode_cols(
        candidates_df, ConstantsUtil.BASIC_ENCODE_COLS)
    candidates_df = EncodingUtil.sort_position_cols_and_encode(
        candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS)

    dt = DecisionTree(criterion="entropy")
    dt.set_data(candidates_df, 'classification')
    X_train, X_test, y_train, y_test = dt.split_test_data(.3, True)
    dt.get_roc_curve(X_train, X_test, y_train, y_test)

    dt.fit_and_predict(X_train, X_test, y_train)
    print(dt.get_confusion_matrix(y_test))
    print(dt.get_classification_report(y_test))

    dt.get_learning_curve()
コード例 #5
0
def run():
    candidates = CandidateFeatureVector.objects.all().values()
    candidates_df = pd.DataFrame(candidates)
    candidates_df.set_index('id', inplace=True)
    candidates_df.drop(columns=['candidate_id'], inplace=True)

    candidates_df = EncodingUtil.basic_label_encode_cols(
        candidates_df, ConstantsUtil.BASIC_ENCODE_COLS)
    candidates_df = EncodingUtil.sort_position_cols_and_encode(
        candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS)

    svm = SVM(C=.75, kernel='poly')
    X_train, X_test, y_train, y_test = svm.split_test_data(
        candidates_df, .3, 'classification', True)
    svm.fit_and_predict(X_train, X_test, y_train)
    print(svm.get_confusion_matrix(y_test))
    print(svm.get_classification_report(y_test))

    estimator = svm.get_model()

    selector = GeneticSelectionCV(estimator,
                                  cv=5,
                                  verbose=1,
                                  scoring="accuracy",
                                  max_features=50,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)
    X, y = svm.get_data()
    selector = selector.fit(X, y)

    print(selector.support_)
コード例 #6
0
def run():
    feat_importances = []
    candidates = CandidateFeatureVector.objects.all().values()
    candidates_df = pd.DataFrame(candidates)
    candidates_df.set_index('id', inplace=True)
    for i in range(0,10):
        y = candidates_df['classification']
        X = candidates_df.copy()
        X.drop(columns=['candidate_id', 'classification'], inplace=True)

        X = EncodingUtil.basic_label_encode_cols(X, ConstantsUtil.BASIC_ENCODE_COLS)
        X = EncodingUtil.sort_position_cols_and_encode(X, ConstantsUtil.STRING_TUPLE_ENCODE_COLS)


        importances = mutual_info_classif(X.values, y)
        feat_importances.append(pd.Series(importances, X.columns))
    
    feature_df = pd.DataFrame(feat_importances)
    importance_means = feature_df.mean()
    
    importance_means.plot(kind='barh')
    plt.show()
    print(importance_means)
    
コード例 #7
0
def run():
    candidates = CandidateFeatureVector.objects.all().values()
    candidates_df = pd.DataFrame(candidates)
    candidates_df.set_index('id', inplace=True)
    candidates_df.drop(columns=['candidate_id'], inplace=True)

    candidates_df = EncodingUtil.basic_label_encode_cols(
        candidates_df, ConstantsUtil.BASIC_ENCODE_COLS)
    candidates_df = EncodingUtil.sort_position_cols_and_encode(
        candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS)

    gnb = GaussianNB()
    gnb.set_data(candidates_df, 'classification')
    X_train, X_test, y_train, y_test = gnb.split_test_data(.3, True)

    gnb.get_roc_curve(X_train, X_test, y_train, y_test)

    gnb.fit_and_predict(X_train, X_test, y_train)
    print(gnb.get_confusion_matrix(y_test))
    print(gnb.get_classification_report(y_test))

    gnb.get_learning_curve()

    print(gnb.get_avg_metrics_for_n_iterations(10, .3, True))