def run(): candidates = CandidateFeatureVector.objects.all().values() candidates = DataUtil.remove_fake_candidates(candidates) candidates_df = pd.DataFrame(candidates) candidates_df.set_index('id', inplace=True) candidates_df.drop(columns=['candidate_id'], inplace=True) candidates_df = EncodingUtil.basic_label_encode_cols( candidates_df, ConstantsUtil.BASIC_ENCODE_COLS) candidates_df = EncodingUtil.sort_position_cols_and_encode( candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS) candidates_df.drop( columns=ConstantsUtil.FEATURES_IGNORED_BY_INFORMATION_GAIN, inplace=True) X = candidates_df.drop(columns=['classification']) y = candidates_df['classification'] knn = KerasNN(input_dim=len(X.columns)) X_train, X_test, y_train, y_test = knn.fit_model(X, y, 120, .3, .2) knn.plot_training_validation() knn.plot_roc_curve() print(knn.get_classification_report(X_test, y_test)) knn.get_accuracy_stats(X_test, y_test) results = knn.test_n_iterations(X, y, 120, .3, .2, 10) print(results) '''
def run(): candidates = CandidateFeatureVector.objects.all().values() candidates_df = pd.DataFrame(candidates) candidates_df.set_index('id', inplace=True) candidates_df.drop(columns=['candidate_id', 'classification'], inplace=True) candidates_df = EncodingUtil.basic_label_encode_cols(candidates_df, ConstantsUtil.BASIC_ENCODE_COLS) candidates_df = EncodingUtil.sort_position_cols_and_encode(candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS) pca = PCA(.9) pca.fit(candidates_df) print(pca.components_)
def get_candidates_df(candidates, drop_fakes=False, convert_fakes=True, drop_min_features=False): if drop_fakes: candidates = DataUtil.remove_fake_candidates(candidates) elif convert_fakes: candidates = DataUtil.convert_fake_candidates(candidates) candidates_df = pd.DataFrame(candidates) candidates_df.set_index('id', inplace=True) candidates_df.drop(columns=['candidate_id'], inplace=True) candidates_df = EncodingUtil.basic_label_encode_cols(candidates_df, ConstantsUtil.BASIC_ENCODE_COLS) candidates_df = EncodingUtil.sort_position_cols_and_encode(candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS) if drop_min_features: candidates_df.drop(columns=ConstantsUtil.FEATURES_IGNORED_BY_INFORMATION_GAIN, inplace=True) return candidates_df
def run(): candidates = CandidateFeatureVector.objects.all().values() candidates_df = pd.DataFrame(candidates) candidates_df.set_index('id', inplace=True) candidates_df.drop(columns=['candidate_id'], inplace=True) candidates_df = EncodingUtil.basic_label_encode_cols( candidates_df, ConstantsUtil.BASIC_ENCODE_COLS) candidates_df = EncodingUtil.sort_position_cols_and_encode( candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS) dt = DecisionTree(criterion="entropy") dt.set_data(candidates_df, 'classification') X_train, X_test, y_train, y_test = dt.split_test_data(.3, True) dt.get_roc_curve(X_train, X_test, y_train, y_test) dt.fit_and_predict(X_train, X_test, y_train) print(dt.get_confusion_matrix(y_test)) print(dt.get_classification_report(y_test)) dt.get_learning_curve()
def run(): candidates = CandidateFeatureVector.objects.all().values() candidates_df = pd.DataFrame(candidates) candidates_df.set_index('id', inplace=True) candidates_df.drop(columns=['candidate_id'], inplace=True) candidates_df = EncodingUtil.basic_label_encode_cols( candidates_df, ConstantsUtil.BASIC_ENCODE_COLS) candidates_df = EncodingUtil.sort_position_cols_and_encode( candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS) svm = SVM(C=.75, kernel='poly') X_train, X_test, y_train, y_test = svm.split_test_data( candidates_df, .3, 'classification', True) svm.fit_and_predict(X_train, X_test, y_train) print(svm.get_confusion_matrix(y_test)) print(svm.get_classification_report(y_test)) estimator = svm.get_model() selector = GeneticSelectionCV(estimator, cv=5, verbose=1, scoring="accuracy", max_features=50, n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=40, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1) X, y = svm.get_data() selector = selector.fit(X, y) print(selector.support_)
def run(): feat_importances = [] candidates = CandidateFeatureVector.objects.all().values() candidates_df = pd.DataFrame(candidates) candidates_df.set_index('id', inplace=True) for i in range(0,10): y = candidates_df['classification'] X = candidates_df.copy() X.drop(columns=['candidate_id', 'classification'], inplace=True) X = EncodingUtil.basic_label_encode_cols(X, ConstantsUtil.BASIC_ENCODE_COLS) X = EncodingUtil.sort_position_cols_and_encode(X, ConstantsUtil.STRING_TUPLE_ENCODE_COLS) importances = mutual_info_classif(X.values, y) feat_importances.append(pd.Series(importances, X.columns)) feature_df = pd.DataFrame(feat_importances) importance_means = feature_df.mean() importance_means.plot(kind='barh') plt.show() print(importance_means)
def run(): candidates = CandidateFeatureVector.objects.all().values() candidates_df = pd.DataFrame(candidates) candidates_df.set_index('id', inplace=True) candidates_df.drop(columns=['candidate_id'], inplace=True) candidates_df = EncodingUtil.basic_label_encode_cols( candidates_df, ConstantsUtil.BASIC_ENCODE_COLS) candidates_df = EncodingUtil.sort_position_cols_and_encode( candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS) gnb = GaussianNB() gnb.set_data(candidates_df, 'classification') X_train, X_test, y_train, y_test = gnb.split_test_data(.3, True) gnb.get_roc_curve(X_train, X_test, y_train, y_test) gnb.fit_and_predict(X_train, X_test, y_train) print(gnb.get_confusion_matrix(y_test)) print(gnb.get_classification_report(y_test)) gnb.get_learning_curve() print(gnb.get_avg_metrics_for_n_iterations(10, .3, True))