def test_data_split_identical(): X_, y_ = X[:40], np.array([0] * 20 + [1] * 20) for md in [ model.SVM(), model.KNN(), model.XGBoost(), model.LinearModel(), model.DecisionTree() ]: a = evaluation.estimate(md, X_train, X_test, y_train, y_test) b = evaluation.estimate(md, X_train, X_test, y_train, y_test) assert a == b a = evaluation.cross_validation(md, X_, y_, scoring='both', n_splits=2, n_jobs=1) b = evaluation.cross_validation(md, X_, y_, scoring='both', n_splits=2, n_jobs=1) assert np.all(a['f1'] == b['f1']) assert np.all(a['roc_auc'] == b['roc_auc'])
def test_ensemble(): for md in [ model.LinearEnsemble([model.LinearModel(), model.LinearModel()]), ]: try: evaluation.cross_validation(md, X, y, scoring='both', n_jobs=1, n_splits=2) except Exception as e: print(md.__class__) raise e
def test_cross_validation(): for md in [ model.SVM(), model.MultiClassesLearner('KNN', {'n_neighbors': 1}), model.KNN(), model.XGBoost(), model.LinearModel(), model.DecisionTree(), ]: try: evaluation.cross_validation(md, X, y, scoring='both', n_jobs=1, n_splits=2) except Exception as e: print(md.__class__) raise e
parser.add_argument('data', type=str, help="Path to tagged data file") parser.add_argument('--feature', dest='feat', type=str, default='bow', options=['bow', 'tfidf', 'bigram', 'trigram'], help="Type of feature to use") parser.add_argument('--classifier', dest='classifier', type=str, default='knn', options=['knn','log-reg', 'dec-tree', 'svm'], help="Type of classifier to use") return parser.parse_args() if __name__=="__main__": args = parse_args() print "Reading data..." titles, bodies, tags_sets, _ = da.read_data(args.data) tags = [list(t)[0] for t in tags_sets] X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags) X_train_t, X_train_b = zip(*X_train) print "Generating features..." if args.feat == "bow": X, extractor = fe.bag_of_words(X_train_t, X_train_b) elif args.feat == "tfidf": X, extractor = fe.tfidf(X_train_t, X_train_b) elif args.feat == "bigram": X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2) else: X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3) print "Train..." if args.classifier == "knn": classifier = KNeighborsClassifier(n_neighbors=3)
def test_logistic_regression(): m = model.LinearModel() X = np.random.normal(size=(1000, 10)) y = np.array(X.sum(axis=1) > 0, dtype=np.int8) scores = evaluation.cross_validation(m, X, y, n_jobs=1) assert np.all(scores > 0.9)
def test_xgboost(): m = model.XGBoost(n_jobs=1) print(evaluation.cross_validation(m, X, y, n_jobs=1))
choices=['bow', 'tfidf', 'bigram', 'trigram'], help="Type of feature to use") parser.add_argument('--classifier', dest='classifier', type=str, default='naive', choices=['naive'], help="Type of classifier to use") parser.add_argument('--maxRows', dest='maxRows', type=int, default=0, help="Max rows from file to read in") return parser.parse_args() if __name__=="__main__": args = parse_args() print "Reading data..." titles, bodies, tags_sets, _ = da.read_data(args.data, args.maxRows) tags = [list(t)[0] for t in tags_sets] X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags) X_train_t, X_train_b = zip(*X_train) print "Generating features..." if args.feat == "bow": X, extractor = fe.bag_of_words(X_train_t, X_train_b) elif args.feat == "tfidf": X, extractor = fe.tfidf(X_train_t, X_train_b) elif args.feat == "bigram": X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2) else: X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3) print "Train..." if args.classifier == "naive": classifier = MultinomialNB()
abalone_test_features, abalone_test_labels = abalone_dataset.get_data(test=True) seeds_train_features, seeds_train_labels = seeds_dataset.get_data() seeds_test_features, seeds_test_labels = seeds_dataset.get_data(test=True) # Grid search hyperparameters lamdas = [0, 0.01, 0.05, 0.1, 0.5, 1] lrs = [0.05, 0.1, 0.5, 1] eps = [0.01, 0.05, 0.1, 0.5] # Task 3. Experiments # 1. Compare accuracy of naive bayes and logistic regression # Get cross validation accuracy for 5-fold cv print("Ionosphere validation accuracy (default parameters):") evaluation.cross_validation(5, ionosphere_train_features, ionosphere_train_labels, model=LogisticRegression) # Grid search for optimal hyperparameters print("Ionosphere grid search hyperparameters:") ionosphere_max_val_acc, ionosphere_arg_max = evaluation.grid_search(learning_rates=lrs, epsilons=eps, lambdas=lamdas, x=ionosphere_train_features, y=ionosphere_train_labels, model=LogisticRegression) # Accuracy on test split - train with best hyperparameters print("Ionosphere test accuracy:") logistic_ionosphere = LogisticRegression(ionosphere_train_features, ionosphere_train_labels) logistic_ionosphere.fit(lr=ionosphere_arg_max[0], eps=ionosphere_arg_max[1], regularization=ionosphere_arg_max[2]) ionosphere_prediction = logistic_ionosphere.predict(ionosphere_test_features) cm_ionosphere = evaluation.confusion_matrix(ionosphere_test_labels, ionosphere_prediction) print("Accuracy:", evaluation.accuracy(cm_ionosphere), "Precision:", evaluation.precision(cm_ionosphere), "Recall:", evaluation.true_positive(cm_ionosphere), "F1:", evaluation.f_score(cm_ionosphere)) # 5-fold CV for naive bayes print("Ionosphere validation accuracy (naive bayes):")
from pandas import Series from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.model_selection import cross_val_score from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from emotion_predictor import EmotionPredictor from tree import Tree from util import get_clean_dataframe, get_noisy_dataframe, get_target, get_predictors, get_emotion_values from util import get_clean_data, get_noisy_data from draw_tree import visualise from evaluation import cross_validation, plot_confusion_matrix, get_precision, get_recall, get_f1_score X, y = get_clean_data() cross_validation(10, X, y, random_forest = True, use_confidence = True, num_of_trees=200) # clf = DecisionTreeClassifier(random_state=0) # scores = cross_val_score(clf, X, y, cv=10) # print("Average accuracy for sklearn decision tree is {} and std is {}".format(np.mean(scores), np.std(scores))) # rf = RandomForestClassifier(random_state=0) # scores = cross_val_score(rf, X, y, cv=10) # print("Average accuracy for sklearn random forest is {} and std is {}".format(np.mean(scores), np.std(scores)))
stratify=y, random_state=42) X_train = normalizer.fit_transform(X_train) X_test = normalizer.transform(X_test) # Load models dt = DecisionTreeClassifier() rf = RandomForestClassifier() lr = LogisticRegression(solver='liblinear') svc = SVC(gamma='auto') knn = KNeighborsClassifier(n_neighbors=2) models = {'dt': dt, 'rf': rf, 'lr': lr, 'svc': svc, 'knn': knn} # Original data set for name, model in models.items(): print(name, evaluation.cross_validation(model, X_train, y_train)) # Random over-sampling with ratio 1/5 X_train_up, y_train_up = preprocess.upsampling(X_train, y_train, ratio=1 / 5) for name, model in models.items(): print(name, evaluation.cross_validation(model, X_train_up, y_train_up)) # Random under-sampling with ratio 1/5 X_train_down, y_train_down = preprocess.downsampling(X_train, y_train, ratio=1 / 5) for name, model in models.items(): print(name, evaluation.cross_validation(model, X_train_down, y_train_down)) # Standard SMOTE with ratio 1/6 X_train_smote, y_train_smote = preprocess.smote(X_train, y_train, ratio=1 / 6)