Esempio n. 1
0
def test_data_split_identical():
    X_, y_ = X[:40], np.array([0] * 20 + [1] * 20)
    for md in [
            model.SVM(),
            model.KNN(),
            model.XGBoost(),
            model.LinearModel(),
            model.DecisionTree()
    ]:
        a = evaluation.estimate(md, X_train, X_test, y_train, y_test)
        b = evaluation.estimate(md, X_train, X_test, y_train, y_test)
        assert a == b
        a = evaluation.cross_validation(md,
                                        X_,
                                        y_,
                                        scoring='both',
                                        n_splits=2,
                                        n_jobs=1)
        b = evaluation.cross_validation(md,
                                        X_,
                                        y_,
                                        scoring='both',
                                        n_splits=2,
                                        n_jobs=1)
        assert np.all(a['f1'] == b['f1'])
        assert np.all(a['roc_auc'] == b['roc_auc'])
Esempio n. 2
0
def test_ensemble():
    for md in [
            model.LinearEnsemble([model.LinearModel(),
                                  model.LinearModel()]),
    ]:
        try:
            evaluation.cross_validation(md,
                                        X,
                                        y,
                                        scoring='both',
                                        n_jobs=1,
                                        n_splits=2)
        except Exception as e:
            print(md.__class__)
            raise e
Esempio n. 3
0
def test_cross_validation():
    for md in [
            model.SVM(),
            model.MultiClassesLearner('KNN', {'n_neighbors': 1}),
            model.KNN(),
            model.XGBoost(),
            model.LinearModel(),
            model.DecisionTree(),
    ]:
        try:
            evaluation.cross_validation(md,
                                        X,
                                        y,
                                        scoring='both',
                                        n_jobs=1,
                                        n_splits=2)
        except Exception as e:
            print(md.__class__)
            raise e
    parser.add_argument('data', type=str, help="Path to tagged data file")
    parser.add_argument('--feature', dest='feat', type=str, default='bow',
        options=['bow', 'tfidf', 'bigram', 'trigram'], help="Type of feature to use")
    parser.add_argument('--classifier', dest='classifier', type=str, default='knn',
        options=['knn','log-reg', 'dec-tree', 'svm'], help="Type of classifier to use")

    return parser.parse_args()

if __name__=="__main__":
    args = parse_args()

    print "Reading data..."
    titles, bodies, tags_sets, _ = da.read_data(args.data)
    tags = [list(t)[0] for t in tags_sets]

    X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags)
    X_train_t, X_train_b = zip(*X_train)

    print "Generating features..."
    if args.feat == "bow":
        X, extractor = fe.bag_of_words(X_train_t, X_train_b)
    elif args.feat == "tfidf":
        X, extractor = fe.tfidf(X_train_t, X_train_b)
    elif args.feat == "bigram":
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2)
    else:
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3)

    print "Train..."
    if args.classifier == "knn":
        classifier = KNeighborsClassifier(n_neighbors=3)
Esempio n. 5
0
def test_logistic_regression():
    m = model.LinearModel()
    X = np.random.normal(size=(1000, 10))
    y = np.array(X.sum(axis=1) > 0, dtype=np.int8)
    scores = evaluation.cross_validation(m, X, y, n_jobs=1)
    assert np.all(scores > 0.9)
Esempio n. 6
0
def test_xgboost():
    m = model.XGBoost(n_jobs=1)
    print(evaluation.cross_validation(m, X, y, n_jobs=1))
        choices=['bow', 'tfidf', 'bigram', 'trigram'], help="Type of feature to use")
    parser.add_argument('--classifier', dest='classifier', type=str, default='naive',
        choices=['naive'], help="Type of classifier to use")
    parser.add_argument('--maxRows', dest='maxRows', type=int, default=0,
        help="Max rows from file to read in")

    return parser.parse_args()

if __name__=="__main__":
    args = parse_args()

    print "Reading data..."
    titles, bodies, tags_sets, _ = da.read_data(args.data, args.maxRows)
    tags = [list(t)[0] for t in tags_sets]

    X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags)
    X_train_t, X_train_b = zip(*X_train)

    print "Generating features..."
    if args.feat == "bow":
        X, extractor = fe.bag_of_words(X_train_t, X_train_b)
    elif args.feat == "tfidf":
        X, extractor = fe.tfidf(X_train_t, X_train_b)
    elif args.feat == "bigram":
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2)
    else:
        X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3)

    print "Train..."
    if args.classifier == "naive":
        classifier = MultinomialNB()
Esempio n. 8
0
abalone_test_features, abalone_test_labels = abalone_dataset.get_data(test=True)

seeds_train_features, seeds_train_labels = seeds_dataset.get_data()
seeds_test_features, seeds_test_labels = seeds_dataset.get_data(test=True)

# Grid search hyperparameters
lamdas = [0, 0.01, 0.05, 0.1, 0.5, 1]
lrs = [0.05, 0.1, 0.5, 1]
eps = [0.01, 0.05, 0.1, 0.5]

# Task 3. Experiments
# 1. Compare accuracy of naive bayes and logistic regression

# Get cross validation accuracy for 5-fold cv
print("Ionosphere validation accuracy (default parameters):")
evaluation.cross_validation(5, ionosphere_train_features, ionosphere_train_labels, model=LogisticRegression)

# Grid search for optimal hyperparameters
print("Ionosphere grid search hyperparameters:")
ionosphere_max_val_acc, ionosphere_arg_max = evaluation.grid_search(learning_rates=lrs, epsilons=eps, lambdas=lamdas, x=ionosphere_train_features, y=ionosphere_train_labels, model=LogisticRegression)

# Accuracy on test split - train with best hyperparameters
print("Ionosphere test accuracy:")
logistic_ionosphere = LogisticRegression(ionosphere_train_features, ionosphere_train_labels)
logistic_ionosphere.fit(lr=ionosphere_arg_max[0], eps=ionosphere_arg_max[1], regularization=ionosphere_arg_max[2])
ionosphere_prediction = logistic_ionosphere.predict(ionosphere_test_features)
cm_ionosphere = evaluation.confusion_matrix(ionosphere_test_labels, ionosphere_prediction)
print("Accuracy:", evaluation.accuracy(cm_ionosphere), "Precision:", evaluation.precision(cm_ionosphere), "Recall:", evaluation.true_positive(cm_ionosphere), "F1:", evaluation.f_score(cm_ionosphere))

# 5-fold CV for naive bayes
print("Ionosphere validation accuracy (naive bayes):")
from pandas import Series
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from emotion_predictor import EmotionPredictor
from tree import Tree
from util import get_clean_dataframe, get_noisy_dataframe, get_target, get_predictors, get_emotion_values
from util import get_clean_data, get_noisy_data
from draw_tree import visualise
from evaluation import cross_validation, plot_confusion_matrix, get_precision, get_recall, get_f1_score




X, y = get_clean_data()


cross_validation(10, X, y, random_forest = True, use_confidence = True, num_of_trees=200)

# clf = DecisionTreeClassifier(random_state=0)
# scores = cross_val_score(clf, X, y, cv=10)
# print("Average accuracy for sklearn decision tree is {} and std is {}".format(np.mean(scores), np.std(scores)))

# rf = RandomForestClassifier(random_state=0)
# scores = cross_val_score(rf, X, y, cv=10)
# print("Average accuracy for sklearn random forest is {} and std is {}".format(np.mean(scores), np.std(scores)))

                                                    stratify=y,
                                                    random_state=42)
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

# Load models
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
lr = LogisticRegression(solver='liblinear')
svc = SVC(gamma='auto')
knn = KNeighborsClassifier(n_neighbors=2)
models = {'dt': dt, 'rf': rf, 'lr': lr, 'svc': svc, 'knn': knn}

# Original data set
for name, model in models.items():
    print(name, evaluation.cross_validation(model, X_train, y_train))

# Random over-sampling with ratio 1/5
X_train_up, y_train_up = preprocess.upsampling(X_train, y_train, ratio=1 / 5)
for name, model in models.items():
    print(name, evaluation.cross_validation(model, X_train_up, y_train_up))

# Random under-sampling with ratio 1/5
X_train_down, y_train_down = preprocess.downsampling(X_train,
                                                     y_train,
                                                     ratio=1 / 5)
for name, model in models.items():
    print(name, evaluation.cross_validation(model, X_train_down, y_train_down))

# Standard SMOTE with ratio 1/6
X_train_smote, y_train_smote = preprocess.smote(X_train, y_train, ratio=1 / 6)