Ejemplo n.º 1
0
def kfold(k=10):
    print "Loading data."
    videos, users, reviews = load_data()

    print "Extracting features."
    orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews])
    feats = create_features(orig_X, users)
    #y = np.array([1 if x['spam'] == 'true' else 0 for x in reviews])
    y = np.array([1 if x['adult'] == 'true' else 0 for x in reviews])

    print "Vectorizing features."
    v = DictVectorizer(sparse=False)
    feats = v.fit_transform(feats)

    print "Starting K-fold cross validation."
    cv = cross_validation.KFold(len(feats), k=k, indices=True, shuffle=True, random_state=1234)

    cls = LogisticRegression(penalty='l2', tol=0.00001, fit_intercept=False, dual=False, C=2.4105, class_weight=None)
    if PRINT_COEFS:
        cls.fit(feats, y)
        c = v.inverse_transform(cls.coef_)
        for key, val in sorted(c[0].iteritems(), key=lambda x: x[1]):
#            if isinstance(key, str) and key.startswith("_"):
             print key, val
        quit()

    f1sum = 0
    for i, (train_idx, test_idx) in enumerate(cv):
        train_X, train_y, test_X, test_y = feats[train_idx], \
                y[train_idx], feats[test_idx], y[test_idx]
        cls.fit(train_X, train_y)
        preds = cls.predict(test_X)

        if PRINT_ERRORS:
#            worst = np.argsort(np.abs(test_y - preds))
            #for j in worst[-1:-10:-1]:
            orig_test = orig_X[test_idx]
#            for j in worst:
            for j in range(len(orig_test)):
                if test_y[j] != preds[j]:
                    print j, orig_test[j][1], test_y[j], preds[j]
            #quit()

        f1 = metrics.f1_score(test_y, preds)
        print "Fold %d F1 score: %.5f" % (i, f1)
        f1sum += f1
    avgf1 = (f1sum / k)
    print "Mean F1 score: %.5f" % (f1sum / k)

#    scores = cross_validation.cross_val_score(cls, feats, y, cv=10, score_func=metrics.f1_score)
#    for i, score in enumerate(scores):
#        print "Fold %d: %.5f" % (i, score)
#    print "Mean score: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)

    return avgf1
Ejemplo n.º 2
0
def kfold(k=10):
    print "Loading data."
    videos, users, reviews = load_data()

    print "Extracting features."
    orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews])
    feats = create_features(orig_X, users)
    #y = np.array([1 if x['spam'] == 'true' else 0 for x in reviews])
    y = np.array([1 if x['adult'] == 'true' else 0 for x in reviews])

    print "Vectorizing features."
    v = DictVectorizer(sparse=False)
    feats = v.fit_transform(feats)

    print "Starting K-fold cross validation."
    cv = cross_validation.KFold(len(feats),
                                k=k,
                                indices=True,
                                shuffle=True,
                                random_state=1234)

    cls = LogisticRegression(penalty='l2',
                             tol=0.00001,
                             fit_intercept=False,
                             dual=False,
                             C=2.4105,
                             class_weight=None)
    if PRINT_COEFS:
        cls.fit(feats, y)
        c = v.inverse_transform(cls.coef_)
        for key, val in sorted(c[0].iteritems(), key=lambda x: x[1]):
            #            if isinstance(key, str) and key.startswith("_"):
            print key, val
        quit()

    f1sum = 0
    for i, (train_idx, test_idx) in enumerate(cv):
        train_X, train_y, test_X, test_y = feats[train_idx], \
                y[train_idx], feats[test_idx], y[test_idx]
        cls.fit(train_X, train_y)
        preds = cls.predict(test_X)

        if PRINT_ERRORS:
            #            worst = np.argsort(np.abs(test_y - preds))
            #for j in worst[-1:-10:-1]:
            orig_test = orig_X[test_idx]
            #            for j in worst:
            for j in range(len(orig_test)):
                if test_y[j] != preds[j]:
                    print j, orig_test[j][1], test_y[j], preds[j]
            #quit()

        f1 = metrics.f1_score(test_y, preds)
        print "Fold %d F1 score: %.5f" % (i, f1)
        f1sum += f1
    avgf1 = (f1sum / k)
    print "Mean F1 score: %.5f" % (f1sum / k)

    #    scores = cross_validation.cross_val_score(cls, feats, y, cv=10, score_func=metrics.f1_score)
    #    for i, score in enumerate(scores):
    #        print "Fold %d: %.5f" % (i, score)
    #    print "Mean score: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)

    return avgf1
Ejemplo n.º 3
0
import numpy as np
import cPickle

from features import create_features, PROJECT
from parse import load_data
from dict_vectorizer import DictVectorizer

videos, users, reviews = load_data()
orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews])
feats = create_features(orig_X, None)
v = DictVectorizer(sparse=False)
feats = v.fit_transform(feats)

# feats is now in vectorized format
# v.transform() is the transformation that needs to be used on test data

cPickle.dump(v, open(PROJECT + "db/dictvectorizer.pickle", "wb"))