Beispiel #1
0
def output_all_plots():
    use_log_scale = False
    ymax_dict = {'rosenbrock_2': 4}
    data = load_data("soo.csv")
    data = load_data("logo.csv", data)
    data = load_data("bamsoo.csv", data)

    for fn in all_fns(data):
        regrets = data_filter(data, fn, REGRETS)
        plt.clf()
        plot_vals_err(fn, regrets)
        out_name = fn + '_regrets.png'
        plt.savefig(out_name, bbox_inches='tight')

    for fn in all_fns(data):
        ws = data_filter(data, fn, WS)
        plt.clf()
        plot_vals_err(fn, ws, ylabel="W")
        out_name = fn + '_ws.png'
        plt.savefig(out_name, bbox_inches='tight')

    for fn in all_fns(data):
        dists = data_filter(data, fn, DISTS)
        plt.clf()
        plot_vals_err(fn, dists, ylabel="Min. Distance")
        out_name = fn + '_dists.png'
        plt.savefig(out_name, bbox_inches='tight')

    for fn in all_fns(data):
        d = data_for_fn(fn, data)['LOGO']
        slope_ws = reduce_sublists(pair_slope_val(1, d, WS))
        plt.clf()
        plot_scatter(fn, slope_ws, 'W', alpha=0.05)
        out_name = fn + '_ws_scatter.png'
        plt.savefig(out_name, bbox_inches='tight')

    for fn in all_fns(data):
        d = data_for_fn(fn, data)['BO1']
        slope_dists = reduce_sublists(pair_slope_val(1, d, DISTS))
        plt.clf()
        plot_scatter(fn, slope_dists, 'Min. Distance')
        out_name = fn + '_dists_scatter.png'
        plt.savefig(out_name, bbox_inches='tight')
Beispiel #2
0
def kfold(k=10):
    print "Loading data."
    videos, users, reviews = load_data()

    print "Extracting features."
    orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews])
    feats = create_features(orig_X, users)
    #y = np.array([1 if x['spam'] == 'true' else 0 for x in reviews])
    y = np.array([1 if x['adult'] == 'true' else 0 for x in reviews])

    print "Vectorizing features."
    v = DictVectorizer(sparse=False)
    feats = v.fit_transform(feats)

    print "Starting K-fold cross validation."
    cv = cross_validation.KFold(len(feats), k=k, indices=True, shuffle=True, random_state=1234)

    cls = LogisticRegression(penalty='l2', tol=0.00001, fit_intercept=False, dual=False, C=2.4105, class_weight=None)
    if PRINT_COEFS:
        cls.fit(feats, y)
        c = v.inverse_transform(cls.coef_)
        for key, val in sorted(c[0].iteritems(), key=lambda x: x[1]):
#            if isinstance(key, str) and key.startswith("_"):
             print key, val
        quit()

    f1sum = 0
    for i, (train_idx, test_idx) in enumerate(cv):
        train_X, train_y, test_X, test_y = feats[train_idx], \
                y[train_idx], feats[test_idx], y[test_idx]
        cls.fit(train_X, train_y)
        preds = cls.predict(test_X)

        if PRINT_ERRORS:
#            worst = np.argsort(np.abs(test_y - preds))
            #for j in worst[-1:-10:-1]:
            orig_test = orig_X[test_idx]
#            for j in worst:
            for j in range(len(orig_test)):
                if test_y[j] != preds[j]:
                    print j, orig_test[j][1], test_y[j], preds[j]
            #quit()

        f1 = metrics.f1_score(test_y, preds)
        print "Fold %d F1 score: %.5f" % (i, f1)
        f1sum += f1
    avgf1 = (f1sum / k)
    print "Mean F1 score: %.5f" % (f1sum / k)

#    scores = cross_validation.cross_val_score(cls, feats, y, cv=10, score_func=metrics.f1_score)
#    for i, score in enumerate(scores):
#        print "Fold %d: %.5f" % (i, score)
#    print "Mean score: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)

    return avgf1
Beispiel #3
0
def output_singles():
    max_out = 20
    data = load_data("random.csv")
    for fn in all_fns(data):
        print fn
        d = data_for_fn(fn, data)['RANDOM']
        pairs = zip(d[REGRETS], d[DISTS])
        for idx, (top, bot) in enumerate(pairs[:max_out]):
            idx += 1
            if idx % 10 == 0:
                print idx
            plt.clf()
            plot_singles(fn, top, bot, bottom_log=True)
            out_name = fn + '_single_' + str(idx) + '.png'
            plt.savefig(out_name, bbox_inches='tight')
Beispiel #4
0
"""
import numpy as np
import tensorflow as tf
import time
from parse import FLAGS, load_data, build_embed
from srn import HLSTM
from srn_tool import train_srn, evaluate_srn, inference_srn
from pn import PolicyGradient
from pn_tool import pretrain_pn, train_pn, develop_pn, evaluate_pn, inference_pn

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    if FLAGS.log_parameters:
        print(FLAGS.__flags)
    label_train, text_train, sentence_len_train, keyword_train = load_data(FLAGS.data_dir, FLAGS.train_filename)
    label_dev, text_dev, sentence_len_dev, keyword_dev = load_data(FLAGS.data_dir, FLAGS.valid_filename)
    label_test, text_test, sentence_len_test, keyword_test = load_data(FLAGS.data_dir, FLAGS.test_filename)
    embed = build_embed(FLAGS.data_dir, FLAGS.word_vector_filename)
    
    SRN_graph = tf.Graph()
    PN_graph = tf.Graph()
    
    with SRN_graph.as_default():
        SRN = HLSTM(FLAGS.symbols,
                      FLAGS.embed_units,
                      FLAGS.hidden_units,
                      FLAGS.labels,
                      embed,
                      FLAGS.learning_rate_srn)
        if FLAGS.log_parameters:
Beispiel #5
0
def kfold(k=10):
    print "Loading data."
    videos, users, reviews = load_data()

    print "Extracting features."
    orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews])
    feats = create_features(orig_X, users)
    #y = np.array([1 if x['spam'] == 'true' else 0 for x in reviews])
    y = np.array([1 if x['adult'] == 'true' else 0 for x in reviews])

    print "Vectorizing features."
    v = DictVectorizer(sparse=False)
    feats = v.fit_transform(feats)

    print "Starting K-fold cross validation."
    cv = cross_validation.KFold(len(feats),
                                k=k,
                                indices=True,
                                shuffle=True,
                                random_state=1234)

    cls = LogisticRegression(penalty='l2',
                             tol=0.00001,
                             fit_intercept=False,
                             dual=False,
                             C=2.4105,
                             class_weight=None)
    if PRINT_COEFS:
        cls.fit(feats, y)
        c = v.inverse_transform(cls.coef_)
        for key, val in sorted(c[0].iteritems(), key=lambda x: x[1]):
            #            if isinstance(key, str) and key.startswith("_"):
            print key, val
        quit()

    f1sum = 0
    for i, (train_idx, test_idx) in enumerate(cv):
        train_X, train_y, test_X, test_y = feats[train_idx], \
                y[train_idx], feats[test_idx], y[test_idx]
        cls.fit(train_X, train_y)
        preds = cls.predict(test_X)

        if PRINT_ERRORS:
            #            worst = np.argsort(np.abs(test_y - preds))
            #for j in worst[-1:-10:-1]:
            orig_test = orig_X[test_idx]
            #            for j in worst:
            for j in range(len(orig_test)):
                if test_y[j] != preds[j]:
                    print j, orig_test[j][1], test_y[j], preds[j]
            #quit()

        f1 = metrics.f1_score(test_y, preds)
        print "Fold %d F1 score: %.5f" % (i, f1)
        f1sum += f1
    avgf1 = (f1sum / k)
    print "Mean F1 score: %.5f" % (f1sum / k)

    #    scores = cross_validation.cross_val_score(cls, feats, y, cv=10, score_func=metrics.f1_score)
    #    for i, score in enumerate(scores):
    #        print "Fold %d: %.5f" % (i, score)
    #    print "Mean score: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)

    return avgf1
Beispiel #6
0
import numpy as np
import cPickle

from features import create_features, PROJECT
from parse import load_data
from dict_vectorizer import DictVectorizer

videos, users, reviews = load_data()
orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews])
feats = create_features(orig_X, None)
v = DictVectorizer(sparse=False)
feats = v.fit_transform(feats)

# feats is now in vectorized format
# v.transform() is the transformation that needs to be used on test data

cPickle.dump(v, open(PROJECT + "db/dictvectorizer.pickle", "wb"))
Beispiel #7
0
from parse import load_data

if __name__ == '__main__':
    load_data()
Beispiel #8
0
def main():
    data = load_data("output.json")
    d = data_for_fn('hartman_3', data)['BO1']
    print d