def output_all_plots(): use_log_scale = False ymax_dict = {'rosenbrock_2': 4} data = load_data("soo.csv") data = load_data("logo.csv", data) data = load_data("bamsoo.csv", data) for fn in all_fns(data): regrets = data_filter(data, fn, REGRETS) plt.clf() plot_vals_err(fn, regrets) out_name = fn + '_regrets.png' plt.savefig(out_name, bbox_inches='tight') for fn in all_fns(data): ws = data_filter(data, fn, WS) plt.clf() plot_vals_err(fn, ws, ylabel="W") out_name = fn + '_ws.png' plt.savefig(out_name, bbox_inches='tight') for fn in all_fns(data): dists = data_filter(data, fn, DISTS) plt.clf() plot_vals_err(fn, dists, ylabel="Min. Distance") out_name = fn + '_dists.png' plt.savefig(out_name, bbox_inches='tight') for fn in all_fns(data): d = data_for_fn(fn, data)['LOGO'] slope_ws = reduce_sublists(pair_slope_val(1, d, WS)) plt.clf() plot_scatter(fn, slope_ws, 'W', alpha=0.05) out_name = fn + '_ws_scatter.png' plt.savefig(out_name, bbox_inches='tight') for fn in all_fns(data): d = data_for_fn(fn, data)['BO1'] slope_dists = reduce_sublists(pair_slope_val(1, d, DISTS)) plt.clf() plot_scatter(fn, slope_dists, 'Min. Distance') out_name = fn + '_dists_scatter.png' plt.savefig(out_name, bbox_inches='tight')
def kfold(k=10): print "Loading data." videos, users, reviews = load_data() print "Extracting features." orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews]) feats = create_features(orig_X, users) #y = np.array([1 if x['spam'] == 'true' else 0 for x in reviews]) y = np.array([1 if x['adult'] == 'true' else 0 for x in reviews]) print "Vectorizing features." v = DictVectorizer(sparse=False) feats = v.fit_transform(feats) print "Starting K-fold cross validation." cv = cross_validation.KFold(len(feats), k=k, indices=True, shuffle=True, random_state=1234) cls = LogisticRegression(penalty='l2', tol=0.00001, fit_intercept=False, dual=False, C=2.4105, class_weight=None) if PRINT_COEFS: cls.fit(feats, y) c = v.inverse_transform(cls.coef_) for key, val in sorted(c[0].iteritems(), key=lambda x: x[1]): # if isinstance(key, str) and key.startswith("_"): print key, val quit() f1sum = 0 for i, (train_idx, test_idx) in enumerate(cv): train_X, train_y, test_X, test_y = feats[train_idx], \ y[train_idx], feats[test_idx], y[test_idx] cls.fit(train_X, train_y) preds = cls.predict(test_X) if PRINT_ERRORS: # worst = np.argsort(np.abs(test_y - preds)) #for j in worst[-1:-10:-1]: orig_test = orig_X[test_idx] # for j in worst: for j in range(len(orig_test)): if test_y[j] != preds[j]: print j, orig_test[j][1], test_y[j], preds[j] #quit() f1 = metrics.f1_score(test_y, preds) print "Fold %d F1 score: %.5f" % (i, f1) f1sum += f1 avgf1 = (f1sum / k) print "Mean F1 score: %.5f" % (f1sum / k) # scores = cross_validation.cross_val_score(cls, feats, y, cv=10, score_func=metrics.f1_score) # for i, score in enumerate(scores): # print "Fold %d: %.5f" % (i, score) # print "Mean score: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) return avgf1
def output_singles(): max_out = 20 data = load_data("random.csv") for fn in all_fns(data): print fn d = data_for_fn(fn, data)['RANDOM'] pairs = zip(d[REGRETS], d[DISTS]) for idx, (top, bot) in enumerate(pairs[:max_out]): idx += 1 if idx % 10 == 0: print idx plt.clf() plot_singles(fn, top, bot, bottom_log=True) out_name = fn + '_single_' + str(idx) + '.png' plt.savefig(out_name, bbox_inches='tight')
""" import numpy as np import tensorflow as tf import time from parse import FLAGS, load_data, build_embed from srn import HLSTM from srn_tool import train_srn, evaluate_srn, inference_srn from pn import PolicyGradient from pn_tool import pretrain_pn, train_pn, develop_pn, evaluate_pn, inference_pn config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: if FLAGS.log_parameters: print(FLAGS.__flags) label_train, text_train, sentence_len_train, keyword_train = load_data(FLAGS.data_dir, FLAGS.train_filename) label_dev, text_dev, sentence_len_dev, keyword_dev = load_data(FLAGS.data_dir, FLAGS.valid_filename) label_test, text_test, sentence_len_test, keyword_test = load_data(FLAGS.data_dir, FLAGS.test_filename) embed = build_embed(FLAGS.data_dir, FLAGS.word_vector_filename) SRN_graph = tf.Graph() PN_graph = tf.Graph() with SRN_graph.as_default(): SRN = HLSTM(FLAGS.symbols, FLAGS.embed_units, FLAGS.hidden_units, FLAGS.labels, embed, FLAGS.learning_rate_srn) if FLAGS.log_parameters:
import numpy as np import cPickle from features import create_features, PROJECT from parse import load_data from dict_vectorizer import DictVectorizer videos, users, reviews = load_data() orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews]) feats = create_features(orig_X, None) v = DictVectorizer(sparse=False) feats = v.fit_transform(feats) # feats is now in vectorized format # v.transform() is the transformation that needs to be used on test data cPickle.dump(v, open(PROJECT + "db/dictvectorizer.pickle", "wb"))
from parse import load_data if __name__ == '__main__': load_data()
def main(): data = load_data("output.json") d = data_for_fn('hartman_3', data)['BO1'] print d