def run_vw(label, learning_rate=0.5, learning_rate2=0.5, l1=0, b=28): # Train with L1 regularization (LASSO) cmd = 'vw -d ../../data/train.train.one_hot_all.y%d.vw -f model_y%d.l1.vw --loss_function logistic -b %d -l %f -q ss -q sb -q sf -q si -q bb -q bf -q bi -q ff -q fi -q ii --passes 3 --hash all --random_seed 42 --compressed -c --l1 %s' % (label, label, b, learning_rate, l1) subprocess.check_output(shlex.split(cmd)) # Re-train with selected features cmd = 'vw -d ../../data/train.train.one_hot_all.y%d.vw -f model_y%d.vw --loss_function logistic -b %d -l %f -q ss -q sb -q sf -q si -q bb -q bf -q bi -q ff -q fi -q ii --passes 3 --hash all --random_seed 42 --compressed -c --feature_mask model_y%d.l1.vw' % (label, label, b, learning_rate2, label) subprocess.check_output(shlex.split(cmd)) # predict heldout data cmd = 'vw -d ../../data/train.validate.one_hot_all.vw -i model_y%d.vw -p preds_y%d.grouped.p.txt --compressed -c' % (label, label) subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT) with open('preds_y%d.grouped.p.txt' % label) as fd: p = load_vw_predictions(fd) return p if __name__ == "__main__": labels = load_npz('../../data/trainLabels.validate.npz')['labels'] for i in (32,11,5,8,6,28,9,30,31): print "Tuning label %d" % i y = labels[:,i] best = None min_score = np.inf cfg = product((0.3, 0.5, 0.7), (0.1, 0.2, 0.3), ('1e-8', '1e-7')) for lr, lr2, l1 in cfg: print "lr = %f, lr2 = %f, l1 = %s" % (lr, lr2, l1) p = run_vw(i, lr, lr2, l1) s = score_predictions(y, p) print "Score: %s" % s if s < min_score: best = [lr, l1]
logging.info(i) C = CFG.get(i, 5.0) clf = LogisticRegression(C=C, tol=0.0001, random_state=42) if len(np.unique(y)) == 1: Y_test.append(y[0]*np.ones(args.test.shape[0])) Y_meta.append(y[0]*np.ones(args.train.shape[0])) else: logging.info("Fitting") clf.fit(X_train, y) logging.info("Predicting") p = clf.predict_proba(args.test) y = 1 - p[:,0] Y_test.append(y) p = clf.predict_proba(args.train) y = 1 - p[:,0] Y_meta.append(y) logging.info("Saving predictions to %s" % args.output) test = load_npz('../../data/test.npz') Y_test = np.vstack(Y_test).T save_npz(args.output, ids=test['ids'], header=args.labels['header'], labels=Y_test) del Y_test, test logging.info("Saving predictions to %s" % args.meta) Y_meta = np.vstack(Y_meta).T save_npz(args.meta, ids=args.labels['ids'], header=args.labels['header'], labels=Y_meta)
logging.info(i) kwargs = CFG[i] clf = RandomForestClassifier( n_estimators=128, criterion='entropy', max_depth=None, max_leaf_nodes=None, min_samples_split=1, n_jobs=-1, random_state=42, verbose=2, **kwargs) if len(np.unique(y)) == 1: Y_test.append(y[0]*np.ones(args.test.shape[0])) clfs.append(y[0]) else: logging.info("Fitting") clf.fit(args.train, y) clfs.append(clf) logging.info("Predicting") p = clf.predict_proba(args.test) y = 1 - p[:,0] Y_test.append(y) logging.info("Saving predictions to %s" % args.output) test = load_npz('../../data/test.npz') Y_test = np.vstack(Y_test).T save_npz(args.output, ids=test['ids'], header=args.labels['header'], labels=Y_test) del Y_test if args.classifiers: logging.info("Saving classifiers to %s" % args.classifiers) pickle.dump(clfs, args.classifiers, 2)
id = int(entry[1]) ids.append(id) y_i = float(entry[0]) y.append(y_i) ids = np.array(ids) y = np.array(y) p = sigmoid(y) return ids, p def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('pred', type=glob.glob, help='VW predictions pattern') parser.add_argument('output', help='Output file (npz)') return parser if __name__ == "__main__": args = opts().parse_args() print "Loading VW predictions" ids = load_vw_predictions(open(args.pred[0]))[0] print "Loading predictions for %d ids" % len(ids) Y = np.zeros((len(ids), 33)) for fn in args.pred: i = int(fn.split('.')[0].split('_')[-1][1:]) print "Label %d" % i Y[:, i] = load_vw_predictions(open(fn))[1] labels = load_npz('../../data/trainLabels.npz') header = labels['header'] save_npz(args.output, header=header, ids=ids, labels=Y)
y_i = float(entry[0]) y.append(y_i) ids = np.array(ids) y = np.array(y) p = sigmoid(y) return ids, p def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('pred', type=glob.glob, help='VW predictions pattern') parser.add_argument('output', help='Output file (npz)') return parser if __name__ == "__main__": args = opts().parse_args() print "Loading VW predictions" ids = load_vw_predictions(open(args.pred[0]))[0] print "Loading predictions for %d ids" % len(ids) Y = np.zeros((len(ids), 33)) for fn in args.pred: i = int(fn.split('.')[0].split('_')[-1][1:]) print "Label %d" % i Y[:,i] = load_vw_predictions(open(fn))[1] labels = load_npz('../../data/trainLabels.npz') header = labels['header'] save_npz(args.output, header=header, ids=ids, labels=Y)