Exemple #1
0
def run_vw(label, learning_rate=0.5, learning_rate2=0.5, l1=0, b=28):
    # Train with L1 regularization (LASSO)
    cmd = 'vw -d ../../data/train.train.one_hot_all.y%d.vw -f model_y%d.l1.vw --loss_function logistic -b %d -l %f -q ss -q sb -q sf -q si -q bb -q bf -q bi -q ff -q fi -q ii --passes 3 --hash all --random_seed 42 --compressed -c --l1 %s' % (label, label, b, learning_rate, l1)
    subprocess.check_output(shlex.split(cmd))
    # Re-train with selected features
    cmd = 'vw -d ../../data/train.train.one_hot_all.y%d.vw -f model_y%d.vw --loss_function logistic -b %d -l %f -q ss -q sb -q sf -q si -q bb -q bf -q bi -q ff -q fi -q ii --passes 3 --hash all --random_seed 42 --compressed -c --feature_mask model_y%d.l1.vw' % (label, label, b, learning_rate2, label)
    subprocess.check_output(shlex.split(cmd))
    # predict heldout data
    cmd = 'vw -d ../../data/train.validate.one_hot_all.vw -i model_y%d.vw -p preds_y%d.grouped.p.txt --compressed -c' % (label, label)
    subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT)
    with open('preds_y%d.grouped.p.txt' % label) as fd:
        p = load_vw_predictions(fd)
    return p

if __name__ == "__main__":
    labels = load_npz('../../data/trainLabels.validate.npz')['labels']
    
    for i in (32,11,5,8,6,28,9,30,31):
        print "Tuning label %d" % i
        y = labels[:,i]
    
        best = None
        min_score = np.inf
        cfg = product((0.3, 0.5, 0.7), (0.1, 0.2, 0.3), ('1e-8', '1e-7'))
        for lr, lr2, l1 in cfg:
            print "lr = %f, lr2 = %f, l1 = %s" % (lr, lr2, l1)
            p = run_vw(i, lr, lr2, l1)
            s = score_predictions(y, p)
            print "Score: %s" % s
            if s < min_score:
                best = [lr, l1]
            
        logging.info(i)
        C = CFG.get(i, 5.0)
        clf = LogisticRegression(C=C, tol=0.0001, random_state=42)
        if len(np.unique(y)) == 1:
            Y_test.append(y[0]*np.ones(args.test.shape[0]))
            Y_meta.append(y[0]*np.ones(args.train.shape[0]))
        else:
            logging.info("Fitting")
            clf.fit(X_train, y)
            logging.info("Predicting")
            p = clf.predict_proba(args.test)
            y = 1 - p[:,0]
            Y_test.append(y)
            p = clf.predict_proba(args.train)
            y = 1 - p[:,0]
            Y_meta.append(y)
            
    logging.info("Saving predictions to %s" % args.output)
    test = load_npz('../../data/test.npz')
    Y_test = np.vstack(Y_test).T
    save_npz(args.output, ids=test['ids'], 
        header=args.labels['header'], labels=Y_test)
    del Y_test, test
        
    logging.info("Saving predictions to %s" % args.meta)
    Y_meta = np.vstack(Y_meta).T
    save_npz(args.meta, ids=args.labels['ids'], 
        header=args.labels['header'], labels=Y_meta)
        
Exemple #3
0
            
        logging.info(i)
        kwargs = CFG[i]
        clf = RandomForestClassifier(
            n_estimators=128, criterion='entropy', max_depth=None,
            max_leaf_nodes=None, min_samples_split=1,
            n_jobs=-1, random_state=42, verbose=2,
            **kwargs)
        if len(np.unique(y)) == 1:
            Y_test.append(y[0]*np.ones(args.test.shape[0]))
            clfs.append(y[0])
        else:
            logging.info("Fitting")
            clf.fit(args.train, y)
            clfs.append(clf)
            logging.info("Predicting")
            p = clf.predict_proba(args.test)
            y = 1 - p[:,0]
            Y_test.append(y)
            
    logging.info("Saving predictions to %s" % args.output)
    test = load_npz('../../data/test.npz')
    Y_test = np.vstack(Y_test).T
    save_npz(args.output, ids=test['ids'], 
        header=args.labels['header'], labels=Y_test)
    del Y_test
        
    if args.classifiers:
        logging.info("Saving classifiers to %s" % args.classifiers)
        pickle.dump(clfs, args.classifiers, 2)
Exemple #4
0
        id = int(entry[1])
        ids.append(id)
        y_i = float(entry[0])
        y.append(y_i)
    ids = np.array(ids)
    y = np.array(y)
    p = sigmoid(y)
    return ids, p


def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('pred', type=glob.glob, help='VW predictions pattern')
    parser.add_argument('output', help='Output file (npz)')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()
    print "Loading VW predictions"
    ids = load_vw_predictions(open(args.pred[0]))[0]
    print "Loading predictions for %d ids" % len(ids)
    Y = np.zeros((len(ids), 33))
    for fn in args.pred:
        i = int(fn.split('.')[0].split('_')[-1][1:])
        print "Label %d" % i
        Y[:, i] = load_vw_predictions(open(fn))[1]
    labels = load_npz('../../data/trainLabels.npz')
    header = labels['header']
    save_npz(args.output, header=header, ids=ids, labels=Y)
        y_i = float(entry[0])
        y.append(y_i)
    ids = np.array(ids)
    y = np.array(y)
    p = sigmoid(y)
    return ids, p

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('pred', type=glob.glob,
        help='VW predictions pattern')
    parser.add_argument('output',
        help='Output file (npz)')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    print "Loading VW predictions"
    ids = load_vw_predictions(open(args.pred[0]))[0]
    print "Loading predictions for %d ids" % len(ids)
    Y = np.zeros((len(ids), 33))
    for fn in args.pred:
        i = int(fn.split('.')[0].split('_')[-1][1:])
        print "Label %d" % i
        Y[:,i] = load_vw_predictions(open(fn))[1]
    labels = load_npz('../../data/trainLabels.npz')
    header = labels['header']
    save_npz(args.output, header=header, ids=ids, labels=Y)