from pyutils.ensemble_selection.Ensemble import Ensemble
    from scipy.spatial.distance import hamming

    print
    print 'Reading data...'
    Id_test, X_test = extract_testing_data(args.kaggle_root +
                                           '/data/kaggle_test_tf_idf.csv')

    print
    print 'Loading ensemble...'
    ensemble = joblib.load(args.ensemble_dir + '/ensemble.pkl')

    if args.hill_predict:
        print
        print 'Ensemble hill predicting...'
        Id_train, X_train, y_train = extract_training_data(
            args.kaggle_root + '/data/kaggle_train_tf_idf.csv')
        err = hamming(ensemble.hill_predict(X_train), y_train)
        print '\tHill climbing error: {0}'.format(err)

    print
    print 'Predicting...'
    pred = ensemble.predict(X_test)
    with open(args.submission_fname, 'w') as f:
        f.write('Id,Prediction\n')
        for i, j in zip(Id_test.astype(int).tolist(), pred.tolist()):
            f.write('{0},{1}\n'.format(i, j))
    print '\tOutput written to {0}.'.format(args.submission_fname)

    print
    print 'Done.\n'
Beispiel #2
0
    parser = argparse.ArgumentParser()
    parser.add_argument('penalty',
                        help='L1 or L2 penalization.')
    parser.add_argument('C',
                        help='Regularization parameter.')
    parser.add_argument('fit_intercept',
                        help='Whether to include a constant bias term in the loss function.')
    parser.add_argument('n_components_pca',
                        help='n_components for PCA.')
    parser.add_argument('model_fname',
                        help='Absolute path to pickle the fitted CvModel.')
    args = parser.parse_args()

    n_components_pca = None if args.n_components_pca == 'None' else int(args.n_components_pca)

    Id, X, y = extract_training_data('/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv')

    # trans/clf specs
    n_folds = 5
    scaler = StandardScaler()
    pca = PCA(n_components=n_components_pca, whiten=True)
    trans = Pipeline([('scale_center', scaler),('pca', pca)]).fit(X)
    clf = LogisticRegression(
        penalty = args.penalty,
        C = float(args.C),
        fit_intercept = bool(args.fit_intercept),
        class_weight = 'auto'
    )

    cv_clf = CvModel(n_folds, trans, clf)
    cv_clf.fit(X, y)
Beispiel #3
0
    import os
    import re
    from operator import itemgetter

    from pyutils.kaggle_io.extract_inputs import extract_training_data
    from pyutils.ensemble_selection.CvModel import CvModel
    from sklearn.externals import joblib
    from scipy.spatial.distance import hamming

    model_dirs = []
    with open(args.model_dirlist_fname, 'r') as f:
        for line in f:
            model_dirs.append(line.strip())

    print 'Reading training data.\n'
    Id, X, y = extract_training_data(args.kaggle_root + '/data/kaggle_train_tf_idf.csv')

    print 'Scoring models.'
    hillclimb_errs = []
    prog, model_rootdir = re.compile('.*\.pkl$'), args.kaggle_root + '/models'
    for m_dir in model_dirs:
        dir_contents = os.listdir(model_rootdir + '/' + m_dir)
        for fname in dir_contents:
            try:
                model_name = m_dir + '/' + prog.match(fname).group(0)
                print '\t{0}'.format(model_name)
                model = joblib.load(model_rootdir + '/' + model_name)
                err = hamming(model.hill_predict(X), y)
                hillclimb_errs.append((model_name, err))

            except AttributeError: