d['target'] = gl.SArray(targets)

        return gl.SFrame(d)

    def _preds_to_array(self, preds):
        p = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '')
        p['id'] = p['id'].astype(int) + 1
        p = p.sort('id')
        del p['id']
        preds_array = np.array(p.to_dataframe(), dtype=float)

        return preds_array


if __name__ == '__main__':
    train, labels, test, _, _ = utils.load_data()

    clf = XGBoost(max_iterations=4800, max_depth=12, min_child_weight=4.9208250938262745,
                  row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804,
                  column_subsample=.730128689911957, step_size=.009)

    if MODE == 'cv':
        scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
        print ('CV:', scores, 'Mean log loss:', np.mean(scores))
        utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
    elif MODE == 'submission':
        clf.fit(train, labels)
        predictions = clf.predict_proba(test)
        utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
                              os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
                              predictions)
Example #2
0
import numpy as np
import os

from sklearn import ensemble, feature_extraction, preprocessing

from otto_utils import consts, utils


MODEL_NAME = 'model_02_random_forest'
MODE = 'holdout'  # cv|submission|holdout

# import data
train, labels, test, _, _ = utils.load_data()

# transform counts to TFIDF features
tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
train = tfidf.fit_transform(train).toarray()
test = tfidf.transform(test).toarray()

# encode labels
lbl_enc = preprocessing.LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# train classifier
clf = ensemble.ExtraTreesClassifier(n_jobs=4, n_estimators=2000, max_features=20, min_samples_split=3,
                                    bootstrap=False, verbose=3, random_state=23)

if MODE == 'cv':
    scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
    print('CV:', scores, 'Mean log loss:', np.mean(scores))
    utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
        iter_funcs = self.create_test_function(dataset, self.model)
        num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size)))

        test_preds, test_probas = np.array([]), None

        for b in range(num_batches_test):
            batch_test_pred, batch_test_proba = iter_funcs['test'](b)
            test_preds = np.append(test_preds, batch_test_pred)
            test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba

        return test_preds, test_probas


if __name__ == '__main__':
    train, labels, test, _, _ = utils.load_data(os.path.join(consts.DATA_PATH, 'fe_train.csv'),
                                                os.path.join(consts.DATA_PATH, 'fe_test.csv'))

    from sklearn import decomposition
    # PCA
    pp = decomposition.PCA()
    train = pp.fit_transform(train)
    test = pp.transform(test)

    clf = NeuralNetwork(1024, 110, 128, 0.00013934891814068934, 0.9724490021642429,
                        6.238206486137665e-05, 0.3081052487919688,
                        .02, True, 10, random_state=21)

    if MODE == 'cv':
        scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
        print( 'CV:', scores, 'Mean log loss:', np.mean(scores))
        utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
Example #4
0
        test_preds, test_probas = np.array([]), None

        for b in range(num_batches_test):
            batch_test_pred, batch_test_proba = iter_funcs['test'](b)
            test_preds = np.append(test_preds, batch_test_pred)
            test_probas = np.append(
                test_probas, batch_test_proba,
                axis=0) if test_probas is not None else batch_test_proba

        return test_preds, test_probas


if __name__ == '__main__':
    train, labels, test, _, _ = utils.load_data(
        os.path.join(consts.DATA_PATH, 'fe_train.csv'),
        os.path.join(consts.DATA_PATH, 'fe_test.csv'))

    from sklearn import decomposition
    # PCA
    pp = decomposition.PCA()
    train = pp.fit_transform(train)
    test = pp.transform(test)

    clf = NeuralNetwork(1024,
                        110,
                        128,
                        0.00013934891814068934,
                        0.9724490021642429,
                        6.238206486137665e-05,
                        0.3081052487919688,
Example #5
0
It achieves around 0.52914588084 log loss on holdout set
"""

import numpy as np
import os

from sklearn import ensemble, feature_extraction, linear_model, preprocessing
from sklearn.svm import LinearSVC

from otto_utils import consts, utils

MODEL_NAME = 'model_01_bagging_linear'
MODE = 'holdout'  # cv|submission|holdout

# import data
train, labels, test, _, _ = utils.load_data()  # 这里的_可能是取消这个变量

# polynomial features
poly_feat = preprocessing.PolynomialFeatures(degree=2,
                                             interaction_only=False,
                                             include_bias=True)
train = poly_feat.fit_transform(train, labels)
test = poly_feat.transform(test)

print train.shape

# transform counts to TFIDF features
tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
train = tfidf.fit_transform(train).toarray()
test = tfidf.transform(test).toarray()