Exemple #1
0
def xv_eval_experts(expert_names, get_indices, aug_train_set):
    rmsles = []
    for fold_id, split in enumerate(generate_xv_splits(aug_train_set)):
        # Select subset of test set according to get_indices.
        X_train, y_train, X_test, y_test = split
        test_is = get_indices(X_test)
        X_test = X_test[test_is].reset_index(drop=True)
        y_test = y_test[test_is].reset_index(drop=True)

        # Get predictions from experts and compute RMSLE.
        y_test_pred = get_predictions(fold_id, expert_names, get_indices, X_test)
        rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred))
        rmsles.append(rmsle)

    return {"rmsles": rmsles, "rmsle_avg": np.mean(rmsles), "rmsle_std": np.std(rmsles)}
def xv_eval_experts(expert_names, get_indices, aug_train_set):
    rmsles = []
    for fold_id, split in enumerate(generate_xv_splits(aug_train_set)):
        # Select subset of test set according to get_indices.
        X_train, y_train, X_test, y_test = split
        test_is = get_indices(X_test)
        X_test = X_test[test_is].reset_index(drop=True)
        y_test = y_test[test_is].reset_index(drop=True)

        # Get predictions from experts and compute RMSLE.
        y_test_pred = get_predictions(
            fold_id, expert_names, get_indices, X_test)
        rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred))
        rmsles.append(rmsle)

    return {
        'rmsles': rmsles,
        'rmsle_avg': np.mean(rmsles),
        'rmsle_std': np.std(rmsles),
    }
Exemple #3
0
def train_and_save_expert(expert_name, aug_train_set, folds=True):
    get_indices = getattr(expert_params, expert_name + "_get_indices")
    params = getattr(expert_params, expert_name + "_params")

    if folds:
        folds_gen = enumerate(generate_xv_splits(aug_train_set))
    else:
        X_train = aug_train_set.copy(deep=False)
        y_train = X_train.pop("log_cost")
        folds_gen = [("all", (X_train, y_train, None, None))]

    print "Training {}...".format(expert_name)
    for fold_id, split in folds_gen:
        print "fold {}...".format(fold_id)
        path = get_expert_path(expert_name, fold_id)
        if os.path.exists(path):
            print "  -> skipping because {} exists".format(path)
        else:
            # Select subset of train set according to get_indices.
            X_train, y_train, X_test, y_test = split
            train_is = get_indices(X_train)
            X_train = X_train[train_is].reset_index(drop=True)
            y_train = y_train[train_is].reset_index(drop=True)

            # Featurize and train model.
            featurizer = AllCategoricalsFeaturizer()
            featurizer.fit(X_train)
            X_train_feats = featurizer.transform(X_train)
            X_train_np = X_train_feats.astype(np.float).values
            y_train_np = y_train.values
            xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
            model = xgb.train(params.items(), xgtrain, params["num_rounds"])

            print "  -> saving to {}".format(path)
            os.makedirs(path)
            with open(os.path.join(path, "featurizer"), "w") as f:
                pickle.dump(featurizer, f)
            model.save_model(os.path.join(path, "model"))
def train_and_save_expert(expert_name, aug_train_set, folds=True):
    get_indices = getattr(expert_params, expert_name + '_get_indices')
    params = getattr(expert_params, expert_name + '_params')

    if folds:
        folds_gen = enumerate(generate_xv_splits(aug_train_set))
    else:
        X_train = aug_train_set.copy(deep=False)
        y_train = X_train.pop('log_cost')
        folds_gen = [('all', (X_train, y_train, None, None))]

    print "Training {}...".format(expert_name)
    for fold_id, split in folds_gen:
        print "fold {}...".format(fold_id)
        path = get_expert_path(expert_name, fold_id)
        if os.path.exists(path):
            print "  -> skipping because {} exists".format(path)
        else:
            # Select subset of train set according to get_indices.
            X_train, y_train, X_test, y_test = split
            train_is = get_indices(X_train)
            X_train = X_train[train_is].reset_index(drop=True)
            y_train = y_train[train_is].reset_index(drop=True)

            # Featurize and train model.
            featurizer = AllCategoricalsFeaturizer()
            featurizer.fit(X_train)
            X_train_feats = featurizer.transform(X_train)
            X_train_np = X_train_feats.astype(np.float).values
            y_train_np = y_train.values
            xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
            model = xgb.train(params.items(), xgtrain, params['num_rounds'])

            print "  -> saving to {}".format(path)
            os.makedirs(path)
            with open(os.path.join(path, 'featurizer'), 'w') as f:
                pickle.dump(featurizer, f)
            model.save_model(os.path.join(path, 'model'))
import numpy as np
import os


if __name__ == "__main__":
    get_indices = layer1_get_indices
    # get_indices = layer2_get_indices

    print "Loading augmented dataset..."
    aug_train_set, aug_test_set = get_augmented_train_and_test_set()

    print "Dumping xv folds using {}...".format(get_indices.__name__)
    featurizer = AllCategoricalsFeaturizer()
    base_path = 'folds'
    for i, split in enumerate(generate_xv_splits(aug_train_set)):
        print i

        # Select subset of train and test set according to get_indices.
        X_train, y_train, X_test, y_test = split
        train_is = get_indices(X_train)
        X_train = X_train[train_is].reset_index(drop=True)
        y_train = y_train[train_is].reset_index(drop=True)
        test_is = get_indices(X_test)
        X_test = X_test[test_is].reset_index(drop=True)
        y_test = y_test[test_is].reset_index(drop=True)

        split_np = featurize_and_to_numpy(
            featurizer, X_train, y_train, X_test, y_test)
        X_train_np, y_train_np, X_test_np, y_test_np = split_np
        np.savez_compressed(