def xv_eval_experts(expert_names, get_indices, aug_train_set): rmsles = [] for fold_id, split in enumerate(generate_xv_splits(aug_train_set)): # Select subset of test set according to get_indices. X_train, y_train, X_test, y_test = split test_is = get_indices(X_test) X_test = X_test[test_is].reset_index(drop=True) y_test = y_test[test_is].reset_index(drop=True) # Get predictions from experts and compute RMSLE. y_test_pred = get_predictions(fold_id, expert_names, get_indices, X_test) rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred)) rmsles.append(rmsle) return {"rmsles": rmsles, "rmsle_avg": np.mean(rmsles), "rmsle_std": np.std(rmsles)}
def xv_eval_experts(expert_names, get_indices, aug_train_set): rmsles = [] for fold_id, split in enumerate(generate_xv_splits(aug_train_set)): # Select subset of test set according to get_indices. X_train, y_train, X_test, y_test = split test_is = get_indices(X_test) X_test = X_test[test_is].reset_index(drop=True) y_test = y_test[test_is].reset_index(drop=True) # Get predictions from experts and compute RMSLE. y_test_pred = get_predictions( fold_id, expert_names, get_indices, X_test) rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred)) rmsles.append(rmsle) return { 'rmsles': rmsles, 'rmsle_avg': np.mean(rmsles), 'rmsle_std': np.std(rmsles), }
def train_and_save_expert(expert_name, aug_train_set, folds=True): get_indices = getattr(expert_params, expert_name + "_get_indices") params = getattr(expert_params, expert_name + "_params") if folds: folds_gen = enumerate(generate_xv_splits(aug_train_set)) else: X_train = aug_train_set.copy(deep=False) y_train = X_train.pop("log_cost") folds_gen = [("all", (X_train, y_train, None, None))] print "Training {}...".format(expert_name) for fold_id, split in folds_gen: print "fold {}...".format(fold_id) path = get_expert_path(expert_name, fold_id) if os.path.exists(path): print " -> skipping because {} exists".format(path) else: # Select subset of train set according to get_indices. X_train, y_train, X_test, y_test = split train_is = get_indices(X_train) X_train = X_train[train_is].reset_index(drop=True) y_train = y_train[train_is].reset_index(drop=True) # Featurize and train model. featurizer = AllCategoricalsFeaturizer() featurizer.fit(X_train) X_train_feats = featurizer.transform(X_train) X_train_np = X_train_feats.astype(np.float).values y_train_np = y_train.values xgtrain = xgb.DMatrix(X_train_np, label=y_train_np) model = xgb.train(params.items(), xgtrain, params["num_rounds"]) print " -> saving to {}".format(path) os.makedirs(path) with open(os.path.join(path, "featurizer"), "w") as f: pickle.dump(featurizer, f) model.save_model(os.path.join(path, "model"))
def train_and_save_expert(expert_name, aug_train_set, folds=True): get_indices = getattr(expert_params, expert_name + '_get_indices') params = getattr(expert_params, expert_name + '_params') if folds: folds_gen = enumerate(generate_xv_splits(aug_train_set)) else: X_train = aug_train_set.copy(deep=False) y_train = X_train.pop('log_cost') folds_gen = [('all', (X_train, y_train, None, None))] print "Training {}...".format(expert_name) for fold_id, split in folds_gen: print "fold {}...".format(fold_id) path = get_expert_path(expert_name, fold_id) if os.path.exists(path): print " -> skipping because {} exists".format(path) else: # Select subset of train set according to get_indices. X_train, y_train, X_test, y_test = split train_is = get_indices(X_train) X_train = X_train[train_is].reset_index(drop=True) y_train = y_train[train_is].reset_index(drop=True) # Featurize and train model. featurizer = AllCategoricalsFeaturizer() featurizer.fit(X_train) X_train_feats = featurizer.transform(X_train) X_train_np = X_train_feats.astype(np.float).values y_train_np = y_train.values xgtrain = xgb.DMatrix(X_train_np, label=y_train_np) model = xgb.train(params.items(), xgtrain, params['num_rounds']) print " -> saving to {}".format(path) os.makedirs(path) with open(os.path.join(path, 'featurizer'), 'w') as f: pickle.dump(featurizer, f) model.save_model(os.path.join(path, 'model'))
import numpy as np import os if __name__ == "__main__": get_indices = layer1_get_indices # get_indices = layer2_get_indices print "Loading augmented dataset..." aug_train_set, aug_test_set = get_augmented_train_and_test_set() print "Dumping xv folds using {}...".format(get_indices.__name__) featurizer = AllCategoricalsFeaturizer() base_path = 'folds' for i, split in enumerate(generate_xv_splits(aug_train_set)): print i # Select subset of train and test set according to get_indices. X_train, y_train, X_test, y_test = split train_is = get_indices(X_train) X_train = X_train[train_is].reset_index(drop=True) y_train = y_train[train_is].reset_index(drop=True) test_is = get_indices(X_test) X_test = X_test[test_is].reset_index(drop=True) y_test = y_test[test_is].reset_index(drop=True) split_np = featurize_and_to_numpy( featurizer, X_train, y_train, X_test, y_test) X_train_np, y_train_np, X_test_np, y_test_np = split_np np.savez_compressed(