def main(): cfg = Config() data_dir = '../../data' train = pd.read_csv(os.path.join(data_dir, 'train_features.csv')) targets = pd.read_csv(os.path.join(data_dir, 'train_targets_scored.csv')) drug = pd.read_csv(os.path.join(data_dir, 'train_drug.csv')) test = pd.read_csv(os.path.join(data_dir, 'test_features.csv')) submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv')) train = process_data_cp(train) test = process_data_cp(test) ff = make_folds(drug=drug, folds=cfg.folds, random_state=cfg.seed, stratify=True, scored=targets) train['fold'] = ff.fold.values targets['fold'] = ff.fold.values set_trace() # initialize oof = np.zeros((len(train), targets.shape[1] - 2)) dna = np.random.uniform(0, 1, (cfg.population, 875))**2.0 cvs = np.zeros((cfg.population))
def cv(config, context, folds=5, repeat=2, save=False): # TODO: too much overloading on folds here if isinstance(folds, int): folds = make_folds(context.data.index, folds, repeat) scores = [] for train, test in folds: context.train_index = train preds, x, y = predict(config, context, test) actuals = y.reindex(test) config.update_reporters_with_predictions(context, x, actuals, preds) scores.append(config.metric.score(actuals, preds)) scores = np.array(scores) #if save: #dataset.save_models([(scores, copy.copy(config))]) return scores
def cv(config, context, folds=5, repeat=2, print_results=False, predict_method=None, predict_update_column=None): # TODO: too much overloading on folds here if isinstance(folds, int): folds = make_folds(context.data.index, folds, repeat) #else: #folds.set_context(config, context) scores = {get_metric_name(m): [] for m in config.metrics} # we are overwriting indices, so make a copy ctx = context.copy() i = 0 folds = list(folds) k = len(folds) / repeat for train, test in folds: print "\nCross-Validation fold %d/%d round %d/%d" % (i % k + 1, k, i / k + 1, repeat) i += 1 ctx.train_index = train ctx.test_index = test fold_scores, result = evaluate(config, ctx, test, predict_method, predict_update_column) context.latest_result = result for metric_, s in fold_scores.items(): scores[metric_].append(s) if print_results: for metric_, s in scores.items(): print "%s: %s" % (metric_, pprint_scores(s)) result = {'config': config, 'scores': scores} # report results t = PrettyTable(["Reporter", "Report"]) t.hrules = ALL t.align["Reporter"] = "l" t.align["Report"] = "l" for reporter in config.reporters: t.add_row([reporter.__class__.__name__, reporter.report()]) reporter.reset() print t return result
def cv(config, context, folds=5, repeat=2, print_results=False): # TODO: too much overloading on folds here if isinstance(folds, int): folds = make_folds(context.data.index, folds, repeat) scores = dict([(m.name, []) for m in config.metrics]) # we are overwriting indices, so make a copy ctx = context.copy() for train, test in folds: ctx.train_index = train preds, x, y = predict(config, ctx, test) actuals = y.reindex(test) config.update_reporters_with_predictions(ctx, x, actuals, preds) for metric in config.metrics: scores[metric.name].append(metric.score(actuals, preds)) #if save: #dataset.save_models([(scores, copy.copy(config))]) if print_results: print "\n" + str(config) print_scores(scores) return scores
def cv(config, context, folds=5, repeat=2, print_results=False): # TODO: too much overloading on folds here if isinstance(folds, int): folds = make_folds(context.data.index, folds, repeat) else: folds.set_context(config, context) scores = {m.name: [] for m in config.metrics} # we are overwriting indices, so make a copy ctx = context.copy() i = 0 folds = list(folds) k = len(folds)/repeat for train, test in folds: print "\nCross-Validation fold %d/%d round %d/%d" % (i % k + 1, k, i/k + 1, repeat) i += 1 ctx.train_index = train ctx.test_index = test preds, x, y = predict(config, ctx, test) actuals = y.reindex(test) config.update_reporters_with_predictions(ctx, x, actuals, preds) for metric in config.metrics: scores[metric.name].append( metric.score(actuals,preds)) result = {'config':config, 'scores':scores} # report results t = PrettyTable(["Reporter", "Report"]) t.hrules = ALL t.align["Reporter"] = "l" t.align["Report"] = "l" for reporter in config.reporters: t.add_row([reporter.__class__.__name__, reporter.report()]) reporter.reset() print t #if save: #dataset.save_models([(scores, copy.copy(config))]) if print_results: print "\n" + str(config) print_scores(scores) return result
def cv(config, context, folds=5, repeat=2, print_results=False): # TODO: too much overloading on folds here if isinstance(folds, int): folds = make_folds(context.data.index, folds, repeat) scores = dict([(m.name, []) for m in config.metrics]) # we are overwriting indices, so make a copy ctx = context.copy() for train, test in folds: ctx.train_index = train preds, x, y = predict(config, ctx, test) actuals = y.reindex(test) config.update_reporters_with_predictions(ctx, x, actuals, preds) for metric in config.metrics: scores[metric.name].append( metric.score(actuals,preds)) #if save: #dataset.save_models([(scores, copy.copy(config))]) if print_results: print "\n" + str(config) print_scores(scores) return scores
def cv(config, context, folds=5, repeat=2, print_results=False, predict_method=None, predict_update_column=None): # TODO: too much overloading on folds here if isinstance(folds, int): folds = make_folds(context.data.index, folds, repeat) #else: #folds.set_context(config, context) scores = {get_metric_name(m): [] for m in config.metrics} # we are overwriting indices, so make a copy ctx = context.copy() i = 0 folds = list(folds) k = len(folds)/repeat for train, test in folds: print "\nCross-Validation fold %d/%d round %d/%d" % (i % k + 1, k, i/k + 1, repeat) i += 1 ctx.train_index = train ctx.test_index = test fold_scores, result = evaluate(config, ctx, test, predict_method, predict_update_column) context.latest_result = result for metric_, s in fold_scores.items(): scores[metric_].append(s) if print_results: for metric_, s in scores.items(): print "%s: %s" % (metric_, pprint_scores(s)) result = {'config':config, 'scores':scores} # report results t = PrettyTable(["Reporter", "Report"]) t.hrules = ALL t.align["Reporter"] = "l" t.align["Report"] = "l" for reporter in config.reporters: t.add_row([reporter.__class__.__name__, reporter.report()]) reporter.reset() print t return result
def cv_predictiveness(data, S, measure, pred_func, V=5, stratified=True, na_rm=False, type="regression", ensemble=False, run_cv=False): """ Compute a cross-validated measure of predictiveness based on the data and the chosen measure @param data: dataset @param S: the covariates to fit @param measure: measure of predictiveness @param pred_func: function that fits to the data @param V: the number of CV folds @param stratified: should the folds be stratified? @param na_rm: should we do a complete-case analysis (True) or not (False) @param type: is this regression (use predict) or classification (use predict_proba)? @param ensemble: is this an ensemble (True) or not (False)? @return cross-validated measure of predictiveness, along with preds and ics """ import numpy as np from compute_ic import compute_ic import utils as uts from data_generator import Dataset ## if na_rm = True, do a complete-case analysis if na_rm: xs = data.x_train[:, S] cc = np.sum(np.isnan(xs), axis=1) == 0 newdata = Dataset(x_train=data.x_train[cc, :], y_train=data.y_train[cc]) else: cc = np.repeat(True, data.x_train.shape[0]) newdata = data ## set up CV folds folds = uts.make_folds(newdata, V, stratified=stratified) ## do CV preds = np.empty((data.y_train.shape[0], )) preds.fill(np.nan) ics = np.empty((data.y_train.shape[0], )) ics.fill(np.nan) # preds = np.empty((newdata.y_train.shape[0],)) vs = np.empty((V, )) # ics = np.empty((newdata.y_train.shape[0],)) cc_cond = np.flatnonzero(cc) for v in range(V): fold_cond = np.flatnonzero(folds == v) x_train, y_train = newdata.x_train[folds != v, :], newdata.y_train[ folds != v] x_test, y_test = newdata.x_train[folds == v, :], newdata.y_train[folds == v] pred_func.fit(x_train[:, S], np.ravel(y_train)) if ensemble: preds_v = np.mean(pred_func.transform(x_test[:, S])) else: if type == "classification": preds_v = pred_func.predict_proba(x_test[:, S])[:, 1] else: preds_v = pred_func.predict(x_test[:, S]) preds[cc_cond[fold_cond]] = preds_v vs[v] = measure(y_test, preds_v) ics[cc_cond[fold_cond]] = compute_ic(y_test, preds_v, measure.__name__) return np.mean(vs), preds, ics, folds