Example #1
0
def main():
    cfg = Config()

    data_dir = '../../data'

    train = pd.read_csv(os.path.join(data_dir, 'train_features.csv'))
    targets = pd.read_csv(os.path.join(data_dir, 'train_targets_scored.csv'))
    drug = pd.read_csv(os.path.join(data_dir, 'train_drug.csv'))
    test = pd.read_csv(os.path.join(data_dir, 'test_features.csv'))
    submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))

    train = process_data_cp(train)
    test = process_data_cp(test)

    ff = make_folds(drug=drug,
                    folds=cfg.folds,
                    random_state=cfg.seed,
                    stratify=True,
                    scored=targets)

    train['fold'] = ff.fold.values
    targets['fold'] = ff.fold.values

    set_trace()
    # initialize
    oof = np.zeros((len(train), targets.shape[1] - 2))
    dna = np.random.uniform(0, 1, (cfg.population, 875))**2.0
    cvs = np.zeros((cfg.population))
Example #2
0
def cv(config, context, folds=5, repeat=2, save=False):
    # TODO: too much overloading on folds here
    if isinstance(folds, int):
        folds = make_folds(context.data.index, folds, repeat)
    scores = []
    for train, test in folds:
        context.train_index = train
        preds, x, y = predict(config, context, test)
        actuals = y.reindex(test)
        config.update_reporters_with_predictions(context, x, actuals, preds)
        scores.append(config.metric.score(actuals,
            preds))
    scores = np.array(scores)
    #if save:
        #dataset.save_models([(scores, copy.copy(config))])
    return scores
Example #3
0
def cv(config,
       context,
       folds=5,
       repeat=2,
       print_results=False,
       predict_method=None,
       predict_update_column=None):
    # TODO: too much overloading on folds here
    if isinstance(folds, int):
        folds = make_folds(context.data.index, folds, repeat)
    #else:
    #folds.set_context(config, context)
    scores = {get_metric_name(m): [] for m in config.metrics}
    # we are overwriting indices, so make a copy
    ctx = context.copy()
    i = 0
    folds = list(folds)
    k = len(folds) / repeat
    for train, test in folds:
        print "\nCross-Validation fold %d/%d round %d/%d" % (i % k + 1, k,
                                                             i / k + 1, repeat)
        i += 1
        ctx.train_index = train
        ctx.test_index = test
        fold_scores, result = evaluate(config, ctx, test, predict_method,
                                       predict_update_column)
        context.latest_result = result
        for metric_, s in fold_scores.items():
            scores[metric_].append(s)
        if print_results:
            for metric_, s in scores.items():
                print "%s: %s" % (metric_, pprint_scores(s))
    result = {'config': config, 'scores': scores}

    # report results
    t = PrettyTable(["Reporter", "Report"])
    t.hrules = ALL
    t.align["Reporter"] = "l"
    t.align["Report"] = "l"
    for reporter in config.reporters:
        t.add_row([reporter.__class__.__name__, reporter.report()])
        reporter.reset()
    print t

    return result
Example #4
0
def cv(config, context, folds=5, repeat=2, print_results=False):
    # TODO: too much overloading on folds here
    if isinstance(folds, int):
        folds = make_folds(context.data.index, folds, repeat)
    scores = dict([(m.name, []) for m in config.metrics])
    # we are overwriting indices, so make a copy
    ctx = context.copy()
    for train, test in folds:
        ctx.train_index = train
        preds, x, y = predict(config, ctx, test)
        actuals = y.reindex(test)
        config.update_reporters_with_predictions(ctx, x, actuals, preds)
        for metric in config.metrics:
            scores[metric.name].append(metric.score(actuals, preds))
    #if save:
    #dataset.save_models([(scores, copy.copy(config))])
    if print_results:
        print "\n" + str(config)
        print_scores(scores)
    return scores
Example #5
0
def cv(config, context, folds=5, repeat=2, print_results=False):
    # TODO: too much overloading on folds here
    if isinstance(folds, int):
        folds = make_folds(context.data.index, folds, repeat)
    else:
        folds.set_context(config, context)
    scores = {m.name: [] for m in config.metrics}
    # we are overwriting indices, so make a copy
    ctx = context.copy()
    i = 0
    folds = list(folds)
    k = len(folds)/repeat
    for train, test in folds:
        print "\nCross-Validation fold %d/%d round %d/%d" % (i % k + 1, k, i/k + 1, repeat)
        i += 1
        ctx.train_index = train
        ctx.test_index = test
        preds, x, y = predict(config, ctx, test)
        actuals = y.reindex(test)
        config.update_reporters_with_predictions(ctx, x, actuals, preds)
        for metric in config.metrics:
            scores[metric.name].append(
                    metric.score(actuals,preds))
    result = {'config':config, 'scores':scores}

    # report results
    t = PrettyTable(["Reporter", "Report"])
    t.hrules = ALL
    t.align["Reporter"] = "l"
    t.align["Report"] = "l"
    for reporter in config.reporters:
        t.add_row([reporter.__class__.__name__, reporter.report()])
        reporter.reset()
    print t
    
    #if save:
        #dataset.save_models([(scores, copy.copy(config))])
    if print_results:
        print "\n" + str(config)
        print_scores(scores)
    return result
Example #6
0
def cv(config, context, folds=5, repeat=2, print_results=False):
    # TODO: too much overloading on folds here
    if isinstance(folds, int):
        folds = make_folds(context.data.index, folds, repeat)
    scores = dict([(m.name, []) for m in config.metrics])
    # we are overwriting indices, so make a copy
    ctx = context.copy()
    for train, test in folds:
        ctx.train_index = train
        preds, x, y = predict(config, ctx, test)
        actuals = y.reindex(test)
        config.update_reporters_with_predictions(ctx, x, actuals, preds)
        for metric in config.metrics:
            scores[metric.name].append(
                    metric.score(actuals,preds))
    #if save:
        #dataset.save_models([(scores, copy.copy(config))])
    if print_results:
        print "\n" + str(config)
        print_scores(scores)
    return scores
Example #7
0
def cv(config, context, folds=5, repeat=2, print_results=False,
       predict_method=None, predict_update_column=None):
    # TODO: too much overloading on folds here
    if isinstance(folds, int):
        folds = make_folds(context.data.index, folds, repeat)
    #else:
        #folds.set_context(config, context)
    scores = {get_metric_name(m): [] for m in config.metrics}
    # we are overwriting indices, so make a copy
    ctx = context.copy()
    i = 0
    folds = list(folds)
    k = len(folds)/repeat
    for train, test in folds:
        print "\nCross-Validation fold %d/%d round %d/%d" % (i % k + 1, k, i/k + 1, repeat)
        i += 1
        ctx.train_index = train
        ctx.test_index = test
        fold_scores, result = evaluate(config, ctx, test, predict_method, predict_update_column)
        context.latest_result = result
        for metric_, s in fold_scores.items():
            scores[metric_].append(s)
        if print_results:
            for metric_, s in scores.items():
                print "%s: %s" % (metric_, pprint_scores(s))
    result = {'config':config, 'scores':scores}

    # report results
    t = PrettyTable(["Reporter", "Report"])
    t.hrules = ALL
    t.align["Reporter"] = "l"
    t.align["Report"] = "l"
    for reporter in config.reporters:
        t.add_row([reporter.__class__.__name__, reporter.report()])
        reporter.reset()
    print t
    
    return result
Example #8
0
def cv_predictiveness(data,
                      S,
                      measure,
                      pred_func,
                      V=5,
                      stratified=True,
                      na_rm=False,
                      type="regression",
                      ensemble=False,
                      run_cv=False):
    """
    Compute a cross-validated measure of predictiveness based on the data
    and the chosen measure

    @param data: dataset
    @param S: the covariates to fit
    @param measure: measure of predictiveness
    @param pred_func: function that fits to the data
    @param V: the number of CV folds
    @param stratified: should the folds be stratified?
    @param na_rm: should we do a complete-case analysis (True) or not (False)
    @param type: is this regression (use predict) or classification (use predict_proba)?
    @param ensemble: is this an ensemble (True) or not (False)?

    @return cross-validated measure of predictiveness, along with preds and ics
    """
    import numpy as np
    from compute_ic import compute_ic
    import utils as uts
    from data_generator import Dataset
    ## if na_rm = True, do a complete-case analysis
    if na_rm:
        xs = data.x_train[:, S]
        cc = np.sum(np.isnan(xs), axis=1) == 0
        newdata = Dataset(x_train=data.x_train[cc, :],
                          y_train=data.y_train[cc])
    else:
        cc = np.repeat(True, data.x_train.shape[0])
        newdata = data
    ## set up CV folds
    folds = uts.make_folds(newdata, V, stratified=stratified)
    ## do CV
    preds = np.empty((data.y_train.shape[0], ))
    preds.fill(np.nan)
    ics = np.empty((data.y_train.shape[0], ))
    ics.fill(np.nan)
    # preds = np.empty((newdata.y_train.shape[0],))
    vs = np.empty((V, ))
    # ics = np.empty((newdata.y_train.shape[0],))
    cc_cond = np.flatnonzero(cc)
    for v in range(V):
        fold_cond = np.flatnonzero(folds == v)
        x_train, y_train = newdata.x_train[folds != v, :], newdata.y_train[
            folds != v]
        x_test, y_test = newdata.x_train[folds == v, :], newdata.y_train[folds
                                                                         == v]
        pred_func.fit(x_train[:, S], np.ravel(y_train))
        if ensemble:
            preds_v = np.mean(pred_func.transform(x_test[:, S]))
        else:
            if type == "classification":
                preds_v = pred_func.predict_proba(x_test[:, S])[:, 1]
            else:
                preds_v = pred_func.predict(x_test[:, S])
        preds[cc_cond[fold_cond]] = preds_v
        vs[v] = measure(y_test, preds_v)
        ics[cc_cond[fold_cond]] = compute_ic(y_test, preds_v, measure.__name__)
    return np.mean(vs), preds, ics, folds