Example #1
0
def mapper(key, output_collector):
    import mapreduce as GLOBAL
    Xtr = GLOBAL.DATA_RESAMPLED["X"][0]
    Xte = GLOBAL.DATA_RESAMPLED["X"][1]
    ytr = GLOBAL.DATA_RESAMPLED["y"][0]
    yte = GLOBAL.DATA_RESAMPLED["y"][1]

    l2 = float(key[0])

    print(l2)

    class_weight = 'auto'  # unbiased

    scaler = preprocessing.StandardScaler().fit(Xtr)
    Xtr = scaler.transform(Xtr)
    Xte = scaler.transform(Xte)

    mod = estimators.RidgeLogisticRegression(l2, penalty_start=penalty_start)

    mod.fit(Xtr, ytr.ravel())
    y_pred = mod.predict(Xte)
    proba_pred = mod.predict_probability(Xte)
    ret = dict(y_pred=y_pred, y_true=yte, proba_pred=proba_pred, beta=mod.beta)
    if output_collector:
        output_collector.collect(key, ret)
    else:
        return ret
def init():
    INPUT_DATA_X = os.path.join(WD_ORIGINAL, 'X.npy')
    INPUT_DATA_y = os.path.join(WD_ORIGINAL, 'y.npy')
    INPUT_MASK_PATH = os.path.join(WD_ORIGINAL, 'mask.nii')
    #INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/data/30yo/Atv.npz'
    # INPUT_CSV = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/population_30yo.csv'

    os.makedirs(WD, exist_ok=True)
    shutil.copy(INPUT_DATA_X, WD)
    shutil.copy(INPUT_DATA_y, WD)
    shutil.copy(INPUT_MASK_PATH, WD)

    #shutil.copy(INPUT_LINEAR_OPE_PATH, WD)

    ## Create config file
    os.chdir(WD)
    X = np.load("X.npy")
    y = np.load("y.npy")

    if not os.path.exists(os.path.join(WD, "Atv.npz")):
        import nibabel
        import parsimony.functions.nesterov.tv as nesterov_tv
        from parsimony.utils.linalgs import LinearOperatorNesterov
        img = nibabel.load(os.path.join(WD, "mask.nii"))
        Atv = nesterov_tv.linear_operator_from_mask(img.get_data(),
                                                    calc_lambda_max=True)
        Atv.save(os.path.join(WD, "Atv.npz"))
        Atv_ = LinearOperatorNesterov(filename=os.path.join(WD, "Atv.npz"))
        assert Atv.get_singular_values(0) == Atv_.get_singular_values(0)
        assert np.allclose(Atv_.get_singular_values(0),
                           11.942045760666732,
                           rtol=1e-03,
                           atol=1e-03)
        assert np.all([
            a.shape == (X.shape[1] - penalty_start, X.shape[1] - penalty_start)
            for a in Atv
        ])

    if False and not os.path.exists(os.path.join(WD, "beta_start.npz")):
        betas = dict()
        import time
        alphas = [.01, 0.1, 1.0, 10]
        for alpha in alphas:
            mod = estimators.RidgeLogisticRegression(
                l=alpha, class_weight="auto", penalty_start=penalty_start)
            t_ = time.time()
            mod.fit(X, y.ravel())
            print(time.time() - t_)  # 11564
            betas["lambda_%.2f" % alpha] = mod.beta

        np.savez(os.path.join(WD, "beta_start.npz"), **betas)
        beta_start = np.load(os.path.join(WD, "beta_start.npz"))
        assert np.all(
            [np.all(beta_start[a] == betas[a]) for a in beta_start.keys()])

    ## Create config file

    #  ########################################################################
    #  Setting 1: 5cv + large range of parameters: cv_largerange
    #  with sub-sample training set with size 50, 100
    # 5cv/cv0*[_sub50]/refit/*

    # sub_sizes = [50, 100]
    sub_sizes = []

    cv_outer = [[
        tr, te
    ] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42).
                split(np.zeros(y.shape[0]), y.ravel())]

    # check we got the same CV than previoulsy
    cv_old = json.load(
        open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"]
    cv_outer_old = [
        cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)]
    ]
    assert np.all([
        np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0])
        for i in range(NFOLDS_OUTER)
    ])
    assert np.all([
        np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1])
        for i in range(NFOLDS_OUTER)
    ])
    # check END

    import collections
    cv = collections.OrderedDict()

    cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))]

    for cv_outer_i, (tr_val, te) in enumerate(cv_outer):
        # Simple CV
        cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te]

        # Nested CV
        # cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42)
        # for cv_inner_i, (tr, val) in enumerate(cv_inner):
        #     cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]]

        # Sub-sample training set with size 50, 100
        # => cv*_sub[50|100]/refit
        grps = np.unique(y[tr_val]).astype(int)
        ytr = y.copy()
        ytr[te] = np.nan
        g_idx = [np.where(ytr == g)[0] for g in grps]
        assert np.all([np.all(ytr[g_idx[g]] == g) for g in grps])

        g_size = np.array([len(g) for g in g_idx])
        g_prop = g_size / g_size.sum()

        for sub_size in sub_sizes:
            # sub_size = sub_sizes[0]
            sub_g_size = np.round(g_prop * sub_size).astype(int)
            g_sub_idx = [
                np.random.choice(g_idx[g], sub_g_size[g], replace=False)
                for g in grps
            ]
            assert np.all([np.all(y[g_sub_idx[g]] == g) for g in grps])
            tr_val_sub = np.concatenate(g_sub_idx)
            assert len(tr_val_sub) == sub_size
            assert np.all([idx in tr_val for idx in tr_val_sub])
            assert np.all(np.logical_not([idx in te for idx in tr_val_sub]))
            cv["cv%02d_sub%i/refit" %
               (cv_outer_i, sub_size)] = [tr_val_sub, te]

    cv = {k: [cv[k][0].tolist(), cv[k][1].tolist()] for k in cv}

    # Nested CV
    # assert len(cv_largerange) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1

    # Simple CV
    # assert len(cv) == NFOLDS_OUTER + 1

    # Simple CV + sub-sample training set with size 50, 100:
    assert len(cv) == NFOLDS_OUTER * (1 + len(sub_sizes)) + 1

    print(list(cv.keys()))

    # Large grid of parameters
    alphas = [0.001, 0.01, 0.1, 1.0]
    # alphas = [.01, 0.1, 1.0] # first ran with this grid
    tv_ratio = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    l1l2_ratio = [0.1, 0.5, 0.9]
    #l1l2_ratio = [0, 0.1, 0.5, 0.9, 1.0] # first ran with this grid
    algos = ["enettv", "enetgn"]
    params_enet_tvgn = [
        list(param)
        for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio)
    ]
    assert len(params_enet_tvgn) == 240  # old 300

    params_enet = [
        list(param)
        for param in itertools.product(["enet"], alphas, l1l2_ratio, [0])
    ]
    assert len(params_enet) == 12  # old 15

    params = params_enet_tvgn + params_enet
    assert len(params) == 252  # 315
    # Simple CV
    # assert len(params) * len(cv) == 1890

    # Simple CV + sub-sample training set with size 50, 100:
    assert len(params) * len(cv) == 1512  # 1890

    config = dict(data=dict(X="X.npy", y="y.npy"),
                  params=params,
                  resample=cv,
                  structure_linear_operator_tv="Atv.npz",
                  beta_start="beta_start.npz",
                  map_output="5cv",
                  user_func=user_func_filename)
    json.dump(config, open(os.path.join(WD, "config_cv_largerange.json"), "w"))

    # Build utils files: sync (push/pull) and PBS
    import brainomics.cluster_gabriel as clust_utils
    cmd = "mapreduce.py --map  %s/config_cv_largerange.json" % WD_CLUSTER
    clust_utils.gabriel_make_qsub_job_files(WD,
                                            cmd,
                                            walltime="250:00:00",
                                            suffix="_cv_largerange",
                                            freecores=2)

    #  ########################################################################
    #  Setting 2: dcv + reduced range of parameters: dcv_reducedrange
    #  5cv/cv0*/cvnested0*/*

    cv_outer = [[
        tr, te
    ] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42).
                split(np.zeros(y.shape[0]), y.ravel())]

    # check we got the same CV than previoulsy
    cv_old = json.load(
        open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"]
    cv_outer_old = [
        cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)]
    ]
    assert np.all([
        np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0])
        for i in range(NFOLDS_OUTER)
    ])
    assert np.all([
        np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1])
        for i in range(NFOLDS_OUTER)
    ])
    # check END

    import collections
    cv = collections.OrderedDict()
    cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))]

    for cv_outer_i, (tr_val, te) in enumerate(cv_outer):
        cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te]
        cv_inner = StratifiedKFold(n_splits=NFOLDS_INNER,
                                   random_state=42).split(
                                       np.zeros(y[tr_val].shape[0]),
                                       y[tr_val].ravel())
        for cv_inner_i, (tr, val) in enumerate(cv_inner):
            cv["cv%02d/cvnested%02d" %
               ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]]

    cv = {k: [cv[k][0].tolist(), cv[k][1].tolist()] for k in cv}
    #assert len(cv) == NFOLDS_OUTER + 1
    assert len(cv) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1
    print(list(cv.keys()))

    # Reduced grid of parameters
    alphas = [0.001, 0.01, 0.1, 1.0]
    tv_ratio = [0.2, 0.8]
    l1l2_ratio = [0.1, 0.9]
    algos = ["enettv", "enetgn"]
    params_enet_tvgn = [
        list(param)
        for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio)
    ]
    assert len(params_enet_tvgn) == 32  # 16

    params_enet = [
        list(param)
        for param in itertools.product(["enet"], alphas, l1l2_ratio, [0])
    ]
    assert len(params_enet) == 8  # 4

    params = params_enet_tvgn + params_enet
    assert len(params) == 40  # 20
    assert len(params) * len(cv) == 1240  # 620

    config = dict(data=dict(X="X.npy", y="y.npy"),
                  params=params,
                  resample=cv,
                  structure_linear_operator_tv="Atv.npz",
                  beta_start="beta_start.npz",
                  map_output="5cv",
                  user_func=user_func_filename)
    json.dump(config,
              open(os.path.join(WD, "config_dcv_reducedrange.json"), "w"))

    # Build utils files: sync (push/pull) and PBS
    import brainomics.cluster_gabriel as clust_utils
    cmd = "mapreduce.py --map  %s/config_dcv_reducedrange.json" % WD_CLUSTER
    clust_utils.gabriel_make_qsub_job_files(WD,
                                            cmd,
                                            walltime="250:00:00",
                                            suffix="_dcv_reducedrange",
                                            freecores=2)
def mapper(key, output_collector):
    """
    # debug mapper
    config = json.load(open(os.path.join(WD, "config_cv_largerange.json"), "r"))
    load_globals(config)
    resample(config, 'refit/refit')
    key = ('enettv', 0.01, 0.1, 0.3)
    """
    import mapreduce as GLOBAL
    Xtr = GLOBAL.DATA_RESAMPLED["X"][0]
    Xte = GLOBAL.DATA_RESAMPLED["X"][1]
    ytr = GLOBAL.DATA_RESAMPLED["y"][0]
    yte = GLOBAL.DATA_RESAMPLED["y"][1]

    # key = 'enettv_0.01_0.1_0.2'.split("_")
    algo, alpha, l1l2ratio, tvratio = key[0], float(key[1]), float(
        key[2]), float(key[3])

    tv = alpha * tvratio
    l1 = alpha * float(1 - tv) * l1l2ratio
    l2 = alpha * float(1 - tv) * (1 - l1l2ratio)

    print(key, algo, alpha, l1, l2, tv)
    # alpha = float(key[0])
    # l1, l2, tv = alpha * float(key[1]), alpha * float(key[2]), alpha * float(key[3])
    # print("l1:%f, l2:%f, tv:%f" % (l1, l2, tv))

    class_weight = "auto"  # unbiased

    # beta_start = GLOBAL.beta_start["lambda_%.4f" % alpha]
    # mask = np.ones(Xtr.shape[0], dtype=bool)

    # scaler = preprocessing.StandardScaler().fit(Xtr)
    # Xtr = scaler.transform(Xtr)
    # Xte = scaler.transform(Xte)
    if algo == 'enettv':
        conesta = algorithms.proximal.CONESTA(max_iter=10000)
        mod = estimators.LogisticRegressionL1L2TV(l1,
                                                  l2,
                                                  tv,
                                                  GLOBAL.Atv,
                                                  algorithm=conesta,
                                                  class_weight=class_weight,
                                                  penalty_start=penalty_start)
    elif algo == 'enetgn':
        fista = algorithms.proximal.FISTA(max_iter=5000)
        mod = estimators.LogisticRegressionL1L2GraphNet(
            l1,
            l2,
            tv,
            GLOBAL.Agn,
            algorithm=fista,
            class_weight=class_weight,
            penalty_start=penalty_start)
    elif algo == 'enet':
        fista = algorithms.proximal.FISTA(max_iter=5000)
        mod = estimators.ElasticNetLogisticRegression(
            l1l2ratio,
            alpha,
            algorithm=fista,
            class_weight=class_weight,
            penalty_start=penalty_start)
    elif algo == 'ridge':
        mod = estimators.RidgeLogisticRegression(l1l2ratio,
                                                 alpha,
                                                 algorithm=fista,
                                                 class_weight=class_weight,
                                                 penalty_start=penalty_start)
    else:
        raise Exception('Algo%s not handled' % algo)

    mod.fit(Xtr, ytr.ravel())
    y_pred = mod.predict(Xte)
    proba_pred = mod.predict_probability(Xte)
    ret = dict(y_pred=y_pred, y_true=yte, proba_pred=proba_pred,
               beta=mod.beta)  #, mask=mask)
    if output_collector:
        output_collector.collect(key, ret)
    else:
        return ret
algorithm_params = dict(eps=1e-4, max_iter=20000, info=info)

## Get data structure from array shape

# l2 + grad_descnt
if has_sklearn:
    MODELS["2d_l2_sklearn"] = \
        sklearn.linear_model.LogisticRegression(C=1. / alpha,
                                                fit_intercept=False,
                                                class_weight=None,
                                                dual=False)

# Parsimony: minimize f(beta, X, y) = - loglik + alpha/2 * ||beta||_1
MODELS["2d_l2_grad_descnt"] = \
    estimators.RidgeLogisticRegression(alpha, class_weight=None,
                                       mean=False,
                                       algorithm_params=algorithm_params)

if has_sklearn:
    MODELS["2d_l2_inter_sklearn"] = \
        sklearn.linear_model.LogisticRegression(C=1. / alpha,
                                                fit_intercept=True,
                                                class_weight=None,
                                                dual=False)

MODELS["2d_l2_inter_grad_descnt"] = \
    estimators.RidgeLogisticRegression(alpha, class_weight=None,
                                       mean=False,
                                       penalty_start=1,
                                       algorithm_params=algorithm_params)
# Empirically set the global penalty, based on maximum l1 penaly
alpha = l1_max_logistic_loss(Xtr, ytr)

###############################################################################

# Rgidge sklearn
# Min f(beta) = - C loglik+ 1/2 * ||beta||^2_2
ridge_sklrn = LogisticRegression(C=1. / (alpha * n_train), fit_intercept=False)

yte_pred_ridge = ridge_sklrn.fit(Xtr, ytr.ravel()).predict(Xte)
_, recall_ridge_sklrn, _, _ = \
    precision_recall_fscore_support(yte, yte_pred_ridge, average=None)

# Ridge Parsimony
#   Min  f(beta, X, y) = - loglik/n_train + k/2 * ||beta||^2_2
ridge_prsmy = estimators.RidgeLogisticRegression(alpha)

yte_pred_ridge_prsmy = ridge_prsmy.fit(Xtr, ytr).predict(Xte)
_, recall_ridge_prsmy, _, _ = \
    precision_recall_fscore_support(yte, yte_pred_ridge_prsmy, average=None)

# EldasticNet
enet = estimators.ElasticNetLogisticRegression(l=0.5, alpha=alpha)
yte_pred_enet = enet.fit(Xtr, ytr).predict(Xte)
_, recall_enet, _, _ = \
    precision_recall_fscore_support(yte, yte_pred_enet, average=None)

# GraphNet
# l1, l2, gn = alpha * np.array((.05, .75, .2))  # l1, l2, gn penalties
l1, l2, gn = alpha * np.array((.33, .33, 33))  # l1, l2, gn penalties
A = sparse.vstack(nesterov_tv.linear_operator_from_shape(shape))
Example #6
0
#Reload with alphas = [.01, 0.1, 1.0]
betas = np.load(os.path.join(OUTPUT, "beta_start.npz"))
betas = {"lambda_%.4f" %float(k.split("_")[1]):betas[k] for k in betas.keys()}
[[k, np.sum(betas[k] ** 2)] for k in betas.keys()]
[[k, betas[k].shape] for k in betas.keys()]

B = np.hstack([betas[k] for k in betas.keys()])
np.corrcoef(B.T)
"""
alphas = [0.0001, 0.001, 0.01, 0.1, 1.0]
# alphas = [0.0001, 0.001]

for alpha in alphas:
    mod = estimators.RidgeLogisticRegression(
        l=alpha,
        class_weight="auto",
        penalty_start=penalty_start,
        algorithm_params=dict(max_iter=10000))
    t_ = time.clock()
    mod.fit(Xs, y.ravel())
    print(time.clock() - t_, mod.algorithm.num_iter)  # 11564
    betas["lambda_%.4f" % alpha] = mod.beta

#np.savez(os.path.join(OUTPUT, "beta_start_1000ite.npz"), **betas)
np.savez(os.path.join(OUTPUT, "beta_start.npz"), **betas)

betas.keys()

beta_start = np.load(os.path.join(OUTPUT, "beta_start.npz"))
assert np.all([np.all(beta_start[a] == betas[a]) for a in beta_start.keys()])
"""