Esempio n. 1
0
def simu_autocutoff(n_simu, n_samples, n_features, n_cut_points):
    seed = n_simu
    cov_corr = .5
    sparsity = .2
    simu = SimuCoxRegWithCutPoints(n_samples=n_samples,
                                   n_features=n_features,
                                   n_cut_points=n_cut_points,
                                   seed=seed,
                                   verbose=False,
                                   shape=2,
                                   scale=.1,
                                   cov_corr=cov_corr,
                                   sparsity=sparsity)
    X, Y, delta, cut_points, beta_star, S = simu.simulate()

    # binarize data
    n_cuts = 50
    binarizer = FeaturesBinarizer(n_cuts=n_cuts)
    binarizer.fit_transform(X)
    boundaries = binarizer.boundaries
    epsilon = 10

    tic = time()
    multiple_testing_rslt = multiple_testing(X,
                                             boundaries,
                                             Y,
                                             delta,
                                             epsilon=epsilon)
    # Lausen & Schumacher correction
    p_values_corr, p_values_min, cut_points_estimates = [], [], []
    n_tested = []
    for j in range(n_features):
        p_values_j = multiple_testing_rslt[j]
        n_tested.append(p_values_j.values_to_test.shape[0])
        p_values_min.append(p_values_j.p_values.min())
        p_values_corr.append(
            p_value_cut(p_values_j.p_values, p_values_j.values_to_test,
                        X[:, j], epsilon))
        idx_cut_points = p_values_j.p_values.argmin()
        cut_points_estimate_j = p_values_j.values_to_test[idx_cut_points]
        cut_points_estimates.append(cut_points_estimate_j)
    tac = time()

    print(p_values_min)

    return [
        n_samples, cut_points, S, cut_points_estimates, p_values_min, n_tested,
        p_values_corr, tac - tic
    ]
Esempio n. 2
0
    def test_binarizer_fit(self):
        """...Test binarizer fit
        """
        n_cuts = 3
        enc = OneHotEncoder(sparse=True)
        expected_binarization = enc.fit_transform(
            self.default_expected_intervals)

        binarizer = FeaturesBinarizer(method='quantile',
                                      n_cuts=n_cuts,
                                      detect_column_type="auto",
                                      remove_first=False)
        # for pandas dataframe
        binarizer.fit(self.df_features)
        binarized_df = binarizer.transform(self.df_features)
        self.assertEqual(binarized_df.__class__, csr.csr_matrix)

        np.testing.assert_array_equal(expected_binarization.toarray(),
                                      binarized_df.toarray())
        # for numpy array
        binarizer.fit(self.features)
        binarized_array = binarizer.transform(self.features)
        self.assertEqual(binarized_array.__class__, csr.csr_matrix)

        np.testing.assert_array_equal(expected_binarization.toarray(),
                                      binarized_array.toarray())

        # test fit_transform
        binarized_array = binarizer.fit_transform(self.features)
        self.assertEqual(binarized_array.__class__, csr.csr_matrix)

        np.testing.assert_array_equal(expected_binarization.toarray(),
                                      binarized_array.toarray())
Esempio n. 3
0
def get_times2(n_simu, n_samples, n_features, n_cut_points):
    print("  n_simu=%s" % n_simu)
    seed = n_simu
    simu = SimuCoxRegWithCutPoints(n_samples=n_samples, n_features=n_features,
                                   seed=seed, verbose=False,
                                   n_cut_points=n_cut_points,
                                   shape=2, scale=.1, cov_corr=cov_corr,
                                   sparsity=sparsity)
    X, Y, delta, cut_points, beta_star, S = simu.simulate()

    # Binacox method
    tic = time()
    n_cuts = 50
    binarizer = FeaturesBinarizer(n_cuts=n_cuts)
    X_bin = binarizer.fit_transform(X)
    blocks_start = binarizer.blocks_start
    blocks_length = binarizer.blocks_length
    solver = 'agd'
    learner = CoxRegression(penalty='binarsity', tol=1e-5,
                            solver=solver, verbose=False,
                            max_iter=100, step=0.3,
                            blocks_start=blocks_start,
                            blocks_length=blocks_length,
                            C=25, warm_start=True)
    learner._solver_obj.linesearch = False
    learner.fit(X_bin, Y, delta)
    tac = time()

    return tac - tic
Esempio n. 4
0
    def test_LogisticRegression_fit(self):
        """...Test LogisticRegression fit with different solvers and penalties
        """
        sto_seed = 179312
        raw_features, y = Test.get_train_data()

        for fit_intercept in [True, False]:
            for penalty in penalties:

                if penalty == 'binarsity':
                    # binarize features
                    n_cuts = 3
                    binarizer = FeaturesBinarizer(n_cuts=n_cuts)
                    features = binarizer.fit_transform(raw_features)
                else:
                    features = raw_features

                for solver in solvers:
                    solver_kwargs = {
                        'penalty': penalty,
                        'tol': 1e-5,
                        'solver': solver,
                        'verbose': False,
                        'max_iter': 10,
                        'fit_intercept': fit_intercept
                    }

                    if penalty != 'none':
                        solver_kwargs['C'] = 100

                    if penalty == 'binarsity':
                        solver_kwargs['blocks_start'] = binarizer.blocks_start
                        solver_kwargs[
                            'blocks_length'] = binarizer.blocks_length

                    if solver == 'sdca':
                        solver_kwargs['sdca_ridge_strength'] = 2e-2

                    if solver in ['sgd', 'svrg', 'sdca']:
                        solver_kwargs['random_state'] = sto_seed

                    if solver == 'sgd':
                        solver_kwargs['step'] = 1.

                    if solver == 'bfgs':
                        # BFGS only accepts ProxZero and ProxL2sq for now
                        if penalty not in ['none', 'l2']:
                            continue

                    learner = LogisticRegression(**solver_kwargs)
                    learner.fit(features, y)
                    probas = learner.predict_proba(features)[:, 1]
                    auc = roc_auc_score(y, probas)
                    self.assertGreater(
                        auc, 0.7, "solver %s with penalty %s and "
                        "intercept %s reached too low AUC" %
                        (solver, penalty, fit_intercept))
Esempio n. 5
0
def get_times1(n_simu, n_samples, n_features, n_cut_points):
    print("  n_simu=%s" % n_simu)
    seed = n_simu
    simu = SimuCoxRegWithCutPoints(n_samples=n_samples, n_features=n_features,
                                   seed=seed, verbose=False,
                                   n_cut_points=n_cut_points,
                                   shape=2, scale=.1, cov_corr=cov_corr,
                                   sparsity=sparsity)
    X, Y, delta, cut_points, beta_star, S = simu.simulate()

    # Binacox method
    n_cuts = 50
    binarizer = FeaturesBinarizer(n_cuts=n_cuts)
    X_bin = binarizer.fit_transform(X)
    blocks_start = binarizer.blocks_start
    blocks_length = binarizer.blocks_length
    boundaries = binarizer.boundaries['0']

    solver = 'agd'
    learner = CoxRegression(penalty='binarsity', tol=1e-5,
                            solver=solver, verbose=False,
                            max_iter=100, step=0.3,
                            blocks_start=blocks_start,
                            blocks_length=blocks_length,
                            C=25, warm_start=True)
    learner._solver_obj.linesearch = False
    learner.fit(X_bin, Y, delta)
    tac = time()
    time_bina = tac - tic

    # Auto Cutoff Method
    X = np.array(X)
    epsilon = 10
    p1 = np.percentile(X, epsilon)
    p2 = np.percentile(X, 100 - epsilon)
    values_to_test = X[np.where((X <= p2) & (X >= p1))]
    tic = time()
    get_p_values_j(X, 0, Y, delta, values_to_test, epsilon)
    tac = time()
    time_ac_all = tac - tic

    tic = time()
    p1 = np.percentile(X, epsilon)
    p2 = np.percentile(X, 100 - epsilon)
    values_to_test = boundaries[
        np.where((boundaries <= p2) & (boundaries >= p1))]
    get_p_values_j(X, 0, Y, delta, values_to_test, epsilon)
    tac = time()
    time_ac_grid = tac - tic

    return n_samples, time_bina, time_ac_all, time_ac_grid
Esempio n. 6
0
    def test_CoxRegression_fit(self):
        """...Test CoxRegression fit with different solvers and penalties
        """
        raw_features, times, censoring = Test.get_train_data()

        coeffs_pen = {
            'none':
            np.array([
                -0.03068462, 0.03940001, 0.16758354, -0.24838003, 0.16940664,
                0.9650363, -0.14818724, -0.0802245, -1.52869811, 0.0414509
            ]),
            'l2':
            np.array([
                -0.02403681, 0.03455527, 0.13470436, -0.21654892, 0.16637723,
                0.83125941, -0.08555382, -0.12914753, -1.35294435, 0.02312935
            ]),
            'l1':
            np.array([
                0., 1.48439371e-02, 1.03806171e-01, -1.57313537e-01,
                1.40448847e-01, 8.05306416e-01, -5.41296030e-02,
                -1.07753576e-01, -1.37612207e+00, 6.43289248e-05
            ]),
            'elasticnet':
            np.array([
                0., 0.01011823, 0.10530518, -0.16885214, 0.14373715,
                0.82547312, -0.06122141, -0.09479487, -1.39454662, 0.00312597
            ]),
            'tv':
            np.array([
                0.03017556, 0.03714465, 0.0385349, -0.10169967, 0.15783755,
                0.64860815, -0.00617636, -0.22235137, -1.07938977, -0.07181225
            ]),
            'binarsity':
            np.array([
                0.03794176, -0.04473702, 0.00339763, 0.00339763, -0.16493989,
                0.05497996, 0.05497996, 0.05497996, -0.08457476, -0.08457476,
                0.0294825, 0.13966702, 0.10251257, 0.02550264, -0.07207419,
                -0.05594102, -0.10018038, -0.10018038, 0.10018038, 0.10018038,
                -0.47859686, -0.06685181, -0.00850803, 0.55395669, 0.00556327,
                -0.00185442, -0.00185442, -0.00185442, 0.26010429, 0.09752455,
                -0.17881442, -0.17881442, 0.932516, 0.32095387, -0.49766315,
                -0.75580671, 0.0593833, -0.01433773, 0.01077109, -0.05581666
            ])
        }

        for penalty in self.penalties:

            if penalty == 'binarsity':
                # binarize features
                n_cuts = 3
                binarizer = FeaturesBinarizer(n_cuts=n_cuts)
                features = binarizer.fit_transform(raw_features)
            else:
                features = raw_features

            for solver in self.solvers:

                solver_kwargs = {
                    'penalty': penalty,
                    'tol': 0,
                    'solver': solver,
                    'verbose': False,
                    'max_iter': 10
                }

                if penalty != 'none':
                    solver_kwargs['C'] = 50

                if penalty == 'binarsity':
                    solver_kwargs['blocks_start'] = \
                        binarizer.feature_indices[:-1, ]
                    solver_kwargs['blocks_length'] = binarizer.n_values

                learner = CoxRegression(**solver_kwargs)
                learner.fit(features, times, censoring)

                np.testing.assert_array_almost_equal(coeffs_pen[penalty],
                                                     learner.coeffs,
                                                     decimal=1)
Esempio n. 7
0
    def _simulate(self):
        # The features matrix already exists, and is created by the
        # super class
        features = self.features
        n_samples, n_features = features.shape
        # Simulation of cut-points
        n_cut_points = self.n_cut_points
        n_cut_points_factor = self.n_cut_points_factor
        sparsity = self.sparsity
        s = round(n_features * sparsity)
        # sparsity index set
        S = np.random.choice(n_features, s, replace=False)

        if n_cut_points is None:
            n_cut_points = np.random.geometric(n_cut_points_factor, n_features)
        else:
            n_cut_points = np.repeat(n_cut_points, n_features)

        cut_points = {}
        coeffs_binarized = np.array([])
        for j in range(n_features):
            feature_j = features[:, j]
            quantile_cuts = np.linspace(10, 90, 10)
            candidates = np.percentile(feature_j,
                                       quantile_cuts,
                                       interpolation="nearest")
            cut_points_j = np.random.choice(candidates,
                                            n_cut_points[j],
                                            replace=False)
            cut_points_j = np.sort(cut_points_j)
            cut_points_j = np.insert(cut_points_j, 0, -np.inf)
            cut_points_j = np.append(cut_points_j, np.inf)
            cut_points[str(j)] = cut_points_j
            # generate beta star
            if j in S:
                coeffs_block = np.zeros(n_cut_points[j] + 1)
            else:
                coeffs_block = np.random.normal(1, .5, n_cut_points[j] + 1)
                # make sure 2 consecutive coeffs are different enough
                coeffs_block = np.abs(coeffs_block)
                coeffs_block[::2] *= -1
            # sum-to-zero constraint in each block
            coeffs_block = coeffs_block - coeffs_block.mean()
            coeffs_binarized = np.append(coeffs_binarized, coeffs_block)

        binarizer = FeaturesBinarizer(method='given',
                                      bins_boundaries=cut_points)
        binarized_features = binarizer.fit_transform(features)

        u = binarized_features.dot(coeffs_binarized)
        # Simulation of true times
        E = np.random.exponential(scale=1., size=n_samples)
        E *= np.exp(-u)
        scale = self.scale
        shape = self.shape
        if self.times_distribution == "weibull":
            T = 1. / scale * E**(1. / shape)
        else:
            # There is not point in this test, but let's do it like that
            # since we're likely to implement other distributions
            T = 1. / scale * E**(1. / shape)

        m = T.mean()
        # Simulation of the censoring
        c = self.censoring_factor
        C = np.random.exponential(scale=c * m, size=n_samples)
        # Observed time
        self._set("times", np.minimum(T, C).astype(self.dtype))
        # Censoring indicator: 1 if it is a time of failure, 0 if censoring.
        censoring = (T <= C).astype(np.ushort)
        self._set("censoring", censoring)
        return self.features, self.times, self.censoring, cut_points, \
               coeffs_binarized, S
Esempio n. 8
0
def fit_and_score(features, features_bin, times, censoring, blocks_start,
                  blocks_length, boundaries, features_names, idx_train,
                  idx_test, validation_data, C):
    if features_names is None:
        features_names = [str(j) for j in range(features.shape[1])]
    X_train, X_test = features_bin[idx_train], features_bin[idx_test]
    Y_train, Y_test = times[idx_train], times[idx_test]
    delta_train, delta_test = censoring[idx_train], censoring[idx_test]

    learner = CoxRegression(penalty='binarsity',
                            tol=1e-5,
                            verbose=False,
                            max_iter=100,
                            step=0.3,
                            blocks_start=blocks_start,
                            blocks_length=blocks_length,
                            warm_start=True)
    learner._solver_obj.linesearch = False
    learner.C = C
    learner.fit(X_train, Y_train, delta_train)
    coeffs = learner.coeffs

    cut_points_estimates = {}
    for j, start in enumerate(blocks_start):
        coeffs_j = coeffs[start:start + blocks_length[j]]
        all_zeros = not np.any(coeffs_j)
        if all_zeros:
            cut_points_estimate_j = np.array([-np.inf, np.inf])
        else:
            groups_j = get_groups(coeffs_j)
            jump_j = np.where(groups_j[1:] - groups_j[:-1] != 0)[0] + 1
            if jump_j.size == 0:
                cut_points_estimate_j = np.array([-np.inf, np.inf])
            else:
                cut_points_estimate_j = boundaries[features_names[j]][jump_j]
                if cut_points_estimate_j[0] != -np.inf:
                    cut_points_estimate_j = np.insert(cut_points_estimate_j, 0,
                                                      -np.inf)
                if cut_points_estimate_j[-1] != np.inf:
                    cut_points_estimate_j = np.append(cut_points_estimate_j,
                                                      np.inf)
        cut_points_estimates[features_names[j]] = cut_points_estimate_j
    binarizer = FeaturesBinarizer(method='given',
                                  bins_boundaries=cut_points_estimates)
    binarized_features = binarizer.fit_transform(features)
    blocks_start = binarizer.blocks_start
    blocks_length = binarizer.blocks_length
    X_bin_train = binarized_features[idx_train]
    X_bin_test = binarized_features[idx_test]
    learner_ = CoxRegression(penalty='binarsity',
                             tol=1e-5,
                             verbose=False,
                             max_iter=100,
                             step=0.3,
                             blocks_start=blocks_start,
                             blocks_length=blocks_length,
                             warm_start=True,
                             C=1e10)
    learner_._solver_obj.linesearch = False
    learner_.fit(X_bin_train, Y_train, delta_train)
    score = learner_.score(X_bin_test, Y_test, delta_test)

    if validation_data is not None:
        X_validation = validation_data[0]
        X_bin_validation = binarizer.fit_transform(X_validation)
        Y_validation = validation_data[1]
        delta_validation = validation_data[2]
        score_validation = learner_.score(X_bin_validation, Y_validation,
                                          delta_validation)
    else:
        score_validation = None

    return score, score_validation
Esempio n. 9
0
def binarsity_reg(X, y, grid_C=np.logspace(-2, 2, 10), C=None, verbose=True):
    """implementation of the linear regression on binarized features with binarsity penalty.
    Parameters:
        X : pandas dataframe, shape=(1420, 4), columns=[ str(j)+":continuous" for j in range(len(X[0]))], dtype='float'
            list of continuous features

        y :numpy.ndarray, dtype='float'
            list of labels

        grid_C : list or numpy.ndarray, dtype='float'
        this list of weigths associated to the binarsity penalty from which will be chosen
                the final by cross-validation if C=None

        C : float (positive) or None
            weigth associated to the binarsity penalty

        verbose: Boolean
                    If True, prints additional info

    Returns :
        cut_points_estimates : dict, length = dimension of observation space

        final_coeffs : numpy.ndarray

        blocks_start : numpy.ndarray

        all_groups : list

        coeffs : numpy.ndarray

        regr.C : 'float'

    """

    t0 = time.clock()
    if verbose == True:
        print("binarsity regression number of observations :", len(X))
    n_cuts = 50
    binarizer = FeaturesBinarizer(n_cuts=n_cuts,
                                  detect_column_type="column_names")
    X_bin = binarizer.fit_transform(X)

    features_names = [X.columns[j] for j in range(len(X.columns))]
    boundaries = binarizer.bins_boundaries

    blocks_start = binarizer.blocks_start
    blocks_length = binarizer.blocks_length

    n_folds = 5

    if C == None:
        scores_cv = pd.DataFrame(columns=['C', 'scores_mean', 'scores_std'])
        for i, C_i in enumerate(grid_C):
            scores = compute_score(X,
                                   X_bin,
                                   y,
                                   blocks_start,
                                   blocks_length,
                                   C=C_i,
                                   n_folds=n_folds)
            scores = [C_i] + scores
            scores_cv.loc[i] = scores
        if verbose == True:
            print("cross_val scores :")
            print(scores_cv.round(3))

        idx_min = scores_cv.scores_mean.argmin()
        C_best = grid_C[idx_min]

        idx_chosen = min([
            i for i, j in enumerate(
                list(scores_cv.scores_mean <= scores_cv.scores_mean.min() +
                     scores_cv.scores_std[idx_min])) if j
        ])
        C_chosen = grid_C[idx_chosen]
        if verbose == True:
            print("C_best :", "%.4g" % C_best)
            print("C_chosen :", "%.4g" % C_chosen)

    regr = linear_model.LinearRegression(penalty='binarsity',
                                         blocks_start=blocks_start,
                                         blocks_length=blocks_length,
                                         warm_start=True)
    if C == None:
        regr.C = C_chosen
    else:
        regr.C = C
    if verbose == True:
        print("regr.C :", "%.4g" % regr.C)

    regr.fit(X_bin, y)
    coeffs = regr.weights

    # computations of the cut-points
    all_groups = list()
    cut_points_estimates = {}
    for j, start in enumerate(blocks_start):
        coeffs_j = coeffs[start:start + blocks_length[j]]
        all_zeros = not np.any(coeffs_j)
        if all_zeros:
            cut_points_estimate_j = np.array([-np.inf, np.inf])
            groups_j = np.array(blocks_length[j] * [0])
        else:
            groups_j = get_groups(coeffs_j)
            # print("group"+str(j), groups_j)
            jump_j = np.where(groups_j[1:] - groups_j[:-1] != 0)[0] + 1
            if jump_j.size == 0:
                cut_points_estimate_j = np.array([-np.inf, np.inf])
            else:
                cut_points_estimate_j = boundaries[features_names[j]][jump_j]
                if cut_points_estimate_j[0] != -np.inf:
                    cut_points_estimate_j = np.insert(cut_points_estimate_j, 0,
                                                      -np.inf)
                if cut_points_estimate_j[-1] != np.inf:
                    cut_points_estimate_j = np.append(cut_points_estimate_j,
                                                      np.inf)
        cut_points_estimates[features_names[j]] = cut_points_estimate_j
        if j > 0:
            groups_j += max(all_groups) + 1
        all_groups += list(groups_j)

    if verbose == True:
        print("cutpoints :")
        for j in range(len(cut_points_estimates)):
            print(features_names[j], [
                "%.4f" % cut_points_estimates[features_names[j]][i]
                for i in range(len(cut_points_estimates[features_names[j]]))
            ])

    # creation of final binarized X data for the computed cutpoints
    binarizer2 = FeaturesBinarizer(method='given',
                                   bins_boundaries=cut_points_estimates)
    X_bin2 = binarizer2.fit_transform(X)
    X_bin2 = np.array(X_bin2.todense())
    blocks_start2 = binarizer2.blocks_start
    blocks_length2 = binarizer2.blocks_length
    X_bin2_train, X_bin2_test, y_train, y_test = train_test_split(
        X_bin2, y, test_size=0.2)

    # final re-fit:
    regr3 = linear_model.LinearRegression(penalty='binarsity',
                                          blocks_start=blocks_start2,
                                          blocks_length=blocks_length2,
                                          warm_start=True)
    regr3.C = 1e10
    regr3.fit(X_bin2_train, y_train)
    if verbose == True:
        print(
            "R² score of final predictor on train data (80% of total data) :",
            "%.4g" % regr3.score(X_bin2_train, y_train))
        print("R² score of final predictor on test data (20% of total data) :",
              "%.4g" % regr3.score(X_bin2_test, y_test))

    final_coeffs = {"intercept": regr3.intercept, "weights": regr3.weights}

    t1 = time.clock()
    if verbose == True:
        print("time elapsed for binarsity regression step:",
              "%.4g" % (t1 - t0), "s")

    return cut_points_estimates, final_coeffs, blocks_start, all_groups, coeffs, regr.C
Esempio n. 10
0
            sparsity = .2
            simu = SimuCoxRegWithCutPoints(n_samples=n_samples,
                                           n_features=n_features,
                                           n_cut_points=n_cut_points,
                                           seed=seed,
                                           verbose=False,
                                           shape=2,
                                           scale=.1,
                                           cov_corr=cov_corr,
                                           sparsity=sparsity)
            X, Y, delta, cut_points, beta_star, S = simu.simulate()

            # binarize data
            n_cuts = 50
            binarizer = FeaturesBinarizer(n_cuts=n_cuts)
            X_bin = binarizer.fit_transform(X)
            blocks_start = binarizer.blocks_start
            blocks_length = binarizer.blocks_length
            boundaries = binarizer.boundaries

            tic = time()

            solver = 'agd'
            learner = CoxRegression(penalty='binarsity',
                                    tol=1e-5,
                                    solver=solver,
                                    verbose=False,
                                    max_iter=100,
                                    step=0.3,
                                    blocks_start=blocks_start,
                                    blocks_length=blocks_length,