Example #1
0
def objective(trial):
    iris = load_iris()
    X, y = cudf.DataFrame(pd.DataFrame(
        iris.data.astype('float32'))), cudf.DataFrame(
            pd.DataFrame(iris.target.astype('float32')))
    solver = trial.suggest_categorical("solver", ["qn"])
    C = trial.suggest_uniform("C", 0.0, 1.0)

    if solver == "qn":
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    else:
        # 'penalty' parameter isn't relevant for this solver,
        # so we always specify 'l2' as the dummy value.
        penalty = "l2"

    classifier = LogisticRegression(max_iter=200,
                                    solver=solver,
                                    C=C,
                                    penalty=penalty)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    classifier.fit(X_train, y_train)

    score = classifier.score(X_valid, y_valid)
    return score
Example #2
0
    def train(self, train_df, eval_df, params=None):
        train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df)
        x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \
                                           eval_df.drop(columns=[self.label]), eval_df[self.label],
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)
        self.clf = LogisticRegression(**use_params)
        self.clf.fit(X=x_train, y=y_train)
        preds = self.clf.predict(X=x_eval)
        output = self.get_loss(y_pred=preds, y=y_eval)

        return output
Example #3
0
def fast_logistic_regression(
    X,
    y,
    *,
    penalty: Literal['l1', 'l2', 'elasticnet', 'none'] = 'l2',
    C: float = 1.0,
    solver: Literal['newton-cg', 'lbfgs', 'liblinear', 'sag',
                    'saga'] = 'lbfgs',
    fit_intercept: bool = True,
    l1_ratio: Optional[float] = None,
    tol: float = 1e-4,
    max_iter: int = 1000,
    class_weight: Optional[Dict[str, float]] = None,
    n_jobs: Optional[int] = None,
    random_state: int = 1,
    framework: Literal['auto', 'cuml', 'sklearn'] = 'sklearn',
    **kwargs,
) -> LogisticRegression:
    """The cuML LogisticRegression is only faster when n_samples > 100000 given 64
  feature dimensions"""
    kw = _prepare_kw(locals(), 'X', 'y', 'kwargs', 'framework', **kwargs)
    ### import
    is_cuml = False
    if framework == 'sklearn':
        LoRe = LogisticRegression
    else:
        try:
            from cuml.linear_model import LogisticRegression as LoRe
            is_cuml = True
            kw.pop('n_jobs')
            kw.pop('random_state')
            # if solver not in CUML_SOLVER:
            kw['solver'] = 'qn'
        except ImportError as e:
            LoRe = LogisticRegression
    ### train
    model = LoRe(**kw)
    model.fit(X, y)
    return model
def run_log_reg(scaled_df):

    raw_train_arr = []
    raw_test_arr = []

    # Init metrics
    metrics = ['accuracy', 'f1', 'roc_auc_ovr']

    # Set c vals and penalty
    C_vals = range(-8, 5)
    C_vals = [10**val for val in C_vals]

    penalty = ['none', 'l1', 'l2']

    # Init params
    params = {'penalty': penalty, 'C': C_vals}

    # Over five trials
    for i in range(5):
        # Train test split
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000)

        # Init clf
        clf = LogisticRegression()

        # Init gridsearch and run
        search_results = GridSearchCV(clf,
                                      params,
                                      scoring=metrics,
                                      refit=False)
        search_results.fit(X_train, y_train)

        # Get results and organize
        results = pd.DataFrame(search_results.cv_results_['params'])

        results['mean_accuracy'] = search_results.cv_results_[
            'mean_test_accuracy']
        results['mean_f1'] = search_results.cv_results_['mean_test_f1']
        results['mean_auc'] = search_results.cv_results_[
            'mean_test_roc_auc_ovr']

        # Get optimal clfs
        opt_acc_inf = results.sort_values(by='mean_accuracy',
                                          ascending=False).iloc[0]
        opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0]
        opt_auc_inf = results.sort_values(by='mean_auc',
                                          ascending=False).iloc[0]

        # Init optimal clfs
        opt_acc_clf = LogisticRegression(C=opt_acc_inf.C,
                                         penalty=opt_acc_inf.penalty,
                                         max_iter=100000)
        opt_f1_clf = LogisticRegression(C=opt_f1_inf.C,
                                        penalty=opt_f1_inf.penalty,
                                        max_iter=100000)
        opt_auc_clf = LogisticRegression(C=opt_auc_inf.C,
                                         penalty=opt_auc_inf.penalty,
                                         max_iter=100000)

        # Fit clfs
        opt_acc_clf.fit(X_train, y_train)
        opt_f1_clf.fit(X_train, y_train)
        opt_auc_clf.fit(X_train, y_train)

        # Get train and test metrics
        train_score_acc = opt_acc_clf.score(X_train, y_train)
        train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train))
        train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train))

        test_score_acc = opt_acc_clf.score(X_test, y_test)
        test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test))
        test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test))

        raw_train_arr.append(
            [train_score_acc, train_score_f1, train_score_auc])
        raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc])

    # Create dataframe from results
    raw_train_arr = np.array(raw_train_arr).reshape(5, 3)
    raw_test_arr = np.array(raw_test_arr).reshape(5, 3)

    raw_train_df = pd.DataFrame(data=raw_train_arr,
                                columns=['accuracy', 'f1', 'auc'])
    raw_test_df = pd.DataFrame(data=raw_test_arr,
                               columns=['accuracy', 'f1', 'auc'])

    # Return results
    return raw_train_df, raw_test_df
Example #5
0
         from cuml.svm import SVC
     else:
         from sklearn.svm import SVC
     model = SVC(**alg.input_variables.__dict__)
 elif alg.name == 'GaussianNaiveBayes':
     from sklearn.naive_bayes import GaussianNB
     model = GaussianNB(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'LogisticRegression':
     if NVIDIA_RAPIDS_ENABLED:
         # nvidia rapids version should be higher than v0.13
         # if version.parse(cuml.__version__) > version.parse("0.13"):
         from cuml.linear_model import LogisticRegression
     else:
         from sklearn.linear_model import LogisticRegression
     model = LogisticRegression(**alg.input_variables.__dict__)
 elif alg.name == 'AdaBoost' and alg.type == 'classification':
     from sklearn.ensemble import AdaBoostClassifier
     model = AdaBoostClassifier(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'GradientBoosting' and alg.type == 'classification':
     from sklearn.ensemble import GradientBoostingClassifier
     model = GradientBoostingClassifier(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'RandomForest' and alg.type == 'classification':
     if NVIDIA_RAPIDS_ENABLED:
         from cuml.ensemble import RandomForestClassifier
     else:
         from sklearn.ensemble import RandomForestClassifier
     model = RandomForestClassifier(**alg.input_variables.__dict__)
 elif alg.name == 'XGBoost' and alg.type == 'classification':
Example #6
0
class CumlLRFitter(FitterBase):
    def __init__(self,
                 label='label',
                 metric='error',
                 opt: LROpt = None,
                 max_eval=10):
        super(CumlLRFitter, self).__init__(label, metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = LROpt()
        self.clf = None

    def train(self, train_df, eval_df, params=None):
        train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df)
        x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \
                                           eval_df.drop(columns=[self.label]), eval_df[self.label],
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)
        self.clf = LogisticRegression(**use_params)
        self.clf.fit(X=x_train, y=y_train)
        preds = self.clf.predict(X=x_eval)
        output = self.get_loss(y_pred=preds, y=y_eval)

        return output

    def search(self, train_df, eval_df):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]))
            else:
                y_pred = self.clf.predict(
                    eval_df.drop(columns=[self.label])).astype(int)

            return self.get_loss(eval_df[self.label], y_pred)

        self.opt_params = fmin(train_impl,
                               asdict(self.opt),
                               algo=tpe.suggest,
                               max_evals=self.max_eval)

    def search_k_fold(self, k_fold, data):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
            for train_id, eval_id in k_fold.split(data):
                train_df = data.iloc[train_id, :]
                eval_df = data.iloc[eval_id, :]
                self.train(train_df, eval_df, params)
                if self.metric == 'auc':
                    y_pred = self.clf.predict(
                        eval_df.drop(columns=[self.label]))
                else:
                    y_pred = self.clf.predict(
                        eval_df.drop(columns=[self.label])).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            return np.mean(loss)

        self.opt_params = fmin(train_impl_nfold,
                               asdict(self.opt),
                               algo=tpe.suggest,
                               max_evals=self.max_eval)

    def train_k_fold(self,
                     k_fold,
                     train_data,
                     test_data,
                     params=None,
                     drop_test_y=True):
        acc_result = list()
        train_pred = cudf.Series(np.empty(train_data.shape[0]))
        test_pred = cudf.Series(np.empty(test_data.shape[0]))
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.iloc[train_id, :]
            eval_df = train_data.iloc[eval_id, :]
            self.train(train_df, eval_df, params)
            train_pred[eval_id] = self.clf.predict_proba(
                eval_df.drop(columns=self.label)).iloc[:, 1].values
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]))
            else:
                y_pred = self.clf.predict(
                    eval_df.drop(columns=[self.label])).astype(int)

            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            test_pred += self.clf.predict_proba(dtest).iloc[:, 1]

        test_pred /= k_fold.n_splits
        return train_pred, test_pred, acc_result
def rank_genes_groups(
    X,
    labels,  # louvain results
    var_names,
    groups=None,
    reference='rest',
    n_genes=100,
    **kwds,
):
    """
    Rank genes for characterizing groups.

    Parameters
    ----------

    X : cupy.ndarray of shape (n_cells, n_genes)
        The cellxgene matrix to rank genes

    labels : cudf.Series of size (n_cells,)
        Observations groupings to consider

    var_names : cudf.Series of size (n_genes,)
        Names of genes in X

    groups : Iterable[str] (default: 'all')
        Subset of groups, e.g. ['g1', 'g2', 'g3'], to which comparison
        shall be restricted, or 'all' (default), for all groups.

    reference : str (default: 'rest')
        If 'rest', compare each group to the union of the rest of the group.
        If a group identifier, compare with respect to this group.

    n_genes : int (default: 100)
        The number of genes that appear in the returned tables.
    """

    #### Wherever we see "adata.obs[groupby], we should just replace w/ the groups"

    import time

    start = time.time()

    # for clarity, rename variable
    if groups == 'all':
        groups_order = 'all'
    elif isinstance(groups, (str, int)):
        raise ValueError('Specify a sequence of groups')
    else:
        groups_order = list(groups)
        if isinstance(groups_order[0], int):
            groups_order = [str(n) for n in groups_order]
        if reference != 'rest' and reference not in set(groups_order):
            groups_order += [reference]
    if (reference != 'rest' and reference not in set(labels.cat.categories)):
        cats = labels.cat.categories.tolist()
        raise ValueError(
            f'reference = {reference} needs to be one of groupby = {cats}.')

    groups_order, groups_masks = select_groups(labels, groups_order)

    original_reference = reference

    n_vars = len(var_names)

    # for clarity, rename variable
    n_genes_user = n_genes
    # make sure indices are not OoB in case there are less genes than n_genes
    if n_genes_user > X.shape[1]:
        n_genes_user = X.shape[1]
    # in the following, n_genes is simply another name for the total number of genes
    n_genes = X.shape[1]

    n_groups = groups_masks.shape[0]
    ns = cp.zeros(n_groups, dtype=int)
    for imask, mask in enumerate(groups_masks):
        ns[imask] = cp.where(mask)[0].size
    if reference != 'rest':
        ireference = cp.where(groups_order == reference)[0][0]
    reference_indices = cp.arange(n_vars, dtype=int)

    rankings_gene_scores = []
    rankings_gene_names = []

    # Perform LogReg

    # if reference is not set, then the groups listed will be compared to the rest
    # if reference is set, then the groups listed will be compared only to the other groups listed
    from cuml.linear_model import LogisticRegression
    reference = groups_order[0]
    if len(groups) == 1:
        raise Exception(
            'Cannot perform logistic regression on a single cluster.')
    grouping_mask = labels.astype('int').isin(cudf.Series(groups_order))
    grouping = labels.loc[grouping_mask]

    X = X[grouping_mask.
          values, :]  # Indexing with a series causes issues, possibly segfault
    y = labels.loc[grouping]

    clf = LogisticRegression(**kwds)
    clf.fit(X.get(), grouping.to_array().astype('float32'))
    scores_all = cp.array(clf.coef_).T

    for igroup, group in enumerate(groups_order):
        if len(groups_order) <= 2:  # binary logistic regression
            scores = scores_all[0]
        else:
            scores = scores_all[igroup]

        partition = cp.argpartition(scores, -n_genes_user)[-n_genes_user:]
        partial_indices = cp.argsort(scores[partition])[::-1]
        global_indices = reference_indices[partition][partial_indices]
        rankings_gene_scores.append(scores[global_indices].get(
        ))  ## Shouldn't need to take this off device
        rankings_gene_names.append(var_names[global_indices].to_pandas())
        if len(groups_order) <= 2:
            break

    groups_order_save = [str(g) for g in groups_order]
    if (len(groups) == 2):
        groups_order_save = [g for g in groups_order if g != reference]

    print("Ranking took (GPU): " + str(time.time() - start))

    start = time.time()

    scores = np.rec.fromarrays(
        [n for n in rankings_gene_scores],
        dtype=[(rn, 'float32') for rn in groups_order_save],
    )

    names = np.rec.fromarrays(
        [n for n in rankings_gene_names],
        dtype=[(rn, 'U50') for rn in groups_order_save],
    )

    print("Preparing output np.rec.fromarrays took (CPU): " +
          str(time.time() - start))
    print("Note: This operation will be accelerated in a future version")

    return scores, names, original_reference
def rank_genes_groups(
    X,
    labels,  # louvain results
    var_names,
    groupby=str,
    groups=None,
    reference='rest',
    n_genes=100,
    key_added=None,
    layer=None,
    **kwds,
):

    #### Wherever we see "adata.obs[groupby], we should just replace w/ the groups"

    import time

    start = time.time()

    # for clarity, rename variable
    if groups == 'all':
        groups_order = 'all'
    elif isinstance(groups, (str, int)):
        raise ValueError('Specify a sequence of groups')
    else:
        groups_order = list(groups)
        if isinstance(groups_order[0], int):
            groups_order = [str(n) for n in groups_order]
        if reference != 'rest' and reference not in set(groups_order):
            groups_order += [reference]
    if (reference != 'rest' and reference not in set(labels.cat.categories)):
        cats = labels.cat.categories.tolist()
        raise ValueError(
            f'reference = {reference} needs to be one of groupby = {cats}.')

    groups_order, groups_masks = select_groups(labels, groups_order)

    original_reference = reference

    n_vars = len(var_names)

    # for clarity, rename variable
    n_genes_user = n_genes
    # make sure indices are not OoB in case there are less genes than n_genes
    if n_genes_user > X.shape[1]:
        n_genes_user = X.shape[1]
    # in the following, n_genes is simply another name for the total number of genes
    n_genes = X.shape[1]

    n_groups = groups_masks.shape[0]
    ns = cp.zeros(n_groups, dtype=int)
    for imask, mask in enumerate(groups_masks):
        ns[imask] = cp.where(mask)[0].size
    if reference != 'rest':
        ireference = cp.where(groups_order == reference)[0][0]
    reference_indices = cp.arange(n_vars, dtype=int)

    rankings_gene_scores = []
    rankings_gene_names = []
    rankings_gene_logfoldchanges = []
    rankings_gene_pvals = []
    rankings_gene_pvals_adj = []

    #     if 'log1p' in adata.uns_keys() and adata.uns['log1p']['base'] is not None:
    #         expm1_func = lambda x: np.expm1(x * np.log(adata.uns['log1p']['base']))
    #     else:
    #         expm1_func = np.expm1

    # Perform LogReg

    # if reference is not set, then the groups listed will be compared to the rest
    # if reference is set, then the groups listed will be compared only to the other groups listed
    from cuml.linear_model import LogisticRegression
    reference = groups_order[0]
    if len(groups) == 1:
        raise Exception(
            'Cannot perform logistic regression on a single cluster.')
    grouping_mask = labels.astype('int').isin(cudf.Series(groups_order))
    grouping = labels.loc[grouping_mask]

    X = X[grouping_mask.
          values, :]  # Indexing with a series causes issues, possibly segfault
    y = labels.loc[grouping]

    clf = LogisticRegression(**kwds)
    clf.fit(X.get(), grouping.to_array().astype('float32'))
    scores_all = cp.array(clf.coef_).T

    for igroup, group in enumerate(groups_order):
        if len(groups_order) <= 2:  # binary logistic regression
            scores = scores_all[0]
        else:
            scores = scores_all[igroup]

        partition = cp.argpartition(scores, -n_genes_user)[-n_genes_user:]
        partial_indices = cp.argsort(scores[partition])[::-1]
        global_indices = reference_indices[partition][partial_indices]
        rankings_gene_scores.append(scores[global_indices].get(
        ))  ## Shouldn't need to take this off device
        rankings_gene_names.append(var_names[global_indices].to_pandas())
        if len(groups_order) <= 2:
            break

    groups_order_save = [str(g) for g in groups_order]
    if (len(groups) == 2):
        groups_order_save = [g for g in groups_order if g != reference]

    print("Ranking took (GPU): " + str(time.time() - start))

    start = time.time()

    scores = np.rec.fromarrays(
        [n for n in rankings_gene_scores],
        dtype=[(rn, 'float32') for rn in groups_order_save],
    )

    names = np.rec.fromarrays(
        [n for n in rankings_gene_names],
        dtype=[(rn, 'U50') for rn in groups_order_save],
    )

    print("Preparing output np.rec.fromarrays took (CPU): " +
          str(time.time() - start))
    print("Note: This operation will be accelerated in a future version")

    return scores, names, original_reference
        TargetEncodeTransform(columns=[
            'nom_7',
        ]),
        TargetEncodeTransform(columns=[
            'nom_8',
        ]),
        TargetEncodeTransform(columns=[
            'nom_9',
        ]),
        # TargetEncodeTransform(columns=['ord_0',]),
        # TargetEncodeTransform(columns=['ord_1',]),
        # TargetEncodeTransform(columns=['ord_2',]),
        # TargetEncodeTransform(columns=['ord_3',]),
        # TargetEncodeTransform(columns=['ord_4',]),
        # TargetEncodeTransform(columns=['ord_5',]),
        LogisticRegression(),
    )
    pipe.fit(X, y)
    y_pred = pipe.predict(X)
    param_grid = {"logisticregression__C": [1, 2, 3, 4, 5, 6]}
    rscv = RandomizedSearchCV(pipe,
                              param_distributions=param_grid,
                              scoring=make_scorer(roc_auc_gpu,
                                                  greater_is_better=True),
                              cv=kf,
                              verbose=6,
                              random_state=Config.RANDOM_STATE,
                              n_iter=1)
    model = rscv.fit(X, y)

    print(pipe)