def objective(trial): iris = load_iris() X, y = cudf.DataFrame(pd.DataFrame( iris.data.astype('float32'))), cudf.DataFrame( pd.DataFrame(iris.target.astype('float32'))) solver = trial.suggest_categorical("solver", ["qn"]) C = trial.suggest_uniform("C", 0.0, 1.0) if solver == "qn": penalty = trial.suggest_categorical("penalty", ["l1", "l2"]) else: # 'penalty' parameter isn't relevant for this solver, # so we always specify 'l2' as the dummy value. penalty = "l2" classifier = LogisticRegression(max_iter=200, solver=solver, C=C, penalty=penalty) X_train, X_valid, y_train, y_valid = train_test_split(X, y) classifier.fit(X_train, y_train) score = classifier.score(X_valid, y_valid) return score
def fast_logistic_regression( X, y, *, penalty: Literal['l1', 'l2', 'elasticnet', 'none'] = 'l2', C: float = 1.0, solver: Literal['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] = 'lbfgs', fit_intercept: bool = True, l1_ratio: Optional[float] = None, tol: float = 1e-4, max_iter: int = 1000, class_weight: Optional[Dict[str, float]] = None, n_jobs: Optional[int] = None, random_state: int = 1, framework: Literal['auto', 'cuml', 'sklearn'] = 'sklearn', **kwargs, ) -> LogisticRegression: """The cuML LogisticRegression is only faster when n_samples > 100000 given 64 feature dimensions""" kw = _prepare_kw(locals(), 'X', 'y', 'kwargs', 'framework', **kwargs) ### import is_cuml = False if framework == 'sklearn': LoRe = LogisticRegression else: try: from cuml.linear_model import LogisticRegression as LoRe is_cuml = True kw.pop('n_jobs') kw.pop('random_state') # if solver not in CUML_SOLVER: kw['solver'] = 'qn' except ImportError as e: LoRe = LogisticRegression ### train model = LoRe(**kw) model.fit(X, y) return model
def run_log_reg(scaled_df): raw_train_arr = [] raw_test_arr = [] # Init metrics metrics = ['accuracy', 'f1', 'roc_auc_ovr'] # Set c vals and penalty C_vals = range(-8, 5) C_vals = [10**val for val in C_vals] penalty = ['none', 'l1', 'l2'] # Init params params = {'penalty': penalty, 'C': C_vals} # Over five trials for i in range(5): # Train test split X_train, X_test, y_train, y_test = train_test_split( scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000) # Init clf clf = LogisticRegression() # Init gridsearch and run search_results = GridSearchCV(clf, params, scoring=metrics, refit=False) search_results.fit(X_train, y_train) # Get results and organize results = pd.DataFrame(search_results.cv_results_['params']) results['mean_accuracy'] = search_results.cv_results_[ 'mean_test_accuracy'] results['mean_f1'] = search_results.cv_results_['mean_test_f1'] results['mean_auc'] = search_results.cv_results_[ 'mean_test_roc_auc_ovr'] # Get optimal clfs opt_acc_inf = results.sort_values(by='mean_accuracy', ascending=False).iloc[0] opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0] opt_auc_inf = results.sort_values(by='mean_auc', ascending=False).iloc[0] # Init optimal clfs opt_acc_clf = LogisticRegression(C=opt_acc_inf.C, penalty=opt_acc_inf.penalty, max_iter=100000) opt_f1_clf = LogisticRegression(C=opt_f1_inf.C, penalty=opt_f1_inf.penalty, max_iter=100000) opt_auc_clf = LogisticRegression(C=opt_auc_inf.C, penalty=opt_auc_inf.penalty, max_iter=100000) # Fit clfs opt_acc_clf.fit(X_train, y_train) opt_f1_clf.fit(X_train, y_train) opt_auc_clf.fit(X_train, y_train) # Get train and test metrics train_score_acc = opt_acc_clf.score(X_train, y_train) train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train)) train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train)) test_score_acc = opt_acc_clf.score(X_test, y_test) test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test)) test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test)) raw_train_arr.append( [train_score_acc, train_score_f1, train_score_auc]) raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc]) # Create dataframe from results raw_train_arr = np.array(raw_train_arr).reshape(5, 3) raw_test_arr = np.array(raw_test_arr).reshape(5, 3) raw_train_df = pd.DataFrame(data=raw_train_arr, columns=['accuracy', 'f1', 'auc']) raw_test_df = pd.DataFrame(data=raw_test_arr, columns=['accuracy', 'f1', 'auc']) # Return results return raw_train_df, raw_test_df
class CumlLRFitter(FitterBase): def __init__(self, label='label', metric='error', opt: LROpt = None, max_eval=10): super(CumlLRFitter, self).__init__(label, metric, max_eval) if opt is not None: self.opt = opt else: self.opt = LROpt() self.clf = None def train(self, train_df, eval_df, params=None): train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df) x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \ eval_df.drop(columns=[self.label]), eval_df[self.label], if params is None: use_params = deepcopy(self.opt_params) else: use_params = deepcopy(params) self.clf = LogisticRegression(**use_params) self.clf.fit(X=x_train, y=y_train) preds = self.clf.predict(X=x_eval) output = self.get_loss(y_pred=preds, y=y_eval) return output def search(self, train_df, eval_df): self.opt_params = dict() def train_impl(params): self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) return self.get_loss(eval_df[self.label], y_pred) self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def search_k_fold(self, k_fold, data): self.opt_params = dict() def train_impl_nfold(params): loss = list() for train_id, eval_id in k_fold.split(data): train_df = data.iloc[train_id, :] eval_df = data.iloc[eval_id, :] self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict( eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) loss.append(self.get_loss(eval_df[self.label], y_pred)) return np.mean(loss) self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True): acc_result = list() train_pred = cudf.Series(np.empty(train_data.shape[0])) test_pred = cudf.Series(np.empty(test_data.shape[0])) if drop_test_y: dtest = test_data.drop(columns=self.label) else: dtest = test_data for train_id, eval_id in k_fold.split(train_data): train_df = train_data.iloc[train_id, :] eval_df = train_data.iloc[eval_id, :] self.train(train_df, eval_df, params) train_pred[eval_id] = self.clf.predict_proba( eval_df.drop(columns=self.label)).iloc[:, 1].values if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) acc_result.append(self.get_loss(eval_df[self.label], y_pred)) test_pred += self.clf.predict_proba(dtest).iloc[:, 1] test_pred /= k_fold.n_splits return train_pred, test_pred, acc_result
def rank_genes_groups( X, labels, # louvain results var_names, groups=None, reference='rest', n_genes=100, **kwds, ): """ Rank genes for characterizing groups. Parameters ---------- X : cupy.ndarray of shape (n_cells, n_genes) The cellxgene matrix to rank genes labels : cudf.Series of size (n_cells,) Observations groupings to consider var_names : cudf.Series of size (n_genes,) Names of genes in X groups : Iterable[str] (default: 'all') Subset of groups, e.g. ['g1', 'g2', 'g3'], to which comparison shall be restricted, or 'all' (default), for all groups. reference : str (default: 'rest') If 'rest', compare each group to the union of the rest of the group. If a group identifier, compare with respect to this group. n_genes : int (default: 100) The number of genes that appear in the returned tables. """ #### Wherever we see "adata.obs[groupby], we should just replace w/ the groups" import time start = time.time() # for clarity, rename variable if groups == 'all': groups_order = 'all' elif isinstance(groups, (str, int)): raise ValueError('Specify a sequence of groups') else: groups_order = list(groups) if isinstance(groups_order[0], int): groups_order = [str(n) for n in groups_order] if reference != 'rest' and reference not in set(groups_order): groups_order += [reference] if (reference != 'rest' and reference not in set(labels.cat.categories)): cats = labels.cat.categories.tolist() raise ValueError( f'reference = {reference} needs to be one of groupby = {cats}.') groups_order, groups_masks = select_groups(labels, groups_order) original_reference = reference n_vars = len(var_names) # for clarity, rename variable n_genes_user = n_genes # make sure indices are not OoB in case there are less genes than n_genes if n_genes_user > X.shape[1]: n_genes_user = X.shape[1] # in the following, n_genes is simply another name for the total number of genes n_genes = X.shape[1] n_groups = groups_masks.shape[0] ns = cp.zeros(n_groups, dtype=int) for imask, mask in enumerate(groups_masks): ns[imask] = cp.where(mask)[0].size if reference != 'rest': ireference = cp.where(groups_order == reference)[0][0] reference_indices = cp.arange(n_vars, dtype=int) rankings_gene_scores = [] rankings_gene_names = [] # Perform LogReg # if reference is not set, then the groups listed will be compared to the rest # if reference is set, then the groups listed will be compared only to the other groups listed from cuml.linear_model import LogisticRegression reference = groups_order[0] if len(groups) == 1: raise Exception( 'Cannot perform logistic regression on a single cluster.') grouping_mask = labels.astype('int').isin(cudf.Series(groups_order)) grouping = labels.loc[grouping_mask] X = X[grouping_mask. values, :] # Indexing with a series causes issues, possibly segfault y = labels.loc[grouping] clf = LogisticRegression(**kwds) clf.fit(X.get(), grouping.to_array().astype('float32')) scores_all = cp.array(clf.coef_).T for igroup, group in enumerate(groups_order): if len(groups_order) <= 2: # binary logistic regression scores = scores_all[0] else: scores = scores_all[igroup] partition = cp.argpartition(scores, -n_genes_user)[-n_genes_user:] partial_indices = cp.argsort(scores[partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[global_indices].get( )) ## Shouldn't need to take this off device rankings_gene_names.append(var_names[global_indices].to_pandas()) if len(groups_order) <= 2: break groups_order_save = [str(g) for g in groups_order] if (len(groups) == 2): groups_order_save = [g for g in groups_order if g != reference] print("Ranking took (GPU): " + str(time.time() - start)) start = time.time() scores = np.rec.fromarrays( [n for n in rankings_gene_scores], dtype=[(rn, 'float32') for rn in groups_order_save], ) names = np.rec.fromarrays( [n for n in rankings_gene_names], dtype=[(rn, 'U50') for rn in groups_order_save], ) print("Preparing output np.rec.fromarrays took (CPU): " + str(time.time() - start)) print("Note: This operation will be accelerated in a future version") return scores, names, original_reference
def rank_genes_groups( X, labels, # louvain results var_names, groupby=str, groups=None, reference='rest', n_genes=100, key_added=None, layer=None, **kwds, ): #### Wherever we see "adata.obs[groupby], we should just replace w/ the groups" import time start = time.time() # for clarity, rename variable if groups == 'all': groups_order = 'all' elif isinstance(groups, (str, int)): raise ValueError('Specify a sequence of groups') else: groups_order = list(groups) if isinstance(groups_order[0], int): groups_order = [str(n) for n in groups_order] if reference != 'rest' and reference not in set(groups_order): groups_order += [reference] if (reference != 'rest' and reference not in set(labels.cat.categories)): cats = labels.cat.categories.tolist() raise ValueError( f'reference = {reference} needs to be one of groupby = {cats}.') groups_order, groups_masks = select_groups(labels, groups_order) original_reference = reference n_vars = len(var_names) # for clarity, rename variable n_genes_user = n_genes # make sure indices are not OoB in case there are less genes than n_genes if n_genes_user > X.shape[1]: n_genes_user = X.shape[1] # in the following, n_genes is simply another name for the total number of genes n_genes = X.shape[1] n_groups = groups_masks.shape[0] ns = cp.zeros(n_groups, dtype=int) for imask, mask in enumerate(groups_masks): ns[imask] = cp.where(mask)[0].size if reference != 'rest': ireference = cp.where(groups_order == reference)[0][0] reference_indices = cp.arange(n_vars, dtype=int) rankings_gene_scores = [] rankings_gene_names = [] rankings_gene_logfoldchanges = [] rankings_gene_pvals = [] rankings_gene_pvals_adj = [] # if 'log1p' in adata.uns_keys() and adata.uns['log1p']['base'] is not None: # expm1_func = lambda x: np.expm1(x * np.log(adata.uns['log1p']['base'])) # else: # expm1_func = np.expm1 # Perform LogReg # if reference is not set, then the groups listed will be compared to the rest # if reference is set, then the groups listed will be compared only to the other groups listed from cuml.linear_model import LogisticRegression reference = groups_order[0] if len(groups) == 1: raise Exception( 'Cannot perform logistic regression on a single cluster.') grouping_mask = labels.astype('int').isin(cudf.Series(groups_order)) grouping = labels.loc[grouping_mask] X = X[grouping_mask. values, :] # Indexing with a series causes issues, possibly segfault y = labels.loc[grouping] clf = LogisticRegression(**kwds) clf.fit(X.get(), grouping.to_array().astype('float32')) scores_all = cp.array(clf.coef_).T for igroup, group in enumerate(groups_order): if len(groups_order) <= 2: # binary logistic regression scores = scores_all[0] else: scores = scores_all[igroup] partition = cp.argpartition(scores, -n_genes_user)[-n_genes_user:] partial_indices = cp.argsort(scores[partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[global_indices].get( )) ## Shouldn't need to take this off device rankings_gene_names.append(var_names[global_indices].to_pandas()) if len(groups_order) <= 2: break groups_order_save = [str(g) for g in groups_order] if (len(groups) == 2): groups_order_save = [g for g in groups_order if g != reference] print("Ranking took (GPU): " + str(time.time() - start)) start = time.time() scores = np.rec.fromarrays( [n for n in rankings_gene_scores], dtype=[(rn, 'float32') for rn in groups_order_save], ) names = np.rec.fromarrays( [n for n in rankings_gene_names], dtype=[(rn, 'U50') for rn in groups_order_save], ) print("Preparing output np.rec.fromarrays took (CPU): " + str(time.time() - start)) print("Note: This operation will be accelerated in a future version") return scores, names, original_reference