def main(): param_space = { 'C': [1e-4, 1, 1e4], 'gamma': [1e-3, 1, 1e3], 'class_weight': [None, 'balanced'] } model = SVC(kernel='rbf') digits = load_digits() X_train, X_test, y_train, y_test = tts(digits.data, digits.target, test_size=0.3) print("Starting local cluster") cluster = LocalCluster() client = Client(cluster) print(client) print("Start searching") search = GridSearchCV(model, param_space, cv=3) search.fit(X_train, y_train) print("Prepare report") print( classification_report(y_true=y_test, y_pred=search.best_estimator_.predict(X_test)))
def getprobabilities(X, y, p_grid, cv, ac_sens): """getprobabilities(X,y,p_grid, cv, ac_sens) - X and y: Inputs and outputs - p_grid: grid of parameters to search over - cv: Number of cross-validation folds (this is different from the outer number of x-val folds) Gets the probability of picking each of the options provided by p_grid given the data in X and y. The algorithm is as follows: - Find the sensitivity of the SSE for each parameter-values - Find the SSE of each parameter-values - Find the probability of selecting those parameter-values """ kern = GPy.kern.RBF(2.0, lengthscale=25.0, variance=1.0) errorlimit = ac_sens * 4.0 ####find sensitivity of the SSE (for each param combo) #this call gets the sensivities not the scores: #TODO This probably should be done locally as it's quick. clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens, inducing=4, getxvalfoldsensitivities=True, kern=kern, errorlimit=errorlimit), param_grid=p_grid, cv=cv) clf.fit(X, y) nparamcombos = len(clf.cv_results_['mean_test_score']) temp_sens = np.zeros([clf.cv, nparamcombos]) for k in range(clf.cv): temp_sens[k, :] = clf.cv_results_['split%d_test_score' % k] #sensitivity of the sum squared error: print(np.sort(temp_sens, axis=0)) sse_sens = ac_sens**2 + 2 * ac_sens * errorlimit + ac_sens**2 * np.max( np.sum(np.sort(temp_sens, axis=0)[0:clf.cv - 1, :], 0)) ####find the SSE (for each param combo) clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens, inducing=4, getxvalfoldsensitivities=False, kern=kern, errorlimit=errorlimit), param_grid=p_grid, cv=cv) clf.fit(X, y) nparamcombos = len(clf.cv_results_['mean_test_score']) temp_scores = np.zeros([clf.cv, nparamcombos]) for k in range(clf.cv): temp_scores[k, :] = clf.cv_results_['split%d_test_score' % k] scores = np.sum(temp_scores, 0) ####compute the probability of selecting that param combo using the exponential mechanism selection_epsilon = 1 param_probabilities = np.exp(selection_epsilon * scores / (2 * sse_sens)) param_probabilities = param_probabilities / np.sum(param_probabilities) return param_probabilities
def getscores(X, y, p_grid, cv, ac_sens): """ Compute the negative RMSE of each of the fold/param combos """ kern = GPy.kern.RBF(2.0, lengthscale=25.0, variance=1.0) clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens, inducing=4, getxvalfoldsensitivities=False, kern=kern), scoring='neg_mean_squared_error', param_grid=p_grid, cv=cv) clf.fit(X, y) nparamcombos = len(clf.cv_results_['mean_test_score']) scores = np.zeros([clf.cv, nparamcombos]) for k in range(clf.cv): scores[k, :] = clf.cv_results_['split%d_test_score' % k] return scores
def grid_search(self, X, y, parameters, scoring=None, cv=5, refit=True, verbose=False): '''Perform an exhaustive search over hyperparameter combinations. # Arguments X: np.ndarray, features y: np.ndarray, labels parameters: dict, hyperparameter ranges scoring: dict, scoring functions e.g. {'acc': accuracy_score, ...} refit: bool, fit an estimator with the best parameters if True verbose: int, controls the verbosity: the higher, the more messages ''' self.grid_search_parameters = { 'estimator__estimator__' + k: v for k, v in parameters.items() } clf = self.clf if verbose is not True: warnings.filterwarnings("ignore", category=UserWarning) self.clf_grid_search = GridSearchCV(clf, self.grid_search_parameters, cv=cv, scoring=scoring, refit=refit, n_jobs=self.n_jobs) self.clf_grid_search.fit(X, y) print('\n`clf.best_estimator_`:\n', self.clf_grid_search.best_estimator_, '\n', sep='')
def main(): param_space = { "C": [1e-4, 1, 1e4], "gamma": [1e-3, 1, 1e3], "class_weight": [None, "balanced"], "kernel": ["linear", "poly", "rbf", "sigmoid"] } model = SVC() # param_space = {"n_estimators":[100, 200, 300, 400, 500], # "criterion":["gini", "entropy"], # "max_features":["auto", "sqrt", "log2"], # "max_depth":[2, 3, 4, 5, 6, 7, 8]} # model = RandomForestClassifier() digits = load_digits() # classifier = GridSearchCV(model, param_space, n_jobs=-1, cv=5) classifier = GridSearchCV(model, param_space, n_jobs=-1, cv=5) classifier.fit(digits.data, digits.target) print("Grid Scores:") means = classifier.cv_results_["mean_test_score"] standard_deviations = classifier.cv_results_["std_test_score"] for mean, standard_deviation, parameter in zip( means, standard_deviations, classifier.cv_results_["params"]): print("%0.3f (+/-%0.03f) for %r" % (mean, standard_deviation * 2, parameter)) print() print("Best Score: %0.3f" % (classifier.best_score_)) print() print("Best Parameters:") print(classifier.best_params_) print()
def evaluate_classifier(X, y, list_of_queries, set_k_range, k_function, alpha_range, l1_ratio): ''' Run a classifier setup on a set of queries. Loop through each query; train and test the classifier using the hyperparameters input as parameters; populate the metrics dictionary with some metrics of which parameters were selected and how well the classifier did for that query. ''' # A dictionary to hold the performance metrics. metrics_dict = {} # Loop through each query; train and test the classifer; populate the metrics dictionary. for query in list_of_queries: num_samples = query[2]['total'] num_positives = query[2]['positive'] # Subset by gene. y_query = y[query[0]] # Subset by diseases. disease_cols = [col for col in covariates.columns if col.endswith(tuple(query[1]))] has_disease = covariates[disease_cols].max(axis=1) > 0 covariates_query = covariates[has_disease] X_query = X[X.index.isin(covariates_query.index)] y_query = y_query[y_query.index.isin(covariates_query.index)] # Test Train split test_size = 0.2 X_train, X_test, y_train, y_test = train_test_split(X_query, y_query, stratify=y_query, test_size=test_size, random_state=RANDOMSEED) # PCA. scaler = StandardScaler() if query[2]['total']*(1-test_size)*(1-(1/3)) > 350: n_comp = 350 else: n_comp = int(query[2]['total']*(1-test_size) - 1) pca = PCA(n_components = n_comp, random_state = RANDOMSEED) scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) pca.fit(X_train_scaled) X_train = pca.transform(X_train_scaled) X_test_scaled = scaler.transform(X_test) X_test = pca.transform(X_test_scaled) if set_k_range: k_range = set_k_range else: k_range = k_function(num_samples=num_samples, num_positives=num_positives, ) # Parameter Sweep for Hyperparameters param_grid = { 'select__k': k_range, 'classify__loss': ['log'], 'classify__penalty': ['elasticnet'], 'classify__alpha': alpha_range, 'classify__l1_ratio': l1_ratio, } pipeline = Pipeline(steps=[ ('select', SelectKBest(variance_scorer)), ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced')) ]) cv_pipeline = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=1, scoring='roc_auc') cv_pipeline.fit(X=X_train, y=y_train) y_pred_train = cv_pipeline.decision_function(X_train) y_pred_test = cv_pipeline.decision_function(X_test) # Get ROC info. def get_threshold_metrics(y_true, y_pred): roc_columns = ['fpr', 'tpr', 'threshold'] roc_items = zip(roc_columns, roc_curve(y_true, y_pred)) roc_df = pd.DataFrame.from_items(roc_items) auroc = roc_auc_score(y_true, y_pred) return {'auroc': auroc, 'roc_df': roc_df} metrics_train = get_threshold_metrics(y_train, y_pred_train) metrics_test = get_threshold_metrics(y_test, y_pred_test) # Populate the metrics dictionary. # Get metrics for the classifier. overfit = metrics_train['auroc'] - metrics_test['auroc'] # Understand how the parameter grid worked... any params at the edge? if cv_pipeline.best_params_['select__k'] == min(param_grid['select__k']): n_comp_status = 'min' elif cv_pipeline.best_params_['select__k'] == max(param_grid['select__k']): n_comp_status = 'max' else: n_comp_status = 'OK' if cv_pipeline.best_params_['classify__alpha'] == min(param_grid['classify__alpha']): alpha_status = 'min' elif cv_pipeline.best_params_['classify__alpha'] == max(param_grid['classify__alpha']): alpha_status = 'max' else: alpha_status = 'OK' metrics = {'num_samples': num_samples, 'num_positive': num_positives, 'balance': num_positives/num_samples, 'train_auroc': metrics_train['auroc'], 'test_auroc': metrics_test['auroc'], 'n_components': cv_pipeline.best_params_['select__k'], 'alpha': cv_pipeline.best_params_['classify__alpha'], 'overfit': overfit, 'n_comp_status': n_comp_status, 'alpha_status': alpha_status } # Add the metrics to the dictonary. metrics_dict[query[0]+str(query[2]['total'])] = metrics # Change the metrics dict into a formatted pandas dataframe. metrics_df = pd.DataFrame(metrics_dict) metrics_df = metrics_df.T metrics_df.sort_values(by='num_positive', ascending=True, inplace=True) metrics_df = metrics_df[['num_samples', 'num_positive', 'balance', 'n_components','n_comp_status', 'alpha', 'alpha_status','train_auroc', 'test_auroc', 'overfit']] return(metrics_df)
stratify=strat) clf_parameters = { 'classify__loss': ['log'], 'classify__penalty': ['elasticnet'], 'classify__alpha': alphas, 'classify__l1_ratio': l1_ratios } estimator = Pipeline(steps=[( 'classify', SGDClassifier(random_state=0, class_weight='balanced', loss='log'))]) cv_pipeline = GridSearchCV(estimator=estimator, param_grid=clf_parameters, n_jobs=-1, cv=folds, scoring='roc_auc') cv_pipeline.fit(X=x_train, y=y_train) cv_results = pd.concat([ pd.DataFrame(cv_pipeline.cv_results_).drop('params', axis=1), pd.DataFrame.from_records(cv_pipeline.cv_results_['params']) ], axis=1) # Cross-validated performance heatmap cv_score_mat = pd.pivot_table(cv_results, values='mean_test_score', index='classify__l1_ratio', columns='classify__alpha')
'expressions': Pipeline([('features', FeatureUnion([('expressions', expression_features) ])), ('classify', classifier)]), 'covariates': Pipeline([('features', FeatureUnion([('covariates', covariate_features)])), ('classify', classifier)]) } # Construct cross-validated grid searches cv_pipelines = dict() for model, pipeline in pipeline_definitions.items(): cv = StratifiedKFold(n_splits=3, random_state=0) grid_search = GridSearchCV( estimator=pipeline, param_grid=param_grids[model], cv=cv, n_jobs=1, scoring='roc_auc', ) cv_pipelines[model] = grid_search # In[13]: # Fit the models for model, pipeline in cv_pipelines.items(): print('Fitting CV for model: {0}'.format(model)) start_time = time.perf_counter() pipeline.fit(X=X_train, y=y_train) end_time = time.perf_counter() elapsed = datetime.timedelta(seconds=end_time - start_time) print('\truntime: {}'.format(elapsed))
def evaluate_classifier(X_train, X_test, y, y_train_allgenes, y_test_allgenes, list_of_genes, set_k_range, k_function, alpha_range, l1_ratio): ''' Run a classifier setup on a set of queries. Loop through each query; train and test the classifier using the hyperparameters input as parameters; populate the metrics dictionary with some metrics of which parameters were selected and how well the classifier did for that query. ''' # A dictionary to hold the performance metrics. metrics_dict = {} # Loop through each query; train and test the classifer; populate the metrics dictionary. for gene in list_of_genes: # Train and test the classifier. y_gene = y[gene] y_train = y_train_allgenes[gene] y_test = y_test_allgenes[gene] num_positives = int(y_gene.value_counts(True)[1] * len(y_gene)) if set_k_range: k_range = set_k_range else: k_range = k_function(num_positives) # Parameter Sweep for Hyperparameters param_grid = { 'select__k': k_range, 'classify__loss': ['log'], 'classify__penalty': ['elasticnet'], 'classify__alpha': alpha_range, 'classify__l1_ratio': l1_ratio, } pipeline = Pipeline(steps=[('select', SelectKBest(variance_scorer)), ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced'))]) cv_pipeline = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=1, scoring='roc_auc') cv_pipeline.fit(X=X_train, y=y_train) y_pred_train = cv_pipeline.decision_function(X_train) y_pred_test = cv_pipeline.decision_function(X_test) # Get ROC info. def get_threshold_metrics(y_true, y_pred): roc_columns = ['fpr', 'tpr', 'threshold'] roc_items = zip(roc_columns, roc_curve(y_true, y_pred)) roc_df = pd.DataFrame.from_items(roc_items) auroc = roc_auc_score(y_true, y_pred) return {'auroc': auroc, 'roc_df': roc_df} metrics_train = get_threshold_metrics(y_train, y_pred_train) metrics_test = get_threshold_metrics(y_test, y_pred_test) # Populate the metrics dictionary. # Get metrics for the classifier. overfit = metrics_train['auroc'] - metrics_test['auroc'] # Understand how the parameter grid worked... any params at the edge? if cv_pipeline.best_params_['select__k'] == min( param_grid['select__k']): n_comp_status = 'min' elif cv_pipeline.best_params_['select__k'] == max( param_grid['select__k']): n_comp_status = 'max' else: n_comp_status = 'OK' if cv_pipeline.best_params_['classify__alpha'] == min( param_grid['classify__alpha']): alpha_status = 'min' elif cv_pipeline.best_params_['classify__alpha'] == max( param_grid['classify__alpha']): alpha_status = 'max' else: alpha_status = 'OK' metrics = { 'num_positive': num_positives, 'train_auroc': metrics_train['auroc'], 'test_auroc': metrics_test['auroc'], 'n_components': cv_pipeline.best_params_['select__k'], 'alpha': cv_pipeline.best_params_['classify__alpha'], 'overfit': overfit, 'n_comp_status': n_comp_status, 'alpha_status': alpha_status } # Add the metrics to the dictonary. metrics_dict[gene] = metrics # Change the metrics dict into a formatted pandas dataframe. metrics_df = pd.DataFrame(metrics_dict) metrics_df = metrics_df.T metrics_df.sort_values(by='num_positive', ascending=True, inplace=True) metrics_df = metrics_df[[ 'num_positive', 'n_components', 'n_comp_status', 'alpha', 'alpha_status', 'train_auroc', 'test_auroc', 'overfit' ]] return (metrics_df)
def evaluate_classifier(X_train, X_test, y, y_train_allgenes, y_test_allgenes, list_of_genes, set_k_range, k_function, alpha_range, l1_ratio): ''' Run a classifier setup on a set of queries. Loop through each query; train and test the classifier using the hyperparameters input as parameters; populate the metrics dictionary with some metrics of which parameters were selected and how well the classifier did for that query. ''' # A dictionary to hold the performance metrics. metrics_dict = {} # Loop through each query; train and test the classifer; populate the metrics dictionary. for gene in list_of_genes: # Train and test the classifier. y_gene = y[gene] y_train = y_train_allgenes[gene] y_test = y_test_allgenes[gene] num_positives = int(y_gene.value_counts(True)[1]*len(y_gene)) if set_k_range: k_range = set_k_range else: k_range = k_function(num_positives) # Parameter Sweep for Hyperparameters param_grid = { 'select__k': k_range, 'classify__loss': ['log'], 'classify__penalty': ['elasticnet'], 'classify__alpha': alpha_range, 'classify__l1_ratio': l1_ratio, } pipeline = Pipeline(steps=[ ('select', SelectKBest(variance_scorer)), ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced')) ]) cv_pipeline = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=1, scoring='roc_auc') cv_pipeline.fit(X=X_train, y=y_train) y_pred_train = cv_pipeline.decision_function(X_train) y_pred_test = cv_pipeline.decision_function(X_test) # Get ROC info. def get_threshold_metrics(y_true, y_pred): roc_columns = ['fpr', 'tpr', 'threshold'] roc_items = zip(roc_columns, roc_curve(y_true, y_pred)) roc_df = pd.DataFrame.from_items(roc_items) auroc = roc_auc_score(y_true, y_pred) return {'auroc': auroc, 'roc_df': roc_df} metrics_train = get_threshold_metrics(y_train, y_pred_train) metrics_test = get_threshold_metrics(y_test, y_pred_test) # Populate the metrics dictionary. # Get metrics for the classifier. overfit = metrics_train['auroc'] - metrics_test['auroc'] # Understand how the parameter grid worked... any params at the edge? if cv_pipeline.best_params_['select__k'] == min(param_grid['select__k']): n_comp_status = 'min' elif cv_pipeline.best_params_['select__k'] == max(param_grid['select__k']): n_comp_status = 'max' else: n_comp_status = 'OK' if cv_pipeline.best_params_['classify__alpha'] == min(param_grid['classify__alpha']): alpha_status = 'min' elif cv_pipeline.best_params_['classify__alpha'] == max(param_grid['classify__alpha']): alpha_status = 'max' else: alpha_status = 'OK' metrics = {'num_positive': num_positives, 'train_auroc': metrics_train['auroc'], 'test_auroc': metrics_test['auroc'], 'n_components': cv_pipeline.best_params_['select__k'], 'alpha': cv_pipeline.best_params_['classify__alpha'], 'overfit': overfit, 'n_comp_status': n_comp_status, 'alpha_status': alpha_status } # Add the metrics to the dictonary. metrics_dict[gene] = metrics # Change the metrics dict into a formatted pandas dataframe. metrics_df = pd.DataFrame(metrics_dict) metrics_df = metrics_df.T metrics_df.sort_values(by='num_positive', ascending=True, inplace=True) metrics_df = metrics_df[['num_positive', 'n_components','n_comp_status', 'alpha', 'alpha_status','train_auroc', 'test_auroc', 'overfit']] return(metrics_df)
if(model_choice == 'random forests'): print("\n\n") print("\t\t Special Grid Search for random foreset") print("\n\n") c = Client(scheduler_address, set_as_default=True) grid_search = DaskRandomizedSearchCV( model, param_grid, cv = cv_temp, get= c.get ) else: c = Client(scheduler_address, set_as_default=True) grid_search = GridSearchCV( model, param_grid, cv = cv_temp ) grid_search.fit(X, y) time_elapse = time()-t0 runing_time.append([model_choice, cv_temp, time_elapse]) print(" running time: ", time_elapse) num_graph = len(grid_search.dask_graph_) print(" size of graph: ", num_graph) runing_time_df = pd.DataFrame(data = runing_time, columns = ['model', 'cv', 'time']) runing_time_df['n_workers'] = n_workers runing_time_df['n_graph'] = num_graph runing_time_df['sample'] = sample
class Classifier(Base): '''Classifier base class. ''' def __init__(self, clf, scale=False, n_jobs=1): super().__init__() # A classifier is built using a `Pipeline` for convenience of chaining # multiple preprocessing steps before the classifier pipeline = [] # Centre data by scaling to zero mean and unit variance if scale is True: pipeline.append(('standard_scaler', StandardScaler())) # Add the `clf` estimator and build the `Pipeline` pipeline.append(('estimator', clf)) self.clf = Pipeline(pipeline) self.n_jobs = n_jobs def __name__(self): return self.__class__.__name__ def grid_search(self, X, y, parameters, scoring=None, cv=5, refit=True, verbose=False): '''Perform an exhaustive search over hyperparameter combinations. # Arguments X: np.ndarray, features y: np.ndarray, labels parameters: dict, hyperparameter ranges scoring: dict, scoring functions e.g. {'acc': accuracy_score, ...} refit: bool, fit an estimator with the best parameters if True verbose: int, controls the verbosity: the higher, the more messages ''' self.grid_search_parameters = { 'estimator__estimator__' + k: v for k, v in parameters.items() } clf = self.clf if verbose is not True: warnings.filterwarnings("ignore", category=UserWarning) self.clf_grid_search = GridSearchCV(clf, self.grid_search_parameters, cv=cv, scoring=scoring, refit=refit, n_jobs=self.n_jobs) self.clf_grid_search.fit(X, y) print('\n`clf.best_estimator_`:\n', self.clf_grid_search.best_estimator_, '\n', sep='') def fit(self, X, y): '''Fit the estimator. # Arguments X: np.ndarray, features y: np.ndarray, labels ''' # Fit classifier using the best parameters from GridSearchCV try: getattr(self.clf_grid_search, 'best_estimator_') fit_using = "clf_grid_search" except AttributeError: # Fit classifier from __init__ fit_using = "clf" self.clf.fit(X, y) finally: print(f'\nFit using `{fit_using}`') def get_clf(self): '''Get the best estimator. If a grid search has been performed, then the `best_estimator_` is returned, else the estimator used to initialise the object is returned. # Returns clf: sklearn estimator ''' try: return self.clf_grid_search.best_estimator_ except AttributeError: return self.clf def predict(self, X): '''Predict the classes of samples using features. # Arguments X: np.ndarray, features # Returns predictions: np.ndarray, class predictions ''' self.predictions = self.get_clf().predict(X) return self.predictions def predict_proba(self, X): '''Predict the class-membership probabilities of samples. # Arguments X: np.ndarray, features # Returns probabilities: np.ndarray, class probabilities ''' self.probabilities = self.get_clf().predict_proba(X) return self.probabilities def decision_function(self, X): '''Decision function. # Arguments X: np.ndarray, features # Returns decisions: np.ndarray, distances of samples to the decision boundary ''' try: self.decisions = self.get_clf().decision_function(X) return self.decisions except AttributeError as err: raise AttributeError( f'decision_function is not implemented for {self.__name__()}')\ from None def score(self, X, y): '''Mean accuracy score on test data. # Arguments X: np.ndarray, test features y: np.ndarray, test labels ''' return self.get_clf().score(X, y) def accuracy(self, y_true): '''Accuracy score. # Arguments y_true: np.ndarry, true labels # Returns accuracy_score: float ''' self.accuracy_score = accuracy_score(y_true, self.predictions) return self.accuracy_score
clf_parameters = {'classify__C': cs, 'classify__penalty': penalties} estimator = Pipeline(steps=[('classify', LogisticRegression(random_state=123, class_weight='balanced', multi_class='ovr', max_iter=100, solver='saga'))]) # Custom scorer that optimizes f1 score weighted by class proportion weighted_f1_scorer = make_scorer(f1_score, average='weighted') # Cross validation pipeline cv_pipeline = GridSearchCV(estimator=estimator, param_grid=clf_parameters, n_jobs=-1, cv=5, return_train_score=True, scoring=weighted_f1_scorer) # ### Fit Model # # This takes a couple minutes to train. For many model parameters, sklearn will throw convergence warnings. This means that after 100 iterations, an optimal solution is not found. Prevent the warnings from being printed redundantly. # In[8]: get_ipython().run_cell_magic( 'time', '', 'with warnings.catch_warnings():\n warnings.simplefilter("ignore")\n cv_pipeline.fit(X=x_df, y=y_df.ras_status)' ) # In[9]:
def evaluate_classifier(X, y, list_of_queries, set_k_range, k_function, alpha_range, l1_ratio): ''' Run a classifier setup on a set of queries. Loop through each query; train and test the classifier using the hyperparameters input as parameters; populate the metrics dictionary with some metrics of which parameters were selected and how well the classifier did for that query. ''' # A dictionary to hold the performance metrics. metrics_dict = {} # Loop through each query; train and test the classifer; populate the metrics dictionary. for query in list_of_queries: num_samples = query[2]['total'] num_positives = query[2]['positive'] # Subset by gene. y_query = y[query[0]] # Subset by diseases. disease_cols = [ col for col in covariates.columns if col.endswith(tuple(query[1])) ] has_disease = covariates[disease_cols].max(axis=1) > 0 covariates_query = covariates[has_disease] X_query = X[X.index.isin(covariates_query.index)] y_query = y_query[y_query.index.isin(covariates_query.index)] # Test Train split test_size = 0.2 X_train, X_test, y_train, y_test = train_test_split( X_query, y_query, stratify=y_query, test_size=test_size, random_state=RANDOMSEED) # PCA. scaler = StandardScaler() if query[2]['total'] * (1 - test_size) * (1 - (1 / 3)) > 350: n_comp = 350 else: n_comp = int(query[2]['total'] * (1 - test_size) - 1) pca = PCA(n_components=n_comp, random_state=RANDOMSEED) scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) pca.fit(X_train_scaled) X_train = pca.transform(X_train_scaled) X_test_scaled = scaler.transform(X_test) X_test = pca.transform(X_test_scaled) if set_k_range: k_range = set_k_range else: k_range = k_function( num_samples=num_samples, num_positives=num_positives, ) # Parameter Sweep for Hyperparameters param_grid = { 'select__k': k_range, 'classify__loss': ['log'], 'classify__penalty': ['elasticnet'], 'classify__alpha': alpha_range, 'classify__l1_ratio': l1_ratio, } pipeline = Pipeline(steps=[('select', SelectKBest(variance_scorer)), ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced'))]) cv_pipeline = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=1, scoring='roc_auc') cv_pipeline.fit(X=X_train, y=y_train) y_pred_train = cv_pipeline.decision_function(X_train) y_pred_test = cv_pipeline.decision_function(X_test) # Get ROC info. def get_threshold_metrics(y_true, y_pred): roc_columns = ['fpr', 'tpr', 'threshold'] roc_items = zip(roc_columns, roc_curve(y_true, y_pred)) roc_df = pd.DataFrame.from_items(roc_items) auroc = roc_auc_score(y_true, y_pred) return {'auroc': auroc, 'roc_df': roc_df} metrics_train = get_threshold_metrics(y_train, y_pred_train) metrics_test = get_threshold_metrics(y_test, y_pred_test) # Populate the metrics dictionary. # Get metrics for the classifier. overfit = metrics_train['auroc'] - metrics_test['auroc'] # Understand how the parameter grid worked... any params at the edge? if cv_pipeline.best_params_['select__k'] == min( param_grid['select__k']): n_comp_status = 'min' elif cv_pipeline.best_params_['select__k'] == max( param_grid['select__k']): n_comp_status = 'max' else: n_comp_status = 'OK' if cv_pipeline.best_params_['classify__alpha'] == min( param_grid['classify__alpha']): alpha_status = 'min' elif cv_pipeline.best_params_['classify__alpha'] == max( param_grid['classify__alpha']): alpha_status = 'max' else: alpha_status = 'OK' metrics = { 'num_samples': num_samples, 'num_positive': num_positives, 'balance': num_positives / num_samples, 'train_auroc': metrics_train['auroc'], 'test_auroc': metrics_test['auroc'], 'n_components': cv_pipeline.best_params_['select__k'], 'alpha': cv_pipeline.best_params_['classify__alpha'], 'overfit': overfit, 'n_comp_status': n_comp_status, 'alpha_status': alpha_status } # Add the metrics to the dictonary. metrics_dict[query[0] + str(query[2]['total'])] = metrics # Change the metrics dict into a formatted pandas dataframe. metrics_df = pd.DataFrame(metrics_dict) metrics_df = metrics_df.T metrics_df.sort_values(by='num_positive', ascending=True, inplace=True) metrics_df = metrics_df[[ 'num_samples', 'num_positive', 'balance', 'n_components', 'n_comp_status', 'alpha', 'alpha_status', 'train_auroc', 'test_auroc', 'overfit' ]] return (metrics_df)
'classify__loss': ['log'], 'classify__penalty': ['elasticnet'], 'classify__alpha': alphas, 'classify__l1_ratio': l1_ratios } estimator = Pipeline(steps=[('classify', SGDClassifier(random_state=0, class_weight='balanced', loss='log', max_iter=50, tol=1e-3))]) cv_pipeline = GridSearchCV(estimator=estimator, param_grid=clf_parameters, n_jobs=-1, cv=n_folds, scoring='average_precision', return_train_score=True) shuffle_cv_pipeline = GridSearchCV(estimator=estimator, param_grid=clf_parameters, n_jobs=-1, cv=n_folds, scoring='average_precision', return_train_score=True) # In[10]: # Fit Regular Pipeline cv_pipeline.fit(X=x_train_df, y=y_train_df.status.tolist())