def test_backend_respected(): clf = RandomForestClassifier(n_estimators=10, n_jobs=2) with parallel_backend("testing") as (ba, n_jobs): clf.fit(X, y) assert ba.count > 0 # predict_proba requires shared memory. Ensure that's honored. with parallel_backend("testing") as (ba, _): clf.predict_proba(X) assert ba.count == 0
def main(argv): with parallel_backend('threading'): pipeline()
def update_spatial(Y, A, b, C, f, sn, gs_sigma=6, dl_wnd=5, sparse_penal=0.5, update_background=True, post_scal=False, normalize=True, zero_thres='eps', sched='single-threaded'): _T = len(Y.coords['frame']) print("estimating penalty parameter") cct = C.dot(C, 'frame') alpha = sparse_penal * sn * np.sqrt(np.max(np.diag(cct))) / _T alpha = alpha.persist() print("computing subsetting matrix") if dl_wnd: selem = moph.disk(dl_wnd) sub = xr.apply_ufunc(cv2.dilate, A.chunk(dict(height=-1, width=-1)), input_core_dims=[['height', 'width']], output_core_dims=[['height', 'width']], vectorize=True, kwargs=dict(kernel=selem), dask='parallelized', output_dtypes=[A.dtype]) sub = (sub > 0) else: sub = xr.apply_ufunc(np.ones_like, A.compute()) if update_background: A = xr.concat([A, b.assign_coords(unit_id=-1)], 'unit_id') b_erd = xr.apply_ufunc(cv2.erode, b.chunk(dict(height=-1, width=-1)), input_core_dims=[['height', 'width']], output_core_dims=[['height', 'width']], kwargs=dict(kernel=selem), dask='parallelized', output_dtypes=[b.dtype]) sub = xr.concat( [sub, (b_erd > 0).astype(bool).assign_coords(unit_id=-1)], 'unit_id') C = xr.concat([C, f.assign_coords(unit_id=-1)], 'unit_id') sub = sub.persist() print("fitting spatial matrix") A_new = xr.apply_ufunc(update_spatial_perpx, Y.chunk(dict(frame=-1)), alpha, sub.chunk(dict(unit_id=-1)), C.chunk(dict(frame=-1, unit_id=-1)), input_core_dims=[['frame'], [], ['unit_id'], ['frame', 'unit_id']], output_core_dims=[['unit_id']], vectorize=True, dask='parallelized', output_dtypes=[Y.dtype]) try: with parallel_backend('dask'): A_new = A_new.persist() except ValueError: with da.config.set(scheduler=sched): A_new = A_new.persist() print("removing empty units") if zero_thres == 'eps': zero_thres = np.finfo(A_new.dtype).eps A_new = A_new.where(A_new > zero_thres).fillna(0) non_empty = (A_new.sum(['width', 'height']) > 0).compute() A_new = A_new.where(non_empty, drop=True) C_new = C.where(non_empty, drop=True) A_new = rechunk_like(A_new, A).persist() C_new = rechunk_like(C_new, C).persist() if post_scal and len(A_new) > 0: print("post-hoc scaling") A_new_flt = (A_new.stack(spatial=['height', 'width']).compute()) Y_flt = (Y.mean('frame').stack(spatial=['height', 'width']).compute()) def lstsq(a, b): return np.linalg.lstsq(a, b, rcond=-1)[0] scale = xr.apply_ufunc(lstsq, A_new_flt, Y_flt, input_core_dims=[['spatial', 'unit_id'], ['spatial']], output_core_dims=[['unit_id']]) C_mean = C.mean('frame').compute() scale = scale / C_mean A_new = A_new * scale try: A_new = A_new.persist() except np.linalg.LinAlgError: warnings.warn("post-hoc scaling failed", RuntimeWarning) if update_background: print("updating background") try: b_new = A_new.sel(unit_id=-1) b_new = b_new / da.array.linalg.norm(b_new.data) f_new = xr.apply_ufunc( da.array.tensordot, Y, b_new, input_core_dims=[['frame', 'height', 'width'], ['height', 'width']], output_core_dims=[['frame']], kwargs=dict(axes=[(1, 2), (0, 1)]), dask='allowed').persist() A_new = A_new.drop(-1, 'unit_id') C_new = C_new.drop(-1, 'unit_id') except KeyError: print("background terms are empty") b_new = xr.zeros_like(b) f_new = xr.zeros_like(f) else: b_new = b f_new = f if normalize and len(A_new) > 0: print("normalizing result") A_norm = xr.apply_ufunc(darr.linalg.norm, A_new.stack(spatial=['height', 'width']), input_core_dims=[['spatial', 'unit_id']], output_core_dims=[['unit_id']], kwargs=dict(axis=0), dask='allowed') A_new = (A_new / A_norm).persist() return A_new, b_new, C_new, f_new
#Standardize numeric features scaler = StandardScaler() scaler.fit(trainX) trainX = scaler.transform(trainX) testX = scaler.transform(testX) #Add categorical features trainX = np.append(trainX, train_missing_price, 1) trainX = np.append(trainX, train_category, 1) testX = np.append(testX, test_missing_price, 1) testX = np.append(testX, test_category, 1) print(trainX.shape) # fit lasso model with (parallel_backend('threading')): m = LassoCV(normalize=False, cv=5, verbose=True).fit(trainX, trainY) # save model file_name = 'linear_model.joblib' dump(m, file_name) def load_model(): return load(file_name) def plot(): # show results for fit data plt.figure() ax = plt.subplot(111)
def runCrossValidate(self, verbose=False): self.logger.log("Cross-validate started...", self.step_n, message="Running cross validation") n_jobs = self.cv_n_jobs cv_results = {} new_cv_results = {} cv = self.getCV() # n_jobs = -1 if verbose: logger.info( f"RunCrossValidate - n_jobs: {n_jobs}, scorer_list: {self.scorer_list}" ) for pipe_name, model in self.model_dict.items(): if verbose: logger.info( f"RunCrossValidate - Running CV on pipe_name: {pipe_name}") start = time.time() dask_scheduler = os.getenv( "DASK_SCHEDULER", "tcp://" + socket.gethostbyname(socket.gethostname()) + ":8786") client = Client(dask_scheduler) with parallel_backend('dask', n_jobs=-1): # 40min test case model_i = cross_validate(model, self.X_df, self.y_df.iloc[:, 0], return_estimator=True, scoring=self.scorer_list, cv=cv, n_jobs=1, verbose=3) end = time.time() if verbose: logger.info( f"SCORES - {pipe_name},{[(scorer,np.mean(model_i[f'test_{scorer}'])) for scorer in self.scorer_list]}, runtime: {(end-start)/60} min." ) logger.info(f"MODELS - {pipe_name},{model_i}") cv_results[pipe_name] = model_i if self.run_stacked: for est_name, result in cv_results.items(): if type(result['estimator'][0]) is MultiPipe: new_results = {} for mp in result['estimator']: for est_n, m in mp.build_individual_fitted_pipelines( ).items(): if not est_n in new_results: new_results[est_n] = [] new_results[est_n].append(m) for est_n in new_results: if est_n in cv_results: est_n += '_fcombo' new_cv_results[est_n] = { 'estimator': new_results[est_n] } cv_results = {**new_cv_results, **cv_results} if verbose: logger.info("CV Results: {}".format(cv_results)) self.cv_results = cv_results self.logger.log("Cross-validate complete.", self.step_n, message="Completed cross validation")
def get_ensemble_models(X_train, y_train, X_test, y_test, day_, sector, best_params, model_name, path_): """ This function is to looking for the best ensemble model. :param X_train: Set of features for train. :param y_train: Set of target for train. :param X_test: Set of features for test. :param y_test: Set of target for test. :param int day_: Time window for features. :param str sector: Sector GICS for filtering the predictions. :param list best_params: List of best params for basics models. :param str or None model_name: Name of the best model. If it's None is because the model has to be trained. :param str path_: Path where the models are saved. :return tuple of dict: Tuple of dicts with the different models. """ t_init = time.time() scoring = 'precision_macro' dict_vot = {} dict_bagg = {} dict_ada = {} dict_total = {} check_ = (sector, day_) if (check_ != ('Financials', 3)) and (check_ != ('Financials', 5)): if model_name is None: with parallel_backend('threading', n_jobs=2): vot = VotingClassifier(estimators=[( 'DecTree', DecisionTreeClassifier(**best_params[2]) ), ('KNN', KNeighborsClassifier(**best_params[1]) ), ('LogReg', LogisticRegression(**best_params[0]))]) val_score_vot = cross_val_score(vot, X_train, y_train, cv=3, scoring=scoring).mean() vot.fit(X_train, y_train) report = classification_report(y_test, vot.predict(X_test), digits=4, output_dict=True) dict_vot['vot'] = [vot, val_score_vot, report] prec_vot = float(dict_vot['vot'][2]['weighted avg']['precision']) elif model_name == 'vot': vot = get_best_ensem_model(path_) vot.fit(X_train, y_train) report = classification_report(y_test, vot.predict(X_test), digits=4, output_dict=True) dict_vot['vot'] = [vot, report] prec_vot = float(dict_vot['vot'][1]['weighted avg']['precision']) else: prec_vot = 0 else: prec_vot = 0 show_time( t_init, time.time(), 'Time to train vot for %d days, %s sector and %s' % (day_, sector, scoring)) if model_name is None: with parallel_backend('threading', n_jobs=2): bagg = BaggingClassifier(base_estimator=KNeighborsClassifier( **best_params[1])) val_score_bagg = cross_val_score(bagg, X_train, y_train, cv=3, scoring=scoring).mean() bagg.fit(X_train, y_train) report = classification_report(y_test, bagg.predict(X_test), digits=4, output_dict=True) dict_bagg['bagg'] = [bagg, val_score_bagg, report] prec_bagg = float(dict_bagg['bagg'][2]['weighted avg']['precision']) elif model_name == 'bagg': bagg = get_best_ensem_model(path_) bagg.fit(X_train, y_train) report = classification_report(y_test, bagg.predict(X_test), digits=4, output_dict=True) dict_bagg['bagg'] = [bagg, report] prec_bagg = float(dict_bagg['bagg'][1]['weighted avg']['precision']) else: prec_bagg = 0 show_time( t_init, time.time(), 'Time to train bagg for %d days, %s sector and %s' % (day_, sector, scoring)) if (check_ != ('Financials', 3)) and (check_ != ('Financials', 5)): if model_name is None: with parallel_backend('threading', n_jobs=2): ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( **best_params[2])) val_score_ada = cross_val_score(ada, X_train, y_train, cv=3, scoring=scoring).mean() ada.fit(X_train, y_train) report = classification_report(y_test, ada.predict(X_test), digits=4, output_dict=True) dict_ada['ada'] = [ada, val_score_ada, report] prec_ada = float(dict_ada['ada'][2]['weighted avg']['precision']) elif model_name == 'ada': ada = get_best_ensem_model(path_) ada.fit(X_train, y_train) report = classification_report(y_test, ada.predict(X_test), digits=4, output_dict=True) dict_ada['ada'] = [ada, report] prec_ada = float(dict_ada['ada'][1]['weighted avg']['precision']) else: prec_ada = 0 else: prec_ada = 0 show_time( t_init, time.time(), 'Time to train ada for %d days, %s sector and %s' % (day_, sector, scoring)) if prec_vot >= prec_bagg: if prec_vot >= prec_ada: dict_total['best'] = dict_vot else: dict_total['best'] = dict_ada else: if prec_bagg >= prec_ada: dict_total['best'] = dict_bagg else: dict_total['best'] = dict_ada return dict_total, dict_vot, dict_bagg, dict_ada
def train_classifier(d2v, training_vectors, training_labels): logging.info("Classifier training") train_vectors = get_vectors(d2v, training_vectors, 300, 'Train') # Find the optimal Random Forest Classifier Hyperparameters n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(100, 500, num=11)] max_depth.append(None) with parallel_backend('threading'): rfc = RandomForestClassifier(n_jobs=1) random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth } rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=WORKERS) rfc_random.fit(train_vectors, np.array(training_labels)) best_parameters = rfc_random.best_params_ model = RandomForestClassifier( n_estimators=best_parameters['n_estimators'], max_features=best_parameters['max_features'], max_depth=best_parameters['max_depth'], n_jobs=WORKERS) #model = RandomForestClassifier(n_jobs=WORKERS) #print("train_vectors shape",train_vectors.shape) #print('train_label shape',training_labels.shape) model.fit(train_vectors, np.array(training_labels)) model_file = os.path.join(path_model, CLASSIFICATION_MODEL_NAME) pickle.dump(model, open(model_file, 'wb')) logging.info("Classification model saved on :{}".format(model_file)) #model = pickle.load(open(model_file,'rb')) training_predictions = model.predict(train_vectors) validate_df = pd.DataFrame(training_predictions, columns=[ 'target', 'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness' ]) validate_df[GENSIM_MODEL_NAME] = pd.DataFrame(training_predictions[:, 0]) validate_df.head() bias_metrics_df = compute_bias_metrics_for_model(validate_df, identity_columns, GENSIM_MODEL_NAME, TOXICITY_COLUMN) performance = get_final_metric( bias_metrics_df, calculate_overall_auc(validate_df, GENSIM_MODEL_NAME)) logging.info('Training predicted classes: {}'.format( np.unique(training_predictions))) logging.info('Training accuracy: {}'.format( accuracy_score(training_labels, training_predictions))) logging.info('Training F1 score: {}'.format( f1_score(training_labels, training_predictions, average='weighted'))) logging.info('Training Bias Metric: {}'.format(performance)) logging.info("Saving classification model") return model
# -*- coding: utf-8 -*- """ Created on Mon Aug 19 19:39:52 2019 @author: amitabh.gunjan """ from operator import neg from sklearn.utils import parallel_backend, Parallel, delayed import joblib with parallel_backend('threading'): print(joblib.Parallel()(joblib.delayed(neg)(i + 1) for i in range(5)))
def predict(self, model_id, matrix_store, misc_db_parameters, train_matrix_columns): """Generate predictions and store them in the database Args: model_id (int) the id of the trained model to predict based off of matrix_store (catwalk.storage.MatrixStore) a wrapper for the prediction matrix and metadata misc_db_parameters (dict): attributes and values to add to each TrainPrediction or TestPrediction object in the results schema train_matrix_columns (list): The order of columns that the model was trained on Returns: (np.Array) the generated prediction values """ # Setting the Prediction object type - TrainPrediction or TestPrediction matrix_type = matrix_store.matrix_type if not self.replace: logger.info( f"Replace flag not set, looking for old predictions for model id {model_id} " f"on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid}" ) try: session = self.sessionmaker() existing_predictions = self._existing_predictions( matrix_type.prediction_obj, session, model_id, matrix_store ) logger.spam(f"Existing predictions length: {existing_predictions.count()}, Length of matrix: {len(matrix_store.index)}") if existing_predictions.count() == len(matrix_store.index): logger.info( f"Found old predictions for model id {model_id}, matrix {matrix_store.uuid}, returning saved versions" ) return self._load_saved_predictions(existing_predictions, matrix_store) finally: session.close() model = self.load_model(model_id) if not model: raise ValueError(f"Model id {model_id} not found") logger.spam(f"Loaded model {model_id}") # Labels are popped from matrix (i.e. they are removed and returned) labels = matrix_store.labels # using a threading backend because the default loky backend doesn't # allow for nested parallelization (e.g., multiprocessing at triage level) with parallel_backend('threading'): predictions = model.predict_proba( matrix_store.matrix_with_sorted_columns(train_matrix_columns) )[:, 1] # Returning only the scores for the label == 1 logger.debug( f"Generated predictions for model {model_id} on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid}" ) if self.save_predictions: df = pd.DataFrame(data=None, columns=None, index=matrix_store.index) df['label_value'] = matrix_store.labels df['score'] = predictions logger.spam(f"Sorting predictions for model {model_id} using {self.rank_order}") if self.rank_order == 'best': df.sort_values(by=["score", "label_value"], inplace=True, ascending=[False,False], na_position='last') elif self.rank_order == 'worst': df.sort_values(by=["score", "label_value"], inplace=True, ascending=[False,True], na_position='first') elif self.rank_order == 'random': df['random'] = np.random.rand(len(df)) df.sort_values(by=['score', 'random'], inplace=True, ascending=[False, False]) df.drop('random', axis=1) else: raise ValueError(f"Rank order specified in condiguration file not recognized: {self.rank_order} ") df['rank_abs_no_ties'] = df['score'].rank(ascending=False, method='first') # uses the lowest rank in the group df['rank_abs_with_ties'] = df['score'].rank(ascending=False, method='min') # No gaps between groups (so it reaches 1.0). We are using rank_abs_no_ties so we can # respect that order (instead of using the mathematical formula, as was done before) df['rank_pct_no_ties'] = df['rank_abs_no_ties'].rank(ascending=True, method='dense', pct=True) df['rank_pct_with_ties'] = df['score'].rank(ascending=False, method='dense', pct=True) df.reset_index(inplace=True) logger.debug(f"Predictions on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid} from model {model_id} sorted using {self.rank_order}") logger.spam( f"Writing predictions for model {model_id} on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid} to database" ) self._write_predictions_to_db( model_id, matrix_store, df, misc_db_parameters, matrix_type.prediction_obj, ) logger.debug( f"Wrote predictions for model {model_id} on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid} to database" ) else: logger.notice( f"Predictions for model {model_id} on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid} weren't written to the db because, because you asked not to do so" ) logger.spam(f"Status of the save_predictions flag: {self.save_predictions}") self._write_metadata_to_db( model_id=model_id, matrix_uuid=matrix_store.uuid, matrix_type=matrix_type, random_seed=None, ) return predictions