def predict(X_test: pd.DataFrame, y_test, gbm: lgb.Booster): # predict pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = [] for x in pred: y_pred.append(np.argmax(x)) # Print the precision and recall, among other metrics print( metrics.classification_report(y_test, y_pred, target_names=Categories))
def predict(gbm: lgb.Booster, test_data: pd.DataFrame, full_data: pd.DataFrame, feature_names: List[str]): last_friday = datetime.now() + relativedelta(weekday=FR(-1)) date_string = last_friday.strftime('%Y-%m-%d') print(date_string) live_data = full_data.loc[date_string].copy() live_data.dropna(subset=feature_names, inplace=True) live_data[PREDICTION_NAME] = gbm.predict(live_data[feature_names]) test_data[PREDICTION_NAME] = gbm.predict(test_data[feature_names]) return dict( predicted_live_data=live_data, predicted_test_data=test_data )
class LightgbmOperator(object): def __init__(self, bst_path, model_tag): """ 初始化 Args: bst_path: 通过model.save()保存的地址 """ self.model = Booster(model_file=bst_path) self.model_tag = model_tag def predict(self, input_datas): # if not isinstance(input_datas,list) and not isinstance(input_datas,np.array): return self.model.predict(input_datas)
def predict_single_fold(self, model: lgb.Booster, dataset: TabularDataset) -> np.ndarray: """Predict target values for dataset. Args: model: Lightgbm object. dataset: test dataset. Return: predicted target values. """ pred = self.task.losses['lgb'].bw_func(model.predict(dataset.data)) return pred
def predict( cv_num: int, sp: Split, model: lgb.Booster, model_number: Optional[int] = None ) -> pd.DataFrame: config = Config() d_start: int = config.CV_START_DAYS[cv_num] d_end: int = config.CV_START_DAYS[cv_num] + 28 test_pred = sp.test.copy() test_pred[config.TARGET + "_true"] = test_pred[config.TARGET] test_pred.loc[test_pred.d >= d_start, config.TARGET] = np.nan for d in tqdm(range(d_start, d_end)): test_pred = make_rolling_for_test(test_pred, d, config.features) test_pred.loc[test_pred.d == d, config.TARGET] = model.predict( test_pred.loc[test_pred.d == d, config.features] ) test_pred.loc[test_pred.d == d, "sales_is_zero"] = ( test_pred.loc[test_pred.d == d, "sales"] == 0 ).astype(np.int8) return test_pred
def mean_match_function_kdtree_cat( mmc, model: Booster, bachelor_features, candidate_values, random_state, hashed_seeds, candidate_preds=None, ): """ This mean matching function selects categorical features by performing nearest neighbors on the output class probabilities. This tends to be more accurate, but takes more time, especially for variables with large number of classes. This function is slower for categorical datatypes, but results in better imputations. .. code-block:: text Mean match procedure for different datatypes: Categorical: If mmc = 0, the class with the highest probability is chosen. If mmc > 0, get N nearest neighbors from class probabilities. Select 1 at random. Numeric: If mmc = 0, the predicted value is used If mmc > 0, obtain the mmc closest candidate predictions and collect the associated real candidate values. Choose 1 randomly. Parameters ---------- mmc: int The number of mean matching candidates (derived from mean_match_candidates parameter) model: lgb.Booster The model that was trained. candidate_features: pd.DataFrame or np.ndarray The features used to train the model. If mmc == 0, this will be None. bachelor_features: pd.DataFrame or np.ndarray The features corresponding to the missing values of the response variable used to train the model. candidate_values: pd.Series or np.ndarray The real (not predicted) values of the candidates from the original dataset. Will be 1D If the feature is pandas categorical, this will be the category codes. random_state: np.random.RandomState The random state from the process calling this function is passed. hashed_seeds: None, np.ndarray (int32) Used to make imputations deterministic at the record level. If this array is passed, random_state is ignored in favor of these seeds. These seeds are derived as a hash of the random_seed_array passed to the imputation functions. The distribution of these seeds is uniform enough. Returns ------- The imputation values Must be np.ndarray or shape (n,), where n is the length of dimension 1 of bachelor_features. If the feature is categorical, return its category code (integer corresponding to its category). """ objective = model.params["objective"] assert objective in _REGRESSIVE_OBJECTIVES + _CATEGORICAL_OBJECTIVES, ( "lightgbm objective not recognized - please check for aliases or " + "define a custom mean matching function to handle this objective.") # Need these no matter what. bachelor_preds = model.predict(bachelor_features) if mmc == 0: if objective in _REGRESSIVE_OBJECTIVES: imp_values = bachelor_preds elif objective == "binary": imp_values = np.floor(bachelor_preds + 0.5) elif objective in ["multiclass", "multiclassova"]: imp_values = np.argmax(bachelor_preds, axis=1) else: if objective in _REGRESSIVE_OBJECTIVES: imp_values = _mean_match_reg( mmc, bachelor_preds, candidate_preds, candidate_values, random_state, hashed_seeds, ) elif objective == "binary": bachelor_preds = logodds(bachelor_preds) imp_values = _mean_match_reg( mmc, bachelor_preds, candidate_preds, candidate_values, random_state, hashed_seeds, ) elif objective in ["multiclass", "multiclassova"]: # inner_predict returns a flat array, need to reshape for KDTree bachelor_preds = logodds(bachelor_preds) imp_values = _mean_match_multiclass_accurate( mmc, bachelor_preds, candidate_preds, candidate_values, random_state, hashed_seeds, ) return imp_values
def predict( m_xgb: xgboost.XGBClassifier, m_lgbm: lightgbm.Booster, test: pd.DataFrame, test_previous: pd.DataFrame, user_summary: "UserSummary", question_features: pd.DataFrame, ) -> Tuple[pd.DataFrame]: """ Predict the probability that the user will answer the current question correctly. Parameters ---------- m: The model object, an xgboost classifier. test: The test data for which to generate predictions. test_previous: The previous group of test data observations, used to update user summary statistics. user_summary: A UserSummary object containing user features, that can be updated with incoming data. question_features: Question features to join on content_id. Returns ------- A tuple of (prediction dataframe, timer dataframe). The timer dataframe is produced to help identify bottlenecks in the prediction pipeline that may cause a timeout on Kaggle. """ timer = {} if test_previous is not None: tic = datetime.utcnow() newdata = process_test_observations(test, test_previous, question_features) toc = datetime.utcnow() timer["process_test_observations"] = (toc - tic).total_seconds() tic = datetime.utcnow() user_summary.update(newdata) toc = datetime.utcnow() timer["update_user_summary"] = (toc - tic).total_seconds() test = test.loc[test["content_type_id"] == 0].drop( columns="content_type_id") tic = datetime.utcnow() test = pd.merge( test, question_features, how="left", left_on="content_id", right_index=True, copy=False, ) toc = datetime.utcnow() timer["merge_question_features"] = (toc - tic).total_seconds() tic = datetime.utcnow() required_columns = [ k for k in constants.USER_SUMMARY_SCHEMA.keys() if k != "user_id" ] for col in required_columns: test[col] = [ user_summary.get_feature(user_id, col) for user_id in test["user_id"] ] calculate_user_features(test, inplace=True) toc = datetime.utcnow() timer["merge_user_features"] = (toc - tic).total_seconds() tic = datetime.utcnow() # test["answered_correctly"] = m_xgb.predict_proba(test[constants.TRAIN_COLS])[:, 1] test["answered_correctly"] = m_lgbm.predict(test[constants.TRAIN_COLS]) toc = datetime.utcnow() timer["prediction"] = (toc - tic).total_seconds() return test, pd.DataFrame(timer, index=[0])
def model_evaluate(self, dt: pd.DataFrame, prob: float = 0.5, model: lgb.Booster = None): """ Evaluate model on given data frame. Produce probability plots, AUC, average PR, F1, Precision, Recall and confusion matrix. Args: dt: data frame with labels and scores to evaluate prob: threshold to count probabilities as ones model: model to evaluate """ if not model: model = self.lgb_model dt_eval = dt dt_eval["preds"] = model.predict(dt_eval[model.feature_name()]) dt_eval["preds"].head() sns.distplot(dt_eval["preds"], axlabel='Full distribution') plt.show() sns.distplot(dt_eval.loc[dt_eval['label'] == 1, "preds"], axlabel='Ones distribution') plt.show() sns.distplot(dt_eval.loc[dt_eval['label'] == 0, "preds"], axlabel='Zeros distribution') plt.show() sns.distplot(dt_eval.loc[dt_eval['label'] == 1, "preds"], axlabel='Ones distribution', kde=False) sns.distplot(dt_eval.loc[dt_eval['label'] == 0, "preds"], axlabel='Zeros distribution', kde=False) plt.show() preds = [0 if x < prob else 1 for x in dt_eval["preds"]] cm = confusion_matrix(dt_eval['label'].values, preds) df_cm = pd.DataFrame(cm) sns.heatmap(df_cm, annot=True) plt.show() a_score = accuracy_score(dt_eval['label'].values, preds, normalize=True) print("Accuracy score: {}\n".format(a_score)) class_report = classification_report(dt_eval['label'].values, preds, target_names=["Zeros", "Ones"]) print(class_report) total = sum(dt_eval['label'].values) predicted = sum(preds) print("Total positive labels: {}. Positive labels predicted: {}\n". format(total, predicted)) average_precision = average_precision_score(dt_eval['label'], dt_eval['preds']) print('Average precision-recall score: {0:0.2f}'.format( average_precision)) precision, recall, _ = precision_recall_curve(dt_eval['label'], dt_eval['preds'], pos_label=1) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format( average_precision)) plt.show()
def predict(booster: lgb.Booster, dtest: pd.DataFrame, dist: str, pred_type: str, n_samples: int = 1000, quantiles: list = [0.1, 0.5, 0.9], seed: str = 123): '''A customized lightgbmlss prediction function. booster: lgb.Booster Trained LightGBMLSS-Model X: pd.DataFrame Test Data dist: str Specifies the distributional assumption. pred_type: str Specifies what is to be predicted: "response" draws n_samples from the predicted response distribution. "quantile" calculates the quantiles from the predicted response distribution. "parameters" returns the predicted distributional parameters. "expectiles" returns the predicted expectiles. n_samples: int If pred_type="response" specifies how many samples are drawn from the predicted response distribution. quantiles: list If pred_type="quantiles" calculates the quantiles from the predicted response distribution. seed: int If pred_type="response" specifies the seed for drawing samples from the predicted response distribution. ''' dict_param = dist.param_dict() predt = booster.predict(dtest, raw_score=True) # Set init_score as starting point for each distributional parameter. init_score_pred = (np.ones(shape=(dtest.shape[0], 1))) * dist.start_values dist_params_predts = [] # The prediction result doesn't include the init_score specified in creating the train data. # Hence, it needs to be added manually with the corresponding transform for each distributional parameter. for i, (dist_param, response_fun) in enumerate(dict_param.items()): dist_params_predts.append( response_fun(predt[:, i] + init_score_pred[:, i])) dist_params_df = pd.DataFrame(dist_params_predts).T dist_params_df.columns = dict_param.keys() if pred_type == "parameters": return dist_params_df elif pred_type == "expectiles": return dist_params_df elif pred_type == "response": pred_resp_df = dist.pred_dist_rvs(pred_params=dist_params_df, n_samples=n_samples, seed=seed) pred_resp_df.columns = [ str("y_pred_sample_") + str(i) for i in range(pred_resp_df.shape[1]) ] return pred_resp_df elif pred_type == "quantiles": pred_quant_df = dist.pred_dist_quantile(quantiles=quantiles, pred_params=dist_params_df) pred_quant_df.columns = [ str("quant_") + str(quantiles[i]) for i in range(len(quantiles)) ] return pred_quant_df