def get_all_model_predictions(model_classes: List[ModelBase]):
    df_eval = pd.DataFrame()
    df_current = pd.DataFrame()
    for model_class in model_classes:
        model = model_class()
        df_eval_tmp = pd.read_csv(
            data_utilities.get_processed_data_filepath(
                f"{model.experiment_name}_eval_predictions.csv"))
        df_eval_tmp = df_eval_tmp.rename(
            {model.pred_column: model.experiment_name}, axis="columns")
        df_current_tmp = pd.read_csv(
            data_utilities.get_processed_data_filepath(
                f"{model.experiment_name}_current_predictions.csv"))
        df_current_tmp = df_current_tmp.rename(
            {model.pred_column: model.experiment_name}, axis="columns")

        if df_eval.shape[0] == 0:
            df_eval = df_eval_tmp.copy()
            df_current = df_current_tmp.copy()
        else:
            df_eval = pd.merge(
                df_eval,
                df_eval_tmp,
                on=[
                    "player_id",
                    "player_name",
                    "opponent",
                    "round",
                    "position_id",
                    "adjusted_points",
                ],
            )
            df_current = pd.merge(
                df_current,
                df_current_tmp,
                on=[
                    "player_id",
                    "player_name",
                    "opponent",
                    "round",
                    "position_id",
                    "cost",
                ],
            )
    return (df_eval, df_current)
Exemple #2
0
def fix_player_home(df, player_team_dict):
    df_final = df.loc[df["home"] != 2].copy()
    df_player_transfered = df.loc[df["home"] == 2]
    df_player_counts = (df_player_transfered.groupby(
        ["player_id"]).size().reset_index(name="count"))
    df_player_transfered = pd.merge(df_player_transfered,
                                    df_player_counts,
                                    how="left",
                                    on=["player_id"])
    df_temp = df_player_transfered.loc[df_player_transfered["count"] == 1]
    df_temp.to_csv(
        data_utilities.get_processed_data_filepath(
            "historical_player_anomalies.csv"),
        sep=",",
        index=False,
    )
    df_player_trans_final = pd.DataFrame()
    df_player_transfered = df_player_transfered.loc[
        df_player_transfered["count"] >= 2]
    for player_id in df_player_transfered["player_id"].unique():
        df_subset = df.loc[(df["player_id"] == player_id)].copy()
        old_team = _get_old_team(df_subset)
        new_team = player_team_dict.get(player_id)
        if old_team != "unfound":
            round = _get_round_switch(df_subset, old_team, new_team)
            logger.critical(
                f"{player_id} switched from {old_team} to {new_team} in round {round}"
            )
            df_subset = df_subset.apply(_fill_correct_team,
                                        args=(old_team, round),
                                        axis=1)
            df_player_trans_final = pd.concat(
                [df_player_trans_final, df_subset], axis=0, sort=True)
            df_final = df_final.loc[df["player_id"] != player_id]
    if df_player_trans_final.shape[0] > 5:
        df_player_trans_final.to_csv(
            data_utilities.get_processed_data_filepath(
                "historical_player_anomalies_fixed.csv"),
            sep=",",
            index=False,
        )
    df_final = pd.concat([df_final, df_player_trans_final])
    return df_final
def save_plot(fig, file_name, length_inches, height_inches):
    fig.set_size_inches(length_inches, height_inches)
    fig.savefig(
        data_utilities.get_processed_data_filepath(f"{file_name}"),
        bbox_inches="tight",
        pad_inches=0,
        edgecolor="none",
        transparent=True,
        dpi=400,
    )
Exemple #4
0
 def save_model(self, model):
     mlflow.sklearn.log_model(model, "model")
     df_coef = pd.DataFrame({
         "coefficients": model.coef_,
         "names": self.features
     })
     df_coef.to_csv(
         data_utilities.get_processed_data_filepath(
             f"{self.experiment_name}_coef.csv"),
         index=False,
     )
    def evaluate_model(self, df_train, df_test, df_valid):
        df_predictions = pd.DataFrame()
        df_train_fold = df_train.copy()
        for test_block in tqdm(sorted(df_test["unique_round"].unique())):
            df_test_fold = df_test.loc[df_test["unique_round"] ==
                                       test_block].copy()

            # prepare data
            df_dict = {
                "train": df_train_fold,
                "test": df_test_fold,
                "valid": df_valid
            }
            input_x_dicts = self.prepare_x_input_dicts(df_dict)
            input_y_dicts = self.prepare_y_input_dicts(df_dict)

            model = self.train_model(
                input_x_dicts["train"],
                input_y_dicts["train"],
                input_x_dicts["valid"],
                input_y_dicts["valid"],
            )

            df_predictions_fold = self.predict(model, input_x_dicts["test"])
            df_predictions_fold = pd.concat(
                [
                    df_test_fold[[
                        "player_id",
                        "player_name",
                        "opponent",
                        "round",
                        "position_id",
                        "adjusted_points",
                    ]].reset_index(drop=True),
                    df_predictions_fold.reset_index(drop=True),
                ],
                axis=1,
            )
            df_predictions = pd.concat([df_predictions, df_predictions_fold],
                                       axis=0)
            df_train_fold = pd.concat([df_train_fold, df_test_fold],
                                      axis=0,
                                      sort=True)

        df_predictions.to_csv(
            data_utilities.get_processed_data_filepath(
                f"{self.experiment_name}_eval_predictions.csv"),
            index=False,
        )
        return df_predictions
 def get_experiment_info(self, df_test):
     params = self.params
     # params.update({'features':self.features})
     metrics = self.calculate_metrics(df_test, 15, ".all")
     position_counts = {1: 3, 2: 6, 3: 6, 4: 4}
     for position_id, count_players in position_counts.items():
         df_position = df_test.loc[df_test["position_id"] == position_id]
         metrics.update(
             self.calculate_metrics(df_position, count_players,
                                    f".position.{position_id}"))
     # add csv to artifacts
     artifacts = [
         data_utilities.get_processed_data_filepath(
             f"{self.experiment_name}_eval_predictions.csv"),
         data_utilities.get_processed_data_filepath(
             f"{self.experiment_name}_current_predictions.csv"),
     ]
     # tags = {
     #    "metaflow_runid" : current.run_id,
     #    "username" : current.username,
     #    "stepname" : current.step_name,
     #    "taskid" : current.task_id
     # }
     return (params, metrics, artifacts)
 def load_training_data(self):
     data_filepath = data_utilities.get_processed_data_filepath(
         f"{self.experiment_name}_data.parquet")
     if self.rerun_sql:
         df = self.save_training_data_to_file(
             conn=data_utilities.initialize_db(),
             data_filepath=data_filepath)
     else:
         df = pd.read_parquet(data_filepath)
     self.features = [
         col for col in df.columns
         if col not in self.unused_cols and col != self.target
     ]
     df[self.target] = df[self.target].clip(upper=self.upper,
                                            lower=self.lower)
     df_train = df.loc[df["dataset"] == "training"].copy()
     df_valid = df.loc[df["dataset"] == "validation"].copy()
     df_test = df.loc[df["dataset"] == "testing"].copy()
     df_new = df.loc[df["dataset"] == "live"].copy()
     return (df_train, df_valid, df_test, df_new)
    def generate_current_predictions(self, df_train, df_test, df_valid,
                                     df_new):
        df_train = pd.concat([df_train, df_test], axis=0)
        df_dict = {"train": df_train, "valid": df_valid, "new": df_new}
        input_x_dicts = self.prepare_x_input_dicts(df_dict)
        df_dict.pop("new")
        input_y_dicts = self.prepare_y_input_dicts(df_dict)

        model = self.train_model(
            input_x_dicts["train"],
            input_y_dicts["train"],
            input_x_dicts["valid"],
            input_y_dicts["valid"],
        )

        df_predictions = self.predict(model, input_x_dicts["new"])
        df_predictions = pd.concat(
            [
                df_new[[
                    "player_id",
                    "player_name",
                    "opponent",
                    "round",
                    "position_id",
                    "cost",
                ]].reset_index(drop=True),
                df_predictions.reset_index(drop=True),
            ],
            axis=1,
        )
        df_predictions.to_csv(
            data_utilities.get_processed_data_filepath(
                f"{self.experiment_name}_current_predictions.csv"),
            index=False,
        )
        return model
        experiment_name).experiment_id

    df_experiment_runs = mlflow.search_runs(experiment_ids=[experiment_id])
    df_run = df_experiment_runs.sort_values("start_time",
                                            ascending=False).head(1)
    df_run["experiment_name"] = experiment_name
    return df_run


if __name__ == "__main__":
    model_classes = [
        CNNOrdinalModel,
        GBMRankingModel,
        RNNOrdinalModel,
        RobustSimpleLinearModel,
        SimpleLinearModel,
    ]
    run_models(model_classes)
    df_eval, df_current = get_all_model_predictions(model_classes)
    df_eval.to_csv(
        data_utilities.get_processed_data_filepath("eval_predictions.csv"),
        index=False)
    df_current.to_csv(
        data_utilities.get_processed_data_filepath("current_predictions.csv"),
        index=False,
    )
    experiment_names = get_all_experiment_names(model_classes)
    df_runs = get_all_model_run_info(experiment_names)
    df_runs.to_csv(
        data_utilities.get_processed_data_filepath("model_runs.csv"))