Ejemplo n.º 1
0
def preprocess_interactions_data_frame(
    data_frame: pd.DataFrame, project_config: ProjectConfig
):
    if len(data_frame) == 0:
        return data_frame

    data_frame[project_config.user_column.name] = data_frame[
        project_config.user_column.name
    ].astype(str)
    data_frame[project_config.item_column.name] = data_frame[
        project_config.item_column.name
    ].astype(str)

    literal_eval_array_columns(
        data_frame,
        [
            project_config.user_column,
            project_config.item_column,
            project_config.output_column,
        ]
        + [input_column for input_column in project_config.other_input_columns]\
        + [input_column for input_column in project_config.auxiliar_output_columns],
    )
    #from IPython import embed; embed()
    if project_config.available_arms_column_name and isinstance(
        data_frame.iloc[0][project_config.available_arms_column_name], str
    ):
        data_frame[project_config.available_arms_column_name] = parallel_literal_eval(
            data_frame[project_config.available_arms_column_name]
        )
    
    return data_frame
Ejemplo n.º 2
0
def literal_eval_array_columns(data_frame: pd.DataFrame, columns: List[Column]):
    for column in columns:
        if (
            column.type
            in (IOType.FLOAT_ARRAY, IOType.INT_ARRAY, IOType.INDEXABLE_ARRAY)
            and column.name in data_frame
        ):
            data_frame[column.name] = parallel_literal_eval(data_frame[column.name])
Ejemplo n.º 3
0
    def run(self):
        os.makedirs(self.output().path)

        # df: pd.DataFrame = preprocess_interactions_data_frame(
        #     pd.read_csv(
        #         get_test_set_predictions_path(self.model_training.output().path)
        #     ),
        #     self.model_training.project_config,
        # )  # .sample(10000)

        df: pd.DataFrame = pd.read_csv(
            get_test_set_predictions_path(self.model_training.output().path),
            dtype={self.model_training.project_config.item_column.name:
                   "str"})  # .sample(10000)

        df["sorted_actions"] = parallel_literal_eval(df["sorted_actions"])
        df["prob_actions"] = parallel_literal_eval(df["prob_actions"])
        df["action_scores"] = parallel_literal_eval(df["action_scores"])

        df["action"] = df["sorted_actions"].apply(
            lambda sorted_actions: str(sorted_actions[0]))

        with Pool(self.num_processes) as p:
            print("Creating the relevance lists...")
            # from IPython import embed; embed()
            df["relevance_list"] = list(
                tqdm(
                    p.starmap(
                        _create_relevance_list,
                        zip(
                            df["sorted_actions"],
                            df[self.model_training.project_config.item_column.
                               name],
                            df[self.model_training.project_config.
                               output_column.name],
                        ),
                    ),
                    total=len(df),
                ))

        if self.model_training.metadata_data_frame is not None:
            df = pd.merge(
                df,
                pd.read_csv(
                    self.model_training.metadata_data_frame_path,
                    dtype={
                        self.model_training.project_config.item_column.name:
                        "str"
                    }),
                left_on="action",
                right_on=self.model_training.project_config.item_column.name,
                suffixes=("", "_action"),
            )

        ground_truth_df = df[~(
            df[self.model_training.project_config.output_column.name] == 0)]

        print("Rank Metrics...")
        df_rank, dict_rank = self.rank_metrics(ground_truth_df)
        gc.collect()

        print("Fairness Metrics")
        df_fairness, df_fairness_metrics = self.fairness_metrics(
            ground_truth_df)
        gc.collect()

        print("Offpolice Metrics")
        df_offpolicy, dict_offpolice = self.offpolice_metrics(df)
        gc.collect()

        # dict_offpolice = {}
        # Save Logs
        metrics = {**dict_rank, **dict_offpolice}
        pprint.pprint(metrics)
        with open(os.path.join(self.output().path, "metrics.json"),
                  "w") as metrics_file:
            json.dump(metrics, metrics_file, cls=JsonEncoder, indent=4)

        df_offpolicy.to_csv(os.path.join(self.output().path,
                                         "df_offpolicy.csv"),
                            index=False)
        df_rank.to_csv(os.path.join(self.output().path, "rank_metrics.csv"),
                       index=False)
        df_fairness_metrics.to_csv(os.path.join(self.output().path,
                                                "fairness_metrics.csv"),
                                   index=False)
        df_fairness.to_csv(os.path.join(self.output().path, "fairness_df.csv"),
                           index=False)