Beispiel #1
0
    def make_temporal_basket_split(self,
                                   data=None,
                                   test_rate=0.1,
                                   n_negative=100,
                                   by_user=False,
                                   n_test=10):
        """Generate split data with temporal_basket_split.

        Generate split data with temporal_basket_split method.

        Args:
            data (DataFrame): DataFrame to be split.
                - Default is None. It will load the raw interaction, with a default filter
                ```
                data = filter_user_item_order(data, min_u_c=10, min_o_c=10, min_i_c=10)
                ```
                - Users can specify their filtered data by using filter methods in data_split.py
            test_rate: percentage of the test data. Note that percentage of the validation data will be the same as testing.
            n_negative:  Number of negative samples for testing and validation data.
            by_user: bool. Default False.
                    - True: user-based split,
                    - False: global split,
            n_test: int. Default 10. The number of testing and validation copies.

        Returns:
            train_data (DataFrame): Interaction for training.
            valid_data list(DataFrame): List of interactions for validation
            test_data list(DataFrame): List of interactions for testing
        """
        if data is None:
            data = self.load_interaction()
            data = filter_user_item_order(data,
                                          min_u_c=10,
                                          min_o_c=10,
                                          min_i_c=10)

        if not isinstance(data, pd.DataFrame):
            raise RuntimeError("data is not a type of DataFrame")

        if DEFAULT_TIMESTAMP_COL not in data.columns:
            raise RuntimeError("This dataset doesn't have an TIMESTAMP_COL")

        if DEFAULT_ORDER_COL not in data.columns:
            raise RuntimeError("This dataset doesn't have an ORDER_COL")

        result = split_data(
            data,
            split_type="temporal_basket",
            test_rate=test_rate,
            n_negative=n_negative,
            save_dir=self.processed_path,
            by_user=by_user,
            n_test=n_test,
        )
        return result
Beispiel #2
0
    def make_leave_one_basket(self,
                              data=None,
                              random=False,
                              n_negative=100,
                              n_test=10):
        """Generate split data with leave_one_basket.

        Generate split data with leave_one_basket method.

        Args:
            data (DataFrame): DataFrame to be split.
                - Default is None. It will load the raw interaction, with a default filter
                    '''
                    filter_user_item(data, min_u_c=0, min_o_c=3, min_i_c=0)
                    '''
                - Users can specify their filtered data by using filter methods in data_split.py
            random: bool. Whether randomly leave one basket as testing.
            n_negative:  Number of negative samples for testing and validation data.
            n_test: int. Default 10. The number of testing and validation copies.

        Returns:
            train_data (DataFrame): Interaction for training.
            valid_data list(DataFrame): List of interactions for validation
            test_data list(DataFrame): List of interactions for testing
        """
        if data is None:
            data = self.load_interaction()
            data = filter_user_item_order(data,
                                          min_u_c=0,
                                          min_o_c=3,
                                          min_i_c=0)

        if not isinstance(data, pd.DataFrame):
            raise RuntimeError("data is not a type of DataFrame")

        if DEFAULT_TIMESTAMP_COL not in data.columns:
            raise RuntimeError("This dataset doesn't have an TIMESTAMP_COL")

        if DEFAULT_ORDER_COL not in data.columns:
            raise RuntimeError("This dataset doesn't have an ORDER_COL")

        result = split_data(
            data,
            split_type="leave_one_basket",
            test_rate=0,
            random=random,
            n_negative=n_negative,
            save_dir=self.processed_path,
            n_test=n_test,
        )
        return result