def test_split_pandas_data(pandas_dummy_timestamp): splits = split_pandas_data_with_ratios(pandas_dummy_timestamp, ratios=[0.5, 0.5]) assert len(splits[0]) == 5 assert len(splits[1]) == 5 splits = split_pandas_data_with_ratios(pandas_dummy_timestamp, ratios=[0.12, 0.36, 0.52]) shape = pandas_dummy_timestamp.shape[0] assert len(splits[0]) == round(shape * 0.12) assert len(splits[1]) == round(shape * 0.36) assert len(splits[2]) == round(shape * 0.52)
def python_random_split(data, ratio=0.75, seed=42): """Pandas random splitter The splitter randomly splits the input data. Args: data (pd.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halves and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. Returns: list: Splits of the input data as pd.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) if multi_split: splits = split_pandas_data_with_ratios(data, ratio, shuffle=True, seed=seed) splits_new = [x.drop('split_index', axis=1) for x in splits] return splits_new else: return sk_split(data, test_size=None, train_size=ratio, random_state=seed)
def python_random_split(data, ratio=0.75, seed=42): """Pandas random splitter The splitter randomly splits the input data. Args: data (pd.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halves and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. Returns: list: Splits of the input data as pd.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) if multi_split: splits = split_pandas_data_with_ratios(data, ratio, shuffle=True, seed=seed) splits_new = [x.drop('split_index', axis=1) for x in splits] return splits_new else: return sk_split(data, test_size=None, train_size=ratio, random_state=seed)
def test_split_pandas_data(pandas_dummy_timestamp): """Test split pandas data """ df_rating = pandas_dummy_timestamp splits = split_pandas_data_with_ratios(df_rating, ratios=[0.5, 0.5]) assert len(splits[0]) == 5 assert len(splits[1]) == 5 splits = split_pandas_data_with_ratios(df_rating, ratios=[0.12, 0.36, 0.52]) assert len(splits[0]) == round(df_rating.shape[0] * 0.12) assert len(splits[1]) == round(df_rating.shape[0] * 0.36) assert len(splits[2]) == round(df_rating.shape[0] * 0.52)
def test_split_pandas_data(pandas_dummy_timestamp): splits = split_pandas_data_with_ratios(pandas_dummy_timestamp, ratios=[0.5, 0.5]) assert len(splits[0]) == 5 assert len(splits[1]) == 5 splits = split_pandas_data_with_ratios( pandas_dummy_timestamp, ratios=[0.12, 0.36, 0.52] ) shape = pandas_dummy_timestamp.shape[0] assert len(splits[0]) == round(shape * 0.12) assert len(splits[1]) == round(shape * 0.36) assert len(splits[2]) == round(shape * 0.52) with pytest.raises(ValueError): splits = split_pandas_data_with_ratios( pandas_dummy_timestamp, ratios=[0.6, 0.2, 0.4] )
def _do_stratification( data, ratio=0.75, min_rating=1, filter_by="user", is_random=True, seed=42, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): # A few preliminary checks. if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError( "min_rating should be integer and larger than or equal to 1.") if col_user not in data.columns: raise ValueError("Schema of data not valid. Missing User Col") if col_item not in data.columns: raise ValueError("Schema of data not valid. Missing Item Col") if not is_random: if col_timestamp not in data.columns: raise ValueError("Schema of data not valid. Missing Timestamp Col") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item ratio = ratio if multi_split else [ratio, 1 - ratio] if min_rating > 1: data = min_rating_filter_pandas( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) # Split by each group and aggregate splits together. splits = [] # If it is for chronological splitting, the split will be performed in a random way. df_grouped = (data.sort_values(col_timestamp).groupby(split_by_column) if is_random is False else data.groupby(split_by_column)) for name, group in df_grouped: group_splits = split_pandas_data_with_ratios( df_grouped.get_group(name), ratio, shuffle=is_random, seed=seed) # Concatenate the list of split dataframes. concat_group_splits = pd.concat(group_splits) splits.append(concat_group_splits) # Concatenate splits for all the groups together. splits_all = pd.concat(splits) # Take split by split_index splits_list = [ splits_all[splits_all["split_index"] == x].drop("split_index", axis=1) for x in range(len(ratio)) ] return splits_list
def python_chrono_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Pandas chronological splitter This function splits data in a chronological manner. That is, for each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (pd.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halfs and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Returns: list: Splits of the input data as pd.DataFrame. """ if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError( "min_rating should be integer and larger than or equal to 1.") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item # Sort data by timestamp. data = data.sort_values(by=[split_by_column, col_timestamp], axis=0, ascending=False) ratio = ratio if multi_split else [ratio, 1 - ratio] if min_rating > 1: data = min_rating_filter_pandas( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) num_of_splits = len(ratio) splits = [pd.DataFrame({})] * num_of_splits df_grouped = data.sort_values(col_timestamp).groupby(split_by_column) for name, group in df_grouped: group_splits = split_pandas_data_with_ratios( df_grouped.get_group(name), ratio, resample=False) for x in range(num_of_splits): splits[x] = pd.concat([splits[x], group_splits[x]]) return splits
def _do_stratification( data, ratio=0.75, min_rating=1, filter_by="user", is_random=True, seed=42, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): # A few preliminary checks. if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError("min_rating should be integer and larger than or equal to 1.") if col_user not in data.columns: raise ValueError("Schema of data not valid. Missing User Col") if col_item not in data.columns: raise ValueError("Schema of data not valid. Missing Item Col") if not is_random: if col_timestamp not in data.columns: raise ValueError("Schema of data not valid. Missing Timestamp Col") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item ratio = ratio if multi_split else [ratio, 1 - ratio] if min_rating > 1: data = min_rating_filter_pandas( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) # Split by each group and aggregate splits together. splits = [] # If it is for chronological splitting, the split will be performed in a random way. df_grouped = ( data.sort_values(col_timestamp).groupby(split_by_column) if is_random is False else data.groupby(split_by_column) ) for name, group in df_grouped: group_splits = split_pandas_data_with_ratios( df_grouped.get_group(name), ratio, shuffle=is_random, seed=seed ) # Concatenate the list of split dataframes. concat_group_splits = pd.concat(group_splits) splits.append(concat_group_splits) # Concatenate splits for all the groups together. splits_all = pd.concat(splits) # Take split by split_index splits_list = [ splits_all[splits_all["split_index"] == x].drop("split_index", axis=1) for x in range(len(ratio)) ] return splits_list
def python_chrono_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Pandas chronological splitter This function splits data in a chronological manner. That is, for each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (pd.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halfs and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Returns: list: Splits of the input data as pd.DataFrame. """ # A few preliminary checks. if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError( "min_rating should be integer and larger than or equal to 1.") if col_user not in data.columns: raise ValueError("Schema of data not valid. Missing User Col") if col_item not in data.columns: raise ValueError("Schema of data not valid. Missing Item Col") if col_timestamp not in data.columns: raise ValueError("Schema of data not valid. Missing Timestamp Col") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item ratio = ratio if multi_split else [ratio, 1 - ratio] if min_rating > 1: data = min_rating_filter_pandas( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) # Split by each group and aggregate splits together. splits = [] df_grouped = data.sort_values(col_timestamp).groupby(split_by_column) for name, group in df_grouped: group_splits = split_pandas_data_with_ratios( df_grouped.get_group(name), ratio, shuffle=False) # Concatenate the list of split dataframes. concat_group_splits = pd.concat(group_splits) splits.append(concat_group_splits) # Concatenate splits for all the groups together. splits_all = pd.concat(splits) # Take split by split_index splits_list = [ splits_all[splits_all["split_index"] == x].drop("split_index", axis=1) for x in range(len(ratio)) ] return splits_list