def al_pkdd(dataset_number): """ Load Activation Log number `dataset_number` Parameters ---------- dataset_number: int index of the Activation Log, lower than 500 Returns ------- al: logs.ActivationLog The Activation Log with given index Notes ----- The Activation Logs are saved with 500 per file (to limit the number of files). This function handles the retrieval. """ assert dataset_number < 100 df = dataframe_operations \ .import_df(f'data/public/PKDD/ActivationLogs2.csv') \ .iloc[dataset_number * 1000: (dataset_number + 1) * 1000] return logs.ActivationLog(df)
def al2dsx(dataset, hierarchy, additional_rp=None): """ Wrapper to adapt the dataset and the hierarchy to the new formulations of Subsection \\ref{sec:compopt:ws2w} Parameters ---------- dataset: Counter, logs.ActivationLog, str, or Path. The value of :math:`\mathcal{D'}`. If ActivationLog, str or Path, it is (imported and) transformed into a bag hierarchy: hierarchy.Hierarchy Hierarchy object defining the relations between (super-)categories additional_rp: Counter Additional r_prime values to be added apart from the dataset. These values are not part of visit_ids_per_rp Returns ------- dataset: DSX D initiated with the adaption presented in Subsection \\ref{sec:compopt:ws2w} - :math:`r'`, :math:`\mathcal{R}'` - hierarchy class as given - invoice_ids_per_rp, reference to original iids that belong to each rp """ # import the data if isinstance(dataset, str) or isinstance(dataset, Path): dataset = logs.ActivationLog(dataset) else: assert isinstance(dataset, logs.ActivationLog) assert isinstance(hierarchy, Hierarchy) if additional_rp is None: additional_rp = Counter() else: assert isinstance(additional_rp, Counter) df = dataset.df.copy() h_vector = [ bitops.subset2int(hierarchy[k], hierarchy.c) for k in hierarchy.sc ] def row2rp(row): r = int(''.join([str(row[i]) for i in hierarchy.c]), 2) return r + (int( ''.join([ str(1 if (hamming_weight(ci & r) > 0) else 0) for ci in h_vector ]), 2) << hierarchy.num_cat) df['rp'] = df.apply(row2rp, axis=1) dataset = Counter(df['rp']) dataset += additional_rp # noinspection PyTypeChecker visit_ids_per_rp = { v: list(df[df['rp'] == v].index) for v in df['rp'].unique() } return DSX(dataset, hierarchy, visit_ids_per_rp)
def al_apkdd(dataset_number): assert dataset_number < 100 sr = pd.Series( data=range(100000)).apply(lambda x: (x % 1000) // 10 == dataset_number) df = dataframe_operations.import_df( f'data/public/PKDD/ActivationLogs2.csv')[sr] return logs.ActivationLog(df)
def al_data(dataset_exponent, dataset_number): assert isinstance(dataset_exponent, int) assert isinstance(dataset_number, int) if dataset_exponent in [3, 4, 5]: # These are saved with multiple values in one log assert dataset_number < 10**(7 - dataset_exponent) size = 10**dataset_exponent number_of_datasets_per_file = 500000 // size file_number = dataset_number // number_of_datasets_per_file df = dataframe_operations.import_df( retailer_folder / f'D{dataset_exponent}' / f'{file_number}.csv') dataset_number_in_file = dataset_number % number_of_datasets_per_file df = df.iloc[dataset_number_in_file * size:(dataset_number_in_file + 1) * size] return logs.ActivationLog(df) elif dataset_exponent in [6, 7]: assert (dataset_exponent == 6 and dataset_number < 10) or (dataset_exponent == 7 and dataset_number == 0) return logs.ActivationLog(retailer_folder / f'D{dataset_exponent}' / f'{dataset_number}.csv') else: raise ValueError('Illegal dataset_exponent')
def al_extended_example2(): return logs.ActivationLog(public_data_folder / 'samples' / 'implementation guide' / 'extended_example2.csv', categories=example_cat_list())
def al_running_example(): return logs.ActivationLog(public_data_folder / 'samples' / 'implementation guide' / 'running_example.csv', categories=example_cat_list())
def al_sample(exp): assert exp in range(1, 5), 'given exp must be in 1...4' return logs.ActivationLog(public_data_folder / 'samples' / 'real dataset' / f'S{exp}.csv')