def al_apkdd(dataset_number): assert dataset_number < 100 sr = pd.Series( data=range(100000)).apply(lambda x: (x % 1000) // 10 == dataset_number) df = dataframe_operations.import_df( f'data/public/PKDD/ActivationLogs2.csv')[sr] return logs.ActivationLog(df)
def pkdd_key(): df = pd.DataFrame() for i in range(3): fn = Path(r'data\private\PKDD') / f'{i}.csv' df = df.append( dataframe_operations.import_df(fn)[[f'PKDD_{invoice}', invoice]]) return df
def __init__(self, fd, fd_out=None, figure_extension='svg'): """ Object to parse the results of multiple bootstraps Parameters ---------- fd: str or Path Location of the results of the multiple bootstraps fd_out: str, Path or None Location of the output. If None, fd/results is used. figure_extension: str Extension of saved figures """ # Input / Output fd = Path(fd) assert fd.exists() if fd_out is None: self.__fd_out = fd / 'results' else: self.__fd_out = Path(fd_out) assert isinstance(figure_extension, str) self.ext = figure_extension # Read the general settings (common to all repetitions) self.settings = MultipleBootstrapSettings(fd) self.hierarchy = data.data_loader.generic_hierarchy( self.settings.hierarchy_name) self.weight_column_names = [ISC_WEIGHT] + self.hierarchy.sc self.medoid_column_names = [ f'{medoid}_{i}' for i in range(self.settings.k) ] self.sbr = dict() df = dataframe_operations.import_df(fd / 'results.csv') for rep in df[REPETITION].unique(): df_rep = df[df[REPETITION] == rep].drop( columns=REPETITION).set_index(ITERATION) self.sbr[rep] = BootstrapResult(df_rep, self.settings.k) # TODO : refactor this to self.results = self.compute_results() self.results = pd.DataFrame( index=list(self.sbr.keys()), columns=[f'{supercat}_{i}' for i in range(self.hierarchy.h + 1)] + [f'w_{i}' for i in range(self.hierarchy.h + 1)] + [f'm_{i}' for i in range(self.settings.k)]) self.fill_results() self.summary = self.results.groupby(fpw_str).mean()[[IEM] + self.weight_column_names] \ .assign(**{IEM + '_std': self.results.groupby(fpw_str).std()[IEM], SUPPORT: self.results[fpw_str].value_counts()}) \ .sort_values(SUPPORT, ascending=False) \ .assign(**{fp_id: range(1, len(self.results[fpw_str].unique()) + 1)}) self.results[fp_id] = self.results[fpw_str].replace( dict(self.summary[fp_id].items()))
def assert_hierarchy_has_correct_column_names(source, require_sku=True): df = dataframe_operations.import_df(source) expected_n_levels = len(df.columns) - (1 if require_sku else 0) for i in range(1, expected_n_levels + 1): assert hl_name(i) in df.columns, f'{hl_name(i)} not in columns.' if require_sku: assert sku in df.columns, f'{sku} not in columns'
def assert_is_retail_sku_info(source): """ Asserts a given source satisfied the requirements of retailer sku info Parameters ---------- source: str, Path, or pd.DataFrame The source to be checked """ source = dataframe_operations.import_df(source) if source.index.name == sku: source.reset_index(drop=False) assert set(source.columns) == {sku, hl_encoding(0), sku_des}
def assert_hierarchy_has_non_duplicate_encoding(source): df = dataframe_operations.import_df(source) level = 0 while hl_encoding(level + 1) in df.columns: level += 1 assert level > 0, f'{hl_encoding(1)} not found.' if level == 1: return for i in range(2, level + 1): assert len(df[hl_encoding(i)].drop_duplicates()) == \ len(df[[hl_encoding(i - 1), hl_encoding(i)]].drop_duplicates()), f'Duplicate {hl_encoding(i)}'
def run(): # TODO FIX hard coded h = data_loader.generic_hierarchy('A3') for sample in 'H': # Read the cluster inclusion values. # Each file contains the inclusion values from one cluster set over all datasets cluster_dfs = dict() csv_files = [ fn for fn in file_functions.list_files( PKDD_PARAMETERS.RESULTS_2(sample), False) if fn[-4:] == '.csv' ] for fn in csv_files: if '_' in fn: m = fn.split('_')[1][:-4] cluster_dfs[m] = dataframe_operations.import_df( PKDD_PARAMETERS.RESULTS_2(sample) / fn) if DATASET_NAME in cluster_dfs[m].columns: cluster_dfs[m].drop(columns=DATASET_NAME) # Combine the results for each cluster df_mean = pd.DataFrame() df_ci = pd.DataFrame() for k, v in cluster_dfs.items(): df_mean[k] = v.mean() df_ci[k] = confidence_interval.std_n_to_ci(v.std(), len(v), 0.95) # Create a tex file cluster_analysis.cluster_statistics_2_tex( df_mean, h, PKDD_PARAMETERS.RESULTS_2(sample) / 'cluster_analysis.tex', df_ci=df_ci, inclusion_missing=False, label='tab:res:inclusion') cluster_analysis.cluster_statistics_2_tex( df_mean, h, PKDD_PARAMETERS.RESULTS_2(sample) / 'cluster_analysis_full.tex', df_ci=df_ci, num_c=99, inclusion_missing=False, label='tab:res:inclusion')
def compress_mbr(mbr_fd): mbr_fd = Path(mbr_fd) sbr_folder = mbr_fd / 'single bootstraps' cycles = pd.Series(dtype=int) results = pd.DataFrame() for sbr_fd in file_functions.list_dirs(sbr_folder, False): fd = mbr_fd / 'single bootstraps' / sbr_fd with open(fd / 'cycle.txt', 'r') as rf: cycles[sbr_fd] = int(rf.readline()) df_results = dataframe_operations.import_df(fd / 'results.csv', dtype=str) df_results[REPETITION] = sbr_fd results = results.append(df_results) dataframe_operations.export_df(cycles, mbr_fd / 'cycles.csv') dataframe_operations.export_df(results, mbr_fd / 'results.csv') file_functions.delete(sbr_folder)
def encode_hierarchy(source): df = dataframe_operations.import_df(source) assert_hierarchy_has_correct_column_names(df) num_levels = get_n_levels(df) def create_encoding(f, k): df.loc[:, hl_encoding(k)] = df[hl_name(k)].map({ n: f'{j:0{int(f)}d}' for j, n in enumerate(df[hl_name( k)].drop_duplicates().sort_values()) }) # Lower levels for i in range(1, num_levels + 1): # Note that children are not necessarily sequential, as each level is encoded separately. create_encoding(f=np.floor(np.log10(len(df[hl_name(i)].unique()))) + 1, k=i) if i != 1: df.loc[:, hl_encoding(i)] = df[hl_encoding(i - 1)] + '.' + df[ hl_encoding(i)] return df
def __init__(self, source): """ Initializes a log from a DataFrame, File, or Log Parameters ---------- source: str, Path, pd.DataFrame, or same type source of the log. If str or Path, the source is read as DataFrame from this location. If same type, the df of the source is used """ if isinstance(source, type(self)): self.df = source.df return elif isinstance(source, str) or isinstance(source, Path): assert Path(source).exists() source = dataframe_operations.import_df(source, dtype=dtypes) assert isinstance( source, pd.DataFrame), 'Source is not a file or DataFrame or same type' self.df = pd.DataFrame() for c in self.required_features(): self.df.loc[:, c] = source[c].astype(dtypes[c])
def al_data(dataset_exponent, dataset_number): assert isinstance(dataset_exponent, int) assert isinstance(dataset_number, int) if dataset_exponent in [3, 4, 5]: # These are saved with multiple values in one log assert dataset_number < 10**(7 - dataset_exponent) size = 10**dataset_exponent number_of_datasets_per_file = 500000 // size file_number = dataset_number // number_of_datasets_per_file df = dataframe_operations.import_df( retailer_folder / f'D{dataset_exponent}' / f'{file_number}.csv') dataset_number_in_file = dataset_number % number_of_datasets_per_file df = df.iloc[dataset_number_in_file * size:(dataset_number_in_file + 1) * size] return logs.ActivationLog(df) elif dataset_exponent in [6, 7]: assert (dataset_exponent == 6 and dataset_number < 10) or (dataset_exponent == 7 and dataset_number == 0) return logs.ActivationLog(retailer_folder / f'D{dataset_exponent}' / f'{dataset_number}.csv') else: raise ValueError('Illegal dataset_exponent')
def get_n_levels(hierarchy): df = dataframe_operations.import_df(hierarchy) return len(df.columns) - (1 if sku in hierarchy.columns else 0)