def al_apkdd(dataset_number):
    assert dataset_number < 100
    sr = pd.Series(
        data=range(100000)).apply(lambda x: (x % 1000) // 10 == dataset_number)
    df = dataframe_operations.import_df(
        f'data/public/PKDD/ActivationLogs2.csv')[sr]
    return logs.ActivationLog(df)
Exemple #2
0
def pkdd_key():
    df = pd.DataFrame()
    for i in range(3):
        fn = Path(r'data\private\PKDD') / f'{i}.csv'
        df = df.append(
            dataframe_operations.import_df(fn)[[f'PKDD_{invoice}', invoice]])
    return df
Exemple #3
0
    def __init__(self, fd, fd_out=None, figure_extension='svg'):
        """
        Object to parse the results of multiple bootstraps

        Parameters
        ----------
        fd: str or Path
            Location of the results of the multiple bootstraps
        fd_out: str, Path or None
            Location of the output. If None, fd/results is used.
        figure_extension: str
            Extension of saved figures
        """

        # Input / Output
        fd = Path(fd)
        assert fd.exists()

        if fd_out is None:
            self.__fd_out = fd / 'results'
        else:
            self.__fd_out = Path(fd_out)
        assert isinstance(figure_extension, str)

        self.ext = figure_extension

        # Read the general settings (common to all repetitions)
        self.settings = MultipleBootstrapSettings(fd)

        self.hierarchy = data.data_loader.generic_hierarchy(
            self.settings.hierarchy_name)
        self.weight_column_names = [ISC_WEIGHT] + self.hierarchy.sc
        self.medoid_column_names = [
            f'{medoid}_{i}' for i in range(self.settings.k)
        ]

        self.sbr = dict()
        df = dataframe_operations.import_df(fd / 'results.csv')
        for rep in df[REPETITION].unique():
            df_rep = df[df[REPETITION] == rep].drop(
                columns=REPETITION).set_index(ITERATION)
            self.sbr[rep] = BootstrapResult(df_rep, self.settings.k)

        # TODO : refactor this to self.results = self.compute_results()
        self.results = pd.DataFrame(
            index=list(self.sbr.keys()),
            columns=[f'{supercat}_{i}' for i in range(self.hierarchy.h + 1)] +
            [f'w_{i}' for i in range(self.hierarchy.h + 1)] +
            [f'm_{i}' for i in range(self.settings.k)])
        self.fill_results()

        self.summary = self.results.groupby(fpw_str).mean()[[IEM] + self.weight_column_names] \
            .assign(**{IEM + '_std': self.results.groupby(fpw_str).std()[IEM],
                       SUPPORT: self.results[fpw_str].value_counts()}) \
            .sort_values(SUPPORT, ascending=False) \
            .assign(**{fp_id: range(1, len(self.results[fpw_str].unique()) + 1)})
        self.results[fp_id] = self.results[fpw_str].replace(
            dict(self.summary[fp_id].items()))
def assert_hierarchy_has_correct_column_names(source, require_sku=True):
    df = dataframe_operations.import_df(source)

    expected_n_levels = len(df.columns) - (1 if require_sku else 0)

    for i in range(1, expected_n_levels + 1):
        assert hl_name(i) in df.columns, f'{hl_name(i)} not in columns.'
    if require_sku:
        assert sku in df.columns, f'{sku} not in columns'
def assert_is_retail_sku_info(source):
    """
    Asserts a given source satisfied the requirements of retailer sku info

    Parameters
    ----------
    source: str, Path, or pd.DataFrame
        The source to be checked

    """
    source = dataframe_operations.import_df(source)
    if source.index.name == sku:
        source.reset_index(drop=False)
    assert set(source.columns) == {sku, hl_encoding(0), sku_des}
def assert_hierarchy_has_non_duplicate_encoding(source):
    df = dataframe_operations.import_df(source)
    level = 0
    while hl_encoding(level + 1) in df.columns:
        level += 1

    assert level > 0, f'{hl_encoding(1)} not found.'

    if level == 1:
        return

    for i in range(2, level + 1):
        assert len(df[hl_encoding(i)].drop_duplicates()) == \
               len(df[[hl_encoding(i - 1), hl_encoding(i)]].drop_duplicates()), f'Duplicate {hl_encoding(i)}'
def run():
    # TODO FIX hard coded
    h = data_loader.generic_hierarchy('A3')

    for sample in 'H':

        # Read the cluster inclusion values.
        # Each file contains the inclusion values from one cluster set over all datasets
        cluster_dfs = dict()
        csv_files = [
            fn for fn in file_functions.list_files(
                PKDD_PARAMETERS.RESULTS_2(sample), False) if fn[-4:] == '.csv'
        ]

        for fn in csv_files:
            if '_' in fn:
                m = fn.split('_')[1][:-4]
                cluster_dfs[m] = dataframe_operations.import_df(
                    PKDD_PARAMETERS.RESULTS_2(sample) / fn)
                if DATASET_NAME in cluster_dfs[m].columns:
                    cluster_dfs[m].drop(columns=DATASET_NAME)

        # Combine the results for each cluster
        df_mean = pd.DataFrame()
        df_ci = pd.DataFrame()
        for k, v in cluster_dfs.items():
            df_mean[k] = v.mean()
            df_ci[k] = confidence_interval.std_n_to_ci(v.std(), len(v), 0.95)

        # Create a tex file
        cluster_analysis.cluster_statistics_2_tex(
            df_mean,
            h,
            PKDD_PARAMETERS.RESULTS_2(sample) / 'cluster_analysis.tex',
            df_ci=df_ci,
            inclusion_missing=False,
            label='tab:res:inclusion')
        cluster_analysis.cluster_statistics_2_tex(
            df_mean,
            h,
            PKDD_PARAMETERS.RESULTS_2(sample) / 'cluster_analysis_full.tex',
            df_ci=df_ci,
            num_c=99,
            inclusion_missing=False,
            label='tab:res:inclusion')
def compress_mbr(mbr_fd):
    mbr_fd = Path(mbr_fd)
    sbr_folder = mbr_fd / 'single bootstraps'

    cycles = pd.Series(dtype=int)
    results = pd.DataFrame()

    for sbr_fd in file_functions.list_dirs(sbr_folder, False):
        fd = mbr_fd / 'single bootstraps' / sbr_fd

        with open(fd / 'cycle.txt', 'r') as rf:
            cycles[sbr_fd] = int(rf.readline())

        df_results = dataframe_operations.import_df(fd / 'results.csv', dtype=str)
        df_results[REPETITION] = sbr_fd
        results = results.append(df_results)

    dataframe_operations.export_df(cycles, mbr_fd / 'cycles.csv')
    dataframe_operations.export_df(results, mbr_fd / 'results.csv')

    file_functions.delete(sbr_folder)
def encode_hierarchy(source):
    df = dataframe_operations.import_df(source)
    assert_hierarchy_has_correct_column_names(df)
    num_levels = get_n_levels(df)

    def create_encoding(f, k):
        df.loc[:, hl_encoding(k)] = df[hl_name(k)].map({
            n: f'{j:0{int(f)}d}'
            for j, n in enumerate(df[hl_name(
                k)].drop_duplicates().sort_values())
        })

    # Lower levels
    for i in range(1, num_levels + 1):
        # Note that children are not necessarily sequential, as each level is encoded separately.
        create_encoding(f=np.floor(np.log10(len(df[hl_name(i)].unique()))) + 1,
                        k=i)
        if i != 1:
            df.loc[:, hl_encoding(i)] = df[hl_encoding(i - 1)] + '.' + df[
                hl_encoding(i)]

    return df
Exemple #10
0
    def __init__(self, source):
        """
        Initializes a log from a DataFrame, File, or Log

        Parameters
        ----------
        source: str, Path, pd.DataFrame, or same type
            source of the log. If str or Path, the source is read as DataFrame from this location. If same type, the df
            of the source is used
        """
        if isinstance(source, type(self)):
            self.df = source.df
            return
        elif isinstance(source, str) or isinstance(source, Path):
            assert Path(source).exists()
            source = dataframe_operations.import_df(source, dtype=dtypes)
        assert isinstance(
            source,
            pd.DataFrame), 'Source is not a file or DataFrame or same type'

        self.df = pd.DataFrame()
        for c in self.required_features():
            self.df.loc[:, c] = source[c].astype(dtypes[c])
def al_data(dataset_exponent, dataset_number):
    assert isinstance(dataset_exponent, int)
    assert isinstance(dataset_number, int)

    if dataset_exponent in [3, 4, 5]:
        # These are saved with multiple values in one log
        assert dataset_number < 10**(7 - dataset_exponent)
        size = 10**dataset_exponent
        number_of_datasets_per_file = 500000 // size
        file_number = dataset_number // number_of_datasets_per_file
        df = dataframe_operations.import_df(
            retailer_folder / f'D{dataset_exponent}' / f'{file_number}.csv')
        dataset_number_in_file = dataset_number % number_of_datasets_per_file
        df = df.iloc[dataset_number_in_file *
                     size:(dataset_number_in_file + 1) * size]
        return logs.ActivationLog(df)
    elif dataset_exponent in [6, 7]:
        assert (dataset_exponent == 6
                and dataset_number < 10) or (dataset_exponent == 7
                                             and dataset_number == 0)
        return logs.ActivationLog(retailer_folder / f'D{dataset_exponent}' /
                                  f'{dataset_number}.csv')
    else:
        raise ValueError('Illegal dataset_exponent')
def get_n_levels(hierarchy):
    df = dataframe_operations.import_df(hierarchy)
    return len(df.columns) - (1 if sku in hierarchy.columns else 0)