コード例 #1
0
    def finish_initialization(self, weights, medoids, clusters,
                              initialization_type):
        """
        Saves the results from the initialization

        Parameters
        ----------
        weights: dict of str to Number, iterable of Number
            The weights to be used. See :py:meth:`validate_and_convert_weights()
            <cjo.weighted_adapted_jaccard.distances.implementation.DSX.validate_and_convert_weights>`.
        medoids: iterable of int
            The medoids after initialization
        clusters: iterable of DSX
            The clusters after initialization
        initialization_type: str
            How the initialization was done. Should be one of

            - BootstrapResults.StartReceipts
            - BootstrapResults.StartDSX
            - BootstrapResults.StartWeights
            - BootstrapResults.StartDefault

        """

        assert self.expected[0] is None, 'Already initialized'
        self.expected = (1, True)

        # Validation and conversion
        weights = self.dsx.get_named_weight_vector(weights)

        assert len(medoids) == self.k
        medoids = listified(medoids, np.uint64)

        assert len(clusters) == self.k
        clusters = listified(clusters, DSX)

        # Store information
        for k, v in weights.items():
            self.weights.loc[0, k] = v
        for lab, (med, cluster) in enumerate(zip(medoids, clusters)):
            self.medoids.loc[0, lab] = med
            self.cluster_sizes.loc[0, lab] = cluster.size_r
            self.clusters.loc[0, cluster.keys()] = lab

        # print results
        self.msg_service.send_message(initialization_type)
        self.msg_service.send_message('*' * len(initialization_type))
        self.__weights_print(weights, 'Initial weights')
        self.__cluster_print(weights, medoids, clusters, 'Initial clusters')
        s = f'Iteration {1}'
        self.msg_service.send_message(f'\n{s}\n{"-" * len(s)}')
コード例 #2
0
ファイル: logs.py プロジェクト: YorickSpenrath/PKDD2020
    def assert_unique(self, f, idx=None):
        """
        Assert feature is unique for each index in the DataFrame.

        Parameters
        ----------
        f: str or Iterable(str)
            The feature(s) to be asserted
        idx: str or None
            The column to treat as index. If None, the index of the DataFrame is used.

        """
        f = listified(f, str, validation=lambda x: x in self.df.columns)
        if idx is None:
            len_vid = len(self.df)
            len_f = len(self.df.reset_index(drop=False)[[self.index.name] + f])
        else:
            # On column
            assert idx in self.df.columns, 'given idx not in df.columns'
            len_vid = len(self[idx].drop_duplicates())
            len_f = len(self[[idx] + f].drop_duplicates())

        if len_f != len_vid:
            if len(f) == 1:
                raise ValueError('Duplicate value of {}'.format(f[0]))
            else:
                raise ValueError('Duplicate combinations of {}'.format(*f))
コード例 #3
0
ファイル: logs.py プロジェクト: YorickSpenrath/PKDD2020
    def subset_visit(self, invoices):
        """
        Get a subset of the log, for a given iterable of invoice ids

        Parameters
        ----------
        invoices: str or iterable of str
            Invoice ids to get.

        Returns
        -------
        Log: Log
            Log of same type with only the given invoices. Missing invoices are ignored.
        """
        invoices = listified(invoices, str)
        if invoice in self.columns:
            return type(self)(self.df[self.df[invoice].isin(invoices)])
        elif self.index.name == invoice:
            invoices = listified(invoices,
                                 str,
                                 filtering=lambda v: v in self.index)
            return type(self)(self.df.loc[invoices])
        else:
            raise TypeError(f'Cannot perform this operation on {type(self)}')
コード例 #4
0
 def fixed_point_info(self):
     """All required information to describe the fixed point of this single bootstrap repetition"""
     final_medoids = listified(self.final_medoids,
                               int,
                               sort=True,
                               validation=lambda x: x > 0)
     f = self.final_weights.sort_values(ascending=False)
     return pd.Series(
         data={
             **{f'{supercat}_{i}': v
                for i, v in enumerate(f.index)},
             **{f'w_{i}': v
                for i, v in enumerate(f.values)},
             **{f'm_{i}': v
                for i, v in enumerate(final_medoids)}
         })
コード例 #5
0
ファイル: bitops.py プロジェクト: YorickSpenrath/PKDD2020
def ints2bitvectors(ints):
    """
    Converts multiple integers to same-length bitvectors

    Parameters
    ----------
    ints: iterable of int
        The integers

    Returns
    -------
    bit_vectors: iterable of list of ints
        The bit_vector representation of the integers in ints, all with the same length.
    """

    ints = listified(ints, int)
    bit_length = len(f'{max(ints):b}')
    return (int2bitvector(i, bit_length) for i in ints)
コード例 #6
0
    def assert_partition(self, clustering):
        """
        Asserts that the given clustering is a valid partition over this DSX

        Parameters
        ----------
        clustering: iterable of DSX
            collection of DSX that is the partition

        Raises
        ------
        AssertionError
            If the given collection of DSX is not a valid partition over this DSX

        """
        clustering = listified(clustering, DSX)
        assert_valid_partition(set(self.keys()),
                               [set(dsx_i.keys()) for dsx_i in clustering])
コード例 #7
0
ファイル: bitops.py プロジェクト: YorickSpenrath/PKDD2020
def bitvector2int(bitvector):
    """
    Computes the integer representation of a bit vector. The inverse of int2bitvector

    Parameters
    ----------
    bitvector: iterable(int)
        The bit vector

    Returns
    -------
    i: int
        The integer representation of the bit vector

    Raises
    ------
    AssertionError:
        If any of the values in bitvector is not an int in [0, 1]
    """
    bitvector = listified(bitvector, int, validation=lambda x: x in [0, 1])
    return int(''.join([str(i) for i in bitvector]), 2)
コード例 #8
0
    def __init__(self, dataset, hierarchy, visit_ids_per_rp=None):
        """
        This class tracks the dataset, hierarchy, and num_cat, such that all assertions are done once. It is meant to
        represent :math:`X`, though we track the hierarchy in order allow computations of :math:`s(r)`, and as such of
        :math:`X^2|_z`. The given input is based on :math:`r' \in R'`, as discussed in Subsection
        \\ref{sec:compopt:ws2w}.

        Parameters
        ----------
        dataset : Counter{np.uint64}
            The frequency of each representation (the value of :math:`\mathcal{D'}`)
        hierarchy: Hierarchy
            The hierarchy used for this DSX
        visit_ids_per_rp: dict : int -> (list of str) or None
            The visit_ids per rp value

        Raises
        ------
        AssertionError
            If the largest original receipt in the dataset is not smaller than :math:`2^{|\mathcal{C}|}`.

            If the smallest original receipt in the dataset is not larger than 0.

            If the sum of hierarchy_vector_prime[1:] is not equal to :math:`2^{|\mathcal{C}|} - 1` or if any of the
            values of the hierarchy_vector_prime is 0. (i.e. if the hierarchy is not a partition over the categories).

            If the first value of the hierarchy_vector_prime is not equal to
            :math:`(2^h) \ll |\mathcal{C}|`.

            If the first value of the hierarchy_names_prime is not equal to :math:`c_0`.
        """
        num_cat = hierarchy.num_cat
        h_vector = [
            bitops.subset2int(hierarchy[k], hierarchy.c) for k in hierarchy.sc
        ]
        hierarchy_vector_prime = [(
            (2**hierarchy.h - 1) << hierarchy.num_cat)] + h_vector
        hierarchy_names_prime = [ISC_WEIGHT] + hierarchy.sc
        self.hierarchy = hierarchy

        # INPUT TYPE VERIFICATION #
        assert isinstance(dataset, Counter)
        assert all(isinstance(di, int) for di in dataset.keys())
        hierarchy_vector_prime = np.array(hierarchy_vector_prime,
                                          dtype=np.uint64)
        assert isinstance(hierarchy_names_prime, list)
        hierarchy_names_prime = listified(hierarchy_names_prime, str)
        assert isinstance(num_cat, int)

        if visit_ids_per_rp is None:
            self.visit_ids_per_rp = {rp: [] for rp in dataset.keys()}
        else:
            assert isinstance(visit_ids_per_rp, dict)
            self.visit_ids_per_rp = visit_ids_per_rp

        h = len(hierarchy_names_prime) - 1

        # DATASET VERIFICATION #
        original_receipts = {
            r_prime & (2**num_cat - 1)
            for r_prime in dataset.keys()
        }
        # All original datapoints must be a subset of C
        assert max(original_receipts) < 2 ** num_cat, \
            'maximum category encoding is not smaller than 2 ^ number of categories'
        # Datapoints must be larger than 0 (non-empty receipts)
        assert min(original_receipts) > 0, 'Encodings should be bigger than 0'
        self.__dataset = dataset

        # HIERARCHY VERIFICATION #
        # after the first index
        assert sum(hierarchy_vector_prime[1:]) == 2 ** num_cat - 1, \
            'hierarchy should contain all categories at least once (i.e. its sum should be 2 ** num_cat - 1)'
        assert all([h > 0 for h in hierarchy_vector_prime]), \
            'hierarchy cannot contain empty sets (i.e. 0 valued integer representations)'
        # on the first index
        assert hierarchy_vector_prime[0] == (2 ** h - 1) << num_cat, \
            'the first value of the hierarchy vector needs to be' \
            ' (2 ** h - 1) << num_cat'
        assert hierarchy_names_prime[
            0] == ISC_WEIGHT, f'first super-category should be "{ISC_WEIGHT}"'

        # DATA IS OKAY, STORE IT
        self.__dataset = dataset
        self.hierarchy_vector = hierarchy_vector_prime
        self.__hierarchy_names = hierarchy_names_prime
        self.__num_cat = num_cat
        self.__h = h

        # OPTIMIZE : Precompute only useful z-values, not all
        # You are currently "precomputing" all values of z and z', but this might not be necessary, as some might not
        # have pairs of receipts a', b' such that z = s'(a') | s'(b').

        self.rp_values = None
        self.drp_values = None
        self.__zp_values = None
        self.__d_ci_x_z_matrix = None
        self.prep()
コード例 #9
0
    def finish_cluster_step(self, medoids, clusters, current_iteration,
                            duration):
        """
        Save the new medoids and clusters.

        Parameters
        ----------
        medoids: iterable of int
            New medoids
        clusters: iterable of DSX
            New clusters
        current_iteration: int
            The iteration number of this round
        duration: float
            The (process) time in seconds taken for the clustering.

        Notes
        -----
        Saves the medoids and the size of the clusters. Each cluster is assigned to a
        'label'. The labels are initialized in :meth:`finish_initialization` . In subsequent calls, each of the clusters
        is assigned to a label as follows. For all
        combinations of clusters and labels, the distance between the medoids of the new cluster and the medoid of the
        cluster for that label is computed. For combination (cluster, label) that has the lowest distance, we assign the
        cluster to the label. We remove the cluster and label from the options and repeat this assigning another n-1
        times, for a total of n times, such that each cluster is assigned to a label.
        """

        assert self.expected == (current_iteration, False)
        self.expected = (current_iteration + 1, True)

        medoids = listified(medoids, np.uint64)
        assert len(medoids) == self.k

        clusters = listified(clusters, DSX)
        assert len(clusters) == self.k

        assert isinstance(duration, float)
        self.durations.loc[current_iteration, CLUSTERING_TIME] = duration

        medoid_clusters = {k: v for k, v in zip(medoids, clusters)}

        weights = self.weights.loc[current_iteration].to_dict()

        # distance function between new medoids and existing labels
        def d(e):
            new_medoid = int(e[0])
            old_medoid = int(self.medoids.loc[current_iteration - 1, e[1]])
            return self.dsx.get_pure_distance_metric(weights)(new_medoid,
                                                              old_medoid)

        # Match new medoids with previous medoids
        unlabelled_medoids = set(medoids)
        available_labels = list(range(self.k))

        while len(unlabelled_medoids) > 0:
            med, label = min(itertools.product(unlabelled_medoids,
                                               available_labels),
                             key=d)
            self.medoids.loc[current_iteration, label] = med
            self.cluster_sizes.loc[current_iteration,
                                   label] = medoid_clusters[med].size_r
            self.clusters.loc[current_iteration,
                              medoid_clusters[med].keys()] = int(label)
            unlabelled_medoids.remove(med)
            available_labels.remove(label)

        self.__cluster_print(weights=weights,
                             medoids=medoids,
                             clusters=clusters,
                             title='New clusters',
                             duration=duration)

        # Maximum number of iterations reached?
        if current_iteration == self.n_max:
            self.cycle = -1
            self.terminate()
            return True
        else:
            s = f'Iteration {current_iteration + 1}'
            self.msg_service.send_message(f'\n{s}\n{"-" * len(s)}')
            return False
コード例 #10
0
def create_clusters_and_assign_them_to_given_medoids(mbr, dataset_name=None):
    """
    Computes clusters given the fixed point of the mbr.

    Parameters
    ----------
    mbr: MultipleBootstrapResult, or str
        Information of the Result. If str, it will load the MultipleBootstrapResult from this location
    dataset_name: str or ActivationLog or None
        Name of the dataset to cluster. If None, the dataset from the mbr is used. This also implies that add_fp_medoids
         should be False

    Returns
    -------
    mapping: dict {str: DSX}
        The mapping from medoids to DSX. See Notes

    Notes
    -----
    The medoids are taken from the fixed point of the mbr. If mbr.settings.dataset != dataset_name or dataset_name is
    not None, the medoids of the fixed point of the mbr are added to the dataset prior to clustering, and are removed
    after assigning the clusters to the initial clusters.

    The return value maps each medoid to the DSX object that it is in after the clustering. The medoids are removed from
    these objects if added because of add_fp_medoids being True.

    Note that this is not necessarily a bijective mapping between the medoids and the clusters. If the dataset and
    medoids are not 'sufficiently similarly representing' the receipt space, multiple medoids may end up in a single
    cluster, and as a result some cluster may not get any medoid.

    """

    if isinstance(mbr, str) or isinstance(mbr, Path):
        mbr = MultipleBootstrapResult(mbr)
    elif isinstance(mbr, MultipleBootstrapResult):
        pass
    else:
        raise TypeError(f'Type of mbr not accepted:{type(mbr)}')

    if dataset_name is None:
        dataset_name = mbr.settings.dataset
    al = data_loader.generic_al_loader(dataset_name)

    add_fp_medoids = dataset_name != mbr.settings.dataset

    fp = mbr.the_fixed_point
    initial_medoids = Counter([int(k) for k in fp[1].values])

    if add_fp_medoids:
        dsx = al2dsx(al,
                     hierarchy=mbr.hierarchy,
                     additional_rp=initial_medoids)
    else:
        dsx = al2dsx(al, hierarchy=mbr.hierarchy)

    w = dsx.validate_and_convert_weights(fp[0])

    # Cluster the data
    _, clustering = clustering_algorithm(dsx, w, initial_medoids=fp[1])

    clustering = listified(clustering, DSX)

    # Map the medoids to the clusters
    res = {
        im: clustering[[im in dsx for dsx in clustering].index(True)]
        for im in initial_medoids
    }

    if add_fp_medoids:
        # Remove receipts from the clusters
        # Note that, since DSX is mutable, the removing is reflected in res.
        for dsx in clustering:
            receipts_to_be_removed = Counter(initial_medoids) - (
                Counter(initial_medoids) - dsx.dataset)
            dsx.remove(receipts_to_be_removed=receipts_to_be_removed)

    return res