def finish_initialization(self, weights, medoids, clusters, initialization_type): """ Saves the results from the initialization Parameters ---------- weights: dict of str to Number, iterable of Number The weights to be used. See :py:meth:`validate_and_convert_weights() <cjo.weighted_adapted_jaccard.distances.implementation.DSX.validate_and_convert_weights>`. medoids: iterable of int The medoids after initialization clusters: iterable of DSX The clusters after initialization initialization_type: str How the initialization was done. Should be one of - BootstrapResults.StartReceipts - BootstrapResults.StartDSX - BootstrapResults.StartWeights - BootstrapResults.StartDefault """ assert self.expected[0] is None, 'Already initialized' self.expected = (1, True) # Validation and conversion weights = self.dsx.get_named_weight_vector(weights) assert len(medoids) == self.k medoids = listified(medoids, np.uint64) assert len(clusters) == self.k clusters = listified(clusters, DSX) # Store information for k, v in weights.items(): self.weights.loc[0, k] = v for lab, (med, cluster) in enumerate(zip(medoids, clusters)): self.medoids.loc[0, lab] = med self.cluster_sizes.loc[0, lab] = cluster.size_r self.clusters.loc[0, cluster.keys()] = lab # print results self.msg_service.send_message(initialization_type) self.msg_service.send_message('*' * len(initialization_type)) self.__weights_print(weights, 'Initial weights') self.__cluster_print(weights, medoids, clusters, 'Initial clusters') s = f'Iteration {1}' self.msg_service.send_message(f'\n{s}\n{"-" * len(s)}')
def assert_unique(self, f, idx=None): """ Assert feature is unique for each index in the DataFrame. Parameters ---------- f: str or Iterable(str) The feature(s) to be asserted idx: str or None The column to treat as index. If None, the index of the DataFrame is used. """ f = listified(f, str, validation=lambda x: x in self.df.columns) if idx is None: len_vid = len(self.df) len_f = len(self.df.reset_index(drop=False)[[self.index.name] + f]) else: # On column assert idx in self.df.columns, 'given idx not in df.columns' len_vid = len(self[idx].drop_duplicates()) len_f = len(self[[idx] + f].drop_duplicates()) if len_f != len_vid: if len(f) == 1: raise ValueError('Duplicate value of {}'.format(f[0])) else: raise ValueError('Duplicate combinations of {}'.format(*f))
def subset_visit(self, invoices): """ Get a subset of the log, for a given iterable of invoice ids Parameters ---------- invoices: str or iterable of str Invoice ids to get. Returns ------- Log: Log Log of same type with only the given invoices. Missing invoices are ignored. """ invoices = listified(invoices, str) if invoice in self.columns: return type(self)(self.df[self.df[invoice].isin(invoices)]) elif self.index.name == invoice: invoices = listified(invoices, str, filtering=lambda v: v in self.index) return type(self)(self.df.loc[invoices]) else: raise TypeError(f'Cannot perform this operation on {type(self)}')
def fixed_point_info(self): """All required information to describe the fixed point of this single bootstrap repetition""" final_medoids = listified(self.final_medoids, int, sort=True, validation=lambda x: x > 0) f = self.final_weights.sort_values(ascending=False) return pd.Series( data={ **{f'{supercat}_{i}': v for i, v in enumerate(f.index)}, **{f'w_{i}': v for i, v in enumerate(f.values)}, **{f'm_{i}': v for i, v in enumerate(final_medoids)} })
def ints2bitvectors(ints): """ Converts multiple integers to same-length bitvectors Parameters ---------- ints: iterable of int The integers Returns ------- bit_vectors: iterable of list of ints The bit_vector representation of the integers in ints, all with the same length. """ ints = listified(ints, int) bit_length = len(f'{max(ints):b}') return (int2bitvector(i, bit_length) for i in ints)
def assert_partition(self, clustering): """ Asserts that the given clustering is a valid partition over this DSX Parameters ---------- clustering: iterable of DSX collection of DSX that is the partition Raises ------ AssertionError If the given collection of DSX is not a valid partition over this DSX """ clustering = listified(clustering, DSX) assert_valid_partition(set(self.keys()), [set(dsx_i.keys()) for dsx_i in clustering])
def bitvector2int(bitvector): """ Computes the integer representation of a bit vector. The inverse of int2bitvector Parameters ---------- bitvector: iterable(int) The bit vector Returns ------- i: int The integer representation of the bit vector Raises ------ AssertionError: If any of the values in bitvector is not an int in [0, 1] """ bitvector = listified(bitvector, int, validation=lambda x: x in [0, 1]) return int(''.join([str(i) for i in bitvector]), 2)
def __init__(self, dataset, hierarchy, visit_ids_per_rp=None): """ This class tracks the dataset, hierarchy, and num_cat, such that all assertions are done once. It is meant to represent :math:`X`, though we track the hierarchy in order allow computations of :math:`s(r)`, and as such of :math:`X^2|_z`. The given input is based on :math:`r' \in R'`, as discussed in Subsection \\ref{sec:compopt:ws2w}. Parameters ---------- dataset : Counter{np.uint64} The frequency of each representation (the value of :math:`\mathcal{D'}`) hierarchy: Hierarchy The hierarchy used for this DSX visit_ids_per_rp: dict : int -> (list of str) or None The visit_ids per rp value Raises ------ AssertionError If the largest original receipt in the dataset is not smaller than :math:`2^{|\mathcal{C}|}`. If the smallest original receipt in the dataset is not larger than 0. If the sum of hierarchy_vector_prime[1:] is not equal to :math:`2^{|\mathcal{C}|} - 1` or if any of the values of the hierarchy_vector_prime is 0. (i.e. if the hierarchy is not a partition over the categories). If the first value of the hierarchy_vector_prime is not equal to :math:`(2^h) \ll |\mathcal{C}|`. If the first value of the hierarchy_names_prime is not equal to :math:`c_0`. """ num_cat = hierarchy.num_cat h_vector = [ bitops.subset2int(hierarchy[k], hierarchy.c) for k in hierarchy.sc ] hierarchy_vector_prime = [( (2**hierarchy.h - 1) << hierarchy.num_cat)] + h_vector hierarchy_names_prime = [ISC_WEIGHT] + hierarchy.sc self.hierarchy = hierarchy # INPUT TYPE VERIFICATION # assert isinstance(dataset, Counter) assert all(isinstance(di, int) for di in dataset.keys()) hierarchy_vector_prime = np.array(hierarchy_vector_prime, dtype=np.uint64) assert isinstance(hierarchy_names_prime, list) hierarchy_names_prime = listified(hierarchy_names_prime, str) assert isinstance(num_cat, int) if visit_ids_per_rp is None: self.visit_ids_per_rp = {rp: [] for rp in dataset.keys()} else: assert isinstance(visit_ids_per_rp, dict) self.visit_ids_per_rp = visit_ids_per_rp h = len(hierarchy_names_prime) - 1 # DATASET VERIFICATION # original_receipts = { r_prime & (2**num_cat - 1) for r_prime in dataset.keys() } # All original datapoints must be a subset of C assert max(original_receipts) < 2 ** num_cat, \ 'maximum category encoding is not smaller than 2 ^ number of categories' # Datapoints must be larger than 0 (non-empty receipts) assert min(original_receipts) > 0, 'Encodings should be bigger than 0' self.__dataset = dataset # HIERARCHY VERIFICATION # # after the first index assert sum(hierarchy_vector_prime[1:]) == 2 ** num_cat - 1, \ 'hierarchy should contain all categories at least once (i.e. its sum should be 2 ** num_cat - 1)' assert all([h > 0 for h in hierarchy_vector_prime]), \ 'hierarchy cannot contain empty sets (i.e. 0 valued integer representations)' # on the first index assert hierarchy_vector_prime[0] == (2 ** h - 1) << num_cat, \ 'the first value of the hierarchy vector needs to be' \ ' (2 ** h - 1) << num_cat' assert hierarchy_names_prime[ 0] == ISC_WEIGHT, f'first super-category should be "{ISC_WEIGHT}"' # DATA IS OKAY, STORE IT self.__dataset = dataset self.hierarchy_vector = hierarchy_vector_prime self.__hierarchy_names = hierarchy_names_prime self.__num_cat = num_cat self.__h = h # OPTIMIZE : Precompute only useful z-values, not all # You are currently "precomputing" all values of z and z', but this might not be necessary, as some might not # have pairs of receipts a', b' such that z = s'(a') | s'(b'). self.rp_values = None self.drp_values = None self.__zp_values = None self.__d_ci_x_z_matrix = None self.prep()
def finish_cluster_step(self, medoids, clusters, current_iteration, duration): """ Save the new medoids and clusters. Parameters ---------- medoids: iterable of int New medoids clusters: iterable of DSX New clusters current_iteration: int The iteration number of this round duration: float The (process) time in seconds taken for the clustering. Notes ----- Saves the medoids and the size of the clusters. Each cluster is assigned to a 'label'. The labels are initialized in :meth:`finish_initialization` . In subsequent calls, each of the clusters is assigned to a label as follows. For all combinations of clusters and labels, the distance between the medoids of the new cluster and the medoid of the cluster for that label is computed. For combination (cluster, label) that has the lowest distance, we assign the cluster to the label. We remove the cluster and label from the options and repeat this assigning another n-1 times, for a total of n times, such that each cluster is assigned to a label. """ assert self.expected == (current_iteration, False) self.expected = (current_iteration + 1, True) medoids = listified(medoids, np.uint64) assert len(medoids) == self.k clusters = listified(clusters, DSX) assert len(clusters) == self.k assert isinstance(duration, float) self.durations.loc[current_iteration, CLUSTERING_TIME] = duration medoid_clusters = {k: v for k, v in zip(medoids, clusters)} weights = self.weights.loc[current_iteration].to_dict() # distance function between new medoids and existing labels def d(e): new_medoid = int(e[0]) old_medoid = int(self.medoids.loc[current_iteration - 1, e[1]]) return self.dsx.get_pure_distance_metric(weights)(new_medoid, old_medoid) # Match new medoids with previous medoids unlabelled_medoids = set(medoids) available_labels = list(range(self.k)) while len(unlabelled_medoids) > 0: med, label = min(itertools.product(unlabelled_medoids, available_labels), key=d) self.medoids.loc[current_iteration, label] = med self.cluster_sizes.loc[current_iteration, label] = medoid_clusters[med].size_r self.clusters.loc[current_iteration, medoid_clusters[med].keys()] = int(label) unlabelled_medoids.remove(med) available_labels.remove(label) self.__cluster_print(weights=weights, medoids=medoids, clusters=clusters, title='New clusters', duration=duration) # Maximum number of iterations reached? if current_iteration == self.n_max: self.cycle = -1 self.terminate() return True else: s = f'Iteration {current_iteration + 1}' self.msg_service.send_message(f'\n{s}\n{"-" * len(s)}') return False
def create_clusters_and_assign_them_to_given_medoids(mbr, dataset_name=None): """ Computes clusters given the fixed point of the mbr. Parameters ---------- mbr: MultipleBootstrapResult, or str Information of the Result. If str, it will load the MultipleBootstrapResult from this location dataset_name: str or ActivationLog or None Name of the dataset to cluster. If None, the dataset from the mbr is used. This also implies that add_fp_medoids should be False Returns ------- mapping: dict {str: DSX} The mapping from medoids to DSX. See Notes Notes ----- The medoids are taken from the fixed point of the mbr. If mbr.settings.dataset != dataset_name or dataset_name is not None, the medoids of the fixed point of the mbr are added to the dataset prior to clustering, and are removed after assigning the clusters to the initial clusters. The return value maps each medoid to the DSX object that it is in after the clustering. The medoids are removed from these objects if added because of add_fp_medoids being True. Note that this is not necessarily a bijective mapping between the medoids and the clusters. If the dataset and medoids are not 'sufficiently similarly representing' the receipt space, multiple medoids may end up in a single cluster, and as a result some cluster may not get any medoid. """ if isinstance(mbr, str) or isinstance(mbr, Path): mbr = MultipleBootstrapResult(mbr) elif isinstance(mbr, MultipleBootstrapResult): pass else: raise TypeError(f'Type of mbr not accepted:{type(mbr)}') if dataset_name is None: dataset_name = mbr.settings.dataset al = data_loader.generic_al_loader(dataset_name) add_fp_medoids = dataset_name != mbr.settings.dataset fp = mbr.the_fixed_point initial_medoids = Counter([int(k) for k in fp[1].values]) if add_fp_medoids: dsx = al2dsx(al, hierarchy=mbr.hierarchy, additional_rp=initial_medoids) else: dsx = al2dsx(al, hierarchy=mbr.hierarchy) w = dsx.validate_and_convert_weights(fp[0]) # Cluster the data _, clustering = clustering_algorithm(dsx, w, initial_medoids=fp[1]) clustering = listified(clustering, DSX) # Map the medoids to the clusters res = { im: clustering[[im in dsx for dsx in clustering].index(True)] for im in initial_medoids } if add_fp_medoids: # Remove receipts from the clusters # Note that, since DSX is mutable, the removing is reflected in res. for dsx in clustering: receipts_to_be_removed = Counter(initial_medoids) - ( Counter(initial_medoids) - dsx.dataset) dsx.remove(receipts_to_be_removed=receipts_to_be_removed) return res