def _check_redundancy(self, feature_pair): """ Perform pairwise mutual information analysis to identify feature redundancy. Parameters ---------- feature_pair : tuple 2-tuple containing features to compare Returns ------- tuple Tuple of three object: (redundancy, feature_pair, nmi_value) """ feature1, feature2 = feature_pair # calculate the normalized mutual information nmi = self._estimate_nmi((feature1, feature2)) logger.debug("| nmi({},{}) = {}".format(feature1, feature2, nmi)) # prune if nmi is above threshold if nmi > self.nmi_threshold: logger.debug("Feature #{} is redundant w/ #{}".format( feature1, feature2)) return True, feature_pair, nmi # feature is not redundant return False, feature_pair, nmi
def read_data(q): logger.debug(q) es_query = {"query": q["query"]} logger.info({"es_query": es_query}) hits = esd.read_all_data(es_query, cfg['elk']['index'], cfg['elk']['type'], 5000) logger.info({"first hit": hits[0]}) return (hits)
def information_leakage(self, clusters, sample_size=5000, joint_leakage=True): """ Evaluate the information leakage for feature(s). Computes marginal KDEs for features given a sites using AKDEs. Conditional entropy is then estimated from the distributions via monte-carlo integration. The conditional entropy is then used to compute the leakage for the feature(s) Parameters ---------- clusters : list A list of lists. Features is a list of clusters. Each cluster is a list containing the features in the cluster. A singular feature or cluster may be given as the parameter. In those instances, the data will be wrapped in additional lists to match the expected form. sample_size : int Count of total random feature samples to use for monte-carlo estimation. joint_leakage : bool Determines if the leakage of clusters should be measured jointly or individually. If True, the probability of samples for each cluster will be multiplied together before estimating entropy. Otherwise, the leakage for each cluster is measured. Returns ------- list Estimated information leakage for the features/clusters. If ``joint_leakage`` is True, the list contains the leakage for the combined analysis. Otherwise, the list contains the leakages for each cluster, appearing in the same order as seen in ``clusters``. """ # convert one feature to singular list for comparability if not isinstance(clusters, Iterable): clusters = [clusters] if not isinstance(clusters[0], Iterable): clusters = [clusters] self.sample_size = sample_size logger.debug("Measuring leakage for {}".format(clusters)) # Shannon Entropy func: -p(x)*log2(p(x)) h = lambda x: -x * math.log(x, 2) # H(C) -- compute website entropy, this represents the maximum number of bits which can be leaked H_C = sum([h(prior) for prior in self.website_priors if prior > 0]) # map clusters to probability predictions for random samples # allows for KDE construction, sampling, and prediction to be done in parallel (if enabled) if self._pool is None: results = map(self._do_predictions, clusters) else: results = self._pool.imap(self._do_predictions, clusters) self._pool.close() # load the results as they are produced and log progress cluster_probs = [] for probs in results: cluster_probs.append(probs) # print progress updates count = len(cluster_probs) if count-1 % (len(clusters)*0.05) == 0: logger.info("Progress: {}/{}".format(count, len(clusters))) # restart pool if multiprocessing if self._pool is not None: self._pool.join() self._pool.restart() if joint_leakage: # multiply cluster probs to get joint probs for each sample # clusters are assumed to be independent from one another # in this way, the joint probability of all the variables is their products cluster_probs = np.array(cluster_probs) prob_sets = [np.prod(cluster_probs, axis=0)] # shape (1, n_sites, n_samples) else: # measure leakages for each cluster independently prob_sets = cluster_probs # shape (n_clusters, n_sites, n_samples) # compute information leakage for each cluster (or combined cluster if joint) leakages = [] for i, prob_set in enumerate(prob_sets): # weight the probability predictions by the website priors # in the closed-world scenario, all are equally weighted probs_weighted = [] for site, probs in enumerate(prob_set): probs_weighted.append(probs * self.website_priors[site]) probs_weighted = np.array(probs_weighted) # transpose array so that first index represents samples, second index represent site probs_weighted = np.transpose(probs_weighted) # normalize probabilities such that the per-site probs for each sample sums to one # (as should be expected for conditional probabilities) probs_norm = [] for probs in probs_weighted: norm = probs / sum(probs) if sum(probs) > 0 else probs probs_norm.append(norm) # compute entropy for each sample entropies = [] for probs in probs_norm: entropies.append(sum([h(prob) for prob in probs if prob > 0])) # H(C|f) -- estimate real entropy as average of all samples H_CF = sum(entropies)/len(entropies) # I(C;f) = H(C) - H(C|f) -- compute information leakage leakage = H_C - H_CF leakages.append(leakage) # debug output logger.debug("{cluster} {l} = {c} - {cf}" .format(cluster=clusters[i], l=leakage, c=H_C, cf=H_CF)) return leakages
def update(self, doc, index, doc_id, doc_type='list'): logger.debug({"about to update":str(doc)}) res = self.es.update(index, id=doc_id, doc_type=doc_type, body=doc ) logger.info({"updated. results":str(res)}) return(res)
def write(self, doc, index, doc_id, doc_type='list'): logger.debug({"About to write":str(doc)}) res = self.es.index(index, id=doc_id, doc_type=doc_type, body=doc ) logger.info({"wrote, results":str(res)}) return(res)
def cluster(self, features, checkpoint=None, min_samples=1, min_cluster_size=3): """ Find clusters in provided features. Use DBSCAN algorithm to cluster topN features based upon their pairwise mutual information. First fill an NxN matrix with NMI feature pair values. NMI values may be retrieved from the MIAnalyzer's internal cache or by doing computations anew. The DBSCAN model is then fit to this distances grid, and the identified clusters are returned. Parameters ---------- features : list A list of features to cluster checkpoint : str Path to plaintext file to store feature redundancy checkpoint information. Do not perform checkpointing if None is used. min_samples : int The min_samples parameter to use for the HDBSCAN algorithm. The number of samples in a neighbourhood for a point to be considered a core point. min_cluster_size : int The min_cluster_size parameter to use for the HDBSCAN algorithm. The minimum size of clusters; single linkage splits that contain fewer points than this will be considered points “falling out” of a cluster rather than a cluster splitting into two new clusters. Returns ------- list Nested lists where each list contains the cluster's features. Features that do not fall into a cluster are given their own cluster (ie. singular list). """ # compute pairwise MI for all topN features X = np.empty(shape=(len(features), len(features)), dtype=float) # distance matrix pairs = list(combinations_with_replacement( features, 2)) # all possible combinations # if checkpointing, read NMI calculations and save to cache if checkpoint is not None: if os.path.exists(checkpoint): chk_fi = open(checkpoint, 'r+') for line in chk_fi: try: if line[0] == '=': a, b, c = line[1:].split(',') self._nmi_cache.append( ((int(a), int(b)), float(c))) except: pass chk_fi.close() # re-open checkpoint for appending chk_fi = open(checkpoint, 'a+') if self._nmi_cache: # ignore unselected features in cache cache = [(pair, nmi) for pair, nmi in self._nmi_cache if pair[0] in features and pair[1] in features] # add each cached nmi to the distance matrix for cached_pair, nmi in cache: # remove cached_pair from pairs pairs = list( filter( lambda pair: (pair[0] != cached_pair[0] and pair[ 1] != cached_pair[1]) and (pair[0] != cached_pair[ 1] and pair[1] != cached_pair[0]), pairs)) # add cached nmi to matrix i, j = features.index(cached_pair[0]), features.index( cached_pair[1]) X[i][j] = 1 - nmi X[j][i] = 1 - nmi if len(pairs) > 0: # map pairs to nmi if self._pool is None: results = map(self._estimate_nmi, pairs) else: results = self._pool.imap(self._estimate_nmi, pairs) self._pool.close() # fill matrix with pair nmi values count = 0 for pair, nmi in zip(pairs, results): # print progress updates count += 1 if count - 1 % (len(pairs) * 0.05) == 0: logger.info("Progress: {}/{}".format(count, len(pairs))) fidx1, fidx2 = pair i, j = features.index(fidx1), features.index(fidx2) X[i][j] = 1 - nmi X[j][i] = 1 - nmi if checkpoint is not None: chk_fi.write('={},{},{}\n'.format(fidx1, fidx2, nmi)) chk_fi.flush() # restart pool if multiprocessing if self._pool is not None: self._pool.join() self._pool.restart() # verify that all values are filled assert not np.any(X[X == np.nan]) # use DBSCAN to cluster our data labels = HDBSCAN(metric='precomputed', min_samples=min_samples, min_cluster_size=min_cluster_size).fit_predict(X) logger.debug("Found {} clusters.".format(set(labels))) # organize the topN features into sub-lists where # each sub-list contains all features in a cluster clusters = [] for label in range(min(labels), max(labels) + 1): if label >= 0: cluster = [ features[i] for i, la in enumerate(labels) if la == label ] clusters.append(cluster) else: # treat features that do not cluster (ie. noise) each as their own independent cluster noise = [[features[i]] for i, la in enumerate(labels) if la == label] clusters.extend(noise) logger.debug("Clusters: {}".format(labels)) return clusters, X
def prune(self, features, checkpoint=None, nmi_threshold=0.9, topn=100): """ Reduce the feature-set to a list of top features which are non-redundant. Redundancy is identified by estimating the pair-wise mutual information of features. The algorithm will find up to a maximum of ``topn`` non-redundant features before ending. If the MIAnalyzer was instantiated with a ``pool``, NMI calculations will be performed in parallel. Parameters ---------- features : list Array of features from which to prune redundant features. Features should be pre-sorted by importance with the most important feature being at index 0. checkpoint : str Path to plaintext file to store feature redundancy checkpoint information. Do not perform checkpointing if None is used. nmi_threshold : float Threshold value used to identify redundant features. Features with NMI values greater than the threshold value are pruned. topn : int Number of features to save when pruning is performed. Returns ------- list Features list having variable length up to ``topn``. """ # results of NMI calculations are saved in list internal to the analyzer # reduces the amount of computation required in any subsequent cluster calls self._nmi_cache, self._mi_cache = [], dict() self.nmi_threshold = nmi_threshold # feature lists cleaned_features = set() # non-redundant pruned_features = set() # redundant # if checkpointing, open file and read any previously processed features if checkpoint is not None: if os.path.exists(checkpoint): checkpoint_fi = open(checkpoint, 'r+') for line in checkpoint_fi: try: if line[0] == '+': feature = int(line[1:].strip()) cleaned_features.add(feature) elif line[0] == '-': feature = int(line[1:].strip()) pruned_features.add(feature) if line[0] == '=': a, b, c = line[1:].split(',') self._nmi_cache.append( ((int(a), int(b)), float(c))) except: pass features = list( filter( lambda f: f not in cleaned_features and f not in pruned_features, features)) checkpoint_fi.close() # re-open checkpoint for appending checkpoint = open(checkpoint, 'a+') # continue to process features until either there are no features left to process # or the topN features have been selected while features and len(cleaned_features) < topn: # the next most important feature current_feature = features.pop(0) logger.debug("MI analysis on feature #{}".format(current_feature)) # for all top features, measure pair-wise mutual information to check for redundancy feature_pairs = zip(repeat(current_feature), cleaned_features) if self._pool is None or len(cleaned_features) < 2: results = map(self._check_redundancy, feature_pairs) else: # parallel, unordered results = self._pool.uimap(self._check_redundancy, feature_pairs) # break upon first occurrence of redundancy is_redundant = False for res in results: # unzip results is_redundant, feature_pair, nmi = res # save feature pair with nmi in cache self._nmi_cache.append((feature_pair, nmi)) if checkpoint is not None: checkpoint.write('={},{},{}\n'.format( feature_pair[0], feature_pair[1], nmi)) checkpoint.flush() # break loop if is_redundant: # if the analyzer is using a process pool # terminate processes and restart the pool if self._pool is not None: self._pool.terminate() self._pool.join() self._pool.restart() break # if the current feature does not appear to be redundant with any # other top features, add current feature to top features list if not is_redundant: cleaned_features.add(current_feature) logger.info("Progress: {}/{}".format(len(cleaned_features), min(topn, len(features)))) if checkpoint is not None: checkpoint.write('+{}\n'.format(current_feature)) checkpoint.flush() else: pruned_features.add(current_feature) if checkpoint is not None: checkpoint.write('-{}\n'.format(current_feature)) checkpoint.flush() if checkpoint is not None: checkpoint.close() # return both non-redundant and redundant features # which feature was redundant with which is however not saved return list(cleaned_features), list(pruned_features)
def main(features_path, output_path, n_procs=0, n_samples=5000, topn=100, nmi_threshold=0.9, discrete_threshold=100000): """ Run the full information leakage analysis on a processed dataset. Parameters ---------- features_path : str Operating system file path to the directory containing processed feature files. output_path : str Operating system file path to the directory where analysis results should be saved. n_procs : int Number of processes to use for parallelism. If 0 is used, auto-detect based on number of system CPUs. n_samples : int Number of samples to use when performing monte-carlo estimation when running the fingerprint modeler. topn : int Top number of features to analyze during joint analysis. nmi_threshold : float Cut-off value for determining redundant features. Should be a percentage value. Returns ------- float Combined feature leakage (in bits) """ # prepare feature dataset logger.info("Loading dataset.") feature_data = WebsiteData(features_path) logger.info("Loaded {} sites.".format(len(feature_data.sites))) logger.info("Loaded {} instances.".format(len(feature_data))) # create process pool if n_procs > 1: pool = Pool(n_procs) elif n_procs == 0: pool = Pool(cpu_count()) else: pool = None # directory to save results outdir = output_path if not os.path.isdir(outdir): os.makedirs(outdir) # initialize fingerprint modeler modeler = WebsiteFingerprintModeler(feature_data, discrete_threshold=discrete_threshold) # load previous leakage measurements if possible indiv_path = os.path.join(outdir, 'indiv.pkl') if os.path.exists(indiv_path): with open(indiv_path, "rb") as fi: logger.info("Loading individual leakage measures from file.") leakage_indiv = dill.load(fi) # otherwise do individual measure else: logger.info("Begin individual feature analysis.") # perform individual measure with checkpointing chk_path = os.path.join(outdir, 'indiv_checkpoint.txt') leakage_indiv = _individual_measure(modeler, pool, chk_path) # save individual leakage to file logger.info("Saving individual leakage to {}.".format(indiv_path)) with open(indiv_path, "wb") as fi: dill.dump(leakage_indiv, fi) # perform combined information leakage measurements # initialize MI analyzer analyzer = MutualInformationAnalyzer(feature_data, pool=pool) # sort the list of features by their individual leakage # we will process these features in the order of their importance during MI analysis logger.info("Sorting features by individual leakage.") tuples = list(zip(feature_data.features, leakage_indiv)) tuples = sorted(tuples, key=lambda x: (-x[1], x[0])) logger.debug("Top 20:\t {}".format(tuples[:20])) sorted_features = list(list(zip(*tuples))[0]) # process into list of non-redundant features cln_path = os.path.join(outdir, 'cleaned.pkl') rdn_path = os.path.join(outdir, 'redundant.pkl') chk_path = os.path.join(outdir, 'prune_checkpoint.txt') if os.path.exists(cln_path): logger.info("Loading top non-redundant features from file.") with open(cln_path, 'rb') as fi: cleaned = dill.load(fi) else: logger.info("Begin feature pruning.") cleaned, pruned = analyzer.prune(features=sorted_features, nmi_threshold=nmi_threshold, topn=topn, checkpoint=chk_path) with open(cln_path, 'wb') as fi: dill.dump(cleaned, fi) with open(rdn_path, 'wb') as fi: dill.dump(pruned, fi) # cluster non-redundant features dst_path = os.path.join(outdir, 'distance_matrix.pkl') cst_path = os.path.join(outdir, 'clusters.pkl') if os.path.exists(cst_path): logger.info("Loading clusters from file.") with open(cst_path, 'rb') as fi: clusters = dill.load(fi) else: logger.info("Begin feature clustering.") clusters, distance_matrix = analyzer.cluster(cleaned, checkpoint=chk_path) with open(dst_path, 'wb') as fi: dill.dump(distance_matrix, fi) with open(cst_path, 'wb') as fi: dill.dump(clusters, fi) # perform joint information leakage measurement logger.info('Identified {} clusters.'.format(len(clusters))) logger.info("Begin cluster leakage measurements.") modeler._pool = pool # configure modeler to use the proc pool leakage_joint = modeler.information_leakage(clusters=clusters, sample_size=n_samples, joint_leakage=True)[0] logger.info("Final leakage results: {} bits".format(leakage_joint)) logger.info("Finished execution.") return leakage_joint