Example #1
0
    def _check_redundancy(self, feature_pair):
        """
        Perform pairwise mutual information analysis to identify feature redundancy.

        Parameters
        ----------
        feature_pair : tuple
            2-tuple containing features to compare

        Returns
        -------
        tuple
            Tuple of three object: (redundancy, feature_pair, nmi_value)
        """
        feature1, feature2 = feature_pair

        # calculate the normalized mutual information
        nmi = self._estimate_nmi((feature1, feature2))
        logger.debug("| nmi({},{}) = {}".format(feature1, feature2, nmi))

        # prune if nmi is above threshold
        if nmi > self.nmi_threshold:
            logger.debug("Feature #{} is redundant w/ #{}".format(
                feature1, feature2))
            return True, feature_pair, nmi
        # feature is not redundant
        return False, feature_pair, nmi
Example #2
0
def read_data(q):
    logger.debug(q)

    es_query = {"query": q["query"]}
    logger.info({"es_query": es_query})

    hits = esd.read_all_data(es_query, cfg['elk']['index'], cfg['elk']['type'],
                             5000)

    logger.info({"first hit": hits[0]})
    return (hits)
Example #3
0
    def information_leakage(self, clusters, sample_size=5000, joint_leakage=True):
        """
        Evaluate the information leakage for feature(s).

        Computes marginal KDEs for features given a sites using AKDEs.
        Conditional entropy is then estimated from the distributions via monte-carlo integration.
        The conditional entropy is then used to compute the leakage for the feature(s)

        Parameters
        ----------
        clusters : list
            A list of lists. Features is a list of clusters.
            Each cluster is a list containing the features in the cluster.
            A singular feature or cluster may be given as the parameter.
            In those instances, the data will be wrapped in additional lists to match the expected form.
        sample_size : int
            Count of total random feature samples to use for monte-carlo estimation.
        joint_leakage : bool
            Determines if the leakage of clusters should be measured jointly or individually.
            If True, the probability of samples for each cluster will be multiplied together before estimating entropy.
            Otherwise, the leakage for each cluster is measured.

        Returns
        -------
        list
            Estimated information leakage for the features/clusters.
            If ``joint_leakage`` is True, the list contains the leakage for the combined analysis.
            Otherwise, the list contains the leakages for each cluster,
            appearing in the same order as seen in ``clusters``.

        """
        # convert one feature to singular list for comparability
        if not isinstance(clusters, Iterable):
            clusters = [clusters]
        if not isinstance(clusters[0], Iterable):
            clusters = [clusters]

        self.sample_size = sample_size
        logger.debug("Measuring leakage for {}".format(clusters))

        # Shannon Entropy func: -p(x)*log2(p(x))
        h = lambda x: -x * math.log(x, 2)

        # H(C) -- compute website entropy, this represents the maximum number of bits which can be leaked
        H_C = sum([h(prior) for prior in self.website_priors if prior > 0])

        # map clusters to probability predictions for random samples
        # allows for KDE construction, sampling, and prediction to be done in parallel (if enabled)
        if self._pool is None:
            results = map(self._do_predictions, clusters)
        else:
            results = self._pool.imap(self._do_predictions, clusters)
            self._pool.close()

        # load the results as they are produced and log progress
        cluster_probs = []
        for probs in results:
            cluster_probs.append(probs)
            # print progress updates
            count = len(cluster_probs)
            if count-1 % (len(clusters)*0.05) == 0:
                logger.info("Progress: {}/{}".format(count, len(clusters)))

        # restart pool if multiprocessing
        if self._pool is not None:
            self._pool.join()
            self._pool.restart()

        if joint_leakage:
            # multiply cluster probs to get joint probs for each sample
            # clusters are assumed to be independent from one another
            # in this way, the joint probability of all the variables is their products
            cluster_probs = np.array(cluster_probs)
            prob_sets = [np.prod(cluster_probs, axis=0)]  # shape (1, n_sites, n_samples)
        else:
            # measure leakages for each cluster independently
            prob_sets = cluster_probs  # shape (n_clusters, n_sites, n_samples)

        # compute information leakage for each cluster (or combined cluster if joint)
        leakages = []
        for i, prob_set in enumerate(prob_sets):

            # weight the probability predictions by the website priors
            # in the closed-world scenario, all are equally weighted
            probs_weighted = []
            for site, probs in enumerate(prob_set):
                probs_weighted.append(probs * self.website_priors[site])
            probs_weighted = np.array(probs_weighted)

            # transpose array so that first index represents samples, second index represent site
            probs_weighted = np.transpose(probs_weighted)

            # normalize probabilities such that the per-site probs for each sample sums to one
            # (as should be expected for conditional probabilities)
            probs_norm = []
            for probs in probs_weighted:
                norm = probs / sum(probs) if sum(probs) > 0 else probs
                probs_norm.append(norm)

            # compute entropy for each sample
            entropies = []
            for probs in probs_norm:
                entropies.append(sum([h(prob) for prob in probs if prob > 0]))

            # H(C|f) -- estimate real entropy as average of all samples
            H_CF = sum(entropies)/len(entropies)

            # I(C;f) = H(C) - H(C|f) -- compute information leakage
            leakage = H_C - H_CF
            leakages.append(leakage)

            # debug output
            logger.debug("{cluster} {l} = {c} - {cf}"
                         .format(cluster=clusters[i], l=leakage, c=H_C, cf=H_CF))

        return leakages
Example #4
0
 def update(self, doc, index, doc_id, doc_type='list'):
     logger.debug({"about to update":str(doc)})
     res = self.es.update(index, id=doc_id, doc_type=doc_type, body=doc )
     logger.info({"updated. results":str(res)})
     return(res)
Example #5
0
 def write(self, doc, index, doc_id, doc_type='list'):
     logger.debug({"About to write":str(doc)})
     res = self.es.index(index, id=doc_id, doc_type=doc_type, body=doc )
     logger.info({"wrote, results":str(res)})
     return(res)
Example #6
0
    def cluster(self,
                features,
                checkpoint=None,
                min_samples=1,
                min_cluster_size=3):
        """
        Find clusters in provided features.

        Use DBSCAN algorithm to cluster topN features based upon their pairwise mutual information.
        First fill an NxN matrix with NMI feature pair values.
        NMI values may be retrieved from the MIAnalyzer's internal cache or by doing computations anew.
        The DBSCAN model is then fit to this distances grid, and the identified clusters are returned.

        Parameters
        ----------
        features : list
            A list of features to cluster
        checkpoint : str
            Path to plaintext file to store feature redundancy checkpoint information.
            Do not perform checkpointing if None is used.
        min_samples : int
            The min_samples parameter to use for the HDBSCAN algorithm.
            The number of samples in a neighbourhood for a point to be considered a core point.

        min_cluster_size : int
            The min_cluster_size parameter to use for the HDBSCAN algorithm.
            The minimum size of clusters; single linkage splits that contain fewer points than this will be considered points “falling out” of a cluster rather than a cluster splitting into two new clusters.

        Returns
        -------
        list
            Nested lists where each list contains the cluster's features.
            Features that do not fall into a cluster are given their own cluster (ie. singular list).
        """
        # compute pairwise MI for all topN features
        X = np.empty(shape=(len(features), len(features)),
                     dtype=float)  # distance matrix
        pairs = list(combinations_with_replacement(
            features, 2))  # all possible combinations

        # if checkpointing, read NMI calculations and save to cache
        if checkpoint is not None:
            if os.path.exists(checkpoint):
                chk_fi = open(checkpoint, 'r+')
                for line in chk_fi:
                    try:
                        if line[0] == '=':
                            a, b, c = line[1:].split(',')
                            self._nmi_cache.append(
                                ((int(a), int(b)), float(c)))
                    except:
                        pass
                chk_fi.close()
            # re-open checkpoint for appending
            chk_fi = open(checkpoint, 'a+')

        if self._nmi_cache:
            # ignore unselected features in cache
            cache = [(pair, nmi) for pair, nmi in self._nmi_cache
                     if pair[0] in features and pair[1] in features]
            # add each cached nmi to the distance matrix
            for cached_pair, nmi in cache:
                # remove cached_pair from pairs
                pairs = list(
                    filter(
                        lambda pair: (pair[0] != cached_pair[0] and pair[
                            1] != cached_pair[1]) and (pair[0] != cached_pair[
                                1] and pair[1] != cached_pair[0]), pairs))
                # add cached nmi to matrix
                i, j = features.index(cached_pair[0]), features.index(
                    cached_pair[1])
                X[i][j] = 1 - nmi
                X[j][i] = 1 - nmi

        if len(pairs) > 0:
            # map pairs to nmi
            if self._pool is None:
                results = map(self._estimate_nmi, pairs)
            else:
                results = self._pool.imap(self._estimate_nmi, pairs)
                self._pool.close()

            # fill matrix with pair nmi values
            count = 0
            for pair, nmi in zip(pairs, results):

                # print progress updates
                count += 1
                if count - 1 % (len(pairs) * 0.05) == 0:
                    logger.info("Progress: {}/{}".format(count, len(pairs)))

                fidx1, fidx2 = pair
                i, j = features.index(fidx1), features.index(fidx2)
                X[i][j] = 1 - nmi
                X[j][i] = 1 - nmi

                if checkpoint is not None:
                    chk_fi.write('={},{},{}\n'.format(fidx1, fidx2, nmi))
                    chk_fi.flush()

            # restart pool if multiprocessing
            if self._pool is not None:
                self._pool.join()
                self._pool.restart()

        # verify that all values are filled
        assert not np.any(X[X == np.nan])

        # use DBSCAN to cluster our data
        labels = HDBSCAN(metric='precomputed',
                         min_samples=min_samples,
                         min_cluster_size=min_cluster_size).fit_predict(X)
        logger.debug("Found {} clusters.".format(set(labels)))

        # organize the topN features into sub-lists where
        # each sub-list contains all features in a cluster
        clusters = []
        for label in range(min(labels), max(labels) + 1):
            if label >= 0:
                cluster = [
                    features[i] for i, la in enumerate(labels) if la == label
                ]
                clusters.append(cluster)
            else:
                # treat features that do not cluster (ie. noise) each as their own independent cluster
                noise = [[features[i]] for i, la in enumerate(labels)
                         if la == label]
                clusters.extend(noise)

        logger.debug("Clusters: {}".format(labels))
        return clusters, X
Example #7
0
    def prune(self, features, checkpoint=None, nmi_threshold=0.9, topn=100):
        """
        Reduce the feature-set to a list of top features which are non-redundant.

        Redundancy is identified by estimating the pair-wise mutual information of features.
        The algorithm will find up to a maximum of ``topn`` non-redundant features before ending.
        If the MIAnalyzer was instantiated with a ``pool``, NMI calculations will be performed in parallel.

        Parameters
        ----------
        features : list
            Array of features from which to prune redundant features.
            Features should be pre-sorted by importance with the most important feature being at index 0.
        checkpoint : str
            Path to plaintext file to store feature redundancy checkpoint information.
            Do not perform checkpointing if None is used.
        nmi_threshold : float
            Threshold value used to identify redundant features.
            Features with NMI values greater than the threshold value are pruned.
        topn : int
            Number of features to save when pruning is performed.

        Returns
        -------
        list
            Features list having variable length up to ``topn``.
        """
        # results of NMI calculations are saved in list internal to the analyzer
        # reduces the amount of computation required in any subsequent cluster calls
        self._nmi_cache, self._mi_cache = [], dict()

        self.nmi_threshold = nmi_threshold

        # feature lists
        cleaned_features = set()  # non-redundant
        pruned_features = set()  # redundant

        # if checkpointing, open file and read any previously processed features
        if checkpoint is not None:
            if os.path.exists(checkpoint):
                checkpoint_fi = open(checkpoint, 'r+')
                for line in checkpoint_fi:
                    try:
                        if line[0] == '+':
                            feature = int(line[1:].strip())
                            cleaned_features.add(feature)
                        elif line[0] == '-':
                            feature = int(line[1:].strip())
                            pruned_features.add(feature)
                        if line[0] == '=':
                            a, b, c = line[1:].split(',')
                            self._nmi_cache.append(
                                ((int(a), int(b)), float(c)))
                    except:
                        pass
                features = list(
                    filter(
                        lambda f: f not in cleaned_features and f not in
                        pruned_features, features))
                checkpoint_fi.close()

            # re-open checkpoint for appending
            checkpoint = open(checkpoint, 'a+')

        # continue to process features until either there are no features left to process
        # or the topN features have been selected
        while features and len(cleaned_features) < topn:

            # the next most important feature
            current_feature = features.pop(0)
            logger.debug("MI analysis on feature #{}".format(current_feature))

            # for all top features, measure pair-wise mutual information to check for redundancy
            feature_pairs = zip(repeat(current_feature), cleaned_features)
            if self._pool is None or len(cleaned_features) < 2:
                results = map(self._check_redundancy, feature_pairs)
            else:  # parallel, unordered
                results = self._pool.uimap(self._check_redundancy,
                                           feature_pairs)

            # break upon first occurrence of redundancy
            is_redundant = False
            for res in results:

                # unzip results
                is_redundant, feature_pair, nmi = res

                # save feature pair with nmi in cache
                self._nmi_cache.append((feature_pair, nmi))
                if checkpoint is not None:
                    checkpoint.write('={},{},{}\n'.format(
                        feature_pair[0], feature_pair[1], nmi))
                    checkpoint.flush()

                # break loop
                if is_redundant:
                    # if the analyzer is using a process pool
                    # terminate processes and restart the pool
                    if self._pool is not None:
                        self._pool.terminate()
                        self._pool.join()
                        self._pool.restart()
                    break

            # if the current feature does not appear to be redundant with any
            # other top features, add current feature to top features list
            if not is_redundant:
                cleaned_features.add(current_feature)
                logger.info("Progress: {}/{}".format(len(cleaned_features),
                                                     min(topn, len(features))))
                if checkpoint is not None:
                    checkpoint.write('+{}\n'.format(current_feature))
                    checkpoint.flush()
            else:
                pruned_features.add(current_feature)
                if checkpoint is not None:
                    checkpoint.write('-{}\n'.format(current_feature))
                    checkpoint.flush()

        if checkpoint is not None:
            checkpoint.close()

        # return both non-redundant and redundant features
        # which feature was redundant with which is however not saved
        return list(cleaned_features), list(pruned_features)
Example #8
0
def main(features_path, output_path, n_procs=0, n_samples=5000, topn=100, nmi_threshold=0.9, discrete_threshold=100000):
    """
    Run the full information leakage analysis on a processed dataset.

    Parameters
    ----------
    features_path : str
        Operating system file path to the directory containing processed feature files.
    output_path : str
        Operating system file path to the directory where analysis results should be saved.
    n_procs : int
        Number of processes to use for parallelism.
        If 0 is used, auto-detect based on number of system CPUs.
    n_samples : int
        Number of samples to use when performing monte-carlo estimation when running the fingerprint modeler.
    topn : int
        Top number of features to analyze during joint analysis.
    nmi_threshold : float
        Cut-off value for determining redundant features. Should be a percentage value.

    Returns
    -------
    float
        Combined feature leakage (in bits)
    """
    # prepare feature dataset
    logger.info("Loading dataset.")
    feature_data = WebsiteData(features_path)
    logger.info("Loaded {} sites.".format(len(feature_data.sites)))
    logger.info("Loaded {} instances.".format(len(feature_data)))

    # create process pool
    if n_procs > 1:
        pool = Pool(n_procs)
    elif n_procs == 0:
        pool = Pool(cpu_count())
    else:
        pool = None

    # directory to save results
    outdir = output_path
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    # initialize fingerprint modeler
    modeler = WebsiteFingerprintModeler(feature_data, discrete_threshold=discrete_threshold)

    # load previous leakage measurements if possible
    indiv_path = os.path.join(outdir, 'indiv.pkl')
    if os.path.exists(indiv_path):
        with open(indiv_path, "rb") as fi:
            logger.info("Loading individual leakage measures from file.")
            leakage_indiv = dill.load(fi)

    # otherwise do individual measure
    else:
        logger.info("Begin individual feature analysis.")

        # perform individual measure with checkpointing
        chk_path = os.path.join(outdir, 'indiv_checkpoint.txt')
        leakage_indiv = _individual_measure(modeler, pool, chk_path)

        # save individual leakage to file
        logger.info("Saving individual leakage to {}.".format(indiv_path))
        with open(indiv_path, "wb") as fi:
            dill.dump(leakage_indiv, fi)

    # perform combined information leakage measurements
    # initialize MI analyzer
    analyzer = MutualInformationAnalyzer(feature_data, pool=pool)

    # sort the list of features by their individual leakage
    # we will process these features in the order of their importance during MI analysis
    logger.info("Sorting features by individual leakage.")
    tuples = list(zip(feature_data.features, leakage_indiv))
    tuples = sorted(tuples, key=lambda x: (-x[1], x[0]))
    logger.debug("Top 20:\t {}".format(tuples[:20]))
    sorted_features = list(list(zip(*tuples))[0])

    # process into list of non-redundant features
    cln_path = os.path.join(outdir, 'cleaned.pkl')
    rdn_path = os.path.join(outdir, 'redundant.pkl')
    chk_path = os.path.join(outdir, 'prune_checkpoint.txt')
    if os.path.exists(cln_path):
        logger.info("Loading top non-redundant features from file.")
        with open(cln_path, 'rb') as fi:
            cleaned = dill.load(fi)
    else:
        logger.info("Begin feature pruning.")
        cleaned, pruned = analyzer.prune(features=sorted_features,
                                         nmi_threshold=nmi_threshold,
                                         topn=topn,
                                         checkpoint=chk_path)
        with open(cln_path, 'wb') as fi:
            dill.dump(cleaned, fi)
        with open(rdn_path, 'wb') as fi:
            dill.dump(pruned, fi)

    # cluster non-redundant features
    dst_path = os.path.join(outdir, 'distance_matrix.pkl')
    cst_path = os.path.join(outdir, 'clusters.pkl')
    if os.path.exists(cst_path):
        logger.info("Loading clusters from file.")
        with open(cst_path, 'rb') as fi:
            clusters = dill.load(fi)
    else:
        logger.info("Begin feature clustering.")
        clusters, distance_matrix = analyzer.cluster(cleaned, checkpoint=chk_path)
        with open(dst_path, 'wb') as fi:
            dill.dump(distance_matrix, fi)
        with open(cst_path, 'wb') as fi:
            dill.dump(clusters, fi)

    # perform joint information leakage measurement
    logger.info('Identified {} clusters.'.format(len(clusters)))
    logger.info("Begin cluster leakage measurements.")
    modeler._pool = pool    # configure modeler to use the proc pool
    leakage_joint = modeler.information_leakage(clusters=clusters,
                                                sample_size=n_samples,
                                                joint_leakage=True)[0]

    logger.info("Final leakage results: {} bits".format(leakage_joint))
    logger.info("Finished execution.")
    return leakage_joint