Beispiel #1
0
    def get_nearest_records(self, anonymized_record):
        """
        Calculate distance between the anonymized instance being re-identified to instances in original buffer.
        Each instance xi in buffer is stored in a set G if the distance d to x' is the minimum distance.
        Once an instance at distance d < A is found, all instances from G are removed and A is updated.
        Finally, the algorithm checks if the target instance is in G.
        :param anonymized_record: Records to be re-identified.
        :return: Indices of identified records if such are found, otherwise None.
        """
        # initialization
        # iterator = iter(self.original_instances_buffer or [])
        # original_record = next(iterator, None)
        original_record = self.original_instances_buffer[0]
        if original_record:
            minimum = MetricsUtils.distance(anonymized_record.quasi_identifier,
                                            original_record.quasi_identifier)
            indices = list()
            indices.append(original_record.timestamp)

            # traversal
            for r in self.original_instances_buffer[1:]:
                distance = MetricsUtils.distance(
                    anonymized_record.quasi_identifier, r.quasi_identifier)
                if distance < minimum:
                    minimum = distance
                    indices = list()
                    indices.append(r.timestamp)
                elif distance == minimum:
                    indices.append(r.timestamp)
            return indices
        return
Beispiel #2
0
 def update_GSE(self):
     xi = self.centroid
     xij = self.W_curr.peek().quasi_identifier
     dataset_centroid = MetaUtils.dataset_centroid
     appended_SSE_dist = MetricsUtils.distance(xij, xi)
     self.__GSE += appended_SSE_dist
     appended_SST_dist = MetricsUtils.distance(xij, dataset_centroid)
     self.__GSE_T += appended_SST_dist
 def randomize(t, w=None):
     """
     Randomize values of each attribute in tuple
     For numerical attributes - return uniform random value in [min_val, max_val] range
     For categorical attributes - return random value from set of unique attribute values
     :param t: Tuple to be randomized
     :param w: Current buffer batch of tuples in the cluster to which t belongs. Default None
     :return: Randomized version of the tuple
     """
     qi = list(t.quasi_identifier)
     for i in range(0, len(qi)):
         if not isinstance(qi[i], str):  # Numerical attributes
             min_val = MetaUtils.get_attr_metadata(i, 'Min_Val')
             max_val = MetaUtils.get_attr_metadata(i, 'Max_Val')
             if not w:
                 qi[i] = DistributionUtils.get_uniform_rand(min=min_val, max=max_val, dtype=type(qi[i]))
             else:
                 records = map(lambda x: x.quasi_identifier, w)
                 attr_vals = list(zip(*records))[i]
                 x = DistributionUtils.get_estimated_rand(sample_batch=attr_vals, dtype=type(qi[i]))
                 qi[i] = MetricsUtils.truncate_value(x, l=min_val, u=max_val, dtype=type(qi[i]))
         else:  # Categorical attributes
             unique_values = MetaUtils.get_attr_metadata(i, 'Distinct_Val')
             qi[i] = DistributionUtils.get_uniform_rand(sample_batch=unique_values, dtype=type(qi[i]))
     anonymized = copy.copy(t)
     anonymized.quasi_identifier = qi
     return anonymized
Beispiel #4
0
 def update_buffer_centroid(self):
     """
     Recalculate the centroid of the buffer.
     :return: Updated centroid.
     """
     # self.centroid = []
     self.centroid = MetricsUtils.calculate_centroid(self.buffer)
     return self.centroid
Beispiel #5
0
 def update_estimation(self, time, record_pair, cluster=None):
     """
     Updates the distortion of the quasi-identifiers, normalized by the number of samples and dimension of QI
     (sigma(j=1..n -> ||xj-x'j||**2)/(qi_dimensions * N)
     Update total stream record number.
     :param record_pair: Pair of original record and its anonymization.
     :param cluster: Cluster from which the record is published (Default: not needed).
     :return: The accumulated SSE
     """
     self.processed_instances += 1
     original = record_pair.original_record.quasi_identifier
     cluster_centroid = record_pair.anonymized_record.quasi_identifier
     # if not self.__qi_dimension:  # m parameter
     #     self.__qi_dimension = len(original)
     self.__SSE += MetricsUtils.distance(original, cluster_centroid)**2
     self.__SST += MetricsUtils.distance(original,
                                         self.__dataset_centroid)**2
     return self.__SSE
Beispiel #6
0
    def __init__(self, original_tuples):
        super(HomogeneityInfoLossMetric, self).__init__()
        self.logger = logging.getLogger(__name__)

        self.__SSE = 0
        self.__SST = 0
        self.__qi_dimension = len(original_tuples[0].quasi_identifier)

        # Mean of whole dataset (stream), for calculating SST
        self.__dataset_centroid = MetricsUtils.calculate_centroid(
            original_tuples)
 def distance_based_test(self, c):
     """
     Compare the distance of the two centroids of the buffers in the given cluster with the p-value of the K-S test.
     Used for mix-type tuples.
     :param c: Cluster to be inspected for concept drift. Contains both current and previous buffer windows.
     :return:
     """
     z1 = c.W_prev.update_buffer_centroid().quasi_identifier
     z2 = c.W_curr.update_buffer_centroid().quasi_identifier
     self.incremental_change = self.__factor * MetricsUtils.distance(z1, z2)
     return self.incremental_change
Beispiel #8
0
 def update_cluster_centroid(self, t):
     """
     Calculates the new centroid of the entire cluster, after assigning the last tuple into it.
     :return: Updated centroid record of the entire cluster.
     """
     if self.size <= 1:
         self.centroid = t
     else:
         centr = MetricsUtils.update_centroid(
             self.centroid.quasi_identifier, t.quasi_identifier, self.size,
             self.categorical_freq)
         self.centroid = Record(centr, centr=True)
     return self.centroid
Beispiel #9
0
    def update_estimation(self, time, record_pair, cluster=None):
        """
        Update relative error by average each error for each pair of records over number of attributes m.
        (sigma(j=1..m -> RE(a_j <-> a'_j)/(qi_dimensions=m)
        Relative error is in [0..1] range.
        Update total stream record number.
        :param record_pair: Pair of original record and its anonymization.
        :param cluster: Cluster from which the record is published (Default: not needed).
        :return: The accumulated relative error
        """
        self.processed_instances += 1
        last_error = self.current_metric

        original = record_pair.original_record.quasi_identifier
        anonymized = record_pair.anonymized_record.quasi_identifier

        self.current_metric += (
            MetricsUtils.relative_error(original, anonymized) / len(original))
        self.incremental_metric = self.current_metric - last_error
    def update_estimation(self, time, record_pair, cluster=None):
        """
        Update the SSE with arriving tuple. Calculate the squared error of the pair of original and anonymized records.
        Accumulate the calculated squared error.
        ||x_j-c_j||**2), where c_j is the centroid of cluster to which x_j belongs.
        :param record_pair: Pair of original record and its anonymization.
        :param cluster: Cluster from which the record is published (Default: not needed).
        :return: None
        """
        self.processed_instances += 1
        original = record_pair.original_record.quasi_identifier
        anonymized = record_pair.anonymized_record.quasi_identifier
        last_error = self.current_metric
        # square_error = MetricsUtils.distance(original, anonymized) ** 2
        square_error = MetricsUtils.distance_unbounded(original, anonymized)

        self.current_metric += square_error
        self.incremental_metric = self.current_metric - last_error
        # self.monitor_overtime_change(cluster.W_curr.max_size, show_incremental=False)
        self.monitor_overtime_change(window_size=100, show_incremental=False)
    def search_best_cluster(self, t):
        """
        Find the nearest and least info-loss increasing cluster w.r.t a new arriving tuple.
        Calculate the distance from the arrived tuple to each cluster in the system, bounded to a given threshold.
        Increase in information loss is computed as the incremental squared error (SSE) of a candidate cluster.
        :param t: New arriving tuple
        :return: The nearest and least info-loss increasing cluster
        """
        cluster_dist_dict = {}
        if not self.cluster_set:
            return None
        else:
            for c in self.cluster_set:
                centr = c.centroid.quasi_identifier
                dist = MetricsUtils.distance(centr, t.quasi_identifier)

                if dist <= self.dist_thr:
                    cluster_dist_dict[c] = dist
            if len(cluster_dist_dict) > 0:
                cluster_dist_dict = sorted(cluster_dist_dict.items(),
                                           key=lambda x: x[1])
                return self.check_min_info_loss_increase(cluster_dist_dict, t)
                # return cluster_dist_dict[0][0]
            return None
Beispiel #12
0
    def add_noise(self, r):
        """
        Add noise to attributes in record w.r.t its domain and noise scale
        In case the noisy value exceed the domain range, truncate a value inside its domain boundaries
        :param r: Record to which noise is added
        :return: Noisy version of quasi-identifier of record
        """
        qi = list(r.quasi_identifier)
        m = len(qi)
        for i in range(0, m):
            w = MetaUtils.stream_metadata[i]['Weight']
            l = MetaUtils.stream_metadata[i]['Min_Val']
            u = MetaUtils.stream_metadata[i]['Max_Val']

            scale_estimator = self.attribute_scale_estimators.get(i)
            if not scale_estimator:
                if not isinstance(qi[i], str):  # Numerical attribute
                    scale_estimator = LaplaceDomainNoiseEstimator(
                        self.k, m, self.epsilon, self.loc, self.scale)
                else:  # Categorical attribute
                    scale_estimator = CategoricalNoiseEstimator(
                        w, self.noise_thr)
            self.attribute_scale_estimators[i] = scale_estimator

            p = scale_estimator.estimate(qi[i])
            noise = scale_estimator.get_noise(p)
            if not isinstance(qi[i], str):  # Numerical attribute
                x = qi[i] + noise

                # Truncate a value inside its domain boundaries
                # (if value after adding noise lies outside its domain range)
                qi[i] = MetricsUtils.truncate_value(x, l, u, dtype=type(qi[i]))
            else:  # Categorical attribute
                qi[i] = noise

        return qi
Beispiel #13
0
def run(log_file, dir, stream_path, datatypes_path, k, l, c, eps, b, delta,
        dist_thr, cd_thr, cd_conf, noise_thr):
    logging.basicConfig(
        filename=log_file,
        filemode='w',
        stream=sys.stdout,
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logging.getLogger("").addHandler(logging.StreamHandler(sys.stdout))

    logging.info("---------- Program started ----------")
    logging.info("Dataset: %s" % stream_path.split('.')[0])

    fs = StreamReader(dir, stream_path, datatypes_path)
    features = fs.read_csv_file(shuffle=False, duplicate_frac=None)

    logging.info("Preparation of stream dataset completed!")
    logging.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

    exec_time_estimator = ExecutionTimeMetric()
    average_publishing_delay_estimator = PublishingDelayTimeMetric()
    mse_info_loss_estimator = MSEInfoLossMetric()
    sse_unbounded_info_loss_estimator = SSEInfoLossMetric()
    relative_err_info_loss = RelativeErrorInfoLossMetric()
    classification_info_loss_estimator = ClassificationInfoLossMetric()
    disclosure_risk_estimator = BufferedDisclosureRiskMetric(buffer_size=100)

    logging.info("Initializing anonymizer...")
    try:
        assert (delta >= k[0])
        assert (k[0] <= b <= k[-1] + 1)
    except AssertionError:
        print("Size of buffer should be between {0} and {1}".format(
            k[0], k[-1] + 1))
        exit(1)

    anonymizer = MicroaggAnonymizer(
        stream=fs.tuples,
        k=k,
        l=l,
        c=c,
        eps=eps,
        b=b,
        delta=delta,
        dist_thr=dist_thr,
        datatypes=features,
        publisher=SmartCentroidPublisher(),
        noiser=DiffPrivateNoiseGenerator(epsilon=eps,
                                         k=k[0],
                                         noise_thr=noise_thr),
        change_detector=ConceptDriftDetector(conf=cd_conf,
                                             buff_size=b,
                                             factor=cd_thr),
        estimators=[
            exec_time_estimator, average_publishing_delay_estimator,
            mse_info_loss_estimator, sse_unbounded_info_loss_estimator,
            relative_err_info_loss, classification_info_loss_estimator,
            disclosure_risk_estimator
        ])

    anonymization_pairs = anonymizer.anonymize()

    if not anonymization_pairs:
        logging.info("Failed to anonymize. Program aborts!")
        return

    logging.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    logging.info("Calculating performance report to CSV file")

    estimators = {
        "Execution Time": exec_time_estimator,
        "Average Publishing Delay": average_publishing_delay_estimator,
        "MSE Info Loss": mse_info_loss_estimator,
        "SSE unbounded Info Loss": sse_unbounded_info_loss_estimator,
        "Relative Percentage Error Info Loss": relative_err_info_loss,
        "Classification Metric": classification_info_loss_estimator,
        "Disclosure Risk": disclosure_risk_estimator
    }

    logging.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    logging.info("Initiating evaluation report...")
    eval_report = EvaluationReport(dataset_name=stream_path.split('.')[0],
                                   anonymization_pairs=anonymization_pairs,
                                   anonymizer=anonymizer,
                                   estimators=estimators)

    eval_report.print_records()

    eval_report.print_incremental_eval_to_CSV()

    logging.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    logging.info("Performing post-analysis evaluation using stream classifier")

    for learner in classification_learner:
        task = classification_task.keys()[0]
        evaluator = classification_evaluation.keys()[0]
        classifier = ClassifierEvaluator(
            task=(task, classification_task[task]),
            learner=(learner, classification_learner[learner]),
            evaluator=(evaluator, classification_evaluation[evaluator]))

        #  Evaluate original performance
        pred_origin, measures_origin = classifier.evaluate(
            dir=eval_report.EVAL_DIR,
            input=eval_report.ORIGINAL_ARFF,
            stream_size=eval_report.stream_size)

        #  Evaluate anonymized performance
        pred_anonym, measures_anonym = classifier.evaluate(
            dir=eval_report.EVAL_DIR,
            input=eval_report.ANONYMIZED_ARFF,
            stream_size=eval_report.stream_size)

        #  Evaluate difference between original and anonymized performance
        origin_anonym_kappa = MetricsUtils.calculate_kappa(
            pred_origin, pred_anonym)
        eval_report.print_post_evaluation(
            task_name=task,
            learner_name=learner,
            measures_origin=measures_origin,
            measures_anonym=measures_anonym,
            origin_anonym_kappa=origin_anonym_kappa)

    # Save evaluation report to directory of current run, and save the log file
    logging.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    result = eval_report.print_report_to_CSV()
    logging.info("Done!") if result else logging.info("Failed to save report!")

    # Shutdown all loggers and flush their handlers.
    # Save the log from current run to the designated directory.
    logging.shutdown()
    while logging.getLogger("").handlers:
        logging.getLogger("").handlers.pop()
    shutil.move(log_file,
                os.path.abspath(os.path.join(eval_report.EVAL_DIR, log_file)))