Esempio n. 1
0
class TestComputeAttributesInstability(unittest.TestCase):
    def setUp(self):
        self._dataset = DummyCleanDataset()
        self._attributes = AttributeSet(ATTRIBUTES)

    def _get_grouped_by_browser(self):
        # 1. Group by the browser id (no sort for performances, no group key to
        #    not add an additonal column with the group key)
        # 2. Sort by the time of collection for each group (give a DataFrame)
        # 3. Regroup by the browser id, here each group has the fingerprints
        #    sorted by the time of collection
        return (self._dataset.dataframe.groupby(
            MetadataField.BROWSER_ID, sort=False,
            group_keys=False).apply(lambda group_df: group_df.sort_values(
                MetadataField.TIME_OF_COLLECT)).groupby(
                    MetadataField.BROWSER_ID, sort=False, group_keys=False))

    def test_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {
            ATTRIBUTES[0]: 0.0,
            ATTRIBUTES[1]: 0.0,
            ATTRIBUTES[2]: 0.0
        }
        self.assertDictEqual(expected_result, attributes_instability)

    def test_unexistent_attribute(self):
        self._attributes.add(UNEXISTENT_ATTRIBUTE)
        grouped_by_browser = self._get_grouped_by_browser()
        with self.assertRaises(KeyError):
            _compute_attributes_instability(grouped_by_browser,
                                            self._attributes)

    def test_empty_attributes(self):
        self._attributes = AttributeSet({})
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {}
        self.assertDictEqual(expected_result, attributes_instability)

    def test_empty_dataset_and_attributes(self):
        self._dataset = DummyEmptyDataset()
        self._attributes = AttributeSet({})
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {}
        self.assertDictEqual(expected_result, attributes_instability)

    def test_clean_dataset(self):
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {
            ATTRIBUTES[0]: 0.0,
            ATTRIBUTES[1]: 0.0,
            ATTRIBUTES[2]: 0.0
        }
        self.assertDictEqual(expected_result, attributes_instability)

    def test_dummy_fingerprint_dataset(self):
        self._dataset = DummyFingerprintDataset()
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {
            ATTRIBUTES[0]: 0.0,
            ATTRIBUTES[1]: 0.0,
            ATTRIBUTES[2]: 0.0
        }
        self.assertDictEqual(expected_result, attributes_instability)

    def test_dummy_dataset_with_changes(self):
        self._dataset = DummyDatasetWithChanges()
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {
            ATTRIBUTES[0]: 1 / 2,
            ATTRIBUTES[1]: 1.0,
            ATTRIBUTES[2]: 0.0
        }
        self.assertDictEqual(expected_result, attributes_instability)
Esempio n. 2
0
 def test_in_between_entropy(self):
     self._attribute_set = AttributeSet([ATTRIBUTES[0]])
     expected_entropy = -1 * ((1/5)*log2(1/5) + (2/5)*log2(2/5)
                              + (2/5)*log2(2/5))
     self.check_entropy_result(expected_entropy)
Esempio n. 3
0
class TestAttributeSetEntropy(unittest.TestCase):

    def setUp(self):
        self._dataset = DummyCleanDataset()
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._csv_result_path = CSV_RESULT_PATH

    def check_entropy_result(self, expected_entropy: float):
        maximum_entropy = log2(len(self._dataset.dataframe))
        attribute_set_entropy_analysis = AttributeSetEntropy(
            self._dataset, self._attribute_set)
        attribute_set_entropy_analysis.execute()
        analysis_result = attribute_set_entropy_analysis.result
        expected_result = {
            ENTROPY_RESULT: expected_entropy,
            MAXIMUM_ENTROPY_RESULT: maximum_entropy,
            NORMALIZED_ENTROPY_RESULT: expected_entropy/maximum_entropy
        }
        for result_name, expected_value in expected_result.items():
            self.assertAlmostEqual(analysis_result[result_name],
                                   expected_value)

    def test_empty_dataset_and_empty_attribute_set(self):
        self._dataset = DummyEmptyDataset()
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_empty_attribute_set(self):
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_in_between_entropy(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0]])
        expected_entropy = -1 * ((1/5)*log2(1/5) + (2/5)*log2(2/5)
                                 + (2/5)*log2(2/5))
        self.check_entropy_result(expected_entropy)

    def test_always_the_same_value(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[2]])
        self.check_entropy_result(0.0)

    def test_unique_values(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[1]])
        maximum_entropy = log2(len(self._dataset.dataframe))
        self.check_entropy_result(maximum_entropy)

    def test_save_csv_result(self):
        attribute_set_entropy_analysis = AttributeSetEntropy(
            self._dataset, self._attribute_set)
        attribute_set_entropy_analysis.execute()
        attribute_set_entropy_analysis.save_csv_result(self._csv_result_path)
        remove(self._csv_result_path)
Esempio n. 4
0
 def test_evaluate_empty_attribute_set(self):
     empty_attr_cost, empty_attr_cost_explanation = (
          self._memory_instability_measure.evaluate(AttributeSet()))
     self.assertEqual(empty_attr_cost, 0.0)
     for cost_value in empty_attr_cost_explanation.values():
         self.assertEqual(cost_value, 0.0)
Esempio n. 5
0
 def test_empty_attribute_set(self):
     self._attribute_set = AttributeSet()
     with self.assertRaises(ValueError):
         self.check_entropy_result(WONT_COMPUTE)
 def setUp(self):
     self._attribute_set = AttributeSet(ATTRIBUTES)
     self._dataset = DummyCleanDataset()
Esempio n. 7
0
 def setUp(self):
     self._dataset = DummyCleanDataset()
     self._dataframe = self._dataset.dataframe
     self._attributes = AttributeSet(ATTRIBUTES)
Esempio n. 8
0
 def test_unique_values(self):
     self._attribute_set = AttributeSet([ATTRIBUTES[1]])
     total_browsers = len(self._dataset.dataframe)
     self.check_unicity_result(total_browsers)
Esempio n. 9
0
def _get_attributes_entropy(
        dataset: FingerprintDataset,
        attributes: AttributeSet) -> Dict[Attribute, float]:
    """Give a dictionary with the entropy of each attribute.

    Args:
        dataset: The fingerprint dataset used to compute the entropy.
        attributes: The attributes for which we compute the entropy.

    Raises:
        ValueError: There are attributes and the fingerprint dataset is empty.
        KeyError: An attribute is not in the fingerprint dataset.

    Returns:
        A dictionary with each attribute (Attribute) and its entropy.
    """
    # Some checks before starting the exploration
    if attributes and dataset.dataframe.empty:
        raise ValueError('Cannot compute the entropy on an empty dataset.')
    for attribute in attributes:
        if attribute not in dataset.candidate_attributes:
            raise KeyError(f'The attribute {attribute} is not in the dataset.')

    # We will work on a dataset with only a fingerprint per browser to avoid
    # overcounting effects
    df_one_fp_per_browser = dataset.get_df_w_one_fp_per_browser()

    # If we execute on a single process
    if not params.getboolean('Multiprocessing', 'explorations'):
        logger.debug('Measuring the attributes entropy on a single process...')
        return _compute_attribute_entropy(df_one_fp_per_browser, attributes)

    # The dictionary to update when using multiprocessing
    logger.debug('Measuring the attributes entropy using multiprocessing...')
    attributes_entropy = {}

    # Infer the number of cores to use
    free_cores = params.getint('Multiprocessing', 'free_cores')
    nb_cores = max(cpu_count() - free_cores, 1)
    attributes_per_core = int(ceil(len(attributes) / nb_cores))
    logger.debug(f'Sharing {len(attributes)} attributes over '
                 f'{nb_cores}(+{free_cores}) cores, hence '
                 f'{attributes_per_core} attributes per core.')

    def update_attributes_entropy(attrs_entropy: Dict[Attribute, float]):
        """Update the complete dictionary attributes_entropy.

        Args:
            attrs_size: The dictionary containing the subset of the results
                        computed by a process.

        Note: This is executed by the main thread and does not pose any
              concurrency or synchronization problem.
        """
        for attribute, attribute_entropy in attrs_entropy.items():
            attributes_entropy[attribute] = attribute_entropy

    # Spawn a number of processes equal to the number of cores
    attributes_list = list(attributes)
    async_results = []
    with Pool(processes=nb_cores) as pool:
        for process_id in range(nb_cores):
            # Generate the candidate attributes for this process
            start_id = process_id * attributes_per_core
            end_id = (process_id + 1) * attributes_per_core
            attributes_subset = AttributeSet(attributes_list[start_id:end_id])

            async_result = pool.apply_async(_compute_attribute_entropy,
                                            args=(df_one_fp_per_browser,
                                                  attributes_subset),
                                            callback=update_attributes_entropy)
            async_results.append(async_result)

        # Wait for all the processes to finish (otherwise we would exit
        # before collecting their result)
        for async_result in async_results:
            async_result.wait()

    return attributes_entropy
Esempio n. 10
0
 def test_empty_dataset_and_empty_attribute_set(self):
     self._dataset = DummyEmptyDataset()
     self._attribute_set = AttributeSet()
     with self.assertRaises(ValueError):
         self.check_unicity_result(WONT_COMPUTE)
Esempio n. 11
0
 def test_in_between_entropy(self):
     self._attribute_set = AttributeSet([ATTRIBUTES[0]])
     self.check_unicity_result(1)
Esempio n. 12
0
class TestAttributeSetUnicity(unittest.TestCase):
    def setUp(self):
        self._dataset = DummyCleanDataset()
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._csv_result_path = CSV_RESULT_PATH

    def check_unicity_result(self, expected_unique_fps: int):
        total_browsers = len(self._dataset.dataframe)
        attribute_set_unicity_analysis = AttributeSetUnicity(
            self._dataset, self._attribute_set)
        attribute_set_unicity_analysis.execute()
        analysis_result = attribute_set_unicity_analysis.result
        expected_result = {
            UNIQUE_FPS_RESULT: expected_unique_fps,
            TOTAL_BROWSERS_RESULT: total_browsers,
            UNICITY_RATE_RESULT: expected_unique_fps / total_browsers
        }
        for result_name, expected_value in expected_result.items():
            self.assertAlmostEqual(analysis_result[result_name],
                                   expected_value)

    def test_empty_dataset_and_empty_attribute_set(self):
        self._dataset = DummyEmptyDataset()
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_unicity_result(WONT_COMPUTE)

    def test_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        with self.assertRaises(ValueError):
            self.check_unicity_result(WONT_COMPUTE)

    def test_empty_attribute_set(self):
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_unicity_result(WONT_COMPUTE)

    def test_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            self.check_unicity_result(WONT_COMPUTE)

    def test_in_between_entropy(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0]])
        self.check_unicity_result(1)

    def test_always_the_same_value(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[2]])
        self.check_unicity_result(0)

    def test_unique_values(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[1]])
        total_browsers = len(self._dataset.dataframe)
        self.check_unicity_result(total_browsers)

    def test_save_csv_result(self):
        attribute_set_unicity_analysis = AttributeSetUnicity(
            self._dataset, self._attribute_set)
        attribute_set_unicity_analysis.execute()
        attribute_set_unicity_analysis.save_csv_result(self._csv_result_path)
        remove(self._csv_result_path)
Esempio n. 13
0
 def setUp(self):
     self._dataset = DummyFingerprintDataset()
     self._attribute_set = AttributeSet(ATTRIBUTES)
     self._candidate_attributes = AttributeSet(ATTRIBUTES)
     self._most_common_fps = 3
Esempio n. 14
0
 def test_top_0_fingerprints(self):
     self._most_common_fps = 0
     for attribute in self._candidate_attributes:
         self._attribute_set = AttributeSet([attribute])
         self.check_top_k_fingerprints(0.0)
 def test_best_conditional_entropic_attribute_empty_attribute_set(self):
     best_cond_ent_attr = _best_conditional_entropic_attribute(
         self._df_w_one_fp_per_browser,
         current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}),
         candidate_attributes=AttributeSet())
     self.assertIsNone(best_cond_ent_attr[0])
Esempio n. 16
0
    def _search_for_solution(self):
        """Search for a solution using the entropy-based exploration algorithm.

        This function has to
        - Set the best solution currently found (AttributeSet).
        - Update the set of the attribute sets that satisfy the sensitivity
          threshold (Set[AttributeSet]).
        - Update the list of the explored attributes which is the trace of the
          execution. The information regarding an explored attribute is stored
          as a dictionary with the following key/values:
          * time (float): The time spent since the starting of the exploration
                          in seconds (use timedelta.total_seconds()).
          * attributes (Set[int]): The set of the ids of the attributes.
          * sensitivity (float): The sensitivity of the attribute set.
          * usability_cost (float): The usability cost of the attribute set.
          * cost_explanation (Dict[str: float]): The explanation of the cost of
                                                 the attribute set.
          * state (State): The state of this attribute set (see State class).
        - Log the explored attribute sets for debugging purposes using loguru.

        Note:
            We use the ids of the attributes instead of their name to reduce
            the size of the trace in memory and when saved in json format.
        """
        # Get a dictionary of the entropy of each attribute
        logger.info('Computing the entropy of each attribute...')
        attributes_entropy = _get_attributes_entropy(
            self._dataset, self._dataset.candidate_attributes)
        entropy_compute_time = datetime.now() - self._start_time
        logger.info('Entropy of the attributes computed after '
                    f'{entropy_compute_time}.')

        # Take the attributes in the order of their entropy
        attribute_set = AttributeSet()
        for attribute, _ in sort_dict_by_value(attributes_entropy,
                                               reverse=True):

            # Check the new attribute set that is obtained
            attribute_set.add(attribute)
            logger.debug(f'Exploring {attribute_set}...')

            # Compute its sensitivity and its cost
            sensitivity = self._sensitivity.evaluate(attribute_set)
            cost, cost_explanation = (
                self._usability_cost.evaluate(attribute_set))
            logger.debug(f'  Sensitivity ({sensitivity}), '
                         f'usability cost ({cost})')

            # If it satisfies the sensitivity threshold, quit the loop
            if sensitivity <= self._sensitivity_threshold:
                self._update_solution(attribute_set)
                self._add_satisfying_attribute_set(attribute_set)

                # Store this attribute set in the explored sets
                compute_time = str(datetime.now() - self._start_time)
                self._add_explored_attribute_set({
                    TraceData.TIME:
                    compute_time,
                    TraceData.ATTRIBUTES:
                    attribute_set.attribute_ids,
                    TraceData.SENSITIVITY:
                    sensitivity,
                    TraceData.USABILITY_COST:
                    cost,
                    TraceData.COST_EXPLANATION:
                    cost_explanation,
                    TraceData.STATE:
                    State.SATISFYING
                })

                # Quit the loop if we found a solution
                break

            # If it does not satisfy the sensitivity threshold, we continue
            compute_time = str(datetime.now() - self._start_time)
            self._add_explored_attribute_set({
                TraceData.TIME:
                compute_time,
                TraceData.ATTRIBUTES:
                attribute_set.attribute_ids,
                TraceData.SENSITIVITY:
                sensitivity,
                TraceData.USABILITY_COST:
                cost,
                TraceData.COST_EXPLANATION:
                cost_explanation,
                TraceData.STATE:
                State.EXPLORED
            })
class TestGetBestConditionalEntropicAttribute(unittest.TestCase):

    def setUp(self):
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._dataset = DummyCleanDataset()

    def test_get_best_entropic_attribute(self):
        # The order is 1 (unique values), then 0 (some collisions), then
        # 2 (the same value for each browser)
        first_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet(),
            candidate_attributes=self._attribute_set)
        self.assertEqual(first_best, ATTRIBUTES[1])

        second_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1]}),
            candidate_attributes=self._attribute_set)
        self.assertEqual(second_best, ATTRIBUTES[0])

        third_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1],
                                                            ATTRIBUTES[0]}),
            candidate_attributes=self._attribute_set)
        self.assertEqual(third_best, ATTRIBUTES[2])

        no_more_available = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({
                ATTRIBUTES[1], ATTRIBUTES[0], ATTRIBUTES[2]}),
            candidate_attributes=self._attribute_set)
        self.assertIsNone(no_more_available)

    def test_get_best_entropic_attribute_every_attribute_already_taken(self):
        result = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=self._attribute_set,
            candidate_attributes=self._attribute_set)
        self.assertIsNone(result)

    def test_get_best_entropic_attribute_empty_attribute_set(self):
        result = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({ATTRIBUTES[0],
                                                            ATTRIBUTES[1]}),
            candidate_attributes=AttributeSet())
        self.assertIsNone(result)

    def test_get_best_entropic_attribute_empty_dataset(self):
        empty_dataset = DummyEmptyDataset()
        with self.assertRaises(ValueError):
            _get_best_conditional_entropic_attribute(
                empty_dataset,
                current_attributes=AttributeSet({ATTRIBUTES[0],
                                                 ATTRIBUTES[1]}),
                candidate_attributes=self._attribute_set)

    def test_get_best_entropic_attribute_empty_candidates_and_dataset(self):
        empty_dataset = DummyEmptyDataset()
        result = _get_best_conditional_entropic_attribute(
            empty_dataset, current_attributes=AttributeSet({ATTRIBUTES[0],
                                                            ATTRIBUTES[1]}),
            candidate_attributes=AttributeSet())
        self.assertIsNone(result)

    def test_get_best_entropic_attribute_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            _get_best_conditional_entropic_attribute(
                self._dataset,
                current_attributes=AttributeSet({ATTRIBUTES[0],
                                                 ATTRIBUTES[1]}),
                candidate_attributes=self._attribute_set)
Esempio n. 18
0
    def _search_for_solution(self):
        """Search for a solution using the entropy-based exploration algorithm.

        This function has to
        - Set the best solution currently found (AttributeSet).
        - Update the set of the attribute sets that satisfy the sensitivity
          threshold (Set[AttributeSet]).
        - Update the list of the explored attributes which is the trace of the
          execution. The information regarding an explored attribute is stored
          as a dictionary with the following key/values:
          * time (float): The time spent since the starting of the exploration
                          in seconds (use timedelta.total_seconds()).
          * attributes (Set[int]): The set of the ids of the attributes.
          * sensitivity (float): The sensitivity of the attribute set.
          * usability_cost (float): The usability cost of the attribute set.
          * cost_explanation (Dict[str: float]): The explanation of the cost of
                                                 the attribute set.
          * state (State): The state of this attribute set (see State class).
        - Log the explored attribute sets for debugging purposes using loguru.

        Note:
            We use the ids of the attributes instead of their name to reduce
            the size of the trace in memory and when saved in json format.
        """
        # The temporary solution (empty set) and the current sensitivity (1.0
        # as it is equivalent to no browser fingerprinting used at all)
        temp_solution, sensitivity = AttributeSet(), 1.0

        # We already checked that the sensitivity threshold is reachable, hence
        # we always reach it when processing the Exploration
        while sensitivity > self._sensitivity_threshold:

            # Find the attribute that has the highest conditional entropy
            best_cond_ent_attr = _get_best_conditional_entropic_attribute(
                self._dataset, temp_solution,
                self._dataset.candidate_attributes)

            # NOTE Removed as we already check that a solution exists before
            #      running the exploration. As a result, we always reach an
            #      attribute set that satisfies the sensitivity threshold, the
            #      complete set of the candidate attributes in the worst case.
            # If no more solution is proposed, end the exploration
            # if not best_cond_ent_attr:
            #     break

            # Add this attribute to the temporary solution
            temp_solution.add(best_cond_ent_attr)

            # Compute its sensitivity and its cost
            logger.debug(f'Exploring {temp_solution}...')
            sensitivity = self._sensitivity.evaluate(temp_solution)
            cost, cost_explanation = (
                self._usability_cost.evaluate(temp_solution))
            logger.debug(f'  Sensitivity ({sensitivity}), '
                         f'usability cost ({cost})')

            # If it satisfies the sensitivity threshold, quit the loop
            if sensitivity <= self._sensitivity_threshold:
                self._update_solution(temp_solution)
                attribute_set_state = State.SATISFYING
                self._add_satisfying_attribute_set(temp_solution)
            else:
                attribute_set_state = State.EXPLORED

            # Store this attribute set in the explored sets
            compute_time = str(datetime.now() - self._start_time)
            self._add_explored_attribute_set({
                TraceData.TIME: compute_time,
                TraceData.ATTRIBUTES: temp_solution.attribute_ids,
                TraceData.SENSITIVITY: sensitivity,
                TraceData.USABILITY_COST: cost,
                TraceData.COST_EXPLANATION: cost_explanation,
                TraceData.STATE: attribute_set_state
            })
 def test_get_best_entropic_attribute_empty_attribute_set(self):
     result = _get_best_conditional_entropic_attribute(
         self._dataset, current_attributes=AttributeSet({ATTRIBUTES[0],
                                                         ATTRIBUTES[1]}),
         candidate_attributes=AttributeSet())
     self.assertIsNone(result)
Esempio n. 20
0
def _get_best_conditional_entropic_attribute(dataset: FingerprintDataset,
                                             current_attributes: AttributeSet,
                                             candidate_attributes: AttributeSet
                                             ) -> Attribute:
    """Get the attribute that provides the highest total entropy.

    When several attributes provide the same total entropy, the attribute of
    the lowest id is given. If no attribute increases the total entropy, we
    still provide the attribute of the lowest id.

    Args:
        dataset: The dataset used to compute the conditional entropy.
        current_attributes: The attributes that compose the current solution.
        candidate_attributes: The candidate attributes (i.e., those available).

    Raises:
        ValueError: There are candidate attributes and the fingerprint dataset
                    is empty.
        KeyError: One of the candidate attributes is not in the fingerprint
                  dataset.

    Returns:
        The attribute that has the highest conditional entropy among the
        candidate attributes and that is not part of the current attributes.
    """
    logger.debug('Getting the best conditional entropic attribute from '
                 f'{current_attributes}...')

    # Some checks before starting the exploration
    if candidate_attributes and dataset.dataframe.empty:
        raise ValueError('Cannot compute the conditional entropy on an empty '
                         'dataset.')
    for attribute in candidate_attributes:
        if attribute not in dataset.candidate_attributes:
            raise KeyError(f'The attribute {attribute} is not in the dataset.')

    # We will work on a dataset with only a fingerprint per browser to avoid
    # overcounting effects
    df_one_fp_per_browser = dataset.get_df_w_one_fp_per_browser()

    # If we execute on a single process
    if not params.getboolean('Multiprocessing', 'explorations'):
        logger.debug('Measuring the attributes entropy on a single process...')
        best_attribute, best_total_ent = _best_conditional_entropic_attribute(
            df_one_fp_per_browser, current_attributes, candidate_attributes)
        logger.debug(f'  The best attribute is {best_attribute} for a total '
                     f'entropy of {best_total_ent}.')
        return best_attribute

    # The values to update through the search for the best attribute
    best_attribute_informations = {}
    logger.debug('Measuring the attributes conditional entropy using '
                 'multiprocessing...')

    # Infer the number of cores to use
    free_cores = params.getint('Multiprocessing', 'free_cores')
    nb_cores = max(cpu_count() - free_cores, 1)
    attributes_per_core = int(ceil(len(candidate_attributes)/nb_cores))
    logger.debug(f'Sharing {len(candidate_attributes)} candidate attributes '
                 f'over {nb_cores}(+{free_cores}) cores, hence '
                 f'{attributes_per_core} attributes per core.')

    def update_best_conditional_entropy_attribute(result: Tuple[Attribute,
                                                                float]):
        """Update the best conditional entropy attribute.

        Args:
            result: A tuple with the best attribute and the best total entropy.

        Note: This is executed by the main thread and does not pose any
              concurrency or synchronization problem.
        """
        best_attribute, best_total_entropy = result
        if best_attribute:  # To avoid the empty results which are None
            best_attribute_informations[best_attribute] = best_total_entropy

    # Spawn a number of processes equal to the number of cores
    candidate_attributes_list = list(candidate_attributes)
    async_results = []
    with Pool(processes=nb_cores) as pool:
        for process_id in range(nb_cores):
            # Generate the candidate attributes for this process
            start_id = process_id * attributes_per_core
            end_id = (process_id + 1) * attributes_per_core
            candidate_attributes_subset = AttributeSet(
                candidate_attributes_list[start_id:end_id])

            async_result = pool.apply_async(
                _best_conditional_entropic_attribute,
                args=(df_one_fp_per_browser, current_attributes,
                      candidate_attributes_subset),
                callback=update_best_conditional_entropy_attribute)
            async_results.append(async_result)

        # Wait for all the processes to finish (otherwise we would exit
        # before collecting their result)
        for async_result in async_results:
            async_result.wait()

    # Search for the best attribute in the local results. If several provide
    # the same total entropy, we provide the attribute having the lowest id.
    best_attribute, best_total_entropy = None, -float('inf')
    for attribute in sorted(best_attribute_informations):
        attribute_total_entropy = best_attribute_informations[attribute]
        if attribute_total_entropy > best_total_entropy:
            best_total_entropy = attribute_total_entropy
            best_attribute = attribute

    logger.debug(f'  The best attribute is {best_attribute} for a total '
                 f'entropy of {best_total_entropy}.')

    return best_attribute
Esempio n. 21
0
 def test_empty_attributes(self):
     self._attributes = AttributeSet({})
     attributes_avg_size = _compute_attribute_avg_size(
         self._dataframe, self._attributes)
     expected_result = self._get_expected_result()
     self.assertDictEqual(expected_result, attributes_avg_size)
Esempio n. 22
0
def attribute_set_information(attribute_set_id: int):
    """Show information about an attribute set.

    Args:
        attribute_set_id: The id of the attribute set to show.
    """
    global TRACE_DATA
    global FINGERPRINT_DATASET
    global REAL_TIME_EXPLORATION
    logger.info('Getting the information about the attribute set '
                f'{attribute_set_id}.')

    # Check that there is an explored attribute set with this id in the
    # trace
    attribute_set_infos = None
    if attribute_set_id == -1:
        attribute_set_infos = EMPTY_NODE
    elif REAL_TIME_EXPLORATION:
        attribute_set_infos_list = (
            REAL_TIME_EXPLORATION.get_explored_attribute_sets(
                attribute_set_id, attribute_set_id + 1))
        if attribute_set_infos_list:
            attribute_set_infos = attribute_set_infos_list[0]
            attribute_set_infos['id'] = attribute_set_id
    elif TRACE_DATA:
        for explored_attr_set in TRACE_DATA['exploration']:
            if explored_attr_set['id'] == attribute_set_id:
                attribute_set_infos = explored_attr_set
                break
    else:
        error_message = ('Accessing the attribute set information page '
                         'requires a trace or a real time exploration to be '
                         'set.')
        logger.error(error_message)
        abort(HTTPStatus.NOT_FOUND, description=error_message)

    if not attribute_set_infos:
        error_message = (f'The attribute set id {attribute_set_id} was not'
                         ' found.')
        logger.error(error_message)
        abort(HTTPStatus.NOT_FOUND, description=error_message)

    # Generate the attribute set object and get the names of these attributes
    if REAL_TIME_EXPLORATION:
        attributes = AttributeSet(
            FINGERPRINT_DATASET.candidate_attributes.get_attribute_by_id(
                attribute_id)
            for attribute_id in attribute_set_infos['attributes'])
    elif TRACE_DATA:
        attributes = AttributeSet(
            Attribute(attribute_id, TRACE_DATA['attributes'][str(
                attribute_id)])
            for attribute_id in attribute_set_infos['attributes'])
    attribute_names = [attribute.name for attribute in attributes]

    # If there is a fingerprint dataset, compute the additional/optional
    # results from it (the subset for now)
    fingerprint_sample = None
    if attribute_set_id == -1:
        pass  # Avoid trying to get the subset with an empty attribute set
    elif FINGERPRINT_DATASET:
        # Collect a sample of the resulting fingerprints
        attr_subset_sample = AttributeSetSample(
            FINGERPRINT_DATASET, attributes,
            params.getint('WebServer', 'fingerprint_sample_size'))
        attr_subset_sample.execute()
        fingerprint_sample = attr_subset_sample.result
    else:
        flash(
            'Please provide a fingerprint dataset to obtain more insight on '
            'the selected attributes',
            params.get('WebServer', 'flash_info_class'))

    # Compute the textual representation of the state of this attribute set
    attribute_set_state = None
    if attribute_set_infos['state'] == State.EXPLORED:
        attribute_set_state = 'Explored'
    elif attribute_set_infos['state'] == State.PRUNED:
        attribute_set_state = 'Pruned'
    elif attribute_set_infos['state'] == State.SATISFYING:
        attribute_set_state = 'Satisfying the threshold'
    elif attribute_set_infos['state'] == State.EMPTY_NODE:
        attribute_set_state = 'Starting empty node'

    # Prepare a dictionary with the cost percentage of each dimension
    # { cost dimension => (bootstrap progress bar class,  # for pretty display
    #                      percentage of the cost of the candidate attributes)
    # }
    usability_cost_ratio = {}
    if REAL_TIME_EXPLORATION:
        candidate_attributes_infos = (
            REAL_TIME_EXPLORATION.get_explored_attribute_sets(0, 1)[0])
    elif TRACE_DATA:
        candidate_attributes_infos = TRACE_DATA['exploration'][0]
    bootstrap_progess_bars = (params.get(
        'WebServer', 'bootstrap_progess_bars').splitlines())

    # The total usability cost
    cost_percentage = (100 * attribute_set_infos['usability_cost'] /
                       candidate_attributes_infos['usability_cost'])
    usability_cost_ratio['usability'] = (bootstrap_progess_bars[0],
                                         '%.2f' % cost_percentage)

    if attribute_set_id > -1:
        # For each cost dimension except the "weighted" ones
        can_attrs_cost_explanation = candidate_attributes_infos[
            'cost_explanation']
        progress_bar_class_id = 1  # 0 already taken
        for cost_dimension, cost_value in can_attrs_cost_explanation.items():
            if cost_dimension.startswith('weighted'):
                continue
            cost_percentage = (
                100 * attribute_set_infos['cost_explanation'][cost_dimension] /
                cost_value)
            usability_cost_ratio[cost_dimension] = (
                bootstrap_progess_bars[progress_bar_class_id %
                                       len(bootstrap_progess_bars)],
                '%.2f' % cost_percentage)
            progress_bar_class_id += 1

    # Display the attribute information page
    return render_template('attribute-set-information.html',
                           attribute_set_infos=attribute_set_infos,
                           attribute_names=attribute_names,
                           attribute_set_state=attribute_set_state,
                           usability_cost_ratio=usability_cost_ratio,
                           fingerprint_sample=fingerprint_sample,
                           javascript_parameters=params)
Esempio n. 23
0
 def setUp(self):
     self._dataset = DummyCleanDataset()
     self._df_one_fp_per_browser = (
         self._dataset.get_df_w_one_fp_per_browser())
     self._attribute_set = AttributeSet(ATTRIBUTES)
Esempio n. 24
0
def attribute_set_unicity(attribute_set_id: int):
    """Provide the results about the unicity of an attribute set.

    Args:
        attribute_set_id: The id of the attribute set for which to provide the
                          unicity results.
    """
    global TRACE_DATA
    global FINGERPRINT_DATASET
    global REAL_TIME_EXPLORATION
    logger.info('Getting the unicity results of the attribute set '
                f'{attribute_set_id}.')

    # Check that there is an explored attribute set with this id in the trace
    attribute_set_infos = None
    if REAL_TIME_EXPLORATION:
        attribute_set_infos_list = (
            REAL_TIME_EXPLORATION.get_explored_attribute_sets(
                attribute_set_id, attribute_set_id + 1))
        if attribute_set_infos_list:
            attribute_set_infos = attribute_set_infos_list[0]
            attribute_set_infos['id'] = attribute_set_id
    elif TRACE_DATA:
        for explored_attr_set in TRACE_DATA['exploration']:
            if explored_attr_set['id'] == attribute_set_id:
                attribute_set_infos = explored_attr_set
                break
    else:
        error_message = ('Accessing the attribute set unicity page requires a '
                         'trace or a real time exploration to be set.')
        logger.error(error_message)
        abort(HTTPStatus.NOT_FOUND, description=error_message)

    if not FINGERPRINT_DATASET:
        error_message = 'No fingerprint dataset is set.'
        logger.error(error_message)
        abort(HTTPStatus.NOT_FOUND, description=error_message)

    if not attribute_set_infos:
        error_message = (f'The attribute set id {attribute_set_id} was not '
                         'found.')
        logger.error(error_message)
        abort(HTTPStatus.NOT_FOUND, description=error_message)

    # Generate the attribute set object
    if REAL_TIME_EXPLORATION:
        attributes = AttributeSet(
            FINGERPRINT_DATASET.candidate_attributes.get_attribute_by_id(
                attribute_id)
            for attribute_id in attribute_set_infos['attributes'])
    elif TRACE_DATA:
        attributes = AttributeSet(
            Attribute(attribute_id, TRACE_DATA['attributes'][str(
                attribute_id)])
            for attribute_id in attribute_set_infos['attributes'])

    # Compute the unicity of the resulting fingerprints
    attr_set_unicity = AttributeSetUnicity(FINGERPRINT_DATASET, attributes)
    attr_set_unicity.execute()
    unicity_result = attr_set_unicity.result

    # We need to format the results due to unsupported json conversions
    unicity_result[UNICITY_RATE_RESULT] = float(
        unicity_result[UNICITY_RATE_RESULT])
    unicity_result[UNIQUE_FPS_RESULT] = int(unicity_result[UNIQUE_FPS_RESULT])
    unicity_result[TOTAL_BROWSERS_RESULT] = int(
        unicity_result[TOTAL_BROWSERS_RESULT])

    # Return the json version of these results
    return jsonify(unicity_result)
Esempio n. 25
0
 def test_always_the_same_value(self):
     self._attribute_set = AttributeSet([ATTRIBUTES[2]])
     self.check_entropy_result(0.0)
Esempio n. 26
0
 def _set_candidate_attributes(self):
     self._candidate_attributes = AttributeSet()
Esempio n. 27
0
 def test_unique_values(self):
     self._attribute_set = AttributeSet([ATTRIBUTES[1]])
     expected_entropy = log2(len(self._dataset.dataframe))
     self.check_entropy_result(expected_entropy)
Esempio n. 28
0
 def _set_candidate_attributes(self):
     self._candidate_attributes = AttributeSet(ATTRIBUTES)
Esempio n. 29
0
 def setUp(self):
     self._dataset = DummyCleanDataset()
     self._attribute_set = AttributeSet(ATTRIBUTES)
     self._csv_result_path = CSV_RESULT_PATH
Esempio n. 30
0
 def test_third_attribute_only(self):
     self._attribute_set = AttributeSet([ATTRIBUTES[2]])
     possible_values = set(product(self._dataset.DATAS[ATTRIBUTES[2].name]))
     self.check_sample_result(possible_values)