コード例 #1
0
class FingerprintDatasetFromCSVInMemory(FingerprintDataset):
    """A fingerprint dataset read from a in memory csv file."""
    def __init__(self, file_handle: TextIOWrapper):
        """Initialize with the file handle of the in memory csv file.

        Args:
            file_handle: The file handle of the in memory csv file.
        """
        self._file_handle = file_handle
        super().__init__()

    def _process_dataset(self):
        """Process the dataset to obtain a DataFrame from the file.

        - The resulting fingerprint dataset is stored in self._dataframe.
        - The fingerprint dataset has to be a DataFrame with the two
          indices being browser_id (int64) and time_of_collect (datetime64).
        - The columns are named after the attributes and have the value
          collected for the browser browser_id at the time time_of_collect.
        - The name of each column should correspond to the attribute.name
          property of an attribute of the candidate attributes.

        This implementation generates a DataFrame from the csv stored in memory
        with the two indices set.
        """
        # Read the file from the in memory csv file
        # + Parse the 'time_of_collect' column as a date with the option
        #   infer_datetime_format activated for performances
        self._dataframe = pd.read_csv(self._file_handle, index_col=False)

        # Check that the necessary metadatas are present
        for required_metadata in MetadataField.ALL:
            if required_metadata not in self._dataframe:
                raise MissingMetadatasFields(
                    f'The required metadata field {required_metadata} is '
                    'missing from the dataset.')

        # Format the indices
        self._dataframe[MetadataField.TIME_OF_COLLECT] = pd.to_datetime(
            self._dataframe[MetadataField.TIME_OF_COLLECT])

        # Set the indices as 'browser_id' and 'time_of_collect'
        self._dataframe.set_index(
            [MetadataField.BROWSER_ID, MetadataField.TIME_OF_COLLECT],
            inplace=True)

        # Remove the file handle as it is not needed anymore and cannot be
        # pickled
        del self._file_handle

    def _set_candidate_attributes(self):
        """Set the candidate attributes.

        This implementation generates the candidate attributes from the columns
        of the DataFrame, ignoring the browser_id and time_of_collect fields.
        """
        self._candidate_attributes = AttributeSet()
        for column_id, column in enumerate(self._dataframe.columns, 1):
            attribute = Attribute(column_id, column)
            self._candidate_attributes.add(attribute)
コード例 #2
0
class FingerprintDatasetFromFile(FingerprintDataset):
    """A fingerprint dataset read from a file."""
    def __init__(self, dataset_path: str):
        """Initialize the FingerprintDataset with the path to the dataset.

        Args:
            dataset_path: The path to the fingerprint dataset.

        Raises:
            FileNotFoundError: There is not file at the given dataset path.
        """
        if not path.isfile(dataset_path):
            raise FileNotFoundError(f'The dataset file at {dataset_path} is '
                                    'not found.')
        self._dataset_path = dataset_path
        super().__init__()

    def __repr__(self) -> str:
        """Provide a string representation of this fingerprint dataset.

        Returns:
            A string representation of this fingerprint dataset.
        """
        return f'{self.__class__.__name__}({self._dataset_path})'

    def _process_dataset(self):
        """Process the dataset to obtain a DataFrame from the file.

        - The resulting fingerprint dataset is stored in self._dataframe.
        - The fingerprint dataset has to be a DataFrame with the two
          indices being browser_id (int64) and time_of_collect (datetime64).
        - The columns are named after the attributes and have the value
          collected for the browser browser_id at the time time_of_collect.
        - The name of each column should correspond to the attribute.name
          property of an attribute of the candidate attributes.

        Raises:
            NotImplementedError: This abstract method is not defined.
        """
        raise NotImplementedError

    def _set_candidate_attributes(self):
        """Set the candidate attributes.

        This implementation generates the candidate attributes from the columns
        of the DataFrame, ignoring the browser_id and time_of_collect fields.
        """
        self._candidate_attributes = AttributeSet()
        for column_id, column in enumerate(self._dataframe.columns, 1):
            attribute = Attribute(column_id, column)
            self._candidate_attributes.add(attribute)

    @property
    def dataset_path(self) -> str:
        """Give the path to the fingerprint dataset.

        Returns:
            The path to the fingerprint dataset.
        """
        return self._dataset_path
コード例 #3
0
class TestBestConditionalEntropic(unittest.TestCase):

    def setUp(self):
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._dataset = DummyCleanDataset()
        self._df_w_one_fp_per_browser = (
            self._dataset.get_df_w_one_fp_per_browser())

    def test_best_conditional_entropic_attribute(self):
        # This will just take the first attribute which is sufficient as it has
        # unique values
        best_cond_ent_attr = _best_conditional_entropic_attribute(
            self._df_w_one_fp_per_browser, current_attributes=AttributeSet(),
            candidate_attributes=self._attribute_set)
        self.assertEqual(best_cond_ent_attr[0], ATTRIBUTES[1])

    def test_best_conditional_entropic_attribute_all_taken(self):
        best_cond_ent_attr = _best_conditional_entropic_attribute(
            self._df_w_one_fp_per_browser,
            current_attributes=self._attribute_set,
            candidate_attributes=self._attribute_set)
        self.assertIsNone(best_cond_ent_attr[0])

    def test_best_conditional_entropic_attribute_empty_attribute_set(self):
        best_cond_ent_attr = _best_conditional_entropic_attribute(
            self._df_w_one_fp_per_browser,
            current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}),
            candidate_attributes=AttributeSet())
        self.assertIsNone(best_cond_ent_attr[0])

    def test_best_conditional_entropic_attribute_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        self._df_w_one_fp_per_browser = (
            self._dataset.get_df_w_one_fp_per_browser())
        with self.assertRaises(ValueError):
            best_cond_ent_attr = _best_conditional_entropic_attribute(
                self._df_w_one_fp_per_browser,
                current_attributes=AttributeSet({ATTRIBUTES[0],
                                                 ATTRIBUTES[1]}),
                candidate_attributes=self._attribute_set)

    def test_best_conditional_entropic_attribute_empty_parameters(self):
        self._dataset = DummyEmptyDataset()
        self._df_w_one_fp_per_browser = (
            self._dataset.get_df_w_one_fp_per_browser())
        best_cond_ent_attr = _best_conditional_entropic_attribute(
            self._df_w_one_fp_per_browser,
            current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}),
            candidate_attributes=AttributeSet())
        self.assertIsNone(best_cond_ent_attr[0])

    def test_best_conditional_entropic_attribute_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            best_cond_ent_attr = _best_conditional_entropic_attribute(
                self._df_w_one_fp_per_browser,
                current_attributes=AttributeSet({ATTRIBUTES[0],
                                                 ATTRIBUTES[1]}),
                candidate_attributes=self._attribute_set)
コード例 #4
0
ファイル: test_attribute.py プロジェクト: tandriamil/BrFAST
 def test_add_new_attribute(self):
     new_attr_set = AttributeSet({self._user_agent})
     self.assertEqual(1, len(new_attr_set))
     new_attribute = ATTRIBUTES[2]
     new_attr_set.add(new_attribute)
     self.assertEqual(2, len(new_attr_set))
     self.assertIn(self._user_agent, new_attr_set)
     self.assertIn(new_attribute, new_attr_set)
コード例 #5
0
ファイル: fpselect.py プロジェクト: tandriamil/BrFAST
def _expand_attribute_sets(attr_sets_to_expand: List[AttributeSet],
                           candidate_attributes: AttributeSet,
                           satisfying_attribute_sets: Set[AttributeSet],
                           attribute_sets_ignored_supersets: Set[AttributeSet],
                           use_pruning_methods: bool) -> Set[AttributeSet]:
    """Expand a subset of the attribute sets to expand.

    Args:
        attr_sets_to_expand: The attribute sets to expand.
        candidate_attributes: The complete set of the candidate attributes.
        satisfying_attribute_sets: The attribute sets that satisfy the
                                   sensitivity threshold.
        attribute_sets_ignored_supersets: The attribute sets for which to
                                          ignore their supersets.
        use_pruning_methods: Whether we use the pruning methods or not.

    Returns:
        The set of the next attribute sets to explore.
    """
    next_attr_sets_to_explore = set()

    # Generate the attr. sets composed of S_i with one more attr.
    # For all S_i in S
    for set_to_expand in attr_sets_to_expand:
        # For all a in A diff C
        for attribute in candidate_attributes:
            if attribute in set_to_expand:
                continue

            # The attr. set C with one more attribute (S_i union {a})
            new_attr_set = AttributeSet(set_to_expand)
            new_attr_set.add(attribute)
            add_new_attr_set = True

            # Ignore C if the attr. a is already in the attr. set S_i
            if attribute in set_to_expand:
                add_new_attr_set = False
                continue

            # Ignore C if it is a superset of an attr. set of T
            for attr_set_sat in satisfying_attribute_sets:
                if new_attr_set.issuperset(attr_set_sat):
                    add_new_attr_set = False
                    break

            # Ignore C if we use the pruning methods and it is a superset
            # of an attr. set which supersets are to be ignored
            if use_pruning_methods and add_new_attr_set:
                for attr_set_to_ign in attribute_sets_ignored_supersets:
                    if new_attr_set.issuperset(attr_set_to_ign):
                        add_new_attr_set = False
                        break

            # If C is fine, it is added to the attr. sets to explore
            if add_new_attr_set:
                next_attr_sets_to_explore.add(new_attr_set)

    return next_attr_sets_to_explore
コード例 #6
0
class TestAttributeSetEntropyFunction(unittest.TestCase):

    def setUp(self):
        self._dataset = DummyCleanDataset()
        self._df_one_fp_per_browser = (
            self._dataset.get_df_w_one_fp_per_browser())
        self._attribute_set = AttributeSet(ATTRIBUTES)

    def check_entropy_result(self, expected_entropy: float):
        computed_entropy = attribute_set_entropy(self._df_one_fp_per_browser,
                                                 self._attribute_set)
        self.assertAlmostEqual(expected_entropy, computed_entropy)

    def test_empty_dataset_and_empty_attribute_set(self):
        self._dataset = DummyEmptyDataset()
        self._df_one_fp_per_browser = (
            self._dataset.get_df_w_one_fp_per_browser())
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        self._df_one_fp_per_browser = (
            self._dataset.get_df_w_one_fp_per_browser())
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_empty_attribute_set(self):
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_always_the_same_value(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[2]])
        self.check_entropy_result(0.0)

    def test_in_between_entropy(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0]])
        expected_entropy = -1 * ((1/5)*log2(1/5) + (2/5)*log2(2/5)
                                 + (2/5)*log2(2/5))
        self.check_entropy_result(expected_entropy)

    def test_unique_values(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[1]])
        expected_entropy = log2(len(self._dataset.dataframe))
        self.check_entropy_result(expected_entropy)
コード例 #7
0
def _best_conditional_entropic_attribute(df_one_fp_per_browser: pd.DataFrame,
                                         current_attributes: AttributeSet,
                                         candidate_attributes: AttributeSet
                                         ) -> Tuple[Attribute, float]:
    """Get the best conditional entropic attribute among the candidates.

    Args:
        df_one_fp_per_browser: The dataframe with only one fingerprint per
                               browser.
        current_attributes: The attributes that compose the current solution.
        candidate_attributes: The candidate attributes for this process to
                              check.

    Returns:
        A tuple with the best attribute for this process and the total entropy
        when adding this attribute to the current attributes.
    """
    best_local_attribute, best_local_total_entropy = None, -float('inf')
    for attribute in candidate_attributes:

        # Ignore the attributes that are already in the current attribute set
        if attribute in current_attributes:
            continue

        # Generate a new attribute set with this attribute
        attribute_set = AttributeSet(current_attributes)
        attribute_set.add(attribute)

        # Evaluate the conditional entropy of this new attribute set and save
        # it in the dictionary
        attr_set_entropy = attribute_set_entropy(df_one_fp_per_browser,
                                                 attribute_set)
        if attr_set_entropy > best_local_total_entropy:
            best_local_attribute = attribute
            best_local_total_entropy = attr_set_entropy

    return (best_local_attribute, best_local_total_entropy)
コード例 #8
0
ファイル: test_attribute.py プロジェクト: tandriamil/BrFAST
 def test_add_new_attribute_already_present(self):
     new_attr_set = AttributeSet({self._user_agent, self._timezone})
     with self.assertRaises(DuplicateAttributeId):
         new_attr_set.add(self._timezone)
コード例 #9
0
class TestAttributeSetEntropy(unittest.TestCase):

    def setUp(self):
        self._dataset = DummyCleanDataset()
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._csv_result_path = CSV_RESULT_PATH

    def check_entropy_result(self, expected_entropy: float):
        maximum_entropy = log2(len(self._dataset.dataframe))
        attribute_set_entropy_analysis = AttributeSetEntropy(
            self._dataset, self._attribute_set)
        attribute_set_entropy_analysis.execute()
        analysis_result = attribute_set_entropy_analysis.result
        expected_result = {
            ENTROPY_RESULT: expected_entropy,
            MAXIMUM_ENTROPY_RESULT: maximum_entropy,
            NORMALIZED_ENTROPY_RESULT: expected_entropy/maximum_entropy
        }
        for result_name, expected_value in expected_result.items():
            self.assertAlmostEqual(analysis_result[result_name],
                                   expected_value)

    def test_empty_dataset_and_empty_attribute_set(self):
        self._dataset = DummyEmptyDataset()
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_empty_attribute_set(self):
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            self.check_entropy_result(WONT_COMPUTE)

    def test_in_between_entropy(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0]])
        expected_entropy = -1 * ((1/5)*log2(1/5) + (2/5)*log2(2/5)
                                 + (2/5)*log2(2/5))
        self.check_entropy_result(expected_entropy)

    def test_always_the_same_value(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[2]])
        self.check_entropy_result(0.0)

    def test_unique_values(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[1]])
        maximum_entropy = log2(len(self._dataset.dataframe))
        self.check_entropy_result(maximum_entropy)

    def test_save_csv_result(self):
        attribute_set_entropy_analysis = AttributeSetEntropy(
            self._dataset, self._attribute_set)
        attribute_set_entropy_analysis.execute()
        attribute_set_entropy_analysis.save_csv_result(self._csv_result_path)
        remove(self._csv_result_path)
コード例 #10
0
class TestGetBestConditionalEntropicAttribute(unittest.TestCase):

    def setUp(self):
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._dataset = DummyCleanDataset()

    def test_get_best_entropic_attribute(self):
        # The order is 1 (unique values), then 0 (some collisions), then
        # 2 (the same value for each browser)
        first_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet(),
            candidate_attributes=self._attribute_set)
        self.assertEqual(first_best, ATTRIBUTES[1])

        second_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1]}),
            candidate_attributes=self._attribute_set)
        self.assertEqual(second_best, ATTRIBUTES[0])

        third_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1],
                                                            ATTRIBUTES[0]}),
            candidate_attributes=self._attribute_set)
        self.assertEqual(third_best, ATTRIBUTES[2])

        no_more_available = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({
                ATTRIBUTES[1], ATTRIBUTES[0], ATTRIBUTES[2]}),
            candidate_attributes=self._attribute_set)
        self.assertIsNone(no_more_available)

    def test_get_best_entropic_attribute_every_attribute_already_taken(self):
        result = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=self._attribute_set,
            candidate_attributes=self._attribute_set)
        self.assertIsNone(result)

    def test_get_best_entropic_attribute_empty_attribute_set(self):
        result = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({ATTRIBUTES[0],
                                                            ATTRIBUTES[1]}),
            candidate_attributes=AttributeSet())
        self.assertIsNone(result)

    def test_get_best_entropic_attribute_empty_dataset(self):
        empty_dataset = DummyEmptyDataset()
        with self.assertRaises(ValueError):
            _get_best_conditional_entropic_attribute(
                empty_dataset,
                current_attributes=AttributeSet({ATTRIBUTES[0],
                                                 ATTRIBUTES[1]}),
                candidate_attributes=self._attribute_set)

    def test_get_best_entropic_attribute_empty_candidates_and_dataset(self):
        empty_dataset = DummyEmptyDataset()
        result = _get_best_conditional_entropic_attribute(
            empty_dataset, current_attributes=AttributeSet({ATTRIBUTES[0],
                                                            ATTRIBUTES[1]}),
            candidate_attributes=AttributeSet())
        self.assertIsNone(result)

    def test_get_best_entropic_attribute_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            _get_best_conditional_entropic_attribute(
                self._dataset,
                current_attributes=AttributeSet({ATTRIBUTES[0],
                                                 ATTRIBUTES[1]}),
                candidate_attributes=self._attribute_set)
コード例 #11
0
    def _search_for_solution(self):
        """Search for a solution using the entropy-based exploration algorithm.

        This function has to
        - Set the best solution currently found (AttributeSet).
        - Update the set of the attribute sets that satisfy the sensitivity
          threshold (Set[AttributeSet]).
        - Update the list of the explored attributes which is the trace of the
          execution. The information regarding an explored attribute is stored
          as a dictionary with the following key/values:
          * time (float): The time spent since the starting of the exploration
                          in seconds (use timedelta.total_seconds()).
          * attributes (Set[int]): The set of the ids of the attributes.
          * sensitivity (float): The sensitivity of the attribute set.
          * usability_cost (float): The usability cost of the attribute set.
          * cost_explanation (Dict[str: float]): The explanation of the cost of
                                                 the attribute set.
          * state (State): The state of this attribute set (see State class).
        - Log the explored attribute sets for debugging purposes using loguru.

        Note:
            We use the ids of the attributes instead of their name to reduce
            the size of the trace in memory and when saved in json format.
        """
        # The temporary solution (empty set) and the current sensitivity (1.0
        # as it is equivalent to no browser fingerprinting used at all)
        temp_solution, sensitivity = AttributeSet(), 1.0

        # We already checked that the sensitivity threshold is reachable, hence
        # we always reach it when processing the Exploration
        while sensitivity > self._sensitivity_threshold:

            # Find the attribute that has the highest conditional entropy
            best_cond_ent_attr = _get_best_conditional_entropic_attribute(
                self._dataset, temp_solution,
                self._dataset.candidate_attributes)

            # NOTE Removed as we already check that a solution exists before
            #      running the exploration. As a result, we always reach an
            #      attribute set that satisfies the sensitivity threshold, the
            #      complete set of the candidate attributes in the worst case.
            # If no more solution is proposed, end the exploration
            # if not best_cond_ent_attr:
            #     break

            # Add this attribute to the temporary solution
            temp_solution.add(best_cond_ent_attr)

            # Compute its sensitivity and its cost
            logger.debug(f'Exploring {temp_solution}...')
            sensitivity = self._sensitivity.evaluate(temp_solution)
            cost, cost_explanation = (
                self._usability_cost.evaluate(temp_solution))
            logger.debug(f'  Sensitivity ({sensitivity}), '
                         f'usability cost ({cost})')

            # If it satisfies the sensitivity threshold, quit the loop
            if sensitivity <= self._sensitivity_threshold:
                self._update_solution(temp_solution)
                attribute_set_state = State.SATISFYING
                self._add_satisfying_attribute_set(temp_solution)
            else:
                attribute_set_state = State.EXPLORED

            # Store this attribute set in the explored sets
            compute_time = str(datetime.now() - self._start_time)
            self._add_explored_attribute_set({
                TraceData.TIME: compute_time,
                TraceData.ATTRIBUTES: temp_solution.attribute_ids,
                TraceData.SENSITIVITY: sensitivity,
                TraceData.USABILITY_COST: cost,
                TraceData.COST_EXPLANATION: cost_explanation,
                TraceData.STATE: attribute_set_state
            })
コード例 #12
0
ファイル: entropy.py プロジェクト: tandriamil/BrFAST
    def _search_for_solution(self):
        """Search for a solution using the entropy-based exploration algorithm.

        This function has to
        - Set the best solution currently found (AttributeSet).
        - Update the set of the attribute sets that satisfy the sensitivity
          threshold (Set[AttributeSet]).
        - Update the list of the explored attributes which is the trace of the
          execution. The information regarding an explored attribute is stored
          as a dictionary with the following key/values:
          * time (float): The time spent since the starting of the exploration
                          in seconds (use timedelta.total_seconds()).
          * attributes (Set[int]): The set of the ids of the attributes.
          * sensitivity (float): The sensitivity of the attribute set.
          * usability_cost (float): The usability cost of the attribute set.
          * cost_explanation (Dict[str: float]): The explanation of the cost of
                                                 the attribute set.
          * state (State): The state of this attribute set (see State class).
        - Log the explored attribute sets for debugging purposes using loguru.

        Note:
            We use the ids of the attributes instead of their name to reduce
            the size of the trace in memory and when saved in json format.
        """
        # Get a dictionary of the entropy of each attribute
        logger.info('Computing the entropy of each attribute...')
        attributes_entropy = _get_attributes_entropy(
            self._dataset, self._dataset.candidate_attributes)
        entropy_compute_time = datetime.now() - self._start_time
        logger.info('Entropy of the attributes computed after '
                    f'{entropy_compute_time}.')

        # Take the attributes in the order of their entropy
        attribute_set = AttributeSet()
        for attribute, _ in sort_dict_by_value(attributes_entropy,
                                               reverse=True):

            # Check the new attribute set that is obtained
            attribute_set.add(attribute)
            logger.debug(f'Exploring {attribute_set}...')

            # Compute its sensitivity and its cost
            sensitivity = self._sensitivity.evaluate(attribute_set)
            cost, cost_explanation = (
                self._usability_cost.evaluate(attribute_set))
            logger.debug(f'  Sensitivity ({sensitivity}), '
                         f'usability cost ({cost})')

            # If it satisfies the sensitivity threshold, quit the loop
            if sensitivity <= self._sensitivity_threshold:
                self._update_solution(attribute_set)
                self._add_satisfying_attribute_set(attribute_set)

                # Store this attribute set in the explored sets
                compute_time = str(datetime.now() - self._start_time)
                self._add_explored_attribute_set({
                    TraceData.TIME:
                    compute_time,
                    TraceData.ATTRIBUTES:
                    attribute_set.attribute_ids,
                    TraceData.SENSITIVITY:
                    sensitivity,
                    TraceData.USABILITY_COST:
                    cost,
                    TraceData.COST_EXPLANATION:
                    cost_explanation,
                    TraceData.STATE:
                    State.SATISFYING
                })

                # Quit the loop if we found a solution
                break

            # If it does not satisfy the sensitivity threshold, we continue
            compute_time = str(datetime.now() - self._start_time)
            self._add_explored_attribute_set({
                TraceData.TIME:
                compute_time,
                TraceData.ATTRIBUTES:
                attribute_set.attribute_ids,
                TraceData.SENSITIVITY:
                sensitivity,
                TraceData.USABILITY_COST:
                cost,
                TraceData.COST_EXPLANATION:
                cost_explanation,
                TraceData.STATE:
                State.EXPLORED
            })
コード例 #13
0
class TestAttributeSetUnicity(unittest.TestCase):
    def setUp(self):
        self._dataset = DummyCleanDataset()
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._csv_result_path = CSV_RESULT_PATH

    def check_unicity_result(self, expected_unique_fps: int):
        total_browsers = len(self._dataset.dataframe)
        attribute_set_unicity_analysis = AttributeSetUnicity(
            self._dataset, self._attribute_set)
        attribute_set_unicity_analysis.execute()
        analysis_result = attribute_set_unicity_analysis.result
        expected_result = {
            UNIQUE_FPS_RESULT: expected_unique_fps,
            TOTAL_BROWSERS_RESULT: total_browsers,
            UNICITY_RATE_RESULT: expected_unique_fps / total_browsers
        }
        for result_name, expected_value in expected_result.items():
            self.assertAlmostEqual(analysis_result[result_name],
                                   expected_value)

    def test_empty_dataset_and_empty_attribute_set(self):
        self._dataset = DummyEmptyDataset()
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_unicity_result(WONT_COMPUTE)

    def test_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        with self.assertRaises(ValueError):
            self.check_unicity_result(WONT_COMPUTE)

    def test_empty_attribute_set(self):
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_unicity_result(WONT_COMPUTE)

    def test_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            self.check_unicity_result(WONT_COMPUTE)

    def test_in_between_entropy(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0]])
        self.check_unicity_result(1)

    def test_always_the_same_value(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[2]])
        self.check_unicity_result(0)

    def test_unique_values(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[1]])
        total_browsers = len(self._dataset.dataframe)
        self.check_unicity_result(total_browsers)

    def test_save_csv_result(self):
        attribute_set_unicity_analysis = AttributeSetUnicity(
            self._dataset, self._attribute_set)
        attribute_set_unicity_analysis.execute()
        attribute_set_unicity_analysis.save_csv_result(self._csv_result_path)
        remove(self._csv_result_path)
コード例 #14
0
class TestComputeAttributesInstability(unittest.TestCase):
    def setUp(self):
        self._dataset = DummyCleanDataset()
        self._attributes = AttributeSet(ATTRIBUTES)

    def _get_grouped_by_browser(self):
        # 1. Group by the browser id (no sort for performances, no group key to
        #    not add an additonal column with the group key)
        # 2. Sort by the time of collection for each group (give a DataFrame)
        # 3. Regroup by the browser id, here each group has the fingerprints
        #    sorted by the time of collection
        return (self._dataset.dataframe.groupby(
            MetadataField.BROWSER_ID, sort=False,
            group_keys=False).apply(lambda group_df: group_df.sort_values(
                MetadataField.TIME_OF_COLLECT)).groupby(
                    MetadataField.BROWSER_ID, sort=False, group_keys=False))

    def test_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {
            ATTRIBUTES[0]: 0.0,
            ATTRIBUTES[1]: 0.0,
            ATTRIBUTES[2]: 0.0
        }
        self.assertDictEqual(expected_result, attributes_instability)

    def test_unexistent_attribute(self):
        self._attributes.add(UNEXISTENT_ATTRIBUTE)
        grouped_by_browser = self._get_grouped_by_browser()
        with self.assertRaises(KeyError):
            _compute_attributes_instability(grouped_by_browser,
                                            self._attributes)

    def test_empty_attributes(self):
        self._attributes = AttributeSet({})
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {}
        self.assertDictEqual(expected_result, attributes_instability)

    def test_empty_dataset_and_attributes(self):
        self._dataset = DummyEmptyDataset()
        self._attributes = AttributeSet({})
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {}
        self.assertDictEqual(expected_result, attributes_instability)

    def test_clean_dataset(self):
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {
            ATTRIBUTES[0]: 0.0,
            ATTRIBUTES[1]: 0.0,
            ATTRIBUTES[2]: 0.0
        }
        self.assertDictEqual(expected_result, attributes_instability)

    def test_dummy_fingerprint_dataset(self):
        self._dataset = DummyFingerprintDataset()
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {
            ATTRIBUTES[0]: 0.0,
            ATTRIBUTES[1]: 0.0,
            ATTRIBUTES[2]: 0.0
        }
        self.assertDictEqual(expected_result, attributes_instability)

    def test_dummy_dataset_with_changes(self):
        self._dataset = DummyDatasetWithChanges()
        grouped_by_browser = self._get_grouped_by_browser()
        attributes_instability = _compute_attributes_instability(
            grouped_by_browser, self._attributes)
        expected_result = {
            ATTRIBUTES[0]: 1 / 2,
            ATTRIBUTES[1]: 1.0,
            ATTRIBUTES[2]: 0.0
        }
        self.assertDictEqual(expected_result, attributes_instability)
コード例 #15
0
class TestAttributeSetSample(unittest.TestCase):
    def setUp(self):
        self._dataset = DummyCleanDataset()
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._sample_size = SAMPLE_SIZE
        self._csv_result_path = CSV_RESULT_PATH

    def check_sample_result(self, possible_values: Set[tuple]):
        attribute_set_sample_analysis = AttributeSetSample(
            self._dataset, self._attribute_set, self._sample_size)
        attribute_set_sample_analysis.execute()
        analysis_result = attribute_set_sample_analysis.result
        first_fingerprint = next(iter(analysis_result.values()))
        expected_sample_size = min(self._sample_size,
                                   len(self._dataset.dataframe))
        self.assertEqual(len(first_fingerprint), len(self._attribute_set))
        self.assertEqual(expected_sample_size, len(analysis_result))
        for sample_id, sample_fingerprint in analysis_result.items():
            self.assertIn(sample_fingerprint, possible_values)

    def test_wrong_sample_size(self):
        with self.assertRaises(AttributeError):
            wrong_sample_size = 0
            AttributeSetSample(self._dataset, self._attribute_set,
                               wrong_sample_size)
        with self.assertRaises(AttributeError):
            wrong_sample_size = -3
            AttributeSetSample(self._dataset, self._attribute_set,
                               wrong_sample_size)

    def test_empty_dataset_and_empty_attribute_set(self):
        self._dataset = DummyEmptyDataset()
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_sample_result(WONT_COMPUTE)

    def test_empty_dataset(self):
        self._dataset = DummyEmptyDataset()
        with self.assertRaises(ValueError):
            self.check_sample_result(WONT_COMPUTE)

    def test_empty_attribute_set(self):
        self._attribute_set = AttributeSet()
        with self.assertRaises(ValueError):
            self.check_sample_result(WONT_COMPUTE)

    def test_unexistent_attribute(self):
        self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
        with self.assertRaises(KeyError):
            self.check_sample_result(WONT_COMPUTE)

    def test_first_attribute_only(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0]])
        possible_values = set(product(self._dataset.DATAS[ATTRIBUTES[0].name]))
        self.check_sample_result(possible_values)

    def test_second_attribute_only(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[1]])
        possible_values = set(product(self._dataset.DATAS[ATTRIBUTES[1].name]))
        self.check_sample_result(possible_values)

    def test_third_attribute_only(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[2]])
        possible_values = set(product(self._dataset.DATAS[ATTRIBUTES[2].name]))
        self.check_sample_result(possible_values)

    def test_two_attributes(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0], ATTRIBUTES[1]])
        first_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[0].name])
        second_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[1].name])
        possible_values = set(
            product(first_attribute_values, second_attribute_values))
        self.check_sample_result(possible_values)

    def test_all_the_attributes(self):
        self._attribute_set = AttributeSet(ATTRIBUTES)
        first_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[0].name])
        second_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[1].name])
        third_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[2].name])
        possible_values = set(
            product(first_attribute_values, second_attribute_values,
                    third_attribute_values))
        self.check_sample_result(possible_values)

    def test_first_attribute_only_higher_sample_size(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0]])
        self._sample_size = len(self._dataset.dataframe) + SAMPLE_SIZE_INCREASE
        with self.assertRaises(ValueError):
            self.check_sample_result(WONT_COMPUTE)

    def test_two_attributes_higher_sample_size(self):
        self._attribute_set = AttributeSet([ATTRIBUTES[0], ATTRIBUTES[1]])
        self._sample_size = len(self._dataset.dataframe) + SAMPLE_SIZE_INCREASE
        with self.assertRaises(ValueError):
            self.check_sample_result(WONT_COMPUTE)

    def test_all_the_attributes_higher_sample_size(self):
        self._attribute_set = AttributeSet(ATTRIBUTES)
        self._sample_size = len(self._dataset.dataframe) + SAMPLE_SIZE_INCREASE
        with self.assertRaises(ValueError):
            self.check_sample_result(WONT_COMPUTE)

    def test_save_csv_result(self):
        attribute_set_sample_analysis = AttributeSetSample(
            self._dataset, self._attribute_set, self._sample_size)
        attribute_set_sample_analysis.execute()
        attribute_set_sample_analysis.save_csv_result(self._csv_result_path)
        remove(self._csv_result_path)