class TestComputeAttributesInstability(unittest.TestCase): def setUp(self): self._dataset = DummyCleanDataset() self._attributes = AttributeSet(ATTRIBUTES) def _get_grouped_by_browser(self): # 1. Group by the browser id (no sort for performances, no group key to # not add an additonal column with the group key) # 2. Sort by the time of collection for each group (give a DataFrame) # 3. Regroup by the browser id, here each group has the fingerprints # sorted by the time of collection return (self._dataset.dataframe.groupby( MetadataField.BROWSER_ID, sort=False, group_keys=False).apply(lambda group_df: group_df.sort_values( MetadataField.TIME_OF_COLLECT)).groupby( MetadataField.BROWSER_ID, sort=False, group_keys=False)) def test_empty_dataset(self): self._dataset = DummyEmptyDataset() grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = { ATTRIBUTES[0]: 0.0, ATTRIBUTES[1]: 0.0, ATTRIBUTES[2]: 0.0 } self.assertDictEqual(expected_result, attributes_instability) def test_unexistent_attribute(self): self._attributes.add(UNEXISTENT_ATTRIBUTE) grouped_by_browser = self._get_grouped_by_browser() with self.assertRaises(KeyError): _compute_attributes_instability(grouped_by_browser, self._attributes) def test_empty_attributes(self): self._attributes = AttributeSet({}) grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = {} self.assertDictEqual(expected_result, attributes_instability) def test_empty_dataset_and_attributes(self): self._dataset = DummyEmptyDataset() self._attributes = AttributeSet({}) grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = {} self.assertDictEqual(expected_result, attributes_instability) def test_clean_dataset(self): grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = { ATTRIBUTES[0]: 0.0, ATTRIBUTES[1]: 0.0, ATTRIBUTES[2]: 0.0 } self.assertDictEqual(expected_result, attributes_instability) def test_dummy_fingerprint_dataset(self): self._dataset = DummyFingerprintDataset() grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = { ATTRIBUTES[0]: 0.0, ATTRIBUTES[1]: 0.0, ATTRIBUTES[2]: 0.0 } self.assertDictEqual(expected_result, attributes_instability) def test_dummy_dataset_with_changes(self): self._dataset = DummyDatasetWithChanges() grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = { ATTRIBUTES[0]: 1 / 2, ATTRIBUTES[1]: 1.0, ATTRIBUTES[2]: 0.0 } self.assertDictEqual(expected_result, attributes_instability)
def test_in_between_entropy(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) expected_entropy = -1 * ((1/5)*log2(1/5) + (2/5)*log2(2/5) + (2/5)*log2(2/5)) self.check_entropy_result(expected_entropy)
class TestAttributeSetEntropy(unittest.TestCase): def setUp(self): self._dataset = DummyCleanDataset() self._attribute_set = AttributeSet(ATTRIBUTES) self._csv_result_path = CSV_RESULT_PATH def check_entropy_result(self, expected_entropy: float): maximum_entropy = log2(len(self._dataset.dataframe)) attribute_set_entropy_analysis = AttributeSetEntropy( self._dataset, self._attribute_set) attribute_set_entropy_analysis.execute() analysis_result = attribute_set_entropy_analysis.result expected_result = { ENTROPY_RESULT: expected_entropy, MAXIMUM_ENTROPY_RESULT: maximum_entropy, NORMALIZED_ENTROPY_RESULT: expected_entropy/maximum_entropy } for result_name, expected_value in expected_result.items(): self.assertAlmostEqual(analysis_result[result_name], expected_value) def test_empty_dataset_and_empty_attribute_set(self): self._dataset = DummyEmptyDataset() self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_empty_dataset(self): self._dataset = DummyEmptyDataset() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_empty_attribute_set(self): self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): self.check_entropy_result(WONT_COMPUTE) def test_in_between_entropy(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) expected_entropy = -1 * ((1/5)*log2(1/5) + (2/5)*log2(2/5) + (2/5)*log2(2/5)) self.check_entropy_result(expected_entropy) def test_always_the_same_value(self): self._attribute_set = AttributeSet([ATTRIBUTES[2]]) self.check_entropy_result(0.0) def test_unique_values(self): self._attribute_set = AttributeSet([ATTRIBUTES[1]]) maximum_entropy = log2(len(self._dataset.dataframe)) self.check_entropy_result(maximum_entropy) def test_save_csv_result(self): attribute_set_entropy_analysis = AttributeSetEntropy( self._dataset, self._attribute_set) attribute_set_entropy_analysis.execute() attribute_set_entropy_analysis.save_csv_result(self._csv_result_path) remove(self._csv_result_path)
def test_evaluate_empty_attribute_set(self): empty_attr_cost, empty_attr_cost_explanation = ( self._memory_instability_measure.evaluate(AttributeSet())) self.assertEqual(empty_attr_cost, 0.0) for cost_value in empty_attr_cost_explanation.values(): self.assertEqual(cost_value, 0.0)
def test_empty_attribute_set(self): self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE)
def setUp(self): self._attribute_set = AttributeSet(ATTRIBUTES) self._dataset = DummyCleanDataset()
def setUp(self): self._dataset = DummyCleanDataset() self._dataframe = self._dataset.dataframe self._attributes = AttributeSet(ATTRIBUTES)
def test_unique_values(self): self._attribute_set = AttributeSet([ATTRIBUTES[1]]) total_browsers = len(self._dataset.dataframe) self.check_unicity_result(total_browsers)
def _get_attributes_entropy( dataset: FingerprintDataset, attributes: AttributeSet) -> Dict[Attribute, float]: """Give a dictionary with the entropy of each attribute. Args: dataset: The fingerprint dataset used to compute the entropy. attributes: The attributes for which we compute the entropy. Raises: ValueError: There are attributes and the fingerprint dataset is empty. KeyError: An attribute is not in the fingerprint dataset. Returns: A dictionary with each attribute (Attribute) and its entropy. """ # Some checks before starting the exploration if attributes and dataset.dataframe.empty: raise ValueError('Cannot compute the entropy on an empty dataset.') for attribute in attributes: if attribute not in dataset.candidate_attributes: raise KeyError(f'The attribute {attribute} is not in the dataset.') # We will work on a dataset with only a fingerprint per browser to avoid # overcounting effects df_one_fp_per_browser = dataset.get_df_w_one_fp_per_browser() # If we execute on a single process if not params.getboolean('Multiprocessing', 'explorations'): logger.debug('Measuring the attributes entropy on a single process...') return _compute_attribute_entropy(df_one_fp_per_browser, attributes) # The dictionary to update when using multiprocessing logger.debug('Measuring the attributes entropy using multiprocessing...') attributes_entropy = {} # Infer the number of cores to use free_cores = params.getint('Multiprocessing', 'free_cores') nb_cores = max(cpu_count() - free_cores, 1) attributes_per_core = int(ceil(len(attributes) / nb_cores)) logger.debug(f'Sharing {len(attributes)} attributes over ' f'{nb_cores}(+{free_cores}) cores, hence ' f'{attributes_per_core} attributes per core.') def update_attributes_entropy(attrs_entropy: Dict[Attribute, float]): """Update the complete dictionary attributes_entropy. Args: attrs_size: The dictionary containing the subset of the results computed by a process. Note: This is executed by the main thread and does not pose any concurrency or synchronization problem. """ for attribute, attribute_entropy in attrs_entropy.items(): attributes_entropy[attribute] = attribute_entropy # Spawn a number of processes equal to the number of cores attributes_list = list(attributes) async_results = [] with Pool(processes=nb_cores) as pool: for process_id in range(nb_cores): # Generate the candidate attributes for this process start_id = process_id * attributes_per_core end_id = (process_id + 1) * attributes_per_core attributes_subset = AttributeSet(attributes_list[start_id:end_id]) async_result = pool.apply_async(_compute_attribute_entropy, args=(df_one_fp_per_browser, attributes_subset), callback=update_attributes_entropy) async_results.append(async_result) # Wait for all the processes to finish (otherwise we would exit # before collecting their result) for async_result in async_results: async_result.wait() return attributes_entropy
def test_empty_dataset_and_empty_attribute_set(self): self._dataset = DummyEmptyDataset() self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_unicity_result(WONT_COMPUTE)
def test_in_between_entropy(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) self.check_unicity_result(1)
class TestAttributeSetUnicity(unittest.TestCase): def setUp(self): self._dataset = DummyCleanDataset() self._attribute_set = AttributeSet(ATTRIBUTES) self._csv_result_path = CSV_RESULT_PATH def check_unicity_result(self, expected_unique_fps: int): total_browsers = len(self._dataset.dataframe) attribute_set_unicity_analysis = AttributeSetUnicity( self._dataset, self._attribute_set) attribute_set_unicity_analysis.execute() analysis_result = attribute_set_unicity_analysis.result expected_result = { UNIQUE_FPS_RESULT: expected_unique_fps, TOTAL_BROWSERS_RESULT: total_browsers, UNICITY_RATE_RESULT: expected_unique_fps / total_browsers } for result_name, expected_value in expected_result.items(): self.assertAlmostEqual(analysis_result[result_name], expected_value) def test_empty_dataset_and_empty_attribute_set(self): self._dataset = DummyEmptyDataset() self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_unicity_result(WONT_COMPUTE) def test_empty_dataset(self): self._dataset = DummyEmptyDataset() with self.assertRaises(ValueError): self.check_unicity_result(WONT_COMPUTE) def test_empty_attribute_set(self): self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_unicity_result(WONT_COMPUTE) def test_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): self.check_unicity_result(WONT_COMPUTE) def test_in_between_entropy(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) self.check_unicity_result(1) def test_always_the_same_value(self): self._attribute_set = AttributeSet([ATTRIBUTES[2]]) self.check_unicity_result(0) def test_unique_values(self): self._attribute_set = AttributeSet([ATTRIBUTES[1]]) total_browsers = len(self._dataset.dataframe) self.check_unicity_result(total_browsers) def test_save_csv_result(self): attribute_set_unicity_analysis = AttributeSetUnicity( self._dataset, self._attribute_set) attribute_set_unicity_analysis.execute() attribute_set_unicity_analysis.save_csv_result(self._csv_result_path) remove(self._csv_result_path)
def setUp(self): self._dataset = DummyFingerprintDataset() self._attribute_set = AttributeSet(ATTRIBUTES) self._candidate_attributes = AttributeSet(ATTRIBUTES) self._most_common_fps = 3
def test_top_0_fingerprints(self): self._most_common_fps = 0 for attribute in self._candidate_attributes: self._attribute_set = AttributeSet([attribute]) self.check_top_k_fingerprints(0.0)
def test_best_conditional_entropic_attribute_empty_attribute_set(self): best_cond_ent_attr = _best_conditional_entropic_attribute( self._df_w_one_fp_per_browser, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=AttributeSet()) self.assertIsNone(best_cond_ent_attr[0])
def _search_for_solution(self): """Search for a solution using the entropy-based exploration algorithm. This function has to - Set the best solution currently found (AttributeSet). - Update the set of the attribute sets that satisfy the sensitivity threshold (Set[AttributeSet]). - Update the list of the explored attributes which is the trace of the execution. The information regarding an explored attribute is stored as a dictionary with the following key/values: * time (float): The time spent since the starting of the exploration in seconds (use timedelta.total_seconds()). * attributes (Set[int]): The set of the ids of the attributes. * sensitivity (float): The sensitivity of the attribute set. * usability_cost (float): The usability cost of the attribute set. * cost_explanation (Dict[str: float]): The explanation of the cost of the attribute set. * state (State): The state of this attribute set (see State class). - Log the explored attribute sets for debugging purposes using loguru. Note: We use the ids of the attributes instead of their name to reduce the size of the trace in memory and when saved in json format. """ # Get a dictionary of the entropy of each attribute logger.info('Computing the entropy of each attribute...') attributes_entropy = _get_attributes_entropy( self._dataset, self._dataset.candidate_attributes) entropy_compute_time = datetime.now() - self._start_time logger.info('Entropy of the attributes computed after ' f'{entropy_compute_time}.') # Take the attributes in the order of their entropy attribute_set = AttributeSet() for attribute, _ in sort_dict_by_value(attributes_entropy, reverse=True): # Check the new attribute set that is obtained attribute_set.add(attribute) logger.debug(f'Exploring {attribute_set}...') # Compute its sensitivity and its cost sensitivity = self._sensitivity.evaluate(attribute_set) cost, cost_explanation = ( self._usability_cost.evaluate(attribute_set)) logger.debug(f' Sensitivity ({sensitivity}), ' f'usability cost ({cost})') # If it satisfies the sensitivity threshold, quit the loop if sensitivity <= self._sensitivity_threshold: self._update_solution(attribute_set) self._add_satisfying_attribute_set(attribute_set) # Store this attribute set in the explored sets compute_time = str(datetime.now() - self._start_time) self._add_explored_attribute_set({ TraceData.TIME: compute_time, TraceData.ATTRIBUTES: attribute_set.attribute_ids, TraceData.SENSITIVITY: sensitivity, TraceData.USABILITY_COST: cost, TraceData.COST_EXPLANATION: cost_explanation, TraceData.STATE: State.SATISFYING }) # Quit the loop if we found a solution break # If it does not satisfy the sensitivity threshold, we continue compute_time = str(datetime.now() - self._start_time) self._add_explored_attribute_set({ TraceData.TIME: compute_time, TraceData.ATTRIBUTES: attribute_set.attribute_ids, TraceData.SENSITIVITY: sensitivity, TraceData.USABILITY_COST: cost, TraceData.COST_EXPLANATION: cost_explanation, TraceData.STATE: State.EXPLORED })
class TestGetBestConditionalEntropicAttribute(unittest.TestCase): def setUp(self): self._attribute_set = AttributeSet(ATTRIBUTES) self._dataset = DummyCleanDataset() def test_get_best_entropic_attribute(self): # The order is 1 (unique values), then 0 (some collisions), then # 2 (the same value for each browser) first_best = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet(), candidate_attributes=self._attribute_set) self.assertEqual(first_best, ATTRIBUTES[1]) second_best = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1]}), candidate_attributes=self._attribute_set) self.assertEqual(second_best, ATTRIBUTES[0]) third_best = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1], ATTRIBUTES[0]}), candidate_attributes=self._attribute_set) self.assertEqual(third_best, ATTRIBUTES[2]) no_more_available = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ ATTRIBUTES[1], ATTRIBUTES[0], ATTRIBUTES[2]}), candidate_attributes=self._attribute_set) self.assertIsNone(no_more_available) def test_get_best_entropic_attribute_every_attribute_already_taken(self): result = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=self._attribute_set, candidate_attributes=self._attribute_set) self.assertIsNone(result) def test_get_best_entropic_attribute_empty_attribute_set(self): result = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=AttributeSet()) self.assertIsNone(result) def test_get_best_entropic_attribute_empty_dataset(self): empty_dataset = DummyEmptyDataset() with self.assertRaises(ValueError): _get_best_conditional_entropic_attribute( empty_dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=self._attribute_set) def test_get_best_entropic_attribute_empty_candidates_and_dataset(self): empty_dataset = DummyEmptyDataset() result = _get_best_conditional_entropic_attribute( empty_dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=AttributeSet()) self.assertIsNone(result) def test_get_best_entropic_attribute_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=self._attribute_set)
def _search_for_solution(self): """Search for a solution using the entropy-based exploration algorithm. This function has to - Set the best solution currently found (AttributeSet). - Update the set of the attribute sets that satisfy the sensitivity threshold (Set[AttributeSet]). - Update the list of the explored attributes which is the trace of the execution. The information regarding an explored attribute is stored as a dictionary with the following key/values: * time (float): The time spent since the starting of the exploration in seconds (use timedelta.total_seconds()). * attributes (Set[int]): The set of the ids of the attributes. * sensitivity (float): The sensitivity of the attribute set. * usability_cost (float): The usability cost of the attribute set. * cost_explanation (Dict[str: float]): The explanation of the cost of the attribute set. * state (State): The state of this attribute set (see State class). - Log the explored attribute sets for debugging purposes using loguru. Note: We use the ids of the attributes instead of their name to reduce the size of the trace in memory and when saved in json format. """ # The temporary solution (empty set) and the current sensitivity (1.0 # as it is equivalent to no browser fingerprinting used at all) temp_solution, sensitivity = AttributeSet(), 1.0 # We already checked that the sensitivity threshold is reachable, hence # we always reach it when processing the Exploration while sensitivity > self._sensitivity_threshold: # Find the attribute that has the highest conditional entropy best_cond_ent_attr = _get_best_conditional_entropic_attribute( self._dataset, temp_solution, self._dataset.candidate_attributes) # NOTE Removed as we already check that a solution exists before # running the exploration. As a result, we always reach an # attribute set that satisfies the sensitivity threshold, the # complete set of the candidate attributes in the worst case. # If no more solution is proposed, end the exploration # if not best_cond_ent_attr: # break # Add this attribute to the temporary solution temp_solution.add(best_cond_ent_attr) # Compute its sensitivity and its cost logger.debug(f'Exploring {temp_solution}...') sensitivity = self._sensitivity.evaluate(temp_solution) cost, cost_explanation = ( self._usability_cost.evaluate(temp_solution)) logger.debug(f' Sensitivity ({sensitivity}), ' f'usability cost ({cost})') # If it satisfies the sensitivity threshold, quit the loop if sensitivity <= self._sensitivity_threshold: self._update_solution(temp_solution) attribute_set_state = State.SATISFYING self._add_satisfying_attribute_set(temp_solution) else: attribute_set_state = State.EXPLORED # Store this attribute set in the explored sets compute_time = str(datetime.now() - self._start_time) self._add_explored_attribute_set({ TraceData.TIME: compute_time, TraceData.ATTRIBUTES: temp_solution.attribute_ids, TraceData.SENSITIVITY: sensitivity, TraceData.USABILITY_COST: cost, TraceData.COST_EXPLANATION: cost_explanation, TraceData.STATE: attribute_set_state })
def test_get_best_entropic_attribute_empty_attribute_set(self): result = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=AttributeSet()) self.assertIsNone(result)
def _get_best_conditional_entropic_attribute(dataset: FingerprintDataset, current_attributes: AttributeSet, candidate_attributes: AttributeSet ) -> Attribute: """Get the attribute that provides the highest total entropy. When several attributes provide the same total entropy, the attribute of the lowest id is given. If no attribute increases the total entropy, we still provide the attribute of the lowest id. Args: dataset: The dataset used to compute the conditional entropy. current_attributes: The attributes that compose the current solution. candidate_attributes: The candidate attributes (i.e., those available). Raises: ValueError: There are candidate attributes and the fingerprint dataset is empty. KeyError: One of the candidate attributes is not in the fingerprint dataset. Returns: The attribute that has the highest conditional entropy among the candidate attributes and that is not part of the current attributes. """ logger.debug('Getting the best conditional entropic attribute from ' f'{current_attributes}...') # Some checks before starting the exploration if candidate_attributes and dataset.dataframe.empty: raise ValueError('Cannot compute the conditional entropy on an empty ' 'dataset.') for attribute in candidate_attributes: if attribute not in dataset.candidate_attributes: raise KeyError(f'The attribute {attribute} is not in the dataset.') # We will work on a dataset with only a fingerprint per browser to avoid # overcounting effects df_one_fp_per_browser = dataset.get_df_w_one_fp_per_browser() # If we execute on a single process if not params.getboolean('Multiprocessing', 'explorations'): logger.debug('Measuring the attributes entropy on a single process...') best_attribute, best_total_ent = _best_conditional_entropic_attribute( df_one_fp_per_browser, current_attributes, candidate_attributes) logger.debug(f' The best attribute is {best_attribute} for a total ' f'entropy of {best_total_ent}.') return best_attribute # The values to update through the search for the best attribute best_attribute_informations = {} logger.debug('Measuring the attributes conditional entropy using ' 'multiprocessing...') # Infer the number of cores to use free_cores = params.getint('Multiprocessing', 'free_cores') nb_cores = max(cpu_count() - free_cores, 1) attributes_per_core = int(ceil(len(candidate_attributes)/nb_cores)) logger.debug(f'Sharing {len(candidate_attributes)} candidate attributes ' f'over {nb_cores}(+{free_cores}) cores, hence ' f'{attributes_per_core} attributes per core.') def update_best_conditional_entropy_attribute(result: Tuple[Attribute, float]): """Update the best conditional entropy attribute. Args: result: A tuple with the best attribute and the best total entropy. Note: This is executed by the main thread and does not pose any concurrency or synchronization problem. """ best_attribute, best_total_entropy = result if best_attribute: # To avoid the empty results which are None best_attribute_informations[best_attribute] = best_total_entropy # Spawn a number of processes equal to the number of cores candidate_attributes_list = list(candidate_attributes) async_results = [] with Pool(processes=nb_cores) as pool: for process_id in range(nb_cores): # Generate the candidate attributes for this process start_id = process_id * attributes_per_core end_id = (process_id + 1) * attributes_per_core candidate_attributes_subset = AttributeSet( candidate_attributes_list[start_id:end_id]) async_result = pool.apply_async( _best_conditional_entropic_attribute, args=(df_one_fp_per_browser, current_attributes, candidate_attributes_subset), callback=update_best_conditional_entropy_attribute) async_results.append(async_result) # Wait for all the processes to finish (otherwise we would exit # before collecting their result) for async_result in async_results: async_result.wait() # Search for the best attribute in the local results. If several provide # the same total entropy, we provide the attribute having the lowest id. best_attribute, best_total_entropy = None, -float('inf') for attribute in sorted(best_attribute_informations): attribute_total_entropy = best_attribute_informations[attribute] if attribute_total_entropy > best_total_entropy: best_total_entropy = attribute_total_entropy best_attribute = attribute logger.debug(f' The best attribute is {best_attribute} for a total ' f'entropy of {best_total_entropy}.') return best_attribute
def test_empty_attributes(self): self._attributes = AttributeSet({}) attributes_avg_size = _compute_attribute_avg_size( self._dataframe, self._attributes) expected_result = self._get_expected_result() self.assertDictEqual(expected_result, attributes_avg_size)
def attribute_set_information(attribute_set_id: int): """Show information about an attribute set. Args: attribute_set_id: The id of the attribute set to show. """ global TRACE_DATA global FINGERPRINT_DATASET global REAL_TIME_EXPLORATION logger.info('Getting the information about the attribute set ' f'{attribute_set_id}.') # Check that there is an explored attribute set with this id in the # trace attribute_set_infos = None if attribute_set_id == -1: attribute_set_infos = EMPTY_NODE elif REAL_TIME_EXPLORATION: attribute_set_infos_list = ( REAL_TIME_EXPLORATION.get_explored_attribute_sets( attribute_set_id, attribute_set_id + 1)) if attribute_set_infos_list: attribute_set_infos = attribute_set_infos_list[0] attribute_set_infos['id'] = attribute_set_id elif TRACE_DATA: for explored_attr_set in TRACE_DATA['exploration']: if explored_attr_set['id'] == attribute_set_id: attribute_set_infos = explored_attr_set break else: error_message = ('Accessing the attribute set information page ' 'requires a trace or a real time exploration to be ' 'set.') logger.error(error_message) abort(HTTPStatus.NOT_FOUND, description=error_message) if not attribute_set_infos: error_message = (f'The attribute set id {attribute_set_id} was not' ' found.') logger.error(error_message) abort(HTTPStatus.NOT_FOUND, description=error_message) # Generate the attribute set object and get the names of these attributes if REAL_TIME_EXPLORATION: attributes = AttributeSet( FINGERPRINT_DATASET.candidate_attributes.get_attribute_by_id( attribute_id) for attribute_id in attribute_set_infos['attributes']) elif TRACE_DATA: attributes = AttributeSet( Attribute(attribute_id, TRACE_DATA['attributes'][str( attribute_id)]) for attribute_id in attribute_set_infos['attributes']) attribute_names = [attribute.name for attribute in attributes] # If there is a fingerprint dataset, compute the additional/optional # results from it (the subset for now) fingerprint_sample = None if attribute_set_id == -1: pass # Avoid trying to get the subset with an empty attribute set elif FINGERPRINT_DATASET: # Collect a sample of the resulting fingerprints attr_subset_sample = AttributeSetSample( FINGERPRINT_DATASET, attributes, params.getint('WebServer', 'fingerprint_sample_size')) attr_subset_sample.execute() fingerprint_sample = attr_subset_sample.result else: flash( 'Please provide a fingerprint dataset to obtain more insight on ' 'the selected attributes', params.get('WebServer', 'flash_info_class')) # Compute the textual representation of the state of this attribute set attribute_set_state = None if attribute_set_infos['state'] == State.EXPLORED: attribute_set_state = 'Explored' elif attribute_set_infos['state'] == State.PRUNED: attribute_set_state = 'Pruned' elif attribute_set_infos['state'] == State.SATISFYING: attribute_set_state = 'Satisfying the threshold' elif attribute_set_infos['state'] == State.EMPTY_NODE: attribute_set_state = 'Starting empty node' # Prepare a dictionary with the cost percentage of each dimension # { cost dimension => (bootstrap progress bar class, # for pretty display # percentage of the cost of the candidate attributes) # } usability_cost_ratio = {} if REAL_TIME_EXPLORATION: candidate_attributes_infos = ( REAL_TIME_EXPLORATION.get_explored_attribute_sets(0, 1)[0]) elif TRACE_DATA: candidate_attributes_infos = TRACE_DATA['exploration'][0] bootstrap_progess_bars = (params.get( 'WebServer', 'bootstrap_progess_bars').splitlines()) # The total usability cost cost_percentage = (100 * attribute_set_infos['usability_cost'] / candidate_attributes_infos['usability_cost']) usability_cost_ratio['usability'] = (bootstrap_progess_bars[0], '%.2f' % cost_percentage) if attribute_set_id > -1: # For each cost dimension except the "weighted" ones can_attrs_cost_explanation = candidate_attributes_infos[ 'cost_explanation'] progress_bar_class_id = 1 # 0 already taken for cost_dimension, cost_value in can_attrs_cost_explanation.items(): if cost_dimension.startswith('weighted'): continue cost_percentage = ( 100 * attribute_set_infos['cost_explanation'][cost_dimension] / cost_value) usability_cost_ratio[cost_dimension] = ( bootstrap_progess_bars[progress_bar_class_id % len(bootstrap_progess_bars)], '%.2f' % cost_percentage) progress_bar_class_id += 1 # Display the attribute information page return render_template('attribute-set-information.html', attribute_set_infos=attribute_set_infos, attribute_names=attribute_names, attribute_set_state=attribute_set_state, usability_cost_ratio=usability_cost_ratio, fingerprint_sample=fingerprint_sample, javascript_parameters=params)
def setUp(self): self._dataset = DummyCleanDataset() self._df_one_fp_per_browser = ( self._dataset.get_df_w_one_fp_per_browser()) self._attribute_set = AttributeSet(ATTRIBUTES)
def attribute_set_unicity(attribute_set_id: int): """Provide the results about the unicity of an attribute set. Args: attribute_set_id: The id of the attribute set for which to provide the unicity results. """ global TRACE_DATA global FINGERPRINT_DATASET global REAL_TIME_EXPLORATION logger.info('Getting the unicity results of the attribute set ' f'{attribute_set_id}.') # Check that there is an explored attribute set with this id in the trace attribute_set_infos = None if REAL_TIME_EXPLORATION: attribute_set_infos_list = ( REAL_TIME_EXPLORATION.get_explored_attribute_sets( attribute_set_id, attribute_set_id + 1)) if attribute_set_infos_list: attribute_set_infos = attribute_set_infos_list[0] attribute_set_infos['id'] = attribute_set_id elif TRACE_DATA: for explored_attr_set in TRACE_DATA['exploration']: if explored_attr_set['id'] == attribute_set_id: attribute_set_infos = explored_attr_set break else: error_message = ('Accessing the attribute set unicity page requires a ' 'trace or a real time exploration to be set.') logger.error(error_message) abort(HTTPStatus.NOT_FOUND, description=error_message) if not FINGERPRINT_DATASET: error_message = 'No fingerprint dataset is set.' logger.error(error_message) abort(HTTPStatus.NOT_FOUND, description=error_message) if not attribute_set_infos: error_message = (f'The attribute set id {attribute_set_id} was not ' 'found.') logger.error(error_message) abort(HTTPStatus.NOT_FOUND, description=error_message) # Generate the attribute set object if REAL_TIME_EXPLORATION: attributes = AttributeSet( FINGERPRINT_DATASET.candidate_attributes.get_attribute_by_id( attribute_id) for attribute_id in attribute_set_infos['attributes']) elif TRACE_DATA: attributes = AttributeSet( Attribute(attribute_id, TRACE_DATA['attributes'][str( attribute_id)]) for attribute_id in attribute_set_infos['attributes']) # Compute the unicity of the resulting fingerprints attr_set_unicity = AttributeSetUnicity(FINGERPRINT_DATASET, attributes) attr_set_unicity.execute() unicity_result = attr_set_unicity.result # We need to format the results due to unsupported json conversions unicity_result[UNICITY_RATE_RESULT] = float( unicity_result[UNICITY_RATE_RESULT]) unicity_result[UNIQUE_FPS_RESULT] = int(unicity_result[UNIQUE_FPS_RESULT]) unicity_result[TOTAL_BROWSERS_RESULT] = int( unicity_result[TOTAL_BROWSERS_RESULT]) # Return the json version of these results return jsonify(unicity_result)
def test_always_the_same_value(self): self._attribute_set = AttributeSet([ATTRIBUTES[2]]) self.check_entropy_result(0.0)
def _set_candidate_attributes(self): self._candidate_attributes = AttributeSet()
def test_unique_values(self): self._attribute_set = AttributeSet([ATTRIBUTES[1]]) expected_entropy = log2(len(self._dataset.dataframe)) self.check_entropy_result(expected_entropy)
def _set_candidate_attributes(self): self._candidate_attributes = AttributeSet(ATTRIBUTES)
def setUp(self): self._dataset = DummyCleanDataset() self._attribute_set = AttributeSet(ATTRIBUTES) self._csv_result_path = CSV_RESULT_PATH
def test_third_attribute_only(self): self._attribute_set = AttributeSet([ATTRIBUTES[2]]) possible_values = set(product(self._dataset.DATAS[ATTRIBUTES[2].name])) self.check_sample_result(possible_values)