def _default_parameters(self) -> Dict[str, Any]: """Give the parameters that are always present in an exploration. Returns: A dictionary with the default parameters of an exploration. """ analysis_engine = params.get('DataAnalysis', 'engine') if analysis_engine == ANALYSIS_ENGINES[1]: modin_engine = params.get('DataAnalysis', 'modin_engine') analysis_engine += f"[{modin_engine}]" return { ExplorationParameters.METHOD: self.__class__.__name__, ExplorationParameters.SENSITIVITY_MEASURE: str(self._sensitivity), ExplorationParameters.USABILITY_COST_MEASURE: str(self._usability_cost), ExplorationParameters.DATASET: str(self._dataset), ExplorationParameters.SENSITIVITY_THRESHOLD: (self._sensitivity_threshold), ExplorationParameters.ANALYSIS_ENGINE: analysis_engine, ExplorationParameters.MULTIPROCESSING: params.getboolean('Multiprocessing', 'explorations'), ExplorationParameters.FREE_CORES: params.getint('Multiprocessing', 'free_cores') }
def setUp(self): # If we use the modin engine, we ignore the multiprocessing test as it # is incompatible with modin if params.get('DataAnalysis', 'engine') == 'modin.pandas': self.skipTest() self._dataset = DummyCleanDataset() self._sensitivity_measure = DummySensitivity() self._usability_cost_measure = DummyUsabilityCostMeasure() self._sensitivity_threshold = SENSITIVITY_THRESHOLD self._trace_path = TRACE_FILENAME self._expected_trace_path = EXPECTED_TRACE_PATH self._exploration = ConditionalEntropy( self._sensitivity_measure, self._usability_cost_measure, self._dataset, self._sensitivity_threshold) params.set('Multiprocessing', 'explorations', 'true')
def attribute_set_entropy(df_one_fp_per_browser: pd.DataFrame, attribute_set: AttributeSet) -> float: """Compute the entropy of a dataset considering the given attribute set. Args: df_one_fp_per_browser: The dataframe with only one fingerprint per browser. attribute_set: The non-empty attribute set that is considered when computing the entropy of the fingerprints. Returns: The entropy of the fingerprints considering this attribute set. Raises: ValueError: The attribute set or the fingerprint dataset is empty. KeyError: An attribute is not in the fingerprint dataset. Note: This function is forced to use pandas as the data analysis engine. """ # If an empty dataset of attribute set, we cannot compute the entropy if not attribute_set or df_one_fp_per_browser.empty: raise ValueError('Cannot compute the entropy considering an empty ' 'dataset or an empty attribute set.') # If using modin, switch back to pandas if params.get('DataAnalysis', 'engine') == ANALYSIS_ENGINES[1]: logger.warning('The attribute_set_entropy function badly supports the ' 'modin engine. We switch back to pandas in this ' 'function.') df_one_fp_per_browser = df_one_fp_per_browser._to_pandas() # Project the datafame on the wanted attributes attribute_names = [attribute.name for attribute in attribute_set] projected_dataframe = df_one_fp_per_browser[attribute_names] # 1. Convert the values of the attributes as strings for the # fingerprints containing NaN values to not be ignored # 2. Count the occurences of each distinct fingerprint # 3. Name the count column as COUNT_FIELD # 4. Project on the count column to obtain a Serie such that each value # is the number of browsers sharing a given fingerprint distinct_value_count = (projected_dataframe.astype('str').value_counts( normalize=True, sort=False).reset_index(name=COUNT_FIELD)[COUNT_FIELD]) return entropy(distinct_value_count, base=ENTROPY_BASE)
def setUp(self): # If we use the modin engine, we ignore the multiprocessing test as it # is incompatible with modin if params.get('DataAnalysis', 'engine') == 'modin.pandas': self.skipTest() self._dataset = DummyCleanDataset() self._sensitivity_measure = DummySensitivity() self._usability_cost_measure = DummyUsabilityCostMeasure() self._sensitivity_threshold = SENSITIVITY_THRESHOLD self._trace_path = TRACE_FILENAME self._expected_trace_path = EXPECTED_TRACE_PATH_MULTIPATH_PRUNING_OFF self._pruning = PRUNING_OFF self._explored_paths = MULTI_EXPLR_PATHS self._exploration = FPSelect(self._sensitivity_measure, self._usability_cost_measure, self._dataset, self._sensitivity_threshold, explored_paths=self._explored_paths, pruning=self._pruning) params.set('Multiprocessing', 'explorations', 'true')
def trace_configuration(): """Configure the trace file and the optional dataset to replay a trace.""" global TRACE_DATA global FINGERPRINT_DATASET global REAL_TIME_EXPLORATION global EXPLORATION_PROCESS # -------------------------- POST request handle -------------------------- if request.method == 'POST': # ------------------- Manage the required trace file ------------------ # Clear the previous data if there were some TRACE_DATA, FINGERPRINT_DATASET = None, None if EXPLORATION_PROCESS: EXPLORATION_PROCESS.terminate() EXPLORATION_PROCESS = None REAL_TIME_EXPLORATION = None # Check that the trace file is in the received POST request trace_file_error_message = erroneous_post_file( request, 'trace-file', expected_extension='json') if trace_file_error_message: return render_template('trace-configuration.html') # Load the content of the trace file as a dictionary from the json try: TRACE_DATA = json.load(request.files['trace-file']) except JSONDecodeError: error_message = 'The trace file is not correctly formated.' flash(error_message, params.get('WebServer', 'flash_error_class')) logger.error(error_message) return render_template('trace-configuration.html') # Check the content of the trace file if error_message := trace_file_errors(TRACE_DATA): flash(error_message, params.get('WebServer', 'flash_error_class')) logger.error(error_message) return render_template('trace-configuration.html') logger.info('The trace is correct and set.') # --------- End of the management of the required trace file ---------- # ------------ Manage the optional fingerprint dataset file ----------- # Process the fingerprint dataset file if there is one provided dataset_provided = ('fingerprint-dataset' in request.files and request.files['fingerprint-dataset']) if dataset_provided: # Check that the fingerprint dataset is in the POST request fp_dataset_error_message = erroneous_post_file( request, 'fingerprint-dataset', expected_extension='csv') if not fp_dataset_error_message: # Try to load the fingerprint dataset, we ignore the dataset if # there is an error and display a warning to the user try: FINGERPRINT_DATASET = FingerprintDatasetFromCSVInMemory( request.files['fingerprint-dataset']) logger.debug('The fingerprint dataset is set.') except MissingMetadatasFields as mmf_error: error_message = ('Ignored the fingerprint dataset due to ' 'the error: ' + str(mmf_error)) flash(error_message, params.get('WebServer', 'flash_warning_class')) logger.warning(error_message) # -- End of the management of the optional fingerprint dataset file --- # At the end, redirect to the trace replay page return redirect(url_for('trace_replay'))
def attribute_set_information(attribute_set_id: int): """Show information about an attribute set. Args: attribute_set_id: The id of the attribute set to show. """ global TRACE_DATA global FINGERPRINT_DATASET global REAL_TIME_EXPLORATION logger.info('Getting the information about the attribute set ' f'{attribute_set_id}.') # Check that there is an explored attribute set with this id in the # trace attribute_set_infos = None if attribute_set_id == -1: attribute_set_infos = EMPTY_NODE elif REAL_TIME_EXPLORATION: attribute_set_infos_list = ( REAL_TIME_EXPLORATION.get_explored_attribute_sets( attribute_set_id, attribute_set_id + 1)) if attribute_set_infos_list: attribute_set_infos = attribute_set_infos_list[0] attribute_set_infos['id'] = attribute_set_id elif TRACE_DATA: for explored_attr_set in TRACE_DATA['exploration']: if explored_attr_set['id'] == attribute_set_id: attribute_set_infos = explored_attr_set break else: error_message = ('Accessing the attribute set information page ' 'requires a trace or a real time exploration to be ' 'set.') logger.error(error_message) abort(HTTPStatus.NOT_FOUND, description=error_message) if not attribute_set_infos: error_message = (f'The attribute set id {attribute_set_id} was not' ' found.') logger.error(error_message) abort(HTTPStatus.NOT_FOUND, description=error_message) # Generate the attribute set object and get the names of these attributes if REAL_TIME_EXPLORATION: attributes = AttributeSet( FINGERPRINT_DATASET.candidate_attributes.get_attribute_by_id( attribute_id) for attribute_id in attribute_set_infos['attributes']) elif TRACE_DATA: attributes = AttributeSet( Attribute(attribute_id, TRACE_DATA['attributes'][str( attribute_id)]) for attribute_id in attribute_set_infos['attributes']) attribute_names = [attribute.name for attribute in attributes] # If there is a fingerprint dataset, compute the additional/optional # results from it (the subset for now) fingerprint_sample = None if attribute_set_id == -1: pass # Avoid trying to get the subset with an empty attribute set elif FINGERPRINT_DATASET: # Collect a sample of the resulting fingerprints attr_subset_sample = AttributeSetSample( FINGERPRINT_DATASET, attributes, params.getint('WebServer', 'fingerprint_sample_size')) attr_subset_sample.execute() fingerprint_sample = attr_subset_sample.result else: flash( 'Please provide a fingerprint dataset to obtain more insight on ' 'the selected attributes', params.get('WebServer', 'flash_info_class')) # Compute the textual representation of the state of this attribute set attribute_set_state = None if attribute_set_infos['state'] == State.EXPLORED: attribute_set_state = 'Explored' elif attribute_set_infos['state'] == State.PRUNED: attribute_set_state = 'Pruned' elif attribute_set_infos['state'] == State.SATISFYING: attribute_set_state = 'Satisfying the threshold' elif attribute_set_infos['state'] == State.EMPTY_NODE: attribute_set_state = 'Starting empty node' # Prepare a dictionary with the cost percentage of each dimension # { cost dimension => (bootstrap progress bar class, # for pretty display # percentage of the cost of the candidate attributes) # } usability_cost_ratio = {} if REAL_TIME_EXPLORATION: candidate_attributes_infos = ( REAL_TIME_EXPLORATION.get_explored_attribute_sets(0, 1)[0]) elif TRACE_DATA: candidate_attributes_infos = TRACE_DATA['exploration'][0] bootstrap_progess_bars = (params.get( 'WebServer', 'bootstrap_progess_bars').splitlines()) # The total usability cost cost_percentage = (100 * attribute_set_infos['usability_cost'] / candidate_attributes_infos['usability_cost']) usability_cost_ratio['usability'] = (bootstrap_progess_bars[0], '%.2f' % cost_percentage) if attribute_set_id > -1: # For each cost dimension except the "weighted" ones can_attrs_cost_explanation = candidate_attributes_infos[ 'cost_explanation'] progress_bar_class_id = 1 # 0 already taken for cost_dimension, cost_value in can_attrs_cost_explanation.items(): if cost_dimension.startswith('weighted'): continue cost_percentage = ( 100 * attribute_set_infos['cost_explanation'][cost_dimension] / cost_value) usability_cost_ratio[cost_dimension] = ( bootstrap_progess_bars[progress_bar_class_id % len(bootstrap_progess_bars)], '%.2f' % cost_percentage) progress_bar_class_id += 1 # Display the attribute information page return render_template('attribute-set-information.html', attribute_set_infos=attribute_set_infos, attribute_names=attribute_names, attribute_set_state=attribute_set_state, usability_cost_ratio=usability_cost_ratio, fingerprint_sample=fingerprint_sample, javascript_parameters=params)
def real_time_exploration_configuration(): """Configure the assets for a real time exploration.""" global TRACE_DATA global FINGERPRINT_DATASET global REAL_TIME_EXPLORATION global EXPLORATION_PROCESS # The exploration methods, sensitivity and usability cost measures exploration_methods = list(EXPLORATION_METHODS.keys()) sensitivity_measures = list(SENSITIVITY_MEASURES.keys()) usability_cost_measures = list(USABILITY_COST_MEASURES.keys()) # We store a dictionary mapping each form field to an error message if the # field is invalid errors = {} # -------------------------- POST request handle -------------------------- if request.method == 'POST': # Clear the previous data if there were some TRACE_DATA, FINGERPRINT_DATASET = None, None if EXPLORATION_PROCESS: EXPLORATION_PROCESS.terminate() EXPLORATION_PROCESS = None REAL_TIME_EXPLORATION = None # ------------ Manage the required fingerprint dataset file ----------- # Check that the dataset file is in the received POST request fp_dataset_error_message = erroneous_post_file( request, 'fingerprint-dataset', expected_extension='csv') if fp_dataset_error_message: errors['fingerprint-dataset'] = fp_dataset_error_message # --------- End of the management of the required trace file ---------- # ------------------ Handle the sensitivity threshold ----------------- sens_thresh_error_message = erroneous_field( request, 'sensitivity-threshold', lambda v: v and is_str_float(v) and float(v) >= 0.0, 'The sensitivity threshold should be a positive float.') if sens_thresh_error_message: errors['sensitivity-threshold'] = sens_thresh_error_message else: sensitivity_threshold = float( request.form['sensitivity-threshold']) # -------------- End of handle the sensitivity threshold -------------- # ------------------- Handle the exploration method ------------------- exploration_method_error_message = erroneous_field( request, 'exploration-method', lambda v: v in exploration_methods, 'The exploration method is unknown.') if exploration_method_error_message: errors['exploration-method'] = exploration_method_error_message else: exploration_method = request.form['exploration-method'] # --------------- End of handle the exploration method ---------------- # ------------------ Handle the FPSelect parameters ------------------- fpselect_method_name = exploration_methods[0] if exploration_method == fpselect_method_name: use_pruning_methods = 'use-pruning-methods' in request.form # Check that it is a strictly positive integer comprised in the # expected range minimum_explored_paths = params.getint( 'WebServer', 'fpselect_minimum_explored_paths') maximum_explored_paths = params.getint( 'WebServer', 'fpselect_maximum_explored_paths') explored_paths_error_message = erroneous_field( request, 'explored-paths', lambda v: v.isdigit() and (0 < minimum_explored_paths <= int(v) <= maximum_explored_paths), 'The number of explored paths is required to be a strictly ' 'positive integer comprised in ' f'[{minimum_explored_paths}; {maximum_explored_paths}].') if explored_paths_error_message: errors['explored-paths'] = explored_paths_error_message else: explored_paths = int(request.form['explored-paths']) # -------------- End of handle the FPSelect parameters ---------------- # ------------------ Handle the sensitivity measure ------------------- sensitivity_measure_error_message = erroneous_field( request, 'sensitivity-measure', lambda v: v in sensitivity_measures, 'Unknown sensitivity measure.') if sensitivity_measure_error_message: errors['sensitivity-measure'] = sensitivity_measure_error_message else: sensitivity_measure = request.form['sensitivity-measure'] # -------------- End of handle the sensitivity measure ---------------- # ------------- Handle the most common fingerprints (=k) -------------- top_k_fps_sens_meas = sensitivity_measures[0] if sensitivity_measure == top_k_fps_sens_meas: minimum_common_fps = params.getint( 'WebServer', 'top_k_fingerprints_sensitivity_measure_min_k') maximum_common_fps = params.getint( 'WebServer', 'top_k_fingerprints_sensitivity_measure_max_k') # Check that it is a strictly positive integer and comprised in the # range top_k_fps_error_message = erroneous_field( request, 'most-common-fingerprints', lambda v: v.isdigit() and (0 < minimum_common_fps <= int(v) <= maximum_common_fps), 'The number of explored paths is required to be a strictly ' 'positive integer and comprised in the range' f'[{minimum_common_fps}; {maximum_common_fps}].') if top_k_fps_error_message: errors['most-common-fingerprints'] = top_k_fps_error_message else: most_common_fingerprints = int( request.form['most-common-fingerprints']) # --------- End of handle the most common fingerprints (=k) ----------- # --- Initialize the dataset (needed to process the usability costs) candidate_attributes = None try: FINGERPRINT_DATASET = FingerprintDatasetFromCSVInMemory( request.files['fingerprint-dataset']) # We will need the candidate attributes afterwards candidate_attributes = FINGERPRINT_DATASET.candidate_attributes except MissingMetadatasFields as mmf_error: error_message = str(mmf_error) flash(error_message, params.get('WebServer', 'flash_error_class')) logger.error(error_message) errors['fingerprint-dataset'] = error_message logger.debug('The fingerprint dataset is set.') # ----------------- Handle the usability cost measure ----------------- # The weights of the cost dimensions cost_dim_weights = {} # Check the chosen usability cost measure usab_cost_meas_error_message = erroneous_field( request, 'usability-cost-measure', lambda v: v in usability_cost_measures, 'Unknown usability cost measure.') if usab_cost_meas_error_message: errors['usability-cost-measure'] = usab_cost_meas_error_message else: usability_cost_measure = request.form['usability-cost-measure'] # All the usability cost measures for now include the memory cost # and the instability cost, check these two # The memory cost results memory_file_error_message = erroneous_post_file( request, 'memory-cost-results', expected_extension='csv') if memory_file_error_message: errors['memory-cost-results'] = memory_file_error_message # The memory cost weight memory_weight_error_message = erroneous_field( request, 'memory-cost-weight', lambda v: v and is_str_float(v) and float(v) >= 0.0, 'The memory cost weight should be a positive float.') if memory_weight_error_message: errors['memory-cost-weight'] = memory_weight_error_message else: cost_dim_weights[CostDimension.MEMORY] = float( request.form['memory-cost-weight']) # Read the memory cost results if candidate_attributes: memory_cost_content = (request.files['memory-cost-results']. read().decode().splitlines()) memory_costs = {} mem_file_reader = DictReader(memory_cost_content) for row in mem_file_reader: try: attribute = candidate_attributes.get_attribute_by_name( row['attribute']) memory_costs[attribute] = float(row['average_size']) except KeyError as key_error: error_message = ( f'The {key_error.args[0]} field is missing from ' 'the memory cost results file.') flash(error_message, params.get('WebServer', 'flash_error_class')) logger.error(error_message) errors['memory-cost-results'] = error_message break # Exit the for loop # The instability cost results instab_file_error_message = erroneous_post_file( request, 'instability-cost-results', expected_extension='csv') if instab_file_error_message: errors['instability-cost-results'] = instab_file_error_message # The instability cost weight instab_weight_error_message = erroneous_field( request, 'instability-cost-weight', lambda v: v and is_str_float(v) and float(v) >= 0.0, 'The instability cost weight should be a positive float.') if instab_weight_error_message: errors['instability-cost-weight'] = instab_weight_error_message else: cost_dim_weights[CostDimension.INSTABILITY] = float( request.form['instability-cost-weight']) # Read the instability cost results if candidate_attributes: instability_cost_content = ( request.files['instability-cost-results'].read().decode( ).splitlines()) instability_costs = {} instability_file_reader = DictReader(instability_cost_content) for row in instability_file_reader: try: attribute = candidate_attributes.get_attribute_by_name( row['attribute']) instability_costs[attribute] = float( row['proportion_of_changes']) except KeyError as key_error: error_message = ( f'The {key_error.args[0]} field is missing from ' 'the instability cost results file.') flash(error_message, params.get('WebServer', 'flash_error_class')) logger.error(error_message) errors['instability-cost-results'] = error_message break # Exit the for loop # If there is also the collection time to consider mem_inst_time_usab_cost = usability_cost_measures[1] if usability_cost_measure == mem_inst_time_usab_cost: # The collection time cost results ct_file_err_mess = erroneous_post_file( request, 'collection-time-cost-results', expected_extension='csv') if ct_file_err_mess: errors['collection-time-cost-results'] = ct_file_err_mess # The collection time cost weight col_time_weight_error_message = erroneous_field( request, 'collection-time-cost-weight', lambda v: v and is_str_float(v) and float(v) >= 0.0, 'The weight of the collection time cost should be a ' 'positive float.') if col_time_weight_error_message: errors['collection-time-cost-weight'] = ( col_time_weight_error_message) else: cost_dim_weights[CostDimension.TIME] = float( request.form['collection-time-cost-weight']) # Read the content of the collection time results if candidate_attributes: collection_time_content = ( request.files['collection-time-cost-results'].read( ).decode().splitlines()) collection_time_costs = {} coll_time_file_reader = DictReader(collection_time_content) for row in coll_time_file_reader: try: attribute = ( candidate_attributes.get_attribute_by_name( row['attribute'])) collection_time_costs[attribute] = ( float(row['average_collection_time']), bool(row['is_asynchronous'])) except KeyError as key_error: err_mess = ( f'The {key_error.args[0]} field is missing ' 'from the collection time cost results file.') flash(err_mess, params.get('WebServer', 'flash_error_class')) logger.error(err_mess) errors['collection-time-cost-results'] = err_mess break # Exit the for loop # ------------- End of handle the usability cost measure -------------- # At the end, redirect to the real time exploration page if there are # no errors, otherwise redirect to the configuration page. if not errors: # --- Initialize the sensitivity measure sens_meas_class = SENSITIVITY_MEASURES[sensitivity_measure] # For now on, there is only the TopKFingerprints actual_sens_meas = sens_meas_class(FINGERPRINT_DATASET, most_common_fingerprints) logger.debug('Initialized the sensitivity measure ' f'{actual_sens_meas}.') # --- Initialize the usability cost measure usab_cost_meas_class = USABILITY_COST_MEASURES[ usability_cost_measure] if usability_cost_measure == mem_inst_time_usab_cost: # Initialize the memory, instability, and collection time actual_usab_cost_meas = usab_cost_meas_class( memory_costs, instability_costs, collection_time_costs, cost_dim_weights) else: actual_usab_cost_meas = usab_cost_meas_class( memory_costs, instability_costs, cost_dim_weights) logger.debug('Initialized the usability cost measure ' f'{actual_usab_cost_meas}.') # --- Initialize the exploration class exploration_class = EXPLORATION_METHODS[exploration_method] # If FPSelect if exploration_method == fpselect_method_name: exploration = exploration_class(actual_sens_meas, actual_usab_cost_meas, FINGERPRINT_DATASET, sensitivity_threshold, explored_paths, use_pruning_methods) else: exploration = exploration_class(actual_sens_meas, actual_usab_cost_meas, FINGERPRINT_DATASET, sensitivity_threshold) logger.debug(f'Initialized the exploration {exploration}.') # Execute the exploration in an asynchronous manner before REAL_TIME_EXPLORATION = exploration EXPLORATION_PROCESS = REAL_TIME_EXPLORATION.run_asynchronous() logger.debug('Redirecting to the real time exploration page') return redirect(url_for('real_time_exploration')) # -------------------- End of POST request handle --------------------- # Show the real time exploration configuration page return render_template('real-time-exploration-configuration.html', params=params, errors=errors, exploration_methods=exploration_methods, sensitivity_measures=sensitivity_measures, usability_cost_measures=usability_cost_measures)
from brfast.measures.distinguishability.unicity import (AttributeSetUnicity, UNICITY_RATE_RESULT, UNIQUE_FPS_RESULT, TOTAL_BROWSERS_RESULT) from brfast.measures.sensitivity.fpselect import TopKFingerprints from brfast.measures.usability_cost.fpselect import (CostDimension, MemoryInstability, MemoryInstabilityTime) from brfast.utils.conversion import is_str_float from brfast.webserver.files_verification import trace_file_errors from brfast.webserver.form_validation import (erroneous_field, erroneous_post_file) # The Flask application app = Flask(__name__) app.config['UPLOAD_FOLDER'] = params.get('WebServer', 'upload_folder') app.secret_key = secrets.token_bytes( params.getint('WebServer', 'secret_key_size')) # Set the exploration methods, the sensitivity measures, and the usability cost # measures below EXPLORATION_METHODS = { 'FPSelect': FPSelect, 'Entropy': Entropy, 'Conditional entropy': ConditionalEntropy } SENSITIVITY_MEASURES = {'Top-k fingerprints': TopKFingerprints} USABILITY_COST_MEASURES = { 'Memory and instability': MemoryInstability, 'Memory, instability, and collection time': MemoryInstabilityTime }
"""Module containing the sensitivity measures used in the FPSelect paper.""" import importlib from typing import List from loguru import logger from brfast.data.attribute import AttributeSet from brfast.data.dataset import FingerprintDataset from brfast.measures import SensitivityMeasure # from measures.similarity import TODO # Import the engine of the analysis module (pandas or modin) from brfast.config import params pd = importlib.import_module(params.get('DataAnalysis', 'engine')) PROPORTION_FIELD = 'proportion' def _get_top_k_fingerprints(dataframe: pd.DataFrame, attribute_names: List[str], k: int) -> pd.DataFrame: """Get a DataFrame with the k-most common fingerprints. Args: dataframe: The fingerprint dataset. attribute_names: The name of the attributes to consider. k: The parameter to specify the k-most common fingerprints to hold. Returns: