def _check_help_query_results(): """ Checks the results of the Cobalt help-query pipeline. This pipeline exracts the help queries from the input data and uses Forculus threshold encryption to encrypt them and then decrypt the ones that occur at least |threshold| times, where |threshold| is a number read from a config file. The straight counting pipline extracts the help queries and then accumulates all of them in plain text regardless of how frequently they occur. In this test we compare the results of the Cobalt pipeline against the results of the straight-counting pipeline. The Cobalt results should be equal to those elements of the straight-counting results with counts at least |threshold|. """ print "\nEnd-to-end test: Checking help-query-results." with file_util.openFileForReading(file_util.FORCULUS_HELP_QUERY_CONFIG, file_util.CONFIG_DIR) as cf: config = forculus.Config.from_csv(cf) # Read the output of the straight-counting pipeline from the csv file and # put into a dictionary all entries with count >= config.threshold # The dictionary will map help queries to their counts. with file_util.openForReading( file_util.POPULAR_HELP_QUERIES_CSV_FILE_NAME) as csvfile: reader = csv.reader(csvfile) straight_counting_results = { row[0]: int(row[1]) for row in reader if int(row[1]) >= config.threshold } # Read the output of the Cobalt prototype pipeline from the csv file and # put all entries into a dictionary. The dictionary will map help queries to # their counts. with file_util.openForReading( file_util.HELP_QUERY_ANALYZER_OUTPUT_FILE_NAME) as csvfile: reader = csv.reader(csvfile) cobalt_prototype_results = {row[0]: int(row[1]) for row in reader} # Check that the two dictionries are equal. if straight_counting_results == cobalt_prototype_results: print "PASS" else: print "**** TEST FAILURE ****" a = set(straight_counting_results.items()) b = set(cobalt_prototype_results.items()) print "straight-counting minus Cobalt:", a - b print "Cobalt minus straight-counting:", b - a
def buildUsageByModuleJsFromRapporOutput(sc_values, rappor_out_file, jsvar, params_jsvar, config_file): """ A helper function for buildUsageByModuleJs(). This function will be invoked twice: once for the metric without differentially-private release and once for the metric with differentially-private release. Args: sc_values: {dictionary} A dictionary of actual values from the straigh- counting pipeline. rappor_out_file: {string} The path to the file containing RAPPOR output data. jsvar: {string} The name of the Javascript variable to be used for the data table. params_jsvar: {string} The name of the Javascript variable to be used for the RAPPOR parameters. config_file: {string} The path of the file containing RAPPOR config data. Returns: {tuple of two strings} (usage_data, rappor_parameters). See buildUsageByModuleJs() for details. """ # We skip row zero because it is the header row. We are going to visualize # the data as an interval chart and so we want to compute the high and # low 95% confidence interval values which we may do using the "std_error" # column, column 2. with file_util.openForReading(rappor_out_file) as csvfile: reader = csv.reader(csvfile) data = [{"module" : row[0], "estimate": float(row[1]), "actual" : sc_values.get(row[0], 0), "low" : float(row[1]) - 1.96 * float(row[2]), "high": float(row[1]) + 1.96 * float(row[2])} for row in reader if reader.line_num > 1] usage_data_js = buildDataTableJs( data=data, var_name=jsvar, description={"module": ("string", "Module"), "estimate": ("number", "Estimate"), "actual": ("number", "Actual"), # The role: 'interval' property is what tells the Google # Visualization API to draw an interval chart. "low": ("number", "Low", {'role': 'interval'}), "high": ("number", "High", {'role': 'interval'})}, columns_order=("module", "estimate", "actual", "low", "high"), order_by=("estimate", "desc")) # RAPPOR parameters rappor_params_js = "{} = {};".format(params_jsvar, readRapporConfigParamsFromFile(config_file).to_json()) return (usage_data_js, rappor_params_js)
def buildItemAndCountJs(filename, varname1, varname2, item_column, item_description): """Builds several strings defining variables used for visualization. Reads a CSV file containing two columns, an item column and a count column, and and uses the data to build two JavaScript strings defining DataTables containing the data. The two DataTables will be the same except for the order of the columns: The first DataTable will have the count column first and the second DataTable will have the item column first. Args: filename: {string} The full path of the CSV file to read. varname1: {string} The name of the first javascript variable to generate. varname2: {string} The name of the second javascript variable to generate. If this is None then the second returned string will also be None. item_column: {string} The name of the item column to use in the generated JS. item_description: The description string to use in the generated JS. Returns: {tuple of two string} of the form <varname>=<json>, where <json> is a json string defining a data table. In the first returned string <varname> will be |varname1| and the "count" column will come first in the DataTable. In the second returned string <varname> will be |varname1| and |item_column| will come first in the DataTable. """ with file_util.openForReading(filename) as csvfile: reader = csv.reader(csvfile) data = [{item_column : row[0], "count": int(row[1])} for row in reader] count_first_string = buildDataTableJs( data=data, var_name=varname1, description={item_column: ("string", item_description), "count": ("number", "Count")}, columns_order=("count", item_column), order_by=(("count", "desc"), item_column)) item_first_string = None if varname2 is not None: item_first_string = buildDataTableJs( data=data, var_name=varname2, description={item_column: ("string", item_description), "count": ("number", "Count")}, columns_order=(item_column, "count"), order_by=(("count", "desc"), item_column)) return (count_first_string, item_first_string)
def buildUsageByHourJs(): """Builds several strings defining variables used for visualization. Reads two CSV files containing the usage-by-hour data for the straight-counting pipeline and the Cobalt prototype pipeline respectively and uses them to build two JavaScript strings defining DataTables containing the data and one string describing basic RAPPOR parameters. Returns: {tuple of two strings} (sc_string, cobalt_string, params_string). The first two strings are of the form <var_name>=<json>, where |json| is a json string defining a data table. The |var_name|s are respectively USAGE_BY_HOUR_SC_JS_VAR_NAME and USAGE_BY_HOUR_JS_VAR_NAME. params_string is a json string containing RAPPOR parameters. """ # straight-counting: # Read the data from the csv file and put it into a dictionary. with file_util.openForReading( file_util.USAGE_BY_HOUR_CSV_FILE_NAME) as csvfile: reader = csv.reader(csvfile) # |data| will be used to generate the visualiation data for the # straight-counting pipeline data = [] # |values| will be used below to include the actual values along with # the RAPPOR estimates in the visualization of the Cobalt pipeline. values = [] hour = 0 for row in reader: data.append({"hour" : hour, "usage": int(row[0])}) values.append(int(row[0])) hour += 1 usage_by_hour_sc_js = buildDataTableJs( data=data, var_name=USAGE_BY_HOUR_SC_JS_VAR_NAME, description = {"hour": ("number", "Hour of Day"), "usage": ("number", "Usage")}, columns_order=("hour", "usage")) # cobalt: # Here the CSV file is the output of the RAPPOR analyzer. # We read it and put the data into a dictionary. # We skip row zero because it is the header row. We are going to visualize # the data as an interval chart and so we want to compute the high and # low 95% confidence interval values wich we may do using the "std_error" # column, column 2. with file_util.openForReading( file_util.HOUR_ANALYZER_OUTPUT_FILE_NAME) as csvfile: reader = csv.reader(csvfile) data = [{"hour" : int(row[0]), "estimate": max(float(row[1]), 0), "actual": values[int(row[0])], "low" : max(float(row[1]) - 1.96 * float(row[2]), 0), "high": float(row[1]) + 1.96 * float(row[2])} for row in reader if reader.line_num > 1] usage_by_hour_cobalt_js = buildDataTableJs( data=data, var_name=USAGE_BY_HOUR_JS_VAR_NAME, description={"hour": ("number", "Hour"), "estimate": ("number", "Estimate"), "actual": ("number", "Actual"), # The role: 'interval' property is what tells the Google # Visualization API to draw an interval chart. "low": ("number", "Low", {'role': 'interval'}), "high": ("number", "High", {'role': 'interval'})}, columns_order=("hour", "estimate", "low", "high", "actual"), order_by=("hour", "asc")) # RAPPOR parameters rappor_params_js = "{} = {};".format( USAGE_BY_HOUR_PARAMS_JS_VAR_NAME, readRapporConfigParamsFromFile(file_util.RAPPOR_HOUR_CONFIG).to_json()) return (usage_by_hour_sc_js, usage_by_hour_cobalt_js, rappor_params_js)
def buildUsageAndRatingByCityJs(): """Builds several strings defining variables used for visualization. Reads a CSV file containing the usage-by-city data and uses it to build a JavaScript string defining a DataTable containing the data. Returns: {string} of the form <var_name>=<json>, where |var_name| is USAGE_BY_CITY_SC_JS_VAR_NAME and |json| is a json string defining a data table. """ # straight-counting: # Read the data from the csv file and put it into a dictionary. with file_util.openForReading( file_util.USAGE_BY_CITY_CSV_FILE_NAME) as csvfile: reader = csv.reader(csvfile) # |data| will be used to generate the visualiation data for the # straight-counting pipeline data = [] # |values| will be used below to include the actual values along with # the RAPPOR estimates in the visualization of the Cobalt pipeline. values = [] for row in reader: data.append({"city" : row[0], "usage": int(row[1]), "rating": float(row[2])}) if int(row[1]) > 50: values.append({"city" : row[0], "usage": int(row[1]), "type" : "actual", "radius_95" : 0, "rating": float(row[2])}) usage_and_rating_by_city_sc_js = buildDataTableJs( data=data, var_name=USAGE_BY_CITY_SC_JS_VAR_NAME, description = {"city": ("string", "City"), "rating": ("number", "Rating"), "usage": ("number", "Usage")}, columns_order=("city", "rating", "usage"), order_by=("usage", "desc")) # cobalt: # Here the CSV file is the output of the RAPPOR analyzer. # We read it and put the data into a dictionary. # We skip row zero because it is the header row. with file_util.openForReading( file_util.CITY_RATINGS_ANALYZER_OUTPUT_FILE_NAME) as csvfile: reader = csv.reader(csvfile) data = [{"city" : row[0], "usage": float(row[1]), "type" : "estimate", "radius_95" : 1.96 * float(row[2]), "rating": float(row[7])} for row in reader if reader.line_num > 1] data.extend(values) usage_and_rating_by_city_cobalt_js = buildDataTableJs( data=data, var_name=USAGE_BY_CITY_JS_VAR_NAME, description={"city": ("string", "City"), "usage": ("number", "Usage"), "rating": ("number", "Rating"), "type" : ("string", "Estimate or Actual"), "radius_95": ("number", "95% conf. intlv. radius")}, columns_order=("city", "usage", "rating", "type", "radius_95"), order_by=("estimate", "desc")) return (usage_and_rating_by_city_sc_js, usage_and_rating_by_city_cobalt_js)
def buildUsageByModuleJs(): """ Builds several strings defining variables used for visualization. Reads some CSV files containing the usage-by-module data for the straight- counting pipeline and the Cobalt prototype pipeline respectively and uses them to build three JavaScript strings defining DataTables containing the data and two JavaScript strings defining RAPPOR parameters. Returns: {tuple of five strings} (sc_string, cobalt_string, cobalt_with_pr_string, rappor_parameters, rappor_with_pr_parameters). The "_pr_" variables refer to the version of the RAPPOR metric that uses very weak RAPPOR parameters but then adds Laplace noise at the end to affect differentially private release. So "_pr_" for "private release". The first three strings are of the form <var_name>=<json>, where |json| is a json string defining a data table. The |var_name|s are respectively USAGE_BY_MODULE_SC_JS_VAR_NAME, USAGE_BY_MODULE_JS_VAR_NAME, and USAGE_BY_MODULE_PR_JS_VAR_NAME. rappor_parameters is a json string containing values for k, h, m, p, q, f. """ # straight-counting: # Read the data from the csv file and put it into a dictionary. with file_util.openForReading( file_util.USAGE_BY_MODULE_CSV_FILE_NAME) as csvfile: reader = csv.reader(csvfile) # |data| will be used to generate the visualiation data for the # straight-counting pipeline data = [] # |values| will be used below to include the actual values along with # the RAPPOR estimates in the visualization of the Cobalt pipeline. values = {} for row in reader: data.append({"module" : row[0], "count": int(row[1])}) values[row[0]] = int(row[1]) usage_by_module_sc_js = buildDataTableJs( data=data, var_name=USAGE_BY_MODULE_SC_JS_VAR_NAME, description={"module": ("string", "Module"), "count": ("number", "Count")}, columns_order=("module", "count"), order_by=("count", "desc")) # cobalt: # Here the CSV file is the output of the RAPPOR analyzer. # We read it and put the data into a dictionary. usage_by_module_cobalt_js, rappor_params_js = \ buildUsageByModuleJsFromRapporOutput(values, file_util.MODULE_NAME_ANALYZER_OUTPUT_FILE_NAME, USAGE_BY_MODULE_JS_VAR_NAME, USAGE_BY_MODULE_PARAMS_JS_VAR_NAME, file_util.RAPPOR_MODULE_NAME_CONFIG) # cobalt with differentially-private release: usage_by_module_cobalt_with_pr_js, rappor_with_pr_params_js = \ buildUsageByModuleJsFromRapporOutput(values, file_util.MODULE_NAME_PR_ANALYZER_OUTPUT_FILE_NAME, USAGE_BY_MODULE_PR_JS_VAR_NAME, USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME, file_util.RAPPOR_MODULE_NAME_PR_CONFIG) return (usage_by_module_sc_js, usage_by_module_cobalt_js, usage_by_module_cobalt_with_pr_js, rappor_params_js, rappor_with_pr_params_js)