Beispiel #1
0
def _check_help_query_results():
    """ Checks the results of the Cobalt help-query pipeline. This pipeline
  exracts the help queries from the input data and uses Forculus threshold
  encryption to encrypt them and then decrypt the ones that occur at least
  |threshold| times, where |threshold| is a number read from a config file.

  The straight counting pipline extracts the help queries and then accumulates
  all of them in plain text regardless of how frequently they occur.

  In this test we compare the results of the Cobalt pipeline against the
  results of the straight-counting pipeline. The Cobalt results should be
  equal to those elements of the straight-counting results with counts
  at least |threshold|.
  """

    print "\nEnd-to-end test: Checking help-query-results."
    with file_util.openFileForReading(file_util.FORCULUS_HELP_QUERY_CONFIG,
                                      file_util.CONFIG_DIR) as cf:
        config = forculus.Config.from_csv(cf)

    # Read the output of the straight-counting pipeline from the csv file and
    # put into a dictionary all entries with count >= config.threshold
    # The dictionary will map help queries to their counts.
    with file_util.openForReading(
            file_util.POPULAR_HELP_QUERIES_CSV_FILE_NAME) as csvfile:
        reader = csv.reader(csvfile)
        straight_counting_results = {
            row[0]: int(row[1])
            for row in reader if int(row[1]) >= config.threshold
        }

    # Read the output of the Cobalt prototype pipeline from the csv file and
    # put all entries into a dictionary. The dictionary will map help queries to
    # their counts.
    with file_util.openForReading(
            file_util.HELP_QUERY_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
        reader = csv.reader(csvfile)
        cobalt_prototype_results = {row[0]: int(row[1]) for row in reader}

    # Check that the two dictionries are equal.
    if straight_counting_results == cobalt_prototype_results:
        print "PASS"
    else:
        print "**** TEST FAILURE ****"
        a = set(straight_counting_results.items())
        b = set(cobalt_prototype_results.items())
        print "straight-counting minus Cobalt:", a - b
        print "Cobalt minus straight-counting:", b - a
Beispiel #2
0
def buildUsageByModuleJsFromRapporOutput(sc_values, rappor_out_file, jsvar,
                                         params_jsvar, config_file):
  """ A helper function for buildUsageByModuleJs().

  This function will be
  invoked twice: once for the metric without differentially-private release
  and once for the metric with differentially-private release.

  Args:
    sc_values: {dictionary} A dictionary of actual values from the straigh-
    counting pipeline.

    rappor_out_file: {string} The path to the file containing RAPPOR output
    data.

    jsvar: {string} The name of the Javascript variable to be used for the
    data table.

    params_jsvar: {string} The name of the Javascript variable to be used for
    the RAPPOR parameters.

    config_file: {string} The path of the file containing RAPPOR config data.

  Returns:
    {tuple of two strings} (usage_data, rappor_parameters).
    See buildUsageByModuleJs() for details.
  """
  # We skip row zero because it is the header row. We are going to visualize
  # the data as an interval chart and so we want to compute the high and
  # low 95% confidence interval values which we may do using the "std_error"
  # column, column 2.
  with file_util.openForReading(rappor_out_file) as csvfile:
    reader = csv.reader(csvfile)
    data = [{"module" : row[0], "estimate": float(row[1]),
             "actual" : sc_values.get(row[0], 0),
             "low" : float(row[1]) - 1.96 * float(row[2]),
             "high": float(row[1]) + 1.96 * float(row[2])}
        for row in reader if reader.line_num > 1]
  usage_data_js = buildDataTableJs(
      data=data,
      var_name=jsvar,
      description={"module": ("string", "Module"),
                   "estimate": ("number", "Estimate"),
                   "actual": ("number", "Actual"),
                   # The role: 'interval' property is what tells the Google
                   # Visualization API to draw an interval chart.
                   "low": ("number", "Low", {'role': 'interval'}),
                   "high": ("number", "High", {'role': 'interval'})},
      columns_order=("module", "estimate", "actual", "low", "high"),
      order_by=("estimate", "desc"))

  # RAPPOR parameters
  rappor_params_js = "{} = {};".format(params_jsvar,
      readRapporConfigParamsFromFile(config_file).to_json())

  return (usage_data_js, rappor_params_js)
Beispiel #3
0
def buildItemAndCountJs(filename, varname1, varname2, item_column,
                        item_description):
  """Builds several strings defining variables used for visualization.

  Reads a CSV file containing two columns, an item column and a
  count column, and and uses the data to build two JavaScript strings defining
  DataTables containing the data. The two DataTables will be the same except
  for the order of the columns: The first DataTable will have the count
  column first and the second DataTable will have the item column first.

  Args:
    filename: {string} The full path of the CSV file to read.
    varname1: {string} The name of the first javascript variable to generate.
    varname2: {string} The name of the second javascript variable to generate.
                       If this is None then the second returned string will
                       also be None.
    item_column: {string} The name of the item column to use in the generated
                          JS.
    item_description: The description string to use in the generated JS.

  Returns:
    {tuple of two string} of the form <varname>=<json>, where <json> is a json
    string defining a data table. In the first returned string <varname> will
    be |varname1| and the "count" column will come first in the DataTable.
    In the second returned string <varname> will be |varname1| and |item_column|
    will come first in the DataTable.
  """

  with file_util.openForReading(filename) as csvfile:
    reader = csv.reader(csvfile)
    data = [{item_column : row[0], "count": int(row[1])} for row in reader]
  count_first_string = buildDataTableJs(
      data=data,
      var_name=varname1,
      description={item_column: ("string", item_description),
                   "count": ("number", "Count")},
      columns_order=("count", item_column),
      order_by=(("count", "desc"), item_column))
  item_first_string = None
  if varname2 is not None:
    item_first_string = buildDataTableJs(
        data=data,
        var_name=varname2,
        description={item_column: ("string", item_description),
                    "count": ("number", "Count")},
        columns_order=(item_column, "count"),
        order_by=(("count", "desc"), item_column))
  return (count_first_string, item_first_string)
Beispiel #4
0
def buildUsageByHourJs():
  """Builds several strings defining variables used for visualization.

  Reads two CSV files containing the usage-by-hour data for the
  straight-counting pipeline and the Cobalt prototype pipeline respectively and
  uses them to build two JavaScript strings defining DataTables containing the
  data and one string describing basic RAPPOR parameters.

  Returns:
    {tuple of two strings} (sc_string, cobalt_string, params_string). The first
    two strings are of the form <var_name>=<json>, where |json| is a json string
    defining a data table. The |var_name|s are respectively
    USAGE_BY_HOUR_SC_JS_VAR_NAME and USAGE_BY_HOUR_JS_VAR_NAME.
    params_string is a json string containing RAPPOR parameters.
  """
  # straight-counting:
  # Read the data from the csv file and put it into a dictionary.
  with file_util.openForReading(
      file_util.USAGE_BY_HOUR_CSV_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    # |data| will be used to generate the visualiation data for the
    # straight-counting pipeline
    data = []
    # |values| will be used below to include the actual values along with
    # the RAPPOR estimates in the visualization of the Cobalt pipeline.
    values = []
    hour = 0
    for row in reader:
      data.append({"hour" : hour, "usage": int(row[0])})
      values.append(int(row[0]))
      hour += 1
  usage_by_hour_sc_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_HOUR_SC_JS_VAR_NAME,
      description = {"hour": ("number", "Hour of Day"),
                     "usage": ("number", "Usage")},
      columns_order=("hour", "usage"))

  # cobalt:
  # Here the CSV file is the output of the RAPPOR analyzer.
  # We read it and put the data into a dictionary.
  # We skip row zero because it is the header row. We are going to visualize
  # the data as an interval chart and so we want to compute the high and
  # low 95% confidence interval values wich we may do using the "std_error"
  # column, column 2.
  with file_util.openForReading(
      file_util.HOUR_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    data = [{"hour" : int(row[0]), "estimate": max(float(row[1]), 0),
             "actual": values[int(row[0])],
             "low" : max(float(row[1])  - 1.96 * float(row[2]), 0),
             "high": float(row[1]) + 1.96 * float(row[2])}
        for row in reader if reader.line_num > 1]
  usage_by_hour_cobalt_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_HOUR_JS_VAR_NAME,
      description={"hour": ("number", "Hour"),
                   "estimate": ("number", "Estimate"),
                   "actual": ("number", "Actual"),
                   # The role: 'interval' property is what tells the Google
                   # Visualization API to draw an interval chart.
                   "low": ("number", "Low", {'role': 'interval'}),
                   "high": ("number", "High", {'role': 'interval'})},
      columns_order=("hour", "estimate", "low", "high", "actual"),
      order_by=("hour", "asc"))

  # RAPPOR parameters
  rappor_params_js = "{} = {};".format(
      USAGE_BY_HOUR_PARAMS_JS_VAR_NAME,
      readRapporConfigParamsFromFile(file_util.RAPPOR_HOUR_CONFIG).to_json())

  return (usage_by_hour_sc_js, usage_by_hour_cobalt_js, rappor_params_js)
Beispiel #5
0
def buildUsageAndRatingByCityJs():
  """Builds several strings defining variables used for visualization.

  Reads a CSV file containing the usage-by-city data and uses it
  to build a JavaScript string defining a DataTable containing the data.

  Returns:
    {string} of the form <var_name>=<json>, where |var_name| is
    USAGE_BY_CITY_SC_JS_VAR_NAME and |json| is a json string defining
    a data table.
  """
  # straight-counting:
  # Read the data from the csv file and put it into a dictionary.
  with file_util.openForReading(
      file_util.USAGE_BY_CITY_CSV_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    # |data| will be used to generate the visualiation data for the
    # straight-counting pipeline
    data = []
    # |values| will be used below to include the actual values along with
    # the RAPPOR estimates in the visualization of the Cobalt pipeline.
    values = []
    for row in reader:
      data.append({"city" : row[0], "usage": int(row[1]),
                   "rating": float(row[2])})
      if int(row[1]) > 50:
        values.append({"city" : row[0], "usage": int(row[1]),
                       "type" : "actual",
                       "radius_95" : 0,
                       "rating": float(row[2])})
  usage_and_rating_by_city_sc_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_CITY_SC_JS_VAR_NAME,
      description = {"city": ("string", "City"),
                     "rating": ("number", "Rating"),
                     "usage": ("number", "Usage")},
      columns_order=("city", "rating", "usage"),
      order_by=("usage", "desc"))

  # cobalt:
  # Here the CSV file is the output of the RAPPOR analyzer.
  # We read it and put the data into a dictionary.
  # We skip row zero because it is the header row.
  with file_util.openForReading(
      file_util.CITY_RATINGS_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    data = [{"city" : row[0], "usage": float(row[1]),
             "type" : "estimate",
             "radius_95" : 1.96 * float(row[2]),
             "rating": float(row[7])}
        for row in reader if reader.line_num > 1]
    data.extend(values)
  usage_and_rating_by_city_cobalt_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_CITY_JS_VAR_NAME,
      description={"city": ("string", "City"),
                   "usage": ("number", "Usage"),
                   "rating": ("number", "Rating"),
                   "type" : ("string", "Estimate or Actual"),
                   "radius_95": ("number", "95% conf. intlv. radius")},
      columns_order=("city", "usage", "rating", "type", "radius_95"),
      order_by=("estimate", "desc"))

  return (usage_and_rating_by_city_sc_js, usage_and_rating_by_city_cobalt_js)
Beispiel #6
0
def buildUsageByModuleJs():
  """ Builds several strings defining variables used for visualization.

  Reads some CSV files containing the usage-by-module data for the straight-
  counting pipeline and the Cobalt prototype pipeline respectively and uses them
  to build three JavaScript strings defining DataTables containing the data
  and two JavaScript strings defining RAPPOR parameters.

  Returns:
    {tuple of five strings} (sc_string, cobalt_string, cobalt_with_pr_string,
    rappor_parameters, rappor_with_pr_parameters). The "_pr_" variables
    refer to the version of the RAPPOR metric that uses very weak RAPPOR
    parameters but then adds Laplace noise at the end to affect differentially
    private release. So "_pr_" for "private release". The first three
    strings are of the form <var_name>=<json>, where |json| is a json
    string defining a data table. The |var_name|s are respectively
    USAGE_BY_MODULE_SC_JS_VAR_NAME, USAGE_BY_MODULE_JS_VAR_NAME, and
    USAGE_BY_MODULE_PR_JS_VAR_NAME.
    rappor_parameters is a json string containing values for k, h, m, p, q, f.
  """
  # straight-counting:
  # Read the data from the csv file and put it into a dictionary.
  with file_util.openForReading(
      file_util.USAGE_BY_MODULE_CSV_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    # |data| will be used to generate the visualiation data for the
    # straight-counting pipeline
    data = []
    # |values| will be used below to include the actual values along with
    # the RAPPOR estimates in the visualization of the Cobalt pipeline.
    values = {}
    for row in reader:
      data.append({"module" : row[0], "count": int(row[1])})
      values[row[0]] = int(row[1])
  usage_by_module_sc_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_MODULE_SC_JS_VAR_NAME,
      description={"module": ("string", "Module"),
                   "count": ("number", "Count")},
      columns_order=("module", "count"),
      order_by=("count", "desc"))

  # cobalt:
  # Here the CSV file is the output of the RAPPOR analyzer.
  # We read it and put the data into a dictionary.
  usage_by_module_cobalt_js, rappor_params_js = \
      buildUsageByModuleJsFromRapporOutput(values,
          file_util.MODULE_NAME_ANALYZER_OUTPUT_FILE_NAME,
          USAGE_BY_MODULE_JS_VAR_NAME,
          USAGE_BY_MODULE_PARAMS_JS_VAR_NAME,
          file_util.RAPPOR_MODULE_NAME_CONFIG)

  # cobalt with differentially-private release:
  usage_by_module_cobalt_with_pr_js, rappor_with_pr_params_js = \
      buildUsageByModuleJsFromRapporOutput(values,
          file_util.MODULE_NAME_PR_ANALYZER_OUTPUT_FILE_NAME,
          USAGE_BY_MODULE_PR_JS_VAR_NAME,
          USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME,
          file_util.RAPPOR_MODULE_NAME_PR_CONFIG)

  return (usage_by_module_sc_js, usage_by_module_cobalt_js,
          usage_by_module_cobalt_with_pr_js,
          rappor_params_js, rappor_with_pr_params_js)