Example #1
0
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile):
  """Encrypts data in a json file based on schema provided.

  Arguments:
    schema: User defined values and types.
    master_key: Key to provide ciphers.
    table_id: Used to unique key for each table.
    infile: File to be encrypted.
    outfile: Location of encrypted file to outputted.
  """
  prob_cipher = ecrypto.ProbabilisticCipher(
      ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
  pseudonym_cipher = ecrypto.PseudonymCipher(
      ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
  # TODO(user): ciphers and hash should not use the same key.
  string_hasher = ecrypto.StringHash(
      ecrypto.GenerateStringHashKey(master_key, table_id))
  homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
      ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
  homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
      ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

  _ValidateJsonDataFile(schema, infile)
  with open(infile, 'rb') as in_file:
    with open(outfile, 'wb') as out_file:
      for line in in_file:
        data = json.loads(line)
        data = _StrToUnicode(data)
        rewritten_data = _ConvertJsonField(
            data, schema, prob_cipher, pseudonym_cipher, string_hasher,
            homomorphic_int_cipher, homomorphic_float_cipher)
        rewritten_data = json.dumps(rewritten_data)
        out_file.write(rewritten_data + '\n')
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile):
    """Encrypts data in a json file based on schema provided.

  Arguments:
    schema: User defined values and types.
    master_key: Key to provide ciphers.
    table_id: Used to unique key for each table.
    infile: File to be encrypted.
    outfile: Location of encrypted file to outputted.
  """
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    # TODO(user): ciphers and hash should not use the same key.
    string_hasher = ecrypto.StringHash(
        ecrypto.GenerateStringHashKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    _ValidateJsonDataFile(schema, infile)
    with open(infile, 'rb') as in_file:
        with open(outfile, 'wb') as out_file:
            for line in in_file:
                data = json.loads(line)
                data = _StrToUnicode(data)
                rewritten_data = _ConvertJsonField(data, schema, prob_cipher,
                                                   pseudonym_cipher,
                                                   string_hasher,
                                                   homomorphic_int_cipher,
                                                   homomorphic_float_cipher)
                # When python prints unicode strings, it uses single quotes and
                # prepends a u before the string (such as u'Hello'). Json does
                # understand this and will only allow strings of double quotes
                # without any prefixes, therefore we must substitute to fit
                # the criteria.
                rewritten_data = str(rewritten_data).replace('u\'', '"')
                rewritten_data = rewritten_data.replace('\'', '"')
                out_file.write(rewritten_data + '\n')
Example #3
0
 def setUp(self):
     """Run once for each test in the class."""
     self.cipher = ecrypto.HomomorphicFloatCipher(_KEY1)
def ConvertCsvDataFile(schema, master_key, table_id, infile, outfile):
    """Reads utf8 csv data, encrypts and stores into a new csv utf8 data file."""
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    # TODO(user): ciphers and hash should not use the same key.
    string_hasher = ecrypto.StringHash(
        ecrypto.GenerateStringHashKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    with open(infile, 'rb') as in_file:
        with open(outfile, 'wb') as out_file:
            num_columns = len(schema)
            csv_writer = csv.writer(out_file)
            _ValidateCsvDataFile(schema, infile)
            csv_reader = _Utf8CsvReader(in_file, csv_writer)
            for row in csv_reader:
                new_row = []
                if len(row) != num_columns:
                    raise EncryptConvertError(
                        'Number of fields in schema do not match '
                        'in row: %s' % row)
                for i in xrange(num_columns):
                    encrypt_mode = schema[i]['encrypt']
                    if encrypt_mode == 'none':
                        new_row.append(row[i].encode('utf-8'))
                    elif encrypt_mode == 'probabilistic':
                        new_row.append(
                            prob_cipher.Encrypt(row[i]).encode('utf-8'))
                    elif encrypt_mode == 'pseudonym':
                        new_row.append(
                            pseudonym_cipher.Encrypt(row[i]).encode('utf-8'))
                    elif encrypt_mode == 'homomorphic' and schema[i][
                            'type'] == 'integer':
                        new_row.append(
                            homomorphic_int_cipher.Encrypt(long(
                                row[i])).encode('utf-8'))
                    elif encrypt_mode == 'homomorphic' and schema[i][
                            'type'] == 'float':
                        new_row.append(
                            homomorphic_float_cipher.Encrypt(float(
                                row[i])).encode('utf-8'))
                    elif encrypt_mode == 'searchwords':
                        if 'searchwords_separator' in schema[i]:
                            searchwords_separator = schema[i][
                                'searchwords_separator']
                        else:
                            searchwords_separator = None
                        if 'max_word_sequence' in schema[i]:
                            max_word_sequence = schema[i]['max_word_sequence']
                        else:
                            max_word_sequence = 5
                        new_row.append(
                            string_hasher.GetHashesForWordSubsequencesWithIv(
                                util.SEARCHWORDS_PREFIX + schema[i]['name'],
                                row[i],
                                separator=searchwords_separator,
                                max_sequence_len=max_word_sequence).encode(
                                    'utf-8'))
                    elif encrypt_mode == 'probabilistic_searchwords':
                        if 'searchwords_separator' in schema[i]:
                            searchwords_separator = schema[i][
                                'searchwords_separator']
                        else:
                            searchwords_separator = None
                        if 'max_word_sequence' in schema[i]:
                            max_word_sequence = schema[i]['max_word_sequence']
                        else:
                            max_word_sequence = 5
                        new_row.append(
                            string_hasher.GetHashesForWordSubsequencesWithIv(
                                util.SEARCHWORDS_PREFIX + schema[i]['name'],
                                row[i],
                                separator=searchwords_separator,
                                max_sequence_len=max_word_sequence).encode(
                                    'utf-8'))
                        new_row.append(
                            prob_cipher.Encrypt(row[i]).encode('utf-8'))
                csv_writer.writerow(new_row)
def _DecryptRows(fields,
                 rows,
                 master_key,
                 table_id,
                 schema,
                 query_list,
                 aggregation_query_list,
                 unencrypted_query_list,
                 manifest=None):
    """Decrypts all values in rows.

  Arguments:
    fields: Column names.
    rows: Table values.
    master_key: Key to get ciphers.
    table_id: Used to generate keys.
    schema: Represents information about fields.
    query_list: List of fields that were queried.
    aggregation_query_list: List of aggregations of fields that were queried.
    unencrypted_query_list: List of unencrypted expressions.
    manifest: optional, query_lib.QueryManifest instance.
  Returns:
    A dictionary that returns for each query, a list of decrypted values.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: User trying to query for a
    SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted.
  """
    # create ciphers for decryption
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    ciphers = {
        util.PROBABILISTIC_PREFIX: prob_cipher,
        util.PSEUDONYM_PREFIX: pseudonym_cipher,
        util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher,
        util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher,
    }

    queried_values = {}
    for query in query_list:
        if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS':
            queried_values[' '.join(query.split(' ')[:-2])] = []
        else:
            queried_values[query] = []
    for query in aggregation_query_list:
        queried_values[query] = []
    for i in xrange(len(unencrypted_query_list)):
        queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = []

    # If a manifest is supplied rewrite the column names according to any
    # computed aliases that were used. Otherwise, resort to the old scheme
    # of substituting the '.' in multidimensional schemas in/out.
    if manifest is not None:
        for i in xrange(len(fields)):
            # TODO(user): This is a hash lookup on every column name.
            # The lookup is efficient and the column names are sufficiently random
            # as compared to likely human language column names such that false
            # hits should not be possible. However this may need future revision.
            n = manifest.GetColumnNameForAlias(fields[i]['name'])
            if n is not None:
                fields[i]['name'] = n
    else:
        for i in xrange(len(fields)):
            fields[i]['name'] = fields[i]['name'].replace(
                util.PERIOD_REPLACEMENT, '.')

    for i in xrange(len(fields)):
        encrypted_name = fields[i]['name'].split('.')[-1]
        if fields[i]['type'] == 'TIMESTAMP':
            queried_values[fields[i]['name']] = _GetTimestampValues(rows, i)
        elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.PROBABILISTIC_PREFIX))
        elif encrypted_name.startswith(util.PSEUDONYM_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.PSEUDONYM_PREFIX))
        elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS '
                'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None,
                None, None)
        elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.HOMOMORPHIC_INT_PREFIX))
        elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.HOMOMORPHIC_FLOAT_PREFIX))
        elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX)
              and encrypted_name.endswith('_')):
            queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType(
                rows, i, fields[i]['type']))
        elif encrypted_name.startswith('f') and encrypted_name.endswith('_'):
            index = int(fields[i]['name'][1:-1])
            original_fieldname = aggregation_query_list[index]
            original_fieldname = original_fieldname.strip()
            if (len(original_fieldname.split(' ')) >= 3
                    and original_fieldname.split(' ')[-2].lower() == 'within'):
                actual_field = original_fieldname.split(' ')[:-2]
                actual_field = ' '.join(actual_field)
            else:
                actual_field = original_fieldname
            if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX):
                concat_field = actual_field.split(
                    util.GROUP_CONCAT_PREFIX)[1][:-1].strip()
                encrypted_name = concat_field.split('.')[-1]
                if encrypted_name.startswith(util.PROBABILISTIC_PREFIX):
                    queried_values[original_fieldname] = (
                        _DecryptGroupConcatValues(original_fieldname, rows, i,
                                                  ciphers, schema,
                                                  util.PROBABILISTIC_PREFIX))
                elif encrypted_name.startswith(util.PSEUDONYM_PREFIX):
                    queried_values[original_fieldname] = (
                        _DecryptGroupConcatValues(original_fieldname, rows, i,
                                                  ciphers, schema,
                                                  util.PSEUDONYM_PREFIX))
                elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX)
                      or encrypted_name.startswith(
                          util.HOMOMORPHIC_FLOAT_PREFIX)):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'GROUP_CONCAT only accepts string type.', None, None,
                        None)
                elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'Invalid query, cannot recover searchwords encryption.',
                        None, None, None)
                else:
                    for j in xrange(len(rows)):
                        queried_values[original_fieldname].append(rows[j][i])
            elif (original_fieldname.startswith('COUNT(')
                  or original_fieldname.startswith('AVG(')
                  or original_fieldname.startswith('SUM(')):
                queried_values[original_fieldname] = (
                    _GetUnencryptedValuesWithType(rows, i, fields[i]['type']))
            elif original_fieldname.startswith('TOP('):
                fieldname = actual_field.split('TOP(')[1][:-1].strip()
                fieldname = fieldname.split(',')[0].strip()
                if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        fieldname, rows, i, ciphers, schema,
                        util.PSEUDONYM_PREFIX))
                else:
                    queried_values[original_fieldname] = (
                        _GetUnencryptedValues(original_fieldname, rows, i,
                                              schema))
            elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX):
                sum_argument = original_fieldname.split(
                    util.PAILLIER_SUM_PREFIX)[1]
                sum_argument = sum_argument.split(',')[0][:-1]
                sum_argument = sum_argument.split('.')[-1]
                real_fieldname = original_fieldname.split(
                    util.PAILLIER_SUM_PREFIX)[1]
                real_fieldname = real_fieldname.split(',')[0][:-1]
                if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        real_fieldname, rows, i, ciphers, schema,
                        util.HOMOMORPHIC_INT_PREFIX))
                elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        real_fieldname, rows, i, ciphers, schema,
                        util.HOMOMORPHIC_FLOAT_PREFIX))
            else:
                queried_values[fields[i]['name']] = (
                    _GetUnencryptedValuesWithType(rows, i, fields[i]['type']))
        else:
            queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType(
                rows, i, fields[i]['type']))

    return queried_values
def _DecryptRows(fields, rows, master_key, table_id, schema, query_list,
                 aggregation_query_list, unencrypted_query_list):
    """Decrypts all values in rows.

  Arguments:
    fields: Column names.
    rows: Table values.
    master_key: Key to get ciphers.
    table_id: Used to generate keys.
    schema: Represents information about fields.
    query_list: List of fields that were queried.
    aggregation_query_list: List of aggregations of fields that were queried.
    unencrypted_query_list: List of unencrypted expressions.

  Returns:
    A dictionary that returns for each query, a list of decrypted values.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: User trying to query for a
    SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted.
  """
    # create ciphers for decryption
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    ciphers = {
        util.PROBABILISTIC_PREFIX: prob_cipher,
        util.PSEUDONYM_PREFIX: pseudonym_cipher,
        util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher,
        util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher,
    }

    queried_values = {}
    for query in query_list:
        if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS':
            queried_values[' '.join(query.split(' ')[:-2])] = []
        else:
            queried_values[query] = []
    for query in aggregation_query_list:
        queried_values[query] = []
    for i in xrange(len(unencrypted_query_list)):
        queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = []

    for i in xrange(len(fields)):
        fields[i]['name'] = fields[i]['name'].replace(util.PERIOD_REPLACEMENT,
                                                      '.')
        encrypted_name = fields[i]['name'].split('.')[-1]
        if fields[i]['type'] == 'TIMESTAMP':
            queried_values[fields[i]['name']] = _GetTimestampValues(rows, i)
        elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.PROBABILISTIC_PREFIX))
        elif encrypted_name.startswith(util.PSEUDONYM_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.PSEUDONYM_PREFIX))
        elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS '
                'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None,
                None, None)
        elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.HOMOMORPHIC_INT_PREFIX))
        elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.HOMOMORPHIC_FLOAT_PREFIX))
        elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX)
              and encrypted_name.endswith('_')):
            queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType(
                rows, i, fields[i]['type']))
        elif encrypted_name.startswith('f') and encrypted_name.endswith('_'):
            index = int(fields[i]['name'][1:-1])
            original_fieldname = aggregation_query_list[index]
            original_fieldname = original_fieldname.strip()
            if (len(original_fieldname.split(' ')) >= 3
                    and original_fieldname.split(' ')[-2].lower() == 'within'):
                actual_field = original_fieldname.split(' ')[:-2]
                actual_field = ' '.join(actual_field)
            else:
                actual_field = original_fieldname
            if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX):
                concat_field = actual_field.split(
                    util.GROUP_CONCAT_PREFIX)[1][:-1].strip()
                encrypted_name = concat_field.split('.')[-1]
                if encrypted_name.startswith(util.PROBABILISTIC_PREFIX):
                    queried_values[original_fieldname] = (
                        _DecryptGroupConcatValues(original_fieldname, rows, i,
                                                  ciphers, schema,
                                                  util.PROBABILISTIC_PREFIX))
                elif encrypted_name.startswith(util.PSEUDONYM_PREFIX):
                    queried_values[original_fieldname] = (
                        _DecryptGroupConcatValues(original_fieldname, rows, i,
                                                  ciphers, schema,
                                                  util.PSEUDONYM_PREFIX))
                elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX)
                      or encrypted_name.startswith(
                          util.HOMOMORPHIC_FLOAT_PREFIX)):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'GROUP_CONCAT only accepts string type.', None, None,
                        None)
                elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'Invalid query, cannot recover searchwords encryption.',
                        None, None, None)
                else:
                    for j in xrange(len(rows)):
                        queried_values[original_fieldname].append(rows[j][i])
            elif (original_fieldname.startswith('COUNT(')
                  or original_fieldname.startswith('AVG(')
                  or original_fieldname.startswith('SUM(')):
                queried_values[original_fieldname] = (
                    _GetUnencryptedValuesWithType(rows, i, fields[i]['type']))
            elif original_fieldname.startswith('TOP('):
                fieldname = actual_field.split('TOP(')[1][:-1].strip()
                fieldname = fieldname.split(',')[0].strip()
                if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        fieldname, rows, i, ciphers, schema,
                        util.PSEUDONYM_PREFIX))
                else:
                    queried_values[original_fieldname] = (
                        _GetUnencryptedValues(original_fieldname, rows, i,
                                              schema))
            elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX):
                sum_argument = original_fieldname.split(
                    util.PAILLIER_SUM_PREFIX)[1]
                sum_argument = sum_argument.split(',')[0][:-1]
                sum_argument = sum_argument.split('.')[-1]
                real_fieldname = original_fieldname.split(
                    util.PAILLIER_SUM_PREFIX)[1]
                real_fieldname = real_fieldname.split(',')[0][:-1]
                if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        real_fieldname, rows, i, ciphers, schema,
                        util.HOMOMORPHIC_INT_PREFIX))
                elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        real_fieldname, rows, i, ciphers, schema,
                        util.HOMOMORPHIC_FLOAT_PREFIX))
            else:
                queried_values[fields[i]['name']] = (
                    _GetUnencryptedValuesWithType(rows, i, fields[i]['type']))
        else:
            queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType(
                rows, i, fields[i]['type']))

    return queried_values