Example #1
0
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile):
  """Encrypts data in a json file based on schema provided.

  Arguments:
    schema: User defined values and types.
    master_key: Key to provide ciphers.
    table_id: Used to unique key for each table.
    infile: File to be encrypted.
    outfile: Location of encrypted file to outputted.
  """
  prob_cipher = ecrypto.ProbabilisticCipher(
      ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
  pseudonym_cipher = ecrypto.PseudonymCipher(
      ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
  # TODO(user): ciphers and hash should not use the same key.
  string_hasher = ecrypto.StringHash(
      ecrypto.GenerateStringHashKey(master_key, table_id))
  homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
      ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
  homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
      ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

  _ValidateJsonDataFile(schema, infile)
  with open(infile, 'rb') as in_file:
    with open(outfile, 'wb') as out_file:
      for line in in_file:
        data = json.loads(line)
        data = _StrToUnicode(data)
        rewritten_data = _ConvertJsonField(
            data, schema, prob_cipher, pseudonym_cipher, string_hasher,
            homomorphic_int_cipher, homomorphic_float_cipher)
        rewritten_data = json.dumps(rewritten_data)
        out_file.write(rewritten_data + '\n')
Example #2
0
 def testDecryptGroupConcatValues(self):
     cars_schema = test_util.GetCarsSchema()
     jobs_schema = test_util.GetJobsSchema()
     master_key = test_util.GetMasterKey()
     query = 'GROUP_CONCAT(%sModel)' % util.PROBABILISTIC_PREFIX
     cipher = ecrypto.ProbabilisticCipher(master_key)
     ciphers = {util.PROBABILISTIC_PREFIX: cipher}
     unencrypted_values = ([['A', 'B', 'C', 'D'], ['1', '2', '3', '4'],
                            ['Hello', 'Bye']])
     table = []
     for values in unencrypted_values:
         encrypted_values = []
         for token in values:
             encrypted_values.append(cipher.Encrypt(unicode(token)))
         table.append([','.join(encrypted_values), random.random()])
     table.insert(0, [None, None])
     column = encrypted_bigquery_client._DecryptGroupConcatValues(
         query, table, 0, ciphers, cars_schema, util.PROBABILISTIC_PREFIX)
     self.assertEqual(column, [
         util.LiteralToken('null', None),
         util.StringLiteralToken('"A,B,C,D"'),
         util.StringLiteralToken('"1,2,3,4"'),
         util.StringLiteralToken('"Hello,Bye"')
     ])
     query = (
         'GROUP_CONCAT(citiesLived.job.%sposition) within citiesLived.job' %
         util.PSEUDONYM_PREFIX)
     cipher = ecrypto.PseudonymCipher(master_key)
     ciphers = {util.PSEUDONYM_PREFIX: cipher}
     table = []
     for values in unencrypted_values:
         encrypted_values = []
         for token in values:
             encrypted_values.append(cipher.Encrypt(unicode(token)))
         table.append([','.join(encrypted_values)])
     column = encrypted_bigquery_client._DecryptGroupConcatValues(
         query, table, 0, ciphers, jobs_schema, util.PSEUDONYM_PREFIX)
     self.assertEqual(column, [
         util.StringLiteralToken('"A,B,C,D"'),
         util.StringLiteralToken('"1,2,3,4"'),
         util.StringLiteralToken('"Hello,Bye"')
     ])
     query = '%sModel' % util.PROBABILISTIC_PREFIX
     self.assertRaises(ValueError,
                       encrypted_bigquery_client._DecryptGroupConcatValues,
                       query, table, 0, ciphers, cars_schema,
                       util.PROBABILISTIC_PREFIX)
     query = (
         'GROUP_CONCAT(citiesLived.%snumberOfYears) within citiesLived' %
         util.HOMOMORPHIC_FLOAT_PREFIX)
     self.assertRaises(bigquery_client.BigqueryInvalidQueryError,
                       encrypted_bigquery_client._DecryptGroupConcatValues,
                       query, table, 0, ciphers, jobs_schema,
                       util.HOMOMORPHIC_FLOAT_PREFIX)
    def Query(self, query, **kwds):
        """Execute the given query, returning the created job and info for print.

    Arguments:
      query: Query to execute.
      **kwds: Passed on to BigqueryClient.ExecuteJob.

    Returns:
      The resulting job info and other info necessary for printing.
    """
        self._CheckKeyfileFlag()
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename)

        try:
            clauses = parser.ParseQuery(query)
        except ParseException as e:
            raise bigquery_client.BigqueryInvalidQueryError(
                e, None, None, None)
        if clauses['FROM']:
            table_id = '%s_%s' % (clauses['FROM'][0],
                                  self._GetTableCreationTime(
                                      clauses['FROM'][0]))
            hashed_table_key, table_version, table_schema = self._GetEBQTableInfo(
                clauses['FROM'][0])
            hashed_master_key = hashlib.sha1(master_key)
            # pylint: disable=too-many-function-args
            hashed_master_key = base64.b64encode(hashed_master_key.digest())
            if hashed_master_key != hashed_table_key:
                raise bigquery_client.BigqueryAccessDeniedError(
                    'Invalid master key for this table.', None, None, None)
            if table_version != util.EBQ_TABLE_VERSION:
                raise bigquery_client.BigqueryNotFoundError(
                    'Invalid table version.', None, None, None)
            cipher = ecrypto.ProbabilisticCipher(master_key)
            orig_schema = zlib.decompress(
                cipher.Decrypt(base64.b64decode(table_schema), raw=True))
            orig_schema = json.loads(orig_schema.decode('utf-8'))
        else:
            table_id = None
            orig_schema = []

        manifest = query_lib.QueryManifest.Generate()
        rewritten_query, print_args = query_lib.RewriteQuery(
            clauses, orig_schema, master_key, table_id, manifest)
        job = super(EncryptedBigqueryClient,
                    self).Query(rewritten_query, **kwds)
        self._LoadJobStatistics(manifest, job)

        printer = EncryptedTablePrinter(**print_args)
        bq.Factory.ClientTablePrinter.SetTablePrinter(printer)

        return job
    def CreateTable(self,
                    reference,
                    ignore_existing=False,
                    schema=None,
                    description=None,
                    friendly_name=None,
                    expiration=None):
        """Create a table corresponding to TableReference.

    Arguments:
      reference: the TableReference to create.
      ignore_existing: (boolean, default False) If False, raise an exception if
        the dataset already exists.
      schema: An required schema (also requires a master key).
      description: an optional table description.
      friendly_name: an optional friendly name for the table.
      expiration: optional expiration time in milliseconds since the epoch.

    Raises:
      TypeError: if reference is not a TableReference.
      BigqueryDuplicateError: if reference exists and ignore_existing
        is False.
    """
        if schema is None:
            raise bigquery_client.BigqueryNotFoundError(
                'A schema must be specified when making a table.', None, None,
                None)
        self._CheckKeyfileFlag()
        schema = load_lib.ReadSchemaFile(schema)
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True)
        # pylint: disable=too-many-function-args
        hashed_key = base64.b64encode(hashlib.sha1(master_key).digest())
        cipher = ecrypto.ProbabilisticCipher(master_key)
        pretty_schema = json.dumps(schema)
        pretty_schema = pretty_schema.encode('utf-8')
        pretty_schema = zlib.compress(pretty_schema)
        encrypted_schema = base64.b64encode(cipher.Encrypt(pretty_schema))
        if description is None:
            description = ''
        new_description = util.ConstructTableDescription(
            description, hashed_key, util.EBQ_TABLE_VERSION, encrypted_schema)
        new_schema = load_lib.RewriteSchema(schema)
        super(EncryptedBigqueryClient,
              self).CreateTable(reference, ignore_existing, new_schema,
                                new_description, friendly_name, expiration)
    def UpdateTable(self,
                    reference,
                    schema=None,
                    description=None,
                    friendly_name=None,
                    expiration=None):
        """Updates a table.

    Arguments:
      reference: the DatasetReference to update.
      schema: an optional schema.
      description: an optional table description.
      friendly_name: an optional friendly name for the table.
      expiration: optional expiration time in milliseconds since the epoch.
    """
        if schema:
            self._CheckKeyfileFlag()

        if description:
            hashed_table_key, table_version, table_schema = (
                self._GetEBQTableInfo(str(reference)))
            if schema:
                master_key = load_lib.ReadMasterKeyFile(
                    self.master_key_filename)
                # pylint: disable=too-many-function-args
                hashed_key = base64.b64encode(
                    hashlib.sha1(master_key).digest())
                if hashed_key != hashed_table_key:
                    raise bigquery_client.BigqueryAccessDeniedError(
                        'Invalid master key for this table.', None, None, None)
                cipher = ecrypto.ProbabilisticCipher(master_key)
                real_schema = json.dumps(load_lib.RewriteSchema(schema))
                real_schema = str.encode('utf-8')
                table_schema = base64.b64encode(
                    cipher.Encrypt(zlib.compress(real_schema)))
            description = util.ConstructTableDescription(
                description, hashed_table_key, table_version, table_schema)

        # Rewrite the schema if the schema is to be updated.
        if schema:
            schema = load_lib.RewriteSchema(schema)

        super(EncryptedBigqueryClient,
              self).UpdateTable(reference, schema, description, friendly_name,
                                expiration)
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile):
    """Encrypts data in a json file based on schema provided.

  Arguments:
    schema: User defined values and types.
    master_key: Key to provide ciphers.
    table_id: Used to unique key for each table.
    infile: File to be encrypted.
    outfile: Location of encrypted file to outputted.
  """
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    # TODO(user): ciphers and hash should not use the same key.
    string_hasher = ecrypto.StringHash(
        ecrypto.GenerateStringHashKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    _ValidateJsonDataFile(schema, infile)
    with open(infile, 'rb') as in_file:
        with open(outfile, 'wb') as out_file:
            for line in in_file:
                data = json.loads(line)
                data = _StrToUnicode(data)
                rewritten_data = _ConvertJsonField(data, schema, prob_cipher,
                                                   pseudonym_cipher,
                                                   string_hasher,
                                                   homomorphic_int_cipher,
                                                   homomorphic_float_cipher)
                # When python prints unicode strings, it uses single quotes and
                # prepends a u before the string (such as u'Hello'). Json does
                # understand this and will only allow strings of double quotes
                # without any prefixes, therefore we must substitute to fit
                # the criteria.
                rewritten_data = str(rewritten_data).replace('u\'', '"')
                rewritten_data = rewritten_data.replace('\'', '"')
                out_file.write(rewritten_data + '\n')
Example #7
0
 def setUp(self):
     """Run once for each test in the class."""
     self.cipher = ecrypto.ProbabilisticCipher(_KEY1)
def ConvertCsvDataFile(schema, master_key, table_id, infile, outfile):
    """Reads utf8 csv data, encrypts and stores into a new csv utf8 data file."""
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    # TODO(user): ciphers and hash should not use the same key.
    string_hasher = ecrypto.StringHash(
        ecrypto.GenerateStringHashKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    with open(infile, 'rb') as in_file:
        with open(outfile, 'wb') as out_file:
            num_columns = len(schema)
            csv_writer = csv.writer(out_file)
            _ValidateCsvDataFile(schema, infile)
            csv_reader = _Utf8CsvReader(in_file, csv_writer)
            for row in csv_reader:
                new_row = []
                if len(row) != num_columns:
                    raise EncryptConvertError(
                        'Number of fields in schema do not match '
                        'in row: %s' % row)
                for i in xrange(num_columns):
                    encrypt_mode = schema[i]['encrypt']
                    if encrypt_mode == 'none':
                        new_row.append(row[i].encode('utf-8'))
                    elif encrypt_mode == 'probabilistic':
                        new_row.append(
                            prob_cipher.Encrypt(row[i]).encode('utf-8'))
                    elif encrypt_mode == 'pseudonym':
                        new_row.append(
                            pseudonym_cipher.Encrypt(row[i]).encode('utf-8'))
                    elif encrypt_mode == 'homomorphic' and schema[i][
                            'type'] == 'integer':
                        new_row.append(
                            homomorphic_int_cipher.Encrypt(long(
                                row[i])).encode('utf-8'))
                    elif encrypt_mode == 'homomorphic' and schema[i][
                            'type'] == 'float':
                        new_row.append(
                            homomorphic_float_cipher.Encrypt(float(
                                row[i])).encode('utf-8'))
                    elif encrypt_mode == 'searchwords':
                        if 'searchwords_separator' in schema[i]:
                            searchwords_separator = schema[i][
                                'searchwords_separator']
                        else:
                            searchwords_separator = None
                        if 'max_word_sequence' in schema[i]:
                            max_word_sequence = schema[i]['max_word_sequence']
                        else:
                            max_word_sequence = 5
                        new_row.append(
                            string_hasher.GetHashesForWordSubsequencesWithIv(
                                util.SEARCHWORDS_PREFIX + schema[i]['name'],
                                row[i],
                                separator=searchwords_separator,
                                max_sequence_len=max_word_sequence).encode(
                                    'utf-8'))
                    elif encrypt_mode == 'probabilistic_searchwords':
                        if 'searchwords_separator' in schema[i]:
                            searchwords_separator = schema[i][
                                'searchwords_separator']
                        else:
                            searchwords_separator = None
                        if 'max_word_sequence' in schema[i]:
                            max_word_sequence = schema[i]['max_word_sequence']
                        else:
                            max_word_sequence = 5
                        new_row.append(
                            string_hasher.GetHashesForWordSubsequencesWithIv(
                                util.SEARCHWORDS_PREFIX + schema[i]['name'],
                                row[i],
                                separator=searchwords_separator,
                                max_sequence_len=max_word_sequence).encode(
                                    'utf-8'))
                        new_row.append(
                            prob_cipher.Encrypt(row[i]).encode('utf-8'))
                csv_writer.writerow(new_row)
def _DecryptRows(fields,
                 rows,
                 master_key,
                 table_id,
                 schema,
                 query_list,
                 aggregation_query_list,
                 unencrypted_query_list,
                 manifest=None):
    """Decrypts all values in rows.

  Arguments:
    fields: Column names.
    rows: Table values.
    master_key: Key to get ciphers.
    table_id: Used to generate keys.
    schema: Represents information about fields.
    query_list: List of fields that were queried.
    aggregation_query_list: List of aggregations of fields that were queried.
    unencrypted_query_list: List of unencrypted expressions.
    manifest: optional, query_lib.QueryManifest instance.
  Returns:
    A dictionary that returns for each query, a list of decrypted values.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: User trying to query for a
    SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted.
  """
    # create ciphers for decryption
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    ciphers = {
        util.PROBABILISTIC_PREFIX: prob_cipher,
        util.PSEUDONYM_PREFIX: pseudonym_cipher,
        util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher,
        util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher,
    }

    queried_values = {}
    for query in query_list:
        if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS':
            queried_values[' '.join(query.split(' ')[:-2])] = []
        else:
            queried_values[query] = []
    for query in aggregation_query_list:
        queried_values[query] = []
    for i in xrange(len(unencrypted_query_list)):
        queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = []

    # If a manifest is supplied rewrite the column names according to any
    # computed aliases that were used. Otherwise, resort to the old scheme
    # of substituting the '.' in multidimensional schemas in/out.
    if manifest is not None:
        for i in xrange(len(fields)):
            # TODO(user): This is a hash lookup on every column name.
            # The lookup is efficient and the column names are sufficiently random
            # as compared to likely human language column names such that false
            # hits should not be possible. However this may need future revision.
            n = manifest.GetColumnNameForAlias(fields[i]['name'])
            if n is not None:
                fields[i]['name'] = n
    else:
        for i in xrange(len(fields)):
            fields[i]['name'] = fields[i]['name'].replace(
                util.PERIOD_REPLACEMENT, '.')

    for i in xrange(len(fields)):
        encrypted_name = fields[i]['name'].split('.')[-1]
        if fields[i]['type'] == 'TIMESTAMP':
            queried_values[fields[i]['name']] = _GetTimestampValues(rows, i)
        elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.PROBABILISTIC_PREFIX))
        elif encrypted_name.startswith(util.PSEUDONYM_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.PSEUDONYM_PREFIX))
        elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS '
                'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None,
                None, None)
        elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.HOMOMORPHIC_INT_PREFIX))
        elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.HOMOMORPHIC_FLOAT_PREFIX))
        elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX)
              and encrypted_name.endswith('_')):
            queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType(
                rows, i, fields[i]['type']))
        elif encrypted_name.startswith('f') and encrypted_name.endswith('_'):
            index = int(fields[i]['name'][1:-1])
            original_fieldname = aggregation_query_list[index]
            original_fieldname = original_fieldname.strip()
            if (len(original_fieldname.split(' ')) >= 3
                    and original_fieldname.split(' ')[-2].lower() == 'within'):
                actual_field = original_fieldname.split(' ')[:-2]
                actual_field = ' '.join(actual_field)
            else:
                actual_field = original_fieldname
            if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX):
                concat_field = actual_field.split(
                    util.GROUP_CONCAT_PREFIX)[1][:-1].strip()
                encrypted_name = concat_field.split('.')[-1]
                if encrypted_name.startswith(util.PROBABILISTIC_PREFIX):
                    queried_values[original_fieldname] = (
                        _DecryptGroupConcatValues(original_fieldname, rows, i,
                                                  ciphers, schema,
                                                  util.PROBABILISTIC_PREFIX))
                elif encrypted_name.startswith(util.PSEUDONYM_PREFIX):
                    queried_values[original_fieldname] = (
                        _DecryptGroupConcatValues(original_fieldname, rows, i,
                                                  ciphers, schema,
                                                  util.PSEUDONYM_PREFIX))
                elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX)
                      or encrypted_name.startswith(
                          util.HOMOMORPHIC_FLOAT_PREFIX)):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'GROUP_CONCAT only accepts string type.', None, None,
                        None)
                elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'Invalid query, cannot recover searchwords encryption.',
                        None, None, None)
                else:
                    for j in xrange(len(rows)):
                        queried_values[original_fieldname].append(rows[j][i])
            elif (original_fieldname.startswith('COUNT(')
                  or original_fieldname.startswith('AVG(')
                  or original_fieldname.startswith('SUM(')):
                queried_values[original_fieldname] = (
                    _GetUnencryptedValuesWithType(rows, i, fields[i]['type']))
            elif original_fieldname.startswith('TOP('):
                fieldname = actual_field.split('TOP(')[1][:-1].strip()
                fieldname = fieldname.split(',')[0].strip()
                if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        fieldname, rows, i, ciphers, schema,
                        util.PSEUDONYM_PREFIX))
                else:
                    queried_values[original_fieldname] = (
                        _GetUnencryptedValues(original_fieldname, rows, i,
                                              schema))
            elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX):
                sum_argument = original_fieldname.split(
                    util.PAILLIER_SUM_PREFIX)[1]
                sum_argument = sum_argument.split(',')[0][:-1]
                sum_argument = sum_argument.split('.')[-1]
                real_fieldname = original_fieldname.split(
                    util.PAILLIER_SUM_PREFIX)[1]
                real_fieldname = real_fieldname.split(',')[0][:-1]
                if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        real_fieldname, rows, i, ciphers, schema,
                        util.HOMOMORPHIC_INT_PREFIX))
                elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        real_fieldname, rows, i, ciphers, schema,
                        util.HOMOMORPHIC_FLOAT_PREFIX))
            else:
                queried_values[fields[i]['name']] = (
                    _GetUnencryptedValuesWithType(rows, i, fields[i]['type']))
        else:
            queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType(
                rows, i, fields[i]['type']))

    return queried_values
    def Load(self, destination_table, source, schema=None, **kwds):
        """Encrypt the given data and then load it into BigQuery.

    The job will execute synchronously if sync=True is provided as an
    argument.

    Args:
      destination_table: TableReference to load data into.
      source: String specifying source data to load.
      schema: The schema that defines fields to be loaded.
      **kwds: Passed on to self.ExecuteJob.

    Returns:
      The resulting job info.
    """
        self._CheckKeyfileFlag()
        self._CheckSchemaFile(schema)

        # To make encrypting more secure, we use different keys for each table
        # and cipher. To generate a different key for each table, we need a distinct
        # table identifier for each table. A table name is not secure since a table
        # can be deleted and created with the same name and, thus the same key. The
        # only distinct identifier happens to be creation time. Therefore, we must
        # construct a table if it does not exist so we can use the creation time
        # to encrypt values.
        try:
            self.CreateTable(destination_table, schema=schema)
        except bigquery_client.BigqueryDuplicateError:
            pass  # Table already exists.

        temp_dir = tempfile.mkdtemp()
        orig_schema = load_lib.ReadSchemaFile(schema)
        new_schema = load_lib.RewriteSchema(orig_schema)
        new_schema_file = '%s/schema.enc_schema' % temp_dir
        # write the new schema as a json file
        with open(new_schema_file, 'wt') as f:
            json.dump(new_schema, f, indent=2)
        new_source_file = '%s/data.enc_data' % temp_dir
        # TODO(user): Put the filepath to the master key in .bigqueryrc file.
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True)
        table_name = str(destination_table).split(':')[-1]
        table_id = '%s_%s' % (
            table_name, self._GetTableCreationTime(str(destination_table)))
        hashed_table_key, table_version, table_schema = self._GetEBQTableInfo(
            str(destination_table))
        hashed_master_key = hashlib.sha1(master_key)
        # pylint: disable=too-many-function-args
        hashed_master_key = base64.b64encode(hashed_master_key.digest())
        if hashed_master_key != hashed_table_key:
            raise bigquery_client.BigqueryAccessDeniedError(
                'Invalid master key for this table.', None, None, None)
        if table_version != util.EBQ_TABLE_VERSION:
            raise bigquery_client.BigqueryNotFoundError(
                'Invalid table version.', None, None, None)
        # TODO(user): Generate a different key.
        cipher = ecrypto.ProbabilisticCipher(master_key)
        table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True)
        table_schema = zlib.decompress(table_schema)
        table_schema = table_schema.decode('utf-8')
        table_schema = json.loads(table_schema)
        if table_schema != orig_schema:
            raise bigquery_client.BigqueryAccessDeniedError(
                'Invalid schema for this table.', None, None, None)
        if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON':
            load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id,
                                         source, new_source_file)
        elif kwds['source_format'] == 'CSV' or not kwds['source_format']:
            load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id,
                                        source, new_source_file)
        else:
            raise app.UsageError(
                'Currently, we do not allow loading from file types other than\n'
                'NEWLINE_DELIMITED_JSON and CSV.')
        job = super(EncryptedBigqueryClient, self).Load(destination_table,
                                                        new_source_file,
                                                        schema=new_schema_file,
                                                        **kwds)
        try:
            shutil.rmtree(temp_dir)
        except OSError:
            raise OSError('Temp file deleted by user before termination.')
        return job
def _DecryptRows(fields, rows, master_key, table_id, schema, query_list,
                 aggregation_query_list, unencrypted_query_list):
    """Decrypts all values in rows.

  Arguments:
    fields: Column names.
    rows: Table values.
    master_key: Key to get ciphers.
    table_id: Used to generate keys.
    schema: Represents information about fields.
    query_list: List of fields that were queried.
    aggregation_query_list: List of aggregations of fields that were queried.
    unencrypted_query_list: List of unencrypted expressions.

  Returns:
    A dictionary that returns for each query, a list of decrypted values.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: User trying to query for a
    SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted.
  """
    # create ciphers for decryption
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    ciphers = {
        util.PROBABILISTIC_PREFIX: prob_cipher,
        util.PSEUDONYM_PREFIX: pseudonym_cipher,
        util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher,
        util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher,
    }

    queried_values = {}
    for query in query_list:
        if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS':
            queried_values[' '.join(query.split(' ')[:-2])] = []
        else:
            queried_values[query] = []
    for query in aggregation_query_list:
        queried_values[query] = []
    for i in xrange(len(unencrypted_query_list)):
        queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = []

    for i in xrange(len(fields)):
        fields[i]['name'] = fields[i]['name'].replace(util.PERIOD_REPLACEMENT,
                                                      '.')
        encrypted_name = fields[i]['name'].split('.')[-1]
        if fields[i]['type'] == 'TIMESTAMP':
            queried_values[fields[i]['name']] = _GetTimestampValues(rows, i)
        elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.PROBABILISTIC_PREFIX))
        elif encrypted_name.startswith(util.PSEUDONYM_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.PSEUDONYM_PREFIX))
        elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS '
                'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None,
                None, None)
        elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.HOMOMORPHIC_INT_PREFIX))
        elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX):
            queried_values[fields[i]['name']] = (_DecryptValues(
                fields[i]['name'], rows, i, ciphers, schema,
                util.HOMOMORPHIC_FLOAT_PREFIX))
        elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX)
              and encrypted_name.endswith('_')):
            queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType(
                rows, i, fields[i]['type']))
        elif encrypted_name.startswith('f') and encrypted_name.endswith('_'):
            index = int(fields[i]['name'][1:-1])
            original_fieldname = aggregation_query_list[index]
            original_fieldname = original_fieldname.strip()
            if (len(original_fieldname.split(' ')) >= 3
                    and original_fieldname.split(' ')[-2].lower() == 'within'):
                actual_field = original_fieldname.split(' ')[:-2]
                actual_field = ' '.join(actual_field)
            else:
                actual_field = original_fieldname
            if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX):
                concat_field = actual_field.split(
                    util.GROUP_CONCAT_PREFIX)[1][:-1].strip()
                encrypted_name = concat_field.split('.')[-1]
                if encrypted_name.startswith(util.PROBABILISTIC_PREFIX):
                    queried_values[original_fieldname] = (
                        _DecryptGroupConcatValues(original_fieldname, rows, i,
                                                  ciphers, schema,
                                                  util.PROBABILISTIC_PREFIX))
                elif encrypted_name.startswith(util.PSEUDONYM_PREFIX):
                    queried_values[original_fieldname] = (
                        _DecryptGroupConcatValues(original_fieldname, rows, i,
                                                  ciphers, schema,
                                                  util.PSEUDONYM_PREFIX))
                elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX)
                      or encrypted_name.startswith(
                          util.HOMOMORPHIC_FLOAT_PREFIX)):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'GROUP_CONCAT only accepts string type.', None, None,
                        None)
                elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'Invalid query, cannot recover searchwords encryption.',
                        None, None, None)
                else:
                    for j in xrange(len(rows)):
                        queried_values[original_fieldname].append(rows[j][i])
            elif (original_fieldname.startswith('COUNT(')
                  or original_fieldname.startswith('AVG(')
                  or original_fieldname.startswith('SUM(')):
                queried_values[original_fieldname] = (
                    _GetUnencryptedValuesWithType(rows, i, fields[i]['type']))
            elif original_fieldname.startswith('TOP('):
                fieldname = actual_field.split('TOP(')[1][:-1].strip()
                fieldname = fieldname.split(',')[0].strip()
                if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        fieldname, rows, i, ciphers, schema,
                        util.PSEUDONYM_PREFIX))
                else:
                    queried_values[original_fieldname] = (
                        _GetUnencryptedValues(original_fieldname, rows, i,
                                              schema))
            elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX):
                sum_argument = original_fieldname.split(
                    util.PAILLIER_SUM_PREFIX)[1]
                sum_argument = sum_argument.split(',')[0][:-1]
                sum_argument = sum_argument.split('.')[-1]
                real_fieldname = original_fieldname.split(
                    util.PAILLIER_SUM_PREFIX)[1]
                real_fieldname = real_fieldname.split(',')[0][:-1]
                if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        real_fieldname, rows, i, ciphers, schema,
                        util.HOMOMORPHIC_INT_PREFIX))
                elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX):
                    queried_values[original_fieldname] = (_DecryptValues(
                        real_fieldname, rows, i, ciphers, schema,
                        util.HOMOMORPHIC_FLOAT_PREFIX))
            else:
                queried_values[fields[i]['name']] = (
                    _GetUnencryptedValuesWithType(rows, i, fields[i]['type']))
        else:
            queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType(
                rows, i, fields[i]['type']))

    return queried_values