def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile): """Encrypts data in a json file based on schema provided. Arguments: schema: User defined values and types. master_key: Key to provide ciphers. table_id: Used to unique key for each table. infile: File to be encrypted. outfile: Location of encrypted file to outputted. """ prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) _ValidateJsonDataFile(schema, infile) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: for line in in_file: data = json.loads(line) data = _StrToUnicode(data) rewritten_data = _ConvertJsonField( data, schema, prob_cipher, pseudonym_cipher, string_hasher, homomorphic_int_cipher, homomorphic_float_cipher) rewritten_data = json.dumps(rewritten_data) out_file.write(rewritten_data + '\n')
def testDecryptGroupConcatValues(self): cars_schema = test_util.GetCarsSchema() jobs_schema = test_util.GetJobsSchema() master_key = test_util.GetMasterKey() query = 'GROUP_CONCAT(%sModel)' % util.PROBABILISTIC_PREFIX cipher = ecrypto.ProbabilisticCipher(master_key) ciphers = {util.PROBABILISTIC_PREFIX: cipher} unencrypted_values = ([['A', 'B', 'C', 'D'], ['1', '2', '3', '4'], ['Hello', 'Bye']]) table = [] for values in unencrypted_values: encrypted_values = [] for token in values: encrypted_values.append(cipher.Encrypt(unicode(token))) table.append([','.join(encrypted_values), random.random()]) table.insert(0, [None, None]) column = encrypted_bigquery_client._DecryptGroupConcatValues( query, table, 0, ciphers, cars_schema, util.PROBABILISTIC_PREFIX) self.assertEqual(column, [ util.LiteralToken('null', None), util.StringLiteralToken('"A,B,C,D"'), util.StringLiteralToken('"1,2,3,4"'), util.StringLiteralToken('"Hello,Bye"') ]) query = ( 'GROUP_CONCAT(citiesLived.job.%sposition) within citiesLived.job' % util.PSEUDONYM_PREFIX) cipher = ecrypto.PseudonymCipher(master_key) ciphers = {util.PSEUDONYM_PREFIX: cipher} table = [] for values in unencrypted_values: encrypted_values = [] for token in values: encrypted_values.append(cipher.Encrypt(unicode(token))) table.append([','.join(encrypted_values)]) column = encrypted_bigquery_client._DecryptGroupConcatValues( query, table, 0, ciphers, jobs_schema, util.PSEUDONYM_PREFIX) self.assertEqual(column, [ util.StringLiteralToken('"A,B,C,D"'), util.StringLiteralToken('"1,2,3,4"'), util.StringLiteralToken('"Hello,Bye"') ]) query = '%sModel' % util.PROBABILISTIC_PREFIX self.assertRaises(ValueError, encrypted_bigquery_client._DecryptGroupConcatValues, query, table, 0, ciphers, cars_schema, util.PROBABILISTIC_PREFIX) query = ( 'GROUP_CONCAT(citiesLived.%snumberOfYears) within citiesLived' % util.HOMOMORPHIC_FLOAT_PREFIX) self.assertRaises(bigquery_client.BigqueryInvalidQueryError, encrypted_bigquery_client._DecryptGroupConcatValues, query, table, 0, ciphers, jobs_schema, util.HOMOMORPHIC_FLOAT_PREFIX)
def Query(self, query, **kwds): """Execute the given query, returning the created job and info for print. Arguments: query: Query to execute. **kwds: Passed on to BigqueryClient.ExecuteJob. Returns: The resulting job info and other info necessary for printing. """ self._CheckKeyfileFlag() master_key = load_lib.ReadMasterKeyFile(self.master_key_filename) try: clauses = parser.ParseQuery(query) except ParseException as e: raise bigquery_client.BigqueryInvalidQueryError( e, None, None, None) if clauses['FROM']: table_id = '%s_%s' % (clauses['FROM'][0], self._GetTableCreationTime( clauses['FROM'][0])) hashed_table_key, table_version, table_schema = self._GetEBQTableInfo( clauses['FROM'][0]) hashed_master_key = hashlib.sha1(master_key) # pylint: disable=too-many-function-args hashed_master_key = base64.b64encode(hashed_master_key.digest()) if hashed_master_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) if table_version != util.EBQ_TABLE_VERSION: raise bigquery_client.BigqueryNotFoundError( 'Invalid table version.', None, None, None) cipher = ecrypto.ProbabilisticCipher(master_key) orig_schema = zlib.decompress( cipher.Decrypt(base64.b64decode(table_schema), raw=True)) orig_schema = json.loads(orig_schema.decode('utf-8')) else: table_id = None orig_schema = [] manifest = query_lib.QueryManifest.Generate() rewritten_query, print_args = query_lib.RewriteQuery( clauses, orig_schema, master_key, table_id, manifest) job = super(EncryptedBigqueryClient, self).Query(rewritten_query, **kwds) self._LoadJobStatistics(manifest, job) printer = EncryptedTablePrinter(**print_args) bq.Factory.ClientTablePrinter.SetTablePrinter(printer) return job
def CreateTable(self, reference, ignore_existing=False, schema=None, description=None, friendly_name=None, expiration=None): """Create a table corresponding to TableReference. Arguments: reference: the TableReference to create. ignore_existing: (boolean, default False) If False, raise an exception if the dataset already exists. schema: An required schema (also requires a master key). description: an optional table description. friendly_name: an optional friendly name for the table. expiration: optional expiration time in milliseconds since the epoch. Raises: TypeError: if reference is not a TableReference. BigqueryDuplicateError: if reference exists and ignore_existing is False. """ if schema is None: raise bigquery_client.BigqueryNotFoundError( 'A schema must be specified when making a table.', None, None, None) self._CheckKeyfileFlag() schema = load_lib.ReadSchemaFile(schema) master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) # pylint: disable=too-many-function-args hashed_key = base64.b64encode(hashlib.sha1(master_key).digest()) cipher = ecrypto.ProbabilisticCipher(master_key) pretty_schema = json.dumps(schema) pretty_schema = pretty_schema.encode('utf-8') pretty_schema = zlib.compress(pretty_schema) encrypted_schema = base64.b64encode(cipher.Encrypt(pretty_schema)) if description is None: description = '' new_description = util.ConstructTableDescription( description, hashed_key, util.EBQ_TABLE_VERSION, encrypted_schema) new_schema = load_lib.RewriteSchema(schema) super(EncryptedBigqueryClient, self).CreateTable(reference, ignore_existing, new_schema, new_description, friendly_name, expiration)
def UpdateTable(self, reference, schema=None, description=None, friendly_name=None, expiration=None): """Updates a table. Arguments: reference: the DatasetReference to update. schema: an optional schema. description: an optional table description. friendly_name: an optional friendly name for the table. expiration: optional expiration time in milliseconds since the epoch. """ if schema: self._CheckKeyfileFlag() if description: hashed_table_key, table_version, table_schema = ( self._GetEBQTableInfo(str(reference))) if schema: master_key = load_lib.ReadMasterKeyFile( self.master_key_filename) # pylint: disable=too-many-function-args hashed_key = base64.b64encode( hashlib.sha1(master_key).digest()) if hashed_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) cipher = ecrypto.ProbabilisticCipher(master_key) real_schema = json.dumps(load_lib.RewriteSchema(schema)) real_schema = str.encode('utf-8') table_schema = base64.b64encode( cipher.Encrypt(zlib.compress(real_schema))) description = util.ConstructTableDescription( description, hashed_table_key, table_version, table_schema) # Rewrite the schema if the schema is to be updated. if schema: schema = load_lib.RewriteSchema(schema) super(EncryptedBigqueryClient, self).UpdateTable(reference, schema, description, friendly_name, expiration)
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile): """Encrypts data in a json file based on schema provided. Arguments: schema: User defined values and types. master_key: Key to provide ciphers. table_id: Used to unique key for each table. infile: File to be encrypted. outfile: Location of encrypted file to outputted. """ prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) _ValidateJsonDataFile(schema, infile) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: for line in in_file: data = json.loads(line) data = _StrToUnicode(data) rewritten_data = _ConvertJsonField(data, schema, prob_cipher, pseudonym_cipher, string_hasher, homomorphic_int_cipher, homomorphic_float_cipher) # When python prints unicode strings, it uses single quotes and # prepends a u before the string (such as u'Hello'). Json does # understand this and will only allow strings of double quotes # without any prefixes, therefore we must substitute to fit # the criteria. rewritten_data = str(rewritten_data).replace('u\'', '"') rewritten_data = rewritten_data.replace('\'', '"') out_file.write(rewritten_data + '\n')
def setUp(self): """Run once for each test in the class.""" self.cipher = ecrypto.ProbabilisticCipher(_KEY1)
def ConvertCsvDataFile(schema, master_key, table_id, infile, outfile): """Reads utf8 csv data, encrypts and stores into a new csv utf8 data file.""" prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: num_columns = len(schema) csv_writer = csv.writer(out_file) _ValidateCsvDataFile(schema, infile) csv_reader = _Utf8CsvReader(in_file, csv_writer) for row in csv_reader: new_row = [] if len(row) != num_columns: raise EncryptConvertError( 'Number of fields in schema do not match ' 'in row: %s' % row) for i in xrange(num_columns): encrypt_mode = schema[i]['encrypt'] if encrypt_mode == 'none': new_row.append(row[i].encode('utf-8')) elif encrypt_mode == 'probabilistic': new_row.append( prob_cipher.Encrypt(row[i]).encode('utf-8')) elif encrypt_mode == 'pseudonym': new_row.append( pseudonym_cipher.Encrypt(row[i]).encode('utf-8')) elif encrypt_mode == 'homomorphic' and schema[i][ 'type'] == 'integer': new_row.append( homomorphic_int_cipher.Encrypt(long( row[i])).encode('utf-8')) elif encrypt_mode == 'homomorphic' and schema[i][ 'type'] == 'float': new_row.append( homomorphic_float_cipher.Encrypt(float( row[i])).encode('utf-8')) elif encrypt_mode == 'searchwords': if 'searchwords_separator' in schema[i]: searchwords_separator = schema[i][ 'searchwords_separator'] else: searchwords_separator = None if 'max_word_sequence' in schema[i]: max_word_sequence = schema[i]['max_word_sequence'] else: max_word_sequence = 5 new_row.append( string_hasher.GetHashesForWordSubsequencesWithIv( util.SEARCHWORDS_PREFIX + schema[i]['name'], row[i], separator=searchwords_separator, max_sequence_len=max_word_sequence).encode( 'utf-8')) elif encrypt_mode == 'probabilistic_searchwords': if 'searchwords_separator' in schema[i]: searchwords_separator = schema[i][ 'searchwords_separator'] else: searchwords_separator = None if 'max_word_sequence' in schema[i]: max_word_sequence = schema[i]['max_word_sequence'] else: max_word_sequence = 5 new_row.append( string_hasher.GetHashesForWordSubsequencesWithIv( util.SEARCHWORDS_PREFIX + schema[i]['name'], row[i], separator=searchwords_separator, max_sequence_len=max_word_sequence).encode( 'utf-8')) new_row.append( prob_cipher.Encrypt(row[i]).encode('utf-8')) csv_writer.writerow(new_row)
def _DecryptRows(fields, rows, master_key, table_id, schema, query_list, aggregation_query_list, unencrypted_query_list, manifest=None): """Decrypts all values in rows. Arguments: fields: Column names. rows: Table values. master_key: Key to get ciphers. table_id: Used to generate keys. schema: Represents information about fields. query_list: List of fields that were queried. aggregation_query_list: List of aggregations of fields that were queried. unencrypted_query_list: List of unencrypted expressions. manifest: optional, query_lib.QueryManifest instance. Returns: A dictionary that returns for each query, a list of decrypted values. Raises: bigquery_client.BigqueryInvalidQueryError: User trying to query for a SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted. """ # create ciphers for decryption prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) ciphers = { util.PROBABILISTIC_PREFIX: prob_cipher, util.PSEUDONYM_PREFIX: pseudonym_cipher, util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher, util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher, } queried_values = {} for query in query_list: if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS': queried_values[' '.join(query.split(' ')[:-2])] = [] else: queried_values[query] = [] for query in aggregation_query_list: queried_values[query] = [] for i in xrange(len(unencrypted_query_list)): queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = [] # If a manifest is supplied rewrite the column names according to any # computed aliases that were used. Otherwise, resort to the old scheme # of substituting the '.' in multidimensional schemas in/out. if manifest is not None: for i in xrange(len(fields)): # TODO(user): This is a hash lookup on every column name. # The lookup is efficient and the column names are sufficiently random # as compared to likely human language column names such that false # hits should not be possible. However this may need future revision. n = manifest.GetColumnNameForAlias(fields[i]['name']) if n is not None: fields[i]['name'] = n else: for i in xrange(len(fields)): fields[i]['name'] = fields[i]['name'].replace( util.PERIOD_REPLACEMENT, '.') for i in xrange(len(fields)): encrypted_name = fields[i]['name'].split('.')[-1] if fields[i]['type'] == 'TIMESTAMP': queried_values[fields[i]['name']] = _GetTimestampValues(rows, i) elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS ' 'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None, None, None) elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX) and encrypted_name.endswith('_')): queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) elif encrypted_name.startswith('f') and encrypted_name.endswith('_'): index = int(fields[i]['name'][1:-1]) original_fieldname = aggregation_query_list[index] original_fieldname = original_fieldname.strip() if (len(original_fieldname.split(' ')) >= 3 and original_fieldname.split(' ')[-2].lower() == 'within'): actual_field = original_fieldname.split(' ')[:-2] actual_field = ' '.join(actual_field) else: actual_field = original_fieldname if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX): concat_field = actual_field.split( util.GROUP_CONCAT_PREFIX)[1][:-1].strip() encrypted_name = concat_field.split('.')[-1] if encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX) or encrypted_name.startswith( util.HOMOMORPHIC_FLOAT_PREFIX)): raise bigquery_client.BigqueryInvalidQueryError( 'GROUP_CONCAT only accepts string type.', None, None, None) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Invalid query, cannot recover searchwords encryption.', None, None, None) else: for j in xrange(len(rows)): queried_values[original_fieldname].append(rows[j][i]) elif (original_fieldname.startswith('COUNT(') or original_fieldname.startswith('AVG(') or original_fieldname.startswith('SUM(')): queried_values[original_fieldname] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) elif original_fieldname.startswith('TOP('): fieldname = actual_field.split('TOP(')[1][:-1].strip() fieldname = fieldname.split(',')[0].strip() if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = (_DecryptValues( fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) else: queried_values[original_fieldname] = ( _GetUnencryptedValues(original_fieldname, rows, i, schema)) elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX): sum_argument = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] sum_argument = sum_argument.split(',')[0][:-1] sum_argument = sum_argument.split('.')[-1] real_fieldname = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] real_fieldname = real_fieldname.split(',')[0][:-1] if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) else: queried_values[fields[i]['name']] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) else: queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) return queried_values
def Load(self, destination_table, source, schema=None, **kwds): """Encrypt the given data and then load it into BigQuery. The job will execute synchronously if sync=True is provided as an argument. Args: destination_table: TableReference to load data into. source: String specifying source data to load. schema: The schema that defines fields to be loaded. **kwds: Passed on to self.ExecuteJob. Returns: The resulting job info. """ self._CheckKeyfileFlag() self._CheckSchemaFile(schema) # To make encrypting more secure, we use different keys for each table # and cipher. To generate a different key for each table, we need a distinct # table identifier for each table. A table name is not secure since a table # can be deleted and created with the same name and, thus the same key. The # only distinct identifier happens to be creation time. Therefore, we must # construct a table if it does not exist so we can use the creation time # to encrypt values. try: self.CreateTable(destination_table, schema=schema) except bigquery_client.BigqueryDuplicateError: pass # Table already exists. temp_dir = tempfile.mkdtemp() orig_schema = load_lib.ReadSchemaFile(schema) new_schema = load_lib.RewriteSchema(orig_schema) new_schema_file = '%s/schema.enc_schema' % temp_dir # write the new schema as a json file with open(new_schema_file, 'wt') as f: json.dump(new_schema, f, indent=2) new_source_file = '%s/data.enc_data' % temp_dir # TODO(user): Put the filepath to the master key in .bigqueryrc file. master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) table_name = str(destination_table).split(':')[-1] table_id = '%s_%s' % ( table_name, self._GetTableCreationTime(str(destination_table))) hashed_table_key, table_version, table_schema = self._GetEBQTableInfo( str(destination_table)) hashed_master_key = hashlib.sha1(master_key) # pylint: disable=too-many-function-args hashed_master_key = base64.b64encode(hashed_master_key.digest()) if hashed_master_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) if table_version != util.EBQ_TABLE_VERSION: raise bigquery_client.BigqueryNotFoundError( 'Invalid table version.', None, None, None) # TODO(user): Generate a different key. cipher = ecrypto.ProbabilisticCipher(master_key) table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True) table_schema = zlib.decompress(table_schema) table_schema = table_schema.decode('utf-8') table_schema = json.loads(table_schema) if table_schema != orig_schema: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid schema for this table.', None, None, None) if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON': load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id, source, new_source_file) elif kwds['source_format'] == 'CSV' or not kwds['source_format']: load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id, source, new_source_file) else: raise app.UsageError( 'Currently, we do not allow loading from file types other than\n' 'NEWLINE_DELIMITED_JSON and CSV.') job = super(EncryptedBigqueryClient, self).Load(destination_table, new_source_file, schema=new_schema_file, **kwds) try: shutil.rmtree(temp_dir) except OSError: raise OSError('Temp file deleted by user before termination.') return job
def _DecryptRows(fields, rows, master_key, table_id, schema, query_list, aggregation_query_list, unencrypted_query_list): """Decrypts all values in rows. Arguments: fields: Column names. rows: Table values. master_key: Key to get ciphers. table_id: Used to generate keys. schema: Represents information about fields. query_list: List of fields that were queried. aggregation_query_list: List of aggregations of fields that were queried. unencrypted_query_list: List of unencrypted expressions. Returns: A dictionary that returns for each query, a list of decrypted values. Raises: bigquery_client.BigqueryInvalidQueryError: User trying to query for a SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted. """ # create ciphers for decryption prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) ciphers = { util.PROBABILISTIC_PREFIX: prob_cipher, util.PSEUDONYM_PREFIX: pseudonym_cipher, util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher, util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher, } queried_values = {} for query in query_list: if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS': queried_values[' '.join(query.split(' ')[:-2])] = [] else: queried_values[query] = [] for query in aggregation_query_list: queried_values[query] = [] for i in xrange(len(unencrypted_query_list)): queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = [] for i in xrange(len(fields)): fields[i]['name'] = fields[i]['name'].replace(util.PERIOD_REPLACEMENT, '.') encrypted_name = fields[i]['name'].split('.')[-1] if fields[i]['type'] == 'TIMESTAMP': queried_values[fields[i]['name']] = _GetTimestampValues(rows, i) elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS ' 'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None, None, None) elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX) and encrypted_name.endswith('_')): queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) elif encrypted_name.startswith('f') and encrypted_name.endswith('_'): index = int(fields[i]['name'][1:-1]) original_fieldname = aggregation_query_list[index] original_fieldname = original_fieldname.strip() if (len(original_fieldname.split(' ')) >= 3 and original_fieldname.split(' ')[-2].lower() == 'within'): actual_field = original_fieldname.split(' ')[:-2] actual_field = ' '.join(actual_field) else: actual_field = original_fieldname if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX): concat_field = actual_field.split( util.GROUP_CONCAT_PREFIX)[1][:-1].strip() encrypted_name = concat_field.split('.')[-1] if encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX) or encrypted_name.startswith( util.HOMOMORPHIC_FLOAT_PREFIX)): raise bigquery_client.BigqueryInvalidQueryError( 'GROUP_CONCAT only accepts string type.', None, None, None) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Invalid query, cannot recover searchwords encryption.', None, None, None) else: for j in xrange(len(rows)): queried_values[original_fieldname].append(rows[j][i]) elif (original_fieldname.startswith('COUNT(') or original_fieldname.startswith('AVG(') or original_fieldname.startswith('SUM(')): queried_values[original_fieldname] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) elif original_fieldname.startswith('TOP('): fieldname = actual_field.split('TOP(')[1][:-1].strip() fieldname = fieldname.split(',')[0].strip() if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = (_DecryptValues( fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) else: queried_values[original_fieldname] = ( _GetUnencryptedValues(original_fieldname, rows, i, schema)) elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX): sum_argument = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] sum_argument = sum_argument.split(',')[0][:-1] sum_argument = sum_argument.split('.')[-1] real_fieldname = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] real_fieldname = real_fieldname.split(',')[0][:-1] if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) else: queried_values[fields[i]['name']] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) else: queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) return queried_values