def _GetTableCreationTime(self, identifier): reference = super(EncryptedBigqueryClient, self).GetReference(identifier) object_info = super(EncryptedBigqueryClient, self).GetObjectInfo(reference) if object_info is None: raise bigquery_client.BigqueryNotFoundError( 'Table %s not found.' % identifier, None, None, None) if 'creationTime' not in object_info: raise bigquery_client.BigqueryNotFoundError( 'Could not gather creation time from table.', None, None, None) return object_info['creationTime']
def ReadMasterKeyFile(filepath, create=False): """Read and return master key from file else create and store key in file.""" if not filepath: raise bigquery_client.BigqueryNotFoundError( 'Master key file not specified.', None, None, None) if not os.path.exists(filepath): if not create: raise bigquery_client.BigqueryNotFoundError( 'Master key file does not exist.', None, None, None) print 'Key file does not exist. Generating a new key now.' _CreateAndStoreMasterKeyFile(filepath) with open(filepath, 'rt') as f: master_key = base64.b64decode(f.read()) if len(master_key) < 16: raise EncryptConvertError( 'key in %s file is too short and may be ' 'corrupted. Please supply a proper key file. ' % filepath) return master_key
def Query(self, query, **kwds): """Execute the given query, returning the created job and info for print. Arguments: query: Query to execute. **kwds: Passed on to BigqueryClient.ExecuteJob. Returns: The resulting job info and other info necessary for printing. """ self._CheckKeyfileFlag() master_key = load_lib.ReadMasterKeyFile(self.master_key_filename) try: clauses = parser.ParseQuery(query) except ParseException as e: raise bigquery_client.BigqueryInvalidQueryError( e, None, None, None) if clauses['FROM']: table_id = '%s_%s' % (clauses['FROM'][0], self._GetTableCreationTime( clauses['FROM'][0])) hashed_table_key, table_version, table_schema = self._GetEBQTableInfo( clauses['FROM'][0]) hashed_master_key = hashlib.sha1(master_key) # pylint: disable=too-many-function-args hashed_master_key = base64.b64encode(hashed_master_key.digest()) if hashed_master_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) if table_version != util.EBQ_TABLE_VERSION: raise bigquery_client.BigqueryNotFoundError( 'Invalid table version.', None, None, None) cipher = ecrypto.ProbabilisticCipher(master_key) orig_schema = zlib.decompress( cipher.Decrypt(base64.b64decode(table_schema), raw=True)) orig_schema = json.loads(orig_schema.decode('utf-8')) else: table_id = None orig_schema = [] manifest = query_lib.QueryManifest.Generate() rewritten_query, print_args = query_lib.RewriteQuery( clauses, orig_schema, master_key, table_id, manifest) job = super(EncryptedBigqueryClient, self).Query(rewritten_query, **kwds) self._LoadJobStatistics(manifest, job) printer = EncryptedTablePrinter(**print_args) bq.Factory.ClientTablePrinter.SetTablePrinter(printer) return job
def _GetEBQTableInfo(self, identifier): reference = super(EncryptedBigqueryClient, self).GetReference(identifier) object_info = super(EncryptedBigqueryClient, self).GetObjectInfo(reference) if object_info is None: raise bigquery_client.BigqueryNotFoundError( 'Table %s not found.' % identifier, None, None, None) if 'description' not in object_info: raise bigquery_client.BigqueryNotFoundError( 'Could not get essential EBQ info from description. Only use ebq ' 'update to edit table descriptions. Using bq will cause the table ' 'to be unusable.', None, None, None) description = object_info['description'].split('||') try: hashed_key = description[-3].split('Hash of master key: ')[1] version_number = description[-2].split('Version: ')[1] schema = description[-1].split('Schema: ')[1] except Exception: raise bigquery_client.BigqueryNotFoundError( 'Corrupt description containing essential EBQ info.', None, None, None) return hashed_key, version_number, schema
def CreateTable(self, reference, ignore_existing=False, schema=None, description=None, friendly_name=None, expiration=None): """Create a table corresponding to TableReference. Arguments: reference: the TableReference to create. ignore_existing: (boolean, default False) If False, raise an exception if the dataset already exists. schema: An required schema (also requires a master key). description: an optional table description. friendly_name: an optional friendly name for the table. expiration: optional expiration time in milliseconds since the epoch. Raises: TypeError: if reference is not a TableReference. BigqueryDuplicateError: if reference exists and ignore_existing is False. """ if schema is None: raise bigquery_client.BigqueryNotFoundError( 'A schema must be specified when making a table.', None, None, None) self._CheckKeyfileFlag() schema = load_lib.ReadSchemaFile(schema) master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) # pylint: disable=too-many-function-args hashed_key = base64.b64encode(hashlib.sha1(master_key).digest()) cipher = ecrypto.ProbabilisticCipher(master_key) pretty_schema = json.dumps(schema) pretty_schema = pretty_schema.encode('utf-8') pretty_schema = zlib.compress(pretty_schema) encrypted_schema = base64.b64encode(cipher.Encrypt(pretty_schema)) if description is None: description = '' new_description = util.ConstructTableDescription( description, hashed_key, util.EBQ_TABLE_VERSION, encrypted_schema) new_schema = load_lib.RewriteSchema(schema) super(EncryptedBigqueryClient, self).CreateTable(reference, ignore_existing, new_schema, new_description, friendly_name, expiration)
def Load(self, destination_table, source, schema=None, **kwds): """Encrypt the given data and then load it into BigQuery. The job will execute synchronously if sync=True is provided as an argument. Args: destination_table: TableReference to load data into. source: String specifying source data to load. schema: The schema that defines fields to be loaded. **kwds: Passed on to self.ExecuteJob. Returns: The resulting job info. """ self._CheckKeyfileFlag() self._CheckSchemaFile(schema) # To make encrypting more secure, we use different keys for each table # and cipher. To generate a different key for each table, we need a distinct # table identifier for each table. A table name is not secure since a table # can be deleted and created with the same name and, thus the same key. The # only distinct identifier happens to be creation time. Therefore, we must # construct a table if it does not exist so we can use the creation time # to encrypt values. try: self.CreateTable(destination_table, schema=schema) except bigquery_client.BigqueryDuplicateError: pass # Table already exists. temp_dir = tempfile.mkdtemp() orig_schema = load_lib.ReadSchemaFile(schema) new_schema = load_lib.RewriteSchema(orig_schema) new_schema_file = '%s/schema.enc_schema' % temp_dir # write the new schema as a json file with open(new_schema_file, 'wt') as f: json.dump(new_schema, f, indent=2) new_source_file = '%s/data.enc_data' % temp_dir # TODO(user): Put the filepath to the master key in .bigqueryrc file. master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) table_name = str(destination_table).split(':')[-1] table_id = '%s_%s' % ( table_name, self._GetTableCreationTime(str(destination_table))) hashed_table_key, table_version, table_schema = self._GetEBQTableInfo( str(destination_table)) hashed_master_key = hashlib.sha1(master_key) # pylint: disable=too-many-function-args hashed_master_key = base64.b64encode(hashed_master_key.digest()) if hashed_master_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) if table_version != util.EBQ_TABLE_VERSION: raise bigquery_client.BigqueryNotFoundError( 'Invalid table version.', None, None, None) # TODO(user): Generate a different key. cipher = ecrypto.ProbabilisticCipher(master_key) table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True) table_schema = zlib.decompress(table_schema) table_schema = table_schema.decode('utf-8') table_schema = json.loads(table_schema) if table_schema != orig_schema: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid schema for this table.', None, None, None) if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON': load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id, source, new_source_file) elif kwds['source_format'] == 'CSV' or not kwds['source_format']: load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id, source, new_source_file) else: raise app.UsageError( 'Currently, we do not allow loading from file types other than\n' 'NEWLINE_DELIMITED_JSON and CSV.') job = super(EncryptedBigqueryClient, self).Load(destination_table, new_source_file, schema=new_schema_file, **kwds) try: shutil.rmtree(temp_dir) except OSError: raise OSError('Temp file deleted by user before termination.') return job