def testConvertJsonDataFileWhenTypeChanges(self): """Test ConvertJsonDataFile().""" infile = tempfile.NamedTemporaryFile(mode='rw+') outfile = tempfile.NamedTemporaryFile(mode='w+') json_before = '{"age": "22", "fullname": "John Doe" }\n' # change: 22 is now an int. json_after = {'age': 22, 'fullname': 'John Doe'} infile.seek(0) infile.write(json_before) infile.seek(0) master_key = '%s' % _MASTER_KEY schema = [ { 'mode': 'nullable', 'name': 'age', 'type': 'integer', 'encrypt': 'none' }, { 'mode': 'nullable', 'name': 'fullname', 'type': 'string', 'encrypt': 'none' }, ] table_id = '%s' % _TABLE_ID load_lib.ConvertJsonDataFile(schema, master_key, table_id, infile.name, outfile.name) # compare as dict because key order is flaky in str(dict) json_output = json.loads(outfile.read()) self.assertEqual(json_output, json_after)
def testConvertJsonDataFileUSuffixRegression(self): """Test ConvertJsonDataFile() for regression of str last-u fix.""" infile = tempfile.NamedTemporaryFile(mode='a+') outfile = tempfile.NamedTemporaryFile(mode='w+') # test utf8 and unicode stability while here. csym = u'\u00a9' # unicode: (C) csym_utf8 = csym.encode('utf-8') json_before = '{"ustr": "foo%s", "bstr": "foou" }\n' % csym_utf8 json_after = {'ustr': u'foo%s' % csym, 'bstr': 'foou'} infile.seek(0) infile.write(json_before) infile.seek(0) master_key = '%s' % _MASTER_KEY schema = [ { 'mode': 'nullable', 'name': 'ustr', 'type': 'string', 'encrypt': 'none' }, { 'mode': 'nullable', 'name': 'bstr', 'type': 'string', 'encrypt': 'none' }, ] table_id = '%s' % _TABLE_ID load_lib.ConvertJsonDataFile(schema, master_key, table_id, infile.name, outfile.name) # compare as json loaded structure because serialized format is unstable json_output = json.loads(outfile.read()) self.assertEqual(json_output, json_after)
def testConvertJsonDataFile(self): schema = json.loads(test_util.GetPlacesSchemaString()) infile = self._WriteTempPlacesJsonFile() outfile = os.path.join(self.dirname, 'places.enc_data') master_key = base64.b64decode(_MASTER_KEY) string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, _TABLE_ID)) load_lib.ConvertJsonDataFile(schema, master_key, _TABLE_ID, infile, outfile) # validate new data file against new rewritten schema. new_schema = json.loads(_PLACES_REWRITTEN_SCHEMA) load_lib._ValidateJsonDataFile(new_schema, outfile) fout = open(outfile, 'rt') for line in fout: data = json.loads(line) break self.assertEqual(data['kind'], 'person') self.assertTrue(util.SEARCHWORDS_PREFIX + u'gender' in data) (model_iv, model_hash) = data[util.SEARCHWORDS_PREFIX + u'gender'].split(' ') expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'gender', u'Male'.lower()) expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) self.assertTrue(util.SEARCHWORDS_PREFIX + u'place' in data['citiesLived'][0]) (model_iv, model_hash) = data['citiesLived'][0][util.SEARCHWORDS_PREFIX + u'place'].split(' ') expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'place', u'Seattle'.lower()) expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) self.assertEquals(data['spouse']['spouseAge'], 23) checked = [] # look for lat,long in citiesLived found_any = False for city in data['citiesLived']: checked.append(city) if city.get('lat', None) is None: continue found_any = True self.assertTrue(isinstance(city['lat'], float)) self.assertTrue(isinstance(city['long'], float)) self.assertTrue(city['lat'] >= 0.0) self.assertTrue(city['long'] >= 0.0) self.assertTrue( found_any, 'found_any %s checked ( %s )' % (found_any, ' , '.join(map(str, checked)))) fout.close()
def testConvertComplexJsonDataFile(self): schema = json.loads(test_util.GetJobsSchemaString()) infile = self._WriteTempJobsJsonFile() outfile = os.path.join(self.dirname, 'jobs.enc_data') master_key = base64.b64decode(_MASTER_KEY) string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, _TABLE_ID)) load_lib.ConvertJsonDataFile(schema, master_key, _TABLE_ID, infile, outfile) # validate new data file against new rewritten schema. new_schema = json.loads(_JOBS_REWRITTEN_SCHEMA) load_lib._ValidateJsonDataFile(new_schema, outfile) fout = open(outfile, 'rt') for line in fout: data = json.loads(line) break self.assertEqual(data['kind'], 'person') self.assertTrue(util.SEARCHWORDS_PREFIX + u'gender' in data) (model_iv, model_hash) = data[util.SEARCHWORDS_PREFIX + u'gender'].split(' ') expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'gender', u'Male'.lower()) expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) self.assertTrue(util.SEARCHWORDS_PREFIX + u'place' in data['citiesLived'][0]) (model_iv, model_hash) = data['citiesLived'][0][util.SEARCHWORDS_PREFIX + u'place'].split(' ') expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'place', u'Seattle'.lower()) expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) self.assertEquals(data['citiesLived'][0]['job'][0]['jobRank'], 1) self.assertEquals(data['citiesLived'][1]['job'], []) self.assertEquals( len(data['citiesLived'][0]['job'][0][util.SEARCHWORDS_PREFIX + u'manager']), 3) self.assertEquals( len(data['citiesLived'][0]['job'][0][util.SEARCHWORDS_PREFIX + u'manager'][0].split(' ')), 4) fout.close()
def Load(self, destination_table, source, schema=None, **kwds): """Encrypt the given data and then load it into BigQuery. The job will execute synchronously if sync=True is provided as an argument. Args: destination_table: TableReference to load data into. source: String specifying source data to load. schema: The schema that defines fields to be loaded. **kwds: Passed on to self.ExecuteJob. Returns: The resulting job info. """ self._CheckKeyfileFlag() self._CheckSchemaFile(schema) # To make encrypting more secure, we use different keys for each table # and cipher. To generate a different key for each table, we need a distinct # table identifier for each table. A table name is not secure since a table # can be deleted and created with the same name and, thus the same key. The # only distinct identifier happens to be creation time. Therefore, we must # construct a table if it does not exist so we can use the creation time # to encrypt values. try: self.CreateTable(destination_table, schema=schema) except bigquery_client.BigqueryDuplicateError: pass # Table already exists. temp_dir = tempfile.mkdtemp() orig_schema = load_lib.ReadSchemaFile(schema) new_schema = load_lib.RewriteSchema(orig_schema) new_schema_file = '%s/schema.enc_schema' % temp_dir # write the new schema as a json file with open(new_schema_file, 'wt') as f: json.dump(new_schema, f, indent=2) new_source_file = '%s/data.enc_data' % temp_dir # TODO(user): Put the filepath to the master key in .bigqueryrc file. master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) table_name = str(destination_table).split(':')[-1] table_id = '%s_%s' % ( table_name, self._GetTableCreationTime(str(destination_table))) hashed_table_key, table_version, table_schema = self._GetEBQTableInfo( str(destination_table)) hashed_master_key = hashlib.sha1(master_key) # pylint: disable=too-many-function-args hashed_master_key = base64.b64encode(hashed_master_key.digest()) if hashed_master_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) if table_version != util.EBQ_TABLE_VERSION: raise bigquery_client.BigqueryNotFoundError( 'Invalid table version.', None, None, None) # TODO(user): Generate a different key. cipher = ecrypto.ProbabilisticCipher(master_key) table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True) table_schema = zlib.decompress(table_schema) table_schema = table_schema.decode('utf-8') table_schema = json.loads(table_schema) if table_schema != orig_schema: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid schema for this table.', None, None, None) if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON': load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id, source, new_source_file) elif kwds['source_format'] == 'CSV' or not kwds['source_format']: load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id, source, new_source_file) else: raise app.UsageError( 'Currently, we do not allow loading from file types other than\n' 'NEWLINE_DELIMITED_JSON and CSV.') job = super(EncryptedBigqueryClient, self).Load(destination_table, new_source_file, schema=new_schema_file, **kwds) try: shutil.rmtree(temp_dir) except OSError: raise OSError('Temp file deleted by user before termination.') return job