def testConvertCsvDataFile(self): self._SetupTestFlags() schema = json.loads(test_util.GetCarsSchemaString()) infile = self._WriteTempCarsCsvFile() outfile = os.path.join(self.dirname, 'cars.enc_data') master_key = base64.b64decode(_MASTER_KEY) string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, _TABLE_ID)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, _TABLE_ID)) load_lib.ConvertCsvDataFile(schema, master_key, _TABLE_ID, infile, outfile) # validate new data file against new rewritten schema. new_schema = json.loads(_CARS_REWRITTEN_SCHEMA) load_lib._ValidateCsvDataFile(new_schema, outfile) # Sanity check one row entries. Entries for semantic encrypted fields cannot # be checked because the values are randomized. fout = open(outfile, 'rt') row0 = fout.readline() self.assertTrue('1997' in row0) self.assertTrue(pseudonym_cipher.Encrypt(unicode('Ford')) in row0) # Get iv and hash for Model searchwords field whose value is 'E350' (model_iv, model_hash) = row0.split(',')[2].split(' ') # Calculate expected key hash value for 'E350' expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'Model', u'E350'.lower()) # Calculate outer sha1 using model_iv and expected key hash. expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) fout.close()
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile): """Encrypts data in a json file based on schema provided. Arguments: schema: User defined values and types. master_key: Key to provide ciphers. table_id: Used to unique key for each table. infile: File to be encrypted. outfile: Location of encrypted file to outputted. """ prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) _ValidateJsonDataFile(schema, infile) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: for line in in_file: data = json.loads(line) data = _StrToUnicode(data) rewritten_data = _ConvertJsonField( data, schema, prob_cipher, pseudonym_cipher, string_hasher, homomorphic_int_cipher, homomorphic_float_cipher) rewritten_data = json.dumps(rewritten_data) out_file.write(rewritten_data + '\n')
def testConvertJsonDataFile(self): schema = json.loads(test_util.GetPlacesSchemaString()) infile = self._WriteTempPlacesJsonFile() outfile = os.path.join(self.dirname, 'places.enc_data') master_key = base64.b64decode(_MASTER_KEY) string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, _TABLE_ID)) load_lib.ConvertJsonDataFile(schema, master_key, _TABLE_ID, infile, outfile) # validate new data file against new rewritten schema. new_schema = json.loads(_PLACES_REWRITTEN_SCHEMA) load_lib._ValidateJsonDataFile(new_schema, outfile) fout = open(outfile, 'rt') for line in fout: data = json.loads(line) break self.assertEqual(data['kind'], 'person') self.assertTrue(util.SEARCHWORDS_PREFIX + u'gender' in data) (model_iv, model_hash) = data[util.SEARCHWORDS_PREFIX + u'gender'].split(' ') expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'gender', u'Male'.lower()) expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) self.assertTrue(util.SEARCHWORDS_PREFIX + u'place' in data['citiesLived'][0]) (model_iv, model_hash) = data['citiesLived'][0][util.SEARCHWORDS_PREFIX + u'place'].split(' ') expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'place', u'Seattle'.lower()) expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) self.assertEquals(data['spouse']['spouseAge'], 23) checked = [] # look for lat,long in citiesLived found_any = False for city in data['citiesLived']: checked.append(city) if city.get('lat', None) is None: continue found_any = True self.assertTrue(isinstance(city['lat'], float)) self.assertTrue(isinstance(city['long'], float)) self.assertTrue(city['lat'] >= 0.0) self.assertTrue(city['long'] >= 0.0) self.assertTrue( found_any, 'found_any %s checked ( %s )' % (found_any, ' , '.join(map(str, checked)))) fout.close()
def testConvertComplexJsonDataFile(self): schema = json.loads(test_util.GetJobsSchemaString()) infile = self._WriteTempJobsJsonFile() outfile = os.path.join(self.dirname, 'jobs.enc_data') master_key = base64.b64decode(_MASTER_KEY) string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, _TABLE_ID)) load_lib.ConvertJsonDataFile(schema, master_key, _TABLE_ID, infile, outfile) # validate new data file against new rewritten schema. new_schema = json.loads(_JOBS_REWRITTEN_SCHEMA) load_lib._ValidateJsonDataFile(new_schema, outfile) fout = open(outfile, 'rt') for line in fout: data = json.loads(line) break self.assertEqual(data['kind'], 'person') self.assertTrue(util.SEARCHWORDS_PREFIX + u'gender' in data) (model_iv, model_hash) = data[util.SEARCHWORDS_PREFIX + u'gender'].split(' ') expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'gender', u'Male'.lower()) expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) self.assertTrue(util.SEARCHWORDS_PREFIX + u'place' in data['citiesLived'][0]) (model_iv, model_hash) = data['citiesLived'][0][util.SEARCHWORDS_PREFIX + u'place'].split(' ') expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'place', u'Seattle'.lower()) expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) self.assertEquals(data['citiesLived'][0]['job'][0]['jobRank'], 1) self.assertEquals(data['citiesLived'][1]['job'], []) self.assertEquals( len(data['citiesLived'][0]['job'][0][util.SEARCHWORDS_PREFIX + u'manager']), 3) self.assertEquals( len(data['citiesLived'][0]['job'][0][util.SEARCHWORDS_PREFIX + u'manager'][0].split(' ')), 4) fout.close()
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile): """Encrypts data in a json file based on schema provided. Arguments: schema: User defined values and types. master_key: Key to provide ciphers. table_id: Used to unique key for each table. infile: File to be encrypted. outfile: Location of encrypted file to outputted. """ prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) _ValidateJsonDataFile(schema, infile) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: for line in in_file: data = json.loads(line) data = _StrToUnicode(data) rewritten_data = _ConvertJsonField(data, schema, prob_cipher, pseudonym_cipher, string_hasher, homomorphic_int_cipher, homomorphic_float_cipher) # When python prints unicode strings, it uses single quotes and # prepends a u before the string (such as u'Hello'). Json does # understand this and will only allow strings of double quotes # without any prefixes, therefore we must substitute to fit # the criteria. rewritten_data = str(rewritten_data).replace('u\'', '"') rewritten_data = rewritten_data.replace('\'', '"') out_file.write(rewritten_data + '\n')
def RewriteSelectionCriteria(stack, schema, master_key, table_id): """Rewrites selection criteria (arguments of WHERE and HAVING clause). Arguments: stack: The postfix expression that is the where/having expression. schema: The user defined values and encryption. master_key: Used to get ciphers for encryption. table_id: Used to generate a proper key. Returns: An infix version of the <stack>. The expression is rewritten so that it can be sent to the BigQuery server. Raises: bigquery_client.BigqueryInvalidQueryError: If the expression is invalid (such as searching non-searchable encrypted fields, etc). """ pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) def FailIfEncrypted(tokens): if util.IsEncryptedExpression(tokens): raise bigquery_client.BigqueryInvalidQueryError( 'Invalid where/having expression.', None, None, None) def FailIfDeterministic(tokens): if util.IsDeterministicExpression(tokens): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot do equality on probabilistic encryption, ' 'only pseudonym encryption.', None, None, None) def RewritePseudonymEncryption(token): if isinstance(token, util.StringLiteralToken): return '"%s"' % pseudonym_cipher.Encrypt(unicode(token[1:-1])) else: return token def RewriteSearchwordsEncryption(field, literal): """Rewrites the literal such that it can be checked for containment. Arguments: field: The field which is being checked if literal is contained within. literal: Substring being searched for. Returns: A tuple containing both field and literal rewritten. Raises: ValueError: Try to rewrite non-searchwords encryption. """ if (not isinstance(field, util.SearchwordsToken) and not isinstance(field, util.ProbabilisticToken)): raise ValueError('Invalid encryption to check containment.') field = field.original_name row = util.GetEntryFromSchema(field, schema) modified_field = util.SEARCHWORDS_PREFIX + row['name'] field = field.split('.') field[-1] = modified_field modified_field = '.'.join(field) if 'searchwords_separator' in row: searchwords_separator = row['searchwords_separator'] else: searchwords_separator = None word_list = ecrypto.CleanUnicodeString(unicode(literal.value), separator=searchwords_separator) if searchwords_separator is None: word_seq = ' '.join(word_list) else: word_seq = searchwords_separator.join(word_list) keyed_hash = (u'\'%s\'' % string_hasher.GetStringKeyHash( modified_field.split('.')[-1], word_seq)) modified_string = ( u'to_base64(left(bytes(sha1(concat(left(%s, 24), %s))), 8))' % (modified_field, keyed_hash)) return (modified_field, modified_string) def CheckSearchableField(op1): """Checks if the operand is a searchable encrypted field. Arguments: op1: The operand that is being checked if it is searchable. Returns: True iff op1 is searchable. """ if isinstance(op1, util.SearchwordsToken): return True elif not isinstance(op1, util.ProbabilisticToken): return False op1 = op1.original_name row = util.GetEntryFromSchema(op1, schema) if row['encrypt'] in ['probabilistic_searchwords', 'searchwords']: return True else: return False return False def RewriteContainsOrFail(op1, op2): """Tries to rewrite a contains expression. Arguments: op1: The first operand of the contains binary operator. op2: The second operand of the contians binary operator. Returns: The rewritten versions of both operands. Raises: bigquery_client.BigqueryInvalidQueryError: If the contains expressions is invalid. """ if not isinstance(op1, util.EncryptedToken): return (op1, op2) if not CheckSearchableField(op1): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot do contains on an encrypted field that is not searchable.', None, None, None) elif not isinstance(op2, util.StringLiteralToken): raise bigquery_client.BigqueryInvalidQueryError( 'The substring to be checked must be a literal.', None, None, None) return RewriteSearchwordsEncryption(op1, op2) def CheckAndRewriteStack(postfix): if not postfix: raise bigquery_client.BigqueryInvalidQueryError( 'Not enough arguments.', None, None, None) top = postfix.pop() if isinstance(top, util.OperatorToken): args = [] for unused_i in range(top.num_args): args.append(CheckAndRewriteStack(postfix)) args.reverse() if top.num_args == 1: return '%s %s' % (str(top), args[0]) elif str(top) in ['=', '==', '!=']: FailIfDeterministic(args) if (isinstance(args[0], util.PseudonymToken) or isinstance(args[1], util.PseudonymToken)): args[0] = RewritePseudonymEncryption(args[0]) args[1] = RewritePseudonymEncryption(args[1]) elif str(top) == 'contains': FailIfEncrypted([args[1]]) args[0], args[1] = RewriteContainsOrFail(args[0], args[1]) else: FailIfEncrypted(args) return '(%s %s %s)' % (args[0], str(top), args[1]) elif isinstance(top, util.BuiltInFunctionToken): func_name = str(top) if func_name in _ZERO_ARGUMENT_FUNCTIONS: return '%s()' % func_name elif func_name in _ONE_ARGUMENT_FUNCTIONS: op = CheckAndRewriteStack(postfix) FailIfEncrypted([op]) return '%s(%s)' % (func_name, op) elif func_name in _TWO_ARGUMENT_FUNCTIONS: op2 = CheckAndRewriteStack(postfix) op1 = CheckAndRewriteStack(postfix) FailIfEncrypted([op1, op2]) return '%s(%s, %s)' % (func_name, op1, op2) elif func_name in _THREE_ARGUMENT_FUNCTIONS: op3 = CheckAndRewriteStack(postfix) op2 = CheckAndRewriteStack(postfix) op1 = CheckAndRewriteStack(postfix) FailIfEncrypted([op1, op2, op3]) return '%s(%s, %s, %s)' % (func_name, op1, op2, op3) else: raise bigquery_client.BigqueryInvalidQueryError( '%s function does not exist.' % func_name, None, None, None) elif not isinstance(top, basestring): return str(top) else: return top temp_stack = list(stack) new_expression = CheckAndRewriteStack(temp_stack) if temp_stack: raise bigquery_client.BigqueryInvalidQueryError( 'Too many arguments.', None, None, None) return new_expression
def ConvertCsvDataFile(schema, master_key, table_id, infile, outfile): """Reads utf8 csv data, encrypts and stores into a new csv utf8 data file.""" prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: num_columns = len(schema) csv_writer = csv.writer(out_file) _ValidateCsvDataFile(schema, infile) csv_reader = _Utf8CsvReader(in_file, csv_writer) for row in csv_reader: new_row = [] if len(row) != num_columns: raise EncryptConvertError( 'Number of fields in schema do not match ' 'in row: %s' % row) for i in xrange(num_columns): encrypt_mode = schema[i]['encrypt'] if encrypt_mode == 'none': new_row.append(row[i].encode('utf-8')) elif encrypt_mode == 'probabilistic': new_row.append( prob_cipher.Encrypt(row[i]).encode('utf-8')) elif encrypt_mode == 'pseudonym': new_row.append( pseudonym_cipher.Encrypt(row[i]).encode('utf-8')) elif encrypt_mode == 'homomorphic' and schema[i][ 'type'] == 'integer': new_row.append( homomorphic_int_cipher.Encrypt(long( row[i])).encode('utf-8')) elif encrypt_mode == 'homomorphic' and schema[i][ 'type'] == 'float': new_row.append( homomorphic_float_cipher.Encrypt(float( row[i])).encode('utf-8')) elif encrypt_mode == 'searchwords': if 'searchwords_separator' in schema[i]: searchwords_separator = schema[i][ 'searchwords_separator'] else: searchwords_separator = None if 'max_word_sequence' in schema[i]: max_word_sequence = schema[i]['max_word_sequence'] else: max_word_sequence = 5 new_row.append( string_hasher.GetHashesForWordSubsequencesWithIv( util.SEARCHWORDS_PREFIX + schema[i]['name'], row[i], separator=searchwords_separator, max_sequence_len=max_word_sequence).encode( 'utf-8')) elif encrypt_mode == 'probabilistic_searchwords': if 'searchwords_separator' in schema[i]: searchwords_separator = schema[i][ 'searchwords_separator'] else: searchwords_separator = None if 'max_word_sequence' in schema[i]: max_word_sequence = schema[i]['max_word_sequence'] else: max_word_sequence = 5 new_row.append( string_hasher.GetHashesForWordSubsequencesWithIv( util.SEARCHWORDS_PREFIX + schema[i]['name'], row[i], separator=searchwords_separator, max_sequence_len=max_word_sequence).encode( 'utf-8')) new_row.append( prob_cipher.Encrypt(row[i]).encode('utf-8')) csv_writer.writerow(new_row)