Esempio n. 1
0
 def testConvertCsvDataFile(self):
     self._SetupTestFlags()
     schema = json.loads(test_util.GetCarsSchemaString())
     infile = self._WriteTempCarsCsvFile()
     outfile = os.path.join(self.dirname, 'cars.enc_data')
     master_key = base64.b64decode(_MASTER_KEY)
     string_hasher = ecrypto.StringHash(
         ecrypto.GenerateStringHashKey(master_key, _TABLE_ID))
     pseudonym_cipher = ecrypto.PseudonymCipher(
         ecrypto.GeneratePseudonymCipherKey(master_key, _TABLE_ID))
     load_lib.ConvertCsvDataFile(schema, master_key, _TABLE_ID, infile,
                                 outfile)
     # validate new data file against new rewritten schema.
     new_schema = json.loads(_CARS_REWRITTEN_SCHEMA)
     load_lib._ValidateCsvDataFile(new_schema, outfile)
     # Sanity check one row entries. Entries for semantic encrypted fields cannot
     # be checked because the values are randomized.
     fout = open(outfile, 'rt')
     row0 = fout.readline()
     self.assertTrue('1997' in row0)
     self.assertTrue(pseudonym_cipher.Encrypt(unicode('Ford')) in row0)
     # Get iv and hash for Model searchwords field whose value is 'E350'
     (model_iv, model_hash) = row0.split(',')[2].split(' ')
     # Calculate expected key hash value for 'E350'
     expected_model_key_hash = string_hasher.GetStringKeyHash(
         util.SEARCHWORDS_PREFIX + u'Model', u'E350'.lower())
     # Calculate outer sha1 using model_iv and expected key hash.
     expected_model_hash = base64.b64encode(
         hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
     self.assertEquals(expected_model_hash, model_hash)
     fout.close()
Esempio n. 2
0
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile):
  """Encrypts data in a json file based on schema provided.

  Arguments:
    schema: User defined values and types.
    master_key: Key to provide ciphers.
    table_id: Used to unique key for each table.
    infile: File to be encrypted.
    outfile: Location of encrypted file to outputted.
  """
  prob_cipher = ecrypto.ProbabilisticCipher(
      ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
  pseudonym_cipher = ecrypto.PseudonymCipher(
      ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
  # TODO(user): ciphers and hash should not use the same key.
  string_hasher = ecrypto.StringHash(
      ecrypto.GenerateStringHashKey(master_key, table_id))
  homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
      ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
  homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
      ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

  _ValidateJsonDataFile(schema, infile)
  with open(infile, 'rb') as in_file:
    with open(outfile, 'wb') as out_file:
      for line in in_file:
        data = json.loads(line)
        data = _StrToUnicode(data)
        rewritten_data = _ConvertJsonField(
            data, schema, prob_cipher, pseudonym_cipher, string_hasher,
            homomorphic_int_cipher, homomorphic_float_cipher)
        rewritten_data = json.dumps(rewritten_data)
        out_file.write(rewritten_data + '\n')
Esempio n. 3
0
    def testConvertJsonDataFile(self):
        schema = json.loads(test_util.GetPlacesSchemaString())
        infile = self._WriteTempPlacesJsonFile()
        outfile = os.path.join(self.dirname, 'places.enc_data')
        master_key = base64.b64decode(_MASTER_KEY)
        string_hasher = ecrypto.StringHash(
            ecrypto.GenerateStringHashKey(master_key, _TABLE_ID))
        load_lib.ConvertJsonDataFile(schema, master_key, _TABLE_ID, infile,
                                     outfile)
        # validate new data file against new rewritten schema.
        new_schema = json.loads(_PLACES_REWRITTEN_SCHEMA)
        load_lib._ValidateJsonDataFile(new_schema, outfile)
        fout = open(outfile, 'rt')
        for line in fout:
            data = json.loads(line)
            break
        self.assertEqual(data['kind'], 'person')
        self.assertTrue(util.SEARCHWORDS_PREFIX + u'gender' in data)
        (model_iv,
         model_hash) = data[util.SEARCHWORDS_PREFIX + u'gender'].split(' ')
        expected_model_key_hash = string_hasher.GetStringKeyHash(
            util.SEARCHWORDS_PREFIX + u'gender', u'Male'.lower())
        expected_model_hash = base64.b64encode(
            hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
        self.assertEquals(expected_model_hash, model_hash)
        self.assertTrue(util.SEARCHWORDS_PREFIX +
                        u'place' in data['citiesLived'][0])
        (model_iv,
         model_hash) = data['citiesLived'][0][util.SEARCHWORDS_PREFIX +
                                              u'place'].split(' ')
        expected_model_key_hash = string_hasher.GetStringKeyHash(
            util.SEARCHWORDS_PREFIX + u'place', u'Seattle'.lower())
        expected_model_hash = base64.b64encode(
            hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
        self.assertEquals(expected_model_hash, model_hash)
        self.assertEquals(data['spouse']['spouseAge'], 23)
        checked = []

        # look for lat,long in citiesLived
        found_any = False
        for city in data['citiesLived']:
            checked.append(city)
            if city.get('lat', None) is None:
                continue
            found_any = True
            self.assertTrue(isinstance(city['lat'], float))
            self.assertTrue(isinstance(city['long'], float))
            self.assertTrue(city['lat'] >= 0.0)
            self.assertTrue(city['long'] >= 0.0)
        self.assertTrue(
            found_any, 'found_any %s checked ( %s )' %
            (found_any, ' , '.join(map(str, checked))))
        fout.close()
Esempio n. 4
0
 def testConvertComplexJsonDataFile(self):
     schema = json.loads(test_util.GetJobsSchemaString())
     infile = self._WriteTempJobsJsonFile()
     outfile = os.path.join(self.dirname, 'jobs.enc_data')
     master_key = base64.b64decode(_MASTER_KEY)
     string_hasher = ecrypto.StringHash(
         ecrypto.GenerateStringHashKey(master_key, _TABLE_ID))
     load_lib.ConvertJsonDataFile(schema, master_key, _TABLE_ID, infile,
                                  outfile)
     # validate new data file against new rewritten schema.
     new_schema = json.loads(_JOBS_REWRITTEN_SCHEMA)
     load_lib._ValidateJsonDataFile(new_schema, outfile)
     fout = open(outfile, 'rt')
     for line in fout:
         data = json.loads(line)
         break
     self.assertEqual(data['kind'], 'person')
     self.assertTrue(util.SEARCHWORDS_PREFIX + u'gender' in data)
     (model_iv,
      model_hash) = data[util.SEARCHWORDS_PREFIX + u'gender'].split(' ')
     expected_model_key_hash = string_hasher.GetStringKeyHash(
         util.SEARCHWORDS_PREFIX + u'gender', u'Male'.lower())
     expected_model_hash = base64.b64encode(
         hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
     self.assertEquals(expected_model_hash, model_hash)
     self.assertTrue(util.SEARCHWORDS_PREFIX +
                     u'place' in data['citiesLived'][0])
     (model_iv,
      model_hash) = data['citiesLived'][0][util.SEARCHWORDS_PREFIX +
                                           u'place'].split(' ')
     expected_model_key_hash = string_hasher.GetStringKeyHash(
         util.SEARCHWORDS_PREFIX + u'place', u'Seattle'.lower())
     expected_model_hash = base64.b64encode(
         hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
     self.assertEquals(expected_model_hash, model_hash)
     self.assertEquals(data['citiesLived'][0]['job'][0]['jobRank'], 1)
     self.assertEquals(data['citiesLived'][1]['job'], [])
     self.assertEquals(
         len(data['citiesLived'][0]['job'][0][util.SEARCHWORDS_PREFIX +
                                              u'manager']), 3)
     self.assertEquals(
         len(data['citiesLived'][0]['job'][0][util.SEARCHWORDS_PREFIX +
                                              u'manager'][0].split(' ')), 4)
     fout.close()
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile):
    """Encrypts data in a json file based on schema provided.

  Arguments:
    schema: User defined values and types.
    master_key: Key to provide ciphers.
    table_id: Used to unique key for each table.
    infile: File to be encrypted.
    outfile: Location of encrypted file to outputted.
  """
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    # TODO(user): ciphers and hash should not use the same key.
    string_hasher = ecrypto.StringHash(
        ecrypto.GenerateStringHashKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    _ValidateJsonDataFile(schema, infile)
    with open(infile, 'rb') as in_file:
        with open(outfile, 'wb') as out_file:
            for line in in_file:
                data = json.loads(line)
                data = _StrToUnicode(data)
                rewritten_data = _ConvertJsonField(data, schema, prob_cipher,
                                                   pseudonym_cipher,
                                                   string_hasher,
                                                   homomorphic_int_cipher,
                                                   homomorphic_float_cipher)
                # When python prints unicode strings, it uses single quotes and
                # prepends a u before the string (such as u'Hello'). Json does
                # understand this and will only allow strings of double quotes
                # without any prefixes, therefore we must substitute to fit
                # the criteria.
                rewritten_data = str(rewritten_data).replace('u\'', '"')
                rewritten_data = rewritten_data.replace('\'', '"')
                out_file.write(rewritten_data + '\n')
Esempio n. 6
0
def RewriteSelectionCriteria(stack, schema, master_key, table_id):
    """Rewrites selection criteria (arguments of WHERE and HAVING clause).

  Arguments:
    stack: The postfix expression that is the where/having expression.
    schema: The user defined values and encryption.
    master_key: Used to get ciphers for encryption.
    table_id: Used to generate a proper key.

  Returns:
    An infix version of the <stack>. The expression is rewritten so that it
    can be sent to the BigQuery server.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: If the expression is invalid
    (such as searching non-searchable encrypted fields, etc).
  """

    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    string_hasher = ecrypto.StringHash(
        ecrypto.GenerateStringHashKey(master_key, table_id))

    def FailIfEncrypted(tokens):
        if util.IsEncryptedExpression(tokens):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Invalid where/having expression.', None, None, None)

    def FailIfDeterministic(tokens):
        if util.IsDeterministicExpression(tokens):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot do equality on probabilistic encryption, '
                'only pseudonym encryption.', None, None, None)

    def RewritePseudonymEncryption(token):
        if isinstance(token, util.StringLiteralToken):
            return '"%s"' % pseudonym_cipher.Encrypt(unicode(token[1:-1]))
        else:
            return token

    def RewriteSearchwordsEncryption(field, literal):
        """Rewrites the literal such that it can be checked for containment.

    Arguments:
      field: The field which is being checked if literal is contained within.
      literal: Substring being searched for.

    Returns:
      A tuple containing both field and literal rewritten.

    Raises:
      ValueError: Try to rewrite non-searchwords encryption.
    """
        if (not isinstance(field, util.SearchwordsToken)
                and not isinstance(field, util.ProbabilisticToken)):
            raise ValueError('Invalid encryption to check containment.')
        field = field.original_name
        row = util.GetEntryFromSchema(field, schema)
        modified_field = util.SEARCHWORDS_PREFIX + row['name']
        field = field.split('.')
        field[-1] = modified_field
        modified_field = '.'.join(field)
        if 'searchwords_separator' in row:
            searchwords_separator = row['searchwords_separator']
        else:
            searchwords_separator = None
        word_list = ecrypto.CleanUnicodeString(unicode(literal.value),
                                               separator=searchwords_separator)
        if searchwords_separator is None:
            word_seq = ' '.join(word_list)
        else:
            word_seq = searchwords_separator.join(word_list)
        keyed_hash = (u'\'%s\'' % string_hasher.GetStringKeyHash(
            modified_field.split('.')[-1], word_seq))
        modified_string = (
            u'to_base64(left(bytes(sha1(concat(left(%s, 24), %s))), 8))' %
            (modified_field, keyed_hash))
        return (modified_field, modified_string)

    def CheckSearchableField(op1):
        """Checks if the operand is a searchable encrypted field.

    Arguments:
      op1: The operand that is being checked if it is searchable.

    Returns:
      True iff op1 is searchable.
    """
        if isinstance(op1, util.SearchwordsToken):
            return True
        elif not isinstance(op1, util.ProbabilisticToken):
            return False
        op1 = op1.original_name
        row = util.GetEntryFromSchema(op1, schema)
        if row['encrypt'] in ['probabilistic_searchwords', 'searchwords']:
            return True
        else:
            return False
        return False

    def RewriteContainsOrFail(op1, op2):
        """Tries to rewrite a contains expression.

    Arguments:
      op1: The first operand of the contains binary operator.
      op2: The second operand of the contians binary operator.

    Returns:
      The rewritten versions of both operands.

    Raises:
      bigquery_client.BigqueryInvalidQueryError: If the contains expressions
      is invalid.
    """
        if not isinstance(op1, util.EncryptedToken):
            return (op1, op2)
        if not CheckSearchableField(op1):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot do contains on an encrypted field that is not searchable.',
                None, None, None)
        elif not isinstance(op2, util.StringLiteralToken):
            raise bigquery_client.BigqueryInvalidQueryError(
                'The substring to be checked must be a literal.', None, None,
                None)
        return RewriteSearchwordsEncryption(op1, op2)

    def CheckAndRewriteStack(postfix):
        if not postfix:
            raise bigquery_client.BigqueryInvalidQueryError(
                'Not enough arguments.', None, None, None)
        top = postfix.pop()
        if isinstance(top, util.OperatorToken):
            args = []
            for unused_i in range(top.num_args):
                args.append(CheckAndRewriteStack(postfix))
            args.reverse()
            if top.num_args == 1:
                return '%s %s' % (str(top), args[0])
            elif str(top) in ['=', '==', '!=']:
                FailIfDeterministic(args)
                if (isinstance(args[0], util.PseudonymToken)
                        or isinstance(args[1], util.PseudonymToken)):
                    args[0] = RewritePseudonymEncryption(args[0])
                    args[1] = RewritePseudonymEncryption(args[1])
            elif str(top) == 'contains':
                FailIfEncrypted([args[1]])
                args[0], args[1] = RewriteContainsOrFail(args[0], args[1])
            else:
                FailIfEncrypted(args)
            return '(%s %s %s)' % (args[0], str(top), args[1])
        elif isinstance(top, util.BuiltInFunctionToken):
            func_name = str(top)
            if func_name in _ZERO_ARGUMENT_FUNCTIONS:
                return '%s()' % func_name
            elif func_name in _ONE_ARGUMENT_FUNCTIONS:
                op = CheckAndRewriteStack(postfix)
                FailIfEncrypted([op])
                return '%s(%s)' % (func_name, op)
            elif func_name in _TWO_ARGUMENT_FUNCTIONS:
                op2 = CheckAndRewriteStack(postfix)
                op1 = CheckAndRewriteStack(postfix)
                FailIfEncrypted([op1, op2])
                return '%s(%s, %s)' % (func_name, op1, op2)
            elif func_name in _THREE_ARGUMENT_FUNCTIONS:
                op3 = CheckAndRewriteStack(postfix)
                op2 = CheckAndRewriteStack(postfix)
                op1 = CheckAndRewriteStack(postfix)
                FailIfEncrypted([op1, op2, op3])
                return '%s(%s, %s, %s)' % (func_name, op1, op2, op3)
            else:
                raise bigquery_client.BigqueryInvalidQueryError(
                    '%s function does not exist.' % func_name, None, None,
                    None)
        elif not isinstance(top, basestring):
            return str(top)
        else:
            return top

    temp_stack = list(stack)
    new_expression = CheckAndRewriteStack(temp_stack)
    if temp_stack:
        raise bigquery_client.BigqueryInvalidQueryError(
            'Too many arguments.', None, None, None)
    return new_expression
def ConvertCsvDataFile(schema, master_key, table_id, infile, outfile):
    """Reads utf8 csv data, encrypts and stores into a new csv utf8 data file."""
    prob_cipher = ecrypto.ProbabilisticCipher(
        ecrypto.GenerateProbabilisticCipherKey(master_key, table_id))
    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    # TODO(user): ciphers and hash should not use the same key.
    string_hasher = ecrypto.StringHash(
        ecrypto.GenerateStringHashKey(master_key, table_id))
    homomorphic_int_cipher = ecrypto.HomomorphicIntCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))
    homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher(
        ecrypto.GenerateHomomorphicCipherKey(master_key, table_id))

    with open(infile, 'rb') as in_file:
        with open(outfile, 'wb') as out_file:
            num_columns = len(schema)
            csv_writer = csv.writer(out_file)
            _ValidateCsvDataFile(schema, infile)
            csv_reader = _Utf8CsvReader(in_file, csv_writer)
            for row in csv_reader:
                new_row = []
                if len(row) != num_columns:
                    raise EncryptConvertError(
                        'Number of fields in schema do not match '
                        'in row: %s' % row)
                for i in xrange(num_columns):
                    encrypt_mode = schema[i]['encrypt']
                    if encrypt_mode == 'none':
                        new_row.append(row[i].encode('utf-8'))
                    elif encrypt_mode == 'probabilistic':
                        new_row.append(
                            prob_cipher.Encrypt(row[i]).encode('utf-8'))
                    elif encrypt_mode == 'pseudonym':
                        new_row.append(
                            pseudonym_cipher.Encrypt(row[i]).encode('utf-8'))
                    elif encrypt_mode == 'homomorphic' and schema[i][
                            'type'] == 'integer':
                        new_row.append(
                            homomorphic_int_cipher.Encrypt(long(
                                row[i])).encode('utf-8'))
                    elif encrypt_mode == 'homomorphic' and schema[i][
                            'type'] == 'float':
                        new_row.append(
                            homomorphic_float_cipher.Encrypt(float(
                                row[i])).encode('utf-8'))
                    elif encrypt_mode == 'searchwords':
                        if 'searchwords_separator' in schema[i]:
                            searchwords_separator = schema[i][
                                'searchwords_separator']
                        else:
                            searchwords_separator = None
                        if 'max_word_sequence' in schema[i]:
                            max_word_sequence = schema[i]['max_word_sequence']
                        else:
                            max_word_sequence = 5
                        new_row.append(
                            string_hasher.GetHashesForWordSubsequencesWithIv(
                                util.SEARCHWORDS_PREFIX + schema[i]['name'],
                                row[i],
                                separator=searchwords_separator,
                                max_sequence_len=max_word_sequence).encode(
                                    'utf-8'))
                    elif encrypt_mode == 'probabilistic_searchwords':
                        if 'searchwords_separator' in schema[i]:
                            searchwords_separator = schema[i][
                                'searchwords_separator']
                        else:
                            searchwords_separator = None
                        if 'max_word_sequence' in schema[i]:
                            max_word_sequence = schema[i]['max_word_sequence']
                        else:
                            max_word_sequence = 5
                        new_row.append(
                            string_hasher.GetHashesForWordSubsequencesWithIv(
                                util.SEARCHWORDS_PREFIX + schema[i]['name'],
                                row[i],
                                separator=searchwords_separator,
                                max_sequence_len=max_word_sequence).encode(
                                    'utf-8'))
                        new_row.append(
                            prob_cipher.Encrypt(row[i]).encode('utf-8'))
                csv_writer.writerow(new_row)