Example #1
0
    def RewriteContainsOrFail(op1, op2):
        """Tries to rewrite a contains expression.

    Arguments:
      op1: The first operand of the contains binary operator.
      op2: The second operand of the contians binary operator.

    Returns:
      The rewritten versions of both operands.

    Raises:
      bigquery_client.BigqueryInvalidQueryError: If the contains expressions
      is invalid.
    """
        if not isinstance(op1, util.EncryptedToken):
            return (op1, op2)
        if not CheckSearchableField(op1):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot do contains on an encrypted field that is not searchable.',
                None, None, None)
        elif not isinstance(op2, util.StringLiteralToken):
            raise bigquery_client.BigqueryInvalidQueryError(
                'The substring to be checked must be a literal.', None, None,
                None)
        return RewriteSearchwordsEncryption(op1, op2)
Example #2
0
 def CheckAndRewriteStack(postfix):
     if not postfix:
         raise bigquery_client.BigqueryInvalidQueryError(
             'Not enough arguments.', None, None, None)
     top = postfix.pop()
     if isinstance(top, util.OperatorToken):
         args = []
         for unused_i in range(top.num_args):
             args.append(CheckAndRewriteStack(postfix))
         args.reverse()
         if top.num_args == 1:
             return '%s %s' % (str(top), args[0])
         elif str(top) in ['=', '==', '!=']:
             FailIfDeterministic(args)
             if (isinstance(args[0], util.PseudonymToken)
                     or isinstance(args[1], util.PseudonymToken)):
                 args[0] = RewritePseudonymEncryption(args[0])
                 args[1] = RewritePseudonymEncryption(args[1])
         elif str(top) == 'contains':
             FailIfEncrypted([args[1]])
             args[0], args[1] = RewriteContainsOrFail(args[0], args[1])
         else:
             FailIfEncrypted(args)
         return '(%s %s %s)' % (args[0], str(top), args[1])
     elif isinstance(top, util.BuiltInFunctionToken):
         func_name = str(top)
         if func_name in _ZERO_ARGUMENT_FUNCTIONS:
             return '%s()' % func_name
         elif func_name in _ONE_ARGUMENT_FUNCTIONS:
             op = CheckAndRewriteStack(postfix)
             FailIfEncrypted([op])
             return '%s(%s)' % (func_name, op)
         elif func_name in _TWO_ARGUMENT_FUNCTIONS:
             op2 = CheckAndRewriteStack(postfix)
             op1 = CheckAndRewriteStack(postfix)
             FailIfEncrypted([op1, op2])
             return '%s(%s, %s)' % (func_name, op1, op2)
         elif func_name in _THREE_ARGUMENT_FUNCTIONS:
             op3 = CheckAndRewriteStack(postfix)
             op2 = CheckAndRewriteStack(postfix)
             op1 = CheckAndRewriteStack(postfix)
             FailIfEncrypted([op1, op2, op3])
             return '%s(%s, %s, %s)' % (func_name, op1, op2, op3)
         else:
             raise bigquery_client.BigqueryInvalidQueryError(
                 '%s function does not exist.' % func_name, None, None,
                 None)
     elif not isinstance(top, basestring):
         return str(top)
     else:
         return top
def _DecryptGroupConcatValues(field, table, column_index, ciphers, schema,
                              prefix):
    if not field.startswith(util.GROUP_CONCAT_PREFIX):
        raise ValueError('Not a GROUP_CONCAT aggregation.')
    if len(field.split(' ')) >= 3:
        field = ' '.join(field.split(' ')[:-2])
    field = field.split(util.GROUP_CONCAT_PREFIX)[1][:-1]
    field = field.split('.')
    field[-1] = field[-1].split(prefix)[1]
    field = '.'.join(field)
    value_type = util.GetFieldType(field, schema)
    if value_type not in ['string', 'integer', 'float']:
        raise ValueError('Not an known type.')
    if value_type != 'string':
        raise bigquery_client.BigqueryInvalidQueryError(
            'Cannot GROUP_CONCAT non-string type.', None, None, None)
    cipher = ciphers[prefix]
    decrypted_column = []
    for i in range(len(table)):
        if table[i][column_index] is None:
            decrypted_column.append(util.LiteralToken('null', None))
            continue
        list_words = table[i][column_index].split(',')
        for k in range(len(list_words)):
            list_words[k] = unicode(
                cipher.Decrypt(list_words[k].encode('utf-8'))).strip()
        decrypted_column.append(
            util.StringLiteralToken('"%s"' % ','.join(list_words)))
    return decrypted_column
Example #4
0
def DateAdd(timestamp, interval, interval_units):
    timestamp = _ConvertToDatetimeObject(timestamp)
    # Python has built-in date addition of day, hour, minute and second.
    if interval_units.lower() in ['day', 'hour', 'minute', 'second']:
        args = {interval_units.lower() + 's': interval}
        return (timestamp +
                datetime.timedelta(**args)).strftime('%Y-%m-%d %H:%M:%S')
    elif interval_units.lower() == 'year':
        try:
            return timestamp.replace(year=timestamp.year +
                                     interval).strftime('%Y-%m-%d %H:%M:%S')
        except ValueError:
            # Not a leap year so the day does not exist.
            return timestamp.replace(day=28,
                                     month=2,
                                     year=timestamp.year +
                                     interval).strftime('%Y-%m-%d %H:%M:%S')
    elif interval_units.lower() == 'month':
        try:
            new_year = timestamp.year + (timestamp.month + interval - 1) / 12
            new_month = (timestamp.month + interval - 1) % 12 + 1
            return timestamp.replace(
                month=new_month, year=new_year).strftime('%Y-%m-%d %H:%M:%S')
        except ValueError:
            # Not a leap year so the day does not exist.
            return timestamp.replace(
                day=28, month=2, year=new_year).strftime('%Y-%m-%d %H:%M:%S')
    else:
        raise bigquery_client.BigqueryInvalidQueryError(
            'Invalid interval unit type.', None, None, None)
Example #5
0
def ParseQuery(query):
  """Parses the entire query.

  Arguments:
    query: The command the user sent that needs to be parsed.

  Returns:
    Dictionary mapping clause names to their arguments.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: When invalid query is given.
  """
  clause_arguments = {
      'SELECT': [],
      'AS': {},
      'WITHIN': {},
      'FROM': [],
      'WHERE': [],
      'HAVING': [],
      'GROUP BY': [],
      'ORDER BY': [],
      'LIMIT': [],
  }
  try:
    _EBQParser(clause_arguments).parseString(query)
  except ValueError as e:
    raise bigquery_client.BigqueryInvalidQueryError(e, None, None, None)
  return clause_arguments
Example #6
0
def _ConvertFromTimestamp(timestamp, utc=True):
    try:
        if utc:
            return datetime.datetime.utcfromtimestamp(timestamp)
        else:
            return datetime.datetime.fromtimestamp(timestamp)
    except ValueError as e:
        raise bigquery_client.BigqueryInvalidQueryError(e, None, None, None)
Example #7
0
    def Rewrite(self):
        """Rewrites group by argument to send to BigQuery server.

    Returns:
      Rewritten group by clause.

    Raises:
      ValueError: Invalid clause type or necessary argument not given.
    """
        if not self._argument:
            return ''
        necessary_attributes = [
            'nsquare',
            'schema',
            'select_clause',
        ]
        self._CheckNecessaryAttributes(necessary_attributes)
        if not isinstance(self.select_clause, _SelectClause):
            raise ValueError('Invalid select clause.')
        for argument in self._argument:
            row = util.GetEntryFromSchema(argument, self.schema)
            if (row['encrypt'].startswith('probabilistic')
                    or row['encrypt'] == 'homomorphic'
                    or row['encrypt'] == 'searchwords'):
                raise bigquery_client.BigqueryInvalidQueryError(
                    'Cannot GROUP BY %s encryption.' % row['encrypt'], None,
                    None, None)
        # Group by arguments have no alias, so an empty dictionary is adequate.
        rewritten_argument = _RewritePostfixExpressions([self._argument], {},
                                                        self.schema,
                                                        self.nsquare)[0]
        # Only want expressions, remove alias from expression.
        unencrypted_expression_list = []
        for query in self.select_clause.GetUnencryptedQueries():
            unencrypted_expression_list.append(' '.join(query.split(' ')[:-2]))
        for i in range(len(rewritten_argument)):
            if rewritten_argument[i] in unencrypted_expression_list:
                rewritten_argument[i] = (
                    '%s%d_' %
                    (util.UNENCRYPTED_ALIAS_PREFIX,
                     unencrypted_expression_list.index(rewritten_argument[i])))
            else:
                manifest = getattr(self, 'manifest', None)
                if manifest is not None:
                    column_alias = manifest.GetColumnAliasForName(
                        rewritten_argument[i], generate=False)
                else:
                    column_alias = None
                if column_alias is not None:
                    rewritten_argument[i] = column_alias
                else:
                    rewritten_argument[i] = rewritten_argument[i].replace(
                        '.', util.PERIOD_REPLACEMENT)
        return 'GROUP BY %s' % ', '.join(rewritten_argument)
    def Rewrite(self):
        """Rewrites having argument to send to BigQuery server.

    Returns:
      Rewritten having clause.

    Raises:
      ValueError: Invalid clause type or necessary argument not given.
    """
        if not self._argument:
            return ''
        necessary_attributes = [
            'as_clause',
            'schema',
            'nsquare',
            'master_key',
            'table_id',
        ]
        self._CheckNecessaryAttributes(necessary_attributes)
        if not isinstance(self.as_clause, _AsClause):
            raise ValueError('Invalid as clause.')
        rewritten_argument = [copy(self._argument)]
        rewritten_argument = _RewritePostfixExpressions(
            rewritten_argument, self.as_clause.GetOriginalArgument(),
            self.schema, self.nsquare)[0]
        for token in rewritten_argument:
            if not isinstance(token, util.AggregationQueryToken):
                continue
            if token.startswith(util.PAILLIER_SUM_PREFIX):
                raise bigquery_client.BigqueryInvalidQueryError(
                    'Cannot include SUM/AVG on homomorphic encryption in HAVING '
                    'clause.', None, None, None)
            elif token.startswith(util.GROUP_CONCAT_PREFIX):
                field = token.split(util.GROUP_CONCAT_PREFIX)[1][:-1]
                if util.IsEncrypted(field):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'Cannot include GROUP_CONCAT on encrypted field in HAVING '
                        'clause.', None, None, None)
        rewritten_argument = interpreter.RewriteSelectionCriteria(
            rewritten_argument, self.schema, self.master_key, self.table_id)
        return 'HAVING %s' % rewritten_argument
Example #9
0
def ParsePackedIP(readable_ip):
    try:
        ip_address = ipaddr.IPv4Address(readable_ip)
        return str(ipaddr.v4_int_to_packed(int(ip_address)))
    except ValueError:
        pass
    try:
        ip_address = ipaddr.IPv6Address(readable_ip)
        return str(ipaddr.v6_int_to_packed(int(ip_address)))
    except ValueError:
        raise bigquery_client.BigqueryInvalidQueryError(
            'Invalid readable ip.', None, None, None)
    def Query(self, query, **kwds):
        """Execute the given query, returning the created job and info for print.

    Arguments:
      query: Query to execute.
      **kwds: Passed on to BigqueryClient.ExecuteJob.

    Returns:
      The resulting job info and other info necessary for printing.
    """
        self._CheckKeyfileFlag()
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename)

        try:
            clauses = parser.ParseQuery(query)
        except ParseException as e:
            raise bigquery_client.BigqueryInvalidQueryError(
                e, None, None, None)
        if clauses['FROM']:
            table_id = '%s_%s' % (clauses['FROM'][0],
                                  self._GetTableCreationTime(
                                      clauses['FROM'][0]))
            hashed_table_key, table_version, table_schema = self._GetEBQTableInfo(
                clauses['FROM'][0])
            hashed_master_key = hashlib.sha1(master_key)
            # pylint: disable=too-many-function-args
            hashed_master_key = base64.b64encode(hashed_master_key.digest())
            if hashed_master_key != hashed_table_key:
                raise bigquery_client.BigqueryAccessDeniedError(
                    'Invalid master key for this table.', None, None, None)
            if table_version != util.EBQ_TABLE_VERSION:
                raise bigquery_client.BigqueryNotFoundError(
                    'Invalid table version.', None, None, None)
            cipher = ecrypto.ProbabilisticCipher(master_key)
            orig_schema = zlib.decompress(
                cipher.Decrypt(base64.b64decode(table_schema), raw=True))
            orig_schema = json.loads(orig_schema.decode('utf-8'))
        else:
            table_id = None
            orig_schema = []

        manifest = query_lib.QueryManifest.Generate()
        rewritten_query, print_args = query_lib.RewriteQuery(
            clauses, orig_schema, master_key, table_id, manifest)
        job = super(EncryptedBigqueryClient,
                    self).Query(rewritten_query, **kwds)
        self._LoadJobStatistics(manifest, job)

        printer = EncryptedTablePrinter(**print_args)
        bq.Factory.ClientTablePrinter.SetTablePrinter(printer)

        return job
 def RewritePseudonymEncryption(token, op2=None):
     if isinstance(token, util.StringLiteralToken):
         if op2 is not None and getattr(op2, 'related', None) is not None:
             if pseudonym_ciphers.get(op2.related, None) is not None:
                 return '"%s"' % pseudonym_ciphers[op2.related].Encrypt(
                     unicode(token[1:-1]))
             else:
                 raise bigquery_client.BigqueryInvalidQueryError(
                     'Cannot process token with related attribute in schema without '
                     'matching related attribute', None, None, None)
         else:
             return '"%s"' % pseudonym_cipher.Encrypt(unicode(token[1:-1]))
     else:
         return token
Example #12
0
def ParseUTCUsec(date_string):
    try:
        date = datetime.datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
        return (time.mktime(date.timetuple()) -
                _TIME_DIFFERENCE_UTC_PST) * 1000000
    except ValueError:
        pass
    try:
        date = datetime.datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S.%f')
        return (time.mktime(date.timetuple()) -
                _TIME_DIFFERENCE_UTC_PST) * 1000000
    except ValueError:
        raise bigquery_client.BigqueryInvalidQueryError(
            'Requires one of two following formats: \'%Y-%m-%d %H:%M:%s\' or'
            '\'%Y-%m-%d %H:%M:%S.%f\'.', None, None, None)
Example #13
0
def ToInfix(stack):
    """Converts a postfix notation stack into an infix string.

  Arguments:
    stack: Postfix notation that is being converted. <stack> is going to be
    modified as elements are being popped off.

  Raises:
    ValueError: Too many arguments for functions/operators in the stack.

  Returns:
    String of expression in infix notation.
  """
    infix = _ConvertStack(stack)
    if stack:
        raise bigquery_client.BigqueryInvalidQueryError(
            'Invalid number of arguments.', None, None, None)
    return infix
def _CollapseFunctions(stack):
    """Collapses functions by evaluating them for actual values.

  Replaces a function's postfix expression with a single token. If the function
  can be evaluated (no fields included as arguments), the single token is
  the value of function's evaluation. Otherwise, the function is collapsed
  into a single token without evaluation.

  Arguments:
    stack: The stack whose functions are to be collapsed and resolved.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: If a field exists inside
    the arguments of a function.

  Returns:
    True iff a function is found and collapsed. In other words, another
    potential function can still exist.
  """
    for i in xrange(len(stack)):
        if isinstance(stack[i], util.BuiltInFunctionToken):
            start_idx, postfix_expr = interpreter.GetSingleValue(stack[:i + 1])
            if util.IsEncryptedExpression(postfix_expr):
                raise bigquery_client.BigqueryInvalidQueryError(
                    'Invalid aggregation function argument: Cannot put an encrypted '
                    'field as an argument to a built-in function.', None, None,
                    None)
            # If the expression has no fields, we want to get the actual value.
            # But, if the field has a field, we have to get the infix string instead.
            try:
                result = interpreter.Evaluate(list(postfix_expr))
                if isinstance(result, basestring):
                    result = util.StringLiteralToken('"%s"' % result)
                elif result is None:
                    result = util.LiteralToken('NULL', None)
                elif str(result).lower() in ['true', 'false']:
                    result = util.LiteralToken(str(result).lower(), result)
                stack[start_idx:i + 1] = [result]
            except bigquery_client.BigqueryInvalidQueryError:
                result = interpreter.ToInfix(list(postfix_expr))
                stack[start_idx:i + 1] = [util.FieldToken(result)]
            return True
    return False
def _ComputeRows(new_postfix_stack, queried_values):
    """Substitutes queries back to expressions and evaluates them.

  Arguments:
    new_postfix_stack: All expressions for each column.
    queried_values: A dictionary that represents the queried values to a list
    of values that were received from server (all have been decrypted).

  Returns:
    A new table with results of each expression after query substitution.
  """
    table_values = []
    if queried_values:
        num_rows = len(queried_values[queried_values.keys()[0]])
    else:
        for stack in new_postfix_stack:
            ans = interpreter.Evaluate(stack)
            if ans is None:
                ans = 'NULL'
            table_values.append(str(ans))
        return [table_values]

    # Substitute queried values back into postfix stacks and evaluate them.
    for i in range(num_rows):
        row_values = []
        for j in range(len(new_postfix_stack)):
            temp_stack = list(new_postfix_stack[j])
            for k in xrange(len(temp_stack)):
                if (isinstance(temp_stack[k], util.AggregationQueryToken) or
                        isinstance(temp_stack[k], util.UnencryptedQueryToken)
                        or isinstance(temp_stack[k], util.FieldToken)):
                    if str(temp_stack[k]) not in queried_values:
                        raise bigquery_client.BigqueryInvalidQueryError(
                            '%s column does not exist.' % temp_stack[k], None,
                            None, None)
                    temp_stack[k] = queried_values[str(temp_stack[k])][i]
            ans = interpreter.Evaluate(temp_stack)
            if ans is None:
                ans = 'NULL'
            row_values.append(str(ans))
        table_values.append(row_values)
    return table_values
Example #16
0
def FormatPackedIP(packed_ip):
    """Formats packed binary data to a readable ip address.

  Args:
    packed_ip: The packed binary data to be converted.

  Returns:
    A readable ip address.

  Returns:
    bigquery_client.BigqueryInvalidQueryError: If the address is not valid.
  """
    packed_ip = ipaddr.Bytes(str(packed_ip))
    try:
        ip_address = ipaddr.IPv4Address(packed_ip)
        return str(ip_address)
    except ipaddr.AddressValueError as e:
        pass
    try:
        ip_address = ipaddr.IPv6Address(packed_ip)
        return str(ip_address)
    except ipaddr.AddressValueError as e:
        raise bigquery_client.BigqueryInvalidQueryError(e, None, None, None)
def _CollapseAggregations(stack, nsquare):
    """Collapses the aggregations by combining arguments and functions.

  During collapses, checks will be done to if aggregations are done on
  encrypted fields. The following aggregations will be rewritten:

  SUM(<homomorphic field>) becomes
  TO_BASE64(PAILLIER_SUM(FROM_BASE64(<homomorphic field>), <nsquare>))

  AVG(<homomorphic field>) becomes
  TO_BASE64(PAILLIER_SUM(FROM_BASE64(<homomorphic field>), <nsquare>)) /
  COUNT(<homomorphic field>)

  Arguments:
    stack: The stack whose aggregations are to be collapsed.
    nsquare: Used for homomorphic addition.

  Returns:
    True iff an aggregation was found and collapsed. In other words, another
    potential aggregation can still exist.
  """
    for i in xrange(len(stack)):
        if isinstance(stack[i], util.AggregationFunctionToken):
            num_args = stack[i].num_args
            function_type = str(stack[i])
            postfix_exprs = []
            infix_exprs = []
            start_idx = i
            rewritten_infix_expr = None
            is_encrypted = False
            # pylint: disable=unused-variable
            for j in xrange(int(num_args)):
                start_idx, postfix_expr = interpreter.GetSingleValue(
                    stack[:start_idx])
                is_encrypted = is_encrypted or util.IsEncryptedExpression(
                    postfix_expr)
                while _CollapseFunctions(postfix_expr):
                    pass
                postfix_exprs.append(postfix_expr)
                infix_exprs.append(interpreter.ToInfix(list(postfix_expr)))
            # Check for proper nested aggregations.
            # PAILLIER_SUM and GROUP_CONCAT on encrypted fields are not legal
            # arguments for an aggregation.
            for expr in postfix_exprs:
                for token in expr:
                    if not isinstance(token, util.AggregationQueryToken):
                        continue
                    if token.startswith(util.PAILLIER_SUM_PREFIX):
                        raise bigquery_client.BigqueryInvalidQueryError(
                            'Cannot use SUM/AVG on homomorphic encryption as argument '
                            'for another aggregation.', None, None, None)
                    elif token.startswith(util.GROUP_CONCAT_PREFIX):
                        fieldname = token.split(
                            util.GROUP_CONCAT_PREFIX)[1][:-1]
                        if util.IsEncrypted(fieldname):
                            raise bigquery_client.BigqueryInvalidQueryError(
                                'Cannot use GROUP_CONCAT on an encrypted field as argument '
                                'for another aggregation.', None, None, None)
            infix_exprs.reverse()
            if function_type in ['COUNT', 'DISTINCTCOUNT']:
                if (function_type == 'DISTINCTCOUNT'
                        and util.IsDeterministicExpression(postfix_exprs[0])):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'Cannot do distinct count on non-pseudonym encryption.',
                        None, None, None)
                if function_type == 'DISTINCTCOUNT':
                    infix_exprs[0] = 'DISTINCT ' + infix_exprs[0]
                rewritten_infix_expr = [
                    util.AggregationQueryToken('COUNT(%s)' %
                                               ', '.join(infix_exprs))
                ]
            elif function_type == 'TOP':
                if util.IsDeterministicExpression(postfix_exprs[0]):
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'Cannot do TOP on non-deterministic encryption.', None,
                        None, None)
                rewritten_infix_expr = [
                    util.AggregationQueryToken('TOP(%s)' %
                                               ', '.join(infix_exprs))
                ]
            elif function_type in ['AVG', 'SUM'] and is_encrypted:
                list_fields = interpreter.CheckValidSumAverageArgument(
                    postfix_expr)[0]
                rewritten_infix_expr = []
                # The representative label is the field that is going to be used
                # to get constant values. An expression SUM(ax + b) must be rewritten as
                # a * SUM(x) + b * COUNT(x). Represetative label is x (this isn't unique
                # as many fields can be in COUNT).
                representative_label = ''
                for field in list_fields:
                    for token in field:
                        if util.IsLabel(token):
                            representative_label = token
                            break
                    if representative_label:
                        break
                for field in list_fields:
                    expression = interpreter.ExpandExpression(field)
                    queries, constant = expression[0], expression[1]
                    rewritten_infix_expr.append(float(constant))
                    rewritten_infix_expr.append(
                        util.AggregationQueryToken('COUNT(%s)' %
                                                   representative_label))
                    rewritten_infix_expr.append(util.OperatorToken('*', 2))
                    for query in queries:
                        rewritten_infix_expr.append(float(query[0]))
                        if (isinstance(query[1], util.HomomorphicFloatToken)
                                or isinstance(query[1],
                                              util.HomomorphicIntToken)):
                            rewritten_infix_expr.append(
                                util.ConstructPaillierSumQuery(
                                    query[1], nsquare))
                        else:
                            rewritten_infix_expr.append(
                                util.AggregationQueryToken('SUM(%s)' %
                                                           query[1]))
                        rewritten_infix_expr.append(util.OperatorToken('*', 2))
                    for j in range(len(queries)):
                        rewritten_infix_expr.append(util.OperatorToken('+', 2))
                for j in range(len(list_fields) - 1):
                    rewritten_infix_expr.append(util.OperatorToken('+', 2))
                if function_type == 'AVG':
                    rewritten_infix_expr.append(
                        util.AggregationQueryToken('COUNT(%s)' %
                                                   representative_label))
                    rewritten_infix_expr.append(util.OperatorToken('/', 2))
            elif function_type == 'GROUP_CONCAT':
                rewritten_infix_expr = [
                    util.AggregationQueryToken('GROUP_CONCAT(%s)' %
                                               ', '.join(infix_exprs))
                ]
            elif is_encrypted:
                raise bigquery_client.BigqueryInvalidQueryError(
                    'Cannot do %s aggregation on any encrypted fields.' %
                    function_type, None, None, None)
            else:
                rewritten_infix_expr = [
                    util.AggregationQueryToken(
                        '%s(%s)' % (function_type, ', '.join(infix_exprs)))
                ]
            stack[start_idx:i + 1] = rewritten_infix_expr
            return True
    return False
    def SortTable(self, column_names, table_rows):
        """Sort table based on ORDER BY arguments.

    Arguments:
      column_names: Column names of to be printed table.
      table_rows: Values of each row.

    Raises:
      bigquery_client.BigqueryInvalidQueryError: ORDER BY argument not a valid
        column name.

    Returns:
      The table sorted based on ORDER BY arguments.
    """
        # If order by clause is not part of query, just return the original table.
        if not self._argument:
            return table_rows

        # Check that each order by argument is a column in the table.
        for argument in self._argument:
            # Argument is on the form field [ASC|DESC]. Only interested in field name.
            field = argument.split(' ')[0]
            found = False
            for column_name in column_names:
                if column_name['name'] == field:
                    found = True
                    break
            if not found:
                raise bigquery_client.BigqueryInvalidQueryError(
                    '%s appears in ORDER BY, but is not a named column in SELECT.'
                    % field, None, None, None)

        # Sort based on the least important field (last specified argument) to the
        # most important argument (first specified argument). This works as long
        # as long as the sort we use is stable. Python's sort is stable:
        # http://docs.python.org/2/howto/sorting.html.

        # Stores current arrangement of tables by original index.
        # Initially, stores list from 0, 1, ... N.
        current_index_sort = list(range(len(table_rows)))

        for argument in reversed(self._argument):
            field = argument.split(' ')[0]
            # Check if we are sorting ascending or descending.
            reverse_sort = (len(argument.split(' ')) == 2
                            and argument.split(' ')[1].lower() == 'desc')
            # Unsorted list consists of column value of each row and their index.
            unsorted_list = []
            for i in xrange(len(column_names)):
                if column_names[i]['name'] == field:
                    for j in current_index_sort:
                        unsorted_list.append((table_rows[j][i], j))
            current_index_sort[:] = []
            for row in sorted(unsorted_list,
                              key=lambda v: v[0],
                              reverse=reverse_sort):
                current_index_sort.append(row[1])

        sorted_table = []
        for i in current_index_sort:
            sorted_table.append(table_rows[i])
        return sorted_table
Example #19
0
def _ConvertToDatetimeObject(date_string):
    try:
        return datetime.datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        raise bigquery_client.BigqueryInvalidQueryError(
            'Not a valid timestamp object.', None, None, None)
Example #20
0
def _ExpandExpression(stack):
    """Expands the postfix versions of stack into an expression.

  This whole recursive function depends on a very complex data structure that
  is used to represent any linear expression. The data structure is as follows:

  [list of pairs, constant]

  For example the polynomial, ax + by + cz + d (a, b, c, d are integers and
  x, y, z are fields) is represented by the data structure as follows:

  [[[a, x], [b, y], [c, z]], d]

  The list of pairs represents variables and their constant factor. The first
  element of the pair will be the constant while the second element will
  be the actual field. The constant is just the constant factor of the
  expression.

  Now, I will explain the definition of addition, subtraction, multiplication
  and division for this data structure.

  Addition/Subtraction:
  To perform this, the list of pairs from the first operand is taken. Then,
  we iterate through the list of pairs in the second list, we try to find
  the field of each respective pair in the list of the first operand. If it
  is found, the constant factor of that pair is updated in the first list.
  Otherwise, a new pair is appended (the constant factor is negated if
  subtraction is occurring). Finally, the constants are added/subtracted.

  Example:
  s1 = x + 2y + 3 = [[[1.0, x], [2.0, y]], 3.0]
  s2 = 1.5x + 4z + 1 = [[[1.5, x], [4.0, z]], 1.0]

  s1 + s2 = 2.5x + 2y + 4z + 4 = [[[2.5, x], [2.0, y], [4.0, z]], 4.0]

  Multiplication:
  Only a specific type of multiplication can occur. A constant multiplied by
  a linear expressions. If two linear expressions are multiplied, it is no
  longer linear, so an error is raised. So we check and assure that at least one
  list is empty. Then, we take the constant factor of the empty listed operand
  and multiply each constant factor and the constant of the other operand.

  Example of failure (multiplication of two fields):
  s1 = x + 3
  s2 = y

  Example of good calculation:
  s1 = x + 2y + 1 = [[[1.0, x], [2.0, y]], 1.0]
  s2 = 3 = [[], 3.0]

  s1 * s2 = [[[3.0, x], [6.0, y]], 3.0]

  Division:
  For division, the denominator must be a constant and not contain any fields.
  If the denominator is not a constant, an exception is raised. Otherwise, we
  take the constant and just divide each constant factor and constant in the
  numerator.

  Example of failure (denominator is not a constant):
  s1 = x + 1
  s2 = y

  Example of good calculation:
  s1 = 2x + 4y + 6 = [[[2.0, x], [4.0, y]], 6.0]
  s2 = 2 = [[], 2.0]

  s1 / s2 = [[[1.0, x], [2.0, y]], 3.0]

  Arguments:
    stack: Postfix expression that you want to expand.

  Returns:
    The above described data structure representing the expression.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: If the stack is not linear or
    if there are invalid arguments.
  """

    top = stack.pop()
    if ((isinstance(top, util.OperatorToken) and top.num_args == 1)
            or isinstance(top, util.BuiltInFunctionToken)
            or isinstance(top, util.AggregationFunctionToken)
            or isinstance(top, util.LiteralToken)):
        raise bigquery_client.BigqueryInvalidQueryError(
            'Invalid SUM arguments. %s is not supported' % top, None, None,
            None)
    elif top in ['+', '-']:
        op2 = _ExpandExpression(stack)
        op1 = _ExpandExpression(stack)
        list_fields = list(op1[0])
        for token in op2[0]:
            found = False
            for i in range(len(list_fields)):
                if token[1] == list_fields[i][1]:
                    found = True
                    list_fields[i][0] = _BINARY_OPERATORS[top](
                        list_fields[i][0], token[0])
                    break
            if not found:
                if top == '-':
                    token[0] *= -1
                list_fields.append(token)
        return [list_fields, _BINARY_OPERATORS[top](op1[1], op2[1])]
    elif top == '*':
        op2 = _ExpandExpression(stack)
        op1 = _ExpandExpression(stack)
        if op1[0] and op2[0]:
            raise bigquery_client.BigqueryInvalidQueryError(
                'Not a linear function. Two fields are being multipled.', None,
                None, None)
        list_fields = list(op1[0])
        list_fields.extend(list(op2[0]))
        if not op1[0]:
            for fields in list_fields:
                fields[0] *= op1[1]
        else:
            for fields in list_fields:
                fields[0] *= op2[1]
        return [list_fields, op1[1] * op2[1]]
    elif top == '/':
        op2 = _ExpandExpression(stack)
        op1 = _ExpandExpression(stack)
        if op2[0]:
            raise bigquery_client.BigqueryInvalidQueryError(
                'Division by a label: not a linear function.', None, None,
                None)
        list_fields = list(op1[0])
        try:
            for fields in list_fields:
                fields[0] /= op2[1]
            return [list_fields, op1[1] / op2[1]]
        except ZeroDivisionError:
            raise bigquery_client.BigqueryInvalidQueryError(
                'Division by zero.', None, None, None)
    elif util.IsFloat(top):
        return [[], float(top)]
    else:
        if (isinstance(top, util.PseudonymToken)
                or isinstance(top, util.SearchwordsToken)
                or isinstance(top, util.ProbabilisticToken)):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot do SUM/AVG on non-homomorphic encryption.', None, None,
                None)
        return [[[1.0, top]], 0.0]
Example #21
0
def CheckValidSumAverageArgument(stack):
    """Checks if stack is a proper argument for SUM/AVG.

  This recursive algorithm performs tainting. It uses a special structure to
  store data which is as follows:

  s = [list of postfix expressions, taint1, taint2]

  The list of postfix expressiions all added together is equivalent to the
  expanded version of <stack>.
  taint1 represents whether s contains an encrypted field.
  taint2 represents whether s contains any field. taint1 is true iff s contains
  any field (encrypted or unencrypted).

  This algorithm fails if any encrypted field is multipled/divided by any other
  field (either encrypted or unencrypted).

  Arguments:
    stack: The postfix expression that is being checked if valid for SUM/AVG
    argument.

  Returns:
    A tuple containing a list of postfix expressions, and two types of taints.
    Representing whether a field is in s and an encrypted field is in s.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: Thrown iff <stack> is not a valid
    linear expression (or one we cannot compute) that can be a SUM/AVG argument.
  """
    top = stack.pop()
    if ((isinstance(top, util.OperatorToken) and top.num_args == 1)
            or isinstance(top, util.BuiltInFunctionToken)
            or isinstance(top, util.AggregationFunctionToken)
            or isinstance(top, util.LiteralToken)):
        raise bigquery_client.BigqueryInvalidQueryError(
            'Invalid SUM arguments. %s is not supported' % top, None, None,
            None)
    elif top in ['+', '-']:
        op2 = CheckValidSumAverageArgument(stack)
        op1 = CheckValidSumAverageArgument(stack)
        list_fields = list(op1[0])
        if top == '-':
            for i in range(len(op2[0])):
                op2[0][i].extend([-1, util.OperatorToken('*', 2)])
        for i in range(len(op2[0])):
            list_fields.append(op2[0][i])
        return [list_fields, op1[1] or op2[1], op1[2] or op2[2]]
    elif top == '*':
        op2 = CheckValidSumAverageArgument(stack)
        op1 = CheckValidSumAverageArgument(stack)
        if (op1[1] and (op2[1] or op2[2])) or (op2[1] and (op1[1] or op1[2])):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Invalid AVG/SUM argument. An encrypted field is multipled by another'
                ' field.', None, None, None)
        list_fields = []
        for field1 in op1[0]:
            for field2 in op2[0]:
                value = list(field1)
                value.extend(field2)
                value.append(util.OperatorToken('*', 2))
                list_fields.append(value)
        return [list_fields, op1[1] or op2[1], op1[2] or op2[2]]
    elif top == '/':
        op2 = CheckValidSumAverageArgument(stack)
        op1 = CheckValidSumAverageArgument(stack)
        if op2[1] or (op1[1] and op2[2]):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Division by/of an encrypted field: not a linear function.',
                None, None, None)
        append_divisor = []
        for field in op2[0]:
            append_divisor.extend(field)
        for i in range(len(op2[0]) - 1):
            append_divisor.append(util.OperatorToken('+', 2))
        append_divisor.append(util.OperatorToken('/', 2))
        list_fields = list(op1[0])
        for i in xrange(len(list_fields)):
            list_fields[i].extend(append_divisor)
        return [list_fields, op1[1], op1[2] or op2[2]]
    else:
        if (isinstance(top, util.PseudonymToken)
                or isinstance(top, util.SearchwordsToken)
                or isinstance(top, util.ProbabilisticToken)):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot do SUM/AVG on non-homomorphic encryption.', None, None,
                None)
        is_encrypted = (isinstance(top, util.HomomorphicIntToken)
                        or isinstance(top, util.HomomorphicFloatToken))
        return [[[top]], is_encrypted, not util.IsFloat(top)]
Example #22
0
def Tld(_):
    raise bigquery_client.BigqueryInvalidQueryError('Not implemented yet.',
                                                    None, None, None)
Example #23
0
def _ConvertStack(postfix):
    """Convert postfix stack to infix string.

  Arguments:
    postfix: A stack in postfix notation. The postfix stack will be modified
    as elements are being popped from the top.

  Raises:
    ValueError: There are not enough arguments for functions/operators.

  Returns:
    A string of the infix represetation of the stack.
  """
    if not postfix:
        raise bigquery_client.BigqueryInvalidQueryError(
            'Not enough arguments.', None, None, None)
    top = postfix.pop()
    if isinstance(top, util.OperatorToken):
        args = []
        for unused_i in range(top.num_args):
            args.append(_ConvertStack(postfix))
        args.reverse()
        if top.num_args == 1:
            return '%s %s' % (str(top), args[0])
        else:
            return '(%s %s %s)' % (args[0], str(top), args[1])
    elif isinstance(top, util.BuiltInFunctionToken):
        func_name = str(top)
        if func_name in _ZERO_ARGUMENT_FUNCTIONS:
            return '%s()' % func_name
        elif func_name in _ONE_ARGUMENT_FUNCTIONS:
            op = _ConvertStack(postfix)
            return '%s(%s)' % (func_name, op)
        elif func_name in _TWO_ARGUMENT_FUNCTIONS:
            op2 = _ConvertStack(postfix)
            op1 = _ConvertStack(postfix)
            return '%s(%s, %s)' % (top, op1, op2)
        elif func_name in _THREE_ARGUMENT_FUNCTIONS:
            op3 = _ConvertStack(postfix)
            op2 = _ConvertStack(postfix)
            op1 = _ConvertStack(postfix)
            return '%s(%s, %s, %s)' % (top, op1, op2, op3)
        else:
            raise bigquery_client.BigqueryInvalidQueryError(
                'Function %s does not exist.' % str(top), None, None, None)
    elif isinstance(top, util.AggregationFunctionToken):
        num_args = top.num_args
        func_name = str(top)
        ops = []
        for unused_i in range(int(num_args)):
            ops.append(_ConvertStack(postfix))
        ops.reverse()
        if func_name == 'DISTINCTCOUNT':
            func_name = 'COUNT'
            ops[0] = 'DISTINCT ' + ops[0]
        ops = [str(op) for op in ops]
        return func_name + '(' + ', '.join(ops) + ')'
    elif not isinstance(top, basestring):
        return str(top)
    else:
        return top
Example #24
0
    def Resolve(stack):
        """Resolves the postfix stack and evaluates the expression into one value.

    The <stack> is the expression in postfix notation.

    Arguments:
      stack: Postfix notation to be resolved.

    Raises:
      ValueError: If an invalid function name is given or not enough arguments
      are provided for an operator/function.

    Returns:
      The resolution of the postfix notation.
    """
        if not stack:
            raise bigquery_client.BigqueryInvalidQueryError(
                'Not enough arguments.', None, None, None)
        top = stack.pop()
        if isinstance(top, util.OperatorToken):
            args = []
            for unused_i in range(top.num_args):
                args.append(Resolve(stack))
            args.reverse()
            if None in args:
                return None
            if top.num_args == 1:
                return _UNARY_OPERATORS[str(top)](*args)
            else:
                try:
                    return _BINARY_OPERATORS[top](*args)
                except ZeroDivisionError:
                    raise bigquery_client.BigqueryInvalidQueryError(
                        'Division by zero.', None, None, None)
        elif isinstance(top, util.BuiltInFunctionToken):
            func_name = str(top)
            if func_name in _ZERO_ARGUMENT_FUNCTIONS:
                result = _ZERO_ARGUMENT_FUNCTIONS[func_name]
            elif func_name in _ONE_ARGUMENT_FUNCTIONS:
                op = Resolve(stack)
                if op is None:
                    result = None
                result = _ONE_ARGUMENT_FUNCTIONS[func_name](op)
            elif func_name in _TWO_ARGUMENT_FUNCTIONS:
                op2 = Resolve(stack)
                op1 = Resolve(stack)
                if op1 is None or op2 is None:
                    result = None
                result = _TWO_ARGUMENT_FUNCTIONS[func_name](op1, op2)
            elif func_name in _THREE_ARGUMENT_FUNCTIONS:
                op3 = Resolve(stack)
                op2 = Resolve(stack)
                op1 = Resolve(stack)
                if op1 is None or op2 is None or op3 is None:
                    result = None
                result = _THREE_ARGUMENT_FUNCTIONS[func_name](op1, op2, op3)
            else:
                raise bigquery_client.BigqueryInvalidQueryError(
                    'No function ' + func_name + ' exists.', None, None, None)
            return result
        elif isinstance(top, util.FieldToken):
            raise bigquery_client.BigqueryInvalidQueryError(
                '%s does not exist as a column.' % str(top), None, None, None)
        elif isinstance(top, util.LiteralToken):
            return top.value
        else:
            return top
Example #25
0
def ParseIP(readable_ip):
    try:
        ip_address = ipaddr.IPv4Address(readable_ip)
    except ipaddr.AddressValueError as e:
        raise bigquery_client.BigqueryInvalidQueryError(e, None, None, None)
    return int(ip_address)
Example #26
0
def RegexpExtract(string, reg_exp):
    search = re.search(reg_exp, string)
    if not search:
        raise bigquery_client.BigqueryInvalidQueryError(
            'No captured group.', None, None, None)
    return search.group(1)
Example #27
0
 def FailIfDeterministic(tokens):
     if util.IsDeterministicExpression(tokens):
         raise bigquery_client.BigqueryInvalidQueryError(
             'Cannot do equality on probabilistic encryption, '
             'only pseudonym encryption.', None, None, None)
Example #28
0
 def FailIfEncrypted(tokens):
     if util.IsEncryptedExpression(tokens):
         raise bigquery_client.BigqueryInvalidQueryError(
             'Invalid where/having expression.', None, None, None)
Example #29
0
def RewriteSelectionCriteria(stack, schema, master_key, table_id):
    """Rewrites selection criteria (arguments of WHERE and HAVING clause).

  Arguments:
    stack: The postfix expression that is the where/having expression.
    schema: The user defined values and encryption.
    master_key: Used to get ciphers for encryption.
    table_id: Used to generate a proper key.

  Returns:
    An infix version of the <stack>. The expression is rewritten so that it
    can be sent to the BigQuery server.

  Raises:
    bigquery_client.BigqueryInvalidQueryError: If the expression is invalid
    (such as searching non-searchable encrypted fields, etc).
  """

    pseudonym_cipher = ecrypto.PseudonymCipher(
        ecrypto.GeneratePseudonymCipherKey(master_key, table_id))
    string_hasher = ecrypto.StringHash(
        ecrypto.GenerateStringHashKey(master_key, table_id))

    def FailIfEncrypted(tokens):
        if util.IsEncryptedExpression(tokens):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Invalid where/having expression.', None, None, None)

    def FailIfDeterministic(tokens):
        if util.IsDeterministicExpression(tokens):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot do equality on probabilistic encryption, '
                'only pseudonym encryption.', None, None, None)

    def RewritePseudonymEncryption(token):
        if isinstance(token, util.StringLiteralToken):
            return '"%s"' % pseudonym_cipher.Encrypt(unicode(token[1:-1]))
        else:
            return token

    def RewriteSearchwordsEncryption(field, literal):
        """Rewrites the literal such that it can be checked for containment.

    Arguments:
      field: The field which is being checked if literal is contained within.
      literal: Substring being searched for.

    Returns:
      A tuple containing both field and literal rewritten.

    Raises:
      ValueError: Try to rewrite non-searchwords encryption.
    """
        if (not isinstance(field, util.SearchwordsToken)
                and not isinstance(field, util.ProbabilisticToken)):
            raise ValueError('Invalid encryption to check containment.')
        field = field.original_name
        row = util.GetEntryFromSchema(field, schema)
        modified_field = util.SEARCHWORDS_PREFIX + row['name']
        field = field.split('.')
        field[-1] = modified_field
        modified_field = '.'.join(field)
        if 'searchwords_separator' in row:
            searchwords_separator = row['searchwords_separator']
        else:
            searchwords_separator = None
        word_list = ecrypto.CleanUnicodeString(unicode(literal.value),
                                               separator=searchwords_separator)
        if searchwords_separator is None:
            word_seq = ' '.join(word_list)
        else:
            word_seq = searchwords_separator.join(word_list)
        keyed_hash = (u'\'%s\'' % string_hasher.GetStringKeyHash(
            modified_field.split('.')[-1], word_seq))
        modified_string = (
            u'to_base64(left(bytes(sha1(concat(left(%s, 24), %s))), 8))' %
            (modified_field, keyed_hash))
        return (modified_field, modified_string)

    def CheckSearchableField(op1):
        """Checks if the operand is a searchable encrypted field.

    Arguments:
      op1: The operand that is being checked if it is searchable.

    Returns:
      True iff op1 is searchable.
    """
        if isinstance(op1, util.SearchwordsToken):
            return True
        elif not isinstance(op1, util.ProbabilisticToken):
            return False
        op1 = op1.original_name
        row = util.GetEntryFromSchema(op1, schema)
        if row['encrypt'] in ['probabilistic_searchwords', 'searchwords']:
            return True
        else:
            return False
        return False

    def RewriteContainsOrFail(op1, op2):
        """Tries to rewrite a contains expression.

    Arguments:
      op1: The first operand of the contains binary operator.
      op2: The second operand of the contians binary operator.

    Returns:
      The rewritten versions of both operands.

    Raises:
      bigquery_client.BigqueryInvalidQueryError: If the contains expressions
      is invalid.
    """
        if not isinstance(op1, util.EncryptedToken):
            return (op1, op2)
        if not CheckSearchableField(op1):
            raise bigquery_client.BigqueryInvalidQueryError(
                'Cannot do contains on an encrypted field that is not searchable.',
                None, None, None)
        elif not isinstance(op2, util.StringLiteralToken):
            raise bigquery_client.BigqueryInvalidQueryError(
                'The substring to be checked must be a literal.', None, None,
                None)
        return RewriteSearchwordsEncryption(op1, op2)

    def CheckAndRewriteStack(postfix):
        if not postfix:
            raise bigquery_client.BigqueryInvalidQueryError(
                'Not enough arguments.', None, None, None)
        top = postfix.pop()
        if isinstance(top, util.OperatorToken):
            args = []
            for unused_i in range(top.num_args):
                args.append(CheckAndRewriteStack(postfix))
            args.reverse()
            if top.num_args == 1:
                return '%s %s' % (str(top), args[0])
            elif str(top) in ['=', '==', '!=']:
                FailIfDeterministic(args)
                if (isinstance(args[0], util.PseudonymToken)
                        or isinstance(args[1], util.PseudonymToken)):
                    args[0] = RewritePseudonymEncryption(args[0])
                    args[1] = RewritePseudonymEncryption(args[1])
            elif str(top) == 'contains':
                FailIfEncrypted([args[1]])
                args[0], args[1] = RewriteContainsOrFail(args[0], args[1])
            else:
                FailIfEncrypted(args)
            return '(%s %s %s)' % (args[0], str(top), args[1])
        elif isinstance(top, util.BuiltInFunctionToken):
            func_name = str(top)
            if func_name in _ZERO_ARGUMENT_FUNCTIONS:
                return '%s()' % func_name
            elif func_name in _ONE_ARGUMENT_FUNCTIONS:
                op = CheckAndRewriteStack(postfix)
                FailIfEncrypted([op])
                return '%s(%s)' % (func_name, op)
            elif func_name in _TWO_ARGUMENT_FUNCTIONS:
                op2 = CheckAndRewriteStack(postfix)
                op1 = CheckAndRewriteStack(postfix)
                FailIfEncrypted([op1, op2])
                return '%s(%s, %s)' % (func_name, op1, op2)
            elif func_name in _THREE_ARGUMENT_FUNCTIONS:
                op3 = CheckAndRewriteStack(postfix)
                op2 = CheckAndRewriteStack(postfix)
                op1 = CheckAndRewriteStack(postfix)
                FailIfEncrypted([op1, op2, op3])
                return '%s(%s, %s, %s)' % (func_name, op1, op2, op3)
            else:
                raise bigquery_client.BigqueryInvalidQueryError(
                    '%s function does not exist.' % func_name, None, None,
                    None)
        elif not isinstance(top, basestring):
            return str(top)
        else:
            return top

    temp_stack = list(stack)
    new_expression = CheckAndRewriteStack(temp_stack)
    if temp_stack:
        raise bigquery_client.BigqueryInvalidQueryError(
            'Too many arguments.', None, None, None)
    return new_expression
Example #30
0
def FormatIP(packed_ip):
    try:
        ip_address = ipaddr.IPv4Address(packed_ip)
    except ipaddr.AddressValueError as e:
        raise bigquery_client.BigqueryInvalidQueryError(e, None, None, None)
    return str(ip_address)