Python RowFilterUnion Examples, google.cloud.bigtable.row_filters.RowFilterUnion Python Examples

Example #1

0

Show file

  def _GetAttributeFilterUnion(self, attributes, timestamp_filter=None):
    filters = []
    for attribute_prefix in attributes:
      family, column = self.GetFamilyColumn(attribute_prefix)

      family_filter = row_filters.FamilyNameRegexFilter(family)
      row_filter_list = [family_filter]

      if column:
        col_filter = row_filters.ColumnQualifierRegexFilter(column)
        row_filter_list.append(col_filter)

      if timestamp_filter:
        row_filter_list.append(timestamp_filter)

      if len(row_filter_list) > 1:
        row_filter = row_filters.RowFilterChain(filters=row_filter_list)
      else:
        row_filter = row_filter_list[0]

      filters.append(row_filter)

    # More than one attribute, use a union, otherwise just use the
    # existing filter.
    if len(filters) > 1:
      filters = row_filters.RowFilterUnion(filters=filters)
    else:
      filters = filters[0]

    return filters

Example #2

0

Show file

File: filter_snippets.py Project: othercamb/GCP

def filter_composing_interleave(project_id, instance_id, table_id):
    client = bigtable.Client(project=project_id, admin=True)
    instance = client.instance(instance_id)
    table = instance.table(table_id)

    rows = table.read_rows(filter_=row_filters.RowFilterUnion(
        filters=[row_filters.ValueRegexFilter("true"),
                 row_filters.ColumnQualifierRegexFilter("os_build")]))
    for row in rows:
        print_row(row)

Example #3

0

Show file

    def get(self):
        bt_array = []
        try:

            table = instance.table(bt_table_name)
            row_set = RowSet()

            for row_key in row_keys:
                row_set.add_row_key(row_key)

            colFilters = []
            for name, bt_name in bt_mapping_dict.items():
                colFilters.append(
                    row_filters.ColumnQualifierRegexFilter(bt_name))

            print("before read_rows...")
            rows = table.read_rows(
                row_set=row_set,
                filter_=row_filters.RowFilterChain(filters=[
                    row_filters.CellsColumnLimitFilter(1),
                    row_filters.RowFilterUnion(filters=colFilters)
                ]),
                retry=bigtable.table.DEFAULT_RETRY_READ_ROWS.with_deadline(
                    60.0))
            print("after read_rows...")

            for row in rows:
                print("Reading data for {}:".format(
                    row.row_key.decode('utf-8')))
                for cf, cols in sorted(row.cells.items()):
                    bt_dict = {}
                    bt_dict['id'] = row.row_key.decode('utf-8')
                    key = None
                    # using BT mapping to return  data
                    for col, cells in sorted(cols.items()):
                        for cell in cells:
                            for name, bt_name in bt_mapping_dict.items():
                                if col.decode('utf-8') == bt_name:
                                    key = name
                                    break
                            if key is not None:
                                bt_dict[key] = cell.value.decode('utf-8')
                    bt_array.append(bt_dict)
        except BaseException as error:
            logging.error(
                'An exception occurred - DemoBigTableGet::get(): {}'.format(
                    error))

        print(bt_array)

        return json.dumps(bt_array), 200, {'ContentType': 'application/json'}

Example #4

0

Show file

  def ResolveMulti(self,
                   subject,
                   attributes,
                   timestamp=None,
                   limit=None,
                   token=None):
    """Resolve multiple attributes for a subject.

    Results will be returned in arbitrary order (i.e. not ordered by attribute
    or timestamp).

    Args:
      subject: The subject to resolve.
      attributes: The attribute string or list of strings to match. Note this is
          an exact match, not a regex.
      timestamp: A range of times for consideration (In
          microseconds). Can be a constant such as ALL_TIMESTAMPS or
          NEWEST_TIMESTAMP or a tuple of ints (start, end).
      limit: The maximum total number of results we return.
      token: The security token used in this call.

    Yields:
       A unordered list of (attribute, value string, timestamp).

    Raises:
      AccessError: if anything goes wrong.
    """
    subject = utils.SmartStr(subject)
    self.security_manager.CheckDataStoreAccess(
        token, [subject], self.GetRequiredResolveAccess(attributes))

    if isinstance(attributes, basestring):
      attributes = [utils.SmartStr(attributes)]
    else:
      attributes = [utils.SmartStr(x) for x in attributes]

    filter_union = []
    for attribute in attributes:
      family, column = self.GetFamilyColumn(attribute)
      col_filter = row_filters.ColumnRangeFilter(
          family, start_column=column, end_column=column)
      filter_union.append(col_filter)

    # More than one attribute, use a union, otherwise just use the
    # existing filter.
    if len(filter_union) > 1:
      filter_union = row_filters.RowFilterUnion(filters=filter_union)
    else:
      filter_union = filter_union[0]

    # Essentially timestamp AND (attr1 OR attr2)
    timestamp_filter = self._TimestampToFilter(timestamp)
    if timestamp_filter:
      row_filter = row_filters.RowFilterChain(
          filters=[filter_union, timestamp_filter])
    else:
      row_filter = filter_union

    row_data = self.CallWithRetry(
        self.table.read_row, "read", subject, filter_=row_filter)

    if row_data:
      max_results = limit or 2**64
      for column, cells in row_data.cells[family].iteritems():
        attribute = ":".join((family, column))
        for cell in cells:
          if max_results <= 0:
            raise StopIteration
          max_results -= 1
          yield attribute, self.Decode(
              attribute,
              cell.value), self.DatetimeToMicroseconds(cell.timestamp)

Example #5

0

Show file

  def MultiResolvePrefix(self,
                         subjects,
                         attribute_prefix,
                         timestamp=None,
                         limit=None,
                         token=None):
    """Get results from multiple rows matching multiple attributes.

    We could implement this using read_rows, but it is a table scan. Our current
    data model makes that slow because it is a directory hierarchy that includes
    entries for subdirectories interleaved. So if you want all the results for a
    directory you need to skip those in the scan.

    Instead we make an RPC for each subject all at once using a threadpool. We
    pay more in RPC overhead but we get to do it concurrently.

    Args:
      subjects: A list of subjects.
      attribute_prefix: The attribute prefix.

      timestamp: A range of times for consideration (In
          microseconds). Can be a constant such as ALL_TIMESTAMPS or
          NEWEST_TIMESTAMP or a tuple of ints (start, end).

      limit: The total number of result values to return.
      token: An ACL token.

    Yields:
       A list of tuples:
       (subject, [(attribute, value string, timestamp)])

       that can be simply converted to a dict.

       Values with the same attribute (happens when timestamp is not
       NEWEST_TIMESTAMP, but ALL_TIMESTAMPS or time range) are guaranteed
       to be ordered in the decreasing timestamp order.

    Raises:
      AccessError: if anything goes wrong.
      ValueError: if we get a string instead of a list of subjects.
    """
    self.security_manager.CheckDataStoreAccess(
        token, subjects, self.GetRequiredResolveAccess(attribute_prefix))

    if isinstance(subjects, basestring):
      raise ValueError("Expected list of subjects, got string: %s" % subjects)

    if isinstance(attribute_prefix, basestring):
      attribute_prefix_list = [utils.SmartStr(attribute_prefix)]
    else:
      attribute_prefix_list = [utils.SmartStr(x) for x in attribute_prefix]

    timestamp_filter = self._TimestampToFilter(timestamp)
    filter_union = []

    for attribute_prefix in attribute_prefix_list:
      family, column = self.GetFamilyColumn(attribute_prefix)

      family_filter = row_filters.FamilyNameRegexFilter(family)
      row_filter_list = [family_filter]

      if column:
        # Make it an actual regex
        column += ".*"
        col_filter = row_filters.ColumnQualifierRegexFilter(column)
        row_filter_list.append(col_filter)

      if timestamp_filter:
        row_filter_list.append(timestamp_filter)

      if len(row_filter_list) > 1:
        row_filter = row_filters.RowFilterChain(filters=row_filter_list)
      else:
        row_filter = row_filter_list[0]

      filter_union.append(row_filter)

    # More than one set of prefixes, use a union, otherwise just use the
    # existing filter chain.
    if len(filter_union) > 1:
      attribute_filter = row_filters.RowFilterUnion(filters=filter_union)
    else:
      attribute_filter = filter_union[0]

    # Apply those filters to each subject as a separate RPC using a threadpool
    pool_args = []
    original_subject_map = {}
    for subject in subjects:
      # List of *args, **kwargs to pass to the RPC caller
      pool_args.append(((self.table.read_row, "read", utils.SmartStr(subject)),
                        {
                            "filter_": attribute_filter
                        }))

      # We're expected to return subjects as their original type, which can be
      # URN, unicode, or string. Keep a mapping in this dict.
      original_subject_map[utils.SmartStr(subject)] = subject

    max_results = limit or 2**64
    for result in self.pool.imap_unordered(self._WrapCallWithRetry, pool_args):
      if max_results <= 0:
        break
      if result:
        subject_results, max_results = self._GetSubjectResults(result,
                                                               max_results)
        yield original_subject_map[
            result.row_key], self._SortResultsByAttrTimestampValue(
                subject_results)