Python RowFilterChain Examples, google.cloud.bigtable.row_filters.RowFilterChain Python Examples

Example #1

0

Show file

File: cloud_bigtable_data_store.py Project: stephanas50/grr

  def Resolve(self, subject, attribute, token=None):
    """Retrieve the latest value set for a subject's attribute.

    Args:
      subject: The subject URN.
      attribute: The attribute.
      token: The security token used in this call.

    Returns:
      A (string, timestamp in microseconds) stored in the bigtable
      cell, or (None, 0).

    Raises:
      AccessError: if anything goes wrong.
    """
    subject = utils.SmartStr(subject)

    attribute = utils.SmartStr(attribute)
    family, column = self.GetFamilyColumn(attribute)

    col_filter = row_filters.ColumnRangeFilter(
        family, start_column=column, end_column=column)

    # Most recent
    latest_filter = row_filters.CellsColumnLimitFilter(1)

    row_filter = row_filters.RowFilterChain(filters=[col_filter, latest_filter])
    row_data = self.table.read_row(subject, filter_=row_filter)

    if row_data:
      for cell in row_data.cells[family][column]:
        return self.Decode(
            attribute, cell.value), self.DatetimeToMicroseconds(cell.timestamp)

    return None, 0

Example #2

0

Show file

  def _GetAttributeFilterUnion(self, attributes, timestamp_filter=None):
    filters = []
    for attribute_prefix in attributes:
      family, column = self.GetFamilyColumn(attribute_prefix)

      family_filter = row_filters.FamilyNameRegexFilter(family)
      row_filter_list = [family_filter]

      if column:
        col_filter = row_filters.ColumnQualifierRegexFilter(column)
        row_filter_list.append(col_filter)

      if timestamp_filter:
        row_filter_list.append(timestamp_filter)

      if len(row_filter_list) > 1:
        row_filter = row_filters.RowFilterChain(filters=row_filter_list)
      else:
        row_filter = row_filter_list[0]

      filters.append(row_filter)

    # More than one attribute, use a union, otherwise just use the
    # existing filter.
    if len(filters) > 1:
      filters = row_filters.RowFilterUnion(filters=filters)
    else:
      filters = filters[0]

    return filters

Example #3

0

Show file

  def _Acquire(self, lease_time):
    now = int(time.time() * 1e6)
    expires = int((time.time() + lease_time) * 1e6)

    # Only latest value
    latest_value = row_filters.CellsColumnLimitFilter(1)
    # Match any lease time value > now which means someone else holds a lock
    # We can't store these as ints, encode to str.
    current_lease = row_filters.ValueRangeFilter(
        start_value=utils.SmartStr(now), inclusive_start=False)

    # aff4:lease
    family, column = self.store.GetFamilyColumn(self.store.LEASE_ATTRIBUTE)
    col_filter = row_filters.ColumnRangeFilter(
        family, start_column=column, end_column=column)

    # Note filter chains are evaluated in order so there are performance
    # considerations with which filter to apply first
    filter_chain = row_filters.RowFilterChain(
        [col_filter, current_lease, latest_value])
    mutate_row = self.store.table.row(self.subject, filter_=filter_chain)

    # state=False means no lease or it's expired, in this case take the lock.
    mutate_row.set_cell(family, column, utils.SmartStr(expires), state=False)

    # Check in review: I think we want to retry the RPC here? Or should we just
    # raise like we can't get the lock?
    existing_lock = self.store.CallWithRetry(mutate_row.commit, "write")

    if existing_lock:
      raise ExistingLock("Row %s locked." % self.subject)

    # We have the lock
    self.expires = expires
    self.locked = True

Example #4

0

Show file

File: filter_snippets.py Project: othercamb/GCP

def filter_composing_chain(project_id, instance_id, table_id):
    client = bigtable.Client(project=project_id, admin=True)
    instance = client.instance(instance_id)
    table = instance.table(table_id)

    rows = table.read_rows(filter_=row_filters.RowFilterChain(
        filters=[row_filters.CellsColumnLimitFilter(1),
                 row_filters.FamilyNameRegexFilter("cell_plan")]))
    for row in rows:
        print_row(row)

Example #5

0

Show file

    def get(self):
        bt_array = []
        try:

            table = instance.table(bt_table_name)
            row_set = RowSet()

            for row_key in row_keys:
                row_set.add_row_key(row_key)

            colFilters = []
            for name, bt_name in bt_mapping_dict.items():
                colFilters.append(
                    row_filters.ColumnQualifierRegexFilter(bt_name))

            print("before read_rows...")
            rows = table.read_rows(
                row_set=row_set,
                filter_=row_filters.RowFilterChain(filters=[
                    row_filters.CellsColumnLimitFilter(1),
                    row_filters.RowFilterUnion(filters=colFilters)
                ]),
                retry=bigtable.table.DEFAULT_RETRY_READ_ROWS.with_deadline(
                    60.0))
            print("after read_rows...")

            for row in rows:
                print("Reading data for {}:".format(
                    row.row_key.decode('utf-8')))
                for cf, cols in sorted(row.cells.items()):
                    bt_dict = {}
                    bt_dict['id'] = row.row_key.decode('utf-8')
                    key = None
                    # using BT mapping to return  data
                    for col, cells in sorted(cols.items()):
                        for cell in cells:
                            for name, bt_name in bt_mapping_dict.items():
                                if col.decode('utf-8') == bt_name:
                                    key = name
                                    break
                            if key is not None:
                                bt_dict[key] = cell.value.decode('utf-8')
                    bt_array.append(bt_dict)
        except BaseException as error:
            logging.error(
                'An exception occurred - DemoBigTableGet::get(): {}'.format(
                    error))

        print(bt_array)

        return json.dumps(bt_array), 200, {'ContentType': 'application/json'}

Example #6

0

Show file

def filter_composing_condition(project_id, instance_id, table_id):
    client = bigtable.Client(project=project_id, admin=True)
    instance = client.instance(instance_id)
    table = instance.table(table_id)

    rows = table.read_rows(filter_=row_filters.ConditionalRowFilter(
        base_filter=row_filters.RowFilterChain(filters=[
            row_filters.ColumnQualifierRegexFilter("data_plan_10gb"),
            row_filters.ValueRegexFilter("true")
        ]),
        true_filter=row_filters.ApplyLabelFilter(label="passed-filter"),
        false_filter=row_filters.ApplyLabelFilter(label="filtered-out")))
    for row in rows:
        print_row(row)

Example #7

0

Show file

  def ScanAttributes(self,
                     subject_prefix,
                     attributes,
                     after_urn=None,
                     max_records=None,
                     token=None,
                     relaxed_order=False):
    subject_prefix = self._CleanSubjectPrefix(subject_prefix)
    after_urn = self._CleanAfterURN(after_urn, subject_prefix)
    # Turn subject prefix into an actual regex
    subject_prefix += ".*"
    self.security_manager.CheckDataStoreAccess(token, [subject_prefix], "rq")

    subject_filter = row_filters.RowKeyRegexFilter(
        utils.SmartStr(subject_prefix))
    latest_value = row_filters.CellsColumnLimitFilter(1)
    attribute_filters = self._GetAttributeFilterUnion(attributes)
    # Subject AND (attr1 OR attr2) AND latest_value
    query_filter = row_filters.RowFilterChain(
        [subject_filter, attribute_filters, latest_value])

    # The API results include the start row, we want to exclude it, append a
    # null to do so.
    if after_urn is not None:
      after_urn += "\x00"

    rows_data = self.CallWithRetry(
        self.table.read_rows,
        "read",
        start_key=after_urn,
        limit=max_records,
        filter_=query_filter)

    # Ideally we should be able to stream and yield, but it seems we can't:
    # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/1812
    self.CallWithRetry(rows_data.consume_all, "read")

    results = []
    if rows_data.rows:
      for subject, row_data in rows_data.rows.iteritems():
        subject_results = self._ReOrderRowResults(row_data)
        results.append((subject, subject_results))
    return sorted(results, key=lambda x: x[0])

Example #8

0

Show file

File: write_conditionally.py Project: mf2199/python-bigtable

def write_conditional(project_id, instance_id, table_id):
    client = bigtable.Client(project=project_id, admin=True)
    instance = client.instance(instance_id)
    table = instance.table(table_id)

    timestamp = datetime.datetime.utcnow()
    column_family_id = "stats_summary"

    row_key = "phone#4c410523#20190501"

    row_filter = row_filters.RowFilterChain(filters=[
        row_filters.FamilyNameRegexFilter(column_family_id),
        row_filters.ColumnQualifierRegexFilter("os_build"),
        row_filters.ValueRegexFilter("PQ2A\\..*"),
    ])
    row = table.conditional_row(row_key, filter_=row_filter)
    row.set_cell(column_family_id, "os_name", "android", timestamp)
    row.commit()

    print("Successfully updated row's os_name.")

Example #9

0

Show file

  def ResolveMulti(self,
                   subject,
                   attributes,
                   timestamp=None,
                   limit=None,
                   token=None):
    """Resolve multiple attributes for a subject.

    Results will be returned in arbitrary order (i.e. not ordered by attribute
    or timestamp).

    Args:
      subject: The subject to resolve.
      attributes: The attribute string or list of strings to match. Note this is
          an exact match, not a regex.
      timestamp: A range of times for consideration (In
          microseconds). Can be a constant such as ALL_TIMESTAMPS or
          NEWEST_TIMESTAMP or a tuple of ints (start, end).
      limit: The maximum total number of results we return.
      token: The security token used in this call.

    Yields:
       A unordered list of (attribute, value string, timestamp).

    Raises:
      AccessError: if anything goes wrong.
    """
    subject = utils.SmartStr(subject)
    self.security_manager.CheckDataStoreAccess(
        token, [subject], self.GetRequiredResolveAccess(attributes))

    if isinstance(attributes, basestring):
      attributes = [utils.SmartStr(attributes)]
    else:
      attributes = [utils.SmartStr(x) for x in attributes]

    filter_union = []
    for attribute in attributes:
      family, column = self.GetFamilyColumn(attribute)
      col_filter = row_filters.ColumnRangeFilter(
          family, start_column=column, end_column=column)
      filter_union.append(col_filter)

    # More than one attribute, use a union, otherwise just use the
    # existing filter.
    if len(filter_union) > 1:
      filter_union = row_filters.RowFilterUnion(filters=filter_union)
    else:
      filter_union = filter_union[0]

    # Essentially timestamp AND (attr1 OR attr2)
    timestamp_filter = self._TimestampToFilter(timestamp)
    if timestamp_filter:
      row_filter = row_filters.RowFilterChain(
          filters=[filter_union, timestamp_filter])
    else:
      row_filter = filter_union

    row_data = self.CallWithRetry(
        self.table.read_row, "read", subject, filter_=row_filter)

    if row_data:
      max_results = limit or 2**64
      for column, cells in row_data.cells[family].iteritems():
        attribute = ":".join((family, column))
        for cell in cells:
          if max_results <= 0:
            raise StopIteration
          max_results -= 1
          yield attribute, self.Decode(
              attribute,
              cell.value), self.DatetimeToMicroseconds(cell.timestamp)

Example #10

0

Show file

  def MultiResolvePrefix(self,
                         subjects,
                         attribute_prefix,
                         timestamp=None,
                         limit=None,
                         token=None):
    """Get results from multiple rows matching multiple attributes.

    We could implement this using read_rows, but it is a table scan. Our current
    data model makes that slow because it is a directory hierarchy that includes
    entries for subdirectories interleaved. So if you want all the results for a
    directory you need to skip those in the scan.

    Instead we make an RPC for each subject all at once using a threadpool. We
    pay more in RPC overhead but we get to do it concurrently.

    Args:
      subjects: A list of subjects.
      attribute_prefix: The attribute prefix.

      timestamp: A range of times for consideration (In
          microseconds). Can be a constant such as ALL_TIMESTAMPS or
          NEWEST_TIMESTAMP or a tuple of ints (start, end).

      limit: The total number of result values to return.
      token: An ACL token.

    Yields:
       A list of tuples:
       (subject, [(attribute, value string, timestamp)])

       that can be simply converted to a dict.

       Values with the same attribute (happens when timestamp is not
       NEWEST_TIMESTAMP, but ALL_TIMESTAMPS or time range) are guaranteed
       to be ordered in the decreasing timestamp order.

    Raises:
      AccessError: if anything goes wrong.
      ValueError: if we get a string instead of a list of subjects.
    """
    self.security_manager.CheckDataStoreAccess(
        token, subjects, self.GetRequiredResolveAccess(attribute_prefix))

    if isinstance(subjects, basestring):
      raise ValueError("Expected list of subjects, got string: %s" % subjects)

    if isinstance(attribute_prefix, basestring):
      attribute_prefix_list = [utils.SmartStr(attribute_prefix)]
    else:
      attribute_prefix_list = [utils.SmartStr(x) for x in attribute_prefix]

    timestamp_filter = self._TimestampToFilter(timestamp)
    filter_union = []

    for attribute_prefix in attribute_prefix_list:
      family, column = self.GetFamilyColumn(attribute_prefix)

      family_filter = row_filters.FamilyNameRegexFilter(family)
      row_filter_list = [family_filter]

      if column:
        # Make it an actual regex
        column += ".*"
        col_filter = row_filters.ColumnQualifierRegexFilter(column)
        row_filter_list.append(col_filter)

      if timestamp_filter:
        row_filter_list.append(timestamp_filter)

      if len(row_filter_list) > 1:
        row_filter = row_filters.RowFilterChain(filters=row_filter_list)
      else:
        row_filter = row_filter_list[0]

      filter_union.append(row_filter)

    # More than one set of prefixes, use a union, otherwise just use the
    # existing filter chain.
    if len(filter_union) > 1:
      attribute_filter = row_filters.RowFilterUnion(filters=filter_union)
    else:
      attribute_filter = filter_union[0]

    # Apply those filters to each subject as a separate RPC using a threadpool
    pool_args = []
    original_subject_map = {}
    for subject in subjects:
      # List of *args, **kwargs to pass to the RPC caller
      pool_args.append(((self.table.read_row, "read", utils.SmartStr(subject)),
                        {
                            "filter_": attribute_filter
                        }))

      # We're expected to return subjects as their original type, which can be
      # URN, unicode, or string. Keep a mapping in this dict.
      original_subject_map[utils.SmartStr(subject)] = subject

    max_results = limit or 2**64
    for result in self.pool.imap_unordered(self._WrapCallWithRetry, pool_args):
      if max_results <= 0:
        break
      if result:
        subject_results, max_results = self._GetSubjectResults(result,
                                                               max_results)
        yield original_subject_map[
            result.row_key], self._SortResultsByAttrTimestampValue(
                subject_results)