def Resolve(self, subject, attribute, token=None): """Retrieve the latest value set for a subject's attribute. Args: subject: The subject URN. attribute: The attribute. token: The security token used in this call. Returns: A (string, timestamp in microseconds) stored in the bigtable cell, or (None, 0). Raises: AccessError: if anything goes wrong. """ subject = utils.SmartStr(subject) attribute = utils.SmartStr(attribute) family, column = self.GetFamilyColumn(attribute) col_filter = row_filters.ColumnRangeFilter( family, start_column=column, end_column=column) # Most recent latest_filter = row_filters.CellsColumnLimitFilter(1) row_filter = row_filters.RowFilterChain(filters=[col_filter, latest_filter]) row_data = self.table.read_row(subject, filter_=row_filter) if row_data: for cell in row_data.cells[family][column]: return self.Decode( attribute, cell.value), self.DatetimeToMicroseconds(cell.timestamp) return None, 0
def _GetAttributeFilterUnion(self, attributes, timestamp_filter=None): filters = [] for attribute_prefix in attributes: family, column = self.GetFamilyColumn(attribute_prefix) family_filter = row_filters.FamilyNameRegexFilter(family) row_filter_list = [family_filter] if column: col_filter = row_filters.ColumnQualifierRegexFilter(column) row_filter_list.append(col_filter) if timestamp_filter: row_filter_list.append(timestamp_filter) if len(row_filter_list) > 1: row_filter = row_filters.RowFilterChain(filters=row_filter_list) else: row_filter = row_filter_list[0] filters.append(row_filter) # More than one attribute, use a union, otherwise just use the # existing filter. if len(filters) > 1: filters = row_filters.RowFilterUnion(filters=filters) else: filters = filters[0] return filters
def _Acquire(self, lease_time): now = int(time.time() * 1e6) expires = int((time.time() + lease_time) * 1e6) # Only latest value latest_value = row_filters.CellsColumnLimitFilter(1) # Match any lease time value > now which means someone else holds a lock # We can't store these as ints, encode to str. current_lease = row_filters.ValueRangeFilter( start_value=utils.SmartStr(now), inclusive_start=False) # aff4:lease family, column = self.store.GetFamilyColumn(self.store.LEASE_ATTRIBUTE) col_filter = row_filters.ColumnRangeFilter( family, start_column=column, end_column=column) # Note filter chains are evaluated in order so there are performance # considerations with which filter to apply first filter_chain = row_filters.RowFilterChain( [col_filter, current_lease, latest_value]) mutate_row = self.store.table.row(self.subject, filter_=filter_chain) # state=False means no lease or it's expired, in this case take the lock. mutate_row.set_cell(family, column, utils.SmartStr(expires), state=False) # Check in review: I think we want to retry the RPC here? Or should we just # raise like we can't get the lock? existing_lock = self.store.CallWithRetry(mutate_row.commit, "write") if existing_lock: raise ExistingLock("Row %s locked." % self.subject) # We have the lock self.expires = expires self.locked = True
def filter_composing_chain(project_id, instance_id, table_id): client = bigtable.Client(project=project_id, admin=True) instance = client.instance(instance_id) table = instance.table(table_id) rows = table.read_rows(filter_=row_filters.RowFilterChain( filters=[row_filters.CellsColumnLimitFilter(1), row_filters.FamilyNameRegexFilter("cell_plan")])) for row in rows: print_row(row)
def get(self): bt_array = [] try: table = instance.table(bt_table_name) row_set = RowSet() for row_key in row_keys: row_set.add_row_key(row_key) colFilters = [] for name, bt_name in bt_mapping_dict.items(): colFilters.append( row_filters.ColumnQualifierRegexFilter(bt_name)) print("before read_rows...") rows = table.read_rows( row_set=row_set, filter_=row_filters.RowFilterChain(filters=[ row_filters.CellsColumnLimitFilter(1), row_filters.RowFilterUnion(filters=colFilters) ]), retry=bigtable.table.DEFAULT_RETRY_READ_ROWS.with_deadline( 60.0)) print("after read_rows...") for row in rows: print("Reading data for {}:".format( row.row_key.decode('utf-8'))) for cf, cols in sorted(row.cells.items()): bt_dict = {} bt_dict['id'] = row.row_key.decode('utf-8') key = None # using BT mapping to return data for col, cells in sorted(cols.items()): for cell in cells: for name, bt_name in bt_mapping_dict.items(): if col.decode('utf-8') == bt_name: key = name break if key is not None: bt_dict[key] = cell.value.decode('utf-8') bt_array.append(bt_dict) except BaseException as error: logging.error( 'An exception occurred - DemoBigTableGet::get(): {}'.format( error)) print(bt_array) return json.dumps(bt_array), 200, {'ContentType': 'application/json'}
def filter_composing_condition(project_id, instance_id, table_id): client = bigtable.Client(project=project_id, admin=True) instance = client.instance(instance_id) table = instance.table(table_id) rows = table.read_rows(filter_=row_filters.ConditionalRowFilter( base_filter=row_filters.RowFilterChain(filters=[ row_filters.ColumnQualifierRegexFilter("data_plan_10gb"), row_filters.ValueRegexFilter("true") ]), true_filter=row_filters.ApplyLabelFilter(label="passed-filter"), false_filter=row_filters.ApplyLabelFilter(label="filtered-out"))) for row in rows: print_row(row)
def ScanAttributes(self, subject_prefix, attributes, after_urn=None, max_records=None, token=None, relaxed_order=False): subject_prefix = self._CleanSubjectPrefix(subject_prefix) after_urn = self._CleanAfterURN(after_urn, subject_prefix) # Turn subject prefix into an actual regex subject_prefix += ".*" self.security_manager.CheckDataStoreAccess(token, [subject_prefix], "rq") subject_filter = row_filters.RowKeyRegexFilter( utils.SmartStr(subject_prefix)) latest_value = row_filters.CellsColumnLimitFilter(1) attribute_filters = self._GetAttributeFilterUnion(attributes) # Subject AND (attr1 OR attr2) AND latest_value query_filter = row_filters.RowFilterChain( [subject_filter, attribute_filters, latest_value]) # The API results include the start row, we want to exclude it, append a # null to do so. if after_urn is not None: after_urn += "\x00" rows_data = self.CallWithRetry( self.table.read_rows, "read", start_key=after_urn, limit=max_records, filter_=query_filter) # Ideally we should be able to stream and yield, but it seems we can't: # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/1812 self.CallWithRetry(rows_data.consume_all, "read") results = [] if rows_data.rows: for subject, row_data in rows_data.rows.iteritems(): subject_results = self._ReOrderRowResults(row_data) results.append((subject, subject_results)) return sorted(results, key=lambda x: x[0])
def write_conditional(project_id, instance_id, table_id): client = bigtable.Client(project=project_id, admin=True) instance = client.instance(instance_id) table = instance.table(table_id) timestamp = datetime.datetime.utcnow() column_family_id = "stats_summary" row_key = "phone#4c410523#20190501" row_filter = row_filters.RowFilterChain(filters=[ row_filters.FamilyNameRegexFilter(column_family_id), row_filters.ColumnQualifierRegexFilter("os_build"), row_filters.ValueRegexFilter("PQ2A\\..*"), ]) row = table.conditional_row(row_key, filter_=row_filter) row.set_cell(column_family_id, "os_name", "android", timestamp) row.commit() print("Successfully updated row's os_name.")
def ResolveMulti(self, subject, attributes, timestamp=None, limit=None, token=None): """Resolve multiple attributes for a subject. Results will be returned in arbitrary order (i.e. not ordered by attribute or timestamp). Args: subject: The subject to resolve. attributes: The attribute string or list of strings to match. Note this is an exact match, not a regex. timestamp: A range of times for consideration (In microseconds). Can be a constant such as ALL_TIMESTAMPS or NEWEST_TIMESTAMP or a tuple of ints (start, end). limit: The maximum total number of results we return. token: The security token used in this call. Yields: A unordered list of (attribute, value string, timestamp). Raises: AccessError: if anything goes wrong. """ subject = utils.SmartStr(subject) self.security_manager.CheckDataStoreAccess( token, [subject], self.GetRequiredResolveAccess(attributes)) if isinstance(attributes, basestring): attributes = [utils.SmartStr(attributes)] else: attributes = [utils.SmartStr(x) for x in attributes] filter_union = [] for attribute in attributes: family, column = self.GetFamilyColumn(attribute) col_filter = row_filters.ColumnRangeFilter( family, start_column=column, end_column=column) filter_union.append(col_filter) # More than one attribute, use a union, otherwise just use the # existing filter. if len(filter_union) > 1: filter_union = row_filters.RowFilterUnion(filters=filter_union) else: filter_union = filter_union[0] # Essentially timestamp AND (attr1 OR attr2) timestamp_filter = self._TimestampToFilter(timestamp) if timestamp_filter: row_filter = row_filters.RowFilterChain( filters=[filter_union, timestamp_filter]) else: row_filter = filter_union row_data = self.CallWithRetry( self.table.read_row, "read", subject, filter_=row_filter) if row_data: max_results = limit or 2**64 for column, cells in row_data.cells[family].iteritems(): attribute = ":".join((family, column)) for cell in cells: if max_results <= 0: raise StopIteration max_results -= 1 yield attribute, self.Decode( attribute, cell.value), self.DatetimeToMicroseconds(cell.timestamp)
def MultiResolvePrefix(self, subjects, attribute_prefix, timestamp=None, limit=None, token=None): """Get results from multiple rows matching multiple attributes. We could implement this using read_rows, but it is a table scan. Our current data model makes that slow because it is a directory hierarchy that includes entries for subdirectories interleaved. So if you want all the results for a directory you need to skip those in the scan. Instead we make an RPC for each subject all at once using a threadpool. We pay more in RPC overhead but we get to do it concurrently. Args: subjects: A list of subjects. attribute_prefix: The attribute prefix. timestamp: A range of times for consideration (In microseconds). Can be a constant such as ALL_TIMESTAMPS or NEWEST_TIMESTAMP or a tuple of ints (start, end). limit: The total number of result values to return. token: An ACL token. Yields: A list of tuples: (subject, [(attribute, value string, timestamp)]) that can be simply converted to a dict. Values with the same attribute (happens when timestamp is not NEWEST_TIMESTAMP, but ALL_TIMESTAMPS or time range) are guaranteed to be ordered in the decreasing timestamp order. Raises: AccessError: if anything goes wrong. ValueError: if we get a string instead of a list of subjects. """ self.security_manager.CheckDataStoreAccess( token, subjects, self.GetRequiredResolveAccess(attribute_prefix)) if isinstance(subjects, basestring): raise ValueError("Expected list of subjects, got string: %s" % subjects) if isinstance(attribute_prefix, basestring): attribute_prefix_list = [utils.SmartStr(attribute_prefix)] else: attribute_prefix_list = [utils.SmartStr(x) for x in attribute_prefix] timestamp_filter = self._TimestampToFilter(timestamp) filter_union = [] for attribute_prefix in attribute_prefix_list: family, column = self.GetFamilyColumn(attribute_prefix) family_filter = row_filters.FamilyNameRegexFilter(family) row_filter_list = [family_filter] if column: # Make it an actual regex column += ".*" col_filter = row_filters.ColumnQualifierRegexFilter(column) row_filter_list.append(col_filter) if timestamp_filter: row_filter_list.append(timestamp_filter) if len(row_filter_list) > 1: row_filter = row_filters.RowFilterChain(filters=row_filter_list) else: row_filter = row_filter_list[0] filter_union.append(row_filter) # More than one set of prefixes, use a union, otherwise just use the # existing filter chain. if len(filter_union) > 1: attribute_filter = row_filters.RowFilterUnion(filters=filter_union) else: attribute_filter = filter_union[0] # Apply those filters to each subject as a separate RPC using a threadpool pool_args = [] original_subject_map = {} for subject in subjects: # List of *args, **kwargs to pass to the RPC caller pool_args.append(((self.table.read_row, "read", utils.SmartStr(subject)), { "filter_": attribute_filter })) # We're expected to return subjects as their original type, which can be # URN, unicode, or string. Keep a mapping in this dict. original_subject_map[utils.SmartStr(subject)] = subject max_results = limit or 2**64 for result in self.pool.imap_unordered(self._WrapCallWithRetry, pool_args): if max_results <= 0: break if result: subject_results, max_results = self._GetSubjectResults(result, max_results) yield original_subject_map[ result.row_key], self._SortResultsByAttrTimestampValue( subject_results)