def _GetAttributeFilterUnion(self, attributes, timestamp_filter=None): filters = [] for attribute_prefix in attributes: family, column = self.GetFamilyColumn(attribute_prefix) family_filter = row_filters.FamilyNameRegexFilter(family) row_filter_list = [family_filter] if column: col_filter = row_filters.ColumnQualifierRegexFilter(column) row_filter_list.append(col_filter) if timestamp_filter: row_filter_list.append(timestamp_filter) if len(row_filter_list) > 1: row_filter = row_filters.RowFilterChain(filters=row_filter_list) else: row_filter = row_filter_list[0] filters.append(row_filter) # More than one attribute, use a union, otherwise just use the # existing filter. if len(filters) > 1: filters = row_filters.RowFilterUnion(filters=filters) else: filters = filters[0] return filters
def filter_composing_interleave(project_id, instance_id, table_id): client = bigtable.Client(project=project_id, admin=True) instance = client.instance(instance_id) table = instance.table(table_id) rows = table.read_rows(filter_=row_filters.RowFilterUnion( filters=[row_filters.ValueRegexFilter("true"), row_filters.ColumnQualifierRegexFilter("os_build")])) for row in rows: print_row(row)
def get(self): bt_array = [] try: table = instance.table(bt_table_name) row_set = RowSet() for row_key in row_keys: row_set.add_row_key(row_key) colFilters = [] for name, bt_name in bt_mapping_dict.items(): colFilters.append( row_filters.ColumnQualifierRegexFilter(bt_name)) print("before read_rows...") rows = table.read_rows( row_set=row_set, filter_=row_filters.RowFilterChain(filters=[ row_filters.CellsColumnLimitFilter(1), row_filters.RowFilterUnion(filters=colFilters) ]), retry=bigtable.table.DEFAULT_RETRY_READ_ROWS.with_deadline( 60.0)) print("after read_rows...") for row in rows: print("Reading data for {}:".format( row.row_key.decode('utf-8'))) for cf, cols in sorted(row.cells.items()): bt_dict = {} bt_dict['id'] = row.row_key.decode('utf-8') key = None # using BT mapping to return data for col, cells in sorted(cols.items()): for cell in cells: for name, bt_name in bt_mapping_dict.items(): if col.decode('utf-8') == bt_name: key = name break if key is not None: bt_dict[key] = cell.value.decode('utf-8') bt_array.append(bt_dict) except BaseException as error: logging.error( 'An exception occurred - DemoBigTableGet::get(): {}'.format( error)) print(bt_array) return json.dumps(bt_array), 200, {'ContentType': 'application/json'}
def ResolveMulti(self, subject, attributes, timestamp=None, limit=None, token=None): """Resolve multiple attributes for a subject. Results will be returned in arbitrary order (i.e. not ordered by attribute or timestamp). Args: subject: The subject to resolve. attributes: The attribute string or list of strings to match. Note this is an exact match, not a regex. timestamp: A range of times for consideration (In microseconds). Can be a constant such as ALL_TIMESTAMPS or NEWEST_TIMESTAMP or a tuple of ints (start, end). limit: The maximum total number of results we return. token: The security token used in this call. Yields: A unordered list of (attribute, value string, timestamp). Raises: AccessError: if anything goes wrong. """ subject = utils.SmartStr(subject) self.security_manager.CheckDataStoreAccess( token, [subject], self.GetRequiredResolveAccess(attributes)) if isinstance(attributes, basestring): attributes = [utils.SmartStr(attributes)] else: attributes = [utils.SmartStr(x) for x in attributes] filter_union = [] for attribute in attributes: family, column = self.GetFamilyColumn(attribute) col_filter = row_filters.ColumnRangeFilter( family, start_column=column, end_column=column) filter_union.append(col_filter) # More than one attribute, use a union, otherwise just use the # existing filter. if len(filter_union) > 1: filter_union = row_filters.RowFilterUnion(filters=filter_union) else: filter_union = filter_union[0] # Essentially timestamp AND (attr1 OR attr2) timestamp_filter = self._TimestampToFilter(timestamp) if timestamp_filter: row_filter = row_filters.RowFilterChain( filters=[filter_union, timestamp_filter]) else: row_filter = filter_union row_data = self.CallWithRetry( self.table.read_row, "read", subject, filter_=row_filter) if row_data: max_results = limit or 2**64 for column, cells in row_data.cells[family].iteritems(): attribute = ":".join((family, column)) for cell in cells: if max_results <= 0: raise StopIteration max_results -= 1 yield attribute, self.Decode( attribute, cell.value), self.DatetimeToMicroseconds(cell.timestamp)
def MultiResolvePrefix(self, subjects, attribute_prefix, timestamp=None, limit=None, token=None): """Get results from multiple rows matching multiple attributes. We could implement this using read_rows, but it is a table scan. Our current data model makes that slow because it is a directory hierarchy that includes entries for subdirectories interleaved. So if you want all the results for a directory you need to skip those in the scan. Instead we make an RPC for each subject all at once using a threadpool. We pay more in RPC overhead but we get to do it concurrently. Args: subjects: A list of subjects. attribute_prefix: The attribute prefix. timestamp: A range of times for consideration (In microseconds). Can be a constant such as ALL_TIMESTAMPS or NEWEST_TIMESTAMP or a tuple of ints (start, end). limit: The total number of result values to return. token: An ACL token. Yields: A list of tuples: (subject, [(attribute, value string, timestamp)]) that can be simply converted to a dict. Values with the same attribute (happens when timestamp is not NEWEST_TIMESTAMP, but ALL_TIMESTAMPS or time range) are guaranteed to be ordered in the decreasing timestamp order. Raises: AccessError: if anything goes wrong. ValueError: if we get a string instead of a list of subjects. """ self.security_manager.CheckDataStoreAccess( token, subjects, self.GetRequiredResolveAccess(attribute_prefix)) if isinstance(subjects, basestring): raise ValueError("Expected list of subjects, got string: %s" % subjects) if isinstance(attribute_prefix, basestring): attribute_prefix_list = [utils.SmartStr(attribute_prefix)] else: attribute_prefix_list = [utils.SmartStr(x) for x in attribute_prefix] timestamp_filter = self._TimestampToFilter(timestamp) filter_union = [] for attribute_prefix in attribute_prefix_list: family, column = self.GetFamilyColumn(attribute_prefix) family_filter = row_filters.FamilyNameRegexFilter(family) row_filter_list = [family_filter] if column: # Make it an actual regex column += ".*" col_filter = row_filters.ColumnQualifierRegexFilter(column) row_filter_list.append(col_filter) if timestamp_filter: row_filter_list.append(timestamp_filter) if len(row_filter_list) > 1: row_filter = row_filters.RowFilterChain(filters=row_filter_list) else: row_filter = row_filter_list[0] filter_union.append(row_filter) # More than one set of prefixes, use a union, otherwise just use the # existing filter chain. if len(filter_union) > 1: attribute_filter = row_filters.RowFilterUnion(filters=filter_union) else: attribute_filter = filter_union[0] # Apply those filters to each subject as a separate RPC using a threadpool pool_args = [] original_subject_map = {} for subject in subjects: # List of *args, **kwargs to pass to the RPC caller pool_args.append(((self.table.read_row, "read", utils.SmartStr(subject)), { "filter_": attribute_filter })) # We're expected to return subjects as their original type, which can be # URN, unicode, or string. Keep a mapping in this dict. original_subject_map[utils.SmartStr(subject)] = subject max_results = limit or 2**64 for result in self.pool.imap_unordered(self._WrapCallWithRetry, pool_args): if max_results <= 0: break if result: subject_results, max_results = self._GetSubjectResults(result, max_results) yield original_subject_map[ result.row_key], self._SortResultsByAttrTimestampValue( subject_results)