Example #1
0
    def _GenerateConvertedValues(self, converter, grr_messages):
        """Generates converted values using given converter from given messages.

    Groups values in batches of BATCH_SIZE size and applies the converter
    to each batch.

    Args:
      converter: ExportConverter instance.
      grr_messages: An iterable (a generator is assumed) with GRRMessage values.

    Yields:
      Values generated by the converter.

    Raises:
      ValueError: if any of the GrrMessage objects doesn't have "source" set.
    """
        for batch in utils.Grouper(grr_messages, self.BATCH_SIZE):
            metadata_items = self._GetMetadataForClients(
                [gm.source for gm in batch])
            batch_with_metadata = zip(metadata_items,
                                      [gm.payload for gm in batch])

            for result in converter.BatchConvert(batch_with_metadata,
                                                 token=self.token):
                yield result
Example #2
0
    def ProcessSingleTypeExportedValues(self, original_value_type,
                                        exported_values):
        first_value = next(exported_values, None)
        if not first_value:
            return

        yield self.archive_generator.WriteFileHeader(
            "%s/%s/from_%s.yaml" %
            (self.path_prefix, first_value.__class__.__name__,
             original_value_type.__name__))
        yield self.archive_generator.WriteFileChunk(
            _SerializeToYaml(first_value))
        counter = 1
        for batch in utils.Grouper(exported_values, self.ROW_BATCH):
            counter += len(batch)
            buf = cStringIO.StringIO()
            for value in batch:
                buf.write("\n")
                buf.write(_SerializeToYaml(value))

            yield self.archive_generator.WriteFileChunk(buf.getvalue())
        yield self.archive_generator.WriteFileFooter()

        counts_for_original_type = self.export_counts.setdefault(
            original_value_type.__name__, dict())
        counts_for_original_type[first_value.__class__.__name__] = counter
Example #3
0
  def Stop(self, reason=None):
    super(GenericHunt, self).Stop(reason=reason)

    started_flows = grr_collections.RDFUrnCollection(
        self.started_flows_collection_urn)

    num_terminated_flows = 0
    self.Log("Hunt stop. Terminating all the started flows.")

    # Delete hunt flows states.
    for flows_batch in utils.Grouper(started_flows,
                                     self.__class__.STOP_BATCH_SIZE):
      with queue_manager.QueueManager(token=self.token) as manager:
        manager.MultiDestroyFlowStates(flows_batch)

      with data_store.DB.GetMutationPool() as mutation_pool:
        for f in flows_batch:
          flow.GRRFlow.MarkForTermination(
              f, reason="Parent hunt stopped.", mutation_pool=mutation_pool)

      num_terminated_flows += len(flows_batch)

    # Delete hunt's requests and responses to ensure no more
    # processing is going to occur.
    with queue_manager.QueueManager(token=self.token) as manager:
      manager.DestroyFlowStates(self.session_id)

    self.Log("%d flows terminated.", num_terminated_flows)
Example #4
0
    def Start(self):
        tmp_ttl = config.CONFIG["DataRetention.tmp_ttl"]
        if not tmp_ttl:
            self.Log("TTL not set - nothing to do...")
            return

        exception_label = config.CONFIG[
            "DataRetention.tmp_ttl_exception_label"]

        tmp_root = aff4.FACTORY.Open("aff4:/tmp", mode="r", token=self.token)
        tmp_urns = list(tmp_root.ListChildren())

        deadline = rdfvalue.RDFDatetime.Now() - tmp_ttl

        for tmp_group in utils.Grouper(tmp_urns, 10000):
            expired_tmp_urns = []
            for tmp_obj in aff4.FACTORY.MultiOpen(tmp_group,
                                                  mode="r",
                                                  token=self.token):
                if exception_label in tmp_obj.GetLabelsNames():
                    continue

                if tmp_obj.Get(tmp_obj.Schema.LAST) < deadline:
                    expired_tmp_urns.append(tmp_obj.urn)

            aff4.FACTORY.MultiDelete(expired_tmp_urns, token=self.token)
            self.HeartBeat()
Example #5
0
    def Start(self):
        inactive_client_ttl = config.CONFIG[
            "DataRetention.inactive_client_ttl"]
        if not inactive_client_ttl:
            self.Log("TTL not set - nothing to do...")
            return

        exception_label = config.CONFIG[
            "DataRetention.inactive_client_ttl_exception_label"]

        index = client_index.CreateClientIndex(token=self.token)

        client_urns = index.LookupClients(["."])

        deadline = rdfvalue.RDFDatetime.Now() - inactive_client_ttl

        for client_group in utils.Grouper(client_urns, 1000):
            inactive_client_urns = []
            for client in aff4.FACTORY.MultiOpen(
                    client_group,
                    mode="r",
                    aff4_type=aff4_grr.VFSGRRClient,
                    token=self.token):
                if exception_label in client.GetLabelsNames():
                    continue

                if client.Get(client.Schema.LAST) < deadline:
                    inactive_client_urns.append(client.urn)

            aff4.FACTORY.MultiDelete(inactive_client_urns, token=self.token)
            self.HeartBeat()
Example #6
0
    def Start(self):
        """Retrieve all the clients for the AbstractClientStatsCollectors."""
        self.stats = aff4.FACTORY.Create(self.FILESTORE_STATS_URN,
                                         aff4_stats.FilestoreStats,
                                         mode="w",
                                         token=self.token)

        self._CreateConsumers()
        hashes = aff4.FACTORY.Open(self.HASH_PATH,
                                   token=self.token).ListChildren(limit=10**8)

        try:
            for urns in utils.Grouper(hashes, self.OPEN_FILES_LIMIT):
                for fd in aff4.FACTORY.MultiOpen(urns,
                                                 mode="r",
                                                 token=self.token,
                                                 age=aff4.NEWEST_TIME):

                    for consumer in self.consumers:
                        consumer.ProcessFile(fd)
                self.HeartBeat()

        finally:
            for consumer in self.consumers:
                consumer.Save(self.stats)
            self.stats.Close()
Example #7
0
def _GetHWInfos(client_list, batch_size=10000, token=None):
    """Opens the given clients in batches and returns hardware information."""

    # This function returns a dict mapping each client_id to a set of reported
    # hardware serial numbers reported by this client.
    hw_infos = {}

    logging.info("%d clients to process.", len(client_list))

    c = 0

    for batch in utils.Grouper(client_list, batch_size):
        logging.info("Processing batch: %d-%d", c, c + batch_size)
        c += len(batch)

        client_objs = aff4.FACTORY.MultiOpen(batch,
                                             age=aff4.ALL_TIMES,
                                             token=token)

        for client in client_objs:
            hwi = client.GetValuesForAttribute(client.Schema.HARDWARE_INFO)

            hw_infos[client.urn] = set(["%s" % x.serial_number for x in hwi])

    return hw_infos
Example #8
0
    def ProcessSingleTypeExportedValues(self, original_value_type,
                                        exported_values):
        first_value = next(exported_values, None)
        if not first_value:
            return

        if not isinstance(first_value, rdf_structs.RDFProtoStruct):
            raise ValueError("The SQLite plugin only supports export-protos")
        yield self.archive_generator.WriteFileHeader(
            "%s/%s_from_%s.sql" %
            (self.path_prefix, first_value.__class__.__name__,
             original_value_type.__name__))
        table_name = "%s.from_%s" % (first_value.__class__.__name__,
                                     original_value_type.__name__)
        schema = self._GetSqliteSchema(first_value.__class__)

        # We will buffer the sql statements into an in-memory sql database before
        # dumping them to the zip archive. We rely on the PySQLite library for
        # string escaping.
        db_connection = sqlite3.connect(":memory:")
        db_cursor = db_connection.cursor()

        yield self.archive_generator.WriteFileChunk("BEGIN TRANSACTION;\n")
        with db_connection:
            buf = cStringIO.StringIO()
            buf.write("CREATE TABLE \"%s\" (\n  " % table_name)
            column_types = [(k, v.sqlite_type) for k, v in schema.items()]
            buf.write(",\n  ".join(
                ["\"%s\" %s" % (k, v) for k, v in column_types]))
            buf.write("\n);")
            db_cursor.execute(buf.getvalue())
            yield self.archive_generator.WriteFileChunk(buf.getvalue() + "\n")
            self._InsertValueIntoDb(table_name, schema, first_value, db_cursor)

        for sql in self._FlushAllRows(db_connection, table_name):
            yield sql
        counter = 1
        for batch in utils.Grouper(exported_values, self.ROW_BATCH):
            counter += len(batch)
            with db_connection:
                for value in batch:
                    self._InsertValueIntoDb(table_name, schema, value,
                                            db_cursor)
            for sql in self._FlushAllRows(db_connection, table_name):
                yield sql

        db_connection.close()
        yield self.archive_generator.WriteFileChunk("COMMIT;\n")
        yield self.archive_generator.WriteFileFooter()

        counts_for_original_type = self.export_counts.setdefault(
            original_value_type.__name__, dict())
        counts_for_original_type[first_value.__class__.__name__] = counter
Example #9
0
def CleanVacuousVersions(clients=None, dry_run=True):
    """A script to remove no-op client versions.

  This script removes versions of a client when it is identical to the previous,
  in the sense that no versioned attributes were changed since the previous
  client version.

  Args:
    clients: A list of ClientURN, if empty cleans all clients.
    dry_run: whether this is a dry run
  """

    if not clients:
        index = client_index.CreateClientIndex()
        clients = index.LookupClients(["."])
    clients.sort()
    with data_store.DB.GetMutationPool() as pool:

        logging.info("checking %d clients", len(clients))
        for batch in utils.Grouper(clients, 10000):
            # TODO(amoser): This only works on datastores that use the Bigtable
            # scheme.
            client_infos = data_store.DB.MultiResolvePrefix(
                batch, ["aff4:", "aff4:"], data_store.DB.ALL_TIMESTAMPS)

            for client, type_list in client_infos:
                cleared = 0
                kept = 0
                updates = []
                for a, _, ts in type_list:
                    if ts != 0:
                        updates.append((ts, a))
                updates = sorted(updates)
                dirty = True
                for ts, a in updates:
                    if a == "aff4:type":
                        if dirty:
                            kept += 1
                            dirty = False
                        else:
                            cleared += 1
                            if not dry_run:
                                pool.DeleteAttributes(client, ["aff4:type"],
                                                      start=ts,
                                                      end=ts)
                                if pool.Size() > 1000:
                                    pool.Flush()
                    else:
                        dirty = True
                logging.info("%s: kept %d and cleared %d", client, kept,
                             cleared)
Example #10
0
 def GetInput(self):
     """Yield client urns."""
     client_list = GetAllClients(token=self.token)
     logging.debug("Got %d clients", len(client_list))
     for client_group in utils.Grouper(client_list, self.client_chunksize):
         for fd in aff4.FACTORY.MultiOpen(client_group,
                                          mode="r",
                                          aff4_type=aff4_grr.VFSGRRClient,
                                          token=self.token):
             if isinstance(fd, aff4_grr.VFSGRRClient):
                 # Skip if older than max_age
                 oldest_time = (time.time() - self.max_age) * 1e6
             if fd.Get(aff4_grr.VFSGRRClient.SchemaCls.PING) >= oldest_time:
                 yield fd
Example #11
0
  def ProcessClients(self, unused_responses):
    """Does the work."""
    self.start = 0
    self.end = int(1e6 * (time.time() - self.MAX_AGE))

    client_urns = export_utils.GetAllClients(token=self.token)

    for batch in utils.Grouper(client_urns, 10000):
      with data_store.DB.GetMutationPool() as mutation_pool:
        for client_urn in batch:
          mutation_pool.DeleteAttributes(
              client_urn.Add("stats"), [u"aff4:stats"],
              start=self.start,
              end=self.end)
      self.HeartBeat()
Example #12
0
File: db.py Project: qsdj/grr
  def IterateAllClientSnapshots(self, batch_size=50000):
    """Iterates over all available clients and yields client snapshot objects.

    Args:
      batch_size: Always reads <batch_size> snapshots at a time.
    Yields:
      An rdfvalues.objects.ClientSnapshot object for each client in the db.
    """
    all_client_ids = self.ReadAllClientIDs()

    for batch in utils.Grouper(all_client_ids, batch_size):
      res = self.MultiReadClientSnapshot(batch)
      for snapshot in res.values():
        if snapshot:
          yield snapshot
Example #13
0
File: db.py Project: qsdj/grr
  def IterateAllClientsFullInfo(self, batch_size=50000, min_last_ping=None):
    """Iterates over all available clients and yields full info protobufs.

    Args:
      batch_size: Always reads <batch_size> client full infos at a time.
      min_last_ping: If not None, only the clients with last ping time bigger
                     than min_last_ping will be returned.
    Yields:
      An rdfvalues.objects.ClientFullInfo object for each client in the db.
    """
    all_client_ids = self.ReadAllClientIDs()

    for batch in utils.Grouper(all_client_ids, batch_size):
      res = self.MultiReadClientFullInfo(batch, min_last_ping=min_last_ping)
      for full_info in res.values():
        yield full_info
Example #14
0
    def _MultiStream(cls, fds):
        """Effectively streams data from multiple opened BlobImage objects.

    Args:
      fds: A list of opened AFF4Stream (or AFF4Stream descendants) objects.

    Yields:
      Tuples (chunk, fd, exception) where chunk is a binary blob of data and fd
      is an object from the fds argument.

      If one or more chunks are missing, exception is a MissingBlobsError object
      and chunk is None. _MultiStream does its best to skip the file entirely if
      one of its chunks is missing, but in case of very large files it's still
      possible to yield a truncated file.
    """

        broken_fds = set()
        missing_blobs_fd_pairs = []
        for chunk_fd_pairs in utils.Grouper(
                cls._GenerateChunkIds(fds),
                cls.MULTI_STREAM_CHUNKS_READ_AHEAD):
            results_map = data_store.DB.ReadBlobs(dict(chunk_fd_pairs).keys(),
                                                  token=fds[0].token)

            for chunk_id, fd in chunk_fd_pairs:
                if chunk_id not in results_map or results_map[chunk_id] is None:
                    missing_blobs_fd_pairs.append((chunk_id, fd))
                    broken_fds.add(fd)

            for chunk, fd in chunk_fd_pairs:
                if fd in broken_fds:
                    continue

                yield fd, results_map[chunk], None

        if missing_blobs_fd_pairs:
            missing_blobs_by_fd = {}
            for chunk_id, fd in missing_blobs_fd_pairs:
                missing_blobs_by_fd.setdefault(fd, []).append(chunk_id)

            for fd, missing_blobs in missing_blobs_by_fd.iteritems():
                e = MissingBlobsError("%d missing blobs (multi-stream)" %
                                      len(missing_blobs),
                                      missing_chunks=missing_blobs)
                yield fd, None, e
Example #15
0
    def Convert(self, values, start_index=0, end_index=None):
        """Converts given collection to exported values.

    This method uses a threadpool to do the conversion in parallel. It
    blocks until everything is converted.

    Args:
      values: Iterable object with values to convert.
      start_index: Start from this index in the collection.
      end_index: Finish processing on the (index - 1) element of the
                 collection. If None, work till the end of the collection.

    Returns:
      Nothing. ConvertedBatch() should handle the results.
    """
        if not values:
            return

        try:
            total_batch_count = len(values) / self.batch_size
        except TypeError:
            total_batch_count = -1

        pool = ThreadPool.Factory(self.threadpool_prefix, self.threadpool_size)
        val_iterator = itertools.islice(values, start_index, end_index)

        pool.Start()
        try:
            for batch_index, batch in enumerate(
                    utils.Grouper(val_iterator, self.batch_size)):
                logging.debug("Processing batch %d out of %d", batch_index,
                              total_batch_count)

                pool.AddTask(target=self.ConvertBatch,
                             args=(batch, ),
                             name="batch_%d" % batch_index,
                             inline=False)

        finally:
            pool.Stop()
Example #16
0
def MigrateClientVfs(client_urn):
    """Migrates entire VFS of given client to the relational data store."""
    vfs = ListVfs(client_urn)

    path_infos = []

    for vfs_urn in vfs:
        _, vfs_path = vfs_urn.Split(2)
        path_type, components = rdf_objects.ParseCategorizedPath(vfs_path)

        path_info = rdf_objects.PathInfo(path_type=path_type,
                                         components=components)
        path_infos.append(path_info)

    data_store.REL_DB.WritePathInfos(client_urn.Basename(), path_infos)

    for vfs_group in utils.Grouper(vfs, _VFS_GROUP_SIZE):
        stat_entries = dict()
        hash_entries = dict()

        for fd in aff4.FACTORY.MultiOpen(vfs_group, age=aff4.ALL_TIMES):
            _, vfs_path = fd.urn.Split(2)
            path_type, components = rdf_objects.ParseCategorizedPath(vfs_path)
            path_info = rdf_objects.PathInfo(path_type=path_type,
                                             components=components)

            for stat_entry in fd.GetValuesForAttribute(fd.Schema.STAT):
                stat_path_info = path_info.Copy()
                stat_path_info.timestamp = stat_entry.age
                stat_entries[stat_path_info] = stat_entry

            for hash_entry in fd.GetValuesForAttribute(fd.Schema.HASH):
                hash_path_info = path_info.Copy()
                hash_path_info.timestamp = hash_entry.age
                hash_entries[hash_path_info] = hash_entry

        data_store.REL_DB.MultiWritePathHistory(client_urn.Basename(),
                                                stat_entries, hash_entries)
Example #17
0
    def Execute(self, thread_count):
        """Runs the migration procedure.

    Args:
      thread_count: A number of threads to execute the migration with.

    Raises:
      AssertionError: If not all clients have been migrated.
      ValueError: If the relational database backend is not available.
    """
        if not data_store.RelationalDBWriteEnabled():
            raise ValueError("No relational database available.")

        sys.stdout.write("Collecting clients...\n")
        client_urns = _GetClientUrns()

        sys.stdout.write("Clients to migrate: {}\n".format(len(client_urns)))
        sys.stdout.write("Threads to use: {}\n".format(thread_count))

        self._total_count = len(client_urns)
        self._migrated_count = 0
        self._start_time = rdfvalue.RDFDatetime.Now()

        batches = utils.Grouper(client_urns, _CLIENT_BATCH_SIZE)

        self._Progress()
        tp = pool.ThreadPool(processes=thread_count)
        tp.map(self._MigrateBatch, list(batches))
        self._Progress()

        if self._migrated_count == self._total_count:
            message = "\nMigration has been finished (migrated {} clients).\n".format(
                self._migrated_count)
            sys.stdout.write(message)
        else:
            message = "Not all clients have been migrated ({}/{})".format(
                self._migrated_count, self._total_count)
            raise AssertionError(message)
Example #18
0
  def ProcessSingleTypeExportedValues(self, original_value_type,
                                      exported_values):
    first_value = next(exported_values, None)
    if not first_value:
      return

    yield self.archive_generator.WriteFileHeader(
        "%s/%s/from_%s.csv" % (self.path_prefix, first_value.__class__.__name__,
                               original_value_type.__name__))

    buf = cStringIO.StringIO()
    writer = csv.writer(buf)
    # Write the CSV header based on first value class and write
    # the first value itself. All other values are guaranteed
    # to have the same class (see ProcessSingleTypeExportedValues definition).
    writer.writerow(self._GetCSVHeader(first_value.__class__))
    writer.writerow(self._GetCSVRow(first_value))
    yield self.archive_generator.WriteFileChunk(buf.getvalue())

    # Counter starts from 1, as 1 value has already been written.
    counter = 1
    for batch in utils.Grouper(exported_values, self.ROW_BATCH):
      counter += len(batch)

      buf = cStringIO.StringIO()
      writer = csv.writer(buf)
      for value in batch:
        writer.writerow(self._GetCSVRow(value))

      yield self.archive_generator.WriteFileChunk(buf.getvalue())

    yield self.archive_generator.WriteFileFooter()

    self.export_counts.setdefault(
        original_value_type.__name__,
        dict())[first_value.__class__.__name__] = counter
Example #19
0
  def Handle(self, args, token=None):
    if args.count:
      end = args.offset + args.count
      # Read <count> clients ahead in case some of them fail to open / verify.
      batch_size = end + args.count
    else:
      end = sys.maxsize
      batch_size = end

    keywords = shlex.split(args.query)
    api_clients = []

    if data_store.RelationalDBReadEnabled():
      index = client_index.ClientIndex()

      # TODO(amoser): We could move the label verification into the
      # database making this method more efficient. Label restrictions
      # should be on small subsets though so this might not be worth
      # it.
      all_client_ids = set()
      for label in self.labels_whitelist:
        label_filter = ["label:" + label] + keywords
        all_client_ids.update(index.LookupClients(label_filter))

      index = 0
      for cid_batch in utils.Grouper(sorted(all_client_ids), batch_size):
        client_infos = data_store.REL_DB.MultiReadClientFullInfo(cid_batch)

        for _, client_info in sorted(client_infos.items()):
          if not self._VerifyLabels(client_info.labels):
            continue
          if index >= args.offset and index < end:
            api_clients.append(ApiClient().InitFromClientInfo(client_info))
          index += 1
          if index >= end:
            UpdateClientsFromFleetspeak(api_clients)
            return ApiSearchClientsResult(items=api_clients)

    else:
      index = client_index.CreateClientIndex(token=token)
      all_urns = set()
      for label in self.labels_whitelist:
        label_filter = ["label:" + label] + keywords
        all_urns.update(index.LookupClients(label_filter))

      all_objs = aff4.FACTORY.MultiOpen(
          all_urns, aff4_type=aff4_grr.VFSGRRClient, token=token)

      index = 0
      for client_obj in sorted(all_objs):
        if not self._CheckClientLabels(client_obj):
          continue
        if index >= args.offset and index < end:
          api_clients.append(ApiClient().InitFromAff4Object(client_obj))

        index += 1
        if index >= end:
          break

    UpdateClientsFromFleetspeak(api_clients)
    return ApiSearchClientsResult(items=api_clients)
Example #20
0
    def ProcessOneHunt(self, exceptions_by_hunt):
        """Reads results for one hunt and process them."""
        hunt_results_urn, results = (
            hunts_results.HuntResultQueue.ClaimNotificationsForCollection(
                token=self.token, lease_time=self.lifetime))
        logging.debug("Found %d results for hunt %s", len(results),
                      hunt_results_urn)
        if not results:
            return 0

        hunt_urn = rdfvalue.RDFURN(hunt_results_urn.Dirname())
        batch_size = self.BATCH_SIZE
        metadata_urn = hunt_urn.Add("ResultsMetadata")
        exceptions_by_plugin = {}
        num_processed_for_hunt = 0
        collection_obj = implementation.GRRHunt.ResultCollectionForHID(
            hunt_urn)
        try:
            with aff4.FACTORY.OpenWithLock(metadata_urn,
                                           lease_time=600,
                                           token=self.token) as metadata_obj:
                all_plugins, used_plugins = self.LoadPlugins(metadata_obj)
                num_processed = int(
                    metadata_obj.Get(
                        metadata_obj.Schema.NUM_PROCESSED_RESULTS))
                for batch in utils.Grouper(results, batch_size):
                    results = list(
                        collection_obj.MultiResolve(
                            [r.value.ResultRecord() for r in batch]))
                    self.RunPlugins(hunt_urn, used_plugins, results,
                                    exceptions_by_plugin)

                    hunts_results.HuntResultQueue.DeleteNotifications(
                        batch, token=self.token)
                    num_processed += len(batch)
                    num_processed_for_hunt += len(batch)
                    self.HeartBeat()
                    metadata_obj.Set(
                        metadata_obj.Schema.NUM_PROCESSED_RESULTS(
                            num_processed))
                    metadata_obj.UpdateLease(600)
                    if self.CheckIfRunningTooLong():
                        logging.warning("Run too long, stopping.")
                        break

                metadata_obj.Set(
                    metadata_obj.Schema.OUTPUT_PLUGINS(all_plugins))
                metadata_obj.Set(
                    metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed))
        except aff4.LockError:
            logging.warn(
                "ProcessHuntResultCollectionsCronFlow: "
                "Could not get lock on hunt metadata %s.", metadata_urn)
            return 0

        if exceptions_by_plugin:
            for plugin, exceptions in exceptions_by_plugin.items():
                exceptions_by_hunt.setdefault(hunt_urn, {}).setdefault(
                    plugin, []).extend(exceptions)

        logging.debug("Processed %d results.", num_processed_for_hunt)
        return len(results)
Example #21
0
    def Generate(self, collection, token=None):
        """Generates archive from a given collection.

    Iterates the collection and generates an archive by yielding contents
    of every referenced AFF4Stream.

    Args:
      collection: Iterable with items that point to aff4 paths.
      token: User's ACLToken.

    Yields:
      Binary chunks comprising the generated archive.
    """
        clients = set()
        for fd_urn_batch in utils.Grouper(self._ItemsToUrns(collection),
                                          self.BATCH_SIZE):

            fds_to_write = {}
            for fd in aff4.FACTORY.MultiOpen(fd_urn_batch, token=token):
                self.total_files += 1

                if not self.predicate(fd):
                    self.ignored_files.append(utils.SmartUnicode(fd.urn))
                    continue

                # Any file-like object with data in AFF4 should inherit AFF4Stream.
                if isinstance(fd, aff4.AFF4Stream):
                    urn_components = fd.urn.Split()
                    clients.add(rdf_client.ClientURN(urn_components[0]))

                    content_path = os.path.join(self.prefix, *urn_components)
                    self.archived_files += 1

                    # Make sure size of the original file is passed. It's required
                    # when output_writer is StreamingTarWriter.
                    st = os.stat_result(
                        (0644, 0, 0, 0, 0, 0, fd.size, 0, 0, 0))
                    fds_to_write[fd] = (content_path, st)

            if fds_to_write:
                prev_fd = None
                for fd, chunk, exception in aff4.AFF4Stream.MultiStream(
                        fds_to_write):
                    if exception:
                        logging.exception(exception)

                        self.archived_files -= 1
                        self.failed_files.append(utils.SmartUnicode(fd.urn))
                        continue

                    if prev_fd != fd:
                        if prev_fd:
                            yield self.archive_generator.WriteFileFooter()
                        prev_fd = fd

                        content_path, st = fds_to_write[fd]
                        yield self.archive_generator.WriteFileHeader(
                            content_path, st=st)

                    yield self.archive_generator.WriteFileChunk(chunk)

                if self.archive_generator.is_file_write_in_progress:
                    yield self.archive_generator.WriteFileFooter()

        if clients:
            for client_urn_batch in utils.Grouper(clients, self.BATCH_SIZE):
                for fd in aff4.FACTORY.MultiOpen(
                        client_urn_batch,
                        aff4_type=aff4_grr.VFSGRRClient,
                        token=token):
                    for chunk in self._GenerateClientInfo(fd):
                        yield chunk

        for chunk in self._GenerateDescription():
            yield chunk

        yield self.archive_generator.Close()