Example #1
0
    def test_grouper_with_stop_value(self):
        # given
        actual_data = utils.grouper(((i, i + 1) for i in range(0, 9)), 2)

        out = []
        for d in actual_data:
            out.append(list(d))  # force generator resolution for checks

        self.assertEqual(
            out,
            [
                [(0, 1), (1, 2)],
                [(2, 3), (3, 4)],
                [(4, 5), (5, 6)],
                [(6, 7), (7, 8)],
                [(8, 9)],
            ],
        )

        # given
        actual_data = utils.grouper((i for i in range(9, 0, -1)), 4)

        out = []
        for d in actual_data:
            out.append(list(d))  # force generator resolution for checks

        self.assertEqual(out, [[9, 8, 7, 6], [5, 4, 3, 2], [1]])
Example #2
0
def random_blocks(iterable, block=100):
    """Randomize iterable per block of size block.

    Given an iterable:

    - slice the iterable in data set of block-sized elements
    - randomized the block-sized elements
    - yield each element of that randomized block-sized
    - continue onto the next block-sized block

    Args:
        iterable (Iterable): an iterable
        block (int): number of elements per block

    Yields:
        random element of the iterable

    """
    count = 0
    for iter_ in grouper(iterable, block):
        count += 1
        lst = list(iter_)
        random.shuffle(lst)
        for e in lst:
            yield e
Example #3
0
 def add(self, swhids: Iterable[str], chunk_size: int, cur: sqlite3.Cursor):
     """Insert the SWHID inside the database."""
     for swhids_chunk in grouper(swhids, chunk_size):
         cur.executemany(
             """INSERT INTO swhids VALUES (?)""",
             [(swhid_chunk, ) for swhid_chunk in swhids_chunk],
         )
Example #4
0
    def ingest_data(self, identifier, checks=False):
        """Rework the base ingest_data.
           Request server endpoint which gives all in one go.

           Simplify and filter response list of repositories.  Inject
           repo information into local db. Queue loader tasks for
           linked repositories.

        Args:
            identifier: Resource identifier (unused)
            checks (bool): Additional checks required (unused)

        """
        response = self.safely_issue_request(identifier)
        response = self.list_packages(response)
        if not response:
            return response, []
        models_list = self.transport_response_simplified(response)
        models_list = self.filter_before_inject(models_list)
        all_injected = []
        for models in utils.grouper(models_list, n=10000):
            models = list(models)
            logging.debug('models: %s' % len(models))
            # inject into local db
            injected = self.inject_repo_data_into_db(models)
            # queue workers
            self.create_missing_origins_and_tasks(models, injected)
            all_injected.append(injected)
            # flush
            self.db_session.commit()
            self.db_session = self.mk_session()

        return response, all_injected
Example #5
0
def call_with_batches(
    f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int,
) -> Iterator[T2]:
    """Calls a function with batches of args, and concatenates the results.
    """
    groups = grouper(args, batch_size)
    for group in groups:
        yield from f(list(group))
def retrieve_unknown_sha1s(swhstorage, gen_data):
    # Compute blocks of 1000 sha1s
    for paths in grouper(gen_data, n=1000):
        data = process_paths(paths)
        sha1s_tocheck = list(data.keys())
        if len(sha1s_tocheck) > 0:
            # let those inexistent sha1s flow
            for sha1 in swhstorage.content_missing_per_sha1(sha1s_tocheck):
                yield data[sha1][0], data[sha1][1]
Example #7
0
def test_grouper():
    # given
    actual_data = utils.grouper((i for i in range(0, 9)), 2)

    out = []
    for d in actual_data:
        out.append(list(d))  # force generator resolution for checks

    assert out == [[0, 1], [2, 3], [4, 5], [6, 7], [8]]

    # given
    actual_data = utils.grouper((i for i in range(9, 0, -1)), 4)

    out = []
    for d in actual_data:
        out.append(list(d))  # force generator resolution for checks

    assert out == [[9, 8, 7, 6], [5, 4, 3, 2], [1]]
def retrieve_unknown_sha1s(swhstorage, gen_data):
    # Compute blocks of 1000 sha1s
    for paths in grouper(gen_data, n=1000):
        data = process_paths(paths)
        sha1s_tocheck = list(data.keys())
        if len(sha1s_tocheck) > 0:
            # let those inexistent sha1s flow
            for sha1 in swhstorage.content_missing_per_sha1(sha1s_tocheck):
                yield data[sha1][0], data[sha1][1]
def store_file_content(db_url, path):
    """The first round finished, there were errors. Adapting the code and
    running this command will finish appropriately the first round.

    """
    db = Db.connect(db_url)
    with db.transaction() as cur:
        for data in grouper(load_file(path), BLOCK_SIZE):
            db.copy_to(data, 'content_sesi',
                       ['origin_sha1', 'sha1', 'sha1_git', 'sha256', 'length',
                        'path', 'corrupted'], cur)
Example #10
0
    def get_new_contents_metadata(
        self, all_contents: List[Dict[str, Any]]
    ) -> Generator[Tuple[Dict[str, Any], List[Any]], Any, None]:
        """Retrieve raw contents and compute new checksums on the
           contents. Unknown or corrupted contents are skipped.

        Args:
            all_contents: List of contents as dictionary with
              the necessary primary keys

        Yields:
            tuple: tuple of (content to update, list of checksums computed)

        """
        content_ids = self._read_content_ids(all_contents)
        for contents in utils.grouper(content_ids,
                                      self.batch_size_retrieve_content):
            contents_iter = itertools.tee(contents, 2)
            try:
                sha1s = [s for s in contents_iter[0]]
                content_metadata: List[
                    Optional[Content]] = self.storage.content_get(sha1s)
            except Exception:
                self.log.exception("Problem when reading contents metadata.")
                continue

            for sha1, content_model in zip(sha1s, content_metadata):
                if not content_model:
                    continue
                content: Dict = content_model.to_dict()
                # Recompute checksums provided in compute_checksums options
                if self.recompute_checksums:
                    checksums_to_compute = list(self.compute_checksums)
                else:
                    # Compute checksums provided in compute_checksums
                    # options not already defined for that content
                    checksums_to_compute = [
                        h for h in self.compute_checksums if not content.get(h)
                    ]

                if not checksums_to_compute:  # Nothing to recompute
                    continue

                try:
                    raw_content = self.objstorage.get(sha1)
                except ObjNotFoundError:
                    self.log.warning("Content %s not found in objstorage!",
                                     sha1)
                    continue

                content_hashes = hashutil.MultiHash.from_data(
                    raw_content, hash_names=checksums_to_compute).digest()
                content.update(content_hashes)
                yield content, checksums_to_compute
Example #11
0
def store_file_content(db_url, path):
    """The first round finished, there were errors. Adapting the code and
    running this command will finish appropriately the first round.

    """
    db = Db.connect(db_url)
    with db.transaction() as cur:
        for data in grouper(load_file(path), BLOCK_SIZE):
            db.copy_to(data, 'content_sesi', [
                'origin_sha1', 'sha1', 'sha1_git', 'sha256', 'length', 'path',
                'corrupted'
            ], cur)
Example #12
0
    def copy_finished(self, content_ids):
        """Once the copy is finished, we'll send those batch of contents as
        done in the destination queue.

        """
        if self.task_destination:
            groups = []
            for ids in utils.grouper(content_ids, self.batch_size):
                sig_ids = self.task_destination.s(list(ids))
                groups.append(sig_ids)

            group(groups).delay()
Example #13
0
    def send_origins(self, origins: Iterable[model.ListedOrigin]) -> int:
        """Record a list of :class:`model.ListedOrigin` in the scheduler.

        Returns:
          the number of listed origins recorded in the scheduler
        """
        count = 0
        for batch_origins in grouper(origins, n=1000):
            ret = self.scheduler.record_listed_origins(batch_origins)
            count += len(ret)

        return count
Example #14
0
    def run(
        self,
        partition_id: int,
        nb_partitions: int,
        skip_existing: bool = True,
        **kwargs,
    ) -> Dict:
        """Given a partition of content ids, index the contents within.

           Either the indexer is incremental (filter out existing computed data) or it
           computes everything from scratch.

        Args:
            partition_id: Index of the partition to fetch
            nb_partitions: Total number of partitions to split into
            skip_existing: Skip existing indexed data
                (default) or not
            **kwargs: passed to the `index` method

        Returns:
            dict with the indexing task status

        """
        summary: Dict[str, Any] = {"status": "uneventful"}
        count = 0
        try:
            if skip_existing:
                gen = self._index_with_skipping_already_done(
                    partition_id, nb_partitions)
            else:
                gen = self._index_contents(partition_id,
                                           nb_partitions,
                                           indexed=set([]))

            count_object_added_key: Optional[str] = None

            for contents in utils.grouper(gen,
                                          n=self.config["write_batch_size"]):
                res = self.persist_index_computations(list(contents))
                if not count_object_added_key:
                    count_object_added_key = list(res.keys())[0]
                count += res[count_object_added_key]
                if count > 0:
                    summary["status"] = "eventful"
        except Exception:
            if not self.catch_exceptions:
                raise
            self.log.exception("Problem when computing metadata.")
            summary["status"] = "failed"

        if count > 0 and count_object_added_key:
            summary[count_object_added_key] = count
        return summary
def compute_s3_files(db_url, block_size, limit, final, huge):
    from swh.scheduler.celery_backend.config import app
    from swh.loader.antelink import tasks  # noqa

    if db_url:
        store = storage.Storage(db_url)
        gen_data = s3_files_to_download(store, huge, final, limit)
    else:
        gen_data = utils.gen_path_length_from_stdin()

    for paths in grouper(gen_data, block_size):
        app.tasks['swh.loader.antelink.tasks.AntelinkS3DownloaderTsk'].delay(
            list(paths))
Example #16
0
def compute_s3_files(db_url, block_size, limit, final, huge):
    from swh.scheduler.celery_backend.config import app
    from swh.loader.antelink import tasks  # noqa

    if db_url:
        store = storage.Storage(db_url)
        gen_data = s3_files_to_download(store, huge, final, limit)
    else:
        gen_data = utils.gen_path_length_from_stdin()

    for paths in grouper(gen_data, block_size):
        app.tasks['swh.loader.antelink.tasks.AntelinkS3DownloaderTsk'].delay(
            list(paths))
Example #17
0
    def mget(self, index_name, doc_ids, chunk_size=500, source=True):
        """Retrieve document's full content according to their ids as per
           source's setup.

           The `source` allows to retrieve only what's interesting, e.g:
           - source=True ; gives back the original indexed data
           - source=False ; returns without the original _source field
           - source=['task_id'] ; returns only task_id in the _source field

        Args:
            index_name (str): Name of the concerned index.
            doc_ids (generator): Generator of ids to retrieve
            chunk_size (int): Number of documents chunk to send for retrieval
            source (bool/[str]): Source of information to return

        Yields:
            document indexed as per source's setup

        """
        if isinstance(source, list):
            source = {"_source": ",".join(source)}
        else:
            source = {"_source": str(source).lower()}

        for ids in utils.grouper(doc_ids, n=1000):
            res = self.storage.mget(
                body={"ids": list(ids)},
                index=index_name,
                doc_type=self.doc_type,
                params=source,
            )
            if not res:
                logger.error("Error during retrieval of data, skipping!")
                continue

            for doc in res["docs"]:
                found = doc.get("found")
                if not found:
                    msg = "Doc id %s not found, not indexed yet" % doc["_id"]
                    logger.warning(msg)
                    continue
                yield doc["_source"]
Example #18
0
    def get_contents_to_archive(self):
        gen_content_ids = (ids for ids in utils.grouper(
            read_sha1_from_stdin(), self.config['batch_max_size']))

        if self.force_copy:
            for content_ids in gen_content_ids:
                content_ids = list(content_ids)

                if not content_ids:
                    continue

                # Add missing entries in archiver table
                self._add_unknown_content_ids(content_ids)

                print('Send %s contents to archive' % len(content_ids))

                for content_id in content_ids:
                    # force its status to missing
                    self.archiver_storage.content_archive_update(
                        content_id, self.destination, 'missing')
                    yield content_id

        else:
            for content_ids in gen_content_ids:
                content_ids = list(content_ids)

                # Add missing entries in archiver table
                self._add_unknown_content_ids(content_ids)

                # Filter already copied data
                content_ids = list(
                    self.archiver_storage.content_archive_get_missing(
                        content_ids=content_ids,
                        backend_name=self.destination))

                if not content_ids:
                    continue

                print('Send %s contents to archive' % len(content_ids))

                for content in content_ids:
                    yield content
Example #19
0
    def run(self, contents: List[Dict[str, Any]]) -> Dict:
        """Given a list of content:

          - (re)compute a given set of checksums on contents available in our
            object storage
          - update those contents with the new metadata

        Args:
            contents: contents as dictionary with necessary keys.
                key present in such dictionary should be the ones defined in
                the 'primary_key' option.

        Returns:
            A summary dict with key 'status', task' status and 'count' the
            number of updated contents.

        """
        status = "uneventful"
        count = 0
        for data in utils.grouper(self.get_new_contents_metadata(contents),
                                  self.batch_size_update):

            groups: Dict[str, List[Any]] = defaultdict(list)
            for content, keys_to_update in data:
                keys_str = ",".join(keys_to_update)
                groups[keys_str].append(content)

            for keys_to_update, contents in groups.items():
                keys: List[str] = keys_to_update.split(",")
                try:
                    self.storage.content_update(contents, keys=keys)
                    count += len(contents)
                    status = "eventful"
                except Exception:
                    self.log.exception("Problem during update.")
                    continue

        return {
            "status": status,
            "count": count,
        }
def process_origin_visits(visits, scheduler, task_names):
    task_dicts = []
    logging.debug("processing origin visits %r", visits)
    if task_names.get("origin_metadata"):
        visits = [visit for visit in visits if visit["status"] == "full"]
        visit_batches = grouper(visits, MAX_ORIGINS_PER_TASK)
        for visit_batch in visit_batches:
            visit_urls = []
            for visit in visit_batch:
                if isinstance(visit["origin"], str):
                    visit_urls.append(visit["origin"])
                else:
                    visit_urls.append(visit["origin"]["url"])
            task_dicts.append(
                create_task_dict(
                    task_names["origin_metadata"],
                    "oneshot",
                    visit_urls,
                    retries_left=1,
                ))

    if task_dicts:
        scheduler.create_tasks(task_dicts)
Example #21
0
def store_file_to_antelink_db_per_block(db, path):
    with db.transaction() as cur:
        for data in grouper(load_data(path), BLOCK_SIZE):
            db.copy_to(data, 'content_s3', ['sha1', 'path', 'length'], cur)
def store_file_to_antelink_db_per_block(db, path):
    with db.transaction() as cur:
        for data in grouper(load_data(path), BLOCK_SIZE):
            db.copy_to(data, 'content_s3',
                       ['sha1', 'path', 'length'], cur)
Example #23
0
    def flush(
            self,
            object_types: Sequence[LObjectType] = OBJECT_TYPES
    ) -> Dict[str, int]:
        summary: Dict[str, int] = {}

        def update_summary(stats):
            for k, v in stats.items():
                summary[k] = v + summary.get(k, 0)

        for object_type in object_types:
            buffer_ = self._objects[object_type]
            if not buffer_:
                continue

            if logger.isEnabledFor(logging.DEBUG):
                log = "Flushing %s objects of type %s"
                log_args = [len(buffer_), object_type]

                if object_type == "content":
                    log += " (%s bytes)"
                    log_args.append(
                        sum(cast(Content, c).length for c in buffer_.values()))

                elif object_type == "directory":
                    log += " (%s entries)"
                    log_args.append(
                        sum(
                            len(cast(Directory, d).entries)
                            for d in buffer_.values()))

                elif object_type == "revision":
                    log += " (%s parents, %s estimated bytes)"
                    log_args.extend((
                        sum(
                            len(cast(Revision, r).parents)
                            for r in buffer_.values()),
                        sum(
                            estimate_revision_size(cast(Revision, r))
                            for r in buffer_.values()),
                    ))

                elif object_type == "release":
                    log += " (%s estimated bytes)"
                    log_args.append(
                        sum(
                            estimate_release_size(cast(Release, r))
                            for r in buffer_.values()))

                logger.debug(log, *log_args)

            batches = grouper(buffer_.values(),
                              n=self._buffer_thresholds[object_type])
            for batch in batches:
                add_fn = getattr(self.storage, "%s_add" % object_type)
                stats = add_fn(list(batch))
                update_summary(stats)

        # Flush underlying storage
        stats = self.storage.flush(object_types)
        update_summary(stats)

        self.clear_buffers(object_types)

        return summary
Example #24
0
def archive_tasks(
    ctx,
    before,
    after,
    batch_index,
    bulk_index,
    batch_clean,
    dry_run,
    verbose,
    cleanup,
    start_from,
):
    """Archive task/task_run whose (task_type is 'oneshot' and task_status
       is 'completed') or (task_type is 'recurring' and task_status is
       'disabled').

       With --dry-run flag set (default), only list those.

    """
    from itertools import groupby

    from swh.core.utils import grouper
    from swh.scheduler.backend_es import ElasticSearchBackend
    from swh.scheduler.utils import utcnow

    config = ctx.obj["config"]
    scheduler = ctx.obj["scheduler"]

    if not scheduler:
        raise ValueError("Scheduler class (local/remote) must be instantiated")

    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    logger = logging.getLogger(__name__)
    logging.getLogger("urllib3").setLevel(logging.WARN)
    logging.getLogger("elasticsearch").setLevel(logging.ERROR)
    if dry_run:
        logger.info("**DRY-RUN** (only reading db)")
    if not cleanup:
        logger.info("**NO CLEANUP**")

    es_storage = ElasticSearchBackend(**config)
    now = utcnow()

    # Default to archive tasks from a rolling month starting the week
    # prior to the current one
    if not before:
        before = now.shift(weeks=-1).format("YYYY-MM-DD")

    if not after:
        after = now.shift(weeks=-1).shift(months=-1).format("YYYY-MM-DD")

    logger.debug(
        "index: %s; cleanup: %s; period: [%s ; %s]"
        % (not dry_run, not dry_run and cleanup, after, before)
    )

    def get_index_name(
        data: Dict[str, Any], es_storage: ElasticSearchBackend = es_storage
    ) -> str:
        """Given a data record, determine the index's name through its ending
           date. This varies greatly depending on the task_run's
           status.

        """
        date = data.get("started")
        if not date:
            date = data["scheduled"]
        return es_storage.compute_index_name(date.year, date.month)

    def index_data(before, page_token, batch_index):
        while True:
            result = scheduler.filter_task_to_archive(
                after, before, page_token=page_token, limit=batch_index
            )
            tasks_sorted = sorted(result["tasks"], key=get_index_name)
            groups = groupby(tasks_sorted, key=get_index_name)
            for index_name, tasks_group in groups:
                logger.debug("Index tasks to %s" % index_name)
                if dry_run:
                    for task in tasks_group:
                        yield task
                    continue

                yield from es_storage.streaming_bulk(
                    index_name,
                    tasks_group,
                    source=["task_id", "task_run_id"],
                    chunk_size=bulk_index,
                )

            page_token = result.get("next_page_token")
            if page_token is None:
                break

    gen = index_data(before, page_token=start_from, batch_index=batch_index)
    if cleanup:
        for task_ids in grouper(gen, n=batch_clean):
            task_ids = list(task_ids)
            logger.info("Clean up %s tasks: [%s, ...]" % (len(task_ids), task_ids[0]))
            if dry_run:  # no clean up
                continue
            ctx.obj["scheduler"].delete_archived_tasks(task_ids)
    else:
        for task_ids in grouper(gen, n=batch_index):
            task_ids = list(task_ids)
            logger.info("Indexed %s tasks: [%s, ...]" % (len(task_ids), task_ids[0]))

    logger.debug("Done!")