def test_config_wrong_consistency_should_raise(self):
        storage_config = dict(
            cls="cassandra",
            hosts=["first"],
            port=9999,
            keyspace="any",
            consistency_level="fake",
            journal_writer={"cls": "memory"},
            objstorage={"cls": "memory"},
        )

        with pytest.raises(ValueError, match="Unknown consistency"):
            get_storage(**storage_config)
 def __init__(self):
     self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG)
     self.scheduler: SchedulerInterface = get_scheduler(
         **self.config["scheduler"])
     self.tool = {
         "name": "swh-deposit",
         "version": __version__,
         "configuration": {
             "sword_version": "2"
         },
     }
     self.storage: StorageInterface = get_storage(**self.config["storage"])
     self.storage_metadata: StorageInterface = get_storage(
         **self.config["storage_metadata"])
def test_storage_direct_writer_anonymized(
    kafka_prefix: str, kafka_server, consumer: Consumer
):

    writer_config = {
        "cls": "kafka",
        "brokers": [kafka_server],
        "client_id": "kafka_writer",
        "prefix": kafka_prefix,
        "anonymize": True,
    }
    storage_config: Dict[str, Any] = {
        "cls": "pipeline",
        "steps": [
            {"cls": "memory", "journal_writer": writer_config},
        ],
    }

    storage = get_storage(**storage_config)

    expected_messages = 0

    for obj_type, objs in TEST_OBJECTS.items():
        if obj_type == "origin_visit":
            # these have non-consistent API and are unrelated with what we
            # want to test here
            continue
        method = getattr(storage, obj_type + "_add")
        method(objs)
        expected_messages += len(objs)

    existing_topics = set(
        topic
        for topic in consumer.list_topics(timeout=10).topics.keys()
        if topic.startswith(kafka_prefix)
    )
    assert existing_topics == {
        f"{kafka_prefix}.{obj_type}"
        for obj_type in (
            "content",
            "directory",
            "extid",
            "metadata_authority",
            "metadata_fetcher",
            "origin",
            "origin_visit",
            "origin_visit_status",
            "raw_extrinsic_metadata",
            "release",
            "revision",
            "snapshot",
            "skipped_content",
        )
    } | {
        f"{kafka_prefix}_privileged.{obj_type}"
        for obj_type in (
            "release",
            "revision",
        )
    }
Exemple #4
0
def storage():
    """An instance of in-memory storage that gets injected
    into the CLI functions."""
    storage = get_storage(cls="memory")
    with patch("swh.storage.get_storage") as get_storage_mock:
        get_storage_mock.return_value = storage
        yield storage
Exemple #5
0
def schedule_origin_metadata_index(
    ctx, type, options, storage_url, origin_batch_size, page_token, limit, dry_run
):
    """Schedules tasks for origins that are already known.

    The first argument is the name of the task type, further ones are
    keyword argument(s) of the task in the form key=value, where value is
    in YAML format.

    Usage sample:

    swh-scheduler --database 'service=swh-scheduler' \
        task schedule_origins index-origin-metadata
    """
    from itertools import islice

    from swh.storage import get_storage

    from .utils import parse_options, schedule_origin_batches

    scheduler = ctx.obj["scheduler"]
    storage = get_storage("remote", url=storage_url)
    if dry_run:
        scheduler = None

    (args, kw) = parse_options(options)
    if args:
        raise click.ClickException("Only keywords arguments are allowed.")

    origins = iter_origins(storage, page_token=page_token)
    if limit:
        origins = islice(origins, limit)

    origin_urls = (origin.url for origin in origins)
    schedule_origin_batches(scheduler, type, origin_urls, origin_batch_size, kw)
Exemple #6
0
def get_config(config_file="web/web"):
    """Read the configuration file `config_file`.

       If an environment variable SWH_CONFIG_FILENAME is defined, this
       takes precedence over the config_file parameter.

       In any case, update the app with parameters (secret_key, conf)
       and return the parsed configuration as a dict.

       If no configuration file is provided, return a default
       configuration.

    """

    if not swhweb_config:
        config_filename = os.environ.get("SWH_CONFIG_FILENAME")
        if config_filename:
            config_file = config_filename
        cfg = config.load_named_config(config_file, DEFAULT_CONFIG)
        swhweb_config.update(cfg)
        config.prepare_folders(swhweb_config, "log_dir")
        if swhweb_config.get("search"):
            swhweb_config["search"] = get_search(**swhweb_config["search"])
        else:
            swhweb_config["search"] = None
        swhweb_config["storage"] = get_storage(**swhweb_config["storage"])
        swhweb_config["vault"] = get_vault(**swhweb_config["vault"])
        swhweb_config["indexer_storage"] = get_indexer_storage(
            **swhweb_config["indexer_storage"])
        swhweb_config["scheduler"] = get_scheduler(
            **swhweb_config["scheduler"])
    return swhweb_config
Exemple #7
0
def test_revision_extra_header_in_metadata(swh_storage_backend_config, sample_data):
    storage = get_storage(**swh_storage_backend_config)
    rev = sample_data.revision

    md_w_extra = dict(
        rev.metadata.items(),
        extra_headers=headers_to_db(
            [
                ["gpgsig", b"test123"],
                ["mergetag", b"foo\\bar"],
                ["mergetag", b"\x22\xaf\x89\x80\x01\x00"],
            ]
        ),
    )

    bw_rev = attr.evolve(rev, extra_headers=())
    object.__setattr__(bw_rev, "metadata", md_w_extra)
    assert bw_rev.extra_headers == ()

    assert storage.revision_add([bw_rev]) == {"revision:add": 1}

    # check data in the db are old format
    with db_transaction(storage) as (_, cur):
        cur.execute("SELECT metadata, extra_headers FROM revision")
        metadata, extra_headers = cur.fetchone()
    assert extra_headers == []
    assert metadata == bw_rev.metadata

    # check the Revision build from revision_get is the original, "new style", Revision
    assert storage.revision_get([rev.id]) == [rev]
Exemple #8
0
def test_resolve_object_from_extids_missing_target() -> None:
    storage = get_storage("memory")

    target = b"\x01" * 20
    rel = Release(
        name=b"aaaa",
        message=b"aaaa",
        target=target,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=False,
    )

    loader = StubPackageLoader(storage, "http://example.org/")

    p_info = Mock(wraps=BasePackageInfo(None, None, None))  # type: ignore

    known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel.swhid()]}
    p_info.extid.return_value = ("extid-type", 0, b"extid-of-aaaa")
    whitelist = {rel.id}

    # Targeted release is missing from the storage
    assert loader.resolve_object_from_extids(known_extids, p_info,
                                             whitelist) is None

    storage.release_add([rel])

    # Targeted release now exists
    assert (loader.resolve_object_from_extids(known_extids, p_info,
                                              whitelist) == rel.swhid())
def _partial_copy_storage(old_storage, origin_url: str, mechanism: str,
                          copy_revisions: bool):
    """Create a new storage, and only copy ExtIDs or head revisions to it."""
    new_storage = get_storage(cls="memory")
    snapshot = snapshot_get_latest(old_storage, origin_url)
    assert snapshot
    heads = [branch.target for branch in snapshot.branches.values()]

    if mechanism == "extid":
        extids = old_storage.extid_get_from_target(ObjectType.REVISION, heads)
        new_storage.extid_add(extids)
        if copy_revisions:
            # copy revisions, but erase their metadata to make sure the loader doesn't
            # fallback to revision.metadata["nodeid"]
            revisions = [
                attr.evolve(rev, metadata={})
                for rev in old_storage.revision_get(heads) if rev
            ]
            new_storage.revision_add(revisions)

    else:
        assert mechanism == "same storage"
        return old_storage

    # copy origin, visit, status
    new_storage.origin_add(old_storage.origin_get([origin_url]))
    visit = old_storage.origin_visit_get_latest(origin_url)
    new_storage.origin_visit_add([visit])
    statuses = old_storage.origin_visit_status_get(origin_url,
                                                   visit.visit).results
    new_storage.origin_visit_status_add(statuses)
    new_storage.snapshot_add([snapshot])

    return new_storage
Exemple #10
0
def get_config(config_file='web/web'):
    """Read the configuration file `config_file`.

       If an environment variable SWH_CONFIG_FILENAME is defined, this
       takes precedence over the config_file parameter.

       In any case, update the app with parameters (secret_key, conf)
       and return the parsed configuration as a dict.

       If no configuration file is provided, return a default
       configuration.

    """

    if not swhweb_config:
        config_filename = os.environ.get('SWH_CONFIG_FILENAME')
        if config_filename:
            config_file = config_filename
        cfg = config.load_named_config(config_file, DEFAULT_CONFIG)
        swhweb_config.update(cfg)
        config.prepare_folders(swhweb_config, 'log_dir')
        swhweb_config['storage'] = get_storage(**swhweb_config['storage'])
        swhweb_config['vault'] = get_vault(**swhweb_config['vault'])
        swhweb_config['indexer_storage'] = \
            get_indexer_storage(**swhweb_config['indexer_storage'])
        swhweb_config['scheduler'] = get_scheduler(
            **swhweb_config['scheduler'])
    return swhweb_config
    def __init__(self, config):
        self.config = config

        self.storage = get_storage(config['storage_class'],
                                   config['storage_args'])

        self.log = logging.getLogger(
            'swh.antelink.loader.AntelinkSesiInjecter')
Exemple #12
0
    def __init__(self, config):
        self.config = config

        self.storage = get_storage(config['storage_class'],
                                   config['storage_args'])

        self.log = logging.getLogger(
            'swh.antelink.loader.AntelinkSesiInjecter')
Exemple #13
0
def storage(swh_indexer_config):
    """An instance of in-memory storage that gets injected into all indexers
       classes.

    """
    storage = get_storage(**swh_indexer_config["storage"])
    fill_storage(storage)
    return storage
    def test_config_consistency_used(self, swh_storage_backend_config):
        config_with_consistency = dict(swh_storage_backend_config,
                                       **{"consistency_level": "THREE"})

        storage = get_storage(**config_with_consistency)

        with pytest.raises(NoHostAvailable):
            storage.content_get_random()
def get_storage_with_buffer_config(**buffer_config) -> BufferingProxyStorage:
    steps = [
        {"cls": "buffer", **buffer_config},
        {"cls": "memory"},
    ]

    ret = get_storage("pipeline", steps=steps)
    assert isinstance(ret, BufferingProxyStorage)
    return ret
Exemple #16
0
 def __init__(self, storage):
     self.storage: StorageInterface = get_storage(**storage)
     for attribute_name in dir(StorageInterface):
         if attribute_name.startswith("_"):
             continue
         attribute = getattr(self.storage, attribute_name)
         if hasattr(attribute, "__call__"):
             setattr(self, attribute_name,
                     retry_function(self.storage, attribute_name))
Exemple #17
0
def test_resolve_object_from_extids() -> None:
    storage = get_storage("memory")
    target = b"\x01" * 20
    rel1 = Release(
        name=b"aaaa",
        message=b"aaaa",
        target=target,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=False,
    )
    rel2 = Release(
        name=b"bbbb",
        message=b"bbbb",
        target=target,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=False,
    )
    storage.release_add([rel1, rel2])

    loader = StubPackageLoader(storage, "http://example.org/")

    p_info = Mock(wraps=BasePackageInfo(None, None, None))  # type: ignore

    # The PackageInfo does not support extids
    p_info.extid.return_value = None
    known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel1.swhid()]}
    whitelist = {b"unused"}
    assert loader.resolve_object_from_extids(known_extids, p_info,
                                             whitelist) is None

    # Some known extid, and the PackageInfo is not one of them (ie. cache miss)
    p_info.extid.return_value = ("extid-type", 0, b"extid-of-cccc")
    assert loader.resolve_object_from_extids(known_extids, p_info,
                                             whitelist) is None

    # Some known extid, and the PackageInfo is one of them (ie. cache hit),
    # but the target release was not in the previous snapshot
    p_info.extid.return_value = ("extid-type", 0, b"extid-of-aaaa")
    assert loader.resolve_object_from_extids(known_extids, p_info,
                                             whitelist) is None

    # Some known extid, and the PackageInfo is one of them (ie. cache hit),
    # and the target release was in the previous snapshot
    whitelist = {rel1.id}
    assert (loader.resolve_object_from_extids(known_extids, p_info,
                                              whitelist) == rel1.swhid())

    # Same as before, but there is more than one extid, and only one is an allowed
    # release
    whitelist = {rel1.id}
    known_extids = {
        ("extid-type", 0, b"extid-of-aaaa"): [rel2.swhid(),
                                              rel1.swhid()]
    }
    assert (loader.resolve_object_from_extids(known_extids, p_info,
                                              whitelist) == rel1.swhid())
    def main(origin_url: str, incremental: bool) -> Dict[str, Any]:
        from swh.storage import get_storage

        storage = get_storage(cls="memory")
        loader = GitLoader(
            storage,
            origin_url,
            incremental=incremental,
        )
        return loader.load()
Exemple #19
0
def swh_storage():
    storage_config = {
        "cls": "pipeline",
        "steps": [
            {"cls": "validate"},
            {"cls": "memory"},
        ],
    }

    return get_storage(**storage_config)
    def __init__(self, config):
        self.config = config

        s3_folder = self.config['s3_folder']
        if not s3_folder.endswith('/'):
            self.config['s3_folder'] = s3_folder + '/'

        self.storage = get_storage(config['storage_class'],
                                   config['storage_args'])

        self.log = logging.getLogger('swh.antelink.loader.AntelinkS3Injecter')
Exemple #21
0
def get_tenacious_storage(**config):
    storage_config = {
        "cls": "pipeline",
        "steps": [
            {"cls": "validate"},
            {"cls": "tenacious", **config},
            {"cls": "memory"},
        ],
    }

    return get_storage(**storage_config)
    def __init__(self, config):
        self.config = config

        s3_folder = self.config['s3_folder']
        if not s3_folder.endswith('/'):
            self.config['s3_folder'] = s3_folder + '/'

        self.storage = get_storage(config['storage_class'],
                                   config['storage_args'])

        self.log = logging.getLogger(
            'swh.antelink.loader.AntelinkS3Injecter')
    def __init__(self) -> None:
        self.config = load_from_envvar(DEFAULT_CONFIG)
        self.storage = get_storage(**self.config["storage"])
        self.objstorage = get_objstorage(**self.config["objstorage"])
        self.compute_checksums = self.config["compute_checksums"]
        self.recompute_checksums = self.config["recompute_checksums"]
        self.batch_size_retrieve_content = self.config[
            "batch_size_retrieve_content"]
        self.batch_size_update = self.config["batch_size_update"]
        self.log = logging.getLogger("swh.indexer.rehash")

        if not self.compute_checksums:
            raise ValueError("Checksums list should not be empty.")
Exemple #24
0
def test_pypi_origin_from_project_name(mocker):
    origin_url = "https://pypi.org/project/ProjectName/"

    storage = get_storage("memory")

    revision_id = b"41" * 10
    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])

    class response:
        code = 200

        def read(self):
            return b'{"info": {"name": "ProjectName"}}'

    mock_urlopen = mocker.patch(
        "swh.storage.migrate_extrinsic_metadata.urlopen",
        return_value=response(),
    )

    assert (pypi_origin_from_filename(
        storage, revision_id, "ProjectName-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_not_called()
    assert (pypi_origin_from_filename(
        storage, revision_id, "projectname-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_called_once_with(
        "https://pypi.org/pypi/projectname/json/")
Exemple #25
0
def get_cooker(bundle_type: str, swhid: CoreSWHID):
    """Instantiate a cooker class of type bundle_type.

    Returns:
        Cooker class in charge of cooking the bundle_type with id swhid.

    Raises:
        ValueError in case of a missing top-level vault key configuration or a storage
          key.
        EnvironmentError in case the vault configuration reference a non remote class.

    """
    if "SWH_CONFIG_FILENAME" in os.environ:
        cfg = read_config(os.environ["SWH_CONFIG_FILENAME"], DEFAULT_CONFIG)
    else:
        cfg = load_named_config(DEFAULT_CONFIG_PATH, DEFAULT_CONFIG)
    cooker_cls = get_cooker_cls(bundle_type, swhid.object_type)

    cfg = check_config(cfg)
    vcfg = cfg["vault"]

    storage = get_storage(**vcfg.pop("storage"))
    backend = get_vault(**vcfg)

    try:
        from swh.graph.client import RemoteGraphClient  # optional dependency

        graph = RemoteGraphClient(
            **vcfg["graph"]) if vcfg.get("graph") else None
    except ModuleNotFoundError:
        if vcfg.get("graph"):
            raise EnvironmentError(
                "Graph configuration required but module is not installed.")
        else:
            graph = None

    kwargs = {
        k: v
        for (k, v) in cfg.items()
        if k in ("max_bundle_size", "thread_pool_size")
    }

    return cooker_cls(
        swhid,
        backend=backend,
        storage=storage,
        graph=graph,
        **kwargs,
    )
Exemple #26
0
    def __init__(self, **config):
        self.config = config
        self.cache = VaultCache(**config["cache"])
        self.scheduler = get_scheduler(**config["scheduler"])
        self.storage = get_storage(**config["storage"])
        self.smtp_server = smtplib.SMTP(**config.get("smtp", {}))

        db_conn = config["db"]
        self._pool = psycopg2.pool.ThreadedConnectionPool(
            config.get("min_pool_conns", 1),
            config.get("max_pool_conns", 10),
            db_conn,
            cursor_factory=psycopg2.extras.RealDictCursor,
        )
        self._db = None
 def __init__(
     self,
     storage,
     error_rate_limit: Optional[Dict[str, int]] = None,
     retries: int = 3,
 ):
     self.storage = get_storage(**storage)
     if error_rate_limit is None:
         error_rate_limit = {"errors": 10, "window_size": 1000}
     assert "errors" in error_rate_limit
     assert "window_size" in error_rate_limit
     self.rate_queue = RateQueue(
         size=error_rate_limit["window_size"],
         max_errors=error_rate_limit["errors"],
     )
     self._single_object_retries: int = retries
Exemple #28
0
def test_storage_replayer_with_validation_nok_raises(
        replayer_storage_and_client, caplog, redisdb):
    """Replayer scenario with invalid objects

    with raise_on_error set to True

    This:
    - writes both valid & invalid objects to a source storage
    - a StorageArgumentException should be raised while replayer consumes
      objects from the topic and replays them
    """
    src, replayer = replayer_storage_and_client
    replayer.value_deserializer = ModelObjectDeserializer(
        validate=True, reporter=redisdb.set, raise_on_error=True).convert

    caplog.set_level(logging.ERROR, "swh.journal.replay")

    # Fill Kafka using a source storage
    nb_sent = 0
    for object_type, objects in TEST_OBJECTS.items():
        method = getattr(src, object_type + "_add")
        method(objects)
        if object_type == "origin_visit":
            nb_sent += len(
                objects)  # origin-visit-add adds origin-visit-status as well
        nb_sent += len(objects)

    # insert invalid objects
    for object_type in ("revision", "directory", "release", "snapshot"):
        method = getattr(src, object_type + "_add")
        method([attr.evolve(TEST_OBJECTS[object_type][0], id=b"\x00" * 20)])
        nb_sent += 1

    # Fill the destination storage from Kafka
    dst = get_storage(cls="memory")
    worker_fn = functools.partial(process_replay_objects, storage=dst)
    with pytest.raises(StorageArgumentException):
        replayer.process(worker_fn)

    # check we do have invalid objects reported
    invalid = 0
    for record in caplog.records:
        logtext = record.getMessage()
        if WRONG_ID_REG.match(logtext):
            invalid += 1
    assert invalid == 1, "One invalid objects should be detected"
    assert len(redisdb.keys()) == 1
Exemple #29
0
    def __init__(self, storage: Mapping, min_batch_size: Mapping = {}):
        self.storage: StorageInterface = get_storage(**storage)

        self._buffer_thresholds = {
            **DEFAULT_BUFFER_THRESHOLDS,
            **min_batch_size
        }

        self._objects: Dict[LObjectType,
                            Dict[Tuple[str, ...],
                                 BaseModel]] = {k: {}
                                                for k in OBJECT_TYPES}
        self._contents_size: int = 0
        self._directory_entries: int = 0
        self._revision_parents: int = 0
        self._revision_size: int = 0
        self._release_size: int = 0
Exemple #30
0
def swh_storage():
    storage_config = {
        "cls": "pipeline",
        "steps": [
            {"cls": "tenacious"},
            {
                "cls": "memory",
                "journal_writer": {
                    "cls": "memory",
                },
            },
        ],
    }

    storage = get_storage(**storage_config)
    storage.journal_writer = storage.storage.journal_writer
    return storage
Exemple #31
0
def test_load_get_known_extids() -> None:
    """Checks PackageLoader.load() fetches known extids efficiently"""
    storage = Mock(wraps=get_storage("memory"))

    loader = StubPackageLoader(storage, "http://example.org")

    loader.load()

    # Calls should be grouped by extid type
    storage.extid_get_from_extid.assert_has_calls(
        [
            call("extid-type1", [b"extid-of-v1.0", b"extid-of-v2.0"],
                 version=0),
            call("extid-type2", [b"extid-of-v3.0", b"extid-of-v4.0"],
                 version=0),
        ],
        any_order=True,
    )
Exemple #32
0
def swh_storage():
    storage_config = {
        "cls":
        "pipeline",
        "steps": [
            {
                "cls": "counter",
                "counters": {
                    "cls": "memory"
                }
            },
            {
                "cls": "memory"
            },
        ],
    }

    return get_storage(**storage_config)
def send_jobs(db_url, block_size, block_max_files, limit, dry_run, huge,
              storage_class, storage_args):
    """Send paths for worker to retrieve from sesi machine.

    """
    from swh.scheduler.celery_backend.config import app
    from swh.loader.antelink import tasks  # noqa

    # right inputs
    if isinstance(block_size, str):
        block_size = int(block_size)
    if isinstance(block_max_files, str):
        block_max_files = int(block_max_files)
    if limit and isinstance(limit, str):
        limit = int(limit)
    if dry_run:
        print('** DRY RUN **')

    if db_url:
        store = storage.Storage(db_url)
        gen_data = store.read_content_sesi_not_in_swh(huge, limit)
    else:
        gen_data = utils.gen_path_length_from_stdin()

    if huge:
        task_name = 'swh.loader.antelink.tasks.AntelinkSesiInjecterHugeTsk'
    else:
        task_name = 'swh.loader.antelink.tasks.AntelinkSesiInjecterTsk'

    swhstorage = get_storage(storage_class, storage_args.split(','))
    gen_data = retrieve_unknown_sha1s(swhstorage, gen_data)

    nb_total_blocks = 0
    for paths, size in utils.split_data_per_size(gen_data, block_size,
                                                 block_max_files):
        nb_total_blocks += 1
        print('%s paths (%s bytes) sent.' % (len(paths), size))
        if dry_run:
            continue
        app.tasks[task_name].delay(paths)

    print('Number of jobs: %s' % nb_total_blocks)
def compute_s3_jobs(db_url, block_size, block_max_files, limit, dry_run,
                    final, huge, storage_class, storage_args):
    from swh.scheduler.celery_backend.config import app
    from swh.loader.antelink import tasks  # noqa

    # right inputs
    if isinstance(block_size, str):
        block_size = int(block_size)
    if isinstance(block_max_files, str):
        block_max_files = int(block_max_files)
    if limit and isinstance(limit, str):
        limit = int(limit)
    if dry_run:
        print('** DRY RUN **')

    swhstorage = get_storage(storage_class, storage_args.split(','))

    if db_url:
        store = storage.Storage(db_url)
        gen_data = retrieve_unknown_sha1s(
            swhstorage,
            store.read_content_s3_not_in_sesi_nor_in_swh(huge, final, limit))
    else:
        gen_data = retrieve_unknown_sha1s(swhstorage,
                                          utils.gen_path_length_from_stdin())

    nb_total_blocks = 0
    for paths, size in utils.split_data_per_size(gen_data, block_size,
                                                 block_max_files):
        nb_total_blocks += 1
        print('%s paths (%s bytes) sent.' % (len(paths), size))
        if dry_run:
            continue
        app.tasks[task_name].delay(paths)

    print('Number of jobs: %s' % nb_total_blocks)
    def run(self, *package_names):
        """Load the history of the given package from snapshot.debian.org"""

        config = self.config

        snapshot = SnapshotDebianOrg(
            connstr=config['snapshot_connstr'],
            basedir=config['snapshot_basedir'],
        )

        storage = get_storage(
            config['storage_class'],
            config['storage_args'],
        )

        swh_authority_dt = open(
            os.path.join(config['snapshot_basedir'], 'TIMESTAMP')
        ).read()

        swh_authority = {
            'authority': '5f4d4c51-498a-4e28-88b3-b3e4e8396cba',
            'validity': dateutil.parser.parse(swh_authority_dt),
        }

        tmpdir = tempfile.mkdtemp()
        os.makedirs(os.path.join(tmpdir, 'source'))

        pkgs = snapshot.prepare_packages(
            package_names,
            os.path.join(tmpdir, 'source'),
            log=self.log,
        )
        origins = snapshot.prepare_origins(package_names, storage)

        closed = False
        fetch_histories = {}
        for origin in origins.values():
            id = origin['id']
            fetch_histories[id] = storage.fetch_history_start(id)

        try:
            sorted_pkgs = []
            for p in pkgs.values():
                p['origin_id'] = origins[p['name']]['id']
                sorted_pkgs.append(p)

            sorted_pkgs.sort(key=lambda p: (p['name'], p['version']))

            partial = {}
            for partial in process_source_packages(
                    sorted_pkgs,
                    config['keyrings'],
                    tmpdir,
                    log=self.log,
            ):

                try_flush_partial(
                    storage, partial,
                    content_packet_size=config['content_packet_size'],
                    content_packet_length=config['content_packet_length'],
                    content_max_length_one=config['content_max_length_one'],
                    directory_packet_size=config['directory_packet_size'],
                    log=self.log,
                )

            if partial:
                try_flush_partial(
                    storage, partial,
                    content_packet_size=config['content_packet_size'],
                    content_packet_length=config['content_packet_length'],
                    content_max_length_one=config['content_max_length_one'],
                    directory_packet_size=config['directory_packet_size'],
                    force=True,
                    log=self.log,
                )

                packages = flush_revision(storage, partial, log=self.log)

                packages_w_revs = flush_release(
                    storage,
                    packages,
                    log=self.log
                )

                flush_occurrences(storage, packages_w_revs, [swh_authority],
                                  log=self.log)

                for fh in fetch_histories.values():
                    storage.fetch_history_end(fh, {'status': True})
                closed = True
        finally:
            shutil.rmtree(tmpdir)
            if not closed:
                data = {
                    'status': False,
                    'stderr': traceback.format_exc(),
                }

                for fh in fetch_histories.values():
                    storage.fetch_history_end(fh, data)