def test_config_wrong_consistency_should_raise(self): storage_config = dict( cls="cassandra", hosts=["first"], port=9999, keyspace="any", consistency_level="fake", journal_writer={"cls": "memory"}, objstorage={"cls": "memory"}, ) with pytest.raises(ValueError, match="Unknown consistency"): get_storage(**storage_config)
def __init__(self): self.config: Dict[str, Any] = config.load_from_envvar(DEFAULT_CONFIG) self.scheduler: SchedulerInterface = get_scheduler( **self.config["scheduler"]) self.tool = { "name": "swh-deposit", "version": __version__, "configuration": { "sword_version": "2" }, } self.storage: StorageInterface = get_storage(**self.config["storage"]) self.storage_metadata: StorageInterface = get_storage( **self.config["storage_metadata"])
def test_storage_direct_writer_anonymized( kafka_prefix: str, kafka_server, consumer: Consumer ): writer_config = { "cls": "kafka", "brokers": [kafka_server], "client_id": "kafka_writer", "prefix": kafka_prefix, "anonymize": True, } storage_config: Dict[str, Any] = { "cls": "pipeline", "steps": [ {"cls": "memory", "journal_writer": writer_config}, ], } storage = get_storage(**storage_config) expected_messages = 0 for obj_type, objs in TEST_OBJECTS.items(): if obj_type == "origin_visit": # these have non-consistent API and are unrelated with what we # want to test here continue method = getattr(storage, obj_type + "_add") method(objs) expected_messages += len(objs) existing_topics = set( topic for topic in consumer.list_topics(timeout=10).topics.keys() if topic.startswith(kafka_prefix) ) assert existing_topics == { f"{kafka_prefix}.{obj_type}" for obj_type in ( "content", "directory", "extid", "metadata_authority", "metadata_fetcher", "origin", "origin_visit", "origin_visit_status", "raw_extrinsic_metadata", "release", "revision", "snapshot", "skipped_content", ) } | { f"{kafka_prefix}_privileged.{obj_type}" for obj_type in ( "release", "revision", ) }
def storage(): """An instance of in-memory storage that gets injected into the CLI functions.""" storage = get_storage(cls="memory") with patch("swh.storage.get_storage") as get_storage_mock: get_storage_mock.return_value = storage yield storage
def schedule_origin_metadata_index( ctx, type, options, storage_url, origin_batch_size, page_token, limit, dry_run ): """Schedules tasks for origins that are already known. The first argument is the name of the task type, further ones are keyword argument(s) of the task in the form key=value, where value is in YAML format. Usage sample: swh-scheduler --database 'service=swh-scheduler' \ task schedule_origins index-origin-metadata """ from itertools import islice from swh.storage import get_storage from .utils import parse_options, schedule_origin_batches scheduler = ctx.obj["scheduler"] storage = get_storage("remote", url=storage_url) if dry_run: scheduler = None (args, kw) = parse_options(options) if args: raise click.ClickException("Only keywords arguments are allowed.") origins = iter_origins(storage, page_token=page_token) if limit: origins = islice(origins, limit) origin_urls = (origin.url for origin in origins) schedule_origin_batches(scheduler, type, origin_urls, origin_batch_size, kw)
def get_config(config_file="web/web"): """Read the configuration file `config_file`. If an environment variable SWH_CONFIG_FILENAME is defined, this takes precedence over the config_file parameter. In any case, update the app with parameters (secret_key, conf) and return the parsed configuration as a dict. If no configuration file is provided, return a default configuration. """ if not swhweb_config: config_filename = os.environ.get("SWH_CONFIG_FILENAME") if config_filename: config_file = config_filename cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, "log_dir") if swhweb_config.get("search"): swhweb_config["search"] = get_search(**swhweb_config["search"]) else: swhweb_config["search"] = None swhweb_config["storage"] = get_storage(**swhweb_config["storage"]) swhweb_config["vault"] = get_vault(**swhweb_config["vault"]) swhweb_config["indexer_storage"] = get_indexer_storage( **swhweb_config["indexer_storage"]) swhweb_config["scheduler"] = get_scheduler( **swhweb_config["scheduler"]) return swhweb_config
def test_revision_extra_header_in_metadata(swh_storage_backend_config, sample_data): storage = get_storage(**swh_storage_backend_config) rev = sample_data.revision md_w_extra = dict( rev.metadata.items(), extra_headers=headers_to_db( [ ["gpgsig", b"test123"], ["mergetag", b"foo\\bar"], ["mergetag", b"\x22\xaf\x89\x80\x01\x00"], ] ), ) bw_rev = attr.evolve(rev, extra_headers=()) object.__setattr__(bw_rev, "metadata", md_w_extra) assert bw_rev.extra_headers == () assert storage.revision_add([bw_rev]) == {"revision:add": 1} # check data in the db are old format with db_transaction(storage) as (_, cur): cur.execute("SELECT metadata, extra_headers FROM revision") metadata, extra_headers = cur.fetchone() assert extra_headers == [] assert metadata == bw_rev.metadata # check the Revision build from revision_get is the original, "new style", Revision assert storage.revision_get([rev.id]) == [rev]
def test_resolve_object_from_extids_missing_target() -> None: storage = get_storage("memory") target = b"\x01" * 20 rel = Release( name=b"aaaa", message=b"aaaa", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) loader = StubPackageLoader(storage, "http://example.org/") p_info = Mock(wraps=BasePackageInfo(None, None, None)) # type: ignore known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel.swhid()]} p_info.extid.return_value = ("extid-type", 0, b"extid-of-aaaa") whitelist = {rel.id} # Targeted release is missing from the storage assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None storage.release_add([rel]) # Targeted release now exists assert (loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel.swhid())
def _partial_copy_storage(old_storage, origin_url: str, mechanism: str, copy_revisions: bool): """Create a new storage, and only copy ExtIDs or head revisions to it.""" new_storage = get_storage(cls="memory") snapshot = snapshot_get_latest(old_storage, origin_url) assert snapshot heads = [branch.target for branch in snapshot.branches.values()] if mechanism == "extid": extids = old_storage.extid_get_from_target(ObjectType.REVISION, heads) new_storage.extid_add(extids) if copy_revisions: # copy revisions, but erase their metadata to make sure the loader doesn't # fallback to revision.metadata["nodeid"] revisions = [ attr.evolve(rev, metadata={}) for rev in old_storage.revision_get(heads) if rev ] new_storage.revision_add(revisions) else: assert mechanism == "same storage" return old_storage # copy origin, visit, status new_storage.origin_add(old_storage.origin_get([origin_url])) visit = old_storage.origin_visit_get_latest(origin_url) new_storage.origin_visit_add([visit]) statuses = old_storage.origin_visit_status_get(origin_url, visit.visit).results new_storage.origin_visit_status_add(statuses) new_storage.snapshot_add([snapshot]) return new_storage
def get_config(config_file='web/web'): """Read the configuration file `config_file`. If an environment variable SWH_CONFIG_FILENAME is defined, this takes precedence over the config_file parameter. In any case, update the app with parameters (secret_key, conf) and return the parsed configuration as a dict. If no configuration file is provided, return a default configuration. """ if not swhweb_config: config_filename = os.environ.get('SWH_CONFIG_FILENAME') if config_filename: config_file = config_filename cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, 'log_dir') swhweb_config['storage'] = get_storage(**swhweb_config['storage']) swhweb_config['vault'] = get_vault(**swhweb_config['vault']) swhweb_config['indexer_storage'] = \ get_indexer_storage(**swhweb_config['indexer_storage']) swhweb_config['scheduler'] = get_scheduler( **swhweb_config['scheduler']) return swhweb_config
def __init__(self, config): self.config = config self.storage = get_storage(config['storage_class'], config['storage_args']) self.log = logging.getLogger( 'swh.antelink.loader.AntelinkSesiInjecter')
def storage(swh_indexer_config): """An instance of in-memory storage that gets injected into all indexers classes. """ storage = get_storage(**swh_indexer_config["storage"]) fill_storage(storage) return storage
def test_config_consistency_used(self, swh_storage_backend_config): config_with_consistency = dict(swh_storage_backend_config, **{"consistency_level": "THREE"}) storage = get_storage(**config_with_consistency) with pytest.raises(NoHostAvailable): storage.content_get_random()
def get_storage_with_buffer_config(**buffer_config) -> BufferingProxyStorage: steps = [ {"cls": "buffer", **buffer_config}, {"cls": "memory"}, ] ret = get_storage("pipeline", steps=steps) assert isinstance(ret, BufferingProxyStorage) return ret
def __init__(self, storage): self.storage: StorageInterface = get_storage(**storage) for attribute_name in dir(StorageInterface): if attribute_name.startswith("_"): continue attribute = getattr(self.storage, attribute_name) if hasattr(attribute, "__call__"): setattr(self, attribute_name, retry_function(self.storage, attribute_name))
def test_resolve_object_from_extids() -> None: storage = get_storage("memory") target = b"\x01" * 20 rel1 = Release( name=b"aaaa", message=b"aaaa", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) rel2 = Release( name=b"bbbb", message=b"bbbb", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) storage.release_add([rel1, rel2]) loader = StubPackageLoader(storage, "http://example.org/") p_info = Mock(wraps=BasePackageInfo(None, None, None)) # type: ignore # The PackageInfo does not support extids p_info.extid.return_value = None known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel1.swhid()]} whitelist = {b"unused"} assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is not one of them (ie. cache miss) p_info.extid.return_value = ("extid-type", 0, b"extid-of-cccc") assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is one of them (ie. cache hit), # but the target release was not in the previous snapshot p_info.extid.return_value = ("extid-type", 0, b"extid-of-aaaa") assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is one of them (ie. cache hit), # and the target release was in the previous snapshot whitelist = {rel1.id} assert (loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel1.swhid()) # Same as before, but there is more than one extid, and only one is an allowed # release whitelist = {rel1.id} known_extids = { ("extid-type", 0, b"extid-of-aaaa"): [rel2.swhid(), rel1.swhid()] } assert (loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel1.swhid())
def main(origin_url: str, incremental: bool) -> Dict[str, Any]: from swh.storage import get_storage storage = get_storage(cls="memory") loader = GitLoader( storage, origin_url, incremental=incremental, ) return loader.load()
def swh_storage(): storage_config = { "cls": "pipeline", "steps": [ {"cls": "validate"}, {"cls": "memory"}, ], } return get_storage(**storage_config)
def __init__(self, config): self.config = config s3_folder = self.config['s3_folder'] if not s3_folder.endswith('/'): self.config['s3_folder'] = s3_folder + '/' self.storage = get_storage(config['storage_class'], config['storage_args']) self.log = logging.getLogger('swh.antelink.loader.AntelinkS3Injecter')
def get_tenacious_storage(**config): storage_config = { "cls": "pipeline", "steps": [ {"cls": "validate"}, {"cls": "tenacious", **config}, {"cls": "memory"}, ], } return get_storage(**storage_config)
def __init__(self, config): self.config = config s3_folder = self.config['s3_folder'] if not s3_folder.endswith('/'): self.config['s3_folder'] = s3_folder + '/' self.storage = get_storage(config['storage_class'], config['storage_args']) self.log = logging.getLogger( 'swh.antelink.loader.AntelinkS3Injecter')
def __init__(self) -> None: self.config = load_from_envvar(DEFAULT_CONFIG) self.storage = get_storage(**self.config["storage"]) self.objstorage = get_objstorage(**self.config["objstorage"]) self.compute_checksums = self.config["compute_checksums"] self.recompute_checksums = self.config["recompute_checksums"] self.batch_size_retrieve_content = self.config[ "batch_size_retrieve_content"] self.batch_size_update = self.config["batch_size_update"] self.log = logging.getLogger("swh.indexer.rehash") if not self.compute_checksums: raise ValueError("Checksums list should not be empty.")
def test_pypi_origin_from_project_name(mocker): origin_url = "https://pypi.org/project/ProjectName/" storage = get_storage("memory") revision_id = b"41" * 10 snapshot_id = b"42" * 10 storage.origin_add([Origin(url=origin_url)]) storage.origin_visit_add( [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")]) storage.origin_visit_status_add([ OriginVisitStatus( origin=origin_url, visit=1, date=now(), status="partial", snapshot=snapshot_id, ) ]) storage.snapshot_add([ Snapshot( id=snapshot_id, branches={ b"foo": SnapshotBranch( target_type=TargetType.REVISION, target=revision_id, ) }, ) ]) class response: code = 200 def read(self): return b'{"info": {"name": "ProjectName"}}' mock_urlopen = mocker.patch( "swh.storage.migrate_extrinsic_metadata.urlopen", return_value=response(), ) assert (pypi_origin_from_filename( storage, revision_id, "ProjectName-1.0.0.tar.gz") == origin_url) mock_urlopen.assert_not_called() assert (pypi_origin_from_filename( storage, revision_id, "projectname-1.0.0.tar.gz") == origin_url) mock_urlopen.assert_called_once_with( "https://pypi.org/pypi/projectname/json/")
def get_cooker(bundle_type: str, swhid: CoreSWHID): """Instantiate a cooker class of type bundle_type. Returns: Cooker class in charge of cooking the bundle_type with id swhid. Raises: ValueError in case of a missing top-level vault key configuration or a storage key. EnvironmentError in case the vault configuration reference a non remote class. """ if "SWH_CONFIG_FILENAME" in os.environ: cfg = read_config(os.environ["SWH_CONFIG_FILENAME"], DEFAULT_CONFIG) else: cfg = load_named_config(DEFAULT_CONFIG_PATH, DEFAULT_CONFIG) cooker_cls = get_cooker_cls(bundle_type, swhid.object_type) cfg = check_config(cfg) vcfg = cfg["vault"] storage = get_storage(**vcfg.pop("storage")) backend = get_vault(**vcfg) try: from swh.graph.client import RemoteGraphClient # optional dependency graph = RemoteGraphClient( **vcfg["graph"]) if vcfg.get("graph") else None except ModuleNotFoundError: if vcfg.get("graph"): raise EnvironmentError( "Graph configuration required but module is not installed.") else: graph = None kwargs = { k: v for (k, v) in cfg.items() if k in ("max_bundle_size", "thread_pool_size") } return cooker_cls( swhid, backend=backend, storage=storage, graph=graph, **kwargs, )
def __init__(self, **config): self.config = config self.cache = VaultCache(**config["cache"]) self.scheduler = get_scheduler(**config["scheduler"]) self.storage = get_storage(**config["storage"]) self.smtp_server = smtplib.SMTP(**config.get("smtp", {})) db_conn = config["db"] self._pool = psycopg2.pool.ThreadedConnectionPool( config.get("min_pool_conns", 1), config.get("max_pool_conns", 10), db_conn, cursor_factory=psycopg2.extras.RealDictCursor, ) self._db = None
def __init__( self, storage, error_rate_limit: Optional[Dict[str, int]] = None, retries: int = 3, ): self.storage = get_storage(**storage) if error_rate_limit is None: error_rate_limit = {"errors": 10, "window_size": 1000} assert "errors" in error_rate_limit assert "window_size" in error_rate_limit self.rate_queue = RateQueue( size=error_rate_limit["window_size"], max_errors=error_rate_limit["errors"], ) self._single_object_retries: int = retries
def test_storage_replayer_with_validation_nok_raises( replayer_storage_and_client, caplog, redisdb): """Replayer scenario with invalid objects with raise_on_error set to True This: - writes both valid & invalid objects to a source storage - a StorageArgumentException should be raised while replayer consumes objects from the topic and replays them """ src, replayer = replayer_storage_and_client replayer.value_deserializer = ModelObjectDeserializer( validate=True, reporter=redisdb.set, raise_on_error=True).convert caplog.set_level(logging.ERROR, "swh.journal.replay") # Fill Kafka using a source storage nb_sent = 0 for object_type, objects in TEST_OBJECTS.items(): method = getattr(src, object_type + "_add") method(objects) if object_type == "origin_visit": nb_sent += len( objects) # origin-visit-add adds origin-visit-status as well nb_sent += len(objects) # insert invalid objects for object_type in ("revision", "directory", "release", "snapshot"): method = getattr(src, object_type + "_add") method([attr.evolve(TEST_OBJECTS[object_type][0], id=b"\x00" * 20)]) nb_sent += 1 # Fill the destination storage from Kafka dst = get_storage(cls="memory") worker_fn = functools.partial(process_replay_objects, storage=dst) with pytest.raises(StorageArgumentException): replayer.process(worker_fn) # check we do have invalid objects reported invalid = 0 for record in caplog.records: logtext = record.getMessage() if WRONG_ID_REG.match(logtext): invalid += 1 assert invalid == 1, "One invalid objects should be detected" assert len(redisdb.keys()) == 1
def __init__(self, storage: Mapping, min_batch_size: Mapping = {}): self.storage: StorageInterface = get_storage(**storage) self._buffer_thresholds = { **DEFAULT_BUFFER_THRESHOLDS, **min_batch_size } self._objects: Dict[LObjectType, Dict[Tuple[str, ...], BaseModel]] = {k: {} for k in OBJECT_TYPES} self._contents_size: int = 0 self._directory_entries: int = 0 self._revision_parents: int = 0 self._revision_size: int = 0 self._release_size: int = 0
def swh_storage(): storage_config = { "cls": "pipeline", "steps": [ {"cls": "tenacious"}, { "cls": "memory", "journal_writer": { "cls": "memory", }, }, ], } storage = get_storage(**storage_config) storage.journal_writer = storage.storage.journal_writer return storage
def test_load_get_known_extids() -> None: """Checks PackageLoader.load() fetches known extids efficiently""" storage = Mock(wraps=get_storage("memory")) loader = StubPackageLoader(storage, "http://example.org") loader.load() # Calls should be grouped by extid type storage.extid_get_from_extid.assert_has_calls( [ call("extid-type1", [b"extid-of-v1.0", b"extid-of-v2.0"], version=0), call("extid-type2", [b"extid-of-v3.0", b"extid-of-v4.0"], version=0), ], any_order=True, )
def swh_storage(): storage_config = { "cls": "pipeline", "steps": [ { "cls": "counter", "counters": { "cls": "memory" } }, { "cls": "memory" }, ], } return get_storage(**storage_config)
def send_jobs(db_url, block_size, block_max_files, limit, dry_run, huge, storage_class, storage_args): """Send paths for worker to retrieve from sesi machine. """ from swh.scheduler.celery_backend.config import app from swh.loader.antelink import tasks # noqa # right inputs if isinstance(block_size, str): block_size = int(block_size) if isinstance(block_max_files, str): block_max_files = int(block_max_files) if limit and isinstance(limit, str): limit = int(limit) if dry_run: print('** DRY RUN **') if db_url: store = storage.Storage(db_url) gen_data = store.read_content_sesi_not_in_swh(huge, limit) else: gen_data = utils.gen_path_length_from_stdin() if huge: task_name = 'swh.loader.antelink.tasks.AntelinkSesiInjecterHugeTsk' else: task_name = 'swh.loader.antelink.tasks.AntelinkSesiInjecterTsk' swhstorage = get_storage(storage_class, storage_args.split(',')) gen_data = retrieve_unknown_sha1s(swhstorage, gen_data) nb_total_blocks = 0 for paths, size in utils.split_data_per_size(gen_data, block_size, block_max_files): nb_total_blocks += 1 print('%s paths (%s bytes) sent.' % (len(paths), size)) if dry_run: continue app.tasks[task_name].delay(paths) print('Number of jobs: %s' % nb_total_blocks)
def compute_s3_jobs(db_url, block_size, block_max_files, limit, dry_run, final, huge, storage_class, storage_args): from swh.scheduler.celery_backend.config import app from swh.loader.antelink import tasks # noqa # right inputs if isinstance(block_size, str): block_size = int(block_size) if isinstance(block_max_files, str): block_max_files = int(block_max_files) if limit and isinstance(limit, str): limit = int(limit) if dry_run: print('** DRY RUN **') swhstorage = get_storage(storage_class, storage_args.split(',')) if db_url: store = storage.Storage(db_url) gen_data = retrieve_unknown_sha1s( swhstorage, store.read_content_s3_not_in_sesi_nor_in_swh(huge, final, limit)) else: gen_data = retrieve_unknown_sha1s(swhstorage, utils.gen_path_length_from_stdin()) nb_total_blocks = 0 for paths, size in utils.split_data_per_size(gen_data, block_size, block_max_files): nb_total_blocks += 1 print('%s paths (%s bytes) sent.' % (len(paths), size)) if dry_run: continue app.tasks[task_name].delay(paths) print('Number of jobs: %s' % nb_total_blocks)
def run(self, *package_names): """Load the history of the given package from snapshot.debian.org""" config = self.config snapshot = SnapshotDebianOrg( connstr=config['snapshot_connstr'], basedir=config['snapshot_basedir'], ) storage = get_storage( config['storage_class'], config['storage_args'], ) swh_authority_dt = open( os.path.join(config['snapshot_basedir'], 'TIMESTAMP') ).read() swh_authority = { 'authority': '5f4d4c51-498a-4e28-88b3-b3e4e8396cba', 'validity': dateutil.parser.parse(swh_authority_dt), } tmpdir = tempfile.mkdtemp() os.makedirs(os.path.join(tmpdir, 'source')) pkgs = snapshot.prepare_packages( package_names, os.path.join(tmpdir, 'source'), log=self.log, ) origins = snapshot.prepare_origins(package_names, storage) closed = False fetch_histories = {} for origin in origins.values(): id = origin['id'] fetch_histories[id] = storage.fetch_history_start(id) try: sorted_pkgs = [] for p in pkgs.values(): p['origin_id'] = origins[p['name']]['id'] sorted_pkgs.append(p) sorted_pkgs.sort(key=lambda p: (p['name'], p['version'])) partial = {} for partial in process_source_packages( sorted_pkgs, config['keyrings'], tmpdir, log=self.log, ): try_flush_partial( storage, partial, content_packet_size=config['content_packet_size'], content_packet_length=config['content_packet_length'], content_max_length_one=config['content_max_length_one'], directory_packet_size=config['directory_packet_size'], log=self.log, ) if partial: try_flush_partial( storage, partial, content_packet_size=config['content_packet_size'], content_packet_length=config['content_packet_length'], content_max_length_one=config['content_max_length_one'], directory_packet_size=config['directory_packet_size'], force=True, log=self.log, ) packages = flush_revision(storage, partial, log=self.log) packages_w_revs = flush_release( storage, packages, log=self.log ) flush_occurrences(storage, packages_w_revs, [swh_authority], log=self.log) for fh in fetch_histories.values(): storage.fetch_history_end(fh, {'status': True}) closed = True finally: shutil.rmtree(tmpdir) if not closed: data = { 'status': False, 'stderr': traceback.format_exc(), } for fh in fetch_histories.values(): storage.fetch_history_end(fh, data)