def test_init_get_indexer_storage(class_name, expected_class, kwargs,
                                  mock_psycopg2):
    if kwargs:
        concrete_idx_storage = get_indexer_storage(class_name, **kwargs)
    else:
        concrete_idx_storage = get_indexer_storage(class_name)
    assert isinstance(concrete_idx_storage, expected_class)
    assert isinstance(concrete_idx_storage, IndexerStorageInterface)
Example #2
0
def get_config(config_file='web/web'):
    """Read the configuration file `config_file`.

       If an environment variable SWH_CONFIG_FILENAME is defined, this
       takes precedence over the config_file parameter.

       In any case, update the app with parameters (secret_key, conf)
       and return the parsed configuration as a dict.

       If no configuration file is provided, return a default
       configuration.

    """

    if not swhweb_config:
        config_filename = os.environ.get('SWH_CONFIG_FILENAME')
        if config_filename:
            config_file = config_filename
        cfg = config.load_named_config(config_file, DEFAULT_CONFIG)
        swhweb_config.update(cfg)
        config.prepare_folders(swhweb_config, 'log_dir')
        swhweb_config['storage'] = get_storage(**swhweb_config['storage'])
        swhweb_config['vault'] = get_vault(**swhweb_config['vault'])
        swhweb_config['indexer_storage'] = \
            get_indexer_storage(**swhweb_config['indexer_storage'])
        swhweb_config['scheduler'] = get_scheduler(
            **swhweb_config['scheduler'])
    return swhweb_config
Example #3
0
def idx_storage(swh_indexer_config):
    """An instance of in-memory indexer storage that gets injected into all
    indexers classes.

    """
    idx_storage_config = swh_indexer_config["indexer_storage"]
    return get_indexer_storage(**idx_storage_config)
Example #4
0
def get_config(config_file="web/web"):
    """Read the configuration file `config_file`.

       If an environment variable SWH_CONFIG_FILENAME is defined, this
       takes precedence over the config_file parameter.

       In any case, update the app with parameters (secret_key, conf)
       and return the parsed configuration as a dict.

       If no configuration file is provided, return a default
       configuration.

    """

    if not swhweb_config:
        config_filename = os.environ.get("SWH_CONFIG_FILENAME")
        if config_filename:
            config_file = config_filename
        cfg = config.load_named_config(config_file, DEFAULT_CONFIG)
        swhweb_config.update(cfg)
        config.prepare_folders(swhweb_config, "log_dir")
        if swhweb_config.get("search"):
            swhweb_config["search"] = get_search(**swhweb_config["search"])
        else:
            swhweb_config["search"] = None
        swhweb_config["storage"] = get_storage(**swhweb_config["storage"])
        swhweb_config["vault"] = get_vault(**swhweb_config["vault"])
        swhweb_config["indexer_storage"] = get_indexer_storage(
            **swhweb_config["indexer_storage"])
        swhweb_config["scheduler"] = get_scheduler(
            **swhweb_config["scheduler"])
    return swhweb_config
Example #5
0
def swh_indexer_storage(swh_indexer_storage_postgresql):
    return get_indexer_storage(
        "local",
        db=swh_indexer_storage_postgresql.dsn,
        journal_writer={
            "cls": "memory",
        },
    )
Example #6
0
def app_server(swh_indexer_storage_postgresql):
    server.storage = get_indexer_storage(
        "local",
        db=swh_indexer_storage_postgresql.dsn,
        journal_writer={
            "cls": "memory",
        },
    )
    yield server
Example #7
0
    def prepare(self) -> None:
        """Prepare the indexer's needed runtime configuration.
           Without this step, the indexer cannot possibly run.

        """
        config_storage = self.config.get("storage")
        if config_storage:
            self.storage = get_storage(**config_storage)

        self.objstorage = get_objstorage(**self.config["objstorage"])

        idx_storage = self.config[INDEXER_CFG_KEY]
        self.idx_storage = get_indexer_storage(**idx_storage)

        _log = logging.getLogger("requests.packages.urllib3.connectionpool")
        _log.setLevel(logging.WARN)
        self.log = logging.getLogger("swh.indexer")

        if self.USE_TOOLS:
            self.tools = list(self.register_tools(self.config.get("tools",
                                                                  [])))
        self.results = []
Example #8
0
def _init_tests_data():
    # To hold reference to the memory storage
    storage = get_storage("memory")

    # Create search instance
    search = get_search("memory")
    search.initialize()
    search.origin_update({"url": origin["url"]} for origin in _TEST_ORIGINS)

    # Create indexer storage instance that will be shared by indexers
    idx_storage = get_indexer_storage("memory")

    # Declare a test tool for origin intrinsic metadata tests
    idx_tool = idx_storage.indexer_configuration_add([INDEXER_TOOL])[0]
    INDEXER_TOOL["id"] = idx_tool["id"]

    # Load git repositories from archives
    for origin in _TEST_ORIGINS:
        for i, archive_ in enumerate(origin["archives"]):
            if i > 0:
                # ensure visit dates will be different when simulating
                # multiple visits of an origin
                time.sleep(1)
            origin_repo_archive = os.path.join(os.path.dirname(__file__),
                                               "resources/repos/%s" % archive_)
            loader = GitLoaderFromArchive(
                storage,
                origin["url"],
                archive_path=origin_repo_archive,
            )

            result = loader.load()
            assert result["status"] == "eventful"

        ori = storage.origin_get([origin["url"]])[0]
        origin.update(ori.to_dict())  # add an 'id' key if enabled
        search.origin_update([{
            "url": origin["url"],
            "has_visits": True,
            "visit_types": ["git"]
        }])

    for i in range(250):
        _add_origin(storage,
                    search,
                    origin_url=f"https://many.origins/{i+1}",
                    visit_type="tar")

    sha1s: Set[Sha1] = set()
    directories = set()
    revisions = set()
    releases = set()
    snapshots = set()

    content_path = {}

    # Get all objects loaded into the test archive
    common_metadata = {ORIGIN_METADATA_KEY: ORIGIN_METADATA_VALUE}
    for origin in _TEST_ORIGINS:
        snp = snapshot_get_latest(storage, origin["url"])
        snapshots.add(hash_to_hex(snp.id))
        for branch_name, branch_data in snp.branches.items():
            target_type = branch_data.target_type.value
            if target_type == "revision":
                revisions.add(branch_data.target)
                if b"master" in branch_name:
                    # Add some origin intrinsic metadata for tests
                    metadata = common_metadata
                    metadata.update(origin.get("metadata", {}))
                    origin_metadata = OriginIntrinsicMetadataRow(
                        id=origin["url"],
                        from_revision=branch_data.target,
                        indexer_configuration_id=idx_tool["id"],
                        metadata=metadata,
                        mappings=[],
                    )
                    idx_storage.origin_intrinsic_metadata_add(
                        [origin_metadata])
                    search.origin_update([{
                        "url": origin["url"],
                        "intrinsic_metadata": metadata
                    }])

                    ORIGIN_MASTER_REVISION[origin["url"]] = hash_to_hex(
                        branch_data.target)
            elif target_type == "release":
                release = storage.release_get([branch_data.target])[0]
                revisions.add(release.target)
                releases.add(hash_to_hex(branch_data.target))

        for rev_log in storage.revision_shortlog(set(revisions)):
            rev_id = rev_log[0]
            revisions.add(rev_id)

        for rev in storage.revision_get(revisions):
            if rev is None:
                continue
            dir_id = rev.directory
            directories.add(hash_to_hex(dir_id))
            for entry in dir_iterator(storage, dir_id):
                if entry["type"] == "file":
                    sha1s.add(entry["sha1"])
                    content_path[entry["sha1"]] = "/".join(
                        [hash_to_hex(dir_id), entry["path"].decode("utf-8")])
                elif entry["type"] == "dir":
                    directories.add(hash_to_hex(entry["target"]))

    _add_extra_contents(storage, sha1s)

    # Get all checksums for each content
    result: List[Optional[Content]] = storage.content_get(list(sha1s))

    contents: List[Dict] = []
    for content in result:
        assert content is not None
        sha1 = hash_to_hex(content.sha1)
        content_metadata = {
            algo: hash_to_hex(getattr(content, algo))
            for algo in DEFAULT_ALGORITHMS
        }

        path = ""
        if content.sha1 in content_path:
            path = content_path[content.sha1]

        cnt_data = storage.content_get_data(content.sha1)
        assert cnt_data is not None
        mimetype, encoding = get_mimetype_and_encoding_for_content(cnt_data)
        _, _, cnt_data = _re_encode_content(mimetype, encoding, cnt_data)
        content_display_data = prepare_content_for_display(
            cnt_data, mimetype, path)

        content_metadata.update({
            "path":
            path,
            "mimetype":
            mimetype,
            "encoding":
            encoding,
            "hljs_language":
            content_display_data["language"],
            "data":
            content_display_data["content_data"],
        })
        _contents[sha1] = content_metadata
        contents.append(content_metadata)

    # Add the empty directory to the test archive
    storage.directory_add([Directory(entries=())])

    # Add empty content to the test archive
    storage.content_add([Content.from_data(data=b"")])

    # Add fake git origin with pull request branches
    _add_origin(
        storage,
        search,
        origin_url="https://git.example.org/project",
        snapshot_branches={
            b"refs/heads/master": {
                "target_type": "revision",
                "target": next(iter(revisions)),
            },
            **{
                f"refs/pull/{i}".encode(): {
                    "target_type": "revision",
                    "target": next(iter(revisions)),
                }
                for i in range(300)
            },
        },
    )

    # Return tests data
    return {
        "search": search,
        "storage": storage,
        "idx_storage": idx_storage,
        "origins": _TEST_ORIGINS,
        "contents": contents,
        "directories": list(directories),
        "releases": list(releases),
        "revisions": list(map(hash_to_hex, revisions)),
        "snapshots": list(snapshots),
        "generated_checksums": set(),
    }
Example #9
0
def _init_tests_data():
    # Load git repositories from archives
    loader = GitLoaderFromArchive(config=_TEST_LOADER_CONFIG)
    for origin in _TEST_ORIGINS:
        nb_visits = len(origin['archives'])
        for i, archive in enumerate(origin['archives']):
            origin_repo_archive = \
                os.path.join(os.path.dirname(__file__),
                             'resources/repos/%s' % archive)
            loader.load(origin['url'], origin_repo_archive, None)
            if nb_visits > 1 and i != nb_visits - 1:
                time.sleep(1)

    # Get reference to the memory storage
    storage = loader.storage

    contents = set()
    directories = set()
    revisions = set()
    releases = set()
    snapshots = set()
    persons = set()

    content_path = {}

    # Get all objects loaded into the test archive
    for origin in _TEST_ORIGINS:
        snp = storage.snapshot_get_latest(origin['id'])
        snapshots.add(hash_to_hex(snp['id']))
        for branch_name, branch_data in snp['branches'].items():
            if branch_data['target_type'] == 'revision':
                revisions.add(branch_data['target'])
            elif branch_data['target_type'] == 'release':
                release = next(storage.release_get([branch_data['target']]))
                revisions.add(release['target'])
                releases.add(hash_to_hex(branch_data['target']))
                persons.add(release['author']['id'])

        for rev_log in storage.revision_shortlog(set(revisions)):
            rev_id = rev_log[0]
            revisions.add(rev_id)

        for rev in storage.revision_get(revisions):
            dir_id = rev['directory']
            persons.add(rev['author']['id'])
            persons.add(rev['committer']['id'])
            directories.add(hash_to_hex(dir_id))
            for entry in dir_iterator(storage, dir_id):
                content_path[entry['sha1']] = '/'.join(
                    [hash_to_hex(dir_id), entry['path'].decode('utf-8')])
                if entry['type'] == 'file':
                    contents.add(entry['sha1'])
                elif entry['type'] == 'dir':
                    directories.add(hash_to_hex(entry['target']))

    # Get all checksums for each content
    contents_metadata = storage.content_get_metadata(contents)
    contents = []
    for content_metadata in contents_metadata:
        contents.append({
            algo: hash_to_hex(content_metadata[algo])
            for algo in DEFAULT_ALGORITHMS
        })
        path = content_path[content_metadata['sha1']]
        cnt = next(storage.content_get([content_metadata['sha1']]))
        mimetype, encoding = get_mimetype_and_encoding_for_content(cnt['data'])
        content_display_data = prepare_content_for_display(
            cnt['data'], mimetype, path)
        contents[-1]['path'] = path
        contents[-1]['mimetype'] = mimetype
        contents[-1]['encoding'] = encoding
        contents[-1]['hljs_language'] = content_display_data['language']
        contents[-1]['data'] = content_display_data['content_data']
        _contents[contents[-1]['sha1']] = contents[-1]

    # Create indexer storage instance that will be shared by indexers
    idx_storage = get_indexer_storage('memory', {})

    # Instantiate content indexers that will be used in tests
    # and force them to use the memory storages
    indexers = {}
    for idx_name, idx_class in (('mimetype_indexer', _MimetypeIndexer),
                                ('language_indexer', _LanguageIndexer),
                                ('license_indexer', _FossologyLicenseIndexer),
                                ('ctags_indexer', _CtagsIndexer)):
        idx = idx_class()
        idx.storage = storage
        idx.objstorage = storage.objstorage
        idx.idx_storage = idx_storage
        idx.register_tools(idx.config['tools'])
        indexers[idx_name] = idx

    # Add the empty directory to the test archive
    empty_dir_id = directory_identifier({'entries': []})
    empty_dir_id_bin = hash_to_bytes(empty_dir_id)
    storage.directory_add([{'id': empty_dir_id_bin, 'entries': []}])

    # Return tests data
    return {
        'storage': storage,
        'idx_storage': idx_storage,
        **indexers, 'origins': _TEST_ORIGINS,
        'contents': contents,
        'directories': list(directories),
        'persons': list(persons),
        'releases': list(releases),
        'revisions': list(map(hash_to_hex, revisions)),
        'snapshots': list(snapshots)
    }
Example #10
0
def get_storage():
    global storage
    if not storage:
        storage = get_indexer_storage(**app.config[INDEXER_CFG_KEY])

    return storage
Example #11
0
def swh_indexer_storage():
    return get_indexer_storage("memory", journal_writer={
        "cls": "memory",
    })
Example #12
0
def test_init_get_indexer_storage_deprecation_warning(class_name,
                                                      expected_class, kwargs,
                                                      mock_psycopg2):
    with pytest.warns(DeprecationWarning):
        concrete_idx_storage = get_indexer_storage(class_name, args=kwargs)
    assert isinstance(concrete_idx_storage, expected_class)
Example #13
0
def test_init_get_indexer_storage_failure():
    with pytest.raises(ValueError, match="Unknown indexer storage class"):
        get_indexer_storage("unknown-idx-storage")