Beispiel #1
0
    def load_repo_null_fields(self, git_loader):
        # Our schema doesn't enforce a lot of non-null revision fields. We need
        # to check these cases don't break the cooker.
        repo = TestRepo()
        with repo as rp:
            (rp / "file").write_text(TEST_CONTENT)
            c = repo.commit("initial commit")
            loader = git_loader(str(rp))
            loader.load()
            repo.repo.refs[b"HEAD"].decode()
            dir_id_hex = repo.repo[c].tree.decode()
            dir_id = hashutil.hash_to_bytes(dir_id_hex)

        test_revision = Revision(
            message=b"",
            author=Person(name=None, email=None, fullname=b""),
            date=None,
            committer=Person(name=None, email=None, fullname=b""),
            committer_date=None,
            parents=(),
            type=RevisionType.GIT,
            directory=dir_id,
            metadata={},
            synthetic=True,
        )

        storage = loader.storage
        storage.revision_add([test_revision])
        return (loader, test_revision.swhid())
Beispiel #2
0
def cook_extract_directory_gitfast(storage, swhid, fsck=True):
    """Context manager that cooks a revision containing a directory and extract it,
    using RevisionGitfastCooker"""
    test_repo = TestRepo()
    with test_repo as p:
        date = TimestampWithTimezone.from_datetime(
            datetime.datetime.now(datetime.timezone.utc))
        revision = Revision(
            directory=swhid.object_id,
            message=b"dummy message",
            author=Person.from_fullname(b"someone"),
            committer=Person.from_fullname(b"someone"),
            date=date,
            committer_date=date,
            type=RevisionType.GIT,
            synthetic=False,
        )
        storage.revision_add([revision])

    with cook_stream_revision_gitfast(
            storage, revision.swhid()) as stream, test_repo as p:
        processor = dulwich.fastexport.GitImportProcessor(test_repo.repo)
        processor.import_stream(stream)
        test_repo.checkout(b"HEAD")
        shutil.rmtree(p / ".git")
        yield p
Beispiel #3
0
 def test_revision_identifier(self):
     self.assertEqual(
         Revision.from_dict(self.revision).id,
         self.revision["id"],
     )
     self.assertEqual(
         Revision.from_dict(remove_id(self.revision)).id,
         self.revision["id"],
     )
Beispiel #4
0
    def test_revision_submodule(self, swh_storage, cook_extract_revision,
                                ingest_target_revision):
        date = TimestampWithTimezone.from_datetime(
            datetime.datetime.now(
                datetime.timezone.utc).replace(microsecond=0))

        target_rev = Revision(
            message=b"target_rev",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=bytes.fromhex(
                "3333333333333333333333333333333333333333"),
            metadata={},
            synthetic=True,
        )
        if ingest_target_revision:
            swh_storage.revision_add([target_rev])

        dir = Directory(entries=(DirectoryEntry(
            name=b"submodule",
            type="rev",
            target=target_rev.id,
            perms=0o160000,
        ), ), )
        swh_storage.directory_add([dir])

        rev = Revision(
            message=b"msg",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=dir.id,
            metadata={},
            synthetic=True,
        )
        swh_storage.revision_add([rev])

        with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p):
            ert.checkout(b"HEAD")
            pattern = b"160000 submodule\x00%s" % target_rev.id
            tree = ert.repo[b"HEAD"].tree
            assert pattern in ert.repo[tree].as_raw_string()
Beispiel #5
0
 def custom_deserializer(object_type, msg):
     assert object_type == "revision"
     obj = kafka_to_value(msg)
     # filter the first revision
     if obj["id"] == revisions[0].id:
         return None
     return Revision.from_dict(obj)
Beispiel #6
0
def test_lookup_directory_with_revision_unknown_content(
        archive_data, new_revision):
    unknown_content_ = random_content()

    dir_path = "README.md"

    # A directory that points to unknown content
    dir = Directory(entries=(DirectoryEntry(
        name=bytes(dir_path.encode("utf-8")),
        type="file",
        target=hash_to_bytes(unknown_content_["sha1_git"]),
        perms=DentryPerms.content,
    ), ))

    # Create a revision that points to a directory
    # Which points to unknown content
    new_revision = new_revision.to_dict()
    new_revision["directory"] = dir.id
    del new_revision["id"]
    new_revision = Revision.from_dict(new_revision)

    # Add the directory and revision in mem
    archive_data.directory_add([dir])
    archive_data.revision_add([new_revision])
    new_revision_id = hash_to_hex(new_revision.id)
    with pytest.raises(NotFoundExc) as e:
        archive.lookup_directory_with_revision(new_revision_id, dir_path)
    assert e.match("Content not found for revision %s" % new_revision_id)
Beispiel #7
0
    def test_revision_metadata_indexer_single_root_dir(self):
        metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
        fill_obj_storage(metadata_indexer.objstorage)
        fill_storage(metadata_indexer.storage)

        # Add a parent directory, that is the only directory at the root
        # of the revision
        rev = REVISION
        assert rev.directory == DIRECTORY2.id

        directory = Directory(
            entries=(
                DirectoryEntry(
                    name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384,
                ),
            ),
        )
        assert directory.id is not None
        metadata_indexer.storage.directory_add([directory])

        new_rev_dict = {**rev.to_dict(), "directory": directory.id}
        new_rev_dict.pop("id")
        new_rev = Revision.from_dict(new_rev_dict)
        metadata_indexer.storage.revision_add([new_rev])

        tool = metadata_indexer.idx_storage.indexer_configuration_get(
            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
        )
        assert tool is not None

        metadata_indexer.idx_storage.content_metadata_add(
            [
                ContentMetadataRow(
                    id=DIRECTORY2.entries[0].target,
                    indexer_configuration_id=tool["id"],
                    metadata=YARN_PARSER_METADATA,
                )
            ]
        )

        metadata_indexer.run([new_rev.id])

        results = list(
            metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
        )

        expected_results = [
            RevisionIntrinsicMetadataRow(
                id=new_rev.id,
                tool=TRANSLATOR_TOOL,
                metadata=YARN_PARSER_METADATA,
                mappings=["npm"],
            )
        ]

        for result in results:
            del result.tool["id"]

        # then
        self.assertEqual(results, expected_results)
Beispiel #8
0
def test_revision_metadata_display(archive_data, client, directory, person,
                                   date):
    metadata = {"foo": "bar"}
    revision = Revision(
        directory=hash_to_bytes(directory),
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
        metadata=metadata,
    )
    archive_data.revision_add([revision])

    url = reverse("browse-revision",
                  url_args={"sha1_git": hash_to_hex(revision.id)})

    resp = check_html_get_response(client,
                                   url,
                                   status_code=200,
                                   template_used="browse/revision.html")
    assert_contains(resp, "swh-metadata-popover")
    assert_contains(resp, escape(json.dumps(metadata, indent=4)))
Beispiel #9
0
def test_lookup_revision_invalid_msg(archive_data, new_revision):
    new_revision = new_revision.to_dict()
    new_revision["message"] = b"elegant fix for bug \xff"
    archive_data.revision_add([Revision.from_dict(new_revision)])

    revision = archive.lookup_revision(hash_to_hex(new_revision["id"]))
    assert revision["message"] == "elegant fix for bug \\xff"
    assert revision["decoding_failures"] == ["message"]
def test_db_to_revision():
    # when
    actual_revision = converters.db_to_revision(
        {
            "id": b"revision-id",
            "date": None,
            "date_offset": None,
            "date_neg_utc_offset": None,
            "date_offset_bytes": None,
            "committer_date": None,
            "committer_date_offset": None,
            "committer_date_neg_utc_offset": None,
            "committer_date_offset_bytes": None,
            "type": "git",
            "directory": b"dir-sha1",
            "message": b"commit message",
            "author_fullname": b"auth-name <auth-email>",
            "author_name": b"auth-name",
            "author_email": b"auth-email",
            "committer_fullname": b"comm-name <comm-email>",
            "committer_name": b"comm-name",
            "committer_email": b"comm-email",
            "metadata": {},
            "synthetic": False,
            "extra_headers": (),
            "raw_manifest": None,
            "parents": [b"123", b"456"],
        }
    )

    # then
    assert actual_revision == Revision(
        id=b"revision-id",
        author=Person(
            fullname=b"auth-name <auth-email>",
            name=b"auth-name",
            email=b"auth-email",
        ),
        date=None,
        committer=Person(
            fullname=b"comm-name <comm-email>",
            name=b"comm-name",
            email=b"comm-email",
        ),
        committer_date=None,
        type=RevisionType.GIT,
        directory=b"dir-sha1",
        message=b"commit message",
        metadata={},
        synthetic=False,
        extra_headers=(),
        parents=(b"123", b"456"),
    )
Beispiel #11
0
    def push_revision_subgraph(self, obj_id: Sha1Git) -> None:
        """Fetches the graph of revisions induced by the given ``obj_id`` and adds
        them to ``self._rev_stack``.

        If swh-graph is not available, this requires fetching the revisions themselves,
        so they are directly loaded instead."""
        loaded_from_graph = False

        if self.graph:
            from swh.graph.client import GraphArgumentException

            # First, try to cook using swh-graph, as it is more efficient than
            # swh-storage for querying the history
            obj_swhid = CoreSWHID(
                object_type=ObjectType.REVISION,
                object_id=obj_id,
            )
            try:
                revision_ids = (swhid.object_id for swhid in map(
                    CoreSWHID.from_string,
                    self.graph.visit_nodes(str(obj_swhid), edges="rev:rev"),
                ))
                self._push(self._rev_stack, revision_ids)
            except GraphArgumentException as e:
                logger.info(
                    "Revision %s not found in swh-graph, falling back to fetching "
                    "history using swh-storage. %s",
                    hash_to_hex(obj_id),
                    e.args[0],
                )
            else:
                loaded_from_graph = True

        if not loaded_from_graph:
            # If swh-graph is not available, or the revision is not yet in
            # swh-graph, fall back to self.storage.revision_log.
            # self.storage.revision_log also gives us the full revisions,
            # so we load them right now instead of just pushing them on the stack.
            walker = DFSRevisionsWalker(self.storage,
                                        obj_id,
                                        state=self._walker_state,
                                        ignore_displayname=True)
            for revision in walker:
                self.write_revision_node(Revision.from_dict(revision))
                self.nb_loaded += 1
                self._push(self._dir_stack, [revision["directory"]])
            # Save the state, so the next call to the walker won't return the same
            # revisions
            self._walker_state = walker.export_state()
Beispiel #12
0
def revision_from_db(db_revision: RevisionRow,
                     parents: Tuple[Sha1Git, ...]) -> Revision:
    revision = db_revision.to_dict()
    metadata = json.loads(revision.pop("metadata", None))
    extra_headers = revision.pop("extra_headers", ())
    if not extra_headers and metadata and "extra_headers" in metadata:
        extra_headers = metadata.pop("extra_headers")
    if extra_headers is None:
        extra_headers = ()
    return Revision(
        parents=parents,
        type=RevisionType(revision.pop("type")),
        metadata=metadata,
        extra_headers=extra_headers,
        **revision,
    )
Beispiel #13
0
def new_revision(draw):
    """
    Hypothesis strategy returning random raw swh revision data
    not ingested into the test archive.
    """
    return Revision(
        directory=draw(sha1().map(hash_to_bytes)),
        author=draw(new_person()),
        committer=draw(new_person()),
        message=draw(
            text(min_size=20, max_size=100).map(lambda t: t.encode())),
        date=TimestampWithTimezone.from_datetime(draw(new_swh_date())),
        committer_date=TimestampWithTimezone.from_datetime(draw(
            new_swh_date())),
        synthetic=False,
        type=RevisionType.GIT,
    )
Beispiel #14
0
def test_client_stop_after_objects(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server: str, count: int
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    revisions = cast(List[Revision], TEST_OBJECTS["revision"])
    for rev in revisions:
        producer.produce(
            topic=kafka_prefix + ".revision",
            key=rev.id,
            value=value_to_kafka(rev.to_dict()),
        )
    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=False,
        stop_after_objects=count,
    )

    worker_fn = MagicMock()
    client.process(worker_fn)

    # this code below is not pretty, but needed since we have to deal with
    # dicts (so no set) which can have values that are list vs tuple, and we do
    # not know for sure how many calls of the worker_fn will happen during the
    # consumption of the topic...
    worker_fn.assert_called()
    revs = []  # list of (unique) rev dicts we got from the client
    for call in worker_fn.call_args_list:
        callrevs = call[0][0]["revision"]
        for rev in callrevs:
            assert Revision.from_dict(rev) in revisions
            if rev not in revs:
                revs.append(rev)
    assert len(revs) == count
Beispiel #15
0
def identify_revision(
    hg: Hg,
    rev: Optional[bytes] = None,
    node_id_2_swhid: Optional[Dict[bytes, CoreSWHID]] = None,
) -> Iterator[RevisionIdentity]:
    """Return the repository revision identities.

    Args:
        hg: A `Hg` repository instance
        rev: An optional revision or Mercurial revsets (See `hg help revsets`)
             If not provided all the repository revisions will be computed.
        node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs
            It will be updated in place with new mappings.
    """
    from swh.model.model import Revision

    if node_id_2_swhid is None:
        node_id_2_swhid = {}

    for revision in hg.log(rev):
        data = revision.to_dict()

        hg.up(revision.node_id)
        directory_swhid = identify_directory(hg.root())
        data["directory"] = directory_swhid.object_id

        parents = []
        for parent in data["parents"]:
            if parent not in node_id_2_swhid:
                parent_revision = next(
                    identify_revision(hg, parent, node_id_2_swhid))
                node_id_2_swhid[parent] = parent_revision.swhid
            assert node_id_2_swhid[parent].object_type == ObjectType.REVISION
            parents.append(node_id_2_swhid[parent].object_id)
        data["parents"] = parents

        revision_swhid = Revision.from_dict(data).swhid()
        node_id_2_swhid[revision.node_id] = revision_swhid

        yield RevisionIdentity(
            swhid=revision_swhid,
            node_id=revision.node_id,
            directory_swhid=directory_swhid,
        )
Beispiel #16
0
    def _make_stub_directory_revision(self, dir_id: Sha1Git) -> Sha1Git:
        author = Person.from_fullname(
            b"swh-vault, git-bare cooker <*****@*****.**>")
        dt = datetime.datetime.now(tz=datetime.timezone.utc)
        dt = dt.replace(microsecond=0)  # not supported by git
        date = TimestampWithTimezone.from_datetime(dt)

        revision = Revision(
            author=author,
            committer=author,
            date=date,
            committer_date=date,
            message=b"Initial commit",
            type=RevisionType.GIT,
            directory=self.obj_id,
            synthetic=True,
        )
        self.write_revision_node(revision)

        return revision.id
Beispiel #17
0
def test_api_revision_directory_ok_returns_revision(api_client, archive_data,
                                                    revision, person, date):
    rev_path = "foo"
    _dir = Directory(entries=(DirectoryEntry(
        name=rev_path.encode(),
        type="rev",
        target=hash_to_bytes(revision),
        perms=DentryPerms.revision,
    ), ))
    archive_data.directory_add([_dir])

    rev = Revision(
        directory=_dir.id,
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
    )
    archive_data.revision_add([rev])

    revision_id = hash_to_hex(rev.id)
    rev_data = archive_data.revision_get(revision)
    url = reverse(
        "api-1-revision-directory",
        {
            "sha1_git": revision_id,
            "dir_path": rev_path
        },
    )
    rv = check_api_get_responses(api_client, url, status_code=200)

    assert rv.data == {
        "content": enrich_revision(rev_data, request=rv.wsgi_request),
        "path": rev_path,
        "type": "rev",
        "revision": revision_id,
    }
def build_swh_revision(rev: int, commit: Dict, repo_uuid: bytes, dir_id: bytes,
                       parents: Sequence[bytes]) -> Revision:
    """Given a svn revision, build a swh revision.

    This adds an 'extra-headers' entry with the
    repository's uuid and the svn revision.

    Args:
        rev: the svn revision number
        commit: the commit data: revision id, date, author, and message
        repo_uuid: The repository's uuid
        dir_id: the tree's hash identifier
        parents: the revision's parents identifier

    Returns:
        The swh revision dictionary.

    """
    author = commit["author_name"]
    msg = commit["message"]
    date = commit["author_date"]

    extra_headers: Tuple[Tuple[bytes, bytes], ...] = (
        (b"svn_repo_uuid", repo_uuid),
        (b"svn_revision", str(rev).encode()),
    )

    return Revision(
        type=RevisionType.SUBVERSION,
        date=date,
        committer_date=date,
        directory=dir_id,
        message=msg,
        author=author,
        committer=author,
        synthetic=True,
        extra_headers=extra_headers,
        parents=tuple(parents),
    )