Example #1
0
    def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]:
        """
        Iterate on all git repositories and yield ListedOrigin instances.
        """
        assert self.lister_obj.id is not None

        prev_origin_url = None

        for repo in page:

            origin_url = repo.git_https_url

            # filter out origins with invalid URL or origin previously listed
            # (last modified repository will be listed twice by launchpadlib)
            if not origin_url.startswith("https://") or origin_url == prev_origin_url:
                continue

            last_update = repo.date_last_modified

            self.date_last_modified = last_update

            logger.debug("Found origin %s last updated on %s", origin_url, last_update)

            prev_origin_url = origin_url

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                visit_type="git",
                url=origin_url,
                last_update=last_update,
            )
Example #2
0
    def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
        assert self.lister_obj.id is not None

        seen_urls = set()
        for package_info in page:
            origin_url, artifact_url = compute_origin_urls(package_info)

            if origin_url in seen_urls:
                # prevent multiple listing of an origin,
                # most recent version will be listed first
                continue

            seen_urls.add(origin_url)

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=origin_url,
                visit_type="tar",
                last_update=parse_packaged_date(package_info),
                extra_loader_arguments={
                    "artifacts": [{
                        "url": artifact_url,
                        "version": package_info["Version"]
                    }]
                },
            )
def deposit_listed_origin(deposit_lister):
    return ListedOrigin(
        lister_id=deposit_lister.id,
        url="https://example.org/project",
        visit_type="deposit",
        extra_loader_arguments={"deposit_id": "some-d-id"},
    )
Example #4
0
    def get_origins_from_page(
            self, page: List[Dict[str, Any]]) -> Iterator[ListedOrigin]:
        """Convert a page of Npm repositories into a list of ListedOrigin."""
        assert self.lister_obj.id is not None

        for package in page:
            # no source code to archive here
            if not package["doc"].get("versions", {}):
                continue

            package_name = package["doc"]["name"]
            package_latest_version = (package["doc"].get("dist-tags",
                                                         {}).get("latest", ""))

            last_update = None
            if package_latest_version in package["doc"].get("time", {}):
                last_update = iso8601.parse_date(
                    package["doc"]["time"][package_latest_version])

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=self.PACKAGE_URL_TEMPLATE.format(
                    package_name=package_name),
                visit_type="npm",
                last_update=last_update,
            )
Example #5
0
    def get_origins_from_page(
            self, page: List[Dict[str, Any]]) -> Iterator[ListedOrigin]:
        """Convert a page of GitHub repositories into a list of ListedOrigins.

        This records the html_url, as well as the pushed_at value if it exists.
        """
        assert self.lister_obj.id is not None

        seen_in_page: Set[str] = set()

        for repo in page:
            if not repo:
                # null repositories in listings happen sometimes...
                continue

            if repo["html_url"] in seen_in_page:
                continue
            seen_in_page.add(repo["html_url"])

            pushed_at_str = repo.get("pushed_at")
            pushed_at: Optional[datetime.datetime] = None
            if pushed_at_str:
                pushed_at = iso8601.parse_date(pushed_at_str)

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=repo["html_url"],
                visit_type="git",
                last_update=pushed_at,
            )
def generate_listed_origin(
    lister_id: uuid.UUID, now: Optional[datetime] = None
) -> ListedOrigin:
    """Returns a globally unique new origin. Seed the `last_update` value
    according to the OriginModel and the passed timestamp.

    Arguments:
      lister: instance of the lister that generated this origin
      now: time of listing, to emulate last_update (defaults to :func:`datetime.now`)
    """
    global _nb_generated_origins
    _nb_generated_origins += 1
    assert _nb_generated_origins < 10**6, "Too many origins!"

    if now is None:
        now = datetime.now(tz=timezone.utc)

    url = f"https://example.com/{_nb_generated_origins:06d}.git"
    visit_type = "test-git"
    origin = OriginModel(visit_type, url)

    return ListedOrigin(
        lister_id=lister_id,
        url=url,
        visit_type=visit_type,
        last_update=origin.get_last_update(now),
    )
Example #7
0
def debian_listed_origin(debian_lister):
    return ListedOrigin(
        lister_id=debian_lister.id,
        url="https://debian.example.org/package",
        visit_type="debian",
        extra_loader_arguments={"packages": {}},
    )
Example #8
0
    def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
        assert self.lister_obj.id is not None

        for repo in page:
            url = get_repo_url(repo["attachments"]["uris"]["uris"])
            if url is None:
                short_name: Optional[str] = None

                for field in "shortName", "name", "callsign":
                    short_name = repo["fields"].get(field)
                    if short_name:
                        break

                logger.warning(
                    "No valid url for repository [%s] (phid=%s)",
                    short_name or repo["phid"],
                    repo["phid"],
                )
                continue

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=url,
                visit_type=repo["fields"]["vcs"],
                # The "dateUpdated" field returned by the Phabricator API only refers to
                # the repository metadata; We can't use it for our purposes.
                last_update=None,
            )
Example #9
0
    def get_origins_from_page(
            self, page: CratesListerPage) -> Iterator[ListedOrigin]:
        """Iterate on all crate pages and yield ListedOrigin instances."""

        assert self.lister_obj.id is not None

        url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"])
        last_update = page[0]["last_update"]
        artifacts = []

        for version in page:
            filename = urlparse(version["crate_file"]).path.split("/")[-1]
            # Build an artifact entry following original-artifacts-json specification
            # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json  # noqa: B950
            artifact = {
                "filename": f"{filename}",
                "checksums": {
                    "sha256": f"{version['checksum']}",
                },
                "url": version["crate_file"],
                "version": version["version"],
            }
            artifacts.append(artifact)

        yield ListedOrigin(
            lister_id=self.lister_obj.id,
            visit_type=self.VISIT_TYPE,
            url=url,
            last_update=last_update,
            extra_loader_arguments={
                "artifacts": artifacts,
            },
        )
Example #10
0
def opam_listed_origin(opam_lister):
    return ListedOrigin(
        lister_id=opam_lister.id,
        url=OPAM_LOADER_ARGS["url"],
        visit_type="opam",
        extra_loader_arguments={
            k: v for k, v in OPAM_LOADER_ARGS.items() if k != "url"
        },
    )
Example #11
0
def maven_listed_origin(maven_lister):
    return ListedOrigin(
        lister_id=maven_lister.id,
        url=MVN_ARTIFACTS[0]["url"],
        visit_type="maven",
        extra_loader_arguments={
            "artifacts": MVN_ARTIFACTS,
        },
    )
Example #12
0
 def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
     """Convert a page of OpamLister repositories into a list of ListedOrigins"""
     assert self.lister_obj.id is not None
     # a page is just a package name
     url = "opam+{}/packages/{}/".format(self.url, page)
     # print("adding url", url)
     yield ListedOrigin(lister_id=self.lister_obj.id,
                        visit_type="opam",
                        url=url,
                        last_update=None)
Example #13
0
def archive_listed_origin(archive_lister):
    return ListedOrigin(
        lister_id=archive_lister.id,
        url="https://example.org/archives",
        visit_type="tar",
        extra_loader_arguments={
            "artifacts": [],
            "snapshot_append": True,
        },
    )
Example #14
0
    def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
        """Convert a page of Tuleap repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        yield ListedOrigin(
            lister_id=self.lister_obj.id,
            url=page["uri"],
            visit_type=page["type"],
            last_update=iso8601.parse_date(page["last_update_date"]),
        )
Example #15
0
 def get_origins_from_page(
         self, page: SourceForgeListerPage) -> Iterator[ListedOrigin]:
     assert self.lister_obj.id is not None
     for hit in page:
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             visit_type=hit.vcs.value,
             url=hit.url,
             last_update=iso8601.parse_date(hit.last_modified),
             enabled=False,
         )
Example #16
0
    def get_origins_from_page(
            self, page_result: PageResult) -> Iterator[ListedOrigin]:
        assert self.lister_obj.id is not None

        repositories = page_result.repositories if page_result.repositories else []
        for repo in repositories:
            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=repo["http_url_to_repo"],
                visit_type="git",
                last_update=iso8601.parse_date(repo["last_activity_at"]),
            )
Example #17
0
    def get_origins_from_page(
            self, packages: PackageListPage) -> Iterator[ListedOrigin]:
        """Convert a page of PyPI repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for origin, last_update in packages:
            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=origin,
                visit_type="pypi",
                last_update=last_update,
            )
Example #18
0
    def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]:
        """Convert a page of Gitea repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for repo in page:
            last_update = iso8601.parse_date(repo["updated_at"])

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=repo["clone_url"],
                visit_type="git",
                last_update=last_update,
            )
Example #19
0
    def get_origins_from_page(
            self, packages_name: PackageListPage) -> Iterator[ListedOrigin]:
        """Convert a page of PyPI repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for package_name in packages_name:
            package_url = self.PACKAGE_URL.format(package_name=package_name)

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=package_url,
                visit_type="pypi",
                last_update=None,  # available on PyPI JSON API
            )
Example #20
0
 def get_origins_from_page(self,
                           page: ArchListerPage) -> Iterator[ListedOrigin]:
     """Iterate on all arch pages and yield ListedOrigin instances."""
     assert self.lister_obj.id is not None
     for origin in page:
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             visit_type=self.VISIT_TYPE,
             url=origin["url"],
             last_update=origin["last_modified"],
             extra_loader_arguments={
                 "artifacts": origin["versions"],
             },
         )
Example #21
0
    def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
        """Retrieve scm origin out of the page information. Only called when type of the
        page is scm.

        Try and detect an scm/vcs repository. Note that official format is in the form:
        scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
        the repo url (without the "scm:type"), so we have to check against the content
        to extract the type and url properly.

        Raises
            AssertionError when the type of the page is not 'scm'

        Returns
            ListedOrigin with proper canonical scm url (for github) if any is found,
            None otherwise.

        """

        assert page["type"] == "scm"
        visit_type: Optional[str] = None
        url: Optional[str] = None
        m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
        if m_scm is None:
            return None

        scm_type = m_scm.group("type")
        if scm_type and scm_type in SUPPORTED_SCM_TYPES:
            url = m_scm.group("url")
            visit_type = scm_type
        elif page["url"].endswith(".git"):
            url = page["url"].lstrip("scm:")
            visit_type = "git"
        else:
            return None

        if url and visit_type == "git":
            # Non-github urls will be returned as is, github ones will be canonical ones
            url = self.github_session.get_canonical_url(url)

        if not url:
            return None

        assert visit_type is not None
        assert self.lister_obj.id is not None
        return ListedOrigin(
            lister_id=self.lister_obj.id,
            url=url,
            visit_type=visit_type,
        )
Example #22
0
    def get_origins_from_page(
            self, page: List[Dict[str, Any]]) -> Iterator[ListedOrigin]:
        """Convert a page of Bitbucket repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for repo in page:
            last_update = iso8601.parse_date(repo["updated_on"])
            origin_url = repo["links"]["clone"][0]["href"]
            origin_type = repo["scm"]

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=origin_url,
                visit_type=origin_type,
                last_update=last_update,
            )
Example #23
0
    def get_origins_from_page(
            self, repositories: Repositories) -> Iterator[ListedOrigin]:
        """Convert a page of cgit repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for repo in repositories:
            origin_url = repo[
                "git_url"] or self._get_origin_from_repository_url(repo["url"])
            if origin_url is None:
                continue

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=origin_url,
                visit_type="git",
                last_update=_parse_last_updated_date(repo),
            )
Example #24
0
 def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
     """Convert a page of OpamLister repositories into a list of ListedOrigins"""
     assert self.lister_obj.id is not None
     # a page is just a package name
     url = f"opam+{self.url}/packages/{page}/"
     yield ListedOrigin(
         lister_id=self.lister_obj.id,
         visit_type="opam",
         url=url,
         last_update=None,
         extra_loader_arguments={
             "opam_root": self.opam_root,
             "opam_instance": self.instance,
             "opam_url": self.url,
             "opam_package": page,
         },
     )
    def get_origins_from_page(
            self, page: NewForgeListerPage) -> Iterator[ListedOrigin]:
        """Convert a page of NewForgeLister repositories into a list of ListedOrigins"""
        assert self.lister_obj.id is not None

        for element in page:

            yield ListedOrigin(
                # Required. Should use this value.
                lister_id=self.lister_obj.id,
                # Required. Visit type of the currently processed origin
                visit_type=self.VISIT_TYPE,
                # Required. URL corresponding to the origin for loaders to ingest
                url=...,
                # Should get it if the service provides it and if it induces no
                # substantial additional processing cost
                last_update=...,
            )
Example #26
0
def listed_origins_by_type(
    stored_lister: Lister, visit_types: List[str]
) -> Dict[str, List[ListedOrigin]]:
    """A fixed list of `ListedOrigin`s, for each `visit_type`."""
    count_per_type = 1000
    assert stored_lister.id
    return {
        visit_type: [
            ListedOrigin(
                lister_id=stored_lister.id,
                url=f"https://{visit_type}.example.com/{i:04d}",
                visit_type=visit_type,
                last_update=datetime(
                    2020, 6, 15, 16, 0, 0, j * count_per_type + i, tzinfo=timezone.utc
                ),
            )
            for i in range(count_per_type)
        ]
        for j, visit_type in enumerate(visit_types)
    }
def test_journal_client_origin_visit_status_after_grab_next_visits(
        swh_scheduler, stored_lister):
    """Ensure OriginVisitStat entries created in the db as a result of calling
    grab_next_visits() do not mess the OriginVisitStats upsert mechanism.

    """

    listed_origins = [
        ListedOrigin(lister_id=stored_lister.id,
                     url=url,
                     visit_type=visit_type) for (url, visit_type) in set(
                         (v["origin"], v["type"]) for v in VISIT_STATUSES_2)
    ]
    swh_scheduler.record_listed_origins(listed_origins)
    before = utcnow()
    swh_scheduler.grab_next_visits(visit_type="git",
                                   count=10,
                                   policy="oldest_scheduled_first")
    after = utcnow()

    assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == []
    assert swh_scheduler.origin_visit_stats_get([("cavabarder", "git")
                                                 ])[0] is not None

    process_journal_objects({"origin_visit_status": VISIT_STATUSES_2},
                            scheduler=swh_scheduler)

    for url in ("cavabarder", "iciaussi"):
        ovs = swh_scheduler.origin_visit_stats_get([(url, "git")])[0]
        assert before <= ovs.last_scheduled <= after

        ovs = swh_scheduler.origin_visit_stats_get([(url, "hg")])[0]
        assert ovs.last_scheduled is None

    ovs = swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0]
    assert ovs.last_successful == DATE1 + 5 * ONE_DAY
    assert ovs.last_visit == DATE1 + 5 * ONE_DAY
    assert ovs.last_visit_status == LastVisitStatus.successful
    assert ovs.last_snapshot == hash_to_bytes(
        "5555555555555555555555555555555555555555")
Example #28
0
    def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]:
        """
        Iterate on all GNU projects and yield ListedOrigin instances.
        """
        assert self.lister_obj.id is not None
        assert self.gnu_tree is not None

        artifacts = self.gnu_tree.artifacts

        for project_name, project_info in page.items():

            origin_url = project_info["url"]
            last_update = iso8601.parse_date(project_info["time_modified"])

            logger.debug("Found origin %s last updated on %s", origin_url, last_update)

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=origin_url,
                visit_type="tar",
                last_update=last_update,
                extra_loader_arguments={"artifacts": artifacts[project_name]},
            )
Example #29
0
    def get_origins_from_page(
            self, page: LaunchpadPageType) -> Iterator[ListedOrigin]:
        """
        Iterate on all git repositories and yield ListedOrigin instances.
        """
        assert self.lister_obj.id is not None

        vcs_type, repos = page

        try:
            for repo in repos:
                origin_url = origin(vcs_type, repo)

                # filter out origins with invalid URL
                if not origin_url.startswith("https://"):
                    continue

                last_update = repo.date_last_modified

                self.date_last_modified[vcs_type] = last_update

                logger.debug(
                    "Found origin %s with type %s last updated on %s",
                    origin_url,
                    vcs_type,
                    last_update,
                )

                yield ListedOrigin(
                    lister_id=self.lister_obj.id,
                    visit_type=vcs_type,
                    url=origin_url,
                    last_update=last_update,
                )
        except RestfulError as e:
            logger.warning("Listing %s origins raised %s", vcs_type, e)
Example #30
0
def svn_listed_origin(svn_lister):
    return ListedOrigin(lister_id=svn_lister.id,
                        url="svn://example.org/repo",
                        visit_type="svn")