Ejemplo n.º 1
0
    def get_metadata(self, uri=None):
        """Retrieve repository metadata. The common metadata (timestamp) is
           added by the software repository parser, and here we need to
           ensure that the url field is populated with a correct url.

           Arguments:
           uri (str) : a repository uri string to override one currently set
        """
        if uri:
            self.set_uri(uri)
        self.load_secrets()

        repo = "/".join(self.uid.split("/")[-2:])
        url = "https://gitlab.com/api/v4/projects/%s" % urllib.parse.quote(
            repo, safe=""
        )

        # Add authorization header if token is provided
        headers = None
        if self.token:
            headers = {"Authorization": "Bearer %s" % self.token}
        response = requests.get(url, headers=headers)

        # Successful query!
        self.data = check_response(response)
        return self.data
Ejemplo n.º 2
0
    def scrape(self, url, paginate=False, delay=0.0):
        """A shared function to scrape a set of repositories. Since the JoSS
        pages for a search and the base are the same, we can use a shared
        function.
        """
        # Api doesn't appear to have pagination
        response = requests.get(url, headers={"User-Agent": get_user_agent()})
        data = check_response(response)

        for entry in data.get("response", {}).get("docs", []):
            page_url = entry["uri_s"]
            response = requests.get(page_url,
                                    headers={"User-Agent": get_user_agent()})
            repo_url = None
            if response.status_code == 200:
                match = re.search(repository_regex, response.text,
                                  re.IGNORECASE)
                if match:
                    repo_url = match.group()

            if repo_url:
                bot.info("Found repository: %s" % repo_url)
                self.results.append(repo_url)
            time.sleep(delay)

        return self.results
Ejemplo n.º 3
0
    def scrape(self, url, paginate=False, delay=None):
        """A shared function to scrape a set of repositories. Since the JoSS
        pages for a search and the base are the same, we can use a shared
        function.
        """
        # Handle pagination
        original_url = url
        while url is not None:

            response = requests.get(url, headers={"User-Agent": get_user_agent()})
            data = check_response(response)

            # Reset the url to be None
            url = None
            if data.get("next") and paginate:
                url = original_url + "&page=%s" % data.get("next", "").replace(
                    "?page=", ""
                )

            for entry in data.get("list", []):

                # Look for GitHub / GitLab URL
                repo = {}
                for link in entry.get("link", []):
                    if "Repository" in link["type"] and re.search(
                        repository_regex, link["url"], re.IGNORECASE
                    ):
                        repo["url"] = link["url"]

                # If we don't have a repository, search the homepage
                if not repo.get("url") and re.search(
                    repository_regex, entry["homepage"]
                ):
                    repo["url"] = entry["homepage"]

                # We must have a repository url to parse
                if not repo.get("url"):
                    continue

                # Look for a doi
                for pub in entry["publication"]:
                    if pub.get("doi"):
                        repo["doi"] = pub.get("doi")

                bot.info("Found repository: %s" % repo["url"])
                self.results.append(repo)

                # Sleep for a random amount of time to give a rest!
                sleep(delay or random.choice(range(1, 10)) * 0.1)

        return self.results
Ejemplo n.º 4
0
Archivo: rsnl.py Proyecto: untzag/rse
    def scrape(self, url, paginate=False, delay=0.0):
        """A shared function to scrape a set of repositories. Since the JoSS
           pages for a search and the base are the same, we can use a shared
           function.
        """
        response = requests.get(url, headers={"User-Agent": get_user_agent()})
        data = check_response(response) or []

        for entry in data:

            # I only see GitHub urls
            repo_url = entry.get("repositoryURLs", {}).get("github")
            repo_url = repo_url[0] if repo_url else None
            doi = entry.get("conceptDOI")
            doi = doi if doi and "FIXME" not in doi else None
            if repo_url and doi:
                bot.info("Found repository: %s" % repo_url)
                self.results.append({"url": repo_url, "doi": doi})
            elif repo_url:
                bot.info("Found repository: %s" % repo_url)
                self.results.append({"url": repo_url})
            time.sleep(delay)

        return self.results
Ejemplo n.º 5
0
Archivo: github.py Proyecto: untzag/rse
    def get_metadata(self, uri=None):
        """Retrieve repository metadata. The common metadata (timestamp) is
           added by the software repository parser, and here we need to
           ensure that the url field is populated with a correct url.

           Arguments:
           uri (str) : a repository uri string to override one currently set
        """
        if uri:
            self.set_uri(uri)
        self.load_secrets()
        repo = "/".join(self.uid.split("/")[-2:])
        url = "https://api.github.com/repos/%s" % (repo)
        headers = {
            "Accept": "application/vnd.github.symmetra-preview+json",
        }
        if self.token:
            headers["Authorization"] = "token %s" % self.token

        response = requests.get(url, headers=headers)

        # Successful query!
        data = check_response(response)
        if data is None:
            return None

        # Only save minimal set
        self.data = {}
        for key in [
                "name",
                "url",
                "full_name",
                "html_url",
                "private",
                "description",
                "created_at",
                "updated_at",
                "clone_url",
                "homepage",
                "size",
                "stargazers_count",
                "watchers_count",
                "language",
                "open_issues_count",
                "license",
                "subscribers_count",
        ]:
            if key in data:
                self.data[key] = data[key]
        self.data["owner"] = {}
        for key in ["html_url", "avatar_url", "login", "type"]:
            self.data["owner"][key] = data["owner"][key]

        # Also try to get topics
        headers.update({"Accept": "application/vnd.github.mercy-preview+json"})
        url = "%s/topics" % url
        response = requests.get(url, headers=headers)

        # Successful query!
        topics = check_response(response)
        if topics is not None:
            self.data["topics"] = topics.get("names", [])

        return self.data