Example #1
0
    def get(self, uid=None):
        """Get a repo based on a uid. Exits on error if doesn't exist. If
        a uid is not provided, get the last updated repository.
        """
        from rse.main.database.models import SoftwareRepository

        # Retrieve either the last repo, or the one with a specific uid
        if not uid:
            repo = (self.session.query(SoftwareRepository).order_by(
                desc("timestamp")).first())
            parser = get_parser(repo.uid, config=self.config)
            if not repo:
                raise NoReposError
        else:
            parser = get_parser(uid, config=self.config)
            repo = SoftwareRepository.query.filter(
                SoftwareRepository.uid == parser.uid).first()

            # If an exact match isn't there, look for partial match
            if not repo:
                query = "%" + parser.uid + "%"
                query = self.session.query(SoftwareRepository).filter(
                    SoftwareRepository.uid.ilike(query))
                results = self.session.execute(query).fetchall()
                if len(results) == 1:
                    return self.get(results[0][0])
                elif len(results) > 1:
                    raise MultipleReposExistError(parser.uid)
                else:
                    raise RepoNotFoundError(parser.uid)

        repo.parser = parser
        return repo
Example #2
0
    def _import_annotation(self,
                           input_file,
                           username,
                           stop_line="## Criteria"):
        """A general helper (private)  function to import an annotation, meaning
           we parse a repository and return additional lines for parsing.
        """
        if not username or not input_file:
            raise RuntimeError(
                "A username and input file are required to import annotation criteria."
            )

        if not os.path.exists(input_file):
            raise FileNotFoundError(input_file)

        lines = read_file(input_file)
        line = lines.pop(0)

        # Find the repository name
        while stop_line not in line:
            match = re.search(repository_regex, line)
            if match:
                break
            line = lines.pop(0)

        # Retrieve the match
        if not match:
            raise RuntimeError(f"repository pattern not found in {input_file}")
        reponame = match.group()
        parser = get_parser(reponame, config=self.config)
        repo = self.get(parser.uid)
        return repo, lines
Example #3
0
    def exists(self, uid):
        """Determine if a repo exists."""
        from rse.main.database.models import SoftwareRepository

        parser = get_parser(uid, config=self.config)
        repo = SoftwareRepository.query.filter(
            SoftwareRepository.uid == parser.uid).first()
        return repo is not None
Example #4
0
    def get_or_create(self, uid):
        """Determine if a repo exists."""
        from rse.main.database.models import SoftwareRepository

        parser = get_parser(uid, config=self.config)
        repo = SoftwareRepository.query.filter(
            SoftwareRepository.uid == parser.uid).first()
        if not repo:
            repo = self.add(uid)
        return repo
Example #5
0
    def create(self, database=None, config_file=None):
        """After a scrape (whether we obtain latest or a search query) we
           run create to create software repositories based on results.
        """
        from rse.main import Encyclopedia

        client = Encyclopedia(config_file=config_file, database=database)
        for repo_id in self.results:
            repo = get_parser(repo_id)

            # Add results that don't exist
            if not client.exists(repo.uid):
                client.add(repo.uid)
Example #6
0
    def analyze(self,
                repo,
                cthresh=0.5,
                tthresh=1,
                taxonomy_uids=None,
                criteria_uids=None):
        """analyze takes a repository and calculates a "final answer" based on user provided
           thresholds
        """
        # If taxonomy or criteria lists aren't defined, use all
        if not taxonomy_uids:
            taxonomy_uids = [x["uid"] for x in self.list_taxonomy()]
        if not criteria_uids:
            criteria_uids = [x["uid"] for x in self.list_criteria()]

        parser = get_parser(repo, config=self.config)
        repo = self.get(parser.uid)
        metrics = {"repo": parser.uid, "criteria": {}, "taxonomy": {}}

        # Calculate "final" answers for each criteria based on votes and threshold
        counts = {}
        for name, votes in repo.get_criteria().items():
            # Skip criteria if not important
            if name not in criteria_uids:
                continue
            if name not in counts:
                counts[name] = {"yes": 0, "no": 0, "total": 0}
            for username, response in votes.items():
                counts[name][response] += 1
                counts[name]["total"] += 1

        # Calculate final answers!
        for name, summary in counts.items():
            if summary["yes"] / summary["total"] >= cthresh:
                metrics["criteria"][name] = "yes"
            else:
                metrics["criteria"][name] = "no"

        counts = {}
        for username, categories in repo.get_taxonomy().items():
            for category in categories:
                if category not in counts:
                    counts[category] = 0
                counts[category] += 1

        # Include those above the requested threshold
        for name, count in counts.items():
            if count >= tthresh:
                metrics["taxonomy"][name] = count

        return metrics
Example #7
0
File: rsnl.py Project: untzag/rse
    def create(self, database=None, config_file=None):
        """After a scrape (whether we obtain latest or a search query) we
           run create to create software repositories based on results.
        """
        from rse.main import Encyclopedia

        client = Encyclopedia(config_file=config_file, database=database)
        for result in self.results:
            uid = result["url"].split("//")[-1]
            repo = get_parser(uid)

            # Add results that don't exist
            if not client.exists(repo.uid):
                client.add(repo.uid)
                if result.get("doi"):
                    client.label(repo.uid, key="doi", value=result.get("doi"))
Example #8
0
    def add(self, uid):
        """Add a new software repository to the database."""
        if uid:
            parser = get_parser(uid, config=self.config)
            data = parser.get_metadata()

            # If it's a parser handoff
            if isinstance(data, ParserBase):
                parser = data
                data = parser.data

            if data:
                bot.info(f"{parser.uid} was added to the the database.")
                return SoftwareRepository(parser, data_base=self.data_base)
        else:
            bot.error("Please define a unique identifier to add.")
Example #9
0
    def yield_taxonomy_annotation_repos(self, username, unseen_only=True, repo=None):
        """Given a username, repository, and preference for seen / unseen,
        yield a repository to annotate.
        """
        if repo is None:
            repos = self.list()
        else:
            parser = get_parser(repo, config=self.config)
            repos = [[parser.uid]]
            unseen_only = False

        # yield combinations that don't exist yet, repo first to save changes
        for name in repos:
            repo = self.get(name[0])
            if unseen_only and not repo.has_taxonomy_annotation(username):
                yield repo
            elif not unseen_only:
                yield repo
Example #10
0
    def get(self, uid=None, exact=False):
        """Get a software repo based on a uid. If exact is not needed, we can
           search for a match based on the partial uid.  If exact is False, 
           and a uid is not provided, get the last repository created.
        """
        if not uid and not exact:
            repos = get_latest_modified(self.data_base, pattern="metadata*.json")
            if repos:
                uid = (
                    repos.replace("metadata.json", "")
                    .replace(self.data_base, "")
                    .strip("/")
                )
            if not uid or not repos:
                raise NoReposError

        parser = get_parser(uid, config=self.config)
        return SoftwareRepository(parser, exists=True, data_base=self.data_base)
Example #11
0
File: joss.py Project: rseng/rse
    def create(self, database=None, config_file=None):
        """After a scrape (whether we obtain latest or a search query) we
        run create to create software repositories based on results.
        """
        from rse.main import Encyclopedia

        client = Encyclopedia(config_file=config_file, database=database)
        for result in self.results:
            uid = result["url"].split("//")[-1]

            # If a repository is added that isn't represented
            try:
                repo = get_parser(uid)
            except NotImplementedError as exc:
                bot.warning(exc)
                continue

            # Add results that don't exist
            if not client.exists(repo.uid):
                client.add(repo.uid)
                client.label(repo.uid, key="doi", value=result.get("doi"))
Example #12
0
    def add(self, uid):
        """Create a new repo based on a uid that matches to a parser."""
        from rse.main.database.models import SoftwareRepository

        parser = get_parser(uid, config=self.config)
        if not self.exists(parser.uid):
            data = parser.get_metadata()

            # If it's a parser handoff
            if isinstance(data, ParserBase):
                parser = data
                data = parser.data

            if data:
                repo = SoftwareRepository(uid=parser.uid,
                                          parser=parser.name,
                                          data=json.dumps(parser.export()))
                self.session.add(repo)
                self.session.commit()
                bot.info(f"{parser.uid} was added to the the database.")
                repo.parser = parser
                return repo
Example #13
0
File: zenodo.py Project: rseng/rse
    def get_metadata(self, uri=None, require_repo=True):
        """Retrieve repository metadata. The common metadata (timestamp) is
        added by the software repository parser, and here we need to
        ensure that the url field is populated with a correct url.

        Arguments:
        uri (str) : a repository uri string to override one currently set
        require_repo (bool) : require a repository to parse.
        """
        from rse.main.parsers import get_parser
        from rse.utils.urls import repository_regex

        repository_regex = repository_regex.rstrip("$")

        if uri:
            self.set_uri(uri)
        self.load_secrets()

        # Get the record number from the doi
        record = self.uid.split("/")[-1].replace("zenodo.", "")

        # Token isn't required for public entries
        if self.token:
            response = requests.get(
                "https://zenodo.org/api/records/%s" % record,
                json={"access_token": self.token},
            )
        else:
            response = requests.get("https://zenodo.org/api/records/%s" %
                                    record)

        # Successful query!
        if response.status_code == 200:
            self.data = response.json()

            # For Zenodo, we require a GitHub or GitLab related identifier to add
            repo_url = None
            for identifier in self.data["metadata"].get(
                    "related_identifiers", []):
                match = re.search(repository_regex, identifier["identifier"])
                if match:
                    repo_url = "https://%s" % match.group()
                    break

            # If we return None, the entry is not added
            if repo_url is None and require_repo is True:
                bot.warning(
                    "Repository url not found with Zenodo record, skipping add."
                )
                return repo_url

            # Convert the class into another parser type
            elif repo_url is not None:
                uid = self.uid
                self = get_parser(repo_url)
                self.get_metadata()
                self.data["doi"] = uid
                return self
            return self.data

        elif response.status_code == 404:
            bot.error(f"Cannot find doi {self.uid}.")

        elif response.status_code in [400, 401, 403]:
            bot.error(f"Permission denied to query {self.uid}")

        else:
            bot.error(
                f"Cannot get doi {self.uid}: {response.status_code}, {response.reason}"
            )
        return None
Example #14
0
 def exists(self, uid):
     """based on a parser type and unique identifier, determine if software
        exists in the database
     """
     parser = get_parser(uid, config=self.config)
     return self.db.exists(parser.uid)
Example #15
0
    def summary(self, repo=None):
        """Summarize metrics for the entire database if uid is not defined,
           or one specific repository.
        """
        if repo is None:
            repos = self.list()
            metrics = {"repos": len(repos)}
        else:
            parser = get_parser(repo, config=self.config)
            repos = [[parser.uid]]
            metrics = {"repo": parser.uid}

        # Add taxonomy and criteria items
        metrics["taxonomy-count"] = len(self.list_taxonomy())
        metrics["criteria-count"] = len(self.list_criteria())
        metrics["users"] = {}
        metrics["taxonomy"] = {}
        metrics["criteria"] = {}

        # Count annotations for
        for repo in repos:
            parser = get_parser(repo[0], config=self.config)
            repo = self.get(parser.uid)

            if not repo.criteria and not repo.taxonomy:
                continue

            # Add repository to summary metrics
            metrics["taxonomy"][repo.uid] = {}
            metrics["criteria"][repo.uid] = {}

            # Derive all users that have annotated taxonomy/criteria
            users = set()
            for name, votes in repo.get_criteria().items():
                [users.add(user) for user in votes.keys()]
                if name not in metrics["criteria"][repo.uid]:
                    metrics["criteria"][repo.uid] = {"yes": 0, "no": 0}
                for vote in votes.values():
                    metrics["criteria"][repo.uid][vote] += 1

            # Update criteria annotations
            for user in users:
                if user not in metrics["users"]:
                    metrics["users"][user] = {
                        "criteria-annotations": 0,
                        "taxonomy-annotations": 0,
                    }
                metrics["users"][user]["criteria-annotations"] += 1

            # Derive all users that have annotated taxonomy/criteria
            users = set()
            for username, categories in repo.get_taxonomy().items():
                users.add(username)
                for category in categories:
                    if category not in metrics["taxonomy"][repo.uid]:
                        metrics["taxonomy"][repo.uid][category] = 0
                    metrics["taxonomy"][repo.uid][category] += 1

            # Don't add empty entries
            if not repo.taxonomy and repo.uid in metrics["taxonomy"]:
                del metrics["taxonomy"][repo.uid]

            if not repo.criteria and repo.uid in metrics["criteria"]:
                del metrics["criteria"][repo.uid]

        # Add unique users
        metrics["users-count"] = len(metrics["users"])
        return metrics