Esempio n. 1
0
    def process_contributors_data(self, contributors: List[str]):
        """Pre process of data for contributors in a project repository."""
        pr_ids = sorted([int(k) for k in self.pull_requests.keys()])

        contributors_reviews_data: Dict[str, Any] = {}
        contributors_reviews_data["reviewers"] = []
        contributors_reviews_data["created_dts"] = []

        interactions = {}
        for contributor in contributors:
            contributor_interaction = dict.fromkeys(contributors, 0)
            interactions[contributor] = contributor_interaction

        for pr_id in pr_ids:
            pr = self.pull_requests[str(pr_id)]

            self._analyze_pr_for_contributor_data(pr_id=pr_id, pr=pr, extracted_data=contributors_reviews_data)

            self._analyze_contributors_interaction(
                pr_interactions=pr["interactions"], pr_author=pr["created_by"], interactions_data=interactions
            )

        for reviewer in contributors_reviews_data["reviewers"]:

            number_reviews = 0
            reviews_length = []
            time_reviews = []

            for reviews in contributors_reviews_data[reviewer]["reviews"].values():
                number_reviews += len(reviews)
                review_words = 0
                for review in reviews:
                    review_words += review["words_count"]
                    time_reviews.append(review["submitted_at"])

                reviews_length.append(review_words)

            last_review_dt = max(time_reviews)

            contributors_reviews_data[reviewer]["number_reviews"] = number_reviews
            contributors_reviews_data[reviewer]["median_review_length"] = np.median(reviews_length)
            contributors_reviews_data[reviewer]["last_review_time"] = last_review_dt

            # Encode Pull Request sizes for the contributor
            contributor_prs_size_encoded = [
                convert_score2num(label=pr_size) for pr_size in contributors_reviews_data[reviewer]["PRs_size"]
            ]

            contributor_pr_median_size, contributor_relative_score = convert_num2label(
                score=np.median(contributor_prs_size_encoded)
            )
            contributors_reviews_data[reviewer]["median_pr_length"] = contributor_pr_median_size
            contributors_reviews_data[reviewer]["median_pr_length_score"] = contributor_relative_score
            contributors_reviews_data[reviewer]["interactions"] = interactions[reviewer]

        return contributors_reviews_data
Esempio n. 2
0
    def process_prs_project_data(self):
        """Pre process of data for a given project repository."""
        if not self.pull_requests:
            return {}

        ids = sorted([int(k) for k in self.pull_requests.keys()])

        project_reviews_data = {}
        project_reviews_data["contributors"] = []
        project_reviews_data["ids"] = []
        project_reviews_data["created_dts"] = []
        project_reviews_data["reviews_dts"] = []

        project_reviews_data["TTFR"] = []  # Time to First Review (TTFR) [hr]
        project_reviews_data["MTTFR"] = []  # Median TTFR [hr]

        project_reviews_data["TTR"] = []  # Time to Review (TTR) [hr]
        project_reviews_data["MTTR"] = []  # Median TTR [hr]

        project_reviews_data["MTTCI"] = []  # Median TTCI [hr]

        project_reviews_data["PRs_size"] = []  # Pull Request length
        # Pull Request length encoded
        project_reviews_data["encoded_PRs_size"] = []

        for id in ids:
            id = str(id)
            if self.pull_requests[id]["closed_at"] is None:
                continue
            pr = self.pull_requests[str(id)]

            if pr["created_by"] not in project_reviews_data["contributors"]:
                project_reviews_data["contributors"].append(pr["created_by"])

            self._analyze_pr_for_project_data(
                pr_id=id, pr=pr, extracted_data=project_reviews_data)

        project_reviews_data["last_review_time"] = max(
            project_reviews_data["reviews_dts"])

        # Encode Pull Request sizes for the contributor
        project_pr_median_size, project_length_score = convert_num2label(
            score=np.median(project_reviews_data["encoded_PRs_size"]))
        project_reviews_data["median_pr_length"] = project_pr_median_size
        project_reviews_data["median_pr_length_score"] = project_length_score

        return project_reviews_data
def pre_process_project_data(data: Dict[str, Any]):
    """Pre process of data for a given project repository."""
    if not data:
        return {}
    pr_ids = sorted([int(k) for k in data.keys()])

    project_reviews_data = {}

    project_reviews_data["contributors"] = []
    project_reviews_data["ids"] = []
    project_reviews_data["created_dts"] = []
    project_reviews_data["reviews_dts"] = []

    project_reviews_data["TTFR"] = []  # Time to First Review (TTFR) [hr]
    project_reviews_data["MTTFR"] = []  # Median TTFR [hr]

    project_reviews_data["TTR"] = []  # Time to Review (TTR) [hr]
    project_reviews_data["MTTR"] = []  # Median TTR [hr]

    project_reviews_data["PRs_size"] = []  # Pull Request length
    project_reviews_data["encoded_PRs_size"] = [
    ]  # Pull Request length encoded

    for pr_id in pr_ids:
        pr = data[str(pr_id)]

        if pr["created_by"] not in project_reviews_data["contributors"]:
            project_reviews_data["contributors"].append(pr["created_by"])

        analyze_pr_for_project_data(pr_id=pr_id,
                                    pr=pr,
                                    extracted_data=project_reviews_data)

    project_reviews_data["last_review_time"] = max(
        project_reviews_data["reviews_dts"])

    # Encode Pull Request sizes for the contributor
    project_pr_median_size, project_length_score = convert_num2label(
        score=np.median(project_reviews_data["encoded_PRs_size"]))
    project_reviews_data["median_pr_length"] = project_pr_median_size
    project_reviews_data["median_pr_length_score"] = project_length_score

    return project_reviews_data