Example #1
0
def tagAnalysis(repo: git.Repo, outputDir: str):
    print("Analyzing tags")

    tagInfo = []
    print(
        "Sorting (no progress available, may take several minutes to complete)"
    )
    tags = sorted(repo.tags, key=getTaggedDate)

    if len(tags) > 0:
        lastTag = None
        for tag in Bar("Processing").iter(tags):
            commitCount = 0
            if lastTag == None:
                commitCount = len(list(tag.commit.iter_items(repo,
                                                             tag.commit)))
            else:
                sinceStr = formatDate(getTaggedDate(lastTag))
                commitCount = len(
                    list(
                        tag.commit.iter_items(repo, tag.commit,
                                              after=sinceStr)))

            tagInfo.append(
                dict(
                    path=tag.path,
                    date=formatDate(getTaggedDate(tag)),
                    commitCount=commitCount,
                ))

            lastTag = tag

    # output non-tabular results
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Tag Count", len(tagInfo)])

    # output tag info
    print("Outputting CSVs")
    with open(os.path.join(outputDir, "tags.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Path", "Date", "Commit Count"])
        for tag in tagInfo:
            w.writerow([tag["path"], tag["date"], tag["commitCount"]])

    outputStatistics(
        [tag["commitCount"] for tag in tagInfo],
        "TagCommitCount",
        outputDir,
    )
Example #2
0
def outputTags(idx: int, tagInfo: List[dict], daysActive: int,
               config: Configuration):

    # calculate FN
    fn = len(tagInfo) / daysActive * 100

    # output non-tabular results
    with open(os.path.join(config.resultsPath, f"results_{idx}.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Tag Count", len(tagInfo)])

    # output tag info
    print("Outputting CSVs")

    with open(os.path.join(config.resultsPath, f"results_{idx}.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["FN", fn])

    with open(os.path.join(config.metricsPath, f"tags_{idx}.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Path", "Date", "Commit Count"])
        for tag in tagInfo:
            w.writerow([tag["path"], tag["date"], tag["commitCount"]])

    outputStatistics(
        idx,
        [tag["commitCount"] for tag in tagInfo],
        "TagCommitCount",
        config.resultsPath,
    )
Example #3
0
def centralityAnalysis(repo: git.Repo, commits: List[git.Commit],
                       outputDir: str):

    allRelatedAuthors = {}
    authorCommits = Counter({})

    # for all commits...
    print("Analyzing centrality")
    for commit in Bar("Processing").iter(commits):
        author = authorIdExtractor(commit.author)

        # increase author commit count
        authorCommits.update({author: 1})

        # initialize dates for related author analysis
        commitDate = datetime.fromtimestamp(commit.committed_date)
        earliestDate = commitDate + relativedelta(months=-1)
        latestDate = commitDate + relativedelta(months=+1)

        # find authors related to this commit
        #        commitRelatedCommits = commit.iter_items(
        #                repo, 'master',
        #                after=earliestDate.strftime('%Y-%m-%d'),
        #                before=latestDate.strftime('%Y-%m-%d'))

        commitRelatedCommits = filter(
            lambda c: findRelatedCommits(author, earliestDate, latestDate, c),
            commits)

        commitRelatedAuthors = set(
            list(
                map(lambda c: authorIdExtractor(c.author),
                    commitRelatedCommits)))

        # get current related authors collection and update it
        authorRelatedAuthors = allRelatedAuthors.setdefault(author, set())
        authorRelatedAuthors.update(commitRelatedAuthors)

    # prepare graph
    print("Preparing NX graph")
    G = nx.Graph()

    for author in allRelatedAuthors:
        G.add_node(author)

        for relatedAuthor in allRelatedAuthors[author]:
            G.add_edge(author.strip(), relatedAuthor.strip())

    # analyze graph
    closeness = dict(nx.closeness_centrality(G))
    betweenness = dict(nx.betweenness_centrality(G))
    centrality = dict(nx.degree_centrality(G))
    density = nx.density(G)
    modularity = []

    try:
        for idx, community in enumerate(greedy_modularity_communities(G)):
            authorCount = len(community)
            communityCommitCount = sum(authorCommits[author]
                                       for author in community)
            row = [authorCount, communityCommitCount]
            modularity.append(row)
    except ZeroDivisionError:
        # not handled
        pass

    # finding high centrality authors
    numberHighCentralityAuthors = len([
        author for author, centrality in centrality.items() if centrality > 0.5
    ])

    percentageHighCentralityAuthors = numberHighCentralityAuthors / len(
        allRelatedAuthors)

    print("Outputting CSVs")

    # output non-tabular results
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Density", density])
        w.writerow(["Community Count", len(modularity)])

    # output community information
    with open(os.path.join(outputDir, "community.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Community Index", "Author Count", "Commit Count"])
        for idx, community in enumerate(modularity):
            w.writerow([idx + 1, community[0], community[1]])

    # combine centrality results
    combined = {}
    for key in closeness:
        single = {
            "Author": key,
            "Closeness": closeness[key],
            "Betweenness": betweenness[key],
            "Centrality": centrality[key],
        }

        combined[key] = single

    # output tabular results
    with open(os.path.join(outputDir, "centrality.csv"), "w", newline="") as f:
        w = csv.DictWriter(
            f, ["Author", "Closeness", "Betweenness", "Centrality"])
        w.writeheader()

        for key in combined:
            w.writerow(combined[key])

    # output high centrality authors
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(
            ["NumberHighCentralityAuthors", numberHighCentralityAuthors])
        w.writerow([
            "PercentageHighCentralityAuthors", percentageHighCentralityAuthors
        ])

    # output statistics
    outputStatistics(
        [value for key, value in closeness.items()],
        "Closeness",
        outputDir,
    )

    outputStatistics(
        [value for key, value in betweenness.items()],
        "Betweenness",
        outputDir,
    )

    outputStatistics(
        [value for key, value in centrality.items()],
        "Centrality",
        outputDir,
    )

    outputStatistics(
        [community[0] for community in modularity],
        "CommunityAuthorCount",
        outputDir,
    )

    outputStatistics(
        [community[1] for community in modularity],
        "CommunityCommitCount",
        outputDir,
    )

    # output graph to PNG
    print("Outputting graph to PNG")
    graphFigure = plt.figure(5, figsize=(30, 30))
    nx.draw(
        G,
        with_labels=True,
        node_color="orange",
        node_size=4000,
        edge_color="black",
        linewidths=2,
        font_size=20,
    )
    graphFigure.savefig(os.path.join(outputDir, "graph.png"))
def commitAnalysis(commits: List[git.Commit], outputDir: str):

    authorInfoDict = {}
    timezoneInfoDict = {}
    experienceDays = 150

    # traverse all commits
    print("Analyzing commits")
    for commit in Bar("Processing").iter(commits):

        # extract info
        author = authorIdExtractor(commit.author)
        timezone = commit.author_tz_offset
        time = commit.authored_datetime

        # get timezone
        timezoneInfo = timezoneInfoDict.setdefault(
            timezone, dict(commitCount=0, authors=set())
        )

        # save author
        timezoneInfo["authors"].add(author)

        # increase commit count
        timezoneInfo["commitCount"] += 1

        # get author
        authorInfo = authorInfoDict.setdefault(
            author,
            dict(
                commitCount=0,
                sponsoredCommitCount=0,
                earliestCommitDate=time,
                latestCommitDate=time,
                sponsored=False,
                activeDays=0,
                experienced=False,
            ),
        )

        # increase commit count
        authorInfo["commitCount"] += 1

        # validate earliest commit
        # by default GitPython orders commits from latest to earliest
        if time < authorInfo["earliestCommitDate"]:
            authorInfo["earliestCommitDate"] = time

        # check if commit was between 9 and 5
        if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17:
            authorInfo["sponsoredCommitCount"] += 1

    print("Analyzing authors")
    sponsoredAuthorCount = 0
    for login, author in authorInfoDict.items():

        # check if sponsored
        commitCount = int(author["commitCount"])
        sponsoredCommitCount = int(author["sponsoredCommitCount"])
        diff = sponsoredCommitCount / commitCount
        if diff >= 0.95:
            author["sponsored"] = True
            sponsoredAuthorCount += 1

        # calculate active days
        earliestDate = author["earliestCommitDate"]
        latestDate = author["latestCommitDate"]
        activeDays = (latestDate - earliestDate).days + 1
        author["activeDays"] = activeDays

        # check if experienced
        if activeDays >= experienceDays:
            author["experienced"] = True

    # calculate percentage sponsored authors
    percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict])

    # calculate active project days
    firstCommitDate = datetime.fromtimestamp(commits[len(commits) - 1].committed_date)
    lastCommitDate = datetime.fromtimestamp(commits[0].committed_date)
    daysActive = (lastCommitDate - firstCommitDate).days

    print("Outputting CSVs")

    # output author days on project
    with open(os.path.join(outputDir, "authorDaysOnProject.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "# of Days"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["activeDays"]])

    # output commits per author
    with open(os.path.join(outputDir, "commitsPerAuthor.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "Commit Count"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["commitCount"]])

    # output timezones
    with open(os.path.join(outputDir, "timezones.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Timezone Offset", "Author Count", "Commit Count"])
        for timezone in timezoneInfoDict:
            timezoneInfo = timezoneInfoDict[timezone]
            w.writerow(
                [timezone, len(timezoneInfo["authors"]), timezoneInfo["commitCount"]]
            )

    # output project info
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["DaysActive", daysActive])
        w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)])
        w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)])
        w.writerow(["AuthorCount", len([*authorInfoDict])])
        w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount])
        w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors])
        w.writerow(["TimezoneCount", len([*timezoneInfoDict])])

    outputStatistics(
        [author["activeDays"] for login, author in authorInfoDict.items()],
        "ActiveDays",
        outputDir,
    )

    outputStatistics(
        [author["commitCount"] for login, author in authorInfoDict.items()],
        "CommitCount",
        outputDir,
    )

    return authorInfoDict
Example #5
0
def prAnalysis(
    config: Configuration,
    senti: sentistrength.PySentiStr,
    delta: relativedelta,
    batchDates: List[datetime],
):

    print("Querying PRs")
    batches = prRequest(config.pat, config.repositoryOwner,
                        config.repositoryName, delta, batchDates)

    batchParticipants = list()
    batchComments = list()

    for batchIdx, batch in enumerate(batches):
        print(f"Analyzing PR batch #{batchIdx}")

        # extract data from batch
        prCount = len(batch)
        participants = list(pr["participants"] for pr in batch
                            if len(pr["participants"]) > 0)
        batchParticipants.append(participants)

        allComments = list()
        prPositiveComments = list()
        prNegativeComments = list()
        generallyNegative = list()

        print(f"    Sentiments per PR", end="")

        semaphore = threading.Semaphore(15)
        threads = []
        for pr in batch:

            comments = list(comment for comment in pr["comments"]
                            if comment and comment.strip())

            # split comments that are longer than 20KB
            splitComments = []
            for comment in comments:

                # calc number of chunks
                byteChunks = math.ceil(sys.getsizeof(comment) / (20 * 1024))
                if byteChunks > 1:

                    # calc desired max length of each chunk
                    chunkLength = math.floor(len(comment) / byteChunks)

                    # divide comment into chunks
                    chunks = [
                        comment[i * chunkLength:i * chunkLength + chunkLength]
                        for i in range(0, byteChunks)
                    ]

                    # save chunks
                    splitComments.extend(chunks)

                else:
                    # append comment as-is
                    splitComments.append(comment)

            # re-assign comments after chunking
            comments = splitComments

            if len(comments) == 0:
                prPositiveComments.append(0)
                prNegativeComments.append(0)
                continue

            allComments.extend(comments)

            thread = threading.Thread(
                target=analyzeSentiments,
                args=(
                    senti,
                    comments,
                    prPositiveComments,
                    prNegativeComments,
                    generallyNegative,
                    semaphore,
                ),
            )
            threads.append(thread)

        for thread in threads:
            thread.start()

        for thread in threads:
            thread.join()

        print("")

        # save comments
        batchComments.append(allComments)

        # get comment length stats
        commentLengths = [len(c) for c in allComments]

        generallyNegativeRatio = len(generallyNegative) / prCount

        # get pr duration stats
        durations = [(pr["closedAt"] - pr["createdAt"]).days for pr in batch]

        print("    All sentiments")

        commentSentiments = []
        commentSentimentsPositive = 0
        commentSentimentsNegative = 0

        if len(allComments) > 0:
            commentSentiments = senti.getSentiment(allComments)
            commentSentimentsPositive = sum(
                1 for _ in filter(lambda value: value >= 1, commentSentiments))
            commentSentimentsNegative = sum(
                1
                for _ in filter(lambda value: value <= -1, commentSentiments))

        toxicityPercentage = getToxicityPercentage(config, allComments)

        centrality.buildGraphQlNetwork(batchIdx, participants, "PRs", config)

        print("    Writing results")
        with open(
                os.path.join(config.resultsPath, f"results_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["NumberPRs", prCount])
            w.writerow(["NumberPRComments", len(allComments)])
            w.writerow(["PRCommentsPositive", commentSentimentsPositive])
            w.writerow(["PRCommentsNegative", commentSentimentsNegative])
            w.writerow(["PRCommentsNegativeRatio", generallyNegativeRatio])
            w.writerow(["PRCommentsToxicityPercentage", toxicityPercentage])

        with open(
                os.path.join(config.metricsPath, f"PRCommits_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["PR Number", "Commit Count"])
            for pr in batch:
                w.writerow([pr["number"], pr["commitCount"]])

        with open(
                os.path.join(config.metricsPath,
                             f"PRParticipants_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["PR Number", "Developer Count"])
            for pr in batch:
                w.writerow([pr["number"], len(set(pr["participants"]))])

        # output statistics
        stats.outputStatistics(
            batchIdx,
            commentLengths,
            "PRCommentsLength",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            durations,
            "PRDuration",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [len(pr["comments"]) for pr in batch],
            "PRCommentsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [pr["commitCount"] for pr in batch],
            "PRCommitsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            commentSentiments,
            "PRCommentSentiments",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [len(set(pr["participants"])) for pr in batch],
            "PRParticipantsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            prPositiveComments,
            "PRCountPositiveComments",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            prNegativeComments,
            "PRCountNegativeComments",
            config.resultsPath,
        )

    return batchParticipants, batchComments
Example #6
0
def prepareGraph(
    allRelatedAuthors: dict,
    authorItems: Counter,
    batchIdx: int,
    outputPrefix: str,
    config: Configuration,
):

    # prepare graph
    print("Preparing NX graph")
    G = nx.Graph()

    for author in allRelatedAuthors:
        G.add_node(author)

        for relatedAuthor in allRelatedAuthors[author]:
            G.add_edge(author.strip(), relatedAuthor.strip())

    # analyze graph
    closeness = dict(nx.closeness_centrality(G))
    betweenness = dict(nx.betweenness_centrality(G))
    centrality = dict(nx.degree_centrality(G))
    density = nx.density(G)
    modularity = []

    try:
        for idx, community in enumerate(greedy_modularity_communities(G)):
            authorCount = len(community)
            communityCommitCount = sum(authorItems[author] for author in community)
            row = [authorCount, communityCommitCount]
            modularity.append(row)
    except ZeroDivisionError:
        # not handled
        pass

    # finding high centrality authors
    highCentralityAuthors = list(
        [author for author, centrality in centrality.items() if centrality > 0.5]
    )

    numberHighCentralityAuthors = len(highCentralityAuthors)

    percentageHighCentralityAuthors = numberHighCentralityAuthors / len(
        allRelatedAuthors
    )

    # calculate TFN
    tfn = len(authorItems) - numberHighCentralityAuthors

    # calculate TFC
    tfc = sum(authorItems[author] for author in highCentralityAuthors) / sum(authorItems.values()) * 100

    print("Outputting CSVs")

    # output non-tabular results
    with open(
        os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow([f"{outputPrefix}_Density", density])
        w.writerow([f"{outputPrefix}_Community Count", len(modularity)])
        w.writerow([f"{outputPrefix}_TFN", tfn])
        w.writerow([f"{outputPrefix}_TFC", tfc])

    # output community information
    with open(
        os.path.join(config.metricsPath, f"{outputPrefix}_community_{batchIdx}.csv"),
        "a",
        newline="",
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Community Index", "Author Count", "Item Count"])
        for idx, community in enumerate(modularity):
            w.writerow([idx + 1, community[0], community[1]])

    # combine centrality results
    combined = {}
    for key in closeness:
        single = {
            "Author": key,
            "Closeness": closeness[key],
            "Betweenness": betweenness[key],
            "Centrality": centrality[key],
        }

        combined[key] = single

    # output tabular results
    with open(
        os.path.join(config.metricsPath, f"{outputPrefix}_centrality_{batchIdx}.csv"),
        "w",
        newline="",
    ) as f:
        w = csv.DictWriter(f, ["Author", "Closeness", "Betweenness", "Centrality"])
        w.writeheader()

        for key in combined:
            w.writerow(combined[key])

    # output high centrality authors
    with open(
        os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(
            [f"{outputPrefix}_NumberHighCentralityAuthors", numberHighCentralityAuthors]
        )
        w.writerow(
            [
                f"{outputPrefix}_PercentageHighCentralityAuthors",
                percentageHighCentralityAuthors,
            ]
        )

    # output statistics
    outputStatistics(
        batchIdx,
        [value for key, value in closeness.items()],
        f"{outputPrefix}_Closeness",
        config.resultsPath,
    )

    outputStatistics(
        batchIdx,
        [value for key, value in betweenness.items()],
        f"{outputPrefix}_Betweenness",
        config.resultsPath,
    )

    outputStatistics(
        batchIdx,
        [value for key, value in centrality.items()],
        f"{outputPrefix}_Centrality",
        config.resultsPath,
    )

    outputStatistics(
        batchIdx,
        [community[0] for community in modularity],
        f"{outputPrefix}_CommunityAuthorCount",
        config.resultsPath,
    )

    outputStatistics(
        batchIdx,
        [community[1] for community in modularity],
        f"{outputPrefix}_CommunityAuthorItemCount",
        config.resultsPath,
    )

    # output graph
    print("Outputting graph")
    plt.figure(5, figsize=(30, 30))

    nx.draw(
        G,
        with_labels=True,
        node_color="orange",
        node_size=4000,
        edge_color="black",
        linewidths=2,
        font_size=20,
    )

    plt.savefig(
        os.path.join(config.resultsPath, f"{outputPrefix}_{batchIdx}.pdf")
    )

    nx.write_graphml(
        G, os.path.join(config.resultsPath, f"{outputPrefix}_{batchIdx}.xml")
    )

    return highCentralityAuthors
Example #7
0
def releaseAnalysis(
    allCommits: List[git.Commit],
    config: Configuration,
    delta: relativedelta,
    batchDates: List[datetime],
):

    # sort commits by ascending commit date
    allCommits.sort(key=lambda c: c.committed_datetime)

    print("Querying releases")
    batches = releaseRequest(config, delta, batchDates)

    for batchIdx, batch in enumerate(batches):

        releases = batch["releases"]
        releaseAuthors = set()
        releaseCommitsCount = {}

        for i, release in enumerate(releases):
            releaseCommits = list()
            releaseDate = release["createdAt"]

            # try add author to set
            releaseAuthors.add(release["author"])

            if i == 0:

                # this is the first release, get all commits prior to release created date
                for commit in allCommits:
                    if commit.committed_datetime < releaseDate:
                        releaseCommits.append(commit)
                    else:
                        break

            else:

                # get in-between commit count
                prevReleaseDate = releases[i - 1]["createdAt"]
                for commit in allCommits:
                    if (
                        commit.committed_datetime >= prevReleaseDate
                        and commit.committed_datetime < releaseDate
                    ):
                        releaseCommits.append(commit)
                    else:
                        break

            # remove all counted commits from list to improve iteration speed
            allCommits = allCommits[len(releaseCommits) :]

            # calculate authors per release
            commitAuthors = set(commit.author.email for commit in releaseCommits)

            # add results
            releaseCommitsCount[release["name"]] = dict(
                date=release["createdAt"],
                authorsCount=len(commitAuthors),
                commitsCount=len(releaseCommits),
            )

        # sort releases by date ascending
        releaseCommitsCount = {
            key: value
            for key, value in sorted(
                releaseCommitsCount.items(), key=lambda r: r[1]["date"]
            )
        }

        print("Writing results")
        with open(
            os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline=""
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["NumberReleases", batch["releaseCount"]])
            w.writerow(["NumberReleaseAuthors", len(releaseAuthors)])

        with open(
            os.path.join(config.metricsPath, f"releases_{batchIdx}.csv"),
            "a",
            newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["Release", "Date", "Author Count", "Commit Count"])
            for key, value in releaseCommitsCount.items():
                w.writerow(
                    [
                        key,
                        value["date"].isoformat(),
                        value["authorsCount"],
                        value["commitsCount"],
                    ]
                )

        stats.outputStatistics(
            batchIdx,
            [value["authorsCount"] for key, value in releaseCommitsCount.items()],
            "ReleaseAuthorCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [value["commitsCount"] for key, value in releaseCommitsCount.items()],
            "ReleaseCommitCount",
            config.resultsPath,
        )
def graphqlAnalysis(pat: str, repoShortName: str, outputDir: str):

    # split repo by owner and name
    owner, name = splitRepoName(repoShortName)

    print("Querying number of issues")
    issueCount = gql.countIssuesPerRepository(pat, owner, name)

    print("Querying number of PRs")
    prCount = gql.countPullRequestsPerRepository(pat, owner, name)

    print("Querying number of commits per PR")
    prCommitCount = gql.countCommitsPerPullRequest(pat, owner, name)

    print("Querying issue participants")
    issueParticipants, issueParticipantCount = gql.getIssueParticipants(
        pat, owner, name)

    print("Querying PR participants")
    prParticipants, prParticipantCount = gql.getPullRequestParticipants(
        pat, owner, name)

    # join lists and clean memory
    participants = issueParticipants.union(prParticipants)
    del issueParticipants
    del prParticipants

    print("Querying number of comments per issue")
    issueCommentCount = gql.getIssueComments(pat, owner, name)

    print("Writing GraphQL analysis results")
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["NumberIssues", issueCount])
        w.writerow(["NumberPRs", prCount])

    with open(os.path.join(outputDir, "numberCommitsPR.csv"), "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["PR Number", "Commit Count"])
        for key, value in prCommitCount.items():
            w.writerow([key, value])

    with open(os.path.join(outputDir, "numberDevelopersIssue.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Issue Number", "Developer Count"])
        for key, value in issueParticipantCount.items():
            w.writerow([key, value])

    with open(os.path.join(outputDir, "numberDevelopersPR.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["PR Number", "Developer Count"])
        for key, value in prParticipantCount.items():
            w.writerow([key, value])

    with open(os.path.join(outputDir, "numberCommentsIssue.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["PR Number", "Commit Count"])
        for key, value in issueCommentCount.items():
            w.writerow([key, value])

    # output statistics
    outputStatistics([value for key, value in prCommitCount.items()],
                     "CommitsPRCount", outputDir)

    outputStatistics(
        [value for key, value in issueParticipantCount.items()],
        "DevelopersIssueCount",
        outputDir,
    )

    outputStatistics(
        [value for key, value in prParticipantCount.items()],
        "DevelopersPRCount",
        outputDir,
    )

    outputStatistics(
        [value for key, value in issueCommentCount.items()],
        "CommentsIssueCount",
        outputDir,
    )

    return participants
Example #9
0
def commitBatchAnalysis(
    idx: int, senti: PySentiStr, commits: List[git.Commit], config: Configuration
):

    authorInfoDict = {}
    timezoneInfoDict = {}
    experienceDays = 150

    # traverse all commits
    print("Analyzing commits")
    startDate = None
    if config.startDate is not None:
        startDate = datetime.strptime(config.startDate, "%Y-%m-%d")
        startDate = startDate.replace(tzinfo=pytz.UTC)
    # sort commits
    commits.sort(key=lambda o: o.committed_datetime, reverse=True)

    commitMessages = []
    commit: Commit
    lastDate = None
    firstDate = None
    realCommitCount = 0
    for commit in Bar("Processing").iter(commits):
        if startDate is not None and startDate > commit.committed_datetime:
            continue
        if lastDate is None:
            lastDate = commit.committed_date
        firstDate = commit.committed_date
        realCommitCount = realCommitCount + 1
        # extract info
        author = authorIdExtractor(commit.author)
        timezone = commit.author_tz_offset
        time = commit.authored_datetime

        # get timezone
        timezoneInfo = timezoneInfoDict.setdefault(
            timezone, dict(commitCount=0, authors=set())
        )

        # save info
        timezoneInfo["authors"].add(author)

        if commit.message and commit.message.strip():
            commitMessages.append(commit.message)

        # increase commit count
        timezoneInfo["commitCount"] += 1

        # get author
        authorInfo = authorInfoDict.setdefault(
            author,
            dict(
                commitCount=0,
                sponsoredCommitCount=0,
                earliestCommitDate=time,
                latestCommitDate=time,
                sponsored=False,
                activeDays=0,
                experienced=False,
            ),
        )

        # increase commit count
        authorInfo["commitCount"] += 1

        # validate earliest commit
        # by default GitPython orders commits from latest to earliest
        if time < authorInfo["earliestCommitDate"]:
            authorInfo["earliestCommitDate"] = time

        # check if commit was between 9 and 5
        if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17:
            authorInfo["sponsoredCommitCount"] += 1

    print("Analyzing commit message sentiment")
    sentimentScores = []
    commitMessageSentimentsPositive = []
    commitMessageSentimentsNegative = []

    if len(commitMessages) > 0:
        sentimentScores = senti.getSentiment(commitMessages)
        commitMessageSentimentsPositive = list(
            result for result in filter(lambda value: value >= 1, sentimentScores)
        )
        commitMessageSentimentsNegative = list(
            result for result in filter(lambda value: value <= -1, sentimentScores)
        )

    print("Analyzing authors")
    sponsoredAuthorCount = 0
    for login, author in authorInfoDict.items():

        # check if sponsored
        commitCount = int(author["commitCount"])
        sponsoredCommitCount = int(author["sponsoredCommitCount"])
        diff = sponsoredCommitCount / commitCount
        if diff >= 0.95:
            author["sponsored"] = True
            sponsoredAuthorCount += 1

        # calculate active days
        earliestDate = author["earliestCommitDate"]
        latestDate = author["latestCommitDate"]
        activeDays = (latestDate - earliestDate).days + 1
        author["activeDays"] = activeDays

        # check if experienced
        if activeDays >= experienceDays:
            author["experienced"] = True

    # calculate percentage sponsored authors
    percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict])

    # calculate active project days
    firstCommitDate = None
    lastCommitDate = None
    if firstDate is not None:
        firstCommitDate = datetime.fromtimestamp(firstDate)
    if lastDate is not None:
        lastCommitDate = datetime.fromtimestamp(lastDate)
    daysActive = 0
    if lastCommitDate is not None:
        daysActive = (lastCommitDate - firstCommitDate).days

    print("Outputting CSVs")

    # output author days on project
    with open(
        os.path.join(config.metricsPath, f"authorDaysOnProject_{idx}.csv"),
        "a",
        newline="",
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "# of Days"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["activeDays"]])

    # output commits per author
    with open(
        os.path.join(config.metricsPath, f"commitsPerAuthor_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "Commit Count"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["commitCount"]])

    # output timezones
    with open(
        os.path.join(config.metricsPath, f"timezones_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Timezone Offset", "Author Count", "Commit Count"])
        for key, timezone in timezoneInfoDict.items():
            w.writerow([key, len(timezone["authors"]), timezone["commitCount"]])

    # output results
    with open(
        os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["CommitCount", realCommitCount])
        w.writerow(["DaysActive", daysActive])
        w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)])
        w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)])
        w.writerow(["AuthorCount", len([*authorInfoDict])])
        w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount])
        w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors])
        w.writerow(["TimezoneCount", len([*timezoneInfoDict])])

    outputStatistics(
        idx,
        [author["activeDays"] for login, author in authorInfoDict.items()],
        "AuthorActiveDays",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [author["commitCount"] for login, author in authorInfoDict.items()],
        "AuthorCommitCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [len(timezone["authors"]) for key, timezone in timezoneInfoDict.items()],
        "TimezoneAuthorCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [timezone["commitCount"] for key, timezone in timezoneInfoDict.items()],
        "TimezoneCommitCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        sentimentScores,
        "CommitMessageSentiment",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        commitMessageSentimentsPositive,
        "CommitMessageSentimentsPositive",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        commitMessageSentimentsNegative,
        "CommitMessageSentimentsNegative",
        config.resultsPath,
    )

    return authorInfoDict, daysActive