Python outputStatistics Examples

Programming Language: Python

Namespace/Package Name: statsAnalysis

Method/Function: outputStatistics

Examples at hotexamples.com: 9

Python outputStatistics - 9 examples found. These are the top rated real world Python examples of statsAnalysis.outputStatistics extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def tagAnalysis(repo: git.Repo, outputDir: str):
    print("Analyzing tags")

    tagInfo = []
    print(
        "Sorting (no progress available, may take several minutes to complete)"
    )
    tags = sorted(repo.tags, key=getTaggedDate)

    if len(tags) > 0:
        lastTag = None
        for tag in Bar("Processing").iter(tags):
            commitCount = 0
            if lastTag == None:
                commitCount = len(list(tag.commit.iter_items(repo,
                                                             tag.commit)))
            else:
                sinceStr = formatDate(getTaggedDate(lastTag))
                commitCount = len(
                    list(
                        tag.commit.iter_items(repo, tag.commit,
                                              after=sinceStr)))

            tagInfo.append(
                dict(
                    path=tag.path,
                    date=formatDate(getTaggedDate(tag)),
                    commitCount=commitCount,
                ))

            lastTag = tag

    # output non-tabular results
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Tag Count", len(tagInfo)])

    # output tag info
    print("Outputting CSVs")
    with open(os.path.join(outputDir, "tags.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Path", "Date", "Commit Count"])
        for tag in tagInfo:
            w.writerow([tag["path"], tag["date"], tag["commitCount"]])

    outputStatistics(
        [tag["commitCount"] for tag in tagInfo],
        "TagCommitCount",
        outputDir,
    )

Example #2

Show file

def outputTags(idx: int, tagInfo: List[dict], daysActive: int,
               config: Configuration):

    # calculate FN
    fn = len(tagInfo) / daysActive * 100

    # output non-tabular results
    with open(os.path.join(config.resultsPath, f"results_{idx}.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Tag Count", len(tagInfo)])

    # output tag info
    print("Outputting CSVs")

    with open(os.path.join(config.resultsPath, f"results_{idx}.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["FN", fn])

    with open(os.path.join(config.metricsPath, f"tags_{idx}.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Path", "Date", "Commit Count"])
        for tag in tagInfo:
            w.writerow([tag["path"], tag["date"], tag["commitCount"]])

    outputStatistics(
        idx,
        [tag["commitCount"] for tag in tagInfo],
        "TagCommitCount",
        config.resultsPath,
    )

Example #3

Show file

def centralityAnalysis(repo: git.Repo, commits: List[git.Commit],
                       outputDir: str):

    allRelatedAuthors = {}
    authorCommits = Counter({})

    # for all commits...
    print("Analyzing centrality")
    for commit in Bar("Processing").iter(commits):
        author = authorIdExtractor(commit.author)

        # increase author commit count
        authorCommits.update({author: 1})

        # initialize dates for related author analysis
        commitDate = datetime.fromtimestamp(commit.committed_date)
        earliestDate = commitDate + relativedelta(months=-1)
        latestDate = commitDate + relativedelta(months=+1)

        # find authors related to this commit
        #        commitRelatedCommits = commit.iter_items(
        #                repo, 'master',
        #                after=earliestDate.strftime('%Y-%m-%d'),
        #                before=latestDate.strftime('%Y-%m-%d'))

        commitRelatedCommits = filter(
            lambda c: findRelatedCommits(author, earliestDate, latestDate, c),
            commits)

        commitRelatedAuthors = set(
            list(
                map(lambda c: authorIdExtractor(c.author),
                    commitRelatedCommits)))

        # get current related authors collection and update it
        authorRelatedAuthors = allRelatedAuthors.setdefault(author, set())
        authorRelatedAuthors.update(commitRelatedAuthors)

    # prepare graph
    print("Preparing NX graph")
    G = nx.Graph()

    for author in allRelatedAuthors:
        G.add_node(author)

        for relatedAuthor in allRelatedAuthors[author]:
            G.add_edge(author.strip(), relatedAuthor.strip())

    # analyze graph
    closeness = dict(nx.closeness_centrality(G))
    betweenness = dict(nx.betweenness_centrality(G))
    centrality = dict(nx.degree_centrality(G))
    density = nx.density(G)
    modularity = []

    try:
        for idx, community in enumerate(greedy_modularity_communities(G)):
            authorCount = len(community)
            communityCommitCount = sum(authorCommits[author]
                                       for author in community)
            row = [authorCount, communityCommitCount]
            modularity.append(row)
    except ZeroDivisionError:
        # not handled
        pass

    # finding high centrality authors
    numberHighCentralityAuthors = len([
        author for author, centrality in centrality.items() if centrality > 0.5
    ])

    percentageHighCentralityAuthors = numberHighCentralityAuthors / len(
        allRelatedAuthors)

    print("Outputting CSVs")

    # output non-tabular results
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Density", density])
        w.writerow(["Community Count", len(modularity)])

    # output community information
    with open(os.path.join(outputDir, "community.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Community Index", "Author Count", "Commit Count"])
        for idx, community in enumerate(modularity):
            w.writerow([idx + 1, community[0], community[1]])

    # combine centrality results
    combined = {}
    for key in closeness:
        single = {
            "Author": key,
            "Closeness": closeness[key],
            "Betweenness": betweenness[key],
            "Centrality": centrality[key],
        }

        combined[key] = single

    # output tabular results
    with open(os.path.join(outputDir, "centrality.csv"), "w", newline="") as f:
        w = csv.DictWriter(
            f, ["Author", "Closeness", "Betweenness", "Centrality"])
        w.writeheader()

        for key in combined:
            w.writerow(combined[key])

    # output high centrality authors
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(
            ["NumberHighCentralityAuthors", numberHighCentralityAuthors])
        w.writerow([
            "PercentageHighCentralityAuthors", percentageHighCentralityAuthors
        ])

    # output statistics
    outputStatistics(
        [value for key, value in closeness.items()],
        "Closeness",
        outputDir,
    )

    outputStatistics(
        [value for key, value in betweenness.items()],
        "Betweenness",
        outputDir,
    )

    outputStatistics(
        [value for key, value in centrality.items()],
        "Centrality",
        outputDir,
    )

    outputStatistics(
        [community[0] for community in modularity],
        "CommunityAuthorCount",
        outputDir,
    )

    outputStatistics(
        [community[1] for community in modularity],
        "CommunityCommitCount",
        outputDir,
    )

    # output graph to PNG
    print("Outputting graph to PNG")
    graphFigure = plt.figure(5, figsize=(30, 30))
    nx.draw(
        G,
        with_labels=True,
        node_color="orange",
        node_size=4000,
        edge_color="black",
        linewidths=2,
        font_size=20,
    )
    graphFigure.savefig(os.path.join(outputDir, "graph.png"))

Example #4

Show file

File: commitAnalysis.py Project: Chandler-Song/DevNetworkPython

def commitAnalysis(commits: List[git.Commit], outputDir: str):

    authorInfoDict = {}
    timezoneInfoDict = {}
    experienceDays = 150

    # traverse all commits
    print("Analyzing commits")
    for commit in Bar("Processing").iter(commits):

        # extract info
        author = authorIdExtractor(commit.author)
        timezone = commit.author_tz_offset
        time = commit.authored_datetime

        # get timezone
        timezoneInfo = timezoneInfoDict.setdefault(
            timezone, dict(commitCount=0, authors=set())
        )

        # save author
        timezoneInfo["authors"].add(author)

        # increase commit count
        timezoneInfo["commitCount"] += 1

        # get author
        authorInfo = authorInfoDict.setdefault(
            author,
            dict(
                commitCount=0,
                sponsoredCommitCount=0,
                earliestCommitDate=time,
                latestCommitDate=time,
                sponsored=False,
                activeDays=0,
                experienced=False,
            ),
        )

        # increase commit count
        authorInfo["commitCount"] += 1

        # validate earliest commit
        # by default GitPython orders commits from latest to earliest
        if time < authorInfo["earliestCommitDate"]:
            authorInfo["earliestCommitDate"] = time

        # check if commit was between 9 and 5
        if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17:
            authorInfo["sponsoredCommitCount"] += 1

    print("Analyzing authors")
    sponsoredAuthorCount = 0
    for login, author in authorInfoDict.items():

        # check if sponsored
        commitCount = int(author["commitCount"])
        sponsoredCommitCount = int(author["sponsoredCommitCount"])
        diff = sponsoredCommitCount / commitCount
        if diff >= 0.95:
            author["sponsored"] = True
            sponsoredAuthorCount += 1

        # calculate active days
        earliestDate = author["earliestCommitDate"]
        latestDate = author["latestCommitDate"]
        activeDays = (latestDate - earliestDate).days + 1
        author["activeDays"] = activeDays

        # check if experienced
        if activeDays >= experienceDays:
            author["experienced"] = True

    # calculate percentage sponsored authors
    percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict])

    # calculate active project days
    firstCommitDate = datetime.fromtimestamp(commits[len(commits) - 1].committed_date)
    lastCommitDate = datetime.fromtimestamp(commits[0].committed_date)
    daysActive = (lastCommitDate - firstCommitDate).days

    print("Outputting CSVs")

    # output author days on project
    with open(os.path.join(outputDir, "authorDaysOnProject.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "# of Days"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["activeDays"]])

    # output commits per author
    with open(os.path.join(outputDir, "commitsPerAuthor.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "Commit Count"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["commitCount"]])

    # output timezones
    with open(os.path.join(outputDir, "timezones.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Timezone Offset", "Author Count", "Commit Count"])
        for timezone in timezoneInfoDict:
            timezoneInfo = timezoneInfoDict[timezone]
            w.writerow(
                [timezone, len(timezoneInfo["authors"]), timezoneInfo["commitCount"]]
            )

    # output project info
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["DaysActive", daysActive])
        w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)])
        w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)])
        w.writerow(["AuthorCount", len([*authorInfoDict])])
        w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount])
        w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors])
        w.writerow(["TimezoneCount", len([*timezoneInfoDict])])

    outputStatistics(
        [author["activeDays"] for login, author in authorInfoDict.items()],
        "ActiveDays",
        outputDir,
    )

    outputStatistics(
        [author["commitCount"] for login, author in authorInfoDict.items()],
        "CommitCount",
        outputDir,
    )

    return authorInfoDict

Example #5

Show file

File: prAnalysis.py Project: Nuri22/csDetector

def prAnalysis(
    config: Configuration,
    senti: sentistrength.PySentiStr,
    delta: relativedelta,
    batchDates: List[datetime],
):

    print("Querying PRs")
    batches = prRequest(config.pat, config.repositoryOwner,
                        config.repositoryName, delta, batchDates)

    batchParticipants = list()
    batchComments = list()

    for batchIdx, batch in enumerate(batches):
        print(f"Analyzing PR batch #{batchIdx}")

        # extract data from batch
        prCount = len(batch)
        participants = list(pr["participants"] for pr in batch
                            if len(pr["participants"]) > 0)
        batchParticipants.append(participants)

        allComments = list()
        prPositiveComments = list()
        prNegativeComments = list()
        generallyNegative = list()

        print(f"    Sentiments per PR", end="")

        semaphore = threading.Semaphore(15)
        threads = []
        for pr in batch:

            comments = list(comment for comment in pr["comments"]
                            if comment and comment.strip())

            # split comments that are longer than 20KB
            splitComments = []
            for comment in comments:

                # calc number of chunks
                byteChunks = math.ceil(sys.getsizeof(comment) / (20 * 1024))
                if byteChunks > 1:

                    # calc desired max length of each chunk
                    chunkLength = math.floor(len(comment) / byteChunks)

                    # divide comment into chunks
                    chunks = [
                        comment[i * chunkLength:i * chunkLength + chunkLength]
                        for i in range(0, byteChunks)
                    ]

                    # save chunks
                    splitComments.extend(chunks)

                else:
                    # append comment as-is
                    splitComments.append(comment)

            # re-assign comments after chunking
            comments = splitComments

            if len(comments) == 0:
                prPositiveComments.append(0)
                prNegativeComments.append(0)
                continue

            allComments.extend(comments)

            thread = threading.Thread(
                target=analyzeSentiments,
                args=(
                    senti,
                    comments,
                    prPositiveComments,
                    prNegativeComments,
                    generallyNegative,
                    semaphore,
                ),
            )
            threads.append(thread)

        for thread in threads:
            thread.start()

        for thread in threads:
            thread.join()

        print("")

        # save comments
        batchComments.append(allComments)

        # get comment length stats
        commentLengths = [len(c) for c in allComments]

        generallyNegativeRatio = len(generallyNegative) / prCount

        # get pr duration stats
        durations = [(pr["closedAt"] - pr["createdAt"]).days for pr in batch]

        print("    All sentiments")

        commentSentiments = []
        commentSentimentsPositive = 0
        commentSentimentsNegative = 0

        if len(allComments) > 0:
            commentSentiments = senti.getSentiment(allComments)
            commentSentimentsPositive = sum(
                1 for _ in filter(lambda value: value >= 1, commentSentiments))
            commentSentimentsNegative = sum(
                1
                for _ in filter(lambda value: value <= -1, commentSentiments))

        toxicityPercentage = getToxicityPercentage(config, allComments)

        centrality.buildGraphQlNetwork(batchIdx, participants, "PRs", config)

        print("    Writing results")
        with open(
                os.path.join(config.resultsPath, f"results_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["NumberPRs", prCount])
            w.writerow(["NumberPRComments", len(allComments)])
            w.writerow(["PRCommentsPositive", commentSentimentsPositive])
            w.writerow(["PRCommentsNegative", commentSentimentsNegative])
            w.writerow(["PRCommentsNegativeRatio", generallyNegativeRatio])
            w.writerow(["PRCommentsToxicityPercentage", toxicityPercentage])

        with open(
                os.path.join(config.metricsPath, f"PRCommits_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["PR Number", "Commit Count"])
            for pr in batch:
                w.writerow([pr["number"], pr["commitCount"]])

        with open(
                os.path.join(config.metricsPath,
                             f"PRParticipants_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["PR Number", "Developer Count"])
            for pr in batch:
                w.writerow([pr["number"], len(set(pr["participants"]))])

        # output statistics
        stats.outputStatistics(
            batchIdx,
            commentLengths,
            "PRCommentsLength",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            durations,
            "PRDuration",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [len(pr["comments"]) for pr in batch],
            "PRCommentsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [pr["commitCount"] for pr in batch],
            "PRCommitsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            commentSentiments,
            "PRCommentSentiments",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [len(set(pr["participants"])) for pr in batch],
            "PRParticipantsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            prPositiveComments,
            "PRCountPositiveComments",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            prNegativeComments,
            "PRCountNegativeComments",
            config.resultsPath,
        )

    return batchParticipants, batchComments

Example #6

Show file

def prepareGraph(
    allRelatedAuthors: dict,
    authorItems: Counter,
    batchIdx: int,
    outputPrefix: str,
    config: Configuration,
):

    # prepare graph
    print("Preparing NX graph")
    G = nx.Graph()

    for author in allRelatedAuthors:
        G.add_node(author)

        for relatedAuthor in allRelatedAuthors[author]:
            G.add_edge(author.strip(), relatedAuthor.strip())

    # analyze graph
    closeness = dict(nx.closeness_centrality(G))
    betweenness = dict(nx.betweenness_centrality(G))
    centrality = dict(nx.degree_centrality(G))
    density = nx.density(G)
    modularity = []

    try:
        for idx, community in enumerate(greedy_modularity_communities(G)):
            authorCount = len(community)
            communityCommitCount = sum(authorItems[author] for author in community)
            row = [authorCount, communityCommitCount]
            modularity.append(row)
    except ZeroDivisionError:
        # not handled
        pass

    # finding high centrality authors
    highCentralityAuthors = list(
        [author for author, centrality in centrality.items() if centrality > 0.5]
    )

    numberHighCentralityAuthors = len(highCentralityAuthors)

    percentageHighCentralityAuthors = numberHighCentralityAuthors / len(
        allRelatedAuthors
    )

    # calculate TFN
    tfn = len(authorItems) - numberHighCentralityAuthors

    # calculate TFC
    tfc = sum(authorItems[author] for author in highCentralityAuthors) / sum(authorItems.values()) * 100

    print("Outputting CSVs")

    # output non-tabular results
    with open(
        os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow([f"{outputPrefix}_Density", density])
        w.writerow([f"{outputPrefix}_Community Count", len(modularity)])
        w.writerow([f"{outputPrefix}_TFN", tfn])
        w.writerow([f"{outputPrefix}_TFC", tfc])

    # output community information
    with open(
        os.path.join(config.metricsPath, f"{outputPrefix}_community_{batchIdx}.csv"),
        "a",
        newline="",
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Community Index", "Author Count", "Item Count"])
        for idx, community in enumerate(modularity):
            w.writerow([idx + 1, community[0], community[1]])

    # combine centrality results
    combined = {}
    for key in closeness:
        single = {
            "Author": key,
            "Closeness": closeness[key],
            "Betweenness": betweenness[key],
            "Centrality": centrality[key],
        }

        combined[key] = single

    # output tabular results
    with open(
        os.path.join(config.metricsPath, f"{outputPrefix}_centrality_{batchIdx}.csv"),
        "w",
        newline="",
    ) as f:
        w = csv.DictWriter(f, ["Author", "Closeness", "Betweenness", "Centrality"])
        w.writeheader()

        for key in combined:
            w.writerow(combined[key])

    # output high centrality authors
    with open(
        os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(
            [f"{outputPrefix}_NumberHighCentralityAuthors", numberHighCentralityAuthors]
        )
        w.writerow(
            [
                f"{outputPrefix}_PercentageHighCentralityAuthors",
                percentageHighCentralityAuthors,
            ]
        )

    # output statistics
    outputStatistics(
        batchIdx,
        [value for key, value in closeness.items()],
        f"{outputPrefix}_Closeness",
        config.resultsPath,
    )

    outputStatistics(
        batchIdx,
        [value for key, value in betweenness.items()],
        f"{outputPrefix}_Betweenness",
        config.resultsPath,
    )

    outputStatistics(
        batchIdx,
        [value for key, value in centrality.items()],
        f"{outputPrefix}_Centrality",
        config.resultsPath,
    )

    outputStatistics(
        batchIdx,
        [community[0] for community in modularity],
        f"{outputPrefix}_CommunityAuthorCount",
        config.resultsPath,
    )

    outputStatistics(
        batchIdx,
        [community[1] for community in modularity],
        f"{outputPrefix}_CommunityAuthorItemCount",
        config.resultsPath,
    )

    # output graph
    print("Outputting graph")
    plt.figure(5, figsize=(30, 30))

    nx.draw(
        G,
        with_labels=True,
        node_color="orange",
        node_size=4000,
        edge_color="black",
        linewidths=2,
        font_size=20,
    )

    plt.savefig(
        os.path.join(config.resultsPath, f"{outputPrefix}_{batchIdx}.pdf")
    )

    nx.write_graphml(
        G, os.path.join(config.resultsPath, f"{outputPrefix}_{batchIdx}.xml")
    )

    return highCentralityAuthors

Example #7

Show file

File: releaseAnalysis.py Project: Nuri22/csDetector

def releaseAnalysis(
    allCommits: List[git.Commit],
    config: Configuration,
    delta: relativedelta,
    batchDates: List[datetime],
):

    # sort commits by ascending commit date
    allCommits.sort(key=lambda c: c.committed_datetime)

    print("Querying releases")
    batches = releaseRequest(config, delta, batchDates)

    for batchIdx, batch in enumerate(batches):

        releases = batch["releases"]
        releaseAuthors = set()
        releaseCommitsCount = {}

        for i, release in enumerate(releases):
            releaseCommits = list()
            releaseDate = release["createdAt"]

            # try add author to set
            releaseAuthors.add(release["author"])

            if i == 0:

                # this is the first release, get all commits prior to release created date
                for commit in allCommits:
                    if commit.committed_datetime < releaseDate:
                        releaseCommits.append(commit)
                    else:
                        break

            else:

                # get in-between commit count
                prevReleaseDate = releases[i - 1]["createdAt"]
                for commit in allCommits:
                    if (
                        commit.committed_datetime >= prevReleaseDate
                        and commit.committed_datetime < releaseDate
                    ):
                        releaseCommits.append(commit)
                    else:
                        break

            # remove all counted commits from list to improve iteration speed
            allCommits = allCommits[len(releaseCommits) :]

            # calculate authors per release
            commitAuthors = set(commit.author.email for commit in releaseCommits)

            # add results
            releaseCommitsCount[release["name"]] = dict(
                date=release["createdAt"],
                authorsCount=len(commitAuthors),
                commitsCount=len(releaseCommits),
            )

        # sort releases by date ascending
        releaseCommitsCount = {
            key: value
            for key, value in sorted(
                releaseCommitsCount.items(), key=lambda r: r[1]["date"]
            )
        }

        print("Writing results")
        with open(
            os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline=""
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["NumberReleases", batch["releaseCount"]])
            w.writerow(["NumberReleaseAuthors", len(releaseAuthors)])

        with open(
            os.path.join(config.metricsPath, f"releases_{batchIdx}.csv"),
            "a",
            newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["Release", "Date", "Author Count", "Commit Count"])
            for key, value in releaseCommitsCount.items():
                w.writerow(
                    [
                        key,
                        value["date"].isoformat(),
                        value["authorsCount"],
                        value["commitsCount"],
                    ]
                )

        stats.outputStatistics(
            batchIdx,
            [value["authorsCount"] for key, value in releaseCommitsCount.items()],
            "ReleaseAuthorCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [value["commitsCount"] for key, value in releaseCommitsCount.items()],
            "ReleaseCommitCount",
            config.resultsPath,
        )

Example #8

Show file

File: graphqlAnalysis.py Project: Chandler-Song/DevNetworkPython

def graphqlAnalysis(pat: str, repoShortName: str, outputDir: str):

    # split repo by owner and name
    owner, name = splitRepoName(repoShortName)

    print("Querying number of issues")
    issueCount = gql.countIssuesPerRepository(pat, owner, name)

    print("Querying number of PRs")
    prCount = gql.countPullRequestsPerRepository(pat, owner, name)

    print("Querying number of commits per PR")
    prCommitCount = gql.countCommitsPerPullRequest(pat, owner, name)

    print("Querying issue participants")
    issueParticipants, issueParticipantCount = gql.getIssueParticipants(
        pat, owner, name)

    print("Querying PR participants")
    prParticipants, prParticipantCount = gql.getPullRequestParticipants(
        pat, owner, name)

    # join lists and clean memory
    participants = issueParticipants.union(prParticipants)
    del issueParticipants
    del prParticipants

    print("Querying number of comments per issue")
    issueCommentCount = gql.getIssueComments(pat, owner, name)

    print("Writing GraphQL analysis results")
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["NumberIssues", issueCount])
        w.writerow(["NumberPRs", prCount])

    with open(os.path.join(outputDir, "numberCommitsPR.csv"), "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["PR Number", "Commit Count"])
        for key, value in prCommitCount.items():
            w.writerow([key, value])

    with open(os.path.join(outputDir, "numberDevelopersIssue.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Issue Number", "Developer Count"])
        for key, value in issueParticipantCount.items():
            w.writerow([key, value])

    with open(os.path.join(outputDir, "numberDevelopersPR.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["PR Number", "Developer Count"])
        for key, value in prParticipantCount.items():
            w.writerow([key, value])

    with open(os.path.join(outputDir, "numberCommentsIssue.csv"),
              "a",
              newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["PR Number", "Commit Count"])
        for key, value in issueCommentCount.items():
            w.writerow([key, value])

    # output statistics
    outputStatistics([value for key, value in prCommitCount.items()],
                     "CommitsPRCount", outputDir)

    outputStatistics(
        [value for key, value in issueParticipantCount.items()],
        "DevelopersIssueCount",
        outputDir,
    )

    outputStatistics(
        [value for key, value in prParticipantCount.items()],
        "DevelopersPRCount",
        outputDir,
    )

    outputStatistics(
        [value for key, value in issueCommentCount.items()],
        "CommentsIssueCount",
        outputDir,
    )

    return participants

Example #9

Show file

def commitBatchAnalysis(
    idx: int, senti: PySentiStr, commits: List[git.Commit], config: Configuration
):

    authorInfoDict = {}
    timezoneInfoDict = {}
    experienceDays = 150

    # traverse all commits
    print("Analyzing commits")
    startDate = None
    if config.startDate is not None:
        startDate = datetime.strptime(config.startDate, "%Y-%m-%d")
        startDate = startDate.replace(tzinfo=pytz.UTC)
    # sort commits
    commits.sort(key=lambda o: o.committed_datetime, reverse=True)

    commitMessages = []
    commit: Commit
    lastDate = None
    firstDate = None
    realCommitCount = 0
    for commit in Bar("Processing").iter(commits):
        if startDate is not None and startDate > commit.committed_datetime:
            continue
        if lastDate is None:
            lastDate = commit.committed_date
        firstDate = commit.committed_date
        realCommitCount = realCommitCount + 1
        # extract info
        author = authorIdExtractor(commit.author)
        timezone = commit.author_tz_offset
        time = commit.authored_datetime

        # get timezone
        timezoneInfo = timezoneInfoDict.setdefault(
            timezone, dict(commitCount=0, authors=set())
        )

        # save info
        timezoneInfo["authors"].add(author)

        if commit.message and commit.message.strip():
            commitMessages.append(commit.message)

        # increase commit count
        timezoneInfo["commitCount"] += 1

        # get author
        authorInfo = authorInfoDict.setdefault(
            author,
            dict(
                commitCount=0,
                sponsoredCommitCount=0,
                earliestCommitDate=time,
                latestCommitDate=time,
                sponsored=False,
                activeDays=0,
                experienced=False,
            ),
        )

        # increase commit count
        authorInfo["commitCount"] += 1

        # validate earliest commit
        # by default GitPython orders commits from latest to earliest
        if time < authorInfo["earliestCommitDate"]:
            authorInfo["earliestCommitDate"] = time

        # check if commit was between 9 and 5
        if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17:
            authorInfo["sponsoredCommitCount"] += 1

    print("Analyzing commit message sentiment")
    sentimentScores = []
    commitMessageSentimentsPositive = []
    commitMessageSentimentsNegative = []

    if len(commitMessages) > 0:
        sentimentScores = senti.getSentiment(commitMessages)
        commitMessageSentimentsPositive = list(
            result for result in filter(lambda value: value >= 1, sentimentScores)
        )
        commitMessageSentimentsNegative = list(
            result for result in filter(lambda value: value <= -1, sentimentScores)
        )

    print("Analyzing authors")
    sponsoredAuthorCount = 0
    for login, author in authorInfoDict.items():

        # check if sponsored
        commitCount = int(author["commitCount"])
        sponsoredCommitCount = int(author["sponsoredCommitCount"])
        diff = sponsoredCommitCount / commitCount
        if diff >= 0.95:
            author["sponsored"] = True
            sponsoredAuthorCount += 1

        # calculate active days
        earliestDate = author["earliestCommitDate"]
        latestDate = author["latestCommitDate"]
        activeDays = (latestDate - earliestDate).days + 1
        author["activeDays"] = activeDays

        # check if experienced
        if activeDays >= experienceDays:
            author["experienced"] = True

    # calculate percentage sponsored authors
    percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict])

    # calculate active project days
    firstCommitDate = None
    lastCommitDate = None
    if firstDate is not None:
        firstCommitDate = datetime.fromtimestamp(firstDate)
    if lastDate is not None:
        lastCommitDate = datetime.fromtimestamp(lastDate)
    daysActive = 0
    if lastCommitDate is not None:
        daysActive = (lastCommitDate - firstCommitDate).days

    print("Outputting CSVs")

    # output author days on project
    with open(
        os.path.join(config.metricsPath, f"authorDaysOnProject_{idx}.csv"),
        "a",
        newline="",
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "# of Days"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["activeDays"]])

    # output commits per author
    with open(
        os.path.join(config.metricsPath, f"commitsPerAuthor_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "Commit Count"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["commitCount"]])

    # output timezones
    with open(
        os.path.join(config.metricsPath, f"timezones_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Timezone Offset", "Author Count", "Commit Count"])
        for key, timezone in timezoneInfoDict.items():
            w.writerow([key, len(timezone["authors"]), timezone["commitCount"]])

    # output results
    with open(
        os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["CommitCount", realCommitCount])
        w.writerow(["DaysActive", daysActive])
        w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)])
        w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)])
        w.writerow(["AuthorCount", len([*authorInfoDict])])
        w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount])
        w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors])
        w.writerow(["TimezoneCount", len([*timezoneInfoDict])])

    outputStatistics(
        idx,
        [author["activeDays"] for login, author in authorInfoDict.items()],
        "AuthorActiveDays",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [author["commitCount"] for login, author in authorInfoDict.items()],
        "AuthorCommitCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [len(timezone["authors"]) for key, timezone in timezoneInfoDict.items()],
        "TimezoneAuthorCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [timezone["commitCount"] for key, timezone in timezoneInfoDict.items()],
        "TimezoneCommitCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        sentimentScores,
        "CommitMessageSentiment",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        commitMessageSentimentsPositive,
        "CommitMessageSentimentsPositive",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        commitMessageSentimentsNegative,
        "CommitMessageSentimentsNegative",
        config.resultsPath,
    )

    return authorInfoDict, daysActive