def tagAnalysis(repo: git.Repo, outputDir: str): print("Analyzing tags") tagInfo = [] print( "Sorting (no progress available, may take several minutes to complete)" ) tags = sorted(repo.tags, key=getTaggedDate) if len(tags) > 0: lastTag = None for tag in Bar("Processing").iter(tags): commitCount = 0 if lastTag == None: commitCount = len(list(tag.commit.iter_items(repo, tag.commit))) else: sinceStr = formatDate(getTaggedDate(lastTag)) commitCount = len( list( tag.commit.iter_items(repo, tag.commit, after=sinceStr))) tagInfo.append( dict( path=tag.path, date=formatDate(getTaggedDate(tag)), commitCount=commitCount, )) lastTag = tag # output non-tabular results with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Tag Count", len(tagInfo)]) # output tag info print("Outputting CSVs") with open(os.path.join(outputDir, "tags.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Path", "Date", "Commit Count"]) for tag in tagInfo: w.writerow([tag["path"], tag["date"], tag["commitCount"]]) outputStatistics( [tag["commitCount"] for tag in tagInfo], "TagCommitCount", outputDir, )
def outputTags(idx: int, tagInfo: List[dict], daysActive: int, config: Configuration): # calculate FN fn = len(tagInfo) / daysActive * 100 # output non-tabular results with open(os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Tag Count", len(tagInfo)]) # output tag info print("Outputting CSVs") with open(os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["FN", fn]) with open(os.path.join(config.metricsPath, f"tags_{idx}.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Path", "Date", "Commit Count"]) for tag in tagInfo: w.writerow([tag["path"], tag["date"], tag["commitCount"]]) outputStatistics( idx, [tag["commitCount"] for tag in tagInfo], "TagCommitCount", config.resultsPath, )
def centralityAnalysis(repo: git.Repo, commits: List[git.Commit], outputDir: str): allRelatedAuthors = {} authorCommits = Counter({}) # for all commits... print("Analyzing centrality") for commit in Bar("Processing").iter(commits): author = authorIdExtractor(commit.author) # increase author commit count authorCommits.update({author: 1}) # initialize dates for related author analysis commitDate = datetime.fromtimestamp(commit.committed_date) earliestDate = commitDate + relativedelta(months=-1) latestDate = commitDate + relativedelta(months=+1) # find authors related to this commit # commitRelatedCommits = commit.iter_items( # repo, 'master', # after=earliestDate.strftime('%Y-%m-%d'), # before=latestDate.strftime('%Y-%m-%d')) commitRelatedCommits = filter( lambda c: findRelatedCommits(author, earliestDate, latestDate, c), commits) commitRelatedAuthors = set( list( map(lambda c: authorIdExtractor(c.author), commitRelatedCommits))) # get current related authors collection and update it authorRelatedAuthors = allRelatedAuthors.setdefault(author, set()) authorRelatedAuthors.update(commitRelatedAuthors) # prepare graph print("Preparing NX graph") G = nx.Graph() for author in allRelatedAuthors: G.add_node(author) for relatedAuthor in allRelatedAuthors[author]: G.add_edge(author.strip(), relatedAuthor.strip()) # analyze graph closeness = dict(nx.closeness_centrality(G)) betweenness = dict(nx.betweenness_centrality(G)) centrality = dict(nx.degree_centrality(G)) density = nx.density(G) modularity = [] try: for idx, community in enumerate(greedy_modularity_communities(G)): authorCount = len(community) communityCommitCount = sum(authorCommits[author] for author in community) row = [authorCount, communityCommitCount] modularity.append(row) except ZeroDivisionError: # not handled pass # finding high centrality authors numberHighCentralityAuthors = len([ author for author, centrality in centrality.items() if centrality > 0.5 ]) percentageHighCentralityAuthors = numberHighCentralityAuthors / len( allRelatedAuthors) print("Outputting CSVs") # output non-tabular results with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Density", density]) w.writerow(["Community Count", len(modularity)]) # output community information with open(os.path.join(outputDir, "community.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Community Index", "Author Count", "Commit Count"]) for idx, community in enumerate(modularity): w.writerow([idx + 1, community[0], community[1]]) # combine centrality results combined = {} for key in closeness: single = { "Author": key, "Closeness": closeness[key], "Betweenness": betweenness[key], "Centrality": centrality[key], } combined[key] = single # output tabular results with open(os.path.join(outputDir, "centrality.csv"), "w", newline="") as f: w = csv.DictWriter( f, ["Author", "Closeness", "Betweenness", "Centrality"]) w.writeheader() for key in combined: w.writerow(combined[key]) # output high centrality authors with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow( ["NumberHighCentralityAuthors", numberHighCentralityAuthors]) w.writerow([ "PercentageHighCentralityAuthors", percentageHighCentralityAuthors ]) # output statistics outputStatistics( [value for key, value in closeness.items()], "Closeness", outputDir, ) outputStatistics( [value for key, value in betweenness.items()], "Betweenness", outputDir, ) outputStatistics( [value for key, value in centrality.items()], "Centrality", outputDir, ) outputStatistics( [community[0] for community in modularity], "CommunityAuthorCount", outputDir, ) outputStatistics( [community[1] for community in modularity], "CommunityCommitCount", outputDir, ) # output graph to PNG print("Outputting graph to PNG") graphFigure = plt.figure(5, figsize=(30, 30)) nx.draw( G, with_labels=True, node_color="orange", node_size=4000, edge_color="black", linewidths=2, font_size=20, ) graphFigure.savefig(os.path.join(outputDir, "graph.png"))
def commitAnalysis(commits: List[git.Commit], outputDir: str): authorInfoDict = {} timezoneInfoDict = {} experienceDays = 150 # traverse all commits print("Analyzing commits") for commit in Bar("Processing").iter(commits): # extract info author = authorIdExtractor(commit.author) timezone = commit.author_tz_offset time = commit.authored_datetime # get timezone timezoneInfo = timezoneInfoDict.setdefault( timezone, dict(commitCount=0, authors=set()) ) # save author timezoneInfo["authors"].add(author) # increase commit count timezoneInfo["commitCount"] += 1 # get author authorInfo = authorInfoDict.setdefault( author, dict( commitCount=0, sponsoredCommitCount=0, earliestCommitDate=time, latestCommitDate=time, sponsored=False, activeDays=0, experienced=False, ), ) # increase commit count authorInfo["commitCount"] += 1 # validate earliest commit # by default GitPython orders commits from latest to earliest if time < authorInfo["earliestCommitDate"]: authorInfo["earliestCommitDate"] = time # check if commit was between 9 and 5 if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17: authorInfo["sponsoredCommitCount"] += 1 print("Analyzing authors") sponsoredAuthorCount = 0 for login, author in authorInfoDict.items(): # check if sponsored commitCount = int(author["commitCount"]) sponsoredCommitCount = int(author["sponsoredCommitCount"]) diff = sponsoredCommitCount / commitCount if diff >= 0.95: author["sponsored"] = True sponsoredAuthorCount += 1 # calculate active days earliestDate = author["earliestCommitDate"] latestDate = author["latestCommitDate"] activeDays = (latestDate - earliestDate).days + 1 author["activeDays"] = activeDays # check if experienced if activeDays >= experienceDays: author["experienced"] = True # calculate percentage sponsored authors percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict]) # calculate active project days firstCommitDate = datetime.fromtimestamp(commits[len(commits) - 1].committed_date) lastCommitDate = datetime.fromtimestamp(commits[0].committed_date) daysActive = (lastCommitDate - firstCommitDate).days print("Outputting CSVs") # output author days on project with open(os.path.join(outputDir, "authorDaysOnProject.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "# of Days"]) for login, author in authorInfoDict.items(): w.writerow([login, author["activeDays"]]) # output commits per author with open(os.path.join(outputDir, "commitsPerAuthor.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "Commit Count"]) for login, author in authorInfoDict.items(): w.writerow([login, author["commitCount"]]) # output timezones with open(os.path.join(outputDir, "timezones.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Timezone Offset", "Author Count", "Commit Count"]) for timezone in timezoneInfoDict: timezoneInfo = timezoneInfoDict[timezone] w.writerow( [timezone, len(timezoneInfo["authors"]), timezoneInfo["commitCount"]] ) # output project info with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["DaysActive", daysActive]) w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)]) w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)]) w.writerow(["AuthorCount", len([*authorInfoDict])]) w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount]) w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors]) w.writerow(["TimezoneCount", len([*timezoneInfoDict])]) outputStatistics( [author["activeDays"] for login, author in authorInfoDict.items()], "ActiveDays", outputDir, ) outputStatistics( [author["commitCount"] for login, author in authorInfoDict.items()], "CommitCount", outputDir, ) return authorInfoDict
def prAnalysis( config: Configuration, senti: sentistrength.PySentiStr, delta: relativedelta, batchDates: List[datetime], ): print("Querying PRs") batches = prRequest(config.pat, config.repositoryOwner, config.repositoryName, delta, batchDates) batchParticipants = list() batchComments = list() for batchIdx, batch in enumerate(batches): print(f"Analyzing PR batch #{batchIdx}") # extract data from batch prCount = len(batch) participants = list(pr["participants"] for pr in batch if len(pr["participants"]) > 0) batchParticipants.append(participants) allComments = list() prPositiveComments = list() prNegativeComments = list() generallyNegative = list() print(f" Sentiments per PR", end="") semaphore = threading.Semaphore(15) threads = [] for pr in batch: comments = list(comment for comment in pr["comments"] if comment and comment.strip()) # split comments that are longer than 20KB splitComments = [] for comment in comments: # calc number of chunks byteChunks = math.ceil(sys.getsizeof(comment) / (20 * 1024)) if byteChunks > 1: # calc desired max length of each chunk chunkLength = math.floor(len(comment) / byteChunks) # divide comment into chunks chunks = [ comment[i * chunkLength:i * chunkLength + chunkLength] for i in range(0, byteChunks) ] # save chunks splitComments.extend(chunks) else: # append comment as-is splitComments.append(comment) # re-assign comments after chunking comments = splitComments if len(comments) == 0: prPositiveComments.append(0) prNegativeComments.append(0) continue allComments.extend(comments) thread = threading.Thread( target=analyzeSentiments, args=( senti, comments, prPositiveComments, prNegativeComments, generallyNegative, semaphore, ), ) threads.append(thread) for thread in threads: thread.start() for thread in threads: thread.join() print("") # save comments batchComments.append(allComments) # get comment length stats commentLengths = [len(c) for c in allComments] generallyNegativeRatio = len(generallyNegative) / prCount # get pr duration stats durations = [(pr["closedAt"] - pr["createdAt"]).days for pr in batch] print(" All sentiments") commentSentiments = [] commentSentimentsPositive = 0 commentSentimentsNegative = 0 if len(allComments) > 0: commentSentiments = senti.getSentiment(allComments) commentSentimentsPositive = sum( 1 for _ in filter(lambda value: value >= 1, commentSentiments)) commentSentimentsNegative = sum( 1 for _ in filter(lambda value: value <= -1, commentSentiments)) toxicityPercentage = getToxicityPercentage(config, allComments) centrality.buildGraphQlNetwork(batchIdx, participants, "PRs", config) print(" Writing results") with open( os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["NumberPRs", prCount]) w.writerow(["NumberPRComments", len(allComments)]) w.writerow(["PRCommentsPositive", commentSentimentsPositive]) w.writerow(["PRCommentsNegative", commentSentimentsNegative]) w.writerow(["PRCommentsNegativeRatio", generallyNegativeRatio]) w.writerow(["PRCommentsToxicityPercentage", toxicityPercentage]) with open( os.path.join(config.metricsPath, f"PRCommits_{batchIdx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["PR Number", "Commit Count"]) for pr in batch: w.writerow([pr["number"], pr["commitCount"]]) with open( os.path.join(config.metricsPath, f"PRParticipants_{batchIdx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["PR Number", "Developer Count"]) for pr in batch: w.writerow([pr["number"], len(set(pr["participants"]))]) # output statistics stats.outputStatistics( batchIdx, commentLengths, "PRCommentsLength", config.resultsPath, ) stats.outputStatistics( batchIdx, durations, "PRDuration", config.resultsPath, ) stats.outputStatistics( batchIdx, [len(pr["comments"]) for pr in batch], "PRCommentsCount", config.resultsPath, ) stats.outputStatistics( batchIdx, [pr["commitCount"] for pr in batch], "PRCommitsCount", config.resultsPath, ) stats.outputStatistics( batchIdx, commentSentiments, "PRCommentSentiments", config.resultsPath, ) stats.outputStatistics( batchIdx, [len(set(pr["participants"])) for pr in batch], "PRParticipantsCount", config.resultsPath, ) stats.outputStatistics( batchIdx, prPositiveComments, "PRCountPositiveComments", config.resultsPath, ) stats.outputStatistics( batchIdx, prNegativeComments, "PRCountNegativeComments", config.resultsPath, ) return batchParticipants, batchComments
def prepareGraph( allRelatedAuthors: dict, authorItems: Counter, batchIdx: int, outputPrefix: str, config: Configuration, ): # prepare graph print("Preparing NX graph") G = nx.Graph() for author in allRelatedAuthors: G.add_node(author) for relatedAuthor in allRelatedAuthors[author]: G.add_edge(author.strip(), relatedAuthor.strip()) # analyze graph closeness = dict(nx.closeness_centrality(G)) betweenness = dict(nx.betweenness_centrality(G)) centrality = dict(nx.degree_centrality(G)) density = nx.density(G) modularity = [] try: for idx, community in enumerate(greedy_modularity_communities(G)): authorCount = len(community) communityCommitCount = sum(authorItems[author] for author in community) row = [authorCount, communityCommitCount] modularity.append(row) except ZeroDivisionError: # not handled pass # finding high centrality authors highCentralityAuthors = list( [author for author, centrality in centrality.items() if centrality > 0.5] ) numberHighCentralityAuthors = len(highCentralityAuthors) percentageHighCentralityAuthors = numberHighCentralityAuthors / len( allRelatedAuthors ) # calculate TFN tfn = len(authorItems) - numberHighCentralityAuthors # calculate TFC tfc = sum(authorItems[author] for author in highCentralityAuthors) / sum(authorItems.values()) * 100 print("Outputting CSVs") # output non-tabular results with open( os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow([f"{outputPrefix}_Density", density]) w.writerow([f"{outputPrefix}_Community Count", len(modularity)]) w.writerow([f"{outputPrefix}_TFN", tfn]) w.writerow([f"{outputPrefix}_TFC", tfc]) # output community information with open( os.path.join(config.metricsPath, f"{outputPrefix}_community_{batchIdx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Community Index", "Author Count", "Item Count"]) for idx, community in enumerate(modularity): w.writerow([idx + 1, community[0], community[1]]) # combine centrality results combined = {} for key in closeness: single = { "Author": key, "Closeness": closeness[key], "Betweenness": betweenness[key], "Centrality": centrality[key], } combined[key] = single # output tabular results with open( os.path.join(config.metricsPath, f"{outputPrefix}_centrality_{batchIdx}.csv"), "w", newline="", ) as f: w = csv.DictWriter(f, ["Author", "Closeness", "Betweenness", "Centrality"]) w.writeheader() for key in combined: w.writerow(combined[key]) # output high centrality authors with open( os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow( [f"{outputPrefix}_NumberHighCentralityAuthors", numberHighCentralityAuthors] ) w.writerow( [ f"{outputPrefix}_PercentageHighCentralityAuthors", percentageHighCentralityAuthors, ] ) # output statistics outputStatistics( batchIdx, [value for key, value in closeness.items()], f"{outputPrefix}_Closeness", config.resultsPath, ) outputStatistics( batchIdx, [value for key, value in betweenness.items()], f"{outputPrefix}_Betweenness", config.resultsPath, ) outputStatistics( batchIdx, [value for key, value in centrality.items()], f"{outputPrefix}_Centrality", config.resultsPath, ) outputStatistics( batchIdx, [community[0] for community in modularity], f"{outputPrefix}_CommunityAuthorCount", config.resultsPath, ) outputStatistics( batchIdx, [community[1] for community in modularity], f"{outputPrefix}_CommunityAuthorItemCount", config.resultsPath, ) # output graph print("Outputting graph") plt.figure(5, figsize=(30, 30)) nx.draw( G, with_labels=True, node_color="orange", node_size=4000, edge_color="black", linewidths=2, font_size=20, ) plt.savefig( os.path.join(config.resultsPath, f"{outputPrefix}_{batchIdx}.pdf") ) nx.write_graphml( G, os.path.join(config.resultsPath, f"{outputPrefix}_{batchIdx}.xml") ) return highCentralityAuthors
def releaseAnalysis( allCommits: List[git.Commit], config: Configuration, delta: relativedelta, batchDates: List[datetime], ): # sort commits by ascending commit date allCommits.sort(key=lambda c: c.committed_datetime) print("Querying releases") batches = releaseRequest(config, delta, batchDates) for batchIdx, batch in enumerate(batches): releases = batch["releases"] releaseAuthors = set() releaseCommitsCount = {} for i, release in enumerate(releases): releaseCommits = list() releaseDate = release["createdAt"] # try add author to set releaseAuthors.add(release["author"]) if i == 0: # this is the first release, get all commits prior to release created date for commit in allCommits: if commit.committed_datetime < releaseDate: releaseCommits.append(commit) else: break else: # get in-between commit count prevReleaseDate = releases[i - 1]["createdAt"] for commit in allCommits: if ( commit.committed_datetime >= prevReleaseDate and commit.committed_datetime < releaseDate ): releaseCommits.append(commit) else: break # remove all counted commits from list to improve iteration speed allCommits = allCommits[len(releaseCommits) :] # calculate authors per release commitAuthors = set(commit.author.email for commit in releaseCommits) # add results releaseCommitsCount[release["name"]] = dict( date=release["createdAt"], authorsCount=len(commitAuthors), commitsCount=len(releaseCommits), ) # sort releases by date ascending releaseCommitsCount = { key: value for key, value in sorted( releaseCommitsCount.items(), key=lambda r: r[1]["date"] ) } print("Writing results") with open( os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["NumberReleases", batch["releaseCount"]]) w.writerow(["NumberReleaseAuthors", len(releaseAuthors)]) with open( os.path.join(config.metricsPath, f"releases_{batchIdx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Release", "Date", "Author Count", "Commit Count"]) for key, value in releaseCommitsCount.items(): w.writerow( [ key, value["date"].isoformat(), value["authorsCount"], value["commitsCount"], ] ) stats.outputStatistics( batchIdx, [value["authorsCount"] for key, value in releaseCommitsCount.items()], "ReleaseAuthorCount", config.resultsPath, ) stats.outputStatistics( batchIdx, [value["commitsCount"] for key, value in releaseCommitsCount.items()], "ReleaseCommitCount", config.resultsPath, )
def graphqlAnalysis(pat: str, repoShortName: str, outputDir: str): # split repo by owner and name owner, name = splitRepoName(repoShortName) print("Querying number of issues") issueCount = gql.countIssuesPerRepository(pat, owner, name) print("Querying number of PRs") prCount = gql.countPullRequestsPerRepository(pat, owner, name) print("Querying number of commits per PR") prCommitCount = gql.countCommitsPerPullRequest(pat, owner, name) print("Querying issue participants") issueParticipants, issueParticipantCount = gql.getIssueParticipants( pat, owner, name) print("Querying PR participants") prParticipants, prParticipantCount = gql.getPullRequestParticipants( pat, owner, name) # join lists and clean memory participants = issueParticipants.union(prParticipants) del issueParticipants del prParticipants print("Querying number of comments per issue") issueCommentCount = gql.getIssueComments(pat, owner, name) print("Writing GraphQL analysis results") with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["NumberIssues", issueCount]) w.writerow(["NumberPRs", prCount]) with open(os.path.join(outputDir, "numberCommitsPR.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["PR Number", "Commit Count"]) for key, value in prCommitCount.items(): w.writerow([key, value]) with open(os.path.join(outputDir, "numberDevelopersIssue.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Issue Number", "Developer Count"]) for key, value in issueParticipantCount.items(): w.writerow([key, value]) with open(os.path.join(outputDir, "numberDevelopersPR.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["PR Number", "Developer Count"]) for key, value in prParticipantCount.items(): w.writerow([key, value]) with open(os.path.join(outputDir, "numberCommentsIssue.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["PR Number", "Commit Count"]) for key, value in issueCommentCount.items(): w.writerow([key, value]) # output statistics outputStatistics([value for key, value in prCommitCount.items()], "CommitsPRCount", outputDir) outputStatistics( [value for key, value in issueParticipantCount.items()], "DevelopersIssueCount", outputDir, ) outputStatistics( [value for key, value in prParticipantCount.items()], "DevelopersPRCount", outputDir, ) outputStatistics( [value for key, value in issueCommentCount.items()], "CommentsIssueCount", outputDir, ) return participants
def commitBatchAnalysis( idx: int, senti: PySentiStr, commits: List[git.Commit], config: Configuration ): authorInfoDict = {} timezoneInfoDict = {} experienceDays = 150 # traverse all commits print("Analyzing commits") startDate = None if config.startDate is not None: startDate = datetime.strptime(config.startDate, "%Y-%m-%d") startDate = startDate.replace(tzinfo=pytz.UTC) # sort commits commits.sort(key=lambda o: o.committed_datetime, reverse=True) commitMessages = [] commit: Commit lastDate = None firstDate = None realCommitCount = 0 for commit in Bar("Processing").iter(commits): if startDate is not None and startDate > commit.committed_datetime: continue if lastDate is None: lastDate = commit.committed_date firstDate = commit.committed_date realCommitCount = realCommitCount + 1 # extract info author = authorIdExtractor(commit.author) timezone = commit.author_tz_offset time = commit.authored_datetime # get timezone timezoneInfo = timezoneInfoDict.setdefault( timezone, dict(commitCount=0, authors=set()) ) # save info timezoneInfo["authors"].add(author) if commit.message and commit.message.strip(): commitMessages.append(commit.message) # increase commit count timezoneInfo["commitCount"] += 1 # get author authorInfo = authorInfoDict.setdefault( author, dict( commitCount=0, sponsoredCommitCount=0, earliestCommitDate=time, latestCommitDate=time, sponsored=False, activeDays=0, experienced=False, ), ) # increase commit count authorInfo["commitCount"] += 1 # validate earliest commit # by default GitPython orders commits from latest to earliest if time < authorInfo["earliestCommitDate"]: authorInfo["earliestCommitDate"] = time # check if commit was between 9 and 5 if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17: authorInfo["sponsoredCommitCount"] += 1 print("Analyzing commit message sentiment") sentimentScores = [] commitMessageSentimentsPositive = [] commitMessageSentimentsNegative = [] if len(commitMessages) > 0: sentimentScores = senti.getSentiment(commitMessages) commitMessageSentimentsPositive = list( result for result in filter(lambda value: value >= 1, sentimentScores) ) commitMessageSentimentsNegative = list( result for result in filter(lambda value: value <= -1, sentimentScores) ) print("Analyzing authors") sponsoredAuthorCount = 0 for login, author in authorInfoDict.items(): # check if sponsored commitCount = int(author["commitCount"]) sponsoredCommitCount = int(author["sponsoredCommitCount"]) diff = sponsoredCommitCount / commitCount if diff >= 0.95: author["sponsored"] = True sponsoredAuthorCount += 1 # calculate active days earliestDate = author["earliestCommitDate"] latestDate = author["latestCommitDate"] activeDays = (latestDate - earliestDate).days + 1 author["activeDays"] = activeDays # check if experienced if activeDays >= experienceDays: author["experienced"] = True # calculate percentage sponsored authors percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict]) # calculate active project days firstCommitDate = None lastCommitDate = None if firstDate is not None: firstCommitDate = datetime.fromtimestamp(firstDate) if lastDate is not None: lastCommitDate = datetime.fromtimestamp(lastDate) daysActive = 0 if lastCommitDate is not None: daysActive = (lastCommitDate - firstCommitDate).days print("Outputting CSVs") # output author days on project with open( os.path.join(config.metricsPath, f"authorDaysOnProject_{idx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "# of Days"]) for login, author in authorInfoDict.items(): w.writerow([login, author["activeDays"]]) # output commits per author with open( os.path.join(config.metricsPath, f"commitsPerAuthor_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "Commit Count"]) for login, author in authorInfoDict.items(): w.writerow([login, author["commitCount"]]) # output timezones with open( os.path.join(config.metricsPath, f"timezones_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Timezone Offset", "Author Count", "Commit Count"]) for key, timezone in timezoneInfoDict.items(): w.writerow([key, len(timezone["authors"]), timezone["commitCount"]]) # output results with open( os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["CommitCount", realCommitCount]) w.writerow(["DaysActive", daysActive]) w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)]) w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)]) w.writerow(["AuthorCount", len([*authorInfoDict])]) w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount]) w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors]) w.writerow(["TimezoneCount", len([*timezoneInfoDict])]) outputStatistics( idx, [author["activeDays"] for login, author in authorInfoDict.items()], "AuthorActiveDays", config.resultsPath, ) outputStatistics( idx, [author["commitCount"] for login, author in authorInfoDict.items()], "AuthorCommitCount", config.resultsPath, ) outputStatistics( idx, [len(timezone["authors"]) for key, timezone in timezoneInfoDict.items()], "TimezoneAuthorCount", config.resultsPath, ) outputStatistics( idx, [timezone["commitCount"] for key, timezone in timezoneInfoDict.items()], "TimezoneCommitCount", config.resultsPath, ) outputStatistics( idx, sentimentScores, "CommitMessageSentiment", config.resultsPath, ) outputStatistics( idx, commitMessageSentimentsPositive, "CommitMessageSentimentsPositive", config.resultsPath, ) outputStatistics( idx, commitMessageSentimentsNegative, "CommitMessageSentimentsNegative", config.resultsPath, ) return authorInfoDict, daysActive