def __init__(self, save_path: str, frequency): self.save_path = save_path self.gh_pat_helper = GithubPersonalAccessTokenHelper(get_pats()) PAT = self._get_access_token() self.gh = Github(PAT) # churn, commit frequency self.frequency = frequency
def __init__(self, save_path: str): self.save_path = save_path # TODO: fix this to be an array self.gh_pat_helper = GithubPersonalAccessTokenHelper(get_pats())
class DevOracle: def __init__(self, save_path: str, frequency): self.save_path = save_path self.gh_pat_helper = GithubPersonalAccessTokenHelper(get_pats()) PAT = self._get_access_token() self.gh = Github(PAT) # churn, commit frequency self.frequency = frequency def _get_access_token(self): res = self.gh_pat_helper.get_access_token() if "token" in res and res["token"] is not None: return res["token"] print('Going to sleep since no token exists with usable rate limit') time.sleep(res["sleep_time_secs"]) return self._get_access_token() def get_and_save_full_stats(self, chain_name: str): github_orgs = self._read_orgs_for_chain_from_toml(chain_name) stats_counter = Counter() hist_data = None for org_url in github_orgs: if not org_url.startswith("https://github.com/"): # TODO: If Gitlab repo then use Gitlab APIs print("%s is not a github repo...Skipping" % org_url) continue org = org_url.split("https://github.com/")[1] print("Fetching repo data for", org) org_repo_data_list = self._get_repo_data_for_org(org) print("Fetching stats(stargazers, forks, releases, churn_4w) for", org_url) stats_counter += self._get_stats_for_org_from_repo_data( org_repo_data_list) hist_data_for_org = self._get_historical_progress( org_repo_data_list) print("Combining hist data ...") hist_data = self._combine_hist_data(hist_data, hist_data_for_org) if hist_data == None or stats_counter == {}: remove_chain_from_config(chain_name) print('No data found for organisation in toml file') sys.exit(1) path_prefix = self.save_path + '/' + chain_name with open(path_prefix + '_stats.json', 'w') as outfile: outfile.write(json.dumps(dict(stats_counter))) with open(path_prefix + '_history.json', 'w') as outfile: outfile.write(json.dumps(dict(hist_data))) # list all the repos of a github org/user # Ensure chain_name is same as name of toml file def _read_orgs_for_chain_from_toml(self, chain_name): toml_file_path = path.join(dir_path, 'protocols', chain_name + '.toml') if not path.exists(toml_file_path): print(".toml file not found for %s in /protocols folder" % chain_name) sys.exit(1) try: with open(toml_file_path, 'r') as f: data = f.read() print("Fetching organizations for %s from toml file ..." % chain_name) github_orgs = toml.loads(data)['github_organizations'] return github_orgs except: print('Could not open toml file - check formatting.') sys.exit(1) # get the data for all the repos of a github organization def _get_repo_data_for_org(self, org_name: str): org_repos = self._make_org_repo_list(org_name) forked_repos = [] page = 1 url = f"https://api.github.com/orgs/{org_name}/repos?type=forks&page={page}&per_page=100" PAT = self._get_access_token() response = requests.get(url, headers={'Authorization': 'Token ' + PAT}) while len(response.json()) > 0: for repo in response.json(): forked_repos.append(repo["full_name"]) page += 1 url = f"https://api.github.com/orgs/{org_name}/repos?type=forks&page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + PAT}) unforked_repos = list(set(org_repos) - set(forked_repos)) # GitHub API can hit spam limit number_of_hyperthreads = multiprocessing.cpu_count() n_jobs = 2 if number_of_hyperthreads > 2 else number_of_hyperthreads repo_data_list = Parallel(n_jobs=n_jobs)( delayed(self._get_single_repo_data)(repo) for repo in unforked_repos) return repo_data_list # given the org_name, return list of organisation repos def _make_org_repo_list(self, org_name: str): org_repos = [] try: entity = self.gh.get_organization(org_name) except: entity = self.gh.get_user(org_name) for repo in entity.get_repos(): org_repos.append(repo.name) org_repos = [org_name + '/{0}'.format(repo) for repo in org_repos] return org_repos # get repo data using a repo URL in the form of `org/repo` def _get_single_repo_data(self, org_then_slash_then_repo: str): print('Fetching repo data for ', org_then_slash_then_repo) try: repo = self.gh.get_repo(org_then_slash_then_repo) weekly_add_del = repo.get_stats_code_frequency() weekly_commits = repo.get_stats_participation().all # TODO: Remove contributor specific code contributors = repo.get_stats_contributors() releases = repo.get_releases() return { "name": org_then_slash_then_repo, "repo": repo, "weekly_add_del": weekly_add_del, "weekly_commits": weekly_commits, "contributors": contributors, "releases": releases } except Exception as e: if e.status == 403: print("Token rate limit reached, switching tokens") PAT = self._get_access_token() self.gh = Github(PAT) return self._get_single_repo_data(org_then_slash_then_repo) print('Could not find data for ' + org_then_slash_then_repo) return {} # given a list of repo_data of org, analyze for churn_4w, commits_4w, stars, releases def _get_stats_for_org_from_repo_data(self, org_repo_data_list): number_of_hyperthreads = multiprocessing.cpu_count() n_jobs = 2 if number_of_hyperthreads > 2 else number_of_hyperthreads repo_stats_list = Parallel(n_jobs=n_jobs)( delayed(self._analyse_repo_data_for_churn_and_commits_4w)( repo_data) for repo_data in org_repo_data_list) stats_counter = Counter() for repo_stats in repo_stats_list: stats_counter += Counter(repo_stats) sc_dict = dict(stats_counter) max_contributors = 0 sc_dict[ 'num_releases'] = 0 if 'num_releases' not in sc_dict else sc_dict[ 'num_releases'] # TODO: remove contributor specific data # FIXME find an efficient way to count distinct devs. This is a good lower bound number. for dictionary in repo_stats_list: try: this_contributors = dictionary['contributors'] except: this_contributors = 0 max_contributors = this_contributors if this_contributors > max_contributors else max_contributors # GitHub API only returns up to 100 contributors FIXME FIX THIS ==================================================================================================== sc_dict['contributors'] = max_contributors sc_dict[ 'num_releases'] = 0 if 'num_releases' not in sc_dict else sc_dict[ 'num_releases'] return sc_dict # analyse churn, commits from a git repo data for 'self.frequency' number of weeks # TODO: change 4w to make it more generic # analyses for latest 4w currently def _analyse_repo_data_for_churn_and_commits_4w(self, repo_data: dict): repo = repo_data["repo"] weekly_add_del = repo_data["weekly_add_del"] weekly_commits = repo_data["weekly_commits"] # TODO: remove contributor specific data contributors = repo_data["contributors"] releases = repo_data["releases"] churn_4w = 0 commits_4w = 0 if weekly_add_del and weekly_commits: for i in range(1, self.frequency + 1): try: # weekly-add_del [<Week In UNIX Timestamp>, <additions>, <deletions with neg symbol>] # Deletions is negative, so churn is being calculated as #additions - #deletions churn_4w += (weekly_add_del[-i]._rawData[1] - weekly_add_del[-i]._rawData[2]) commits_4w += weekly_commits[-i] except: break # TODO: remove contributor specific data num_contributors = len(contributors) if contributors else 0 stats = { 'churn_4w': churn_4w, 'commits_4w': commits_4w, 'contributors': num_contributors, 'stars': repo.stargazers_count, 'forks': repo.forks_count, 'num_releases': releases.totalCount } return stats # given a list of repo_data for org, analyze for # weekly_commits and weekly_churn for all weeks till now; # Weekly commit, churn serve as indicators for historical progress def _get_historical_progress(self, org_repo_data_list: list): # GitHub API can hit spam limit number_of_hyperthreads = multiprocessing.cpu_count() n_jobs = 2 if number_of_hyperthreads > 2 else number_of_hyperthreads repo_count_list = Parallel(n_jobs=n_jobs)( delayed(self._get_weekly_churn_and_commits_of_repo)(repo_data) for repo_data in org_repo_data_list) churns = [] commits = [] for repo in repo_count_list: this_churn = repo['weekly_churn'] this_commits = repo['weekly_commits'] # Reverse churn and commits array to show latest week data first churns.append(this_churn[::-1]) commits.append(this_commits[::-1]) # Element wise addition of list of lists # Re-reverse churn and commits array to show oldesr week data first churns = [sum(x) for x in zip_longest(*churns, fillvalue=0)][::-1] commits = [sum(x) for x in zip_longest(*commits, fillvalue=0)][::-1] # churns = churns[-52:] # TODO: figure out why this assert is failing # assert len(churns) == len(commits) # Reversed weeks_ago based on the length of churn/commit weeks weeks_ago = list(range(len(churns)))[::-1] sc_dict = { 'weekly_churn': churns, 'weekly_commits': commits, 'weeks_ago': weeks_ago } return sc_dict def _get_weekly_churn_and_commits_of_repo(self, repo_data: dict): org_then_slash_then_repo = repo_data["name"] weekly_commits = repo_data["weekly_commits"] weekly_add_del = repo_data["weekly_add_del"] try: # For front-end app use, combining this github API call with that for single_repo_stats would be beneficial weekly_churn = [] if weekly_add_del: for i in range(len(weekly_add_del)): # Deletions is negative weekly_churn.append(weekly_add_del[i]._rawData[1] - weekly_add_del[i]._rawData[2]) stats = { 'weekly_churn': weekly_churn, 'weekly_commits': weekly_commits, 'repo': org_then_slash_then_repo } return stats except Exception as e: print(e) stats = { 'weekly_churn': [], 'weekly_commits': weekly_commits, 'repo': org_then_slash_then_repo } return stats # Do element wise addition for `weekly_churn`, `weekly_commits`, `weeks_ago` lists # to get the cumulative historical data for a given chain def _combine_hist_data(self, cumulative_hist_data, hist_data_for_org): if cumulative_hist_data is None: cumulative_hist_data = hist_data_for_org else: cumulative_hist_data["weekly_churn"] = \ element_wise_addition_lists( cumulative_hist_data["weekly_churn"][::-1], hist_data_for_org["weekly_churn"][::-1] )[::-1] cumulative_hist_data["weekly_commits"] = \ element_wise_addition_lists( cumulative_hist_data["weekly_commits"][::-1], hist_data_for_org["weekly_commits"][::-1] )[::-1] cumulative_hist_data["weeks_ago"] = \ element_wise_addition_lists( cumulative_hist_data["weeks_ago"][::-1], hist_data_for_org["weeks_ago"][::-1] )[::-1] return cumulative_hist_data
class Contributors: def __init__(self, save_path: str): self.save_path = save_path # TODO: fix this to be an array self.gh_pat_helper = GithubPersonalAccessTokenHelper(get_pats()) # list all the repos of a protocol from toml # Includes all the core github org/user repos and the repo urls listed in toml # Ensure protocol is same as name of toml file async def get_repos_for_protocol_from_toml(self, protocol): pat = await self._get_access_token() repos = set() toml_file_path = path.join(dir_path, 'protocols', protocol + '.toml') if not path.exists(toml_file_path): print(".toml file not found for %s in /protocols folder" % chain_name) sys.exit(1) try: with open(toml_file_path, 'r') as f: data = f.read() github_orgs = toml.loads(data)['github_organizations'] repos_in_toml = toml.loads(data)['repo'] except: print('Could not open toml file - check formatting!!') sys.exit(1) for org in github_orgs: if not org.lower().startswith("https://github.com/"): continue org_name = org.split('https://github.com/')[1] try: # Get all repos all_org_repos = [] page = 1 url = f"https://api.github.com/orgs/{org_name}/repos?page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) while len(response.json()) > 0: for repo in response.json(): all_org_repos.append(repo["full_name"]) page += 1 url = f"https://api.github.com/orgs/{org_name}/repos?page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) # Get forked repos forked_org_repos = [] page = 1 url = f"https://api.github.com/orgs/{org_name}/repos?type=forks&page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) while len(response.json()) > 0: for repo in response.json(): forked_org_repos.append(repo["full_name"]) page += 1 url = f"https://api.github.com/orgs/{org_name}/repos?type=forks&page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) # Find difference unforked_repos = list(set(all_org_repos) - set(forked_org_repos)) for repo in unforked_repos: repos.add(repo.lower()) except: # Core org is not org but a user # Get repos of user url = f"https://api.github.com/users/{org_name}/repos" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) for repo in response.json(): repos.add(repo["full_name"].lower()) return list(repos) async def _get_access_token(self): res = self.gh_pat_helper.get_access_token() if "token" in res and res["token"] is not None: return res["token"] print('Going to sleep since no token exists with usable rate limit') await asyncio.sleep(res["sleep_time_secs"]) return await self._get_access_token() async def get_contributors_of_repo_in_last_n_years(self, org_then_slash_then_repo: str, n_years: int = 1): # Commits are not chronological, so need to pull all and filter commits = [] # get personal access token pat = await self._get_access_token() async with ClientSession() as session: initial_request = await get_commits(session, pat, org_then_slash_then_repo, page=1) # Repo doesn't exist if initial_request["error"] or (type(initial_request["data"]) == dict and initial_request["data"].message == 'Not Found'): return [] if isinstance(initial_request["data"], list) and len(initial_request["data"]) == 0: return [] commits.extend(initial_request["data"]) rate_limit_remaining = initial_request["rate_limit_remaining"] remaining_requests_to_be_made = 0 if initial_request["total_pages"]: remaining_requests_to_be_made = initial_request["total_pages"] - 1 # starting page batch_start = 2 while remaining_requests_to_be_made > 0: if remaining_requests_to_be_made > min(rate_limit_remaining, 200): batch_end = batch_start + min(rate_limit_remaining, 200) else: batch_end = batch_start + remaining_requests_to_be_made print("Start", batch_start, "End", batch_end) # get data for page from batch_start to batch_end tasks = [] for page in range(batch_start, batch_end + 1): task = ensure_future( get_commits(session, pat, org_then_slash_then_repo, page) ) tasks.append(task) responses = await asyncio.gather(*tasks) if len(responses) == 0: sys.exit(1) successful_responses_count = 0 rate_limit_exceeded = False for response in responses: if response["error"]: # Get 502 some times if response["error_code"] == 403 or response["error_code"] // 100 == 5: print("Rate limit trigger detected") rate_limit_exceeded = True break # Printing unhandled error and exiting print(response) sys.exit(1) if not isinstance(response["data"], list): print(response["error"]) sys.exit(1) successful_responses_count += 1 commits.extend(response["data"]) if rate_limit_exceeded: print("Hourly rate limit exceeded for current token") pat = await self._get_access_token() print("Successful reqs: ", successful_responses_count) remaining_requests_to_be_made -= successful_responses_count rate_limit_remaining -= successful_responses_count batch_start += successful_responses_count days_count = 365 * n_years # TODO: Adjust for leap years # Remove older commits year_ago_date = dt.datetime.now() - dt.timedelta(days=days_count) contributors = [] for item in commits: try: date_string = item['commit']['author']['date'] date = dt.datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%SZ') if date > year_ago_date: if item['author']: # Can be null (user not logged in) contributors.append(item['author']['login']) # GitHub username except Exception as e: print(e) sys.exit(1) # De-duplicate commiters deduplicated_contributors = list(set(contributors)) return deduplicated_contributors async def get_monthly_contributors_of_repo_in_last_n_years(self, org_then_slash_then_repo: str, n_years: int = 1): # Commits are not chronological, so need to pull all and filter commits = [] # get personal access token pat = await self._get_access_token() month_count_plus_one = 12 * n_years + 1 # create empty 2D list of (12 * n_years) empty list elements) # explicity append rather than []*12 as this uses same memory ref, thus append to one element means append to all contributors = [] for i in range(1, month_count_plus_one): contributors.append([]) async with ClientSession() as session: initial_request = await get_commits(session, pat, org_then_slash_then_repo, page=1) # Repo doesn't exist if initial_request["error"] or (type(initial_request["data"]) == dict and initial_request["data"].message == 'Not Found'): return contributors if isinstance(initial_request["data"], list) and len(initial_request["data"]) == 0: return contributors commits.extend(initial_request["data"]) rate_limit_remaining = initial_request["rate_limit_remaining"] remaining_requests_to_be_made = 0 if initial_request["total_pages"]: remaining_requests_to_be_made = initial_request["total_pages"] - 1 # starting page batch_start = 2 while remaining_requests_to_be_made > 0: if remaining_requests_to_be_made > min(rate_limit_remaining, 200): batch_end = batch_start + min(rate_limit_remaining, 200) else: batch_end = batch_start + remaining_requests_to_be_made print("Start", batch_start, "End", batch_end) # get data for page from batch_start to batch_end tasks = [] for page in range(batch_start, batch_end + 1): task = ensure_future( get_commits(session, pat, org_then_slash_then_repo, page) ) tasks.append(task) responses = await asyncio.gather(*tasks) if len(responses) == 0: sys.exit(1) successful_responses_count = 0 rate_limit_exceeded = False for response in responses: if response["error"]: # Get 502 some times if response["error_code"] == 403 or response["error_code"] // 100 == 5: print("Rate limit trigger detected") rate_limit_exceeded = True break # Printing unhandled error and exiting print(response) sys.exit(1) if not isinstance(response["data"], list): print(response["error"]) sys.exit(1) successful_responses_count += 1 commits.extend(response["data"]) if rate_limit_exceeded: print("Hourly rate limit exceeded for current token") pat = await self._get_access_token() print("Successful reqs: ", successful_responses_count) remaining_requests_to_be_made -= successful_responses_count rate_limit_remaining -= successful_responses_count batch_start += successful_responses_count # If wanting to create a record of every repo's commits, uncomment this # with open(org_then_slash_then_repo + '_commits.json', 'w+') as outfile: # json.dump(commits, outfile) # Remove older commits month_start_dates = [dt.datetime.now()] # Include final end date for later use for month in range(1, month_count_plus_one): # Generate (12 * n_years) months of start dates month_start_dates.append(month_start_dates[-1] - dt.timedelta(days=30)) # 12 'months' is 360 days month_start_dates.reverse() for item in commits: try: date_string = item['commit']['author']['date'] date = dt.datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%SZ') # FIXME find a more efficient way to do this for index, (start, end) in enumerate(zip(month_start_dates, month_start_dates[1:])): if date >= start and date < end and item['author']: # Can be null (user not logged in) contributors[index].append(item['author']['login']) except Exception as e: print('Failed to get monthly contributors for ' + org_then_slash_then_repo) print(e) sys.exit(1) # De-duplicate commiters for index, month_of_contributors in enumerate(contributors): deduplicated_contributors = list(set(month_of_contributors)) contributors[index] = deduplicated_contributors return contributors async def get_contr_from_toml(self, toml_file: str, monthly: bool = True, years_count: int = 1): toml_file_without_protocols = toml_file.split('protocols/')[1] protocol_name = toml_file_without_protocols.split('.toml')[0] out_file_name = toml_file_without_protocols.replace('.toml', '_contributors.json') out_file_name_with_path = self.save_path + '/' + out_file_name # Useful if left running e.g. over weekend - if failed, re-run INCLUDING last repo listed progress_file_name = toml_file.replace('.toml', '_repos_seen.txt') month_count_plus_one = 12 * years_count + 1 # create empty 2D list of (12 * years_count) empty list elements) # explicity append rather than []*12 as this uses same memory ref, thus append to one element means append to all list_2d = [] for i in range(1, month_count_plus_one): list_2d.append([]) stats = None seen_repos = [] if path.exists(out_file_name_with_path): with open(out_file_name_with_path, 'r') as stats_json: stats = json.load(stats_json) if not stats == list_2d: if path.exists(progress_file_name): progress_file = open(progress_file_name, 'r') progress_repos_list = progress_file.readlines() for (_, repo_name_with_line_term) in enumerate(progress_repos_list): repo_name = repo_name_with_line_term.split("\n")[0] seen_repos.append(repo_name) elif path.exists(progress_file_name): remove(progress_file_name) if stats: core_array = stats elif monthly: # Explicity def, see above # TODO: change this length to make it configurable core_array = list_2d else: # yearly core_array = [] with open(out_file_name_with_path, 'w') as outfile: json.dump(core_array, outfile) repos = await self.get_repos_for_protocol_from_toml(protocol_name) unseen_repo = [] for repo in repos: if repo in seen_repos: print("Ignoring seen repo: ", repo) continue unseen_repo.append(repo) # Don't thread this - API limit for repo in unseen_repo: print("Analysing repo: ", repo) if monthly: contributors = await self.get_monthly_contributors_of_repo_in_last_n_years(repo, n_years=years_count) else: contributors = await self.get_contributors_of_repo_in_last_n_years(repo, n_years=years_count) # Save progress in case of failure try: with open(out_file_name_with_path) as json_file: data = json.load(json_file) if monthly: # FIXME efficiency, note np.concatenate on axis 1 doesn't play well with our core array for index, item in enumerate(data): item.extend(contributors[index]) else: data.extend(contributors) with open(progress_file_name, 'a') as progress_file: progress_file.write(repo + '\n') with open(out_file_name_with_path, 'w') as outfile: json.dump(data, outfile) except Exception as e: print('Failed to collate monthly contributors for all repos in toml file') print(e) sys.exit(1) try: with open(out_file_name_with_path) as json_file: data = json.load(json_file) except Exception as e: print(e) sys.exit(1) if monthly: print('Monthly active developers in the past year:') for index, month_of_contributors in enumerate(data): deduplicated_monthly_contributors = list(set(month_of_contributors)) data[index] = deduplicated_monthly_contributors print('Month ' + str(index + 1) + ': ' + str(len(deduplicated_monthly_contributors))) deduplicated_contributors = data else: deduplicated_contributors = list(set(data)) print('Total active developers in the past year: ' + str(len(deduplicated_contributors))) with open(out_file_name_with_path, 'w') as outfile: json.dump(deduplicated_contributors, outfile) return deduplicated_contributors