def get_and_save_full_stats(self, chain_name: str): github_orgs = self._read_orgs_for_chain_from_toml(chain_name) stats_counter = Counter() hist_data = None for org_url in github_orgs: if not org_url.startswith("https://github.com/"): # TODO: If Gitlab repo then use Gitlab APIs print("%s is not a github repo...Skipping" % org_url) continue org = org_url.split("https://github.com/")[1] print("Fetching repo data for", org) org_repo_data_list = self._get_repo_data_for_org(org) print("Fetching stats(stargazers, forks, releases, churn_4w) for", org_url) stats_counter += self._get_stats_for_org_from_repo_data( org_repo_data_list) hist_data_for_org = self._get_historical_progress( org_repo_data_list) print("Combining hist data ...") hist_data = self._combine_hist_data(hist_data, hist_data_for_org) if hist_data == None or stats_counter == {}: remove_chain_from_config(chain_name) print('No data found for organisation in toml file') sys.exit(1) path_prefix = self.save_path + '/' + chain_name with open(path_prefix + '_stats.json', 'w') as outfile: outfile.write(json.dumps(dict(stats_counter))) with open(path_prefix + '_history.json', 'w') as outfile: outfile.write(json.dumps(dict(hist_data)))
async def get_repos_for_protocol_from_toml(self, protocol): pat = await self._get_access_token() repos = set() toml_file_path = path.join(dir_path, 'protocols', protocol + '.toml') if not path.exists(toml_file_path): print(".toml file not found for %s in /protocols folder" % chain_name) sys.exit(1) try: with open(toml_file_path, 'r') as f: data = f.read() github_orgs = toml.loads(data)['github_organizations'] repos_in_toml = toml.loads(data)['repo'] except: print('Could not open toml file - check formatting!!') sys.exit(1) for org in github_orgs: if not org.lower().startswith("https://github.com/"): continue org_name = org.split('https://github.com/')[1] try: # Get all repos all_org_repos = [] page = 1 url = f"https://api.github.com/orgs/{org_name}/repos?page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) while len(response.json()) > 0: for repo in response.json(): all_org_repos.append(repo["full_name"]) page += 1 url = f"https://api.github.com/orgs/{org_name}/repos?page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) # Get forked repos forked_org_repos = [] page = 1 url = f"https://api.github.com/orgs/{org_name}/repos?type=forks&page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) while len(response.json()) > 0: for repo in response.json(): forked_org_repos.append(repo["full_name"]) page += 1 url = f"https://api.github.com/orgs/{org_name}/repos?type=forks&page={page}&per_page=100" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) # Find difference unforked_repos = list(set(all_org_repos) - set(forked_org_repos)) for repo in unforked_repos: repos.add(repo.lower()) except: # Core org is not org but a user # Get repos of user url = f"https://api.github.com/users/{org_name}/repos" response = requests.get(url, headers={'Authorization': 'Token ' + pat}) for repo in response.json(): repos.add(repo["full_name"].lower()) return list(repos)
def _read_orgs_for_chain_from_toml(self, chain_name): toml_file_path = path.join(dir_path, 'protocols', chain_name + '.toml') if not path.exists(toml_file_path): print(".toml file not found for %s in /protocols folder" % chain_name) sys.exit(1) try: with open(toml_file_path, 'r') as f: data = f.read() print("Fetching organizations for %s from toml file ..." % chain_name) github_orgs = toml.loads(data)['github_organizations'] return github_orgs except: print('Could not open toml file - check formatting.') sys.exit(1)
def _get_single_repo_data(self, org_then_slash_then_repo: str, year_count: int = 1): try: out_file_name_with_path = get_single_repo_stats_json_file_path( org_then_slash_then_repo) if path.exists(out_file_name_with_path): with open(out_file_name_with_path, 'r') as single_repo_data_json: return json.load(single_repo_data_json) repo_data = self._get_single_repo_data_from_api( org_then_slash_then_repo, year_count) with open(out_file_name_with_path, 'w') as single_repo_data_json: single_repo_data_json.write(json.dumps(dict(repo_data))) return repo_data except Exception as e: print(f"Exception occured while fetching single repo data {e}") sys.exit(1)
async def get_contr_from_toml(self, toml_file: str, monthly: bool = True, years_count: int = 1): toml_file_without_protocols = toml_file.split('protocols/')[1] protocol_name = toml_file_without_protocols.split('.toml')[0] out_file_name = toml_file_without_protocols.replace('.toml', '_contributors.json') out_file_name_with_path = self.save_path + '/' + out_file_name # Useful if left running e.g. over weekend - if failed, re-run INCLUDING last repo listed progress_file_name = toml_file.replace('.toml', '_repos_seen.txt') month_count_plus_one = 12 * years_count + 1 # create empty 2D list of (12 * years_count) empty list elements) # explicity append rather than []*12 as this uses same memory ref, thus append to one element means append to all list_2d = [] for i in range(1, month_count_plus_one): list_2d.append([]) stats = None seen_repos = [] if path.exists(out_file_name_with_path): with open(out_file_name_with_path, 'r') as stats_json: stats = json.load(stats_json) if not stats == list_2d: if path.exists(progress_file_name): progress_file = open(progress_file_name, 'r') progress_repos_list = progress_file.readlines() for (_, repo_name_with_line_term) in enumerate(progress_repos_list): repo_name = repo_name_with_line_term.split("\n")[0] seen_repos.append(repo_name) elif path.exists(progress_file_name): remove(progress_file_name) if stats: core_array = stats elif monthly: # Explicity def, see above # TODO: change this length to make it configurable core_array = list_2d else: # yearly core_array = [] with open(out_file_name_with_path, 'w') as outfile: json.dump(core_array, outfile) repos = await self.get_repos_for_protocol_from_toml(protocol_name) unseen_repo = [] for repo in repos: if repo in seen_repos: print("Ignoring seen repo: ", repo) continue unseen_repo.append(repo) # Don't thread this - API limit for repo in unseen_repo: print("Analysing repo: ", repo) if monthly: contributors = await self.get_monthly_contributors_of_repo_in_last_n_years(repo, n_years=years_count) else: contributors = await self.get_contributors_of_repo_in_last_n_years(repo, n_years=years_count) # Save progress in case of failure try: with open(out_file_name_with_path) as json_file: data = json.load(json_file) if monthly: # FIXME efficiency, note np.concatenate on axis 1 doesn't play well with our core array for index, item in enumerate(data): item.extend(contributors[index]) else: data.extend(contributors) with open(progress_file_name, 'a') as progress_file: progress_file.write(repo + '\n') with open(out_file_name_with_path, 'w') as outfile: json.dump(data, outfile) except Exception as e: print('Failed to collate monthly contributors for all repos in toml file') print(e) sys.exit(1) try: with open(out_file_name_with_path) as json_file: data = json.load(json_file) except Exception as e: print(e) sys.exit(1) if monthly: print('Monthly active developers in the past year:') for index, month_of_contributors in enumerate(data): deduplicated_monthly_contributors = list(set(month_of_contributors)) data[index] = deduplicated_monthly_contributors print('Month ' + str(index + 1) + ': ' + str(len(deduplicated_monthly_contributors))) deduplicated_contributors = data else: deduplicated_contributors = list(set(data)) print('Total active developers in the past year: ' + str(len(deduplicated_contributors))) with open(out_file_name_with_path, 'w') as outfile: json.dump(deduplicated_contributors, outfile) return deduplicated_contributors
async def get_monthly_contributors_of_repo_in_last_n_years(self, org_then_slash_then_repo: str, n_years: int = 1): # Commits are not chronological, so need to pull all and filter commits = [] # get personal access token pat = await self._get_access_token() month_count_plus_one = 12 * n_years + 1 # create empty 2D list of (12 * n_years) empty list elements) # explicity append rather than []*12 as this uses same memory ref, thus append to one element means append to all contributors = [] for i in range(1, month_count_plus_one): contributors.append([]) async with ClientSession() as session: initial_request = await get_commits(session, pat, org_then_slash_then_repo, page=1) # Repo doesn't exist if initial_request["error"] or (type(initial_request["data"]) == dict and initial_request["data"].message == 'Not Found'): return contributors if isinstance(initial_request["data"], list) and len(initial_request["data"]) == 0: return contributors commits.extend(initial_request["data"]) rate_limit_remaining = initial_request["rate_limit_remaining"] remaining_requests_to_be_made = 0 if initial_request["total_pages"]: remaining_requests_to_be_made = initial_request["total_pages"] - 1 # starting page batch_start = 2 while remaining_requests_to_be_made > 0: if remaining_requests_to_be_made > min(rate_limit_remaining, 200): batch_end = batch_start + min(rate_limit_remaining, 200) else: batch_end = batch_start + remaining_requests_to_be_made print("Start", batch_start, "End", batch_end) # get data for page from batch_start to batch_end tasks = [] for page in range(batch_start, batch_end + 1): task = ensure_future( get_commits(session, pat, org_then_slash_then_repo, page) ) tasks.append(task) responses = await asyncio.gather(*tasks) if len(responses) == 0: sys.exit(1) successful_responses_count = 0 rate_limit_exceeded = False for response in responses: if response["error"]: # Get 502 some times if response["error_code"] == 403 or response["error_code"] // 100 == 5: print("Rate limit trigger detected") rate_limit_exceeded = True break # Printing unhandled error and exiting print(response) sys.exit(1) if not isinstance(response["data"], list): print(response["error"]) sys.exit(1) successful_responses_count += 1 commits.extend(response["data"]) if rate_limit_exceeded: print("Hourly rate limit exceeded for current token") pat = await self._get_access_token() print("Successful reqs: ", successful_responses_count) remaining_requests_to_be_made -= successful_responses_count rate_limit_remaining -= successful_responses_count batch_start += successful_responses_count # If wanting to create a record of every repo's commits, uncomment this # with open(org_then_slash_then_repo + '_commits.json', 'w+') as outfile: # json.dump(commits, outfile) # Remove older commits month_start_dates = [dt.datetime.now()] # Include final end date for later use for month in range(1, month_count_plus_one): # Generate (12 * n_years) months of start dates month_start_dates.append(month_start_dates[-1] - dt.timedelta(days=30)) # 12 'months' is 360 days month_start_dates.reverse() for item in commits: try: date_string = item['commit']['author']['date'] date = dt.datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%SZ') # FIXME find a more efficient way to do this for index, (start, end) in enumerate(zip(month_start_dates, month_start_dates[1:])): if date >= start and date < end and item['author']: # Can be null (user not logged in) contributors[index].append(item['author']['login']) except Exception as e: print('Failed to get monthly contributors for ' + org_then_slash_then_repo) print(e) sys.exit(1) # De-duplicate commiters for index, month_of_contributors in enumerate(contributors): deduplicated_contributors = list(set(month_of_contributors)) contributors[index] = deduplicated_contributors return contributors
async def get_contributors_of_repo_in_last_n_years(self, org_then_slash_then_repo: str, n_years: int = 1): # Commits are not chronological, so need to pull all and filter commits = [] # get personal access token pat = await self._get_access_token() async with ClientSession() as session: initial_request = await get_commits(session, pat, org_then_slash_then_repo, page=1) # Repo doesn't exist if initial_request["error"] or (type(initial_request["data"]) == dict and initial_request["data"].message == 'Not Found'): return [] if isinstance(initial_request["data"], list) and len(initial_request["data"]) == 0: return [] commits.extend(initial_request["data"]) rate_limit_remaining = initial_request["rate_limit_remaining"] remaining_requests_to_be_made = 0 if initial_request["total_pages"]: remaining_requests_to_be_made = initial_request["total_pages"] - 1 # starting page batch_start = 2 while remaining_requests_to_be_made > 0: if remaining_requests_to_be_made > min(rate_limit_remaining, 200): batch_end = batch_start + min(rate_limit_remaining, 200) else: batch_end = batch_start + remaining_requests_to_be_made print("Start", batch_start, "End", batch_end) # get data for page from batch_start to batch_end tasks = [] for page in range(batch_start, batch_end + 1): task = ensure_future( get_commits(session, pat, org_then_slash_then_repo, page) ) tasks.append(task) responses = await asyncio.gather(*tasks) if len(responses) == 0: sys.exit(1) successful_responses_count = 0 rate_limit_exceeded = False for response in responses: if response["error"]: # Get 502 some times if response["error_code"] == 403 or response["error_code"] // 100 == 5: print("Rate limit trigger detected") rate_limit_exceeded = True break # Printing unhandled error and exiting print(response) sys.exit(1) if not isinstance(response["data"], list): print(response["error"]) sys.exit(1) successful_responses_count += 1 commits.extend(response["data"]) if rate_limit_exceeded: print("Hourly rate limit exceeded for current token") pat = await self._get_access_token() print("Successful reqs: ", successful_responses_count) remaining_requests_to_be_made -= successful_responses_count rate_limit_remaining -= successful_responses_count batch_start += successful_responses_count days_count = 365 * n_years # TODO: Adjust for leap years # Remove older commits year_ago_date = dt.datetime.now() - dt.timedelta(days=days_count) contributors = [] for item in commits: try: date_string = item['commit']['author']['date'] date = dt.datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%SZ') if date > year_ago_date: if item['author']: # Can be null (user not logged in) contributors.append(item['author']['login']) # GitHub username except Exception as e: print(e) sys.exit(1) # De-duplicate commiters deduplicated_contributors = list(set(contributors)) return deduplicated_contributors
print('Month ' + str(index + 1) + ': ' + str(len(deduplicated_monthly_contributors))) deduplicated_contributors = data else: deduplicated_contributors = list(set(data)) print('Total active developers in the past year: ' + str(len(deduplicated_contributors))) with open(out_file_name_with_path, 'w') as outfile: json.dump(deduplicated_contributors, outfile) return deduplicated_contributors # Get last commit from JSON response, and create one list of all active in the past n years, and one list of all contributors ever # Write to file every n repos + repos viewed to not lose progress if __name__ == '__main__': if not (len(sys.argv) == 2 or len(sys.argv) == 3): print('Usage: python3 contr.py [INPUTFILE.TOML] [YEARS_COUNT]') sys.exit(1) loop = get_event_loop() try: if len(sys.argv) == 3 and sys.argv[2] and int(sys.argv[2]) > 0 and int(sys.argv[2]) < 5: years_count = int(sys.argv[2]) elif len(sys.argv) == 2: years_count = 1 except: years_count = 1 try: c = Contributors('./output') loop.run_until_complete(c.get_contr_from_toml(sys.argv[1], years_count=years_count)) finally: loop.close()