def get_forks_info(origin: Repository, client: Client) -> List[ForkInfo]: """ Requests forks and wraps them in 'ForkInfo' objects """ result = [] forks = origin.get_forks() log.info('got list of forks, total %d', forks.totalCount) client.count_rate_limit(1) try: rate_limits_check(client, forks.totalCount + pagination_correction(forks.totalCount, 30)) except RateLimitError: return for fork in forks: try: log.info('comparing fork: %s', fork.full_name) comparison = origin.compare(origin.owner.login + ":master", fork.owner.login + ":master") fi = ForkInfo(fork.html_url, (abs(datetime.now() - fork.updated_at)).days, fork.stargazers_count, comparison.ahead_by, comparison.behind_by) result.append(fi) except UnknownObjectException as e: log.exception('possibly removed fork or user: %s, %d, message: %s', fork.html_url, e.status, e.data.get('message', '')) except GithubException as e: message = e.data.get('message', '') if e.status == 404 and 'No common ancestor between ' in message: # that can be handled log.error('404 %s', message) handle_github_exception(result, fork) else: log.exception('github error') client.count_rate_limit(forks.totalCount + pagination_correction(forks.totalCount, 30)) return result
def __get_fork(fork_username: str, repo: _GithubRepository) -> _GithubRepository: forks = list( filter(lambda fork: fork.owner.login == fork_username, repo.get_forks()) ) if not forks: raise GithubAPIException("Requested fork doesn't exist") return forks[0]
def get_forks_over_time(repo: Repository.Repository) -> pd.DataFrame: # TODO: for ~10k forks repositories, this operation is too costly for doing # it as part of each analyzer invocation. Move this to the fetcher, and # persist the data. log.info("fetch fork time series for repo %s", repo) reqlimit_before = GHUB.get_rate_limit().core.remaining log.info("GH request limit before operation: %s", reqlimit_before) forks = [] for count, fork in enumerate(repo.get_forks(), 1): # Store `PullRequest` object with integer key in dictionary. forks.append(fork) if count % 200 == 0: log.info("%s forks fetched", count) reqlimit_after = GHUB.get_rate_limit().core.remaining log.info("GH request limit after operation: %s", reqlimit_after) log.info("http requests made (approximately): %s", reqlimit_before - reqlimit_after) log.info("current fork count: %s", len(forks)) # The GitHub API returns ISO 8601 timestamp strings encoding the timezone # via the Z suffix, i.e. Zulu time, i.e. UTC. pygithub doesn't parse that # timezone. That is, whereas the API returns `starred_at` in UTC, the # datetime obj created by pygithub is a naive one. Correct for that. forktimes_aware = [ pytz.timezone("UTC").localize(f.created_at) for f in forks ] # Create sorted pandas DatetimeIndex dtidx = pd.to_datetime(forktimes_aware) dtidx = dtidx.sort_values() # Each timestamp corresponds to *1* fork event. Build cumulative sum over # time. df = pd.DataFrame( data={"fork_events": [1] * len(forks)}, index=dtidx, ) df.index.name = "time" df["forks_cumulative"] = df["fork_events"].cumsum() df = df.drop(columns=["fork_events"]).astype(int) log.info("forks df: \n%s", df) return df