def main(args): """ Main command-line function. """ if set(args).intersection({"-h", "--help"}): lib.text.eprint(f"Usage: {__file__} [cursor CURSOR]") sys.exit(1) path = "queries/repos/starred.gql" variables = lib.process_variables(args) results = [] query_counter = 0 while True: query_counter += 1 print(f"Query #{query_counter}") data = lib.query_by_filename(path, variables) repositories = data["viewer"]["starredRepositories"] if query_counter == 1: print(f"Total count: {repositories['totalCount']}") for node in repositories["nodes"]: results.append(parse_repo(node)) repo_page_info = repositories["pageInfo"] if repo_page_info["hasNextPage"]: variables["cursor"] = repo_page_info["endCursor"] else: break lib.write_csv(lib.STARRED_CSV_PATH, results)
def main(): data = lib.read_csv('data/labeled-data-singlelabels.csv') train_tweets, test_tweets = lib.split_data(data) lib.write_csv(train_tweets, 'data/labeled-data-singlelabels-train.csv') lib.write_csv(test_tweets, 'data/labeled-data-singlelabels-test.csv') train_tweets2, test_tweets2 = lib.read_data() assert (len(train_tweets) == len(train_tweets2)) assert (len(test_tweets) == len(test_tweets2))
def main(args): """ Main command-line function. """ if len(args) not in (2, 4) or set(args).intersection({"-h", "--help"}): lib.text.eprint(f"Usage: {__file__} login LOGIN") sys.exit(1) path = "queries/repos/repos_recent_commits.gql" variables = lib.process_variables(args) out_data = [] query_counter = 0 while True: query_counter += 1 print(f"Query #{query_counter}") data = lib.query_by_filename(path, variables) repositories = data["repositoryOwner"]["repositories"] if query_counter == 1: print(f"Total count: {repositories['totalCount']}") for repo_node in repositories["nodes"]: name = repo_node["name"] print(f"Name: {name}") branch = repo_node["defaultBranch"] if branch is not None: branch_name = branch["name"] commits = branch["commits"]["history"]["nodes"] for c in commits: parsed_commit_data = parse_commit(c) out_commit = dict( repo_name=name, branch_name=branch_name, **parsed_commit_data, ) out_data.append(out_commit) repo_page_info = repositories["pageInfo"] if repo_page_info["hasNextPage"]: variables["cursor"] = repo_page_info["endCursor"] else: break lib.write_csv(lib.VAR_DIR / "repos_recent_commits.txt", out_data)
def commits_to_csv(owner, repo_name, start_date=None): """ Write a CSV of all commits in a repo. Existing file will be overwritten. """ filename = CSV_OUT_NAME.format( repo_name=repo_name, end_date=datetime.date.today(), start_date=start_date if start_date else "INIT", ) path = lib.VAR_DIR / filename repo_commits = get_commits(owner, repo_name, start_date) lib.write_csv(path, repo_commits, append=False)
def commits_to_csv(owner, repo_names, start_date=None): """ Fetch commits for given repos and append to a CSV after each repo is done. """ filename = CSV_OUT_NAME.format( end_date=datetime.date.today(), start_date=start_date if start_date else "INIT", ) path = lib.VAR_DIR / filename summary_filename = CSV_OUT_NAME_SUMMARY.format( end_date=datetime.date.today(), start_date=start_date if start_date else "INIT", ) summary_path = lib.VAR_DIR / summary_filename print(f"Start: {start_date if start_date else 'first commit'}") print() repos_summary = [] for repo_name in repo_names: commits = repo_commits.get_commits(owner, repo_name, start_date) # TODO delete if it is first round. Or wait until the end and write out # everything with overwriting but that just means an incomplete report # is not generated of the script aborts which might be okay lib.write_csv(path, commits, append=True) # TODO Move this to a separate function, possibly using commits returned # from the function. repo_summary = defaultdict(int) repo_summary["name"] = repo_name repo_summary["commits"] = len(commits) # Note that lines changed or files changed would not be accurate when # adding commits so that is left out. for commit in commits: repo_summary["additions"] += commit["additions"] repo_summary["deletions"] += commit["deletions"] repos_summary.append(repo_summary) lib.write_csv(summary_path, repos_summary)
def main() -> None: """ Main command-line function to create a report of GitHub commit activity. Fetch and write commits out as a CSV report, where each row includes the details of a single commit including stats, metadata and the repo and branch labels. For the configured repos, get all available branches. Start with master, then develop, then the feature branches (leaving them in alphabetical order). Get the stats across the commits by starting with the HEAD commit and get its parents recursively. Skip commits older than the min date. Once commits are fetched (each requiring a GET request), then filter to just those by the configured users. Filter out commits which have no author set. We keep track of the SHA commit values seen when iterating through a branch (since a merge commit will have two histories which should have a common commit which they diverged from). Additionally, we keep track of SHA commit values across branches in a repo, so that after we have traversed master all the way back to its initial commit (if the date range allows), then we only have to look at commits which are previously traversed branches when going through develop (if it exists) and any feature branches. """ if config.MIN_DATE: print(f"Commit min date: {config.MIN_DATE}") else: print("No commit min date set") print() out_data = [] for repo in lib.get_repos(): print(f"REPO: {repo.name}\n") seen_commits = set() fetched_branches = repo.get_branches() branch_list = [] for branch in fetched_branches: if not branch_list: branch_list.append(branch) elif branch.name == "master": branch_list.insert(0, branch) elif branch.name in ("develop", "development"): dev_insert_index = 1 if branch_list[0] == "master" else 0 branch_list.insert(dev_insert_index, branch) else: branch_list.append(branch) for branch in branch_list: print(f"BRANCH: {branch.name}") print("Fetching commits") commits = list(traverse_commits(branch.commit, seen_commits)) print(f"\nFound: {len(commits)}") if config.USERNAMES: commits = [ x for x in commits if x.author and x.author.login in config.USERNAMES ] print(f"After filtering: {len(commits)}") for commit in commits: try: out_row = to_row(repo, branch, commit) except Exception as e: # Report error without aborting. print(f"Could not parse Commit." f" {type(e).__name__}: {str(e)}") else: out_data.append(out_row) print() header = ( "Repo Owner", "Repo Name", "Branch", "Commit SHA", "Commit Modified", "Commit Author", "Changed Files", "Added Lines", "Deleted Lines", "Changed Lines", ) lib.write_csv(config.COMMIT_CSV_PATH, header, out_data)
def main(): """ Main command-line function to fetch PR data then write a CSV. Set the usernames value in the config so that the report will either filter to specific usernames or show activity for all. If using the BY_OWNER setting, it's best to try retrieve the profile as an org first, since getting an org as a user object only gives as access to public repos. Fallback to getting a user object if it wasn't actually an org. Use the MIN_DATE value in the config to exclude PRs which were last updated before the cutoff date. The API's default setting is to return PRs ordered by most recently created first. https://developer.github.com/v3/pulls/#list-pull-requests Therefore if we encounter an old PR then skip remaining PRs and go the next repo. """ if config.MIN_DATE: print(f"PR updates min date: {config.MIN_DATE}") else: print("No PR updates min date set") print() out_data = [] for repo in lib.get_repos(): print(f"REPO: {repo.name}") for pr in repo.get_pulls(state=config.PR_STATE): if config.MIN_DATE and pr.updated_at < config.MIN_DATE: print( f"Skipping PRs which were updated before the" f" configured min cuttoff date: {config.MIN_DATE}" ) break author = pr.user if not config.USERNAMES or author.login in config.USERNAMES: print(f"PR #{pr.number} - author: @{author.login}") try: out_row = to_row(repo, author, pr) except Exception: # Keep the report generation robust by logging and skipping # over any errors. Create a bug issue in the aggre-git repo # on GitHub so that the error will be addressed. print("Could not fetch or parse the PR.") traceback.print_exc() print("---") else: out_data.append(out_row) else: print(f"PR #{pr.number} - skipping") header = ( "Repo Owner", "Repo Name", "Repo URL", "PR ID", "PR Title", "PR From Branch", "PR To Branch", "Author", "PR URL", "Jira Ticket", "Status", "Merged/Closed WOY", "Merged/Closed Date", "PR Updated At", "PR Created At", "Latest Commit At", "Oldest Commit At", "Days Between Commits", "Latest Commit Author", "Oldest Commit Author", "Commits", "Changed Files", "Added Lines", "Deleted Lines", "Changed Lines", "Comments", "Merged By", "Reviewers", ) + Review.get_states() lib.write_csv(config.PR_CSV_PATH, header, out_data)
def main(): filename = sys.argv[1] with open(filename, 'rb') as pdffile: result = lib.parse_pdf(pdffile) lib.write_csv(filename.replace(".pdf", ".csv"), result)