Exemple #1
0
def main(args):
    """
    Main command-line function.
    """
    if set(args).intersection({"-h", "--help"}):
        lib.text.eprint(f"Usage: {__file__} [cursor CURSOR]")
        sys.exit(1)

    path = "queries/repos/starred.gql"
    variables = lib.process_variables(args)

    results = []

    query_counter = 0
    while True:
        query_counter += 1
        print(f"Query #{query_counter}")

        data = lib.query_by_filename(path, variables)
        repositories = data["viewer"]["starredRepositories"]

        if query_counter == 1:
            print(f"Total count: {repositories['totalCount']}")
        for node in repositories["nodes"]:
            results.append(parse_repo(node))

        repo_page_info = repositories["pageInfo"]
        if repo_page_info["hasNextPage"]:
            variables["cursor"] = repo_page_info["endCursor"]
        else:
            break

    lib.write_csv(lib.STARRED_CSV_PATH, results)
Exemple #2
0
def main():
    data = lib.read_csv('data/labeled-data-singlelabels.csv')
    train_tweets, test_tweets = lib.split_data(data)

    lib.write_csv(train_tweets, 'data/labeled-data-singlelabels-train.csv')
    lib.write_csv(test_tweets, 'data/labeled-data-singlelabels-test.csv')

    train_tweets2, test_tweets2 = lib.read_data()

    assert (len(train_tweets) == len(train_tweets2))
    assert (len(test_tweets) == len(test_tweets2))
Exemple #3
0
def main(args):
    """
    Main command-line function.
    """
    if len(args) not in (2, 4) or set(args).intersection({"-h", "--help"}):
        lib.text.eprint(f"Usage: {__file__} login LOGIN")
        sys.exit(1)

    path = "queries/repos/repos_recent_commits.gql"
    variables = lib.process_variables(args)

    out_data = []
    query_counter = 0

    while True:
        query_counter += 1
        print(f"Query #{query_counter}")
        data = lib.query_by_filename(path, variables)

        repositories = data["repositoryOwner"]["repositories"]
        if query_counter == 1:
            print(f"Total count: {repositories['totalCount']}")

        for repo_node in repositories["nodes"]:
            name = repo_node["name"]

            print(f"Name: {name}")

            branch = repo_node["defaultBranch"]
            if branch is not None:
                branch_name = branch["name"]

                commits = branch["commits"]["history"]["nodes"]

                for c in commits:
                    parsed_commit_data = parse_commit(c)
                    out_commit = dict(
                        repo_name=name,
                        branch_name=branch_name,
                        **parsed_commit_data,
                    )
                    out_data.append(out_commit)

        repo_page_info = repositories["pageInfo"]
        if repo_page_info["hasNextPage"]:
            variables["cursor"] = repo_page_info["endCursor"]
        else:
            break

    lib.write_csv(lib.VAR_DIR / "repos_recent_commits.txt", out_data)
def commits_to_csv(owner, repo_name, start_date=None):
    """
    Write a CSV of all commits in a repo.

    Existing file will be overwritten.
    """
    filename = CSV_OUT_NAME.format(
        repo_name=repo_name,
        end_date=datetime.date.today(),
        start_date=start_date if start_date else "INIT",
    )
    path = lib.VAR_DIR / filename

    repo_commits = get_commits(owner, repo_name, start_date)

    lib.write_csv(path, repo_commits, append=False)
def commits_to_csv(owner, repo_names, start_date=None):
    """
    Fetch commits for given repos and append to a CSV after each repo is done.
    """
    filename = CSV_OUT_NAME.format(
        end_date=datetime.date.today(),
        start_date=start_date if start_date else "INIT",
    )
    path = lib.VAR_DIR / filename

    summary_filename = CSV_OUT_NAME_SUMMARY.format(
        end_date=datetime.date.today(),
        start_date=start_date if start_date else "INIT",
    )
    summary_path = lib.VAR_DIR / summary_filename

    print(f"Start: {start_date if start_date else 'first commit'}")
    print()

    repos_summary = []

    for repo_name in repo_names:
        commits = repo_commits.get_commits(owner, repo_name, start_date)
        # TODO delete if it is first round. Or wait until the end and write out
        # everything with overwriting but that just means an incomplete report
        # is not generated of the script aborts which might be okay

        lib.write_csv(path, commits, append=True)

        # TODO Move this to a separate function, possibly using commits returned
        # from the function.
        repo_summary = defaultdict(int)
        repo_summary["name"] = repo_name
        repo_summary["commits"] = len(commits)
        # Note that lines changed or files changed would not be accurate when
        # adding commits so that is left out.
        for commit in commits:
            repo_summary["additions"] += commit["additions"]
            repo_summary["deletions"] += commit["deletions"]
        repos_summary.append(repo_summary)

    lib.write_csv(summary_path, repos_summary)
def main() -> None:
    """
    Main command-line function to create a report of GitHub commit activity.

    Fetch and write commits out as a CSV report, where each row includes the
    details of a single commit including stats, metadata and the repo and branch
    labels.

    For the configured repos, get all available branches. Start with
    master, then develop, then the feature branches (leaving them
    in alphabetical order). Get the stats across the commits by starting
    with the HEAD commit and get its parents recursively. Skip commits older
    than the min date. Once commits are fetched (each requiring a GET request),
    then filter to just those by the configured users. Filter out commits
    which have no author set.

    We keep track of the SHA commit values seen when iterating through a branch
    (since a merge commit will have two histories which should have a common
    commit which they diverged from). Additionally, we keep track of SHA commit
    values across branches in a repo, so that after we have traversed master
    all the way back to its initial commit (if the date range allows), then
    we only have to look at commits which are previously traversed branches
    when going through develop (if it exists) and any feature branches.
    """
    if config.MIN_DATE:
        print(f"Commit min date: {config.MIN_DATE}")
    else:
        print("No commit min date set")
    print()

    out_data = []
    for repo in lib.get_repos():
        print(f"REPO: {repo.name}\n")

        seen_commits = set()

        fetched_branches = repo.get_branches()
        branch_list = []

        for branch in fetched_branches:
            if not branch_list:
                branch_list.append(branch)
            elif branch.name == "master":
                branch_list.insert(0, branch)
            elif branch.name in ("develop", "development"):
                dev_insert_index = 1 if branch_list[0] == "master" else 0
                branch_list.insert(dev_insert_index, branch)
            else:
                branch_list.append(branch)

        for branch in branch_list:
            print(f"BRANCH: {branch.name}")

            print("Fetching commits")
            commits = list(traverse_commits(branch.commit, seen_commits))
            print(f"\nFound: {len(commits)}")

            if config.USERNAMES:
                commits = [
                    x for x in commits
                    if x.author and x.author.login in config.USERNAMES
                ]
                print(f"After filtering: {len(commits)}")

            for commit in commits:
                try:
                    out_row = to_row(repo, branch, commit)
                except Exception as e:
                    # Report error without aborting.
                    print(f"Could not parse Commit."
                          f" {type(e).__name__}: {str(e)}")
                else:
                    out_data.append(out_row)
            print()

    header = (
        "Repo Owner",
        "Repo Name",
        "Branch",
        "Commit SHA",
        "Commit Modified",
        "Commit Author",
        "Changed Files",
        "Added Lines",
        "Deleted Lines",
        "Changed Lines",
    )
    lib.write_csv(config.COMMIT_CSV_PATH, header, out_data)
Exemple #7
0
def main():
    """
    Main command-line function to fetch PR data then write a CSV.

    Set the usernames value in the config so that the report will either
    filter to specific usernames or show activity for all.

    If using the BY_OWNER setting, it's best to try retrieve the profile as an
    org first, since  getting an org as a user object only gives as access to
    public repos. Fallback to getting a user object if it wasn't actually
    an org.

    Use the MIN_DATE value in the config to exclude PRs which were last updated
    before the cutoff date. The API's default setting is to return PRs ordered
    by most recently created first.
        https://developer.github.com/v3/pulls/#list-pull-requests
    Therefore if we encounter an old PR then skip remaining PRs and go the next
    repo.
    """
    if config.MIN_DATE:
        print(f"PR updates min date: {config.MIN_DATE}")
    else:
        print("No PR updates min date set")
    print()

    out_data = []
    for repo in lib.get_repos():
        print(f"REPO: {repo.name}")

        for pr in repo.get_pulls(state=config.PR_STATE):
            if config.MIN_DATE and pr.updated_at < config.MIN_DATE:
                print(
                    f"Skipping PRs which were updated before the"
                    f" configured min cuttoff date: {config.MIN_DATE}"
                )
                break

            author = pr.user
            if not config.USERNAMES or author.login in config.USERNAMES:
                print(f"PR #{pr.number} - author: @{author.login}")
                try:
                    out_row = to_row(repo, author, pr)
                except Exception:
                    # Keep the report generation robust by logging and skipping
                    # over any errors. Create a bug issue in the aggre-git repo
                    # on GitHub so that the error will be addressed.
                    print("Could not fetch or parse the PR.")
                    traceback.print_exc()
                    print("---")
                else:
                    out_data.append(out_row)
            else:
                print(f"PR #{pr.number} - skipping")

    header = (
        "Repo Owner",
        "Repo Name",
        "Repo URL",
        "PR ID",
        "PR Title",
        "PR From Branch",
        "PR To Branch",
        "Author",
        "PR URL",
        "Jira Ticket",
        "Status",
        "Merged/Closed WOY",
        "Merged/Closed Date",
        "PR Updated At",
        "PR Created At",
        "Latest Commit At",
        "Oldest Commit At",
        "Days Between Commits",
        "Latest Commit Author",
        "Oldest Commit Author",
        "Commits",
        "Changed Files",
        "Added Lines",
        "Deleted Lines",
        "Changed Lines",
        "Comments",
        "Merged By",
        "Reviewers",
    ) + Review.get_states()

    lib.write_csv(config.PR_CSV_PATH, header, out_data)
Exemple #8
0
def main():
    filename = sys.argv[1]
    with open(filename, 'rb') as pdffile:
        result = lib.parse_pdf(pdffile)

    lib.write_csv(filename.replace(".pdf", ".csv"), result)