def get_dataframe_from_repo(repo, num=100): """Create pandas dataframe of contributors by country. Args: repo - a full GitHub repo URL num - number of contributors to analyze per repo Returns: df - a pandas dataframe of contributors by country num_contributors - total number of contributors """ # get contributors repo_ending_string = extract_github_owner_and_repo(repo) contributors = get_contributors(repo_ending_string, num) num_contributors = len(contributors) # get count of countries country_list = [] for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) country_list.append(country) country_counter = Counter(country_list) # convert counter to pandas dataframe df = pd.DataFrame.from_records(country_counter.most_common(), columns=["country", "contributor_count"]) return df, num_contributors
def print_by_contributor(contributors, pypi_data=None): """ Print location results by contributor Args: contributors - a list of contributors Returns: null """ print("CONTRIBUTOR, LOCATION") if pypi_data is not None: print("* indicates PyPI maintainer") print("---------------------") for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) try: # Check if pypi_data is not None, indicating a PyPI package scan if pypi_data is not None and contributor in pypi_data[ "pypi_maintainers"]: print(contributor, "*", "|", location, "|", country) else: print(contributor, "|", location, "|", country) except UnicodeEncodeError: print(contributor, "| error")
def print_by_contributor(software_name, contributors, output_csv=False, pypi_data=None): """Print location results by contributor. Print contributors and countries to terminal window. If output csv is set to true, then also output results to a csv file. Args: software_name - name of package or repo contributors - a list of contributors output_csv - whether to output a csv. pypi_data - a pypi data object. Returns: null """ # create csv if output_csv specified if output_csv: # unique current time timestamp to create unique filename timestamp = time.strftime("%Y%m%d-%H%M%S") create_csv("contributor", timestamp) print("CONTRIBUTOR, LOCATION") if pypi_data is not None: print("* indicates PyPI maintainer") print("---------------------") for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) if output_csv: add_committer_to_csv("contributor", software_name, timestamp, contributor, location, country) try: # Check if pypi_data is not None, indicating a PyPI package scan if pypi_data is not None and contributor in pypi_data[ "pypi_maintainers"]: print(contributor, "*", "|", location, "|", country) else: print(contributor, "|", location, "|", country) except UnicodeEncodeError: print(contributor, "| error")
def print_by_country(contributors): """ Print contributors aggregated by country Args: contributors: a list of contributors Returns: null """ print("COUNTRY | # OF CONTRIBUTORS") print("---------------------------") country_list = [] for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) country_list.append(country) country_counter = Counter(country_list) for country, count in country_counter.most_common(): print(country, count)
def scan_multiple_repos(input_file="repos.txt", num=100): """Create csv of data for multiple repos. Scan through repos provided in repos.txt and create a single csv that stores all contributor-related data for each contributor in each repo. Args: input_file - file containing repo list num - max number of contributors to analyze per repo Returns: None """ # create csv to store multi-repo scan results timestamp = time.strftime("%Y%m%d-%H%M%S") create_csv("multirepo", timestamp) # open file that contains repos to scan and append contributors for each # repo to csv. Also, repos.txt must contain repo names, one repo per line. with open(input_file, "r") as input_repos: for repo in input_repos: # Skip blank lines if repo == "": continue # strip blank space before extracting owner and repo name repo_ending_string = extract_github_owner_and_repo(repo.strip()) contributors = get_contributors(repo_ending_string, num) for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) add_committer_to_csv( "multirepo", repo_ending_string, timestamp, contributor, location, country, )
def test_get_contributor_location(self): """Unit test for get_contributor_location().""" assert get_contributor_location( "anarkiwi") == "Wellington, New Zealand"