def get_dataframe_from_repo(repo, num=100): """Create pandas dataframe of contributors by country. Args: repo - a full GitHub repo URL num - number of contributors to analyze per repo Returns: df - a pandas dataframe of contributors by country num_contributors - total number of contributors """ # get contributors repo_ending_string = extract_github_owner_and_repo(repo) contributors = get_contributors(repo_ending_string, num) num_contributors = len(contributors) # get count of countries country_list = [] for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) country_list.append(country) country_counter = Counter(country_list) # convert counter to pandas dataframe df = pd.DataFrame.from_records(country_counter.most_common(), columns=["country", "contributor_count"]) return df, num_contributors
def test_get_country_from_location_world_cities(self): """test get_country_from_location on world city names.""" assert get_country_from_location("Tokyo") == "Japan" assert get_country_from_location("London") == "United Kingdom" assert get_country_from_location("Jakarta") == "Indonesia" assert get_country_from_location("Beijing") == "China" assert get_country_from_location("Washington D.C.") == "United States" assert get_country_from_location("Toronto, ON") == "Canada"
def print_by_contributor(software_name, contributors, output_csv=False, pypi_data=None): """Print location results by contributor. Print contributors and countries to terminal window. If output csv is set to true, then also output results to a csv file. Args: software_name - name of package or repo contributors - a list of contributors output_csv - whether to output a csv. pypi_data - a pypi data object. Returns: null """ # create csv if output_csv specified if output_csv: # unique current time timestamp to create unique filename timestamp = time.strftime("%Y%m%d-%H%M%S") create_csv("contributor", timestamp) print("CONTRIBUTOR, LOCATION") if pypi_data is not None: print("* indicates PyPI maintainer") print("---------------------") for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) if output_csv: add_committer_to_csv("contributor", software_name, timestamp, contributor, location, country) try: # Check if pypi_data is not None, indicating a PyPI package scan if pypi_data is not None and contributor in pypi_data[ "pypi_maintainers"]: print(contributor, "*", "|", location, "|", country) else: print(contributor, "|", location, "|", country) except UnicodeEncodeError: print(contributor, "| error")
def print_by_country(contributors): """Print contributors aggregated by country. Print contributor county by country to terminal window. Args: contributors: a list of contributors Returns: null """ print("COUNTRY | # OF CONTRIBUTORS") print("---------------------------") country_list = [] for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) country_list.append(country) country_counter = Counter(country_list) for country, count in country_counter.most_common(): print(country, count)
def scan_multiple_repos(input_file="repos.txt", num=100): """Create csv of data for multiple repos. Scan through repos provided in repos.txt and create a single csv that stores all contributor-related data for each contributor in each repo. Args: input_file - file containing repo list num - max number of contributors to analyze per repo Returns: None """ # create csv to store multi-repo scan results timestamp = time.strftime("%Y%m%d-%H%M%S") create_csv("multirepo", timestamp) # open file that contains repos to scan and append contributors for each # repo to csv. Also, repos.txt must contain repo names, one repo per line. with open(input_file, "r") as input_repos: for repo in input_repos: # Skip blank lines if repo == "": continue # strip blank space before extracting owner and repo name repo_ending_string = extract_github_owner_and_repo(repo.strip()) contributors = get_contributors(repo_ending_string, num) for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) add_committer_to_csv( "multirepo", repo_ending_string, timestamp, contributor, location, country, )
def test_get_country_from_location_dataset_pull_geographies(self): """tests of get_gountry_from_location() that fail as of 2/14/2021""" assert get_country_from_location("Saclay") == "France" assert get_country_from_location("Warszawa") == "Poland" assert get_country_from_location("brookline, ma") == "United States" assert get_country_from_location("Greater Los Angeles Area") == "United States" assert get_country_from_location("Forschungszentrum") == "Germany" assert get_country_from_location("Montigny-lès-Metz") == "France" assert get_country_from_location("roudnice nad labem, czech republic") == "Czech Republic" assert get_country_from_location("Berlin/Florence") == "Germany" assert get_country_from_location("Greater Seattle Area") == "United States" assert get_country_from_location("Flanders, Europe, Earth") == "Belgium" assert get_country_from_location("Wrocław") == "Poland"
def test_get_country_from_location_corner_case_geographies(self): """test get_country_from_location on unusual geographies.""" assert get_country_from_location("Palestine") == "Palestine" assert get_country_from_location("San Francisco Bay Area") == "United States" assert get_country_from_location("EU") == "None" assert get_country_from_location("Canary Islands") == "Spain" assert get_country_from_location("Earth") == "None" assert get_country_from_location("Sydney") == "Australia" assert get_country_from_location("Amsterdam") == "Netherlands" assert get_country_from_location("NYC") == "United States" assert get_country_from_location("Barcelona") == "Spain" assert get_country_from_location("Kerala") == "India" assert get_country_from_location("Hyderabad") == "India" assert get_country_from_location("Vancouver") == "Canada" assert get_country_from_location("Jiangxi") == "China" assert get_country_from_location("San Francisco") == "United States" assert get_country_from_location("New York") == "United States" assert get_country_from_location("Saint Petersburg") == "Russia" assert get_country_from_location("England") == "United Kingdom" assert get_country_from_location("Athens") == "Greece" assert get_country_from_location("Europe") == "None" assert get_country_from_location("Lima") == "Peru" assert get_country_from_location("Bay Area") == "United States" assert get_country_from_location("EU") == "None" assert get_country_from_location("Canary Islands") == "Spain" assert get_country_from_location("waterloo") == "United Kingdom" assert get_country_from_location("Europe/Berlin") == "None" assert get_country_from_location("York") == "United Kingdom" assert get_country_from_location("München") == "Germany" assert get_country_from_location("Montreal, CA") == "Canada" assert get_country_from_location("Florianópolis") == "Brazil" assert get_country_from_location("Montréal") == "Canada" assert get_country_from_location("Bangalore") == "India" assert get_country_from_location("Dublin") == "Ireland" assert get_country_from_location("Santiago de Querétaro, México") == "Mexico" assert get_country_from_location("Jülich") == "Germany" assert get_country_from_location("Victoria, BC") == "Canada" assert get_country_from_location("Waterloo, ON") == "Canada" assert get_country_from_location("Falls Church, Virginia") == "United States" assert get_country_from_location("Amsterdam, the Netherlands") == "Netherlands" assert get_country_from_location("BeiJing") == "China" assert get_country_from_location("Edinburgh, Scotland") == "United Kingdom" assert get_country_from_location("Medellín, Colombia") == "Colombia" assert get_country_from_location("La Jolla, CA.") == "United States" assert get_country_from_location("beijing") == "China" assert get_country_from_location("Pemberton, British Columbia") == "Canada" assert get_country_from_location("Timișoara") == "Romania" assert get_country_from_location("PRC") == "China" assert get_country_from_location("Amsterdam, The Netherlands") == "Netherlands" assert get_country_from_location("Oxford") == "United Kingdom" assert get_country_from_location("São Paulo") == "Brazil" assert get_country_from_location("Kyiv") == "Ukraine" assert get_country_from_location("Vancouver, BC") == "Canada" assert get_country_from_location("N.H.") == "United States" assert get_country_from_location("Sri-City, Andhra Pradesh") == "India" assert get_country_from_location("Scotland") == "United Kingdom" assert get_country_from_location("Geneva") == "Switzerland" assert get_country_from_location("Rotterdam, the Netherlands") == "Netherlands" assert get_country_from_location("Milan") == "Italy" assert get_country_from_location("Republic of Korea") == "South Korea" assert get_country_from_location("Brasília, Brazil.") == "Brazil" assert get_country_from_location("beijing") == "China" assert get_country_from_location("Zürich") == "Switzerland" assert get_country_from_location("Kitchener, Ontario") == "Canada" assert get_country_from_location("Montréal, QC") == "Canada" assert get_country_from_location("Glasgow, Scotland") == "United Kingdom" assert ( get_country_from_location("28 rue du Dr Roux 75015 Paris, FRANCE") == "France" ) assert get_country_from_location("Kraków") == "Poland" assert get_country_from_location("İstanbul") == "Turkey" assert get_country_from_location("Russian Federation") == "Russia" assert get_country_from_location("Newcastle, NSW") == "Australia" assert get_country_from_location("Australia, Victoria") == "Australia" assert get_country_from_location("Perth, Western Australia ") == "Australia" assert get_country_from_location("Gdańsk") == "Poland" assert get_country_from_location("SF") == "United States" assert get_country_from_location("Hyderabad (India)") == "India" assert get_country_from_location("BITS Pilani, Rajasthan") == "India" assert get_country_from_location("Sri-City, Andhra Pradesh") == "India"
def test_get_country_from_location_country_abbreviations(self): """test get_country_from_location on country abbreviations.""" assert get_country_from_location("USA") == "United States" assert get_country_from_location("Cambridge, UK") == "United Kingdom" assert get_country_from_location("UK") == "United Kingdom"
def test_get_country_from_location_standard_order_no_comma(self): """test get_country_from_location on standard order pairs without comma.""" assert get_country_from_location("Menlo Park CA") == "United States"
def test_get_country_from_location_nonstandard_order(self): """test get_country_from_location on non-standard order pairs.""" assert get_country_from_location("Russia, Moscow") == "Russia" assert get_country_from_location("Russia, Nizhny Novgorod") == "Russia"
def test_get_country_from_location_standard_order_with_comma(self): """test get_country_from_location on standard order pairs with comma.""" assert get_country_from_location("Wellington, New Zealand") == "New Zealand" assert get_country_from_location("Jordan, Minnesota") == "United States" assert get_country_from_location("Jordan, MN") == "United States" assert get_country_from_location("Atlanta, Georgia") == "United States" assert get_country_from_location("Atlanta, Ga") == "United States" assert get_country_from_location("London, England") == "United Kingdom" assert get_country_from_location("Prague, Czech Republic") == "Czech Republic" assert get_country_from_location("Virginia, USA") == "United States" assert get_country_from_location("Naperville, IL") == "United States" assert get_country_from_location("Toronto, Ontario, Canada") == "Canada" assert get_country_from_location("Berlin, DE") == "Germany" assert get_country_from_location("CSU Sacramento") == "United States" assert get_country_from_location("Philadelphia, PA") == "United States"