def test_multiple_dcids(self, urlopen_mock): """ Calling get_property_values with multiple dcids returns valid results. """ dcids = ['geoId/06085', 'geoId/24031'] # Get the containedInPlace Towns for Santa Clara and Montgomery County. towns = dc.get_property_values( dcids, 'containedInPlace', out=False, value_type='Town') self.assertDictEqual(towns, { 'geoId/06085': ['geoId/0643294', 'geoId/0644112'], 'geoId/24031': ['geoId/2462850'] }) dcids = ['geoId/06085', 'geoId/24031', float('nan')] # Handle NaN values towns = dc.get_property_values( dcids, 'containedInPlace', out=False, value_type='Town') self.assertDictEqual(towns, { 'geoId/06085': ['geoId/0643294', 'geoId/0644112'], 'geoId/24031': ['geoId/2462850'] }) # Get the name of Santa Clara and Montgomery County. names = dc.get_property_values(dcids, 'name') self.assertDictEqual(names, { 'geoId/06085': ['Santa Clara County'], 'geoId/24031': ['Montgomery County'] }) # Return empty result when there is no data. names = dc.get_property_values(['dc/p/1234'], 'name') self.assertDictEqual(names, { 'dc/p/1234': [] })
def test_multiple_dcids(self, post_mock): """ Calling get_property_values with multiple dcids returns valid results. """ # Set the API key dc.set_api_key('TEST-API-KEY') dcids = ['geoId/06085', 'geoId/24031'] # Get the containedInPlace Towns for Santa Clara and Montgomery County. towns = dc.get_property_values(dcids, 'containedInPlace', out=False, value_type='Town') self.assertDictEqual( towns, { 'geoId/06085': ['geoId/0643294', 'geoId/0644112'], 'geoId/24031': ['geoId/2462850'] }) # Get the name of Santa Clara and Montgomery County. names = dc.get_property_values(dcids, 'name') self.assertDictEqual( names, { 'geoId/06085': ['Santa Clara County'], 'geoId/24031': ['Montgomery County'] })
def download_data(self, place='country/USA', level=1): """Downloads GeoJSON data for a specified location. Given the specified location, extracts the GeoJSONs of all administrative areas one level below it (as specified by the LEVEL_MAP class constant). For example, if the input is country/USA, extracts all AdministrativeArea1's within the US (US states). Args: place: A string that is a valid value for the geoId property of a DataCommons node. level: Number of administrative levels down from place that should be fetched. For example if place='country/USA' and level=1, US states will be fetched. If instead level=2, US counties will be fetched, and so on. Raises: ValueError: If a Data Commons API call fails. """ geolevel = dc.get_property_values([place], "typeOf") # There is an extra level of nesting in geojson files, so we have # to get the 0th element explicitly. assert len(geolevel[place]) == 1 geolevel = geolevel[place][0] for i in range(level): if geolevel not in self.LEVEL_MAP: raise ValueError("Desired level does not exist.") geolevel = self.LEVEL_MAP[geolevel] geos_contained_in_place = dc.get_places_in([place], geolevel)[place] self.geojsons = dc.get_property_values(geos_contained_in_place, "geoJsonCoordinates") for area, coords in self.iter_subareas(): self.geojsons[area][0] = geojson.loads(coords)
def test_bad_dcids(self, post_mock): """ Calling get_property_values with dcids that do not exist returns empty results. """ # Set the API key dc.set_api_key('TEST-API-KEY') bad_dcids_1 = ['geoId/06085', 'dc/MadDcid'] bad_dcids_2 = ['dc/MadDcid', 'dc/MadderDcid'] # Get entities containedInPlace of Santa Clara County and a dcid that does # not exist. contained_1 = dc.get_property_values(bad_dcids_1, 'containedInPlace', out=False) self.assertDictEqual(contained_1, { 'geoId/06085': ['geoId/0644112'], 'dc/MadDcid': [] }) # Get entities containedInPlace for two dcids that do not exist. contained_2 = dc.get_property_values(bad_dcids_2, 'containedInPlace') self.assertDictEqual(contained_2, { 'dc/MadDcid': [], 'dc/MadderDcid': [] })
def download_data(self, place='country/USA'): """Downloads GeoJSON data for a specified location. Given the specified location, extracts the GeoJSONs of all administrative areas one level below it (as specified by the LEVEL_MAP class constant). For example, if the input is country/USA, extracts all AdministrativeArea1's within the US (US states). Args: place: A string that is a valid value for the geoId property of a DataCommons node. Raises: ValueError: If a Data Commons API call fails. """ geolevel = dc.get_property_values([place], "typeOf") # There is an extra level of nesting in geojson files, so we have # to get the 0th element explicitly. assert len(geolevel[place]) == 1 geolevel = geolevel[place][0] geos_contained_in_place = dc.get_places_in( [place], self.LEVEL_MAP[geolevel])[place] self.geojsons = dc.get_property_values(geos_contained_in_place, "geoJsonCoordinates") for area, coords in self.iter_subareas(): self.geojsons[area][0] = geojson.loads(coords)
def _load_geojsons(): countries = dc.get_places_in(['Earth'], 'Country')['Earth'] resp = dc.get_property_values(countries, 'geoJsonCoordinatesDP2') geojsons = {} for p, gj in resp.items(): if not gj: continue geojsons[p] = geometry.shape(json.loads(gj[0])) print('Got', len(geojsons), 'geojsons!') cip = dc.get_property_values(countries, 'containedInPlace') return geojsons, cip
def main(): # Set the dcid to be that of Santa Clara County. dcids = ['geoId/06085', 'dc/p/zsb968m3v1f97'] # Print all incoming and outgoing properties from Santa Clara County. print('Property Labels for Santa Clara County') in_labels = dc.get_property_labels(dcids) out_labels = dc.get_property_labels(dcids, out=False) print('> Printing properties for {}'.format(dcids)) print('> Incoming properties: {}'.format(in_labels)) print('> Outgoing properties: {}'.format(out_labels)) # Print all property values for "containedInPlace" for Santa Clara County. print('Property Values for "containedInPlace" of Santa Clara County') prop_vals = dc.get_property_values( dcids, 'containedInPlace', out=False, value_type='City') print('> Cities contained in {}'.format(dcids)) for dcid in dcids: for city_dcid in prop_vals[dcid]: print(' - {}'.format(city_dcid)) # Print the first 10 triples associated with Santa Clara County print('Triples for Santa Clara County') triples = dc.get_triples(dcids) for dcid in dcids: print('> Triples for {}'.format(dcid)) for s, p, o in triples[dcid][:5]: print(' - ("{}", {}, "{}")'.format(s, p, o))
def test_bad_property(self, urlopen_mock): """ Calling get_property_values with a property that does not exist returns empty results. """ # Get propery values for a property that does not exist. prop_vals = dc.get_property_values(['geoId/06085', 'geoId/24031'], 'madProperty') self.assertDictEqual(prop_vals, {'geoId/06085': [], 'geoId/24031': []})
def test_series_no_dcid(self, post_mock): # The input and expected series dcids = pd.Series([]) expected = pd.Series([]) # Call get_property_values and assert the results are correct. actual = dc.get_property_values(dcids, 'containedInPlace') assert_series_equal(actual, expected)
def test_no_dcids(self, post_mock): """ Calling get_property_values with no dcids returns empty results. """ # Set the API key dc.set_api_key('TEST-API-KEY') # Get property values with an empty list of dcids. prop_vals = dc.get_property_values([], 'containedInPlace') self.assertDictEqual(prop_vals, {})
def test_bad_property(self, post_mock): """ Calling get_property_values with a property that does not exist returns empty results. """ # Set the API key dc.set_api_key('TEST-API-KEY') # Get propery values for a property that does not exist. prop_vals = dc.get_property_values(['geoId/06085', 'geoId/24031'], 'madProperty') self.assertDictEqual(prop_vals, {'geoId/06085': [], 'geoId/24031': []})
def test_series_bad_property(self, post_mock): """ Calling get_property_values with a Pandas Series and a property that does not exist returns an empty result. """ # Set the API key dc.set_api_key('TEST-API-KEY') # The input and expected series dcids = pd.Series(['geoId/06085', 'geoId/24031']) expected = pd.Series([[], []]) # Call get_property_values and assert the results are correct. actual = dc.get_property_values(dcids, 'madProperty') assert_series_equal(actual, expected)
def test_series_bad_dcids(self, post_mock): """ Calling get_property_values with a Pandas Series and dcids that does not exist resturns an empty result. """ # Set the API key dc.set_api_key('TEST-API-KEY') # The given and expected series bad_dcids_1 = pd.Series(['geoId/06085', 'dc/MadDcid']) bad_dcids_2 = pd.Series(['dc/MadDcid', 'dc/MadderDcid']) expected_1 = pd.Series([['geoId/0644112'], []]) expected_2 = pd.Series([[], []]) # Call get_property_values with series as input actual_1 = dc.get_property_values(bad_dcids_1, 'containedInPlace', out=False) actual_2 = dc.get_property_values(bad_dcids_2, 'containedInPlace', out=False) # Assert the results are correct assert_series_equal(actual_1, expected_1) assert_series_equal(actual_2, expected_2)
def test_dataframe(self, post_mock): """ Calling get_property_values with a Pandas DataFrame returns the correct results. """ # Set the API key dc.set_api_key('TEST-API-KEY') # The given and expected series. dcids = pd.DataFrame({'dcids': ['geoId/06085', 'geoId/24031']}) expected = pd.Series([['geoId/0643294', 'geoId/0644112'], ['geoId/2462850']]) # Call get_property_values with the series as input actual = dc.get_property_values(dcids, 'containedInPlace', out=False, value_type='Town') assert_series_equal(actual, expected)
args = argparser.parse_args() # Get the country name aux = read_file(ROOT / "src" / "data" / "metadata.csv").set_index("key") country_name = aux.loc[args.country_code, "country_name"] # Convert 2-letter to 3-letter country code iso_codes = read_file(ROOT / "src" / "data" / "country_codes.csv").set_index("key") country_code_alpha_3 = iso_codes.loc[args.country_code, "3166-1-alpha-3"] dc.set_api_key(args.dc_api_key) country = "country/{}".format(country_code_alpha_3) nuts_name = "EurostatNUTS{}".format(args.nuts_level) regions = dc.get_places_in([country], nuts_name)[country] names = dc.get_property_values(regions, "name") for key, name in names.items(): region_name = name[0] region_code = key.split("/")[-1][2:] print(("{country_code}_{region_code}," "{country_code}," "{country_name}," "{region_code}," "{region_name}," ",,,0").format( **{ "country_code": args.country_code, "region_code": region_code, "country_name": country_name, "region_name": region_name, }))
def main(): # Set the dcid to be that of Santa Clara County. dcids = ['geoId/06085'] # Print all incoming and outgoing properties from Santa Clara County. utils._print_header('Property Labels for Santa Clara County') in_labels = dc.get_property_labels(dcids) out_labels = dc.get_property_labels(dcids, out=False) print('> Printing properties for {}'.format(dcids)) print('> Incoming properties: {}'.format(in_labels)) print('> Outgoing properties: {}'.format(out_labels)) # Print all property values for "containedInPlace" for Santa Clara County. utils._print_header( 'Property Values for "containedInPlace" of Santa Clara County') prop_vals = dc.get_property_values(dcids, 'containedInPlace', out=False, value_type='City') print('> Cities contained in {}'.format(dcids)) for dcid in dcids: for city_dcid in prop_vals[dcid]: print(' - {}'.format(city_dcid)) # Print the first 10 triples associated with Santa Clara County utils._print_header('Triples for Santa Clara County') triples = dc.get_triples(dcids) for dcid in dcids: print('> Triples for {}'.format(dcid)) for s, p, o in triples[dcid][:5]: print(' - ("{}", {}, "{}")'.format(s, p, o)) # get_property_values can be easily used to populate Pandas DataFrames. First # create a DataFrame with some data. utils._print_header('Initialize the DataFrame') pd_frame = pd.DataFrame({'county': ['geoId/06085', 'geoId/24031']}) print(pd_frame) # Get the names for the given counties. utils._print_header('Get County Names') pd_frame['county_name'] = dc.get_property_values(pd_frame['county'], 'name') print(pd_frame) # Get the cities contained in these counties. utils._print_header('Get Contained Cities') pd_frame['city'] = dc.get_property_values(pd_frame['county'], 'containedInPlace', out=False, value_type='City') print(pd_frame) # To expand on a column with get_property_values, the data frame has to be # flattened first. Clients can use flatten_frame to do this. utils._print_header('Flatten the Frame') pd_frame = dc.flatten_frame(pd_frame) print(pd_frame) # Get the names for each city. utils._print_header('Get City Names') pd_frame['city_name'] = dc.get_property_values(pd_frame['city'], 'name') print(pd_frame) # Format the final frame. utils._print_header('The Final Frame') pd_frame = dc.flatten_frame(pd_frame) print(pd_frame)
def test_no_dcids(self, urlopen_mock): """ Calling get_property_values with no dcids returns empty results. """ # Get property values with an empty list of dcids. prop_vals = dc.get_property_values([], 'containedInPlace') self.assertDictEqual(prop_vals, {})
def main(): # Create a list of dcids for California, Kentucky, and Maryland ca, ky, md = 'geoId/06', 'geoId/21', 'geoId/24' dcids = [ca, ky, md] # Get the population of all employed individuals in the above states. utils._print_header('Get Populations for All Employed Individuals') employed = dc.get_populations( dcids, 'Person', constraining_properties={'employment': 'BLS_Employed'}) print('> Printing all populations of employed individuals\n') print(json.dumps(employed, indent=2)) # Get the count for all male / females for the above states in 2016 utils._print_header( 'Get Population Counts for Employed Individuals in Maryland') pop_dcids = [employed[md]] print('> Requesting observations for {} in December 2018\n'.format( pop_dcids)) obs = dc.get_observations(pop_dcids, 'count', 'measuredValue', '2018-12', observation_period='P1M', measurement_method='BLSSeasonallyAdjusted') print(json.dumps(obs, indent=2)) # We perform the same workflow using a Pandas DataFrame. First, initialize a # DataFrame with Santa Clara and Montgomery County. utils._print_header('Initialize the DataFrame') pd_frame = pd.DataFrame({'state': ['geoId/06', 'geoId/21', 'geoId/24']}) pd_frame['state_name'] = dc.get_property_values(pd_frame['state'], 'name') pd_frame = dc.flatten_frame(pd_frame) print(pd_frame) # Get populations for employed individuals utils._print_header('Add Population and Observation to DataFrame') pd_frame['employed_pop'] = dc.get_populations( pd_frame['state'], 'Person', constraining_properties={'employment': 'BLS_Employed'}) # Add the observation for employed individuals pd_frame['employed_count'] = dc.get_observations( pd_frame['employed_pop'], 'count', 'measuredValue', '2018-12', observation_period='P1M', measurement_method='BLSSeasonallyAdjusted') print(pd_frame) # Final dataframe. Use the convenience function "clean_frame" to convert # columns to numerical types. utils._print_header('Final Data Frame') pd_frame = dc.clean_frame(pd_frame) print(pd_frame) # Get all population and observation data of Mountain View. utils._print_header('Get Mountain View population and observation') popobs = dc.get_pop_obs("geoId/0649670") pprint.pprint(popobs)
def __addNameCol(self, df): df['name'] = df.index.map(dc.get_property_values(df.index, 'name')) df['name'] = df['name'].str[0]