def test_update_in_hdx(self, configuration, post_update): resource = Resource(configuration) resource['id'] = 'NOTEXIST' with pytest.raises(HDXError): resource.update_in_hdx() resource['name'] = 'LALA' with pytest.raises(HDXError): resource.update_in_hdx() resource = Resource.read_from_hdx(configuration, 'TEST1') assert resource['id'] == 'de6549d8-268b-4dfe-adaf-a4ae5c8510d5' assert resource['format'] == 'XLSX' resource['format'] = 'CSV' resource['id'] = 'TEST1' resource['name'] = 'MyResource1' resource.update_in_hdx() assert resource['id'] == 'TEST1' assert resource['format'] == 'CSV' resource['id'] = 'NOTEXIST' with pytest.raises(HDXError): resource.update_in_hdx() del resource['id'] with pytest.raises(HDXError): resource.update_in_hdx() resource_data = copy.deepcopy(TestResource.resource_data) resource_data['name'] = 'MyResource1' resource_data['id'] = 'TEST1' resource = Resource(configuration, resource_data) resource.create_in_hdx() assert resource['id'] == 'TEST1' assert resource['format'] == 'xlsx'
def generate_dataset(configuration, countryName): #showedName = countryName if (countryName == "Ivory Coast"): showedName = "Cote d'Ivoire" name = countryName + '-healthsites' title = countryName + '-healthsites' slugified_name = slugify(name).lower() # dataset = Dataset(configuration, { # }) dataset = Dataset({ 'name': slugified_name, 'title': title, }) # dataset['name'] = slugified_name # dataset['title'] = title #generating the datasets getCountryHealthSites(configuration, countryName) # geojson resource if (os.path.isfile(configuration.read()['data_folder'] + countryName + '.geojson')): rName = countryName + '-healthsites-geojson' geojsonResource = Resource() geojsonResource['name'] = rName geojsonResource['format'] = 'geojson' geojsonResource['url'] = configuration.read()['base_url'] geojsonResource['description'] = countryName + ' healthsites geojson' geojsonResource.set_file_to_upload( configuration.read()['data_folder'] + countryName + '.geojson') geojsonResource.check_required_fields(['group', 'package_id']) dataset.add_update_resource(geojsonResource) #csv resource if (os.path.isfile(configuration.read()['data_folder'] + countryName + '.csv')): resource_csv = Resource() resource_csv['name'] = countryName + '-healthsites-csv' resource_csv['description'] = countryName + ' healthsites csv' resource_csv['format'] = 'csv' resource_csv.set_file_to_upload(configuration.read()['data_folder'] + countryName + '.csv') resource_csv.check_required_fields(['group', 'package_id']) dataset.add_update_resource(resource_csv) # shp resource if (os.path.isfile(configuration.read()['data_folder'] + countryName + "-shapefiles.zip")): resource_shp = Resource() resource_shp['name'] = countryName + '-healthsites-shp' resource_shp['format'] = 'zipped shapefile' resource_shp['description'] = countryName + ' healthsites shapefiles' resource_shp.set_file_to_upload(configuration.read()['data_folder'] + countryName + "-shapefiles.zip") resource_shp.check_required_fields(['group', 'package_id']) dataset.add_update_resource(resource_shp) return dataset
def test_update_in_hdx(self, configuration, post_update): resource = Resource() resource['id'] = 'NOTEXIST' with pytest.raises(HDXError): resource.update_in_hdx() resource['name'] = 'LALA' with pytest.raises(HDXError): resource.update_in_hdx() resource = Resource.read_from_hdx('74b74ae1-df0c-4716-829f-4f939a046811') assert resource['id'] == 'de6549d8-268b-4dfe-adaf-a4ae5c8510d5' assert resource.get_file_type() == 'csv' resource.set_file_type('XLSX') resource['id'] = '74b74ae1-df0c-4716-829f-4f939a046811' resource['name'] = 'MyResource1' resource.update_in_hdx() assert resource['id'] == '74b74ae1-df0c-4716-829f-4f939a046811' assert resource['format'] == 'xlsx' assert resource.get_file_type() == 'xlsx' assert resource['url_type'] == 'api' assert resource['resource_type'] == 'api' assert resource[ 'url'] == 'https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/master/tests/fixtures/test_data.csv' assert resource['state'] == 'active' filetoupload = join('tests', 'fixtures', 'test_data.csv') resource.set_file_to_upload(filetoupload) resource.update_in_hdx() assert resource['url_type'] == 'upload' assert resource['resource_type'] == 'file.upload' assert resource[ 'url'] == 'http://test-data.humdata.org/dataset/6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d/resource/de6549d8-268b-4dfe-adaf-a4ae5c8510d5/download/test_data.csv' assert resource['state'] == 'active' resource['id'] = 'NOTEXIST' with pytest.raises(HDXError): resource.update_in_hdx() del resource['id'] with pytest.raises(HDXError): resource.update_in_hdx() resource.data = dict() with pytest.raises(HDXError): resource.update_in_hdx() resource_data = copy.deepcopy(TestResource.resource_data) resource_data['name'] = 'MyResource1' resource_data['id'] = '74b74ae1-df0c-4716-829f-4f939a046811' resource = Resource(resource_data) resource.create_in_hdx() assert resource['id'] == '74b74ae1-df0c-4716-829f-4f939a046811' assert resource.get_file_type() == 'xlsx' assert resource['state'] == 'active'
def test_resource_views(self, configuration, post_resourceview): resource = Resource({'id': '25982d1c-f45a-45e1-b14e-87d367413045'}) with pytest.raises(HDXError): resource.add_update_resource_view('123') resource_view = copy.deepcopy(resource_view_list[0]) del resource_view['id'] del resource_view['package_id'] resource.add_update_resource_view(resource_view) resource_view = copy.deepcopy(resource_view_list[1]) del resource_view['id'] del resource_view['package_id'] with pytest.raises(HDXError): resource.add_update_resource_views('123') resource.add_update_resource_views([resource_view]) resource_views = resource.get_resource_views() assert resource_views[0]['id'] == 'd80301b5-4abd-49bd-bf94-fa4af7b6e7a4' assert resource_views[1]['id'] == 'c06b5a0d-1d41-4a74-a196-41c251c76023' with pytest.raises(HDXError): resource.delete_resource_view('123') resource.delete_resource_view('d80301b5-4abd-49bd-bf94-fa4af7b6e7a4') resource.delete_resource_view(resource_view) resource_view['title'] = 'XXX' with pytest.raises(HDXError): resource.delete_resource_view(resource_view) with pytest.raises(HDXError): resource.reorder_resource_views('123') resource.reorder_resource_views(['c06b5a0d-1d41-4a74-a196-41c251c76023', 'd80301b5-4abd-49bd-bf94-fa4af7b6e7a4']) resource.reorder_resource_views(resource_view_list) resource_view = copy.deepcopy(resource_view_list[0]) resource_view['id'] = '123' with pytest.raises(HDXError): resource.reorder_resource_views([resource_view_list[1], resource_view])
def add_update_resource(self, resource, ignore_datasetid=False): # type: (Union[Resource,dict,str], Optional[bool]) -> None """Add new or update existing resource in dataset with new metadata Args: resource (Union[Resource,dict,str]): Either resource id or resource metadata from a Resource object or a dictionary ignore_datasetid (Optional[bool]): Whether to ignore dataset id in the resource Returns: None """ if isinstance(resource, str): resource = Resource.read_from_hdx(resource, configuration=self.configuration) elif isinstance(resource, dict): resource = Resource(resource, configuration=self.configuration) if isinstance(resource, Resource): if 'package_id' in resource: if not ignore_datasetid: raise HDXError( 'Resource %s being added already has a dataset id!' % (resource['name'])) resource_updated = self._addupdate_hdxobject( self.resources, 'name', resource) resource_updated.set_file_to_upload(resource.get_file_to_upload()) return raise HDXError('Type %s cannot be added as a resource!' % type(resource).__name__)
def generate_dataset(configuration): url = configuration['base_url'] + configuration['api'] loaData.writeData(url) name = 'Africa health facilities' title = 'Africa health facilities data' slugified_name = slugify(name).lower() dataset = Dataset(configuration, {}) dataset['name'] = slugified_name dataset['title'] = title date = time.strftime("%d/%m/%Y") dataset['dataset_date'] = date dataset.add_continent_location('AF') rName = "sen-healthfacilities" resource = Resource() resource['name'] = rName resource['format'] = 'geojson' resource['url'] = url resource['description'] = configuration['base_url'] resource['url_type'] = 'api' resource['resource_type'] = 'api' resource.set_file_to_upload(configuration['data_folder'] + 'sen-healthfacilities.geojson') dataset.add_update_resource(resource) return dataset
def test_update_in_hdx(self, configuration, post_update): dataset = Dataset() dataset['id'] = 'NOTEXIST' with pytest.raises(HDXError): dataset.update_in_hdx() dataset['name'] = 'LALA' with pytest.raises(HDXError): dataset.update_in_hdx() dataset = Dataset.read_from_hdx('TEST1') assert dataset['id'] == '6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d' assert dataset['dataset_date'] == '06/04/2016' dataset['dataset_date'] = '02/26/2016' dataset['id'] = 'TEST1' dataset['name'] = 'MyDataset1' dataset.update_in_hdx() assert dataset['id'] == 'TEST1' assert dataset['dataset_date'] == '02/26/2016' dataset['id'] = 'NOTEXIST' with pytest.raises(HDXError): dataset.update_in_hdx() del dataset['id'] with pytest.raises(HDXError): dataset.update_in_hdx() dataset_data = copy.deepcopy(TestDataset.dataset_data) gallery_data = copy.deepcopy(TestDataset.gallery_data) dataset_data['name'] = 'MyDataset1' dataset_data['id'] = 'TEST1' dataset = Dataset(dataset_data) dataset.add_update_gallery(gallery_data) dataset.create_in_hdx() assert dataset['id'] == 'TEST1' assert dataset['dataset_date'] == '03/23/2016' assert len(dataset.resources) == 2 assert len(dataset.gallery) == 1 dataset.update_in_hdx() assert len(dataset.resources) == 2 assert len(dataset.gallery) == 1 dataset = Dataset.read_from_hdx('TEST4') del gallery_data[0]['id'] dataset.add_update_gallery(gallery_data) dataset['id'] = 'TEST4' dataset.update_in_hdx() assert len(dataset.resources) == 2 assert len(dataset.gallery) == 1 dataset = Dataset.read_from_hdx('TEST4') resources_data = copy.deepcopy(TestDataset.resources_data) resource = Resource(resources_data[0]) file = tempfile.NamedTemporaryFile(delete=False) resource.set_file_to_upload(file.name) dataset.add_update_resource(resource) dataset.update_in_hdx() os.unlink(file.name) assert len(dataset.resources) == 2 assert len(dataset.gallery) == 0
def test_update_in_hdx(self, configuration, post_update): resource = Resource() resource['id'] = 'NOTEXIST' with pytest.raises(HDXError): resource.update_in_hdx() resource['name'] = 'LALA' with pytest.raises(HDXError): resource.update_in_hdx() resource = Resource.read_from_hdx('TEST1') assert resource['id'] == 'de6549d8-268b-4dfe-adaf-a4ae5c8510d5' assert resource['format'] == 'XLSX' resource['format'] = 'CSV' resource['id'] = 'TEST1' resource['name'] = 'MyResource1' resource.update_in_hdx() assert resource['id'] == 'TEST1' assert resource['format'] == 'CSV' assert resource['url_type'] == 'api' assert resource['resource_type'] == 'api' assert resource[ 'url'] == 'https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/master/tests/fixtures/test_data.csv' resource.set_file_to_upload('fixtures/test_data.csv') resource.update_in_hdx() assert resource['url_type'] == 'upload' assert resource['resource_type'] == 'file.upload' assert resource[ 'url'] == 'http://test-data.humdata.org/dataset/6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d/resource/de6549d8-268b-4dfe-adaf-a4ae5c8510d5/download/test_data.csv' resource['id'] = 'NOTEXIST' with pytest.raises(HDXError): resource.update_in_hdx() del resource['id'] with pytest.raises(HDXError): resource.update_in_hdx() resource_data = copy.deepcopy(TestResource.resource_data) resource_data['name'] = 'MyResource1' resource_data['id'] = 'TEST1' resource = Resource(resource_data) resource.create_in_hdx() assert resource['id'] == 'TEST1' assert resource['format'] == 'xlsx'
def test_get_dataset(self, configuration, post_dataset): resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(resource_data) dataset = resource.get_dataset() assert dataset['id'] == '6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d' del resource['package_id'] with pytest.raises(HDXError): resource.get_dataset()
def test_update_yaml(self, configuration, static_yaml): resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(resource_data) assert resource['name'] == 'MyResource1' assert resource['format'] == 'xlsx' resource.update_from_yaml(static_yaml) assert resource['name'] == 'MyResource1' assert resource['format'] == 'csv'
def test_update_json(self, configuration, static_json): resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(resource_data) assert resource['name'] == 'MyResource1' assert resource.get_file_type() == 'xlsx' resource.update_from_json(static_json) assert resource['name'] == 'MyResource1' assert resource.get_file_type() == 'zipped csv'
def test_check_url_filetoupload(self, configuration): resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(resource_data) resource.check_url_filetoupload() resource.set_file_to_upload('abc') resource.check_url_filetoupload() resource['url'] = 'lala' with pytest.raises(HDXError): resource.check_url_filetoupload()
def test_create_in_hdx(self, configuration, post_create): resource = Resource() with pytest.raises(HDXError): resource.create_in_hdx() resource['id'] = 'TEST1' resource['name'] = 'LALA' with pytest.raises(HDXError): resource.create_in_hdx() resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(resource_data) resource.create_in_hdx() assert resource['id'] == 'de6549d8-268b-4dfe-adaf-a4ae5c8510d5' assert resource['url_type'] == 'api' assert resource['resource_type'] == 'api' assert resource[ 'url'] == 'https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/master/tests/fixtures/test_data.csv' resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(resource_data) filetoupload = join('tests', 'fixtures', 'test_data.csv') resource.set_file_to_upload(filetoupload) assert resource.get_file_to_upload() == filetoupload resource.create_in_hdx() assert resource['url_type'] == 'upload' assert resource['resource_type'] == 'file.upload' assert resource[ 'url'] == 'http://test-data.humdata.org/dataset/6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d/resource/de6549d8-268b-4dfe-adaf-a4ae5c8510d5/download/test_data.csv' resource_data['name'] = 'MyResource2' resource = Resource(resource_data) with pytest.raises(HDXError): resource.create_in_hdx() resource_data['name'] = 'MyResource3' resource = Resource(resource_data) with pytest.raises(HDXError): resource.create_in_hdx()
def test_create_in_hdx(self, configuration, post_create): resource = Resource(configuration) with pytest.raises(HDXError): resource.create_in_hdx() resource['id'] = 'TEST1' resource['name'] = 'LALA' with pytest.raises(HDXError): resource.create_in_hdx() resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(configuration, resource_data) resource.create_in_hdx() assert resource['id'] == 'de6549d8-268b-4dfe-adaf-a4ae5c8510d5' resource_data['name'] = 'MyResource2' resource = Resource(configuration, resource_data) with pytest.raises(HDXError): resource.create_in_hdx() resource_data['name'] = 'MyResource3' resource = Resource(configuration, resource_data) with pytest.raises(HDXError): resource.create_in_hdx()
def test_add_update_delete_resources(self, configuration, post_delete): dataset_data = copy.deepcopy(TestDataset.dataset_data) resources_data = copy.deepcopy(TestDataset.resources_data) dataset = Dataset(dataset_data) dataset.add_update_resources(resources_data) assert len(dataset.resources) == 2 dataset.delete_resource('NOTEXIST') assert len(dataset.resources) == 2 dataset.delete_resource('de6549d8-268b-4dfe-adaf-a4ae5c8510d5') assert len(dataset.resources) == 1 resources_data = copy.deepcopy(TestDataset.resources_data) resource = Resource(resources_data[0]) resource.set_file_to_upload('lala') dataset.add_update_resource(resource) assert dataset.resources[1].get_file_to_upload() == 'lala'
def generate_dataset_and_showcase(countryName, countryISO2): title = '%s - Demographic, Health, Education and Transport indicators' % countryName logger.info('Creating dataset: %s' % title) name = 'unhabitat-%s-indicators' % countryISO2 slugified_name = slugify(name).lower() dataset = Dataset({ 'name': slugified_name, 'title': title, }) # dataset.set_dataset_date(date, dataset_end_date=) dataset.set_dataset_year_range(1950, 2050) dataset.set_expected_update_frequency('Every year') dataset.set_subnational(1) dataset.add_country_location(getCountryISO3Code(countryISO2)) dataset.add_tags(['EDUCATION', 'POPULATION', 'HEALTH', 'TRANSPORT', 'HXL']) if os.path.isfile('data/indicator_data_' + countryISO2 + '.csv'): resource = Resource() resource['name'] = 'Indicators_data_%s' % countryISO2 resource[ 'description'] = '%s - Demographic, Health, Education and Transport indicators' % countryName resource['format'] = 'csv' resource.set_file_to_upload('data/indicator_data_' + countryISO2 + '.csv') resource.check_required_fields(['group', 'package_id']) dataset.add_update_resource(resource) showcase_name = slugify('unhabitat-%s' % countryName + ' indacators-data').lower() showcase = Showcase({ 'name': showcase_name, 'title': 'Explore %s' % countryName + ' indicators', 'notes': 'Explore %s' % countryName + ' indicators', 'url': 'http://urbandata.unhabitat.org/data-country/?countries=%s' % countryISO2 + '&indicators=total_length_road,rural_population,urban_population_countries,urban_slum_population_countries,population,income_gini_coefficient_countries', 'image_url': 'https://centre.humdata.org/wp-content/uploads/2018/09/unhabitat-showcase.png' }) showcase.add_tags(['EDUCATION', 'POPULATION', 'HEALTH', 'TRANSPORT']) return dataset, showcase
def generateDatasetBykey(key, countryName): metadata = yaml.load(open('config/metadata.yml', 'r')) title = '%s - ' % countryName + metadata[key]['title'] name = metadata[key]['name'] desc = metadata[key]['notes'] slugified_name = slugify(name).lower() dataset = Dataset({ 'name': slugified_name, 'title': title, 'description': desc }) dataset.set_dataset_year_range(1985, 2017) dataset.set_expected_update_frequency('Every year') dataset.set_subnational(1) dataset.add_country_location(countryName) resource = Resource() rName = '' upCountry = countryName.upper() if key == 'education': dataset.add_tag('EDUCATION') rName = 'UNECA %s - Education' % countryName resource.set_file_to_upload('data/%s-education.csv' % upCountry) if key == 'health': dataset.add_tag('health') rName = 'UNECA %s - Health' % countryName resource.set_file_to_upload('data/%s-health.csv' % upCountry) if key == 'population_and_migration': dataset.add_tags(['population', 'migration']) rName = 'UNECA %s - Population and Migration' % countryName resource.set_file_to_upload('data/%s-population_and_migration.csv' % upCountry) resource['name'] = rName resource['description'] = 'UNECA %s data' % countryName resource['format'] = 'csv' # resource.check_required_fields(['notes']) dataset.add_update_resource(resource) print("==================== %s dataset generated ====================" % key) return dataset
def add_update_resource(self, resource: Any) -> None: """Add new or update existing resource in dataset with new metadata Args: resource (Any): Resource metadata either from a Resource object or a dictionary Returns: None """ if isinstance(resource, dict): resource = Resource(self.configuration, resource) if isinstance(resource, Resource): if 'package_id' in resource: raise HDXError( "Resource %s being added already has a dataset id!" % (resource['name'])) self._addupdate_hdxobject(self.resources, 'name', self._underlying_object, resource) return raise HDXError("Type %s cannot be added as a resource!" % type(resource).__name__)
def add_update_resource(self, resource): # type: (Any) -> None """Add new or update existing resource in dataset with new metadata Args: resource (Any): Resource metadata either from a Resource object or a dictionary Returns: None """ if isinstance(resource, dict): resource = Resource(resource) if isinstance(resource, Resource): if 'package_id' in resource: raise HDXError( "Resource %s being added already has a dataset id!" % (resource['name'])) resource_updated = self._addupdate_hdxobject( self.resources, 'name', resource) resource_updated.set_file_to_upload(resource.get_file_to_upload()) return raise HDXError("Type %s cannot be added as a resource!" % type(resource).__name__)
def generate_dataset_and_showcase(downloader, countrydata, endpoints_metadata, folder, merge_resources=True, single_dataset=False, split_to_resources_by_column="STAT_UNIT", remove_useless_columns=True): """ https://api.uis.unesco.org/sdmx/data/UNESCO,DEM_ECO/....AU.?format=csv-:-tab-true-y&locale=en&subscription-key=... :param downloader: Downloader object :param countrydata: Country datastructure from UNESCO API :param endpoints_metadata: Endpoint datastructure from UNESCO API :param folder: temporary folder :param merge_resources: if true, merge resources for all time periods :param single_dataset: if true, put all endpoints into a single dataset :param split_to_resources_by_column: split data into multiple resorces (csv) based on a value in the specified column :param remove_useless_columns: :return: generator yielding (dataset, showcase) tuples. It may yield None, None. """ countryiso2 = countrydata['id'] countryname = countrydata['names'][0]['value'] logger.info("Processing %s" % countryname) if countryname[:4] in ['WB: ', 'SDG:', 'MDG:', 'UIS:', 'EFA:'] or countryname[:5] in ['GEMR:', 'AIMS:'] or \ countryname[:7] in ['UNICEF:', 'UNESCO:']: logger.info('Ignoring %s!' % countryname) yield None, None return countryiso3 = Country.get_iso3_from_iso2(countryiso2) if countryiso3 is None: countryiso3, _ = Country.get_iso3_country_code_fuzzy(countryname) if countryiso3 is None: logger.exception('Cannot get iso3 code for %s!' % countryname) yield None, None return logger.info('Matched %s to %s!' % (countryname, countryiso3)) earliest_year = 10000 latest_year = 0 if single_dataset: name = 'UNESCO indicators - %s' % countryname dataset, showcase = create_dataset_showcase( name, countryname, countryiso2, countryiso3, single_dataset=single_dataset) if dataset is None: return for endpoint in sorted(endpoints_metadata): time.sleep(0.2) indicator, structure_url, more_info_url, dimensions = endpoints_metadata[ endpoint] structure_url = structure_url % countryiso2 response = load_safely(downloader, '%s%s' % (structure_url, dataurl_suffix)) json = response.json() if not single_dataset: name = 'UNESCO %s - %s' % (json["structure"]["name"], countryname) dataset, showcase = create_dataset_showcase( name, countryname, countryiso2, countryiso3, single_dataset=single_dataset) if dataset is None: continue observations = json['structure']['dimensions']['observation'] time_periods = dict() for observation in observations: if observation['id'] == 'TIME_PERIOD': for value in observation['values']: time_periods[int(value['id'])] = value['actualObs'] if len(time_periods) == 0: logger.warning('No time periods for endpoint %s for country %s!' % (indicator, countryname)) continue earliest_year = min(earliest_year, *time_periods.keys()) latest_year = max(latest_year, *time_periods.keys()) csv_url = '%sformat=csv' % structure_url description = more_info_url if description != ' ': description = '[Info on %s](%s)' % (indicator, description) description = 'To save, right click download button & click Save Link/Target As \n%s' % description df = None for start_year, end_year in chunk_years(time_periods): if merge_resources: df1 = download_df(downloader, csv_url, start_year, end_year) if df1 is not None: df = df1 if df is None else df.append(df1) else: url_years = '&startPeriod=%d&endPeriod=%d' % (start_year, end_year) resource = { 'name': '%s (%d-%d)' % (indicator, start_year, end_year), 'description': description, 'format': 'csv', 'url': downloader.get_full_url('%s%s' % (csv_url, url_years)) } dataset.add_update_resource(resource) if df is not None: stat = { x["id"]: x["name"] for d in dimensions if d["id"] == "STAT_UNIT" for x in d["values"] } for value, df_part in split_df_by_column( process_df(df), split_to_resources_by_column): file_csv = join( folder, ("UNESCO_%s_%s.csv" % (countryiso3, endpoint + ("" if value is None else "_" + value))).replace( " ", "-").replace(":", "-").replace("/", "-").replace( ",", "-").replace("(", "-").replace(")", "-")) if remove_useless_columns: df_part = remove_useless_columns_from_df(df_part) df_part["country-iso3"] = countryiso3 df_part.iloc[ 0, df_part.columns.get_loc("country-iso3")] = "#country+iso3" df_part["Indicator name"] = value df_part.iloc[0, df_part.columns.get_loc("Indicator name" )] = "#indicator+name" df_part = postprocess_df(df_part) df_part.to_csv(file_csv, index=False) description_part = stat.get( value, 'Info on %s%s' % ("" if value is None else value + " in ", indicator)) resource = Resource({ 'name': value, 'description': description_part }) resource.set_file_type('csv') resource.set_file_to_upload(file_csv) dataset.add_update_resource(resource) if not single_dataset: if dataset is None or len(dataset.get_resources()) == 0: logger.error('No resources created for country %s, %s!' % (countryname, endpoint)) else: dataset.set_dataset_year_range(min(time_periods.keys()), max(time_periods.keys())) yield dataset, showcase if single_dataset: if dataset is None or len(dataset.get_resources()) == 0: logger.error('No resources created for country %s!' % (countryname)) else: dataset.set_dataset_year_range(earliest_year, latest_year) yield dataset, showcase
def test_create_in_hdx(self, configuration, post_create): dataset = Dataset() with pytest.raises(HDXError): dataset.create_in_hdx() dataset['id'] = 'TEST1' dataset['name'] = 'LALA' with pytest.raises(HDXError): dataset.create_in_hdx() dataset_data = copy.deepcopy(TestDataset.dataset_data) dataset = Dataset(dataset_data) dataset.create_in_hdx() assert dataset['id'] == '6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d' assert len(dataset.resources) == 2 assert len(dataset.gallery) == 0 dataset_data['name'] = 'MyDataset2' dataset = Dataset(dataset_data) with pytest.raises(HDXError): dataset.create_in_hdx() dataset_data['name'] = 'MyDataset3' dataset = Dataset(dataset_data) with pytest.raises(HDXError): dataset.create_in_hdx() dataset_data = copy.deepcopy(TestDataset.dataset_data) gallery_data = copy.deepcopy(TestDataset.gallery_data) dataset_data['gallery'] = gallery_data with pytest.raises(HDXError): dataset = Dataset(dataset_data) del dataset_data['gallery'] dataset = Dataset(dataset_data) del gallery_data[0]['id'] dataset.add_update_gallery(gallery_data) dataset.create_in_hdx() assert dataset['id'] == '6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d' assert len(dataset.resources) == 2 assert len(dataset.gallery) == 1 dataset_data = copy.deepcopy(TestDataset.dataset_data) resources_data = copy.deepcopy(TestDataset.resources_data) dataset_data['resources'] = resources_data with pytest.raises(HDXError): dataset = Dataset(dataset_data) del dataset_data['resources'] dataset = Dataset(dataset_data) del resources_data[0]['id'] del resources_data[1]['id'] dataset.add_update_resources(resources_data) dataset.create_in_hdx() assert dataset['id'] == '6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d' assert len(dataset.resources) == 2 assert len(dataset.gallery) == 0 dataset_data = copy.deepcopy(TestDataset.dataset_data) dataset = Dataset(dataset_data) resource = Resource(resources_data[0]) file = tempfile.NamedTemporaryFile(delete=False) resource.set_file_to_upload(file.name) dataset.add_update_resource(resource) dataset.create_in_hdx() os.unlink(file.name) assert len(dataset.resources) == 2 assert len(dataset.gallery) == 0
def test_check_required_fields(self, configuration): resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(resource_data) resource.check_url_filetoupload() resource.check_required_fields()
def generate_dataset_resources_and_showcase(pop_types, today): title = 'Energy consumption of refugees and displaced people' slugified_name = slugify(title.lower()) dataset = Dataset({ 'name': slugified_name, 'title': title, }) dataset.set_maintainer('196196be-6037-4488-8b71-d786adf4c081') dataset.set_organization('0c6bf79f-504c-4ba5-9fdf-c8cc893c8b2f') dataset.set_dataset_date_from_datetime(today) dataset.set_expected_update_frequency('Every month') dataset.add_other_location('world') tags = ['HXL', 'energy', 'refugees', 'internally displaced persons - idp'] dataset.add_tags(tags) resources = list() for pop_type in pop_types: resource_data = { 'name': '%s_consumption.csv' % pop_type.lower().replace(' ', '_'), 'description': '%s %s' % (pop_type, title.lower()), 'format': 'csv' } resources.append(Resource(resource_data)) resource_data = { 'name': 'population.csv', 'description': 'UNHCR displaced population totals', 'format': 'csv' } resources.append(Resource(resource_data)) resource_data = { 'name': 'keyfigures_disagg.csv', 'description': 'Disaggregated MEI Key Figures', 'format': 'csv' } resources.append(Resource(resource_data)) resource_data = { 'name': 'keyfigures.csv', 'description': 'MEI Key Figures', 'format': 'csv' } resources.append(Resource(resource_data)) showcase = Showcase({ 'name': '%s-showcase' % slugified_name, 'title': 'Energy services for refugees and displaced people', 'notes': 'Click the image on the right to go to the energy services model', 'url': 'http://www.sciencedirect.com/science/article/pii/S2211467X16300396', 'image_url': 'https://ars.els-cdn.com/content/image/X2211467X.jpg' }) showcase.add_tags(tags) return dataset, resources, showcase
def test_patch(self, configuration, post_patch): resource = Resource() resource['id'] = '74b74ae1-df0c-4716-829f-4f939a046811' resource.update_in_hdx(operation='patch', batch_mode='KEEP_OLD', skip_validation=True) assert resource['id'] == 'de6549d8-268b-4dfe-adaf-a4ae5c8510d5'
def generate_dataset(self, metadata): title = metadata["DatasetTitle"] is_requestdata_type = metadata["is_requestdata_type"] if not is_requestdata_type: if metadata["Total"] == 0: self.errors.add( f"Ignoring dataset: {title} which has no resources!") return None, None if not metadata["Source"]: self.errors.add(f"Dataset: {title} has no source!") logger.info(f"Creating dataset: {title}") cod_level = "cod-standard" if metadata["is_enhanced_cod"]: cod_level = "cod-enhanced" theme = metadata["Theme"] if not theme: self.errors.add(f"Dataset: {title} has no theme!") location = metadata["Location"] if theme == "COD_AB" and (location == ["MMR"] or location == ["mmr"]): name = slugify(title) else: name = slugify(f"{theme} {' '.join(location)}") dataset = Dataset({ "name": name[:99], "title": title, "notes": metadata["DatasetDescription"], "dataset_source": metadata["Source"], "methodology": metadata["Methodology"], "methodology_other": metadata["Methodology_Other"], "license_id": metadata["License"], "license_other": metadata["License_Other"], "caveats": metadata["Caveats"], "data_update_frequency": metadata["FrequencyUpdates"], "cod_level": cod_level, }) licence = metadata["License"] if licence == "Other": dataset["license_id"] = "hdx-other" dataset["license_other"] = metadata["License_Other"] else: dataset["license_id"] = licence methodology = metadata["Methodology"] if methodology == "Other": dataset["methodology"] = "Other" methodology_other = metadata["Methodology_Other"] if not methodology_other: self.errors.add(f"Dataset: {title} has no methodology!") if methodology_other: dataset["methodology_other"] = methodology_other else: dataset["methodology"] = methodology dataset.set_maintainer("196196be-6037-4488-8b71-d786adf4c081") organization = Organization.autocomplete(metadata["Contributor"]) if len(organization) == 0: organization = Organization.autocomplete( metadata["Contributor"].replace(" ", "-")) organization_id = None batch = None try: organization_id = organization[0]["id"] except IndexError: self.errors.add( f"Dataset: {title} has an invalid organization {metadata['Contributor']}!" ) if organization_id: dataset.set_organization(organization_id) batch = self.batches_by_org.get(organization_id, get_uuid()) self.batches_by_org[organization_id] = batch dataset.set_subnational(True) try: dataset.add_country_locations(location) except HDXError: self.errors.add( f"Dataset: {title} has an invalid location {location}!") dataset.add_tags(metadata["Tags"]) if len(dataset.get_tags()) < len(metadata["Tags"]): self.errors.add(f"Dataset: {title} has invalid tags!") if "common operational dataset - cod" not in dataset.get_tags(): dataset.add_tag("common operational dataset - cod") if is_requestdata_type: dataset["dataset_date"] = metadata["DatasetDate"] dataset["is_requestdata_type"] = True dataset["file_types"] = metadata["file_types"] dataset["field_names"] = metadata["field_names"] num_of_rows = metadata.get("num_of_rows") if num_of_rows: dataset["num_of_rows"] = num_of_rows else: startdate = None enddate = None ongoing = False resources = list() for resource_metadata in metadata["Resources"]: resource_daterange = resource_metadata["daterange_for_data"] format = resource_metadata["Format"] if format == "VectorTile": format = "MBTiles" logger.error( f"Dataset: {title} is using file type VectorTile instead of MBTiles" ) resourcedata = { "name": resource_metadata["ResourceItemTitle"], "description": resource_metadata["ResourceItemDescription"], "url": resource_metadata["DownloadURL"], "format": format, "daterange_for_data": resource_daterange, "grouping": resource_metadata["Version"], } date_info = DateHelper.get_date_info(resource_daterange) resource_startdate = date_info["startdate"] resource_enddate = date_info["enddate"] resource_ongoing = date_info["ongoing"] if startdate is None or resource_startdate < startdate: startdate = resource_startdate if enddate is None or resource_enddate > enddate: enddate = resource_enddate ongoing = resource_ongoing resource = Resource(resourcedata) resources.append(resource) if ongoing: enddate = "*" try: dataset.add_update_resources(resources) except HDXError as ex: self.errors.add( f"Dataset: {title} resources could not be added. Error: {ex}" ) dataset.set_date_of_dataset(startdate, enddate) if len(self.errors.errors) > 0: return None, None else: return dataset, batch
def test_get_dataset(self, configuration, post_dataset): resource_data = copy.deepcopy(TestResource.resource_data) resource = Resource(resource_data) dataset = resource.get_dataset() assert dataset['id'] == '6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d'
def generate_dataset_and_showcase(wfpfood_url, downloader, folder, countrydata, shortcuts): """Generate datasets and showcases for each country. """ title = '%s - Food Prices' % countrydata['name'] logger.info('Creating dataset: %s' % title) name = 'WFP food prices for %s' % countrydata[ 'name'] # Example name which should be unique so can include organisation name and country slugified_name = slugify(name).lower() df = read_dataframe(wfpfood_url, downloader, countrydata) if len(df) <= 1: logger.warning('Dataset "%s" is empty' % title) return None, None dataset = Dataset({ 'name': slugified_name, 'title': title, "dataset_preview": "resource_id" }) dataset.set_maintainer( "9957c0e9-cd38-40f1-900b-22c91276154b") # Orest Dubay # dataset.set_maintainer("154de241-38d6-47d3-a77f-0a9848a61df3") dataset.set_organization("3ecac442-7fed-448d-8f78-b385ef6f84e7") dataset.set_dataset_date(df.loc[1:].date.min(), df.loc[1:].date.max(), "%Y-%m-%d") dataset.set_expected_update_frequency("weekly") dataset.add_country_location(countrydata["name"]) dataset.set_subnational(True) dataset.add_tags(tags) dataset.add_tag('hxl') file_csv = join( folder, "WFP_food_prices_%s.csv" % countrydata["name"].replace(" ", "-")) df.to_csv(file_csv, index=False) resource = Resource({ 'name': title, "dataset_preview_enabled": "False", 'description': "Food prices data with HXL tags" }) resource.set_file_type('csv') # set the file type to eg. csv resource.set_file_to_upload(file_csv) dataset.add_update_resource(resource) df1 = quickchart_dataframe(df, shortcuts) file_csv = join( folder, "WFP_food_median_prices_%s.csv" % countrydata["name"].replace(" ", "-")) df1.to_csv(file_csv, index=False) resource = Resource({ 'name': '%s - Food Median Prices' % countrydata['name'], "dataset_preview_enabled": "True", 'description': """Food median prices data with HXL tags. Median of all prices for a given commodity observed on different markets is shown, together with the market where it was observed. Data are shortened in multiple ways: - Rather that prices on all markets, only median price across all markets is shown, together with the market where it has been observed. - Only food commodities are displayed (non-food commodities like fuel and wages are not shown). - Only data after %s are shown. Missing data are interpolated. - Column with shorter commodity names "cmnshort" are available to be used as chart labels. - Units are adapted and prices are rescaled in order to yield comparable values (so that they can be displayed and compared in a single chart). Scaling factor is present in scaling column. Label with full commodity name and a unit (with scale if applicable) is in column "label". This reduces the amount of data and allows to make cleaner charts. """ % (df1.loc[1:].date.min()) }) resource.set_file_type('csv') # set the file type to eg. csv resource.set_file_to_upload(file_csv) dataset.add_update_resource(resource) showcase = Showcase({ 'name': '%s-showcase' % slugified_name, 'title': title + " showcase", 'notes': countrydata["name"] + " food prices data from World Food Programme displayed through VAM Economic Explorer", 'url': "http://dataviz.vam.wfp.org/economic_explorer/prices?adm0=" + countrydata["code"], 'image_url': "http://dataviz.vam.wfp.org/_images/home/economic_2-4.jpg" }) showcase.add_tags(tags) return dataset, showcase
def generate_datasets_and_showcases(downloader, folder, indicatorname, indicatortypedata, countriesdata, showcase_base_url): dataset_template = Dataset() dataset_template.set_maintainer('196196be-6037-4488-8b71-d786adf4c081') dataset_template.set_organization('ed727a5b-3e6e-4cd6-b97e-4a71532085e6') dataset_template.set_expected_update_frequency('Every year') dataset_template.set_subnational(False) tags = ['hxl', indicatorname.lower()] dataset_template.add_tags(tags) earliest_year = 10000 latest_year = 0 countrycode = None iso3 = None countryname = None rows = None datasets = list() showcases = list() def output_csv(): if rows is None: return headers = deepcopy(downloader.response.headers) for i, header in enumerate(headers): if 'year' in header.lower(): headers.insert(i, 'EndYear') headers.insert(i, 'StartYear') break headers.insert(0, 'Iso3') hxlrow = dict() for header in headers: hxlrow[header] = hxltags.get(header, '') rows.insert(0, hxlrow) filepath = join(folder, '%s_%s.csv' % (indicatorname, countrycode)) write_list_to_csv(rows, filepath, headers=headers) ds = datasets[-1] ds.set_dataset_year_range(earliest_year, latest_year) ds.resources[0].set_file_to_upload(filepath) for row in downloader.get_tabular_rows(indicatortypedata['FileLocation'], dict_rows=True, headers=1, format='csv', encoding='WINDOWS-1252'): newcountry = row['Area Code'] if newcountry != countrycode: output_csv() rows = None countrycode = newcountry result = countriesdata.get(countrycode) if result is None: logger.warning('Ignoring %s' % countrycode) continue iso3, cn = result countryname = Country.get_country_name_from_iso3(iso3) if countryname is None: logger.error('Missing country %s: %s, %s' % (countrycode, cn, iso3)) continue rows = list() title = '%s - %s Indicators' % (countryname, indicatorname) logger.info('Generating dataset: %s' % title) name = 'FAOSTAT %s indicators for %s' % (countryname, indicatorname) slugified_name = slugify(name).lower() dataset = Dataset(deepcopy(dataset_template.data)) dataset['name'] = slugified_name dataset['title'] = title dataset.update_from_yaml() dataset.add_country_location(countryname) earliest_year = 10000 latest_year = 0 resource = Resource({'name': title, 'description': ''}) resource.set_file_type('csv') dataset.add_update_resource(resource) datasets.append(dataset) showcase = Showcase({ 'name': '%s-showcase' % slugified_name, 'title': title, 'notes': dataset['notes'], 'url': '%s%s' % (showcase_base_url, countrycode), 'image_url': 'http://www.fao.org/uploads/pics/food-agriculture.png' }) showcase.add_tags(tags) showcases.append(showcase) row['Iso3'] = iso3 row['Area'] = countryname year = row['Year'] if '-' in year: years = year.split('-') row['StartYear'] = years[0] row['EndYear'] = years[1] else: years = [year] row['StartYear'] = year row['EndYear'] = year for year in years: year = int(year) if year < earliest_year: earliest_year = year if year > latest_year: latest_year = year if rows is not None: rows.append(row) output_csv() return datasets, showcases
def generate_joint_dataset_and_showcase(wfpfood_url, downloader, folder, countriesdata): """Generate single joint datasets and showcases containing data for all countries. """ title = 'Global Food Prices Database (WFP)' logger.info('Creating joint dataset: %s' % title) slugified_name = 'wfp-food-prices' df = joint_dataframe(wfpfood_url, downloader, countriesdata) if len(df) <= 1: logger.warning('Dataset "%s" is empty' % title) return None, None dataset = Dataset({'name': slugified_name, 'title': title}) dataset.set_maintainer( "9957c0e9-cd38-40f1-900b-22c91276154b") # Orest Dubay # dataset.set_maintainer("154de241-38d6-47d3-a77f-0a9848a61df3") dataset.set_organization("3ecac442-7fed-448d-8f78-b385ef6f84e7") maxmonth = (100 * df.mp_year + df.mp_month).max() % 100 dataset.set_dataset_date("%04d-01-01" % df.mp_year.min(), "%04d-%02d-15" % (df.mp_year.max(), maxmonth), "%Y-%m-%d") dataset.set_expected_update_frequency("weekly") dataset.add_country_locations(sorted(df.adm0_name.unique())) dataset.add_tags(tags) file_csv = join(folder, "WFPVAM_FoodPrices.csv") df.to_csv(file_csv, index=False) resource = Resource({ 'name': title, 'description': "Word Food Programme – Food Prices Data Source: WFP Vulnerability Analysis and Mapping (VAM)." }) resource.set_file_type('csv') # set the file type to eg. csv resource.set_file_to_upload(file_csv) dataset.add_update_resource(resource) showcase = Showcase({ 'name': '%s-showcase' % slugified_name, 'title': 'Global Food Prices', 'notes': "Interactive data visualisation of WFP's Food Market Prices dataset", 'url': "https://data.humdata.org/organization/wfp#interactive-data", 'image_url': "https://docs.humdata.org/wp-content/uploads/wfp_food_prices_data_viz.gif" }) showcase.add_tags(tags) dataset.update_from_yaml() dataset['notes'] = dataset[ 'notes'] % 'Global Food Prices data from the World Food Programme covering' dataset.create_in_hdx() showcase.create_in_hdx() showcase.add_dataset(dataset) dataset.get_resource().create_datastore_from_yaml_schema( yaml_path="wfp_food_prices.yml", path=file_csv) logger.info('Finished joint dataset') return dataset, showcase
def test_touch(self, configuration, post_patch): resource = Resource() resource['id'] = 'TEST1' resource.touch() assert resource['id'] == 'TEST1'