def main(): """Generate dataset and create it in HDX""" filelist_url = Configuration.read()["filelist_url"] countrygroup_url = Configuration.read()["countrygroup_url"] indicatorsetnames = Configuration.read()["indicatorsetnames"] showcase_base_url = Configuration.read()["showcase_base_url"] with Download() as downloader: with wheretostart_tempdir_batch(lookup) as info: folder = info["folder"] batch = info["batch"] indicatorsets = download_indicatorsets(filelist_url, indicatorsetnames, downloader, folder) logger.info( f"Number of indicator types to upload: {len(indicatorsetnames)}" ) countries, countrymapping = get_countries(countrygroup_url, downloader) logger.info(f"Number of countries to upload: {len(countries)}") for info, country in progress_storing_folder( info, countries, "iso3"): for indicatorsetname in indicatorsets: ( dataset, showcase, bites_disabled, qc_indicators, ) = generate_dataset_and_showcase( indicatorsetname, indicatorsets, country, countrymapping, showcase_base_url, filelist_url, downloader, info["folder"], ) if dataset: dataset.update_from_yaml() dataset.generate_resource_view( -1, bites_disabled=bites_disabled, indicators=qc_indicators) dataset.create_in_hdx( remove_additional_resources=True, hxl_update=False, updated_by_script="HDX Scraper: FAOStat", batch=batch, ) showcase.create_in_hdx() showcase.add_dataset(dataset)
def configuration(self): Configuration._create( hdx_read_only=True, user_agent="test", project_config_yaml=join("tests", "config", "project_configuration.yml"), ) Locations.set_validlocations([{ "name": "afg", "title": "Afghanistan" }]) # add locations used in tests Country.countriesdata(use_live=False) Vocabulary._tags_dict = True Vocabulary._approved_vocabulary = { "tags": [ { "name": "hxl" }, { "name": "food security" }, { "name": "indicators" }, ], "id": "4e61d464-4943-4e97-973a-84673c1aaa87", "name": "approved", } return Configuration.read()
def main( output_dir, saved_dir, save, use_saved, dportal_params, whattorun, filterdate, **ignore, ): logger.info(f"##### hdx-scraper-iati-viz version {VERSION:.1f} ####") configuration = Configuration.read() output_dir = f"{output_dir}_{whattorun}" rmtree(output_dir, ignore_errors=True) mkdir(output_dir) with Download() as downloader: retriever = Retrieve( downloader, configuration["fallback_dir"], f"{saved_dir}_{whattorun}", output_dir, save, use_saved, ) today = datetime.utcnow().isoformat() start( configuration, today, retriever, output_dir, dportal_params, whattorun, filterdate, )
def configuration(self): Configuration._create( user_agent="test", hdx_key="12345", project_config_yaml=join("tests", "config", "project_configuration.yml"), ) Locations.set_validlocations([{"name": "bgd", "title": "Bangladesh"}]) Country.countriesdata(use_live=False) Vocabulary._tags_dict = True Vocabulary._approved_vocabulary = { "tags": [ { "name": "hxl" }, { "name": "refugees" }, { "name": "asylum" }, { "name": "population" }, ], "id": "4e61d464-4943-4e97-973a-84673c1aaa87", "name": "approved", } return Configuration.read()
def configuration(): project_config_yaml = join("tests", "fixtures", "project_configuration.yml") Configuration._create( hdx_site="prod", user_agent="test", hdx_read_only=True, project_config_yaml=project_config_yaml, ) return Configuration.read()
def configuration(self): Configuration._create( hdx_read_only=True, hdx_site="prod", user_agent="test", project_config_yaml=join("tests", "config", "project_configuration.yml"), ) Locations.set_validlocations([ { "name": "afg", "title": "Afghanistan" }, { "name": "pse", "title": "State of Palestine" }, ]) return Configuration.read()
def configuration(self): Configuration._create( user_agent="test", hdx_key="12345", project_config_yaml=join("tests", "config", "project_configuration.yml"), ) Locations.set_validlocations([ { "name": "afg", "title": "Afghanistan" }, { "name": "phl", "title": "Philippines" }, ]) Country.countriesdata(use_live=False) return Configuration.read()
def main(output_failures=False, **ignore): configuration = Configuration.read() with Download() as downloader: dataset_ids = get_dataset_ids(configuration, downloader) logger.info(f"Number of datasets to upload: {len(dataset_ids)}") for info, dataset_id in progress_storing_tempdir( "UNHCR-MICRODATA", dataset_ids, "id" ): dataset = generate_dataset( dataset_id["id"], configuration, downloader, output_failures ) if dataset: dataset.update_from_yaml() dataset.create_in_hdx( remove_additional_resources=True, hxl_update=False, updated_by_script="HDX Scraper: UNHCR microdata", batch=info["batch"], ) for failure in failures: logger.error(failure)
def configuration(self): Configuration._create( user_agent="test", hdx_key="12345", project_config_yaml=join("tests", "config", "project_configuration.yml"), ) Locations.set_validlocations([ { "name": "afg", "title": "Afghanistan" }, { "name": "phl", "title": "Philippines" }, ]) Country.countriesdata(use_live=False) Vocabulary._tags_dict = True Vocabulary._approved_vocabulary = { "tags": [ { "name": "common operational dataset - cod" }, { "name": "administrative divisions" }, { "name": "geodata" }, { "name": "gazetteer" }, ], "id": "4e61d464-4943-4e97-973a-84673c1aaa87", "name": "approved", } return Configuration.read()
def main(): """Generate dataset and create it in HDX""" configuration = Configuration.read() with ErrorsOnExit() as errors: with Download() as downloader: cod = COD(downloader, errors) datasets_metadata = cod.get_datasets_metadata(configuration["url"]) logger.info(f"Number of datasets to upload: {len(datasets_metadata)}") for metadata in datasets_metadata: dataset, batch = cod.generate_dataset(metadata) if dataset: dataset.update_from_yaml() try: dataset.create_in_hdx( remove_additional_resources=True, hxl_update=False, updated_by_script="HDX Scraper: CODS", batch=batch, ignore_fields=["num_of_rows", "resource:description"], ) except HDXError as ex: errors.add(f"Dataset: {metadata['DatasetTitle']}, error: {ex}")
def main( db_url: Optional[str] = None, db_params: Optional[str] = None, gsheet_auth: Optional[str] = None, email_server: Optional[str] = None, failure_emails: Optional[str] = None, sysadmin_emails: Optional[str] = None, email_test: Optional[str] = None, spreadsheet_test: bool = False, no_spreadsheet: bool = False, **ignore, ) -> None: """Run freshness emailer. Either a database connection string (db_url) or database connection parameters (db_params) can be supplied. If neither is supplied, a local SQLite database with filename "freshness.db" is assumed. An optional email server can be supplied in the form: connection type (eg. ssl),host,port,username,password,sender email If not supplied, no emails will be sent. An optional authorisation string for Google Sheets can be supplied of the form: {"type": "service_account", "project_id": "hdx-bot", "private_key_id": ... "token_uri": "https://oauth2.googleapis.com/token", "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",...} failure_emails is a list of email addresses for the people who should be emailed in the event of a freshness failure. sysadmin_emails is a list of email addresses of HDX system administrators who are emailed with summaries of maintainers contacted, datasets that have become delinquent, invalid maintainers and org admins etc. Args: db_url (Optional[str]): Database connection string. Defaults to None. db_params (Optional[str]): Database connection parameters. Defaults to None. gsheet_auth (Optional[str]): Google Sheets authorisation. Defaults to None. email_server (Optional[str]): Email server to use. Defaults to None. failure_emails (Optional[str]): Email addresses. Defaults to None. sysadmin_emails (Optional[str]): Email addresses. Defaults to None. email_test (Optional[str]): Only email test users. Defaults to None. spreadsheet_test (bool): Output to test Google spreadsheet. Defaults to False. no_spreadsheet (bool): Don't output to Google spreadsheet. Defaults to False. Returns: None """ logger.info(f"> Data freshness emailer {__version__}") configuration = Configuration.read() if email_server: # Get email server details email_config = email_server.split(",") email_config_dict = { "connection_type": email_config[0], "host": email_config[1], "port": int(email_config[2]), "username": email_config[3], "password": email_config[4], } if len(email_config) > 5: email_config_dict["sender"] = email_config[5] configuration.setup_emailer(email_config_dict=email_config_dict) logger.info(f"> Email host: {email_config[1]}") send_emails = configuration.emailer().send else: logger.info("> No email host!") send_emails = None if db_params: # Get freshness database server details params = args_to_dict(db_params) elif db_url: params = Database.get_params_from_sqlalchemy_url(db_url) else: params = {"driver": "sqlite", "database": "freshness.db"} if sysadmin_emails: sysadmin_emails = sysadmin_emails.split(",") logger.info(f"> Database parameters: {params}") with Database(**params) as session: now = datetime.datetime.utcnow() email = Email( now, sysadmin_emails=sysadmin_emails, send_emails=send_emails, ) sheet = Sheet(now) if failure_emails: failure_emails = failure_emails.split(",") else: failure_emails = list() error = sheet.setup_gsheet(configuration, gsheet_auth, spreadsheet_test, no_spreadsheet) if error: email.htmlify_send(failure_emails, "Error opening Google sheets!", error) else: error = sheet.setup_input() if error: email.htmlify_send( failure_emails, "Error reading DP duty roster or data grid curation sheet!", error, ) else: hdxhelper = HDXHelper( site_url=configuration.get_hdx_site_url()) databasequeries = DatabaseQueries(session=session, now=now, hdxhelper=hdxhelper) freshness = DataFreshnessStatus( databasequeries=databasequeries, email=email, sheet=sheet, ) # Check number of datasets hasn't dropped if not freshness.check_number_datasets( now, send_failures=failure_emails): if email_test: # send just to test users test_users = [failure_emails[0]] freshness.process_broken(recipients=test_users) freshness.process_overdue(recipients=test_users, sysadmins=test_users) freshness.process_delinquent(recipients=test_users) freshness.process_maintainer_orgadmins( recipients=test_users) freshness.process_datasets_noresources( recipients=test_users) # freshness.process_datasets_dataset_date( # recipients=test_users, # sysadmins=test_users # ) freshness.process_datasets_datagrid( recipients=test_users) else: freshness.process_broken( ) # Check for broken resources freshness.process_overdue( ) # Check for overdue datasets freshness.process_delinquent( ) # Check for delinquent datasets # Check for datasets with invalid maintainer and organisations # with invalid administrators freshness.process_maintainer_orgadmins() # Check for datasets with no resources freshness.process_datasets_noresources() # Check for datasets where the dataset date may need updating # freshness.process_datasets_dataset_date( # sysadmins=test_users # ) # Check for candidates for the data grid freshness.process_datasets_datagrid() logger.info("Freshness emailer completed!")