Exemple #1
0
 def configuration(self):
     Configuration._create(
         hdx_read_only=True,
         user_agent="test",
         project_config_yaml=join("tests", "config",
                                  "project_configuration.yml"),
     )
     Locations.set_validlocations([{
         "name": "afg",
         "title": "Afghanistan"
     }])  # add locations used in tests
     Country.countriesdata(use_live=False)
     Vocabulary._tags_dict = True
     Vocabulary._approved_vocabulary = {
         "tags": [
             {
                 "name": "hxl"
             },
             {
                 "name": "food security"
             },
             {
                 "name": "indicators"
             },
         ],
         "id":
         "4e61d464-4943-4e97-973a-84673c1aaa87",
         "name":
         "approved",
     }
     return Configuration.read()
Exemple #2
0
 def configuration(self):
     Configuration._create(
         user_agent="test",
         hdx_key="12345",
         project_config_yaml=join("tests", "config",
                                  "project_configuration.yml"),
     )
     Locations.set_validlocations([{"name": "bgd", "title": "Bangladesh"}])
     Country.countriesdata(use_live=False)
     Vocabulary._tags_dict = True
     Vocabulary._approved_vocabulary = {
         "tags": [
             {
                 "name": "hxl"
             },
             {
                 "name": "refugees"
             },
             {
                 "name": "asylum"
             },
             {
                 "name": "population"
             },
         ],
         "id":
         "4e61d464-4943-4e97-973a-84673c1aaa87",
         "name":
         "approved",
     }
     return Configuration.read()
Exemple #3
0
def configuration():
    project_config_yaml = join("tests", "fixtures",
                               "project_configuration.yml")
    Configuration._create(
        hdx_site="prod",
        user_agent="test",
        hdx_read_only=True,
        project_config_yaml=project_config_yaml,
    )
    return Configuration.read()
Exemple #4
0
def main():
    """Generate dataset and create it in HDX"""

    filelist_url = Configuration.read()["filelist_url"]
    countrygroup_url = Configuration.read()["countrygroup_url"]
    indicatorsetnames = Configuration.read()["indicatorsetnames"]
    showcase_base_url = Configuration.read()["showcase_base_url"]
    with Download() as downloader:
        with wheretostart_tempdir_batch(lookup) as info:
            folder = info["folder"]
            batch = info["batch"]
            indicatorsets = download_indicatorsets(filelist_url,
                                                   indicatorsetnames,
                                                   downloader, folder)
            logger.info(
                f"Number of indicator types to upload: {len(indicatorsetnames)}"
            )
            countries, countrymapping = get_countries(countrygroup_url,
                                                      downloader)
            logger.info(f"Number of countries to upload: {len(countries)}")
            for info, country in progress_storing_folder(
                    info, countries, "iso3"):
                for indicatorsetname in indicatorsets:
                    (
                        dataset,
                        showcase,
                        bites_disabled,
                        qc_indicators,
                    ) = generate_dataset_and_showcase(
                        indicatorsetname,
                        indicatorsets,
                        country,
                        countrymapping,
                        showcase_base_url,
                        filelist_url,
                        downloader,
                        info["folder"],
                    )
                    if dataset:
                        dataset.update_from_yaml()
                        dataset.generate_resource_view(
                            -1,
                            bites_disabled=bites_disabled,
                            indicators=qc_indicators)
                        dataset.create_in_hdx(
                            remove_additional_resources=True,
                            hxl_update=False,
                            updated_by_script="HDX Scraper: FAOStat",
                            batch=batch,
                        )
                        showcase.create_in_hdx()
                        showcase.add_dataset(dataset)
Exemple #5
0
def main(
    output_dir,
    saved_dir,
    save,
    use_saved,
    dportal_params,
    whattorun,
    filterdate,
    **ignore,
):
    logger.info(f"##### hdx-scraper-iati-viz version {VERSION:.1f} ####")
    configuration = Configuration.read()
    output_dir = f"{output_dir}_{whattorun}"
    rmtree(output_dir, ignore_errors=True)
    mkdir(output_dir)
    with Download() as downloader:
        retriever = Retrieve(
            downloader,
            configuration["fallback_dir"],
            f"{saved_dir}_{whattorun}",
            output_dir,
            save,
            use_saved,
        )
        today = datetime.utcnow().isoformat()
        start(
            configuration,
            today,
            retriever,
            output_dir,
            dportal_params,
            whattorun,
            filterdate,
        )
Exemple #6
0
 def configuration_multiple(self):
     project_config_yaml = join("tests", "fixtures",
                                "project_configuration_multiple.yml")
     return Configuration(
         hdx_site="prod",
         user_agent="test",
         hdx_read_only=True,
         project_config_yaml=project_config_yaml,
     )
 def configuration(self):
     Configuration._create(
         hdx_read_only=True,
         hdx_site="prod",
         user_agent="test",
         project_config_yaml=join("tests", "config",
                                  "project_configuration.yml"),
     )
     Locations.set_validlocations([
         {
             "name": "afg",
             "title": "Afghanistan"
         },
         {
             "name": "pse",
             "title": "State of Palestine"
         },
     ])
     return Configuration.read()
 def configuration(self):
     Configuration._create(
         user_agent="test",
         hdx_key="12345",
         project_config_yaml=join("tests", "config",
                                  "project_configuration.yml"),
     )
     Locations.set_validlocations([
         {
             "name": "afg",
             "title": "Afghanistan"
         },
         {
             "name": "phl",
             "title": "Philippines"
         },
     ])
     Country.countriesdata(use_live=False)
     return Configuration.read()
 def configuration(self):
     Configuration._create(
         user_agent="test",
         hdx_key="12345",
         project_config_yaml=join("tests", "config",
                                  "project_configuration.yml"),
     )
     Locations.set_validlocations([
         {
             "name": "afg",
             "title": "Afghanistan"
         },
         {
             "name": "phl",
             "title": "Philippines"
         },
     ])
     Country.countriesdata(use_live=False)
     Vocabulary._tags_dict = True
     Vocabulary._approved_vocabulary = {
         "tags": [
             {
                 "name": "common operational dataset - cod"
             },
             {
                 "name": "administrative divisions"
             },
             {
                 "name": "geodata"
             },
             {
                 "name": "gazetteer"
             },
         ],
         "id":
         "4e61d464-4943-4e97-973a-84673c1aaa87",
         "name":
         "approved",
     }
     return Configuration.read()
def main(output_failures=False, **ignore):
    configuration = Configuration.read()
    with Download() as downloader:
        dataset_ids = get_dataset_ids(configuration, downloader)
        logger.info(f"Number of datasets to upload: {len(dataset_ids)}")
        for info, dataset_id in progress_storing_tempdir(
            "UNHCR-MICRODATA", dataset_ids, "id"
        ):
            dataset = generate_dataset(
                dataset_id["id"], configuration, downloader, output_failures
            )
            if dataset:
                dataset.update_from_yaml()
                dataset.create_in_hdx(
                    remove_additional_resources=True,
                    hxl_update=False,
                    updated_by_script="HDX Scraper: UNHCR microdata",
                    batch=info["batch"],
                )
        for failure in failures:
            logger.error(failure)
Exemple #11
0
def main():
    """Generate dataset and create it in HDX"""

    configuration = Configuration.read()
    with ErrorsOnExit() as errors:
        with Download() as downloader:
            cod = COD(downloader, errors)
            datasets_metadata = cod.get_datasets_metadata(configuration["url"])
            logger.info(f"Number of datasets to upload: {len(datasets_metadata)}")
            for metadata in datasets_metadata:
                dataset, batch = cod.generate_dataset(metadata)
                if dataset:
                    dataset.update_from_yaml()
                    try:
                        dataset.create_in_hdx(
                            remove_additional_resources=True,
                            hxl_update=False,
                            updated_by_script="HDX Scraper: CODS",
                            batch=batch,
                            ignore_fields=["num_of_rows", "resource:description"],
                        )
                    except HDXError as ex:
                        errors.add(f"Dataset: {metadata['DatasetTitle']}, error: {ex}")
Exemple #12
0
def main(
    db_url: Optional[str] = None,
    db_params: Optional[str] = None,
    gsheet_auth: Optional[str] = None,
    email_server: Optional[str] = None,
    failure_emails: Optional[str] = None,
    sysadmin_emails: Optional[str] = None,
    email_test: Optional[str] = None,
    spreadsheet_test: bool = False,
    no_spreadsheet: bool = False,
    **ignore,
) -> None:
    """Run freshness emailer. Either a database connection string (db_url) or database
    connection parameters (db_params) can be supplied. If neither is supplied, a local
    SQLite database with filename "freshness.db" is assumed. An optional email server
    can be supplied in the form:
    connection type (eg. ssl),host,port,username,password,sender email

    If not supplied, no emails will be sent. An optional authorisation string for
    Google Sheets can be supplied of the form:
    {"type": "service_account", "project_id": "hdx-bot", "private_key_id": ...
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",...}

    failure_emails is a list of email addresses for the people who should be emailed in
    the event of a freshness failure. sysadmin_emails is a list of email addresses of
    HDX system administrators who are emailed with summaries of maintainers contacted,
    datasets that have become delinquent, invalid maintainers and org admins etc.

    Args:
        db_url (Optional[str]): Database connection string. Defaults to None.
        db_params (Optional[str]): Database connection parameters. Defaults to None.
        gsheet_auth (Optional[str]): Google Sheets authorisation. Defaults to None.
        email_server (Optional[str]): Email server to use. Defaults to None.
        failure_emails (Optional[str]): Email addresses. Defaults to None.
        sysadmin_emails (Optional[str]): Email addresses. Defaults to None.
        email_test (Optional[str]): Only email test users. Defaults to None.
        spreadsheet_test (bool): Output to test Google spreadsheet. Defaults to False.
        no_spreadsheet (bool): Don't output to Google spreadsheet. Defaults to False.

    Returns:
        None
    """

    logger.info(f"> Data freshness emailer {__version__}")
    configuration = Configuration.read()
    if email_server:  # Get email server details
        email_config = email_server.split(",")
        email_config_dict = {
            "connection_type": email_config[0],
            "host": email_config[1],
            "port": int(email_config[2]),
            "username": email_config[3],
            "password": email_config[4],
        }
        if len(email_config) > 5:
            email_config_dict["sender"] = email_config[5]
        configuration.setup_emailer(email_config_dict=email_config_dict)
        logger.info(f"> Email host: {email_config[1]}")
        send_emails = configuration.emailer().send
    else:
        logger.info("> No email host!")
        send_emails = None
    if db_params:  # Get freshness database server details
        params = args_to_dict(db_params)
    elif db_url:
        params = Database.get_params_from_sqlalchemy_url(db_url)
    else:
        params = {"driver": "sqlite", "database": "freshness.db"}
    if sysadmin_emails:
        sysadmin_emails = sysadmin_emails.split(",")
    logger.info(f"> Database parameters: {params}")
    with Database(**params) as session:
        now = datetime.datetime.utcnow()
        email = Email(
            now,
            sysadmin_emails=sysadmin_emails,
            send_emails=send_emails,
        )
        sheet = Sheet(now)

        if failure_emails:
            failure_emails = failure_emails.split(",")
        else:
            failure_emails = list()
        error = sheet.setup_gsheet(configuration, gsheet_auth,
                                   spreadsheet_test, no_spreadsheet)
        if error:
            email.htmlify_send(failure_emails, "Error opening Google sheets!",
                               error)
        else:
            error = sheet.setup_input()
            if error:
                email.htmlify_send(
                    failure_emails,
                    "Error reading DP duty roster or data grid curation sheet!",
                    error,
                )
            else:
                hdxhelper = HDXHelper(
                    site_url=configuration.get_hdx_site_url())
                databasequeries = DatabaseQueries(session=session,
                                                  now=now,
                                                  hdxhelper=hdxhelper)
                freshness = DataFreshnessStatus(
                    databasequeries=databasequeries,
                    email=email,
                    sheet=sheet,
                )
                # Check number of datasets hasn't dropped
                if not freshness.check_number_datasets(
                        now, send_failures=failure_emails):
                    if email_test:  # send just to test users
                        test_users = [failure_emails[0]]
                        freshness.process_broken(recipients=test_users)
                        freshness.process_overdue(recipients=test_users,
                                                  sysadmins=test_users)
                        freshness.process_delinquent(recipients=test_users)
                        freshness.process_maintainer_orgadmins(
                            recipients=test_users)
                        freshness.process_datasets_noresources(
                            recipients=test_users)
                        # freshness.process_datasets_dataset_date(
                        #     recipients=test_users,
                        #     sysadmins=test_users
                        # )
                        freshness.process_datasets_datagrid(
                            recipients=test_users)
                    else:
                        freshness.process_broken(
                        )  # Check for broken resources
                        freshness.process_overdue(
                        )  # Check for overdue datasets
                        freshness.process_delinquent(
                        )  # Check for delinquent datasets
                        # Check for datasets with invalid maintainer and organisations
                        # with invalid administrators
                        freshness.process_maintainer_orgadmins()
                        # Check for datasets with no resources
                        freshness.process_datasets_noresources()
                        # Check for datasets where the dataset date may need updating
                        # freshness.process_datasets_dataset_date(
                        #     sysadmins=test_users
                        # )
                        # Check for candidates for the data grid
                        freshness.process_datasets_datagrid()

    logger.info("Freshness emailer completed!")
Exemple #13
0
import os

import pandas as pd
import unicodecsv
import xlrd
from hdx.data.dataset import Dataset
from hdx.api.configuration import Configuration

from conf import config, nutriset_config

Configuration.create(hdx_site='prod',
                     user_agent='read-hdx',
                     hdx_read_only=True)


def get_jme_dataset():
    print("Downloading latest version of Joint Malnutrition Dataset from HDX")
    return Dataset.read_from_hdx(
        'child-malnutrition-joint-country-dataset-unicef-who-world-bank-group-2017'
    )


def get_reliefweb_dataset():
    print(
        "Downloading latest version of Relief Web Crisis App Dataset from HDX")
    return Dataset.read_from_hdx('reliefweb-crisis-figures')


def download_dataset(dataset, filename):
    resources = dataset.get_resources()
    url, path = resources[0].download(config.WORKING_FOLDER)