Example #1
0
def main(db_params, start_date, **ignore):
    params = args_to_dict(db_params)
    port = params.get('port')
    if port:
        params['port'] = int(port)
    params['charset'] = 'utf8'
    params['cursorclass'] = pymysql.cursors.DictCursor
    make_hdx_entries(start_date, **params)
def main(dbread_url, dbread_params, dbwrite_url, dbwrite_params, action, run_numbers, **ignore):
    if dbread_params:
        readparams = args_to_dict(dbread_params)
    elif dbread_url:
        readparams = Database.get_params_from_sqlalchemy_url(dbread_url)
    else:
        readparams = {'driver': 'sqlite', 'database': 'input.db'}
    logger.info('> Database (to read) parameters: %s' % readparams)
    if dbwrite_params:
        writeparams = args_to_dict(dbwrite_params)
    elif dbwrite_url:
        writeparams = Database.get_params_from_sqlalchemy_url(dbwrite_url)
    else:
        writeparams = {'driver': 'sqlite', 'database': 'output.db'}
    logger.info('> Database (to write) parameters: %s' % writeparams)
    with Database(**readparams) as readsession:
        with Database(**writeparams) as writesession:
            dbactions = DatabaseActions(readsession, writesession, run_numbers)
            if action == 'duplicate':
                dbactions.duplicate()
Example #3
0
def main(file_path, hdx_key, user_agent, preprefix, hdx_site, db_url, db_params, gsheet_auth):
    if db_params:
        params = args_to_dict(db_params)
    elif db_url:
        params = Database.get_params_from_sqlalchemy_url(db_url)
    else:
        params = {'driver': 'sqlite', 'database': 'freshness.db'}
    logger.info('> Database parameters: %s' % params)
    with Database(**params) as session:
        info = json.loads(gsheet_auth)
        scopes = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive']
        credentials = service_account.Credentials.from_service_account_info(info, scopes=scopes)
        gc = pygsheets.authorize(custom_credentials=credentials)
        configuration = load_yaml('project_configuration.yml')
        spreadsheet = gc.open_by_url(configuration['spreadsheet_url'])
        sheet = spreadsheet.worksheet_by_title('datasets')
        sheet.clear()
        rows = [['update freq', 'fresh', 'no days', 'title', 'run date', 'last modified', 'dataset date', 'dataset end date', 'org title', 'URL', 'id', 'org id', 'maintainer', 'what updated', 'resources']]
        run_number, run_date = session.query(DBRun.run_number, DBRun.run_date).order_by(DBRun.run_number.desc()).first()
        logger.info('Run number is %d' % run_number)

        datasetcolumns = [DBDataset.update_frequency, DBDataset.fresh, DBInfoDataset.title, DBDataset.last_modified,
                          DBDataset.dataset_date, DBOrganization.title.label('organization_title'), DBInfoDataset.name,
                          DBDataset.id, DBOrganization.id.label('organization_id'), DBInfoDataset.maintainer, DBDataset.what_updated]

        resourcecolumns = [DBDataset.id, DBResource.url]

        def get_datasets(update_frequency, fresh):
            filters = [DBDataset.run_number == run_number, DBDataset.id == DBInfoDataset.id,
                       DBInfoDataset.organization_id == DBOrganization.id,
                       DBDataset.fresh == fresh, DBDataset.update_frequency == update_frequency]
            return session.query(*datasetcolumns).filter(and_(*filters))

        def get_resources(dataset_ids):
            filters = [DBDataset.run_number == run_number, DBResource.run_number == run_number,
                       DBDataset.id == DBResource.dataset_id, DBDataset.id.in_(dataset_ids)]
            return session.query(*resourcecolumns).filter(and_(*filters))

        fresh_values = [0, 1, 2, 3]
        update_frequencies = [1, 7, 14, 30, 180, 365]

        repobase = '%s/tree/master/datasets/' % configuration['repo']
        dir = join(file_path, 'datasets')
        rmtree(dir, ignore_errors=True)
        mkdir(dir)

        with Download(user_agent=user_agent, preprefix=preprefix) as downloader:
            status_forcelist = [429, 500, 502, 503, 504]
            method_whitelist = frozenset(['HEAD', 'TRACE', 'GET', 'PUT', 'OPTIONS', 'DELETE'])
            retries = Retry(total=1, backoff_factor=0.4, status_forcelist=status_forcelist,
                            method_whitelist=method_whitelist,
                            raise_on_redirect=True,
                            raise_on_status=True)
            downloader.session.mount('http://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100))
            downloader.session.mount('https://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100))

            for update_frequency in update_frequencies:
                for fresh in fresh_values:
                    org_ids = list()
                    results = get_datasets(update_frequency, fresh)
                    datasets = list()
                    ids = list()
                    datasets_urls = dict()
                    for dataset in results:
                        dataset = list(dataset)
                        datasets.append(dataset)
                        ids.append(dataset[7])
                    for result in get_resources(ids):
                        resource = list(result)
                        dict_of_lists_add(datasets_urls, resource[0], resource[1])
                    for dataset in datasets:
                        org_id = dataset[8]
                        if org_id in org_ids:
                            continue
                        dataset = list(dataset)
                        dataset[0] = Dataset.transform_update_frequency(str(update_frequency))
                        fresh = dataset[1]
                        if fresh == 0:
                            dataset[1] = 'fresh'
                        elif fresh == 1:
                            dataset[1] = 'due'
                        elif fresh == 2:
                            dataset[1] = 'overdue'
                        elif fresh == 3:
                            dataset[1] = 'delinquent'
                        last_modified = dataset[3]
                        dataset[3] = last_modified.isoformat()
                        nodays = (run_date - last_modified).days
                        dataset.insert(2, nodays)
                        dataset.insert(4, run_date.isoformat())
                        dataset_date = dataset[6]
                        if '-' in dataset_date:
                            dataset_date = dataset_date.split('-')
                            dataset[6] = datetime.strptime(dataset_date[0], '%m/%d/%Y').date().isoformat()
                            dataset.insert(7, datetime.strptime(dataset_date[1], '%m/%d/%Y').date().isoformat())
                        else:
                            dataset[6] = datetime.strptime(dataset_date, '%m/%d/%Y').date().isoformat()
                            dataset.insert(7, '')
                        dataset_name = dataset[9]
                        dataset[9] = 'https://data.humdata.org/dataset/%s' % dataset_name
                        org_ids.append(org_id)
                        if len(org_ids) == 6:
                            break
                        urls = datasets_urls[dataset[10]]
                        if len(urls) != 0:
                            datasetdir = join(dir, dataset_name)
                            mkdir(datasetdir)
                            for url in urls:
                                urlpath = urlsplit(url).path
                                filename = basename(urlpath)
                                try:
                                    downloader.download_file(url, datasetdir, filename)
                                except DownloadError as ex:
                                    with open(join(datasetdir, filename), 'w') as text_file:
                                        text_file.write(str(ex))
                            dataset.append('%s%s' % (repobase, dataset_name))
                        else:
                            dataset.append('')
                        rows.append(dataset)
                        logger.info('Added dataset %s' % dataset_name)
            sheet.update_values('A1', rows)
Example #4
0
def main(
    db_url: Optional[str] = None,
    db_params: Optional[str] = None,
    gsheet_auth: Optional[str] = None,
    email_server: Optional[str] = None,
    failure_emails: Optional[str] = None,
    sysadmin_emails: Optional[str] = None,
    email_test: Optional[str] = None,
    spreadsheet_test: bool = False,
    no_spreadsheet: bool = False,
    **ignore,
) -> None:
    """Run freshness emailer. Either a database connection string (db_url) or database
    connection parameters (db_params) can be supplied. If neither is supplied, a local
    SQLite database with filename "freshness.db" is assumed. An optional email server
    can be supplied in the form:
    connection type (eg. ssl),host,port,username,password,sender email

    If not supplied, no emails will be sent. An optional authorisation string for
    Google Sheets can be supplied of the form:
    {"type": "service_account", "project_id": "hdx-bot", "private_key_id": ...
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",...}

    failure_emails is a list of email addresses for the people who should be emailed in
    the event of a freshness failure. sysadmin_emails is a list of email addresses of
    HDX system administrators who are emailed with summaries of maintainers contacted,
    datasets that have become delinquent, invalid maintainers and org admins etc.

    Args:
        db_url (Optional[str]): Database connection string. Defaults to None.
        db_params (Optional[str]): Database connection parameters. Defaults to None.
        gsheet_auth (Optional[str]): Google Sheets authorisation. Defaults to None.
        email_server (Optional[str]): Email server to use. Defaults to None.
        failure_emails (Optional[str]): Email addresses. Defaults to None.
        sysadmin_emails (Optional[str]): Email addresses. Defaults to None.
        email_test (Optional[str]): Only email test users. Defaults to None.
        spreadsheet_test (bool): Output to test Google spreadsheet. Defaults to False.
        no_spreadsheet (bool): Don't output to Google spreadsheet. Defaults to False.

    Returns:
        None
    """

    logger.info(f"> Data freshness emailer {__version__}")
    configuration = Configuration.read()
    if email_server:  # Get email server details
        email_config = email_server.split(",")
        email_config_dict = {
            "connection_type": email_config[0],
            "host": email_config[1],
            "port": int(email_config[2]),
            "username": email_config[3],
            "password": email_config[4],
        }
        if len(email_config) > 5:
            email_config_dict["sender"] = email_config[5]
        configuration.setup_emailer(email_config_dict=email_config_dict)
        logger.info(f"> Email host: {email_config[1]}")
        send_emails = configuration.emailer().send
    else:
        logger.info("> No email host!")
        send_emails = None
    if db_params:  # Get freshness database server details
        params = args_to_dict(db_params)
    elif db_url:
        params = Database.get_params_from_sqlalchemy_url(db_url)
    else:
        params = {"driver": "sqlite", "database": "freshness.db"}
    if sysadmin_emails:
        sysadmin_emails = sysadmin_emails.split(",")
    logger.info(f"> Database parameters: {params}")
    with Database(**params) as session:
        now = datetime.datetime.utcnow()
        email = Email(
            now,
            sysadmin_emails=sysadmin_emails,
            send_emails=send_emails,
        )
        sheet = Sheet(now)

        if failure_emails:
            failure_emails = failure_emails.split(",")
        else:
            failure_emails = list()
        error = sheet.setup_gsheet(configuration, gsheet_auth,
                                   spreadsheet_test, no_spreadsheet)
        if error:
            email.htmlify_send(failure_emails, "Error opening Google sheets!",
                               error)
        else:
            error = sheet.setup_input()
            if error:
                email.htmlify_send(
                    failure_emails,
                    "Error reading DP duty roster or data grid curation sheet!",
                    error,
                )
            else:
                hdxhelper = HDXHelper(
                    site_url=configuration.get_hdx_site_url())
                databasequeries = DatabaseQueries(session=session,
                                                  now=now,
                                                  hdxhelper=hdxhelper)
                freshness = DataFreshnessStatus(
                    databasequeries=databasequeries,
                    email=email,
                    sheet=sheet,
                )
                # Check number of datasets hasn't dropped
                if not freshness.check_number_datasets(
                        now, send_failures=failure_emails):
                    if email_test:  # send just to test users
                        test_users = [failure_emails[0]]
                        freshness.process_broken(recipients=test_users)
                        freshness.process_overdue(recipients=test_users,
                                                  sysadmins=test_users)
                        freshness.process_delinquent(recipients=test_users)
                        freshness.process_maintainer_orgadmins(
                            recipients=test_users)
                        freshness.process_datasets_noresources(
                            recipients=test_users)
                        # freshness.process_datasets_dataset_date(
                        #     recipients=test_users,
                        #     sysadmins=test_users
                        # )
                        freshness.process_datasets_datagrid(
                            recipients=test_users)
                    else:
                        freshness.process_broken(
                        )  # Check for broken resources
                        freshness.process_overdue(
                        )  # Check for overdue datasets
                        freshness.process_delinquent(
                        )  # Check for delinquent datasets
                        # Check for datasets with invalid maintainer and organisations
                        # with invalid administrators
                        freshness.process_maintainer_orgadmins()
                        # Check for datasets with no resources
                        freshness.process_datasets_noresources()
                        # Check for datasets where the dataset date may need updating
                        # freshness.process_datasets_dataset_date(
                        #     sysadmins=test_users
                        # )
                        # Check for candidates for the data grid
                        freshness.process_datasets_datagrid()

    logger.info("Freshness emailer completed!")
Example #5
0
 def test_args_to_dict(self):
     args = "a=1,big=hello,1=3"
     assert args_to_dict(args) == {"a": "1", "big": "hello", "1": "3"}