def main(db_params, start_date, **ignore): params = args_to_dict(db_params) port = params.get('port') if port: params['port'] = int(port) params['charset'] = 'utf8' params['cursorclass'] = pymysql.cursors.DictCursor make_hdx_entries(start_date, **params)
def main(dbread_url, dbread_params, dbwrite_url, dbwrite_params, action, run_numbers, **ignore): if dbread_params: readparams = args_to_dict(dbread_params) elif dbread_url: readparams = Database.get_params_from_sqlalchemy_url(dbread_url) else: readparams = {'driver': 'sqlite', 'database': 'input.db'} logger.info('> Database (to read) parameters: %s' % readparams) if dbwrite_params: writeparams = args_to_dict(dbwrite_params) elif dbwrite_url: writeparams = Database.get_params_from_sqlalchemy_url(dbwrite_url) else: writeparams = {'driver': 'sqlite', 'database': 'output.db'} logger.info('> Database (to write) parameters: %s' % writeparams) with Database(**readparams) as readsession: with Database(**writeparams) as writesession: dbactions = DatabaseActions(readsession, writesession, run_numbers) if action == 'duplicate': dbactions.duplicate()
def main(file_path, hdx_key, user_agent, preprefix, hdx_site, db_url, db_params, gsheet_auth): if db_params: params = args_to_dict(db_params) elif db_url: params = Database.get_params_from_sqlalchemy_url(db_url) else: params = {'driver': 'sqlite', 'database': 'freshness.db'} logger.info('> Database parameters: %s' % params) with Database(**params) as session: info = json.loads(gsheet_auth) scopes = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive'] credentials = service_account.Credentials.from_service_account_info(info, scopes=scopes) gc = pygsheets.authorize(custom_credentials=credentials) configuration = load_yaml('project_configuration.yml') spreadsheet = gc.open_by_url(configuration['spreadsheet_url']) sheet = spreadsheet.worksheet_by_title('datasets') sheet.clear() rows = [['update freq', 'fresh', 'no days', 'title', 'run date', 'last modified', 'dataset date', 'dataset end date', 'org title', 'URL', 'id', 'org id', 'maintainer', 'what updated', 'resources']] run_number, run_date = session.query(DBRun.run_number, DBRun.run_date).order_by(DBRun.run_number.desc()).first() logger.info('Run number is %d' % run_number) datasetcolumns = [DBDataset.update_frequency, DBDataset.fresh, DBInfoDataset.title, DBDataset.last_modified, DBDataset.dataset_date, DBOrganization.title.label('organization_title'), DBInfoDataset.name, DBDataset.id, DBOrganization.id.label('organization_id'), DBInfoDataset.maintainer, DBDataset.what_updated] resourcecolumns = [DBDataset.id, DBResource.url] def get_datasets(update_frequency, fresh): filters = [DBDataset.run_number == run_number, DBDataset.id == DBInfoDataset.id, DBInfoDataset.organization_id == DBOrganization.id, DBDataset.fresh == fresh, DBDataset.update_frequency == update_frequency] return session.query(*datasetcolumns).filter(and_(*filters)) def get_resources(dataset_ids): filters = [DBDataset.run_number == run_number, DBResource.run_number == run_number, DBDataset.id == DBResource.dataset_id, DBDataset.id.in_(dataset_ids)] return session.query(*resourcecolumns).filter(and_(*filters)) fresh_values = [0, 1, 2, 3] update_frequencies = [1, 7, 14, 30, 180, 365] repobase = '%s/tree/master/datasets/' % configuration['repo'] dir = join(file_path, 'datasets') rmtree(dir, ignore_errors=True) mkdir(dir) with Download(user_agent=user_agent, preprefix=preprefix) as downloader: status_forcelist = [429, 500, 502, 503, 504] method_whitelist = frozenset(['HEAD', 'TRACE', 'GET', 'PUT', 'OPTIONS', 'DELETE']) retries = Retry(total=1, backoff_factor=0.4, status_forcelist=status_forcelist, method_whitelist=method_whitelist, raise_on_redirect=True, raise_on_status=True) downloader.session.mount('http://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100)) downloader.session.mount('https://', HTTPAdapter(max_retries=retries, pool_connections=100, pool_maxsize=100)) for update_frequency in update_frequencies: for fresh in fresh_values: org_ids = list() results = get_datasets(update_frequency, fresh) datasets = list() ids = list() datasets_urls = dict() for dataset in results: dataset = list(dataset) datasets.append(dataset) ids.append(dataset[7]) for result in get_resources(ids): resource = list(result) dict_of_lists_add(datasets_urls, resource[0], resource[1]) for dataset in datasets: org_id = dataset[8] if org_id in org_ids: continue dataset = list(dataset) dataset[0] = Dataset.transform_update_frequency(str(update_frequency)) fresh = dataset[1] if fresh == 0: dataset[1] = 'fresh' elif fresh == 1: dataset[1] = 'due' elif fresh == 2: dataset[1] = 'overdue' elif fresh == 3: dataset[1] = 'delinquent' last_modified = dataset[3] dataset[3] = last_modified.isoformat() nodays = (run_date - last_modified).days dataset.insert(2, nodays) dataset.insert(4, run_date.isoformat()) dataset_date = dataset[6] if '-' in dataset_date: dataset_date = dataset_date.split('-') dataset[6] = datetime.strptime(dataset_date[0], '%m/%d/%Y').date().isoformat() dataset.insert(7, datetime.strptime(dataset_date[1], '%m/%d/%Y').date().isoformat()) else: dataset[6] = datetime.strptime(dataset_date, '%m/%d/%Y').date().isoformat() dataset.insert(7, '') dataset_name = dataset[9] dataset[9] = 'https://data.humdata.org/dataset/%s' % dataset_name org_ids.append(org_id) if len(org_ids) == 6: break urls = datasets_urls[dataset[10]] if len(urls) != 0: datasetdir = join(dir, dataset_name) mkdir(datasetdir) for url in urls: urlpath = urlsplit(url).path filename = basename(urlpath) try: downloader.download_file(url, datasetdir, filename) except DownloadError as ex: with open(join(datasetdir, filename), 'w') as text_file: text_file.write(str(ex)) dataset.append('%s%s' % (repobase, dataset_name)) else: dataset.append('') rows.append(dataset) logger.info('Added dataset %s' % dataset_name) sheet.update_values('A1', rows)
def main( db_url: Optional[str] = None, db_params: Optional[str] = None, gsheet_auth: Optional[str] = None, email_server: Optional[str] = None, failure_emails: Optional[str] = None, sysadmin_emails: Optional[str] = None, email_test: Optional[str] = None, spreadsheet_test: bool = False, no_spreadsheet: bool = False, **ignore, ) -> None: """Run freshness emailer. Either a database connection string (db_url) or database connection parameters (db_params) can be supplied. If neither is supplied, a local SQLite database with filename "freshness.db" is assumed. An optional email server can be supplied in the form: connection type (eg. ssl),host,port,username,password,sender email If not supplied, no emails will be sent. An optional authorisation string for Google Sheets can be supplied of the form: {"type": "service_account", "project_id": "hdx-bot", "private_key_id": ... "token_uri": "https://oauth2.googleapis.com/token", "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",...} failure_emails is a list of email addresses for the people who should be emailed in the event of a freshness failure. sysadmin_emails is a list of email addresses of HDX system administrators who are emailed with summaries of maintainers contacted, datasets that have become delinquent, invalid maintainers and org admins etc. Args: db_url (Optional[str]): Database connection string. Defaults to None. db_params (Optional[str]): Database connection parameters. Defaults to None. gsheet_auth (Optional[str]): Google Sheets authorisation. Defaults to None. email_server (Optional[str]): Email server to use. Defaults to None. failure_emails (Optional[str]): Email addresses. Defaults to None. sysadmin_emails (Optional[str]): Email addresses. Defaults to None. email_test (Optional[str]): Only email test users. Defaults to None. spreadsheet_test (bool): Output to test Google spreadsheet. Defaults to False. no_spreadsheet (bool): Don't output to Google spreadsheet. Defaults to False. Returns: None """ logger.info(f"> Data freshness emailer {__version__}") configuration = Configuration.read() if email_server: # Get email server details email_config = email_server.split(",") email_config_dict = { "connection_type": email_config[0], "host": email_config[1], "port": int(email_config[2]), "username": email_config[3], "password": email_config[4], } if len(email_config) > 5: email_config_dict["sender"] = email_config[5] configuration.setup_emailer(email_config_dict=email_config_dict) logger.info(f"> Email host: {email_config[1]}") send_emails = configuration.emailer().send else: logger.info("> No email host!") send_emails = None if db_params: # Get freshness database server details params = args_to_dict(db_params) elif db_url: params = Database.get_params_from_sqlalchemy_url(db_url) else: params = {"driver": "sqlite", "database": "freshness.db"} if sysadmin_emails: sysadmin_emails = sysadmin_emails.split(",") logger.info(f"> Database parameters: {params}") with Database(**params) as session: now = datetime.datetime.utcnow() email = Email( now, sysadmin_emails=sysadmin_emails, send_emails=send_emails, ) sheet = Sheet(now) if failure_emails: failure_emails = failure_emails.split(",") else: failure_emails = list() error = sheet.setup_gsheet(configuration, gsheet_auth, spreadsheet_test, no_spreadsheet) if error: email.htmlify_send(failure_emails, "Error opening Google sheets!", error) else: error = sheet.setup_input() if error: email.htmlify_send( failure_emails, "Error reading DP duty roster or data grid curation sheet!", error, ) else: hdxhelper = HDXHelper( site_url=configuration.get_hdx_site_url()) databasequeries = DatabaseQueries(session=session, now=now, hdxhelper=hdxhelper) freshness = DataFreshnessStatus( databasequeries=databasequeries, email=email, sheet=sheet, ) # Check number of datasets hasn't dropped if not freshness.check_number_datasets( now, send_failures=failure_emails): if email_test: # send just to test users test_users = [failure_emails[0]] freshness.process_broken(recipients=test_users) freshness.process_overdue(recipients=test_users, sysadmins=test_users) freshness.process_delinquent(recipients=test_users) freshness.process_maintainer_orgadmins( recipients=test_users) freshness.process_datasets_noresources( recipients=test_users) # freshness.process_datasets_dataset_date( # recipients=test_users, # sysadmins=test_users # ) freshness.process_datasets_datagrid( recipients=test_users) else: freshness.process_broken( ) # Check for broken resources freshness.process_overdue( ) # Check for overdue datasets freshness.process_delinquent( ) # Check for delinquent datasets # Check for datasets with invalid maintainer and organisations # with invalid administrators freshness.process_maintainer_orgadmins() # Check for datasets with no resources freshness.process_datasets_noresources() # Check for datasets where the dataset date may need updating # freshness.process_datasets_dataset_date( # sysadmins=test_users # ) # Check for candidates for the data grid freshness.process_datasets_datagrid() logger.info("Freshness emailer completed!")
def test_args_to_dict(self): args = "a=1,big=hello,1=3" assert args_to_dict(args) == {"a": "1", "big": "hello", "1": "3"}