def _caller(*args, **kwargs): source_name = func.__module__.split('.')[-1] country = args[0] if (len(args) > 0 and isinstance(args[0], Country)) else None try: with transaction.atomic(): return func(*args, **kwargs) except Exception as e: # Log error to cronjob CronJob.sync_cron({ 'name': source_name, 'message': (f'Error querying {source_name}.' + (f' For Country: {country}.' if country else '') + f'\n\n' + traceback.format_exc()), 'status': CronJobStatus.ERRONEOUS, }) logger.error( f"Failed to load <{source_name}:{func.__name__}>" + (f'For Country: {country}' if country else '') + (f' {error_message}' if error_message else ''), exc_info=True, )
def handle(self, *args, **options): logger.info('Starting appeals ingest') new, modified, bilaterals = self.get_new_or_modified_appeals() logger.info('%s current appeals' % Appeal.objects.all().count()) logger.info('Creating %s new appeals' % len(new)) logger.info('Updating %s existing appeals that have been modified' % len(modified)) num_created = 0 for i, r in enumerate(new): fields = self.parse_appeal_record(r, is_new_appeal=True) if fields[ 'code'] in bilaterals: # correction of the appeal record due to appealbilaterals api fields['amount_funded'] += round(bilaterals[fields['code']], 1) try: Appeal.objects.create(**fields) except Exception as e: logger.error(str(e)[:100]) logger.error('Could not create appeal with code %s' % fields['code']) continue num_created = num_created + 1 num_updated = 0 for i, r in enumerate(modified): fields = self.parse_appeal_record(r, is_new_appeal=False) if fields[ 'code'] in bilaterals: # correction of the appeal record due to appealbilaterals api fields['amount_funded'] += round(bilaterals[fields['code']], 1) try: appeal, created = Appeal.objects.update_or_create( code=fields['code'], defaults=fields) except Exception as e: logger.error(str(e)[:100]) logger.error('Could not update appeal with code %s' % fields['code']) continue num_updated = num_updated + 1 CronJobSum = Appeal.objects.all().count() logger.info('%s appeals created' % num_created) logger.info('%s appeals updated' % num_updated) logger.info('%s total appeals' % CronJobSum) logger.info('Appeals ingest completed') body = { "name": "ingest_appeals", "message": 'Appeals ingest completed, %s total appeals (%s new, %s existing).' % (CronJobSum, num_created, num_updated), "num_result": CronJobSum, "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body)
def prefetch(): data = {} url = API_ENDPOINT page = 1 now = datetime.datetime.now() daterange = f'{now.year - 10}:{now.year}' while True: # TODO: lastupdated rs = requests.get(f'{url}?date={daterange}', params={ 'format': 'json', 'source': 50, 'per_page': 5000 - 1, # WD throws error on 5000 'page': page, }) if rs.status_code != 200: body = { "name": "WB", "message": "Error querying WorldBank feed at " + url, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) return data rs = rs.json() for pop_data in rs[1]: geo_code = pop_data['country']['id'] pop = pop_data['value'] year = pop_data['date'] if len(geo_code) == 3: # Admin Level 0 pcountry = get_country_by_iso3(geo_code) if pcountry is None: continue geo_id = pcountry.alpha_2 else: # Should be Admin Level 1 # NOTE: District code's structure is <ISO2>_<Number>, so using ISO2 geo_code = geo_code[-6:] pcountry = get_country_by_iso3(geo_code[:3]) if pcountry is None: continue iso2 = pcountry.alpha_2 geo_id = f'{iso2}{geo_code[3:]}' geo_id = geo_id.upper() if data.get(geo_id) is None or data.get(geo_id)[1] < year: data[geo_id] = (pop, year) if page >= rs[0]['pages']: break page += 1 body = { "name": "WB", "message": "Done querying WorldBank feed at " + url, "num_result": len(data), "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body) return data
def _crises_event_prefetch(): query_params = json.dumps({ 'limit': 1000, 'filter': { 'operator': 'AND', 'conditions': [ { 'field': 'primary_type.code', 'value': [type_code for type_code, _ in PastCrisesEvent.CHOICES], 'operator': 'OR' } ] }, 'fields': { 'include': ['date.created', 'primary_country.iso3', 'primary_type.code'] } }) url = DISASTER_API data = {} while True: response = requests.post(url, data=query_params) if response.status_code != 200: body = { "name": "RELIEFWEB", "message": "Error querying ReliefWeb crisis event feed at " + url, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) response = response.json() for disaster in response['data']: disaster = disaster['fields'] iso3 = disaster['primary_country']['iso3'].upper() pcountry = get_country_by_iso3(iso3) if pcountry is None: continue iso2 = pcountry.alpha_2 dt = parse_date(disaster['date']['created']) disaster_data = { 'event': disaster['primary_type']['code'], 'year': dt.year, 'month': dt.month, } if data.get(iso2) is None: data[iso2] = [disaster_data] else: data[iso2].append(disaster_data) if 'next' not in response['links']: break url = response['links']['next']['href'] return data
def load(self): """ Load data for Databank from specified sources """ source_prefetch_data = {} # Prefetch Data print('\nPrefetching from sources:: ') for source, name in SOURCES: if hasattr(source, 'prefetch'): start = datetime.datetime.now() print(f'\t -> {name}', end='') source_prefetch_data[source.__name__] = source.prefetch() print(f' [{datetime.datetime.now() - start}]') # Load print('\nLoading Sources data into GO DB:: ') for source, name in SOURCES: if hasattr(source, 'global_load'): print(f'\t -> {name}', end='') source.global_load(source_prefetch_data.get(source.__name__)) print(f' [{datetime.datetime.now() - start}]') index, country_count = 1, Country.objects.count() print('\nLoading Sources data for each country to GO DB:: ') for country in Country.objects.prefetch_related('countryoverview').all(): print(f'\t -> ({index}/{country_count}) {country}') overview = ( country.countryoverview if hasattr(country, 'countryoverview') else CountryOverview.objects.create(country=country) ) overview.script_modified_at = timezone.now() for source, name in SOURCES: if hasattr(source, 'load'): print(f'\t\t -> {name}', end='') # Load For each country source_data = source_prefetch_data.get(source.__name__) start = datetime.datetime.now() source.load(country, overview, source_data) print(f' [{datetime.datetime.now() - start}]') overview.save() index += 1 if name == 'FTS_HPC': # This source can not be checked/logged via prefetch, that is why we do it here, after the "load". body = { "name": name, "message": "Done querying " + name + " data feeds", "num_result": index, "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body)
def prefetch(): inform_data = {} response_d = requests.get(INFORM_API_ENDPOINT) if response_d.status_code != 200: # Because it is too often, it is set to WARNED, but should be ERRONEOUS: body = { "name": "INFORM", "message": "Error querying Inform feed at " + INFORM_API_ENDPOINT, "status": CronJobStatus.WARNED } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) return inform_data response_d = response_d.json() for index, i_data in enumerate(response_d): iso3 = i_data['Iso3'] pcountry = get_country_by_iso3(iso3) if pcountry is None: continue indicator_id = i_data['IndicatorId'] score = i_data['IndicatorScore'] entry = { 'id': index + 1, 'indicator': indicator_id, 'group': InformIndicator.get_group(indicator_id), 'score': score, 'indicator_display': InformIndicator.LABEL_MAP.get(indicator_id), 'group_display': InformIndicator.get_group_display(indicator_id), } # Assuming indicator data are unique from the API if inform_data.get(pcountry.alpha_2) is None: inform_data[pcountry.alpha_2] = [entry] else: inform_data[pcountry.alpha_2].append(entry) body = { "name": "INFORM", "message": "Done querying Inform feed at " + INFORM_API_ENDPOINT, "num_result": len(inform_data), "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body) return inform_data
def prefetch(): data = {} rs = requests.get(API_ENDPOINT) if rs.status_code != 200: body = { "name": "START_NETWORK", "message": "Error querying StartNetwork feed at " + API_ENDPOINT, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) return data rs = rs.text.splitlines() CronJobSum = 0 for row in csv.DictReader(rs): # Some value are like `Congo [DRC]` country = get_country_by_name(row['Country'].split('[')[0].strip()) date = parse_alert_date(row['Alert date']) if country is None or date is None: continue iso2 = country.alpha_2 alert_data = { 'date': date.isoformat(), 'alert': row['Alert'], 'alert_type': row['Alert type'], 'amount_awarded': parse_amount(row['Amount Awarded']), 'crisis_type': row['Crisis Type'], } if data.get(iso2) is None: data[iso2] = [alert_data] else: data[iso2].append(alert_data) CronJobSum += 1 body = { "name": "START_NETWORK", "message": "Done querying StartNetwork feed at " + API_ENDPOINT, "num_result": CronJobSum, "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body) return data
def run(self): try: server = smtplib.SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT) server.ehlo() server.starttls() server.ehlo() succ = server.login(settings.EMAIL_USER, settings.EMAIL_PASS) if 'successful' not in str(succ[1]): cron_rec = { "name": "notification", "message": 'Error contacting ' + settings.EMAIL_HOST + ' smtp server for notifications', "status": CronJobStatus.ERRONEOUS } CronJob.sync_cron(cron_rec) if len(self.recipients) > 0: server.sendmail(settings.EMAIL_USER, self.recipients, self.msg.as_string()) server.quit() logger.info('E-mails were sent successfully.') except Exception as exc: logger.error( 'Could not send emails with Python smtlib, exception: {} -- {}' .format(type(exc).__name__, exc.args)) ex = '' try: ex = str(exc.args) except Exception as exctwo: logger.error(exctwo.args) cron_rec = { "name": "notification", "message": 'Error sending out email with Python smtplib: {}'.format(ex), "status": CronJobStatus.ERRONEOUS } CronJob.sync_cron(cron_rec)
def prefetch(): fdrs_entities = requests.get(FDRS_NS_API_ENDPOINT, headers=FDRS_HEADERS) if fdrs_entities.status_code != 200: body = { "name": "FDRS", "message": "Error querying FDRS NS API feed at " + FDRS_NS_API_ENDPOINT, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) return {} fdrs_entities = fdrs_entities.json() ns_iso_map = { # ISO3 are missing for some in FDRS & IFRC-GO only have ISO2 for countries ns['KPI_DON_code']: ns['iso_2'] for ns in fdrs_entities } body = { "name": "FDRS", "message": "Done querying FDRS NS API feed at " + FDRS_NS_API_ENDPOINT, "num_result": len(ns_iso_map), "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body) return { # KEY <ISO2>-<Indicator_ID>: {year: '', value: ''} f"{ns_iso_map[ns_data['id']].upper()}-{indicator_data['id']}": (ns_data['data'][-1] if (ns_data['data'] and len(ns_data['data']) > 0) else None) for indicator_data in requests.get(FDRS_DATA_API_ENDPOINT, headers=FDRS_HEADERS).json()['data'] for ns_data in indicator_data['data'] }
def handle(self, *args, **options): logger.info('Starting GDACs ingest') # get latest nspace = '{http://www.gdacs.org}' url = 'http://www.gdacs.org/xml/rss_7d.xml' response = requests.get(url) if response.status_code != 200: text_to_log = 'Error querying GDACS xml feed at ' + url logger.error(text_to_log) logger.error(response.content) body = { "name": "ingest_dgacs", "message": text_to_log, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong.... CronJob.sync_cron(body) raise Exception('Error querying GDACS') # get as XML xml2dict = XML2Dict() results = xml2dict.parse(response.content) levels = {'Orange': 1, 'Red': 2} added = 0 for alert in results['rss']['channel']['item']: alert_level = alert['%salertlevel' % nspace].decode('utf-8') if alert_level in levels.keys(): latlon = alert['{http://www.georss.org/georss}point'].decode( 'utf-8').split() eid = alert.pop(nspace + 'eventid') alert_score = alert[nspace + 'alertscore'] if ( nspace + 'alertscore') in alert else None data = { 'title': alert.pop('title'), 'description': alert.pop('description'), 'image': alert.pop('enclosure'), 'report': alert.pop('link'), 'publication_date': parse(alert.pop('pubDate')), 'year': alert.pop(nspace + 'year'), 'lat': latlon[0], 'lon': latlon[1], 'event_type': alert.pop(nspace + 'eventtype'), 'alert_level': levels[alert_level], 'alert_score': alert_score, 'severity': alert.pop(nspace + 'severity'), 'severity_unit': alert['@' + nspace + 'severity']['unit'], 'severity_value': alert['@' + nspace + 'severity']['value'], 'population_unit': alert['@' + nspace + 'population']['unit'], 'population_value': alert['@' + nspace + 'population']['value'], 'vulnerability': alert['@' + nspace + 'vulnerability']['value'], 'country_text': alert.pop(nspace + 'country'), } # do some length checking for key in [ 'event_type', 'alert_score', 'severity_unit', 'severity_value', 'population_unit', 'population_value' ]: if len(data[key]) > 16: data[key] = data[key][:16] data = { k: v.decode('utf-8') if isinstance(v, bytes) else v for k, v in data.items() } gdacsevent, created = GDACSEvent.objects.get_or_create( eventid=eid, defaults=data) if created: added += 1 for c in data['country_text'].split(','): country = Country.objects.filter(name=c.strip()) if country.count() == 1: gdacsevent.countries.add(country[0]) title_elements = ['GDACS %s:' % alert_level] for field in ['country_text', 'event_type', 'severity']: if data[field] is not None: title_elements.append(str(data[field])) title = (' ').join(title_elements) # make sure we don't exceed the 100 character limit if len(title) > 97: title = '%s...' % title[:97] fields = { 'name': title, 'summary': data['description'], 'disaster_start_date': data['publication_date'], 'auto_generated': True, 'auto_generated_source': SOURCES['gdacs'], 'ifrc_severity_level': data['alert_level'], } event = Event.objects.create(**fields) # add countries [ event.countries.add(c) for c in gdacsevent.countries.all() ] text_to_log = '%s GDACs events added' % added logger.info(text_to_log) body = { "name": "ingest_gdacs", "message": text_to_log, "num_result": added, "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body)
def handle(self, *args, **options): logger.info('Starting appeal document ingest') # v smoke test baseurl = 'https://www.ifrc.org/appeals/' # no more ...en/publications-and-reports... http = PoolManager( ) # stackoverflow.com/questions/36516183/what-should-i-use-to-open-a-url-instead-of-urlopen-in-urllib3 smoke_response = http.request('GET', baseurl) joy_to_the_world = False if smoke_response.status == 200: joy_to_the_world = True # We log the success later, when we know the numeric results. else: body = { "name": "ingest_appeal_docs", "message": f'Error ingesting appeals_docs on url: {baseurl}, error_code: {smoke_response.code}', "status": CronJobStatus.ERRONEOUS } CronJob.sync_cron(body) # ^ smoke test if options['fullscan']: # If the `--fullscan` option is passed (at the end of command), check ALL appeals. Runs an hour! print('Doing a full scan of all Appeals') qset = Appeal.objects.all() else: # By default, only check appeals for the past 3 months where Appeal Documents is 0 now = datetime.now().replace(tzinfo=timezone.utc) six_months_ago = now - relativedelta(months=6) # This was the original qset, but it wouldn't get newer docs for the same Appeals # qset = Appeal.objects.filter(appealdocument__isnull=True).filter(end_date__gt=six_months_ago) qset = Appeal.objects.filter(end_date__gt=six_months_ago) # qset = Appeal.objects.filter(code='Something') # could help debug # First get all Appeal Codes appeal_codes = [a.code for a in qset] # Modify code taken from https://pastebin.com/ieMe9yPc to scrape `publications-and-reports` and find # Documents for each appeal code output = [] page_not_found = [] for code in appeal_codes: code = code.replace(' ', '') docs_url = f'{baseurl}?appeal_code={code}' # no more ac={code}&at=0&c=&co=&dt=1&f=&re=&t=&ti=&zo= try: http = PoolManager() response = http.request('GET', docs_url) except Exception: # if we get an error fetching page for an appeal, we ignore it page_not_found.append(code) continue soup = BeautifulSoup(response.data, "lxml") div = soup.find('div', class_='row appeals-view__row') for t in div.findAll('tbody'): output = output + self.makelist(t) # Once we have all Documents in output, we add all missing Documents to the associated Appeal not_found = [] existing = [] created = [] acodes = list(set([a['appealcode'] for a in output])) for code in acodes: try: appeal = Appeal.objects.get(code=code) except ObjectDoesNotExist: not_found.append(code) continue existing_docs = list(appeal.appealdocument_set.all()) docs = [a for a in output if code == a['appealcode']] for doc in docs: if doc['url'].startswith('/'): # can be /docs or /sites also doc['url'] = f'https://www.ifrc.org{doc["url"]}' # href only contains relative path to the document if it's available at the ifrc.org site exists = len([ a for a in existing_docs if a.document_url == doc['url'] ]) > 0 if exists: existing.append(doc['url']) else: try: created_at = self.parse_date(doc['date']) except Exception: created_at = None AppealDocument.objects.create( document_url=doc['url'], name=doc[ 'appealtype'], # not ['name'], because this is the appeal's name created_at=created_at, appeal=appeal, ) created.append(doc['url']) text_to_log = [] text_to_log.append('%s appeal documents created' % len(created)) text_to_log.append('%s existing appeal documents' % len(existing)) text_to_log.append('%s pages not found for appeal' % len(page_not_found)) for t in text_to_log: logger.info(t) # body = { "name": "ingest_appeal_docs", "message": t, "status": CronJobStatus.SUCCESSFUL } # CronJob.sync_cron(body) if len(not_found): t = '%s documents without appeals in system' % len(not_found) logger.warning(t) body = { "name": "ingest_appeal_docs", "message": t, "num_result": len(not_found), "status": CronJobStatus.WARNED } CronJob.sync_cron(body) if (joy_to_the_world): body = { "name": "ingest_appeal_docs", "message": (f'Done ingesting appeals_docs on url {baseurl},' f' {len(created)} appeal document(s) were created,' f' {len(existing)} already exist,' f' {len(page_not_found)} not found'), "num_result": len(created), "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body)
def load(self): """ Load data for Databank from specified sources """ source_prefetch_data = {} # Prefetch Data try: print('\nPrefetching from sources:: ') for source, name in SOURCES: if hasattr(source, 'prefetch'): start = datetime.datetime.now() print(f'\t -> {name}', end='') prefetch_response = source.prefetch() if prefetch_response is not None: source_prefetch_data[ source. __name__], item_count, sources = prefetch_response # Log success prefetch CronJob.sync_cron({ 'name': name, 'message': f'Done querying {name}' + (sources and f' using sources: {sources}') or '', 'num_result': item_count, 'status': CronJobStatus.SUCCESSFUL, }) print(f' [{datetime.datetime.now() - start}]') except Exception as ex: CronJob.sync_cron({ 'name': 'ingest_databank', 'message': f'Could not prefetch from sources\n\nException:\n{str(ex)}', 'status': CronJobStatus.ERRONEOUS, }) # Load try: print('\nLoading Sources data into GO DB:: ') for source, name in SOURCES: if hasattr(source, 'global_load'): print(f'\t -> {name}', end='') source.global_load( source_prefetch_data.get(source.__name__)) print(f' [{datetime.datetime.now() - start}]') index, country_count = 1, Country.objects.count() print('\nLoading Sources data for each country to GO DB:: ') for country in Country.objects.prefetch_related( 'countryoverview').all(): print(u'\t -> ({}/{}) {}'.format(index, country_count, str(country))) overview = (country.countryoverview if hasattr( country, 'countryoverview') else CountryOverview.objects.create(country=country)) overview.script_modified_at = timezone.now() for source, name in SOURCES: if hasattr(source, 'load'): print(f'\t\t -> {name}', end='') # Load For each country source_data = source_prefetch_data.get(source.__name__) start = datetime.datetime.now() source.load(country, overview, source_data) print(f' [{datetime.datetime.now() - start}]') overview.save() index += 1 # This source can not be checked/logged via prefetch, that is why we do it here, after the "load". if name == 'FTS_HPC': CronJob.sync_cron({ 'name': name, 'message': f'Done querying {name} data feeds', 'num_result': index, "status": CronJobStatus.SUCCESSFUL, }) except Exception as ex: CronJob.sync_cron({ 'name': 'ingest_databank', 'message': f'Could not load all data\n\nException:\n{str(ex)}', 'status': CronJobStatus.ERRONEOUS, })
def send_notification(subject, recipients, html, mailtype='', files=None): """ Generic email sending method, handly only HTML emails currently """ if not settings.EMAIL_USER or not settings.EMAIL_API_ENDPOINT: logger.warning( 'Cannot send notifications.\n' 'No username and/or API endpoint set as environment variables.') if settings.DEBUG: print('-' * 22, 'EMAIL START', '-' * 22) print( f'subject={subject}\nrecipients={recipients}\nhtml={html}\nmailtype={mailtype}' ) print('-' * 22, 'EMAIL END -', '-' * 22) return if settings.DEBUG_EMAIL: print('-' * 22, 'EMAIL START', '-' * 22) print(f'\n{html}\n') print('-' * 22, 'EMAIL END -', '-' * 22) # If it's not PROD only able to use test e-mail addresses which are set in the env var to_addresses = recipients if isinstance(recipients, list) else [recipients] if not IS_PROD: logger.info('Using test email addresses...') to_addresses = [] logger.info(to_addresses) for eml in settings.TEST_EMAILS: # It is possible to filter test addressees to domain name only – not used. is_dom = True if '@' not in eml else False if is_dom: for rec in recipients: try: if eml == rec.split('@')[1]: to_addresses.append(rec) except Exception: logger.info( 'Could not extract domain from: {}'.format(rec)) elif eml and (eml in recipients): to_addresses.append(eml) recipients_as_string = ','.join(to_addresses) if not recipients_as_string: if len(to_addresses) > 0: warn_msg = 'Recipients failed to be converted to string, 1st rec.: {}'.format( to_addresses[0]) logger.info(warn_msg) # Save the warning into the CronJob logs too cron_error = { "name": "index_and_notify", "message": warn_msg, "status": CronJobStatus.WARNED } CronJob.sync_cron(cron_error) else: logger.info('Recipients string is empty') return # If there are no recipients it's unnecessary to send out the email # Encode with base64 into bytes, then converting it back to strings for the JSON payload = { "FromAsBase64": str(base64.b64encode(settings.EMAIL_USER.encode('utf-8')), 'utf-8'), "ToAsBase64": str(base64.b64encode(EMAIL_TO.encode('utf-8')), 'utf-8'), "CcAsBase64": "", "BccAsBase64": str(base64.b64encode(recipients_as_string.encode('utf-8')), 'utf-8'), "SubjectAsBase64": str(base64.b64encode(subject.encode('utf-8')), 'utf-8'), "BodyAsBase64": str(base64.b64encode(html.encode('utf-8')), 'utf-8'), "IsBodyHtml": True, "TemplateName": "", "TemplateLanguage": "" } # The response contains the GUID (res.text) res = requests.post(settings.EMAIL_API_ENDPOINT, json=payload) res_text = res.text.replace('"', '') if res.status_code == 200: logger.info(u'Subject: {subject}, Recipients: {recs}'.format( subject=subject, recs=recipients_as_string)) logger.info('GUID: {}'.format(res_text)) # Saving GUID into a table so that the API can be queried with it to get info about # if the actual sending has failed or not. NotificationGUID.objects.create( api_guid=res_text, email_type=mailtype, to_list=f'To: {EMAIL_TO}; Bcc: {recipients_as_string}') logger.info('E-mails were sent successfully.') elif res.status_code == 401 or res.status_code == 403: # Try sending with Python smtplib, if reaching the API fails logger.error( f'Authorization/authentication failed ({res.status_code}) to the e-mail sender API.' ) msg = construct_msg(subject, html, files) SendMail(to_addresses, msg).start() else: # Try sending with Python smtplib, if reaching the API fails logger.error( 'Could not reach the e-mail sender API. Trying with Python smtplib...' ) msg = construct_msg(subject, html, files) SendMail(to_addresses, msg).start() return res.text
def handle(self, *args, **options): guids = [ e.auto_generated_source for e in Event.objects.filter( auto_generated_source__startswith='www.who.int') ] logger.info('Querying WHO RSS feed for new emergency data') # get latest nspace = '{https://www.who.int}' ur2 = [] ur2.append('https://www.who.int/feeds/entity/csr/don/en/rss.xml') ur2.append('https://www.who.int/feeds/entity/hac/en/rss.xml') for index, url in enumerate(ur2): response = requests.get(url) if response.status_code != 200: text_to_log = 'Error querying WHO xml feed at ' + url logger.error(text_to_log) logger.error(response.content) body = { "name": "ingest_who", "message": text_to_log, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) raise Exception('Error querying WHO') # get as XML, but then do not use the obsolate xml2dict = XML2Dict(), but xmltodict results = xmltodict.parse(response.content) added = 0 # lastBuildDate = results['rss']['channel']['lastBuildDate'] # managingEditor = results['rss']['channel']['managingEditor'] for row in results['rss']['channel']['item']: data = { 'title': row.pop('title'), 'link': row.pop('link'), 'description': row.pop('description'), 'guid': row['guid']['#text'], 'isPermaLink': row['guid']['@isPermaLink'], 'category': row.pop('category'), 'pubDate': row.pop('pubDate'), } if data['guid'] in guids: continue if data['guid'] in ['WeDontWantThis', 'NeitherThis']: continue title = data['title'] # for csr link short = title.replace(' (ex-China)', '') pos = short.find(' – ') region = None country = None if pos == -1: pos = short.find(' - ') if pos > 0: country = short[ pos + 3:] # cutting the part after " – " or " - " else: country = 'DashNotFoundInTitle' if country == 'Democratic Republic of the Congo': #replacement country = 'Congo, Dem. Rep.' elif country == 'Argentine Republic': country = 'Argentina' elif country == 'Republic of Panama': country = 'Panama' elif country == 'Islamic Republic of Pakistan': country = 'Pakistan' elif country[:4] == 'the ': country = country[4:] elif index == 1: # for 'hac' category. See link for 'hac' above hac_category = data['category'] # Searching for the given country end = hac_category.find('[country]') if end > 0: start = hac_category[:end - 1].rfind( ',', 0) # backwards search the comma country = hac_category[ start + 2:end - 1] # Getting the comma followed part from the category as Country else: country = 'CountryNotFoundInCategory' # Will not be found via filtering # Searching for the given region end = hac_category.find('[region]') if end > 0: start = hac_category[:end - 1].rfind( ',', 0) # backwards search the comma region_name = hac_category[ start + 2:end - 1] # Getting the comma followed part from the category as Country if 'Afr' in region_name: # Keep synchronised with https://github.com/IFRCGo/go-api/blob/master/api/models.py#L38-L42 region = 0 elif 'Ame' in region_name: region = 1 elif 'As' in region_name: region = 2 elif 'Eu' in region_name: region = 3 elif 'MENA' in region_name: region = 4 else: # search for region that is joined to country (later)... region = None # make sure we don't exceed the 100 character limit if len(title) > 99: title = '%s...' % title[:99] date = parse(data['pubDate']) if data['category'] == 'news': alert_level = 1 else: alert_level = 2 if "Ebola" in title or "virus" in title or "fever" in title: alert_level = 2 elif index == 1: alert_level = 0 if data['category'] == 'news': summary = data['description'] else: summary = data['description'] + ' (' + data[ 'category'] + ')' fields = { 'name': title, 'summary': summary, 'disaster_start_date': date, 'auto_generated': True, 'auto_generated_source': data['guid'], 'ifrc_severity_level': alert_level, } # TODO: fields['name'] sometimes exceeds 100 maxlength, so will need some altering if this will be used event = Event.objects.create(**fields) added += 1 # add country country_found = Country.objects.filter(name=country.strip()) if country_found.count() >= 1: event.countries.add(country_found[0]) else: country_word_list = country.split( ) # list of country words country_found = Country.objects.filter( name=country_word_list[-1].strip() ) # Search only the last word, like "Republic of Panama" > "Panama" if country_found.count() >= 1: event.countries.add(country_found[0]) # add region # print(country) if (region is None) and (country_found.count() > 0) and ( country != 'CountryNotFoundInCategory'): region = country_found[0].region_id if region is not None: event.regions.add(region) text_to_log = "{} WHO messages added, URL-{}".format( added, index + 1) logger.info(text_to_log) # Database CronJob logging body = { "name": "ingest_who", "message": text_to_log, "num_result": added, "storing_days": 6, "status": CronJobStatus.SUCCESSFUL } #... via API - not using frosm here, but from front-end it can be useful: #resp = requests.post(api_url + '/api/v2/add_cronjob_log/', body, headers={'CONTENT_TYPE': 'application/json'}) # ... via a direct write-in: CronJob.sync_cron(body)
def handle(self, *args, **options): guids = [ e.auto_generated_source for e in Event.objects.filter( auto_generated_source__startswith='www.who.int') ] logger.info('Querying WHO RSS feed for new emergency data') # get latest nspace = '{https://www.who.int}' ur2 = [] ur2.append('https://www.who.int/feeds/entity/csr/don/en/rss.xml') ur2.append('https://www.who.int/feeds/entity/hac/en/rss.xml') for index, url in enumerate(ur2): response = requests.get(url) if response.status_code != 200: text_to_log = 'Error querying WHO xml feed at ' + url logger.error(text_to_log) logger.error(response.content) body = { "name": "ingest_who", "message": text_to_log, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) raise Exception('Error querying WHO') # get as XML xml2dict = XML2Dict() results = xml2dict.parse(response.content) added = 0 lastBuildDate = results['rss']['channel']['lastBuildDate'] managingEditor = results['rss']['channel']['managingEditor'] for row in results['rss']['channel']['item']: data = { 'title': row.pop('title'), 'link': row.pop('link'), 'description': row.pop('description'), 'guid': row.pop('guid'), # '@guid': row.pop('@guid'), #can not be popped twice 'isPermaLink': row.pop('@guid').pop('isPermaLink'), 'category': row.pop('category'), 'pubDate': row.pop('pubDate'), } if data['guid'].decode("utf-8") in guids: continue if data['guid'].decode("utf-8") in [ 'WeDontWantThis', 'NeitherThis' ]: continue # alert_level = alert['%salertlevel' % nspace].decode('utf-8') # if alert_level in levels.keys(): # latlon = alert['{http://www.georss.org/georss}point'].decode('utf-8').split() # eid = alert.pop(nspace + 'eventid') # alert_score = alert[nspace + 'alertscore'] if (nspace + 'alertscore') in alert else None # data = { # 'title': alert.pop('title'), # 'description': alert.pop('description'), # 'image': alert.pop('enclosure'), # 'report': alert.pop('link'), # 'publication_date': parse(alert.pop('pubDate')), # 'year': alert.pop(nspace + 'year'), # 'lat': latlon[0], # 'lon': latlon[1], # 'event_type': alert.pop(nspace + 'eventtype'), # 'alert_level': levels[alert_level], # 'alert_score': alert_score, # 'severity': alert.pop(nspace + 'severity'), # 'severity_unit': alert['@' + nspace + 'severity']['unit'], # 'severity_value': alert['@' + nspace + 'severity']['value'], # 'population_unit': alert['@' + nspace + 'population']['unit'], # 'population_value': alert['@' + nspace + 'population']['value'], # 'vulnerability': alert['@' + nspace + 'vulnerability']['value'], # 'country_text': alert.pop(nspace + 'country'), # } # # # do some length checking # for key in ['event_type', 'alert_score', 'severity_unit', 'severity_value', 'population_unit', 'population_value']: # if len(data[key]) > 16: # data[key] = data[key][:16] # data = {k: v.decode('utf-8') if isinstance(v, bytes) else v for k, v in data.items()} # gdacsevent, created = GDACSEvent.objects.get_or_create(eventid=eid, defaults=data) # if created: # added += 1 # for c in data['country_text'].split(','): # country = Country.objects.filter(name=c.strip()) # if country.count() == 1: # gdacsevent.countries.add(country[0]) # # title_elements = ['GDACS %s:' % alert_level] # for field in ['country_text', 'event_type', 'severity']: # if data[field] is not None: # title_elements.append(str(data[field])) # title = (' ').join(title_elements) # title = data['title'].decode("utf-8") # for csr link short = title.replace(' (ex-China)', '') pos = short.find(' – ') region = None country = None if pos == -1: pos = short.find(' - ') if pos > 0: country = short[ pos + 3:] # cutting the part after " – " or " - " else: country = 'DashNotFoundInTitle' if country == 'Democratic Republic of the Congo': #replacement country = 'Congo, Dem. Rep.' elif country == 'Argentine Republic': country = 'Argentina' elif country == 'Republic of Panama': country = 'Panama' elif country == 'Islamic Republic of Pakistan': country = 'Pakistan' elif index == 1: # for 'hac' category. See link for 'hac' above hac_category = data['category'].decode("utf-8") # Searching for the given country end = hac_category.find('[country]') if end > 0: start = hac_category[:end - 1].rfind( ',', 0) # backwards search the comma country = hac_category[ start + 2:end - 1] # Getting the comma followed part from the category as Country else: country = 'CountryNotFoundInCategory' # Will not be found via filtering # Searching for the given region end = hac_category.find('[region]') if end > 0: start = hac_category[:end - 1].rfind( ',', 0) # backwards search the comma region_name = hac_category[ start + 2:end - 1] # Getting the comma followed part from the category as Country if 'Afr' in region_name: # Keep synchronised with https://github.com/IFRCGo/go-api/blob/master/api/models.py#L38-L42 region = 0 elif 'Ame' in region_name: region = 1 elif 'As' in region_name: region = 2 elif 'Eu' in region_name: region = 3 elif 'MENA' in region_name: region = 4 else: # search for region that is joined to country (later)... region = None # make sure we don't exceed the 100 character limit if len(title) > 99: title = '%s...' % title[:99] date = parse(data['pubDate'].decode("utf-8")) if data['category'].decode("utf-8") == 'news': alert_level = 1 else: alert_level = 2 if "Ebola" in title or "virus" in title or "fever" in title: alert_level = 2 elif index == 1: alert_level = 0 if data['category'].decode("utf-8") == 'news': summary = data['description'].decode("utf-8") else: summary = data['description'].decode( "utf-8") + ' (' + data['category'].decode( "utf-8") + ')' fields = { 'name': title, 'summary': summary, 'disaster_start_date': date, 'auto_generated': True, 'auto_generated_source': data['guid'].decode("utf-8"), 'ifrc_severity_level': alert_level, } # TODO: fields['name'] sometimes exceeds 100 maxlength, so will need some altering if this will be used event = Event.objects.create(**fields) added += 1 # add country country_found = Country.objects.filter(name=country.strip()) if country_found.count() >= 1: event.countries.add(country_found[0]) else: country_word_list = country.split( ) # list of country words country_found = Country.objects.filter( name=country_word_list[-1].strip() ) # Search only the last word, like "Republic of Panama" > "Panama" if country_found.count() >= 1: event.countries.add(country_found[0]) # add region # print(country) if (region is None) and (country_found.count() > 0) and ( country != 'CountryNotFoundInCategory'): region = country_found[0].region_id if region is not None: event.regions.add(region) text_to_log = "{} WHO messages added, URL-{}".format( added, index + 1) logger.info(text_to_log) # Database CronJob logging body = { "name": "ingest_who", "message": text_to_log, "num_result": added, "storing_days": 6, "status": CronJobStatus.SUCCESSFUL } #... via API - not using frosm here, but from front-end it can be useful: #resp = requests.post(api_url + '/api/v2/add_cronjob_log/', body, headers={'CONTENT_TYPE': 'application/json'}) # ... via a direct write-in: CronJob.sync_cron(body)
def handle(self, *args, **options): logger.info('Starting appeal document ingest') # v smoke test baseurl = 'https://www.ifrc.org/en/publications-and-reports/appeals/' smoke_response = urlopen(baseurl) joy_to_the_world = False if smoke_response.code == 200: joy_to_the_world = True # We log the success later, when we know the numeric results. else: body = { "name": "ingest_appeal_docs", "message": 'Error ingesting appeals_docs on url ' + baseurl + ', error_code: ' + smoke_response.code, "status": CronJobStatus.ERRONEOUS } CronJob.sync_cron(body) # ^ smoke test if options['fullscan']: # If the `--fullscan` option is passed, check ALL appeals print('Doing a full scan of all Appeals') qset = Appeal.objects.all() else: # By default, only check appeals for the past 3 months where Appeal Documents is 0 now = datetime.now() three_months_ago = now - relativedelta(months=3) # This was the original qset, but it wouldn't get newer docs for the same Appeals #qset = Appeal.objects.filter(appealdocument__isnull=True).filter(end_date__gt=three_months_ago) qset = Appeal.objects.filter(end_date__gt=three_months_ago) # First get all Appeal Codes appeal_codes = [a.code for a in qset] # Modify code taken from https://pastebin.com/ieMe9yPc to scrape `publications-and-reports` and find # Documents for each appeal code output = [] page_not_found = [] for code in appeal_codes: code = code.replace(' ', '') docs_url = baseurl + '?ac=' + code + '&at=0&c=&co=&dt=1&f=&re=&t=&ti=&zo=' try: response = urlopen(docs_url) except: # if we get an error fetching page for an appeal, we ignore it page_not_found.append(code) continue soup = BeautifulSoup(response.read(), "lxml") div = soup.find('div', id='cw_content') for t in div.findAll('tbody'): output = output + self.makelist(t) # Once we have all Documents in output, we add all missing Documents to the associated Appeal not_found = [] existing = [] created = [] acodes = list(set([a[2] for a in output])) for code in acodes: try: appeal = Appeal.objects.get(code=code) except ObjectDoesNotExist: not_found.append(code) continue existing_docs = list(appeal.appealdocument_set.all()) docs = [a for a in output if a[2] == code] for doc in docs: doc[0] = 'https://www.ifrc.org' + doc[0] if doc[0].startswith( '/docs' ) else doc[ 0] # href only contains relative path to the document if it's available at the ifrc.org site exists = len( [a for a in existing_docs if a.document_url == doc[0]]) > 0 if exists: existing.append(doc[0]) else: try: created_at = self.parse_date(doc[5]) except: created_at = None AppealDocument.objects.create( document_url=doc[0], name=doc[4], created_at=created_at, appeal=appeal, ) created.append(doc[0]) text_to_log = [] text_to_log.append('%s appeal documents created' % len(created)) text_to_log.append('%s existing appeal documents' % len(existing)) text_to_log.append('%s pages not found for appeal' % len(page_not_found)) for t in text_to_log: logger.info(t) # body = { "name": "ingest_appeal_docs", "message": t, "status": CronJobStatus.SUCCESSFUL } # CronJob.sync_cron(body) if len(not_found): t = '%s documents without appeals in system' % len(not_found) logger.warn(t) body = { "name": "ingest_appeal_docs", "message": t, "num_result": len(not_found), "status": CronJobStatus.WARNED } CronJob.sync_cron(body) if (joy_to_the_world): body = { "name": "ingest_appeal_docs", "message": 'Done ingesting appeals_docs on url ' + baseurl + ', %s appeal document(s) were created, %s already exist, %s not found' % (len(created), len(existing), len(page_not_found)), "num_result": len(created), "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body)
def load(country, overview, _): pcountry = get_country_by_iso2(country.iso) if pcountry is None: return fts_data = requests.get(FTS_URL.format(pcountry.alpha_3), headers=HEADERS) emg_data = requests.get(EMERGENCY_URL.format(pcountry.alpha_3), headers=HEADERS) if fts_data.status_code != 200: body = { "name": "FTS_HPC", "message": "Error querying HPC fts data feed at " + FTS_URL, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) return {} if emg_data.status_code != 200: body = { "name": "FTS_HPC", "message": "Error querying HPC emergency data feed at " + EMERGENCY_URL, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) return {} fts_data = fts_data.json() emg_data = emg_data.json() c_data = {} # fundingTotals, pledgeTotals for fund_area in ['fundingTotals', 'pledgeTotals']: fund_area_data = fts_data['data']['report3'][fund_area]['objects'] if len(fund_area_data) > 0: for v in fund_area_data[0]['objectsBreakdown']: try: year = int(v['name']) totalFunding = v['totalFunding'] except ValueError: continue if year not in c_data: c_data[year] = {fund_area: totalFunding} else: c_data[year][fund_area] = totalFunding # numActivations CronJobSum = 0 for v in emg_data['data']: try: year = datetime.datetime.strptime( v['date'].split('T')[0], '%Y-%m-%d', ).year except ValueError: continue if year not in c_data: c_data[year] = {'numActivations': 1} else: c_data[year]['numActivations'] = c_data[year].get( 'numActivations', 0) + 1 CronJobSum += c_data[year]['numActivations'] overview.fts_data = [{ 'year': year, **values, } for year, values in c_data.items()] overview.save()
def get_new_or_modified_appeals(self): use_local_file = True if os.getenv( 'DJANGO_DB_NAME') == 'test' and os.path.exists( 'appeals.json') else False new = [] modified = [] if use_local_file: # read from static file for development logger.info('Using local appeals.json file') with open('appeals.json') as f: modified = json.loads(f.read()) logger.info('Using local appealbilaterals.json file') with open('appealbilaterals.json') as f: records = json.loads(f.read()) bilaterals = {} for r in records: # code duplication ¤ if r['APP_Code'] and r['AmountCHF']: if r['APP_Code'] in bilaterals.keys(): bilaterals[r['APP_Code']] += r['AmountCHF'] else: bilaterals[r['APP_Code']] = r['AmountCHF'] else: # get latest BILATERALS logger.info('Querying appeals API for new appeals data') url = 'http://go-api.ifrc.org/api/appealbilaterals' auth = (os.getenv('APPEALS_USER'), os.getenv('APPEALS_PASS')) response = requests.get(url, auth=auth) if response.status_code != 200: text_to_log = 'Error querying AppealBilaterals API at ' + url logger.error(text_to_log) logger.error(response.content) body = { "name": "ingest_appeals", "message": text_to_log, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) raise Exception(text_to_log) records = response.json() # write the current record file to local disk with open('appealbilaterals.json', 'w') as outfile: json.dump(records, outfile) bilaterals = {} for r in records: # code duplication ¤ if r['APP_Code'] and r['AmountCHF']: if r['APP_Code'] in bilaterals.keys(): bilaterals[r['APP_Code']] += r['AmountCHF'] else: bilaterals[r['APP_Code']] = r['AmountCHF'] # get latest APPEALS logger.info('Querying appeals API for new appeals data') url = 'http://go-api.ifrc.org/api/appeals' auth = (os.getenv('APPEALS_USER'), os.getenv('APPEALS_PASS')) response = requests.get(url, auth=auth) if response.status_code != 200: logger.error('Error querying Appeals API') raise Exception('Error querying Appeals API') records = response.json() # write the current record file to local disk with open('appeals.json', 'w') as outfile: json.dump(records, outfile) codes = [a.code for a in Appeal.objects.all()] for r in records: # Temporary filtering, the manual version should be kept: if r['APP_code'] in ['MDR65002', 'MDR00001', 'MDR00004']: continue #if r['APP_code'] != 'MDRMZ014': # Debug to test bilateral additions or other specific appeals # continue if not r['APP_code'] in codes: new.append(r) # We use all records, do NOT check if last_modified > since_last_checked modified.append(r) return new, modified, bilaterals
def _epidemics_prefetch(): query_params = json.dumps({ 'limit': 1000, 'filter': { 'operator': 'AND', 'conditions': [ { 'field': 'primary_type.code', 'value': ['EP'], }, ] }, 'fields': { 'include': ['name', 'date.created', 'primary_country.iso3'] } }) url = DISASTER_API data = {} while True: response = requests.post(url, data=query_params) if response.status_code != 200: body = { "name": "RELIEFWEB", "message": "Error querying ReliefWeb epicemics feed at " + url, "status": CronJobStatus.ERRONEOUS } # not every case is catched here, e.g. if the base URL is wrong... CronJob.sync_cron(body) return data response = response.json() for epidemic in response['data']: epidemic = epidemic['fields'] iso3 = epidemic['primary_country']['iso3'].upper() pcountry = get_country_by_iso3(iso3) if pcountry is None: continue iso2 = pcountry.alpha_2 dt = parse_date(epidemic['date']['created']) name = epidemic['name'] selected_epidemic_type = None # Simple Text Search for epidemic_type, _ in PastEpidemic.CHOICES: if epidemic_type.lower() in name.lower(): selected_epidemic_type = epidemic_type if selected_epidemic_type is None: continue epidemic_data = { 'epidemic': selected_epidemic_type, 'year': dt.year, 'month': dt.month, } if data.get(iso2) is None: data[iso2] = [epidemic_data] else: data[iso2].append(epidemic_data) if 'next' not in response['links']: break url = response['links']['next'] body = { "name": "RELIEFWEB", "message": "Done querying all ReliefWeb feeds at " + url, "num_result": len(data), "status": CronJobStatus.SUCCESSFUL } CronJob.sync_cron(body) return data