def handle(self, *args, **kwargs): self.vmpps = DMDVmpp.objects.values('nm', 'vppid') self.counter = { 'new-and-matched': 0, 'new-and-unmatched': 0, 'changed': 0, 'unchanged': 0, } self.import_from_archive() self.import_from_current() logger.info('New and matched: %s', self.counter['new-and-matched']) logger.info('New and unmatched: %s', self.counter['new-and-unmatched']) logger.info('Changed: %s', self.counter['changed']) logger.info('Unchanged: %s', self.counter['unchanged']) Client('dmd').upload_model(NCSOConcession) msg = '\n'.join([ 'Imported NCSO concessions', 'New and matched: %s' % self.counter['new-and-matched'], 'New and unmatched: %s' % self.counter['new-and-unmatched'], 'Changed: %s' % self.counter['changed'], 'Unchanged: %s' % self.counter['unchanged'], ]) notify_slack(msg)
def run_task(task, year, month, **kwargs): if TaskLog.objects.filter( year=year, month=month, task_name=task.name, status=TaskLog.SUCCESSFUL, ).exists(): # This task has already been run successfully return task_log = TaskLog.objects.create( year=year, month=month, task_name=task.name, ) try: task.run(year, month, **kwargs) task_log.mark_succeeded() except: # We want to catch absolutely every error here, including things that # wouldn't be caught by `except Exception` (like `KeyboardInterrupt`), # since we want to log that the task didn't complete. import traceback task_log.mark_failed(formatted_tb=traceback.format_exc()) msg = 'Importing data for {}_{} has failed when running {}.'.format( year, month, task.name) notify_slack(msg) raise
def handle(self, *args, **kwargs): url = "https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/" rsp = requests.get(url) doc = bs4.BeautifulSoup(rsp.content, "html.parser") month_abbrs = [x.lower() for x in calendar.month_abbr] imported_months = [] for a in doc.findAll("a", href=re.compile(r"Part%20VIIIA.+\.xlsx$")): # a.attrs['href'] typically has a filename part like # Part%20VIIIA%20September%202017.xlsx # # We split that into ['Part', 'VIIIA', 'September', '2017'] base_filename = unquote( os.path.splitext(os.path.basename(a.attrs["href"]))[0]) if base_filename == "Part VIIIA Nov 20 updated": # November 2020 has a different filename. In general we want to be # warned (through the scraper crashing) about updates (because we have # to delete all records for the month in question, and reimport) so # special-casing is appropriate here. year, month = "2020", 11 else: words = re.split(r"[ -]+", base_filename) month_name, year = words[-2:] # We have seen the last token in `words` be "19_0". The year is # reported to us via Slack, so if we pull out some nonsense here we # *should* notice. year = re.match("\d+", year).group() if len(year) == 2: year = "20" + year # We have seen the month be `September`, `Sept`, and `Sep`. month_abbr = month_name.lower()[:3] month = month_abbrs.index(month_abbr) date = datetime.date(int(year), month, 1) if ImportLog.objects.filter(category="tariff", current_at=date).exists(): continue xls_url = urljoin(url, a.attrs["href"]) xls_file = BytesIO(requests.get(xls_url).content) import_month(xls_file, date) imported_months.append((year, month)) if imported_months: client = Client("dmd") client.upload_model(TariffPrice) for year, month in imported_months: msg = "Imported Drug Tariff for %s_%s" % (year, month) notify_slack(msg) else: msg = "Found no new tariff data to import" notify_slack(msg)
def run_all(year, month, under_test=False): tasks = load_tasks() if not under_test: for task in tasks.by_type("manual_fetch"): run_task(task, year, month) for task in tasks.by_type("auto_fetch"): run_task(task, year, month) upload_all_to_storage(tasks) for task in tasks.by_type("convert").ordered(): run_task(task, year, month) for task in tasks.by_type("import").ordered(): run_task(task, year, month) prescribing_path = tasks["convert_hscic_prescribing"].imported_paths()[-1] last_imported = re.findall(r"/(\d{4}_\d{2})/", prescribing_path)[0] for task in tasks.by_type("post_process").ordered(): run_task(task, year, month, last_imported=last_imported) if not under_test: # Remove numbers.json files. These are created by check_numbers, and we want to # remove them after each import, since all numbers on the site will change with # the new data. We only want to remove these files after importing real data, # and not during an end-to-end run. check_numbers_glob = os.path.join( settings.CHECK_NUMBERS_BASE_PATH, "*", "numbers.json" ) for path in glob.glob(check_numbers_glob): os.remove(path) TaskLog.objects.create( year=year, month=month, task_name="fetch_and_import", status=TaskLog.SUCCESSFUL ) activity = random.choice( ["put the kettle on", "have a glass of wine", "get up and stretch"] ) msg = """ Importing data for {}_{} complete!' You should now: * tweet about it * run `sudo systemctl restart app.openprescribing.*.service` * {} (Details: https://github.com/ebmdatalab/openprescribing/wiki/Importing-data) """.strip().format( year, month, activity ) if not under_test: notify_slack(msg)
def handle(self, *args, **kwargs): url = 'https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/' rsp = requests.get(url) doc = bs4.BeautifulSoup(rsp.content, 'html.parser') month_names = [x.lower() for x in calendar.month_name] month_abbrs = [x.lower() for x in calendar.month_abbr] imported_months = [] for a in doc.findAll('a', href=re.compile('Part%20VIIIA')): # a.attrs['href'] typically has a filename part like # Part%20VIIIA%20September%202017.xlsx # # We split that into ['Part', 'VIIIA', 'September', '2017'] words = re.split( r'[ -]+', urllib.unquote(os.path.splitext( os.path.basename(a.attrs['href']))[0])) month_name, year = words[-2:] # We have seen the last token in `words` be "19_0". The year is # reported to us via Slack, so if we pull out some nonsense here we # *should* notice. year = re.match('\d+', year).group() if len(year) == 2: year = "20" + year try: month = month_names.index(month_name.lower()) except ValueError: month = month_abbrs.index(month_name.lower()) date = datetime.date(int(year), month, 1) if ImportLog.objects.filter( category='tariff', current_at=date ).exists(): continue xls_url = urljoin(url, a.attrs['href']) xls_file = StringIO(requests.get(xls_url).content) import_month(xls_file, date) imported_months.append((year, month)) if imported_months: client = Client('dmd') client.upload_model(TariffPrice) for year, month in imported_months: msg = 'Imported Drug Tariff for %s_%s' % (year, month) notify_slack(msg) else: msg = 'Found no new tariff data to import' notify_slack(msg)
def run_all(year, month, under_test=False): tasks = load_tasks() if not under_test: for task in tasks.by_type('manual_fetch'): run_task(task, year, month) for task in tasks.by_type('auto_fetch'): run_task(task, year, month) upload_all_to_storage(tasks) for task in tasks.by_type('convert').ordered(): run_task(task, year, month) for task in tasks.by_type('import').ordered(): run_task(task, year, month) prescribing_path = tasks['import_hscic_prescribing'].imported_paths()[-1] last_imported = re.findall(r'/(\d{4}_\d{2})/', prescribing_path)[0] for task in tasks.by_type('post_process').ordered(): if under_test and 'smoketest' in task.name: # Smoketests run against live site, so we should skip when running # under test continue run_task(task, year, month, last_imported=last_imported) TaskLog.objects.create( year=year, month=month, task_name='fetch_and_import', status=TaskLog.SUCCESSFUL, ) activity = random.choice([ 'Put the kettle on', 'Have a glass of wine', 'Get up and stretch', ]) msg = ''' Importing data for {}_{} complete!' You should now: * Tweet about it * Commit the changes to the smoke tests * {} (Details: https://github.com/ebmdatalab/openprescribing/wiki/Importing-data) '''.strip().format(year, month, activity) if not under_test: notify_slack(msg)
def handle(self, *args, **kwargs): if os.environ['DJANGO_SETTINGS_MODULE'] != \ 'openprescribing.settings.e2etest': raise CommandError('Command must run with e2etest settings') try: run_end_to_end() except: import traceback msg = 'End-to-end test failed:\n\n' msg += traceback.format_exc() notify_slack(msg) raise notify_slack('Pipeline tests ran to completion')
def handle_case_c(self, ccg): # The CCG is still open but some practices have moved to another CCG name = self.ccg_to_name[ccg] if self.dry_run: self.stdout.write( "Some practices have left CCG and some currently in CCG are not active" ) else: msg = """ Practices have left CCG {} ({}) and some remaining practices are not active. Check whether these inactive practices should have moved. See instructions in handle_orphan_practices.py. """.format(ccg, name).strip() notify_slack(msg)
def handle(self, *args, **kwargs): if os.environ['DJANGO_SETTINGS_MODULE'] != \ 'openprescribing.settings.e2etest': raise CommandError('Command must run with e2etest settings') try: run_end_to_end() except Exception: msg = 'End-to-end test failed (seed: %s)\n\n' % settings.BQ_NONCE msg += 'Check logs in /tmp/' notify_slack(msg) raise msg = 'Pipeline tests ran to completion (seed: %s)' % settings.BQ_NONCE notify_slack(msg)
def handle(self, *args, **kwargs): if os.environ[ "DJANGO_SETTINGS_MODULE"] != "openprescribing.settings.e2etest": raise CommandError("Command must run with e2etest settings") try: run_end_to_end() except Exception: msg = "End-to-end test failed (seed: %s)\n\n" % settings.BQ_NONCE msg += "Check logs in /tmp/" notify_slack(msg) raise msg = "Pipeline tests ran to completion (seed: %s)" % settings.BQ_NONCE notify_slack(msg)
def handle(self, *args, **kwargs): host = 'https://www.nhsbsa.nhs.uk' index = (host + '/pharmacies-gp-practices-and-appliance-contractors/' 'drug-tariff/drug-tariff-part-viii/') rsp = requests.get(index) doc = bs4.BeautifulSoup(rsp.content, 'html.parser') month_names = [x.lower() for x in calendar.month_name] imported_months = [] for a in doc.findAll('a', href=re.compile('Part%20VIIIA')): # a.attrs['href'] typically has a filename part like # Part%20VIIIA%20September%202017.xlsx # # We split that into ['Part', 'VIIIA', 'September', '2017'] words = re.split( r'[ -]+', urllib.unquote( os.path.splitext(os.path.basename(a.attrs['href']))[0])) month_name, year = words[-2:] if len(year) == 2: year = "20" + year assert "2000" <= year <= str(datetime.date.today().year) month = month_names.index(month_name.lower()) date = datetime.date(int(year), month, 1) if ImportLog.objects.filter(category='tariff', current_at=date).exists(): continue xls_url = urljoin(index, a.attrs['href']) xls_file = StringIO(requests.get(xls_url).content) import_month(xls_file, date) imported_months.append((year, month)) if imported_months: client = Client('dmd') client.upload_model(TariffPrice) for year, month in imported_months: msg = 'Imported Drug Tariff for %s_%s' % (year, month) notify_slack(msg) else: msg = 'Found no new tariff data to import' notify_slack(msg)
def import_month(xls_file, date): wb = load_workbook(xls_file) rows = wb.active.rows # The first row is a title, and the second is empty next(rows) next(rows) # The third row is column headings header_row = next(rows) headers = ["".join((c.value or "?").lower().split()) for c in header_row] assert headers == [ "medicine", "packsize", "?", "vmppsnomedcode", "drugtariffcategory", "basicprice", ] with transaction.atomic(): for row in rows: values = [c.value for c in row] if all(v is None for v in values): continue d = dict(zip(headers, values)) if d["basicprice"] is None: msg = "Missing price for {} Drug Tariff for {}".format( d["medicine"], date) notify_slack(msg) continue TariffPrice.objects.get_or_create( date=date, vmpp_id=d["vmppsnomedcode"], tariff_category_id=get_tariff_cat_id(d["drugtariffcategory"]), price_pence=int(d["basicprice"]), ) ImportLog.objects.create(category="tariff", current_at=date, filename="none")
def handle(self, *args, **kwargs): self.vmpps = VMPP.objects.values('id', 'nm') num_before = NCSOConcession.objects.count() self.import_from_archive() self.import_from_current() self.reconcile() num_after = NCSOConcession.objects.count() num_unmatched = NCSOConcession.objects.filter( vmpp__isnull=True).count() Client('dmd').upload_model(NCSOConcession) if num_before == num_after: msg = 'Found no new concessions to import' else: msg = 'Imported %s new concessions' % (num_after - num_before) if num_unmatched: msg += '\nThere are %s unmatched concessions' % num_unmatched notify_slack(msg)
def handle_case_b(self, ccg, new_ccgs): # The CCG has closed and its practices have moved to multiple CCGs name = self.ccg_to_name[ccg] if self.dry_run: self.stdout.write( "All practices currently in CCG are closed or dormant") self.stdout.write( "Active practices previously in CCG are now in multiple CCGs:") for new_ccg, count in new_ccgs.most_common(): new_ccg_name = self.ccg_to_name[new_ccg] self.stdout.write("{:3} {} ({})".format( count, new_ccg, new_ccg_name)) else: msg = """ All active practices previously in CCG {} ({}) are now in multiple CCGs: Check whether inactive practices remaining in CCG should have moved. See instructions in handle_orphan_practices.py. """.format(ccg, name).strip() for new_ccg, count in new_ccgs.most_common(): new_ccg_name = self.ccg_to_name[new_ccg] msg += "\n{:3} {} ({})".format(count, new_ccg, new_ccg_name) notify_slack(msg)
def import_from_html(self, heading): month_name, year = heading.text.strip().split() month_names = list(calendar.month_name) month = month_names.index(month_name) date = datetime.date(int(year), month, 1) if date < datetime.date(2014, 8, 1): return table = heading.find_next('table') trs = table.find_all('tr') records = [[td.text.strip() for td in tr.find_all('td')] for tr in trs] # Make sure the first row contains expected headers. # Unfortunately, the header names are not consistent. assert 'drug' in records[0][0].lower() assert 'pack' in records[0][1].lower() assert 'price' in records[0][2].lower() for record in records[1:]: drug, pack_size, price_concession = record drug = drug.replace('(new)', '').strip() match = re.match(u'£(\d+)\.(\d\d)', price_concession) price_concession_pence = 100 * int(match.groups()[0]) \ + int(match.groups()[1]) self.import_record(date, drug, pack_size, price_concession_pence) if NCSOConcession.objects.filter(date=date).count() >= len(records): # If there are more records in the database than we have imported, # then there was previously a record in the source that is no # longer present. I have seen this once, with a record whose # spelling was corrected. If we see this frequently, we should # automate dealing with it; for now we can deal with it manually. msg = 'NCSO concession(s) removed from source for {}'.format(date) notify_slack(msg)
def handle(self, *args, **options): if import_in_progress(): notify_slack("Not checking numbers: import in progress") return previous_log_path = get_previous_log_path() timestamp = datetime.now().strftime("%Y%m%d%H%M%S") log_path = os.path.join(settings.CHECK_NUMBERS_BASE_PATH, timestamp) mkdir_p(log_path) numbers = {} options = Options() options.headless = True with webdriver.Firefox(options=options) as browser: browser.set_page_load_timeout(60) for name, path in paths_to_scrape(): source = get_page_source(browser, path, name, log_path) numbers_list = extract_numbers(source) numbers[name] = {"path": path, "numbers": numbers_list} write_numbers(numbers, log_path) if previous_log_path is None: msg = "Not checking numbers: this is the first deploy since last import" notify_slack(msg) return previous_numbers = load_previous_numbers(previous_log_path) differences = compare_numbers(previous_numbers, numbers) if differences: msg = "The following pages have changed:\n\n" msg += "\n".join(differences) msg += "\n\nNext step: compare {} and {}".format( previous_log_path, log_path ) notify_slack(msg)
def notify_slack(self): msg = "Imported dm+d data, release " + self.release notify_slack(msg)