def handle(self, *args, **kwargs): url = "https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/" rsp = requests.get(url) doc = bs4.BeautifulSoup(rsp.content, "html.parser") month_abbrs = [x.lower() for x in calendar.month_abbr] imported_months = [] for a in doc.findAll("a", href=re.compile(r"Part%20VIIIA.+\.xlsx$")): # a.attrs['href'] typically has a filename part like # Part%20VIIIA%20September%202017.xlsx # # We split that into ['Part', 'VIIIA', 'September', '2017'] base_filename = unquote( os.path.splitext(os.path.basename(a.attrs["href"]))[0]) if base_filename == "Part VIIIA Nov 20 updated": # November 2020 has a different filename. In general we want to be # warned (through the scraper crashing) about updates (because we have # to delete all records for the month in question, and reimport) so # special-casing is appropriate here. year, month = "2020", 11 else: words = re.split(r"[ -]+", base_filename) month_name, year = words[-2:] # We have seen the last token in `words` be "19_0". The year is # reported to us via Slack, so if we pull out some nonsense here we # *should* notice. year = re.match("\d+", year).group() if len(year) == 2: year = "20" + year # We have seen the month be `September`, `Sept`, and `Sep`. month_abbr = month_name.lower()[:3] month = month_abbrs.index(month_abbr) date = datetime.date(int(year), month, 1) if ImportLog.objects.filter(category="tariff", current_at=date).exists(): continue xls_url = urljoin(url, a.attrs["href"]) xls_file = BytesIO(requests.get(xls_url).content) import_month(xls_file, date) imported_months.append((year, month)) if imported_months: client = Client("dmd") client.upload_model(TariffPrice) for year, month in imported_months: msg = "Imported Drug Tariff for %s_%s" % (year, month) notify_slack(msg) else: msg = "Found no new tariff data to import" notify_slack(msg)
def handle(self, *args, **kwargs): url = 'https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/' rsp = requests.get(url) doc = bs4.BeautifulSoup(rsp.content, 'html.parser') month_names = [x.lower() for x in calendar.month_name] month_abbrs = [x.lower() for x in calendar.month_abbr] imported_months = [] for a in doc.findAll('a', href=re.compile('Part%20VIIIA')): # a.attrs['href'] typically has a filename part like # Part%20VIIIA%20September%202017.xlsx # # We split that into ['Part', 'VIIIA', 'September', '2017'] words = re.split( r'[ -]+', urllib.unquote(os.path.splitext( os.path.basename(a.attrs['href']))[0])) month_name, year = words[-2:] # We have seen the last token in `words` be "19_0". The year is # reported to us via Slack, so if we pull out some nonsense here we # *should* notice. year = re.match('\d+', year).group() if len(year) == 2: year = "20" + year try: month = month_names.index(month_name.lower()) except ValueError: month = month_abbrs.index(month_name.lower()) date = datetime.date(int(year), month, 1) if ImportLog.objects.filter( category='tariff', current_at=date ).exists(): continue xls_url = urljoin(url, a.attrs['href']) xls_file = StringIO(requests.get(xls_url).content) import_month(xls_file, date) imported_months.append((year, month)) if imported_months: client = Client('dmd') client.upload_model(TariffPrice) for year, month in imported_months: msg = 'Imported Drug Tariff for %s_%s' % (year, month) notify_slack(msg) else: msg = 'Found no new tariff data to import' notify_slack(msg)
def upload_to_bq(self): client = Client("dmd") for model in apps.get_app_config("dmd").get_models(): client.upload_model(model) for model in [VMP, AMP, VMPP, AMPP]: table_id = "{}_full".format(model.obj_type) table = client.get_table(table_id) sql = self.sql_for_cls(model) table.insert_rows_from_query(sql)
def handle(self, *args, **kwargs): host = 'https://www.nhsbsa.nhs.uk' index = (host + '/pharmacies-gp-practices-and-appliance-contractors/' 'drug-tariff/drug-tariff-part-viii/') rsp = requests.get(index) doc = bs4.BeautifulSoup(rsp.content, 'html.parser') month_names = [x.lower() for x in calendar.month_name] imported_months = [] for a in doc.findAll('a', href=re.compile('Part%20VIIIA')): # a.attrs['href'] typically has a filename part like # Part%20VIIIA%20September%202017.xlsx # # We split that into ['Part', 'VIIIA', 'September', '2017'] words = re.split( r'[ -]+', urllib.unquote( os.path.splitext(os.path.basename(a.attrs['href']))[0])) month_name, year = words[-2:] if len(year) == 2: year = "20" + year assert "2000" <= year <= str(datetime.date.today().year) month = month_names.index(month_name.lower()) date = datetime.date(int(year), month, 1) if ImportLog.objects.filter(category='tariff', current_at=date).exists(): continue xls_url = urljoin(index, a.attrs['href']) xls_file = StringIO(requests.get(xls_url).content) import_month(xls_file, date) imported_months.append((year, month)) if imported_months: client = Client('dmd') client.upload_model(TariffPrice) for year, month in imported_months: msg = 'Imported Drug Tariff for %s_%s' % (year, month) notify_slack(msg) else: msg = 'Found no new tariff data to import' notify_slack(msg)
def test_upload_model(self): client = Client('dmd') DMDProduct.objects.create( dmdid=327368008, bnf_code='0803042A0AAABAB', vpid=327368008, name='Bicalutamide 150mg tablets', ) DMDVmpp.objects.create( vppid=1206011000001108, nm='Bicalutamide 150mg tablets 28 tablet', vpid=327368008, ) NCSOConcession.objects.create( drug='Bicalutamide 150mg tablets', pack_size=28, price_concession_pence=499, vmpp_id=1206011000001108, date='2017-11-01', ) TariffPrice.objects.create( vmpp_id=1206011000001108, product_id=327368008, price_pence=422, tariff_category_id=11, date='2017-11-01', ) client.upload_model(DMDProduct) client.upload_model(DMDVmpp) client.upload_model(NCSOConcession) client.upload_model(TariffPrice) table = client.get_table('product') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['name'], 'Bicalutamide 150mg tablets') table = client.get_table('vmpp') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['nm'], 'Bicalutamide 150mg tablets 28 tablet') table = client.get_table('ncsoconcession') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['drug'], 'Bicalutamide 150mg tablets') table = client.get_table('tariffprice') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['price_pence'], 422)
def test_upload_model(self): client = Client("dmd") client.upload_model(VMPP) table = client.get_table("vmpp") rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 4) for row in rows: if row["id"] != 1079211000001106: continue for k, v in { "invalid": False, "nm": "Chlortalidone 50mg tablets 28 tablet", "vmp": 317935006, "qtyval": Decimal("28.00"), "qty_uom": 428673006, "combpack": None, "bnf_code": "0202010F0AAAAAA", }.items(): self.assertEqual(row[k], v)
def test_upload_model(self): client = Client('dmd') client.upload_model(VMPP) table = client.get_table('vmpp') rows = list(table.get_rows_as_dicts()) self.assertEqual(len(rows), 4) for row in rows: if row['id'] != 1079211000001106: continue for k, v in { 'invalid': False, 'nm': 'Chlortalidone 50mg tablets 28 tablet', 'vmp': 317935006, 'qtyval': Decimal('28.00'), 'qty_uom': 428673006, 'combpack': None, 'bnf_code': '0202010F0AAAAAA', }.items(): self.assertEqual(row[k], v)
def upload_to_bq(self): client = Client("dmd") for model in apps.get_app_config("dmd2").get_models(): client.upload_model(model)
def upload_to_bq(self): client = Client('dmd') for model in apps.get_app_config('dmd2').get_models(): client.upload_model(model)
def run_end_to_end(): print('BQ_NONCE: {}'.format(settings.BQ_NONCE)) call_command('migrate') path = os.path.join(settings.APPS_ROOT, 'frontend', 'management', 'commands', 'measure_definitions') # No MeasureGlobals or MeasureValues are generated for the ghost branded # generics measure, because both numerator and denominator are computed # from a view (vw__ghost_generic_measure) which has no data. Rather than # populate this view, it is simpler to pretend it doesn't exist. num_measures = len(os.listdir(path)) - 1 shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True) with open(settings.PIPELINE_IMPORT_LOG_PATH, 'w') as f: f.write('{}') for blob in StorageClient().bucket().list_blobs(): blob.delete() for dataset_key in DATASETS: BQClient(dataset_key).create_dataset() client = BQClient('hscic') client.create_table('bnf', schemas.BNF_SCHEMA) client.create_table('ccgs', schemas.CCG_SCHEMA) client.create_table('ppu_savings', schemas.PPU_SAVING_SCHEMA) client.create_table( 'practice_statistics', schemas.PRACTICE_STATISTICS_SCHEMA ) client.create_table( 'practice_statistics_all_years', schemas.PRACTICE_STATISTICS_SCHEMA ) client.create_table('practices', schemas.PRACTICE_SCHEMA) client.create_table('prescribing', schemas.PRESCRIBING_SCHEMA) client.create_table('presentation', schemas.PRESENTATION_SCHEMA) client.create_table('tariff', schemas.TARIFF_SCHEMA) client.create_table('bdz_adq', schemas.BDZ_ADQ_SCHEMA) client = BQClient('measures') # This is enough of a schema to allow the practice_data_all_low_priority # table to be created, since it references these fields. Once populated by # import_measures, the tables in the measures dataset will have several # more fields. But we don't need to specify exactly what they are, as BQ # will work it out when the data is inserted with insert_rows_from_query. measures_schema = build_schema( ('month', 'DATE'), ('practice_id', 'STRING'), ('numerator', 'INTEGER'), ('denominator', 'INTEGER'), ) path = os.path.join(settings.APPS_ROOT, 'frontend', 'management', 'commands', 'measure_definitions', '*.json') for path in glob.glob(path): measure_id = os.path.splitext(os.path.basename(path))[0] client.create_table('practice_data_' + measure_id, measures_schema) client.create_table('ccg_data_' + measure_id, measures_schema) client.create_table('global_data_' + measure_id, measures_schema) # Although there are no model instances, we call upload_model to create the # tables in BQ that might be required by certain measure views. client = BQClient('dmd') client.upload_model(TariffPrice) client.upload_model(VMPP) call_command('generate_presentation_replacements') path = os.path.join(settings.APPS_ROOT, 'frontend', 'management', 'commands', 'replace_matviews.sql') with open(path) as f: with connection.cursor() as c: c.execute(f.read()) copy_tree( os.path.join(e2e_path, 'data-1'), os.path.join(e2e_path, 'data'), ) runner.run_all(2017, 9, under_test=True) # We expect one MeasureGlobal per measure per month. assert_count_equal(num_measures, MeasureGlobal) # We expect one MeasureValue for each organisation per measure per month # (There are 4 practices, 2 CCGs, 2 STPs, and 2 regional teams). assert_count_equal(10 * num_measures, MeasureValue) # We expect one statistic per CCG per month assert_raw_count_equal(2, 'vw__ccgstatistics') # We expect one chemical summary per CCG per month assert_raw_count_equal(2, 'vw__chemical_summary_by_ccg', "chemical_id = '1001030C0'") # We expect one chemical summary per practice per month assert_raw_count_equal(4, 'vw__chemical_summary_by_practice', "chemical_id = '1001030C0'") # We expect one summary per practice per month assert_raw_count_equal(4, 'vw__practice_summary') # We expect one presentation summary per month assert_raw_count_equal(1, 'vw__presentation_summary', "presentation_code = '1001030C0AAAAAA'") # We expect one presentation summary per CCG per month assert_raw_count_equal(2, 'vw__presentation_summary_by_ccg', "presentation_code = '1001030C0AAAAAA'") copy_tree( os.path.join(e2e_path, 'data-2'), os.path.join(e2e_path, 'data'), ) runner.run_all(2017, 10, under_test=True) # We expect one MeasureGlobal per measure per month assert_count_equal(2 * num_measures, MeasureGlobal) # We expect one MeasureValue for each organisation per measure per month assert_count_equal(20 * num_measures, MeasureValue) # We expect one statistic per CCG per month assert_raw_count_equal(4, 'vw__ccgstatistics') # We expect one chemical summary per CCG per month assert_raw_count_equal(4, 'vw__chemical_summary_by_ccg', "chemical_id = '1001030C0'") # We expect one chemical summary per practice per month assert_raw_count_equal(8, 'vw__chemical_summary_by_practice', "chemical_id = '1001030C0'") # We expect one summary per practice per month assert_raw_count_equal(8, 'vw__practice_summary') # We expect one presentation summary per month assert_raw_count_equal(2, 'vw__presentation_summary', "presentation_code = '1001030C0AAAAAA'") # We expect one presentation summary per CCG per month assert_raw_count_equal(4, 'vw__presentation_summary_by_ccg', "presentation_code = '1001030C0AAAAAA'")
def run_end_to_end(): print("BQ_NONCE: {}".format(settings.BQ_NONCE)) call_command("migrate") # No MeasureGlobals or MeasureValues are generated for the ghost branded # generics measure, because both numerator and denominator are computed # from a view (vw__ghost_generic_measure) which has no data. Rather than # populate this view, it is simpler to pretend it doesn't exist. num_measures = (len( glob.glob(os.path.join(settings.MEASURE_DEFINITIONS_PATH, "*.json"))) - 1) shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True) with open(settings.PIPELINE_IMPORT_LOG_PATH, "w") as f: f.write("{}") for blob in StorageClient().bucket().list_blobs(): blob.delete() for dataset_key in DATASETS: BQClient(dataset_key).create_dataset() client = BQClient("hscic") client.create_table("bnf", schemas.BNF_SCHEMA) client.create_table("ccgs", schemas.CCG_SCHEMA) client.create_table("stps", schemas.STP_SCHEMA) client.create_table("regional_teams", schemas.REGIONAL_TEAM_SCHEMA) client.create_table("ppu_savings", schemas.PPU_SAVING_SCHEMA) client.create_table("practice_statistics", schemas.PRACTICE_STATISTICS_SCHEMA) client.create_table("practice_statistics_all_years", schemas.PRACTICE_STATISTICS_SCHEMA) client.create_table("practices", schemas.PRACTICE_SCHEMA) client.create_table("prescribing", schemas.PRESCRIBING_SCHEMA) client.create_table("presentation", schemas.PRESENTATION_SCHEMA) client.create_table("tariff", schemas.TARIFF_SCHEMA) client.create_table("bdz_adq", schemas.BDZ_ADQ_SCHEMA) client = BQClient("measures") # This is enough of a schema to allow the practice_data_all_low_priority # table to be created, since it references these fields. Once populated by # import_measures, the tables in the measures dataset will have several # more fields. But we don't need to specify exactly what they are, as BQ # will work it out when the data is inserted with insert_rows_from_query. measures_schema = build_schema( ("month", "DATE"), ("practice_id", "STRING"), ("numerator", "INTEGER"), ("denominator", "INTEGER"), ) for path in glob.glob( os.path.join(settings.MEASURE_DEFINITIONS_PATH, "*.json")): measure_id = os.path.splitext(os.path.basename(path))[0] client.create_table("practice_data_" + measure_id, measures_schema) client.create_table("ccg_data_" + measure_id, measures_schema) client.create_table("global_data_" + measure_id, measures_schema) # Although there are no model instances, we call upload_model to create the # dm+d tables in BQ that are required by certain measure views. client = BQClient("dmd") for model in apps.get_app_config("dmd2").get_models(): client.upload_model(model) call_command("generate_presentation_replacements") copy_tree(os.path.join(e2e_path, "data-1"), os.path.join(e2e_path, "data")) runner.run_all(2017, 9, under_test=True) # We expect one MeasureGlobal per measure per month. assert_count_equal(num_measures, MeasureGlobal) # We expect one MeasureValue for each organisation per measure per month # (There are 4 practices, 2 CCGs, 2 STPs, and 2 regional teams). assert_count_equal(10 * num_measures, MeasureValue) copy_tree(os.path.join(e2e_path, "data-2"), os.path.join(e2e_path, "data")) runner.run_all(2017, 10, under_test=True) # We expect one MeasureGlobal per measure per month assert_count_equal(2 * num_measures, MeasureGlobal) # We expect one MeasureValue for each organisation per measure per month assert_count_equal(20 * num_measures, MeasureValue)