class CommandsFunctionalTestCase(TestCase): fixtures = ['presentations', 'importlog'] def setUp(self): """Create a raw_prescribing_data table such as is expected to exist by ADQ calculation code. """ raw_data_path = 'frontend/tests/fixtures/commands/' +\ 'convert_hscic_prescribing/2016_01/' +\ 'Detailed_Prescribing_Information.csv' year_and_month = ImportLog.objects.latest_in_category( 'prescribing').current_at.strftime("%Y_%m") self.table_name = 'raw_prescribing_data_{}'.format(year_and_month) self.client = Client('tmp_eu') t1 = self.client.get_or_create_table( self.table_name, RAW_PRESCRIBING_SCHEMA) t1.insert_rows_from_csv(raw_data_path, skip_leading_rows=1) call_command('import_adqs') def test_import_adqs_functional(self): # These all have ADQs in the raw data for p in Presentation.objects.filter(bnf_code__regex='0202010B0.*AB'): self.assertEqual(p.adq_per_quantity, 1.0) # This doesn't exist in the raw data p = Presentation.objects.get(bnf_code='0204000I0AAAZAZ') self.assertEqual(p.adq_per_quantity, None) def tearDown(self): try: self.client.delete_table(self.table_name) except NotFound: pass
def write_zero_prescribing_codes_table(level): """Given a BNF `level` (`section`, `chapter`, `paragraph`, etc), write a table in bigquery listing all such levels that have zero prescribing. Returns a bigquery Table. """ logger.info("Scanning %s to see if it has zero prescribing" % level) sql = """ SELECT bnf.%s FROM {hscic}.normalised_prescribing_standard AS prescribing RIGHT JOIN {hscic}.bnf bnf ON prescribing.bnf_code = bnf.presentation_code WHERE ( bnf.presentation_code NOT LIKE '2%%' -- appliances, etc ) GROUP BY bnf.%s HAVING COUNT(prescribing.bnf_code) = 0 """ % (level, level) client = Client('tmp_eu') table = client.get_table('unused_codes_%s' % level) table.insert_rows_from_query(sql) return table
class CommandsFunctionalTestCase(TestCase): fixtures = ["presentations", "importlog"] def setUp(self): """Create a raw_prescribing_data table such as is expected to exist by ADQ calculation code. """ raw_data_path = ("frontend/tests/fixtures/commands/" + "convert_hscic_prescribing/2016_01/" + "EPD_201601.csv") year_and_month = ImportLog.objects.latest_in_category( "prescribing").current_at.strftime("%Y_%m") self.table_name = "raw_prescribing_data_{}".format(year_and_month) self.client = Client("tmp_eu") t1 = self.client.get_or_create_table(self.table_name, RAW_PRESCRIBING_SCHEMA_V2) t1.insert_rows_from_csv(raw_data_path, RAW_PRESCRIBING_SCHEMA_V2, skip_leading_rows=1) call_command("import_adqs") def test_import_adqs_functional(self): # These all have ADQs in the raw data for p in Presentation.objects.filter(bnf_code__regex="0202010B0.*AB"): self.assertEqual(p.adq_per_quantity, 1.0) # This doesn't exist in the raw data p = Presentation.objects.get(bnf_code="0204000I0AAAZAZ") self.assertEqual(p.adq_per_quantity, None) def tearDown(self): try: self.client.delete_table(self.table_name) except NotFound: pass
def handle(self, *args, **kwargs): base_path = os.path.join( settings.APPS_ROOT, "frontend", "management", "commands", "measure_sql" ) client = Client("measures") for table_name in [ "dmd_objs_with_form_route", "opioid_total_ome", "practice_data_all_low_priority", "pregabalin_total_mg", "vw__median_price_per_unit", "vw__ghost_generic_measure", # This references pregabalin_total_mg, so must come afterwards "gaba_total_ddd", ]: path = os.path.join(base_path, table_name + ".sql") with open(path, "r") as sql_file: sql = sql_file.read() try: client.create_table_with_view(table_name, sql, False) except Conflict: client.delete_table(table_name) client.create_table_with_view(table_name, sql, False) # cmpa_products is a table that has been created and managed by Rich. schema = build_schema( ("bnf_code", "STRING"), ("bnf_name", "STRING"), ("type", "STRING") ) client.get_or_create_table("cmpa_products", schema)
def upload_dummy_prescribing(bnf_codes): """Upload enough dummy prescribing data to BQ to allow the BNF code simplification to be meaningful.""" prescribing_rows = [] for bnf_code in bnf_codes: row = [ None, # sha None, # pct None, # practice bnf_code, # bnf_code None, # bnf_name None, # items None, # net_cost None, # actual_cost None, # quantity None, # month ] prescribing_rows.append(row) table = Client("hscic").get_table("normalised_prescribing") with tempfile.NamedTemporaryFile("wt") as f: writer = csv.writer(f) for row in prescribing_rows: writer.writerow(row) f.seek(0) table.insert_rows_from_csv(f.name, schemas.PRESCRIBING_SCHEMA)
def fill_views(self): client = Client('hscic') pool = Pool(processes=len(self.view_paths)) tables = [] prescribing_date = ImportLog.objects.latest_in_category( 'prescribing').current_at.strftime('%Y-%m-%d') for path in self.view_paths: table_name = "vw__%s" % os.path.basename(path).replace('.sql', '') table = client.get_table(table_name) tables.append(table) with open(path) as f: sql = f.read() substitutions = {'this_month': prescribing_date} args = [table.name, sql, substitutions] pool.apply_async(query_and_export, args) pool.close() pool.join() # wait for all worker processes to exit for table in tables: self.download_and_import(table) self.log("-------------")
def fill_views(self): client = Client('hscic') pool = Pool(processes=len(self.view_paths)) tables = [] prescribing_date = ImportLog.objects.latest_in_category( 'prescribing').current_at.strftime('%Y-%m-%d') for path in self.view_paths: table_name = "vw__%s" % os.path.basename(path).replace('.sql', '') table = client.get_table(table_name) tables.append(table) with open(path) as f: sql = f.read() substitutions = {'this_month': prescribing_date} args = [table.table_id, sql, substitutions] pool.apply_async(query_and_export, args) pool.close() pool.join() # wait for all worker processes to exit for table in tables: self.download_and_import(table) self.log("-------------") with connection.cursor() as cursor: for view_id in self.materialized_views: # This is quite slow! up to 10 mins. cursor.execute("REFRESH MATERIALIZED VIEW %s" % view_id)
def query_and_export(table_name, sql, substitutions): try: client = Client('hscic') table = client.get_table(table_name) storage_prefix = 'hscic/views/{}-'.format(table_name) logger.info("Generating view %s and saving to %s" % (table_name, storage_prefix)) logger.info("Running SQL for %s: %s" % (table_name, sql)) table.insert_rows_from_query(sql, substitutions=substitutions) exporter = TableExporter(table, storage_prefix) logger.info('Deleting existing data in storage at %s' % storage_prefix) exporter.delete_from_storage() logger.info('Exporting data to storage at %s' % storage_prefix) exporter.export_to_storage() logger.info("View generation complete for %s" % table_name) except Exception: # Log the formatted error, because the multiprocessing pool # this is called from only shows the error message (with no # traceback) logger.error(traceback.format_exc()) raise
def ensure_stats_downloaded_for_date(date): """ Download practice statistics for date, or do nothing if already downloaded """ filename = get_practice_stats_filename(date) if os.path.exists(filename): return client = Client("hscic") check_stats_in_bigquery(date, client) logger.info("Downloading practice statistics for %s", date) temp_name = get_temp_filename(filename) result = client.query( """ SELECT * FROM {hscic}.practice_statistics_all_years WHERE month = TIMESTAMP("%s") """ % (date,) ) with gzip.open(temp_name, "wt") as f: writer = csv.writer(f) writer.writerow(result.field_names) for row in result.rows: writer.writerow(row) os.rename(temp_name, filename)
def upload_dummy_prescribing(bnf_codes): """Upload enough dummy prescribing data to BQ to allow the BNF code simplification to be meaningful.""" prescribing_rows = [] for bnf_code in bnf_codes: row = [ None, None, None, None, None, None, bnf_code, None, None, None, None, None, None, ] prescribing_rows.append(row) table = Client("hscic").get_table("normalised_prescribing_standard") with tempfile.NamedTemporaryFile("wt") as f: writer = csv.writer(f) for row in prescribing_rows: writer.writerow(row) f.seek(0) table.insert_rows_from_csv(f.name, schemas.PRESCRIBING_SCHEMA)
def reimport_all(self): last_imported = ImportLog.objects.latest_in_category( 'prescribing').current_at self.date = last_imported - relativedelta(years=5) client = Client('tmp_eu') while self.date <= last_imported: date_str = self.date.strftime('%Y-%m-%d') sql = ('SELECT pct AS pct_id, practice AS practice_id, ' 'bnf_code AS presentation_code, items AS total_items, ' 'net_cost, actual_cost, quantity, ' 'FORMAT_TIMESTAMP("%%Y_%%m_%%d", month) AS processing_date ' 'FROM {hscic}.normalised_prescribing_standard ' "WHERE month = '%s'" % date_str) table_name = "prescribing_%s" % date_str.replace('-', '_') table = client.get_or_create_table(table_name) table.insert_rows_from_query(sql) exporter = TableExporter(table, 'tmp/{}-*'.format(table_name)) exporter.export_to_storage() with tempfile.NamedTemporaryFile(mode='wb') as tmpfile: logger.info("Importing data for %s" % self.date) exporter.download_from_storage_and_unzip(tmpfile) with transaction.atomic(): self.drop_partition() self.create_partition() self.import_prescriptions(tmpfile.name) self.create_partition_indexes() self.add_parent_trigger() self.date += relativedelta(months=1)
def handle(self, *args, **kwargs): last_imported = kwargs["last_imported"] prescribing_date = "-".join(last_imported.split("_")) + "-01" date_condition = ('month > TIMESTAMP(DATE_SUB(DATE "%s", ' "INTERVAL 5 YEAR))" % prescribing_date) path = os.path.join(settings.PIPELINE_METADATA_DIR, "smoketests") for sql_file in glob.glob(os.path.join(path, "*.sql")): test_name = os.path.splitext(os.path.basename(sql_file))[0] with open(sql_file) as f: query = f.read().replace("{{ date_condition }}", date_condition) print(query) client = Client() results = client.query(query) quantity = [] cost = [] items = [] for row in results_to_dicts(results): quantity.append(row["quantity"]) cost.append(row["actual_cost"]) items.append(row["items"]) print("Updating test expectations for %s" % test_name) json_path = os.path.join(path, "%s.json" % test_name) with open(json_path, "w") as f: obj = {"cost": cost, "items": items, "quantity": quantity} json.dump(obj, f, indent=2)
def handle(self, *args, **kwargs): base_path = os.path.join(settings.APPS_ROOT, 'frontend', 'management', 'commands', 'measure_sql') client = Client("measures") for table_name in [ 'opioid_total_ome', 'practice_data_all_low_priority', 'pregabalin_total_mg', 'vw__median_price_per_unit', 'vw__ghost_generic_measure', ]: path = os.path.join(base_path, table_name + '.sql') with open(path, "r") as sql_file: sql = sql_file.read() try: client.create_table_with_view(table_name, sql, False) except Conflict: client.delete_table(table_name) client.create_table_with_view(table_name, sql, False) # cmpa_products is a table that has been created and managed by Rich. schema = build_schema( ('bnf_code', 'STRING'), ('bnf_name', 'STRING'), ('type', 'STRING'), ) client.get_or_create_table('cmpa_products', schema)
def handle(self, *args, **kwargs): url = "https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/" rsp = requests.get(url) doc = bs4.BeautifulSoup(rsp.content, "html.parser") month_abbrs = [x.lower() for x in calendar.month_abbr] imported_months = [] for a in doc.findAll("a", href=re.compile(r"Part%20VIIIA.+\.xlsx$")): # a.attrs['href'] typically has a filename part like # Part%20VIIIA%20September%202017.xlsx # # We split that into ['Part', 'VIIIA', 'September', '2017'] base_filename = unquote( os.path.splitext(os.path.basename(a.attrs["href"]))[0]) if base_filename == "Part VIIIA Nov 20 updated": # November 2020 has a different filename. In general we want to be # warned (through the scraper crashing) about updates (because we have # to delete all records for the month in question, and reimport) so # special-casing is appropriate here. year, month = "2020", 11 else: words = re.split(r"[ -]+", base_filename) month_name, year = words[-2:] # We have seen the last token in `words` be "19_0". The year is # reported to us via Slack, so if we pull out some nonsense here we # *should* notice. year = re.match("\d+", year).group() if len(year) == 2: year = "20" + year # We have seen the month be `September`, `Sept`, and `Sep`. month_abbr = month_name.lower()[:3] month = month_abbrs.index(month_abbr) date = datetime.date(int(year), month, 1) if ImportLog.objects.filter(category="tariff", current_at=date).exists(): continue xls_url = urljoin(url, a.attrs["href"]) xls_file = BytesIO(requests.get(xls_url).content) import_month(xls_file, date) imported_months.append((year, month)) if imported_months: client = Client("dmd") client.upload_model(TariffPrice) for year, month in imported_months: msg = "Imported Drug Tariff for %s_%s" % (year, month) notify_slack(msg) else: msg = "Found no new tariff data to import" notify_slack(msg)
def handle(self, *args, **kwargs): last_imported = kwargs['last_imported'] prescribing_date = "-".join(last_imported.split('_')) + '-01' date_condition = ('month > TIMESTAMP(DATE_SUB(DATE "%s", ' 'INTERVAL 5 YEAR))' % prescribing_date) path = os.path.join(settings.PIPELINE_METADATA_DIR, 'smoketests') for sql_file in glob.glob(os.path.join(path, '*.sql')): test_name = os.path.splitext(os.path.basename(sql_file))[0] with open(sql_file, 'rb') as f: query = f.read().replace( '{{ date_condition }}', date_condition) print(query) client = Client() results = client.query(query) quantity = [] cost = [] items = [] for row in results_to_dicts(results): quantity.append(row['quantity']) cost.append(row['actual_cost']) items.append(row['items']) print("Updating test expectations for %s" % test_name) json_path = os.path.join(path, '%s.json' % test_name) with open(json_path, 'wb') as f: obj = {'cost': cost, 'items': items, 'quantity': quantity} json.dump(obj, f, indent=2)
def backup_table(self, table_name): client = Client("hscic") sql = "SELECT max(month) FROM {hscic}.%s" % table_name latest_date = client.query(sql).rows[0][0] latest_year_and_month = latest_date.strftime("%Y_%m") table = client.get_table(table_name) storage_client = StorageClient() bucket = storage_client.bucket() year_and_months = set() prefix_base = "backups/{}/".format(table_name) for blob in bucket.list_blobs(prefix=prefix_base): match = re.search("/(\d{4}_\d{2})/", blob.name) year_and_months.add(match.groups()[0]) if latest_year_and_month in year_and_months: print("{} table already backed up for {}".format( table_name, latest_year_and_month)) return storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month, table_name) exporter = TableExporter(table, storage_prefix) exporter.export_to_storage()
def handle(self, *args, **kwargs): url = 'https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/' rsp = requests.get(url) doc = bs4.BeautifulSoup(rsp.content, 'html.parser') month_names = [x.lower() for x in calendar.month_name] month_abbrs = [x.lower() for x in calendar.month_abbr] imported_months = [] for a in doc.findAll('a', href=re.compile('Part%20VIIIA')): # a.attrs['href'] typically has a filename part like # Part%20VIIIA%20September%202017.xlsx # # We split that into ['Part', 'VIIIA', 'September', '2017'] words = re.split( r'[ -]+', urllib.unquote(os.path.splitext( os.path.basename(a.attrs['href']))[0])) month_name, year = words[-2:] # We have seen the last token in `words` be "19_0". The year is # reported to us via Slack, so if we pull out some nonsense here we # *should* notice. year = re.match('\d+', year).group() if len(year) == 2: year = "20" + year try: month = month_names.index(month_name.lower()) except ValueError: month = month_abbrs.index(month_name.lower()) date = datetime.date(int(year), month, 1) if ImportLog.objects.filter( category='tariff', current_at=date ).exists(): continue xls_url = urljoin(url, a.attrs['href']) xls_file = StringIO(requests.get(xls_url).content) import_month(xls_file, date) imported_months.append((year, month)) if imported_months: client = Client('dmd') client.upload_model(TariffPrice) for year, month in imported_months: msg = 'Imported Drug Tariff for %s_%s' % (year, month) notify_slack(msg) else: msg = 'Found no new tariff data to import' notify_slack(msg)
def upload_to_bq(self): client = Client("dmd") for model in apps.get_app_config("dmd").get_models(): client.upload_model(model) for model in [VMP, AMP, VMPP, AMPP]: table_id = "{}_full".format(model.obj_type) table = client.get_table(table_id) sql = self.sql_for_cls(model) table.insert_rows_from_query(sql)
def handle(self, *args, **options): """ Compute and store cost savings for the specified month. Deletes any existing data for that month. """ if not options["month"]: last_prescribing = ImportLog.objects.latest_in_category( "prescribing" ).current_at options["month"] = last_prescribing log = ImportLog.objects.latest_in_category("ppu") if log is not None: if options["month"] <= log.current_at: raise argparse.ArgumentTypeError("Couldn't infer date") with transaction.atomic(): # Create custom presentations for our overrides, if they don't exist. # See https://github.com/ebmdatalab/price-per-dose/issues/1. Presentation.objects.get_or_create( bnf_code="0601060D0AAA0A0", name="Glucose Blood Testing Reagents", is_generic=True, ) Presentation.objects.get_or_create( bnf_code="0601060U0AAA0A0", name="Urine Testing Reagents", is_generic=True, ) PPUSaving.objects.filter(date=options["month"]).delete() for entity_type in ["pct", "practice"]: result = get_savings(entity_type, options["month"]) for row in result.itertuples(): d = row._asdict() if d["price_per_unit"]: PPUSaving.objects.create( date=options["month"], presentation_id=d["Index"], lowest_decile=d["lowest_decile"], quantity=d["quantity"], price_per_unit=d["price_per_unit"], possible_savings=d["possible_savings"], formulation_swap=d["formulation_swap"] or None, pct_id=d.get("pct", None), practice_id=d.get("practice", None), ) ImportLog.objects.create( category="ppu", filename="n/a", current_at=options["month"] ) client = Client("hscic") table = client.get_or_create_table("ppu_savings", PPU_SAVING_SCHEMA) table.insert_rows_from_pg( PPUSaving, PPU_SAVING_SCHEMA, transformer=ppu_savings_transform )
def handle(self, *args, **kwargs): if os.environ['DJANGO_SETTINGS_MODULE'] != \ 'openprescribing.settings.test': raise CommandError('Command must run with test settings') gcbq_client = Client().gcbq_client for dataset in gcbq_client.list_datasets(): tables = list(dataset.list_tables()) if len(tables) == 0: dataset.delete()
def tearDown(self): client = Client("test") client.delete_dataset() client = StorageClient() bucket = client.bucket() for blob in bucket.list_blobs(prefix=self.storage_prefix): blob.delete() archive_client = Client("archive") archive_client.delete_dataset()
def update_bnf_table(): """Update `bnf` table from cloud-stored CSV""" storage_client = StorageClient() bucket = storage_client.get_bucket() blobs = bucket.list_blobs(prefix="hscic/bnf_codes/") blobs = sorted(blobs, key=lambda blob: blob.name, reverse=True) blob = blobs[0] bq_client = BQClient("hscic") table = bq_client.get_table("bnf") table.insert_rows_from_storage(blob.name, skip_leading_rows=1)
def create_pregabalin_total_mg_view(): fpath = os.path.dirname(__file__) sql_path = os.path.join(fpath, "./measure_sql/pregabalin_total_mg.sql") with open(sql_path, "r") as sql_file: sql = sql_file.read() client = Client("measures") try: client.create_table_with_view('pregabalin_total_mg', sql, False) except Conflict: pass
def create_omnibus_lp_view(): fpath = os.path.dirname(__file__) sql_path = os.path.join(fpath, "./measure_sql/lpomnibusview.sql") with open(sql_path, "r") as sql_file: sql = sql_file.read() client = Client("measures") try: client.create_table_with_view('practice_data_all_low_priority', sql, False) except Conflict: pass
def handle(self, *args, measure_ids, **kwargs): base_path = os.path.join(settings.APPS_ROOT, "bq_public_tables") client = Client("public") with open(os.path.join(base_path, "_measure_template.sql")) as f: measure_template_sql = f.read() if measure_ids: measures = Measure.objects.filter(id__in=measure_ids) else: measures = Measure.objects.all() for measure in measures: table_name = "measure_" + measure.id print(table_name) table = client.get_table(table_name) numerator_sql = """ SELECT CAST(month AS DATE) AS month, practice AS practice_id, {numerator_columns} FROM {numerator_from} WHERE {numerator_where} GROUP BY month, practice_id """.format( numerator_columns=measure.numerator_columns, numerator_from=measure.numerator_from, numerator_where=measure.numerator_where, ) denominator_sql = """ SELECT CAST(month AS DATE) AS month, practice AS practice_id, {denominator_columns} FROM {denominator_from} WHERE {denominator_where} GROUP BY month, practice_id """.format( denominator_columns=measure.denominator_columns, denominator_from=measure.denominator_from, denominator_where=measure.denominator_where, ) sql = partially_format( measure_template_sql, numerator_sql=numerator_sql, denominator_sql=denominator_sql, ) table.insert_rows_from_query(sql)
def handle(self, *args, **kwargs): client = Client("hscic") try: client.create_storage_backed_table( "raw_prescribing_v1", RAW_PRESCRIBING_SCHEMA_V1, "hscic/prescribing_v1/20*Detailed_Prescribing_Information.csv", ) except Conflict: pass try: client.create_storage_backed_table( "raw_prescribing_v2", RAW_PRESCRIBING_SCHEMA_V2, # This pattern may change once the data is published via the # new Open Data Portal. "hscic/prescribing_v2/20*.csv", ) except Conflict: pass for table_name in [ "all_prescribing", "normalised_prescribing", "normalised_prescribing_standard", "raw_prescribing_normalised", ]: self.recreate_table(client, table_name) client = Client("measures") for table_name in [ "dmd_objs_with_form_route", "dmd_objs_hospital_only", "opioid_total_ome", "practice_data_all_low_priority", "pregabalin_total_mg", "vw__median_price_per_unit", "vw__ghost_generic_measure", "vw__herbal_list", # This references pregabalin_total_mg, so must come afterwards "gaba_total_ddd", ]: self.recreate_table(client, table_name) # cmpa_products is a table that has been created and managed by Rich. schema = build_schema( ("bnf_code", "STRING"), ("bnf_name", "STRING"), ("type", "STRING") ) client.get_or_create_table("cmpa_products", schema)
def handle(self, *args, **options): client = Client("tmp_eu") year_and_month = ImportLog.objects.latest_in_category( "prescribing").current_at.strftime("%Y_%m") raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month) sql = SQL.format(detailed_raw_data_table="{tmp_eu}.%s" % raw_data_table_name) with transaction.atomic(): for row in client.query(sql).rows: bnf_code_regex, adq_per_quantity = row matches = Presentation.objects.filter( bnf_code__regex=bnf_code_regex) matches.update(adq_per_quantity=adq_per_quantity)
def handle(self, *args, **kwargs): base_path = os.path.join(settings.APPS_ROOT, "bq_public_tables") client = Client("public") for path in glob.glob(os.path.join(base_path, "*.sql")): table_name = os.path.splitext(os.path.basename(path))[0] table = client.get_table(table_name) with open(path) as f: sql = f.read() table.insert_rows_from_query(sql)
def setUp(self): client = Client('test') self.storage_prefix = 'test_bq_client/{}-'.format(client.dataset_id) client.create_dataset() archive_client = Client('archive') archive_client.create_dataset()
def setUp(self): client = Client("test") self.storage_prefix = "test_bq_client/{}-".format(client.dataset_id) client.create_dataset() archive_client = Client("archive") archive_client.create_dataset()