def handle(self, *args, **kwargs):
        url = "https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/"
        rsp = requests.get(url)
        doc = bs4.BeautifulSoup(rsp.content, "html.parser")

        month_abbrs = [x.lower() for x in calendar.month_abbr]

        imported_months = []

        for a in doc.findAll("a", href=re.compile(r"Part%20VIIIA.+\.xlsx$")):
            # a.attrs['href'] typically has a filename part like
            # Part%20VIIIA%20September%202017.xlsx
            #
            # We split that into ['Part', 'VIIIA', 'September', '2017']
            base_filename = unquote(
                os.path.splitext(os.path.basename(a.attrs["href"]))[0])

            if base_filename == "Part VIIIA Nov 20 updated":
                # November 2020 has a different filename.  In general we want to be
                # warned (through the scraper crashing) about updates (because we have
                # to delete all records for the month in question, and reimport) so
                # special-casing is appropriate here.
                year, month = "2020", 11

            else:
                words = re.split(r"[ -]+", base_filename)
                month_name, year = words[-2:]

                # We have seen the last token in `words` be "19_0".  The year is
                # reported to us via Slack, so if we pull out some nonsense here we
                # *should* notice.
                year = re.match("\d+", year).group()
                if len(year) == 2:
                    year = "20" + year

                # We have seen the month be `September`, `Sept`, and `Sep`.
                month_abbr = month_name.lower()[:3]
                month = month_abbrs.index(month_abbr)

            date = datetime.date(int(year), month, 1)
            if ImportLog.objects.filter(category="tariff",
                                        current_at=date).exists():
                continue

            xls_url = urljoin(url, a.attrs["href"])
            xls_file = BytesIO(requests.get(xls_url).content)

            import_month(xls_file, date)
            imported_months.append((year, month))

        if imported_months:
            client = Client("dmd")
            client.upload_model(TariffPrice)

            for year, month in imported_months:
                msg = "Imported Drug Tariff for %s_%s" % (year, month)
                notify_slack(msg)
        else:
            msg = "Found no new tariff data to import"
            notify_slack(msg)
Ejemplo n.º 2
0
    def handle(self, *args, **kwargs):
        url = 'https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/'
        rsp = requests.get(url)
        doc = bs4.BeautifulSoup(rsp.content, 'html.parser')

        month_names = [x.lower() for x in calendar.month_name]
        month_abbrs = [x.lower() for x in calendar.month_abbr]

        imported_months = []

        for a in doc.findAll('a', href=re.compile('Part%20VIIIA')):
            # a.attrs['href'] typically has a filename part like
            # Part%20VIIIA%20September%202017.xlsx
            #
            # We split that into ['Part', 'VIIIA', 'September', '2017']
            words = re.split(
                r'[ -]+',
                urllib.unquote(os.path.splitext(
                    os.path.basename(a.attrs['href']))[0]))
            month_name, year = words[-2:]

            # We have seen the last token in `words` be "19_0".  The year is
            # reported to us via Slack, so if we pull out some nonsense here we
            # *should* notice.
            year = re.match('\d+', year).group()
            if len(year) == 2:
                year = "20" + year

            try:
                month = month_names.index(month_name.lower())
            except ValueError:
                month = month_abbrs.index(month_name.lower())

            date = datetime.date(int(year), month, 1)

            if ImportLog.objects.filter(
                category='tariff',
                current_at=date
            ).exists():
                continue

            xls_url = urljoin(url, a.attrs['href'])
            xls_file = StringIO(requests.get(xls_url).content)

            import_month(xls_file, date)
            imported_months.append((year, month))

        if imported_months:
            client = Client('dmd')
            client.upload_model(TariffPrice)

            for year, month in imported_months:
                msg = 'Imported Drug Tariff for %s_%s' % (year, month)
                notify_slack(msg)
        else:
            msg = 'Found no new tariff data to import'
            notify_slack(msg)
Ejemplo n.º 3
0
    def upload_to_bq(self):
        client = Client("dmd")
        for model in apps.get_app_config("dmd").get_models():
            client.upload_model(model)

        for model in [VMP, AMP, VMPP, AMPP]:
            table_id = "{}_full".format(model.obj_type)
            table = client.get_table(table_id)
            sql = self.sql_for_cls(model)
            table.insert_rows_from_query(sql)
Ejemplo n.º 4
0
    def handle(self, *args, **kwargs):
        host = 'https://www.nhsbsa.nhs.uk'
        index = (host + '/pharmacies-gp-practices-and-appliance-contractors/'
                 'drug-tariff/drug-tariff-part-viii/')
        rsp = requests.get(index)
        doc = bs4.BeautifulSoup(rsp.content, 'html.parser')

        month_names = [x.lower() for x in calendar.month_name]

        imported_months = []

        for a in doc.findAll('a', href=re.compile('Part%20VIIIA')):
            # a.attrs['href'] typically has a filename part like
            # Part%20VIIIA%20September%202017.xlsx
            #
            # We split that into ['Part', 'VIIIA', 'September', '2017']
            words = re.split(
                r'[ -]+',
                urllib.unquote(
                    os.path.splitext(os.path.basename(a.attrs['href']))[0]))
            month_name, year = words[-2:]
            if len(year) == 2:
                year = "20" + year
                assert "2000" <= year <= str(datetime.date.today().year)
            month = month_names.index(month_name.lower())
            date = datetime.date(int(year), month, 1)

            if ImportLog.objects.filter(category='tariff',
                                        current_at=date).exists():
                continue

            xls_url = urljoin(index, a.attrs['href'])
            xls_file = StringIO(requests.get(xls_url).content)

            import_month(xls_file, date)
            imported_months.append((year, month))

        if imported_months:
            client = Client('dmd')
            client.upload_model(TariffPrice)

            for year, month in imported_months:
                msg = 'Imported Drug Tariff for %s_%s' % (year, month)
                notify_slack(msg)
        else:
            msg = 'Found no new tariff data to import'
            notify_slack(msg)
Ejemplo n.º 5
0
    def test_upload_model(self):
        client = Client('dmd')

        DMDProduct.objects.create(
            dmdid=327368008,
            bnf_code='0803042A0AAABAB',
            vpid=327368008,
            name='Bicalutamide 150mg tablets',
        )

        DMDVmpp.objects.create(
            vppid=1206011000001108,
            nm='Bicalutamide 150mg tablets 28 tablet',
            vpid=327368008,
        )

        NCSOConcession.objects.create(
            drug='Bicalutamide 150mg tablets',
            pack_size=28,
            price_concession_pence=499,
            vmpp_id=1206011000001108,
            date='2017-11-01',
         )

        TariffPrice.objects.create(
            vmpp_id=1206011000001108,
            product_id=327368008,
            price_pence=422,
            tariff_category_id=11,
            date='2017-11-01',
        )

        client.upload_model(DMDProduct)
        client.upload_model(DMDVmpp)
        client.upload_model(NCSOConcession)
        client.upload_model(TariffPrice)

        table = client.get_table('product')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]['name'], 'Bicalutamide 150mg tablets')

        table = client.get_table('vmpp')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]['nm'], 'Bicalutamide 150mg tablets 28 tablet')

        table = client.get_table('ncsoconcession')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]['drug'], 'Bicalutamide 150mg tablets')

        table = client.get_table('tariffprice')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]['price_pence'], 422)
Ejemplo n.º 6
0
    def test_upload_model(self):
        client = Client("dmd")
        client.upload_model(VMPP)
        table = client.get_table("vmpp")
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 4)

        for row in rows:
            if row["id"] != 1079211000001106:
                continue

            for k, v in {
                    "invalid": False,
                    "nm": "Chlortalidone 50mg tablets 28 tablet",
                    "vmp": 317935006,
                    "qtyval": Decimal("28.00"),
                    "qty_uom": 428673006,
                    "combpack": None,
                    "bnf_code": "0202010F0AAAAAA",
            }.items():
                self.assertEqual(row[k], v)
Ejemplo n.º 7
0
    def test_upload_model(self):
        client = Client('dmd')
        client.upload_model(VMPP)
        table = client.get_table('vmpp')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 4)

        for row in rows:
            if row['id'] != 1079211000001106:
                continue

            for k, v in {
                    'invalid': False,
                    'nm': 'Chlortalidone 50mg tablets 28 tablet',
                    'vmp': 317935006,
                    'qtyval': Decimal('28.00'),
                    'qty_uom': 428673006,
                    'combpack': None,
                    'bnf_code': '0202010F0AAAAAA',
            }.items():
                self.assertEqual(row[k], v)
Ejemplo n.º 8
0
 def upload_to_bq(self):
     client = Client("dmd")
     for model in apps.get_app_config("dmd2").get_models():
         client.upload_model(model)
Ejemplo n.º 9
0
 def upload_to_bq(self):
     client = Client('dmd')
     for model in apps.get_app_config('dmd2').get_models():
         client.upload_model(model)
Ejemplo n.º 10
0
def run_end_to_end():
    print('BQ_NONCE: {}'.format(settings.BQ_NONCE))

    call_command('migrate')

    path = os.path.join(settings.APPS_ROOT, 'frontend', 'management',
                        'commands', 'measure_definitions')

    # No MeasureGlobals or MeasureValues are generated for the ghost branded
    # generics measure, because both numerator and denominator are computed
    # from a view (vw__ghost_generic_measure) which has no data.  Rather than
    # populate this view, it is simpler to pretend it doesn't exist.
    num_measures = len(os.listdir(path)) - 1

    shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True)

    with open(settings.PIPELINE_IMPORT_LOG_PATH, 'w') as f:
        f.write('{}')

    for blob in StorageClient().bucket().list_blobs():
        blob.delete()

    for dataset_key in DATASETS:
        BQClient(dataset_key).create_dataset()

    client = BQClient('hscic')
    client.create_table('bnf', schemas.BNF_SCHEMA)
    client.create_table('ccgs', schemas.CCG_SCHEMA)
    client.create_table('ppu_savings', schemas.PPU_SAVING_SCHEMA)
    client.create_table(
        'practice_statistics',
        schemas.PRACTICE_STATISTICS_SCHEMA
    )
    client.create_table(
        'practice_statistics_all_years',
        schemas.PRACTICE_STATISTICS_SCHEMA
    )
    client.create_table('practices', schemas.PRACTICE_SCHEMA)
    client.create_table('prescribing', schemas.PRESCRIBING_SCHEMA)
    client.create_table('presentation', schemas.PRESENTATION_SCHEMA)
    client.create_table('tariff', schemas.TARIFF_SCHEMA)
    client.create_table('bdz_adq', schemas.BDZ_ADQ_SCHEMA)

    client = BQClient('measures')
    # This is enough of a schema to allow the practice_data_all_low_priority
    # table to be created, since it references these fields.  Once populated by
    # import_measures, the tables in the measures dataset will have several
    # more fields.  But we don't need to specify exactly what they are, as BQ
    # will work it out when the data is inserted with insert_rows_from_query.
    measures_schema = build_schema(
        ('month', 'DATE'),
        ('practice_id', 'STRING'),
        ('numerator', 'INTEGER'),
        ('denominator', 'INTEGER'),
    )
    path = os.path.join(settings.APPS_ROOT, 'frontend', 'management',
                        'commands', 'measure_definitions', '*.json')
    for path in glob.glob(path):
        measure_id = os.path.splitext(os.path.basename(path))[0]
        client.create_table('practice_data_' + measure_id, measures_schema)
        client.create_table('ccg_data_' + measure_id, measures_schema)
        client.create_table('global_data_' + measure_id, measures_schema)

    # Although there are no model instances, we call upload_model to create the
    # tables in BQ that might be required by certain measure views.
    client = BQClient('dmd')
    client.upload_model(TariffPrice)
    client.upload_model(VMPP)

    call_command('generate_presentation_replacements')

    path = os.path.join(settings.APPS_ROOT, 'frontend', 'management',
                        'commands', 'replace_matviews.sql')
    with open(path) as f:
        with connection.cursor() as c:
            c.execute(f.read())

    copy_tree(
        os.path.join(e2e_path, 'data-1'),
        os.path.join(e2e_path, 'data'),
    )

    runner.run_all(2017, 9, under_test=True)

    # We expect one MeasureGlobal per measure per month.
    assert_count_equal(num_measures, MeasureGlobal)

    # We expect one MeasureValue for each organisation per measure per month
    # (There are 4 practices, 2 CCGs, 2 STPs, and 2 regional teams).
    assert_count_equal(10 * num_measures, MeasureValue)

    # We expect one statistic per CCG per month
    assert_raw_count_equal(2, 'vw__ccgstatistics')

    # We expect one chemical summary per CCG per month
    assert_raw_count_equal(2, 'vw__chemical_summary_by_ccg',
                           "chemical_id = '1001030C0'")

    # We expect one chemical summary per practice per month
    assert_raw_count_equal(4, 'vw__chemical_summary_by_practice',
                           "chemical_id = '1001030C0'")

    # We expect one summary per practice per month
    assert_raw_count_equal(4, 'vw__practice_summary')

    # We expect one presentation summary per month
    assert_raw_count_equal(1, 'vw__presentation_summary',
                           "presentation_code = '1001030C0AAAAAA'")

    # We expect one presentation summary per CCG per month
    assert_raw_count_equal(2, 'vw__presentation_summary_by_ccg',
                           "presentation_code = '1001030C0AAAAAA'")

    copy_tree(
        os.path.join(e2e_path, 'data-2'),
        os.path.join(e2e_path, 'data'),
    )

    runner.run_all(2017, 10, under_test=True)

    # We expect one MeasureGlobal per measure per month
    assert_count_equal(2 * num_measures, MeasureGlobal)

    # We expect one MeasureValue for each organisation per measure per month
    assert_count_equal(20 * num_measures, MeasureValue)

    # We expect one statistic per CCG per month
    assert_raw_count_equal(4, 'vw__ccgstatistics')

    # We expect one chemical summary per CCG per month
    assert_raw_count_equal(4, 'vw__chemical_summary_by_ccg',
                           "chemical_id = '1001030C0'")

    # We expect one chemical summary per practice per month
    assert_raw_count_equal(8, 'vw__chemical_summary_by_practice',
                           "chemical_id = '1001030C0'")

    # We expect one summary per practice per month
    assert_raw_count_equal(8, 'vw__practice_summary')

    # We expect one presentation summary per month
    assert_raw_count_equal(2, 'vw__presentation_summary',
                           "presentation_code = '1001030C0AAAAAA'")

    # We expect one presentation summary per CCG per month
    assert_raw_count_equal(4, 'vw__presentation_summary_by_ccg',
                           "presentation_code = '1001030C0AAAAAA'")
def run_end_to_end():
    print("BQ_NONCE: {}".format(settings.BQ_NONCE))

    call_command("migrate")

    # No MeasureGlobals or MeasureValues are generated for the ghost branded
    # generics measure, because both numerator and denominator are computed
    # from a view (vw__ghost_generic_measure) which has no data.  Rather than
    # populate this view, it is simpler to pretend it doesn't exist.
    num_measures = (len(
        glob.glob(os.path.join(settings.MEASURE_DEFINITIONS_PATH, "*.json"))) -
                    1)

    shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True)

    with open(settings.PIPELINE_IMPORT_LOG_PATH, "w") as f:
        f.write("{}")

    for blob in StorageClient().bucket().list_blobs():
        blob.delete()

    for dataset_key in DATASETS:
        BQClient(dataset_key).create_dataset()

    client = BQClient("hscic")
    client.create_table("bnf", schemas.BNF_SCHEMA)
    client.create_table("ccgs", schemas.CCG_SCHEMA)
    client.create_table("stps", schemas.STP_SCHEMA)
    client.create_table("regional_teams", schemas.REGIONAL_TEAM_SCHEMA)
    client.create_table("ppu_savings", schemas.PPU_SAVING_SCHEMA)
    client.create_table("practice_statistics",
                        schemas.PRACTICE_STATISTICS_SCHEMA)
    client.create_table("practice_statistics_all_years",
                        schemas.PRACTICE_STATISTICS_SCHEMA)
    client.create_table("practices", schemas.PRACTICE_SCHEMA)
    client.create_table("prescribing", schemas.PRESCRIBING_SCHEMA)
    client.create_table("presentation", schemas.PRESENTATION_SCHEMA)
    client.create_table("tariff", schemas.TARIFF_SCHEMA)
    client.create_table("bdz_adq", schemas.BDZ_ADQ_SCHEMA)

    client = BQClient("measures")
    # This is enough of a schema to allow the practice_data_all_low_priority
    # table to be created, since it references these fields.  Once populated by
    # import_measures, the tables in the measures dataset will have several
    # more fields.  But we don't need to specify exactly what they are, as BQ
    # will work it out when the data is inserted with insert_rows_from_query.
    measures_schema = build_schema(
        ("month", "DATE"),
        ("practice_id", "STRING"),
        ("numerator", "INTEGER"),
        ("denominator", "INTEGER"),
    )

    for path in glob.glob(
            os.path.join(settings.MEASURE_DEFINITIONS_PATH, "*.json")):
        measure_id = os.path.splitext(os.path.basename(path))[0]
        client.create_table("practice_data_" + measure_id, measures_schema)
        client.create_table("ccg_data_" + measure_id, measures_schema)
        client.create_table("global_data_" + measure_id, measures_schema)

    # Although there are no model instances, we call upload_model to create the
    # dm+d tables in BQ that are required by certain measure views.
    client = BQClient("dmd")
    for model in apps.get_app_config("dmd2").get_models():
        client.upload_model(model)

    call_command("generate_presentation_replacements")

    copy_tree(os.path.join(e2e_path, "data-1"), os.path.join(e2e_path, "data"))

    runner.run_all(2017, 9, under_test=True)

    # We expect one MeasureGlobal per measure per month.
    assert_count_equal(num_measures, MeasureGlobal)

    # We expect one MeasureValue for each organisation per measure per month
    # (There are 4 practices, 2 CCGs, 2 STPs, and 2 regional teams).
    assert_count_equal(10 * num_measures, MeasureValue)

    copy_tree(os.path.join(e2e_path, "data-2"), os.path.join(e2e_path, "data"))

    runner.run_all(2017, 10, under_test=True)

    # We expect one MeasureGlobal per measure per month
    assert_count_equal(2 * num_measures, MeasureGlobal)

    # We expect one MeasureValue for each organisation per measure per month
    assert_count_equal(20 * num_measures, MeasureValue)