Beispiel #1
0
    def backup_table(self, table_name):
        client = Client("hscic")
        sql = "SELECT max(month) FROM {hscic}.%s" % table_name
        latest_date = client.query(sql).rows[0][0]
        latest_year_and_month = latest_date.strftime("%Y_%m")
        table = client.get_table(table_name)

        storage_client = StorageClient()
        bucket = storage_client.bucket()
        year_and_months = set()

        prefix_base = "backups/{}/".format(table_name)

        for blob in bucket.list_blobs(prefix=prefix_base):
            match = re.search("/(\d{4}_\d{2})/", blob.name)
            year_and_months.add(match.groups()[0])

        if latest_year_and_month in year_and_months:
            print("{} table already backed up for {}".format(
                table_name, latest_year_and_month))
            return

        storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month,
                                            table_name)
        exporter = TableExporter(table, storage_prefix)
        exporter.export_to_storage()
    def test_data_is_aggregated(self):
        # there are 11 rows in the input file; 2 are for the same
        # practice/presentation and should be collapsed, and 1 is for
        # an UNKNONWN SURGERY (see issue #349)

        raw_data_path = ("frontend/tests/fixtures/commands/" +
                         "convert_hscic_prescribing/2016_01/" +
                         "EPD_201601.csv")
        gcs_path = "hscic/prescribing_v2/2016_01/EPD_201601.csv"

        client = StorageClient()
        bucket = client.get_bucket()
        blob = bucket.blob(gcs_path)

        with open(raw_data_path, "rb") as f:
            blob.upload_from_file(f)

        call_command("convert_hscic_prescribing", filename=raw_data_path)

        # Test that data added to prescribing table
        client = BQClient()
        sql = """SELECT *
        FROM {hscic}.prescribing_v2
        WHERE month = TIMESTAMP('2016-01-01')"""

        rows = list(results_to_dicts(client.query(sql)))
        self.assertEqual(len(rows), 9)
        for row in rows:
            if row["practice"] == "P92042" and row[
                    "bnf_code"] == "0202010B0AAABAB":
                self.assertEqual(row["quantity"], 1288)
    def tearDown(self):
        client = Client('test')
        client.delete_dataset()

        client = StorageClient()
        bucket = client.bucket()
        for blob in bucket.list_blobs(prefix=self.storage_prefix):
            blob.delete()
Beispiel #4
0
    def setUpClass(cls):
        if 'SKIP_BQ_LOAD' not in os.environ:
            # Create local test data from fixtures, then upload this to a
            # test project in bigquery
            call_command('loaddata',
                         'frontend/tests/fixtures/ccgs.json',
                         verbosity=0)
            call_command('loaddata',
                         'frontend/tests/fixtures/practices.json',
                         verbosity=0)
            call_command('loaddata',
                         'frontend/tests/fixtures/practice_listsizes.json',
                         verbosity=0)
            prescribing_fixture_path = os.path.join(
                'frontend', 'tests', 'fixtures', 'commands',
                'prescribing_bigquery_views_fixture.csv'
            )

            client = Client('hscic')

            for table_name in [
                    'normalised_prescribing_standard',
                    'normalised_prescribing_legacy']:
                table = client.get_or_create_table(
                    table_name,
                    PRESCRIBING_SCHEMA
                )
                table.insert_rows_from_csv(prescribing_fixture_path)

            table = client.get_or_create_table('ccgs', CCG_SCHEMA)
            columns = [field.name for field in CCG_SCHEMA]
            table.insert_rows_from_pg(PCT, columns, ccgs_transform)

            table = client.get_or_create_table(
                'practice_statistics',
                PRACTICE_STATISTICS_SCHEMA
            )
            columns = [field.name for field in PRACTICE_STATISTICS_SCHEMA]
            columns[0] = 'date'
            columns[-1] = 'practice_id'
            table.insert_rows_from_pg(
                PracticeStatistics,
                columns,
                statistics_transform
            )

            client = StorageClient()
            bucket = client.get_bucket()
            for blob in bucket.list_blobs(prefix='hscic/views/vw__'):
                blob.delete()

        ImportLog.objects.create(
            category='prescribing', current_at='2015-10-01')
        # Create view tables and indexes
        with open(
                'frontend/management/commands/replace_matviews.sql', 'r') as f:
            with connection.cursor() as c:
                c.execute(f.read())
    def test_existing_files_deleted(self):
        # Create a dataset fragment which should end up being deleted
        client = StorageClient()
        bucket = client.get_bucket()
        blob_name = ('hscic/views/vw__presentation_summary_by_ccg'
                     '-000000009999.csv.gz')
        blob = bucket.blob(blob_name)
        blob.upload_from_string("test", content_type="application/gzip")

        # Run import command
        call_command('create_views')

        # Check the bucket is no longer there
        client = StorageClient()
        bucket = client.get_bucket()
        prefix, suffix = blob_name.split('-')
        for blob in bucket.list_blobs(prefix=prefix):
            self.assertNotIn(suffix, blob.path)
def update_bnf_table():
    """Update `bnf` table from cloud-stored CSV"""
    storage_client = StorageClient()
    bucket = storage_client.get_bucket()
    blobs = bucket.list_blobs(prefix="hscic/bnf_codes/")
    blobs = sorted(blobs, key=lambda blob: blob.name, reverse=True)
    blob = blobs[0]

    bq_client = BQClient("hscic")
    table = bq_client.get_table("bnf")
    table.insert_rows_from_storage(blob.name, skip_leading_rows=1)
Beispiel #7
0
def upload_task_input_files(task):
    storage_client = StorageClient()
    bucket = storage_client.get_bucket()

    for path in task.input_paths():
        assert path[0] == '/'
        assert settings.PIPELINE_DATA_BASEDIR[-1] == '/'
        name = 'hscic' + path.replace(settings.PIPELINE_DATA_BASEDIR, '/')
        blob = bucket.blob(name)
        if blob.exists():
            print("Skipping %s, already uploaded" % name)
            continue
        print("Uploading %s to %s" % (path, name))
        with open(path) as f:
            blob.upload_from_file(f)
    def test_data_is_aggregated(self):
        # there are 11 rows in the input file; 2 are for the same
        # practice/presentation and should be collapsed, and 1 is for
        # an UNKNONWN SURGERY (see issue #349)

        raw_data_path = 'frontend/tests/fixtures/commands/' +\
            'convert_hscic_prescribing/2016_01/' +\
            'Detailed_Prescribing_Information.csv'
        converted_data_path = 'frontend/tests/fixtures/commands/' +\
            'convert_hscic_prescribing/2016_01/' +\
            'Detailed_Prescribing_Information_formatted.CSV'
        gcs_path = 'hscic/prescribing/2016_01/' +\
            'Detailed_Prescribing_Information.csv'

        client = StorageClient()
        bucket = client.get_bucket()
        blob = bucket.blob(gcs_path)

        with open(raw_data_path) as f:
            blob.upload_from_file(f)

        call_command('convert_hscic_prescribing', filename=raw_data_path)

        # Test that data added to prescribing table
        client = BQClient()
        sql = '''SELECT *
        FROM {hscic}.prescribing
        WHERE month = TIMESTAMP('2016-01-01')'''

        rows = list(results_to_dicts(client.query(sql)))
        self.assertEqual(len(rows), 9)
        for row in rows:
            if row['practice'] == 'P92042' and \
                    row['bnf_code'] == '0202010B0AAABAB':
                self.assertEqual(row['quantity'], 1288)

        # Test that downloaded data is correct
        with open(converted_data_path) as f:
            rows = list(csv.reader(f))

        self.assertEqual(len(rows), 9)
        for row in rows:
            if row[1] == 'P92042' and row[2] == '0202010B0AAABAB':
                self.assertEqual(row[6], '1288')
Beispiel #9
0
    def create_storage_backed_table(self, table_id, schema, gcs_path):
        gcs_client = StorageClient()
        bucket = gcs_client.bucket()
        if bucket.get_blob(gcs_path) is None:
            raise RuntimeError("Could not find blob at {}".format(gcs_path))

        gcs_uri = "gs://{}/{}".format(self.project, gcs_path)
        schema_as_dict = [{
            "name": s.name,
            "type": s.field_type.lower()
        } for s in schema]
        resource = {
            "tableReference": {
                "tableId": table_id
            },
            "externalDataConfiguration": {
                "csvOptions": {
                    "skipLeadingRows": "1"
                },
                "sourceFormat": "CSV",
                "sourceUris": [gcs_uri],
                "schema": {
                    "fields": schema_as_dict
                },
            },
        }

        path = "/projects/{}/datasets/{}/tables".format(
            self.project, self.dataset_id)

        try:
            self.gcbq_client._connection.api_request(method="POST",
                                                     path=path,
                                                     data=resource)
        except NotFound as e:
            if not dataset_is_missing(e):
                raise
            self.create_dataset()
            self.gcbq_client._connection.api_request(method="POST",
                                                     path=path,
                                                     data=resource)

        return self.get_table(table_id)
Beispiel #10
0
    def create_storage_backed_table(self, table_id, schema, gcs_path):
        gcs_client = StorageClient()
        bucket = gcs_client.bucket()
        if bucket.get_blob(gcs_path) is None:
            raise RuntimeError('Could not find blob at {}'.format(gcs_path))

        gcs_uri = 'gs://{}/{}'.format(self.project, gcs_path)
        schema_as_dict = [{'name': s.name, 'type': s.field_type.lower()} for s in schema]
        resource = {
            'tableReference': {'tableId': table_id},
            'externalDataConfiguration': {
                'csvOptions': {'skipLeadingRows': '1'},
                'sourceFormat': 'CSV',
                'sourceUris': [gcs_uri],
                'schema': {'fields': schema_as_dict}
            }
        }

        path = '/projects/{}/datasets/{}/tables'.format(
            self.project,
            self.dataset_id
        )

        try:
            self.gcbq_client._connection.api_request(
                method='POST',
                path=path,
                data=resource
            )
        except NotFound as e:
            if not dataset_is_missing(e):
                raise
            self.create_dataset()
            self.gcbq_client._connection.api_request(
                method='POST',
                path=path,
                data=resource
            )

        return self.get_table(table_id)
Beispiel #11
0
def run_end_to_end():
    print('BQ_NONCE: {}'.format(settings.BQ_NONCE))

    call_command('migrate')

    path = os.path.join(settings.APPS_ROOT, 'frontend', 'management',
                        'commands', 'measure_definitions')

    # No MeasureGlobals or MeasureValues are generated for the ghost branded
    # generics measure, because both numerator and denominator are computed
    # from a view (vw__ghost_generic_measure) which has no data.  Rather than
    # populate this view, it is simpler to pretend it doesn't exist.
    num_measures = len(os.listdir(path)) - 1

    shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True)

    with open(settings.PIPELINE_IMPORT_LOG_PATH, 'w') as f:
        f.write('{}')

    for blob in StorageClient().bucket().list_blobs():
        blob.delete()

    for dataset_key in DATASETS:
        BQClient(dataset_key).create_dataset()

    client = BQClient('hscic')
    client.create_table('bnf', schemas.BNF_SCHEMA)
    client.create_table('ccgs', schemas.CCG_SCHEMA)
    client.create_table('ppu_savings', schemas.PPU_SAVING_SCHEMA)
    client.create_table(
        'practice_statistics',
        schemas.PRACTICE_STATISTICS_SCHEMA
    )
    client.create_table(
        'practice_statistics_all_years',
        schemas.PRACTICE_STATISTICS_SCHEMA
    )
    client.create_table('practices', schemas.PRACTICE_SCHEMA)
    client.create_table('prescribing', schemas.PRESCRIBING_SCHEMA)
    client.create_table('presentation', schemas.PRESENTATION_SCHEMA)
    client.create_table('tariff', schemas.TARIFF_SCHEMA)
    client.create_table('bdz_adq', schemas.BDZ_ADQ_SCHEMA)

    client = BQClient('measures')
    # This is enough of a schema to allow the practice_data_all_low_priority
    # table to be created, since it references these fields.  Once populated by
    # import_measures, the tables in the measures dataset will have several
    # more fields.  But we don't need to specify exactly what they are, as BQ
    # will work it out when the data is inserted with insert_rows_from_query.
    measures_schema = build_schema(
        ('month', 'DATE'),
        ('practice_id', 'STRING'),
        ('numerator', 'INTEGER'),
        ('denominator', 'INTEGER'),
    )
    path = os.path.join(settings.APPS_ROOT, 'frontend', 'management',
                        'commands', 'measure_definitions', '*.json')
    for path in glob.glob(path):
        measure_id = os.path.splitext(os.path.basename(path))[0]
        client.create_table('practice_data_' + measure_id, measures_schema)
        client.create_table('ccg_data_' + measure_id, measures_schema)
        client.create_table('global_data_' + measure_id, measures_schema)

    # Although there are no model instances, we call upload_model to create the
    # tables in BQ that might be required by certain measure views.
    client = BQClient('dmd')
    client.upload_model(TariffPrice)
    client.upload_model(VMPP)

    call_command('generate_presentation_replacements')

    path = os.path.join(settings.APPS_ROOT, 'frontend', 'management',
                        'commands', 'replace_matviews.sql')
    with open(path) as f:
        with connection.cursor() as c:
            c.execute(f.read())

    copy_tree(
        os.path.join(e2e_path, 'data-1'),
        os.path.join(e2e_path, 'data'),
    )

    runner.run_all(2017, 9, under_test=True)

    # We expect one MeasureGlobal per measure per month.
    assert_count_equal(num_measures, MeasureGlobal)

    # We expect one MeasureValue for each organisation per measure per month
    # (There are 4 practices, 2 CCGs, 2 STPs, and 2 regional teams).
    assert_count_equal(10 * num_measures, MeasureValue)

    # We expect one statistic per CCG per month
    assert_raw_count_equal(2, 'vw__ccgstatistics')

    # We expect one chemical summary per CCG per month
    assert_raw_count_equal(2, 'vw__chemical_summary_by_ccg',
                           "chemical_id = '1001030C0'")

    # We expect one chemical summary per practice per month
    assert_raw_count_equal(4, 'vw__chemical_summary_by_practice',
                           "chemical_id = '1001030C0'")

    # We expect one summary per practice per month
    assert_raw_count_equal(4, 'vw__practice_summary')

    # We expect one presentation summary per month
    assert_raw_count_equal(1, 'vw__presentation_summary',
                           "presentation_code = '1001030C0AAAAAA'")

    # We expect one presentation summary per CCG per month
    assert_raw_count_equal(2, 'vw__presentation_summary_by_ccg',
                           "presentation_code = '1001030C0AAAAAA'")

    copy_tree(
        os.path.join(e2e_path, 'data-2'),
        os.path.join(e2e_path, 'data'),
    )

    runner.run_all(2017, 10, under_test=True)

    # We expect one MeasureGlobal per measure per month
    assert_count_equal(2 * num_measures, MeasureGlobal)

    # We expect one MeasureValue for each organisation per measure per month
    assert_count_equal(20 * num_measures, MeasureValue)

    # We expect one statistic per CCG per month
    assert_raw_count_equal(4, 'vw__ccgstatistics')

    # We expect one chemical summary per CCG per month
    assert_raw_count_equal(4, 'vw__chemical_summary_by_ccg',
                           "chemical_id = '1001030C0'")

    # We expect one chemical summary per practice per month
    assert_raw_count_equal(8, 'vw__chemical_summary_by_practice',
                           "chemical_id = '1001030C0'")

    # We expect one summary per practice per month
    assert_raw_count_equal(8, 'vw__practice_summary')

    # We expect one presentation summary per month
    assert_raw_count_equal(2, 'vw__presentation_summary',
                           "presentation_code = '1001030C0AAAAAA'")

    # We expect one presentation summary per CCG per month
    assert_raw_count_equal(4, 'vw__presentation_summary_by_ccg',
                           "presentation_code = '1001030C0AAAAAA'")
Beispiel #12
0
 def __init__(self, table, storage_prefix):
     self.table = table
     self.storage_prefix = storage_prefix
     storage_client = StorageClient()
     self.bucket = storage_client.bucket()
    def setUpClass(cls):
        if 'SKIP_BQ_LOAD' not in os.environ:
            # Create local test data from fixtures, then upload this to a
            # test project in bigquery
            call_command('loaddata',
                         'frontend/tests/fixtures/orgs.json',
                         verbosity=0)
            call_command('loaddata',
                         'frontend/tests/fixtures/practices.json',
                         verbosity=0)
            call_command('loaddata',
                         'frontend/tests/fixtures/practice_listsizes.json',
                         verbosity=0)

            client = Client('hscic')

            table = client.get_or_create_table('prescribing',
                                               PRESCRIBING_SCHEMA)
            prescribing_fixture_path = os.path.join(
                'frontend', 'tests', 'fixtures', 'commands',
                'prescribing_bigquery_views_fixture.csv')
            table.insert_rows_from_csv(prescribing_fixture_path)

            table = client.get_or_create_table('ccgs', CCG_SCHEMA)
            columns = [field.name for field in CCG_SCHEMA]
            table.insert_rows_from_pg(PCT, columns, ccgs_transform)

            table = client.get_or_create_table('practices', PRACTICE_SCHEMA)
            columns = [field.name for field in PRACTICE_SCHEMA]
            table.insert_rows_from_pg(Practice, columns)

            table = client.get_or_create_table('practice_statistics',
                                               PRACTICE_STATISTICS_SCHEMA)
            columns = [field.name for field in PRACTICE_STATISTICS_SCHEMA]
            columns[0] = 'date'
            columns[-1] = 'practice_id'
            table.insert_rows_from_pg(PracticeStatistics, columns,
                                      statistics_transform)

            sql = """
            SELECT
              prescribing.sha AS sha,
              practices.ccg_id AS pct,
              prescribing.practice AS practice,
              prescribing.bnf_code AS bnf_code,
              prescribing.bnf_name AS bnf_name,
              prescribing.items AS items,
              prescribing.net_cost AS net_cost,
              prescribing.actual_cost AS actual_cost,
              prescribing.quantity AS quantity,
              prescribing.month AS month
            FROM
              {project}.{hscic}.prescribing AS prescribing
            INNER JOIN
              {project}.{hscic}.practices  AS practices
            ON practices.code = prescribing.practice
            """

            try:
                client.delete_table('normalised_prescribing_standard')
            except NotFound:
                pass

            client.create_table_with_view('normalised_prescribing_standard',
                                          sql, False)

            client = StorageClient()
            bucket = client.get_bucket()
            for blob in bucket.list_blobs(prefix='hscic/views/vw__'):
                blob.delete()
        else:
            assert 'BQ_NONCE' in os.environ, 'Set BQ_NONCE to reuse BQ data'

        ImportLog.objects.create(category='prescribing',
                                 current_at='2015-10-01')

        # Create view tables and indexes
        with open('frontend/management/commands/replace_matviews.sql') as f:
            with connection.cursor() as c:
                c.execute(f.read())
def run_end_to_end():
    print('BQ_NONCE: {}'.format(settings.BQ_NONCE))

    num_measures = 57

    shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True)

    with open(settings.PIPELINE_IMPORT_LOG_PATH, 'w') as f:
        f.write('{}')

    for blob in StorageClient().bucket().list_blobs():
        blob.delete()

    for dataset_key in DATASETS:
        BQClient(dataset_key).create_dataset()

    client = BQClient('hscic')
    client.create_table('bnf', schemas.BNF_SCHEMA)
    client.create_table('ccgs', schemas.CCG_SCHEMA)
    client.create_table('ppu_savings', schemas.PPU_SAVING_SCHEMA)
    client.create_table('practice_statistics',
                        schemas.PRACTICE_STATISTICS_SCHEMA)
    client.create_table('practice_statistics_all_years',
                        schemas.PRACTICE_STATISTICS_SCHEMA)
    client.create_table('practices', schemas.PRACTICE_SCHEMA)
    client.create_table('prescribing', schemas.PRESCRIBING_SCHEMA)
    client.create_table('presentation', schemas.PRESENTATION_SCHEMA)
    client.create_table('tariff', schemas.TARIFF_SCHEMA)

    client = BQClient('measures')
    # This is enough of a schema to allow the practice_data_all_low_priority
    # table to be created, since it references these fields.  Once populated by
    # import_measures, the tables in the measures dataset will have several
    # more fields.  But we don't need to specify exactly what they are, as BQ
    # will work it out when the data is inserted with insert_rows_from_query.
    measures_schema = build_schema(
        ('month', 'DATE'),
        ('practice_id', 'STRING'),
        ('numerator', 'INTEGER'),
        ('denominator', 'INTEGER'),
    )
    path = os.path.join(settings.SITE_ROOT, 'frontend', 'management',
                        'commands', 'measure_definitions', '*.json')
    for path in glob.glob(path):
        measure_id = os.path.splitext(os.path.basename(path))[0]
        client.create_table('practice_data_' + measure_id, measures_schema)
        client.create_table('ccg_data_' + measure_id, measures_schema)
        client.create_table('global_data_' + measure_id, measures_schema)

    call_command('generate_presentation_replacements')

    path = os.path.join(settings.SITE_ROOT, 'frontend', 'management',
                        'commands', 'replace_matviews.sql')
    with open(path) as f:
        with connection.cursor() as c:
            c.execute(f.read())

    copy_tree(
        os.path.join(e2e_path, 'data-1'),
        os.path.join(e2e_path, 'data'),
    )

    runner.run_all(2017, 9, under_test=True)

    # We expect one MeasureGlobal per measure per month.  If this assert fails,
    # check that num_measures is still correct.
    assert_count_equal(num_measures, MeasureGlobal)

    # We expect one MeasureValue for each CCG or Practice per measure per month
    assert_count_equal(6 * num_measures, MeasureValue)

    # We expect one statistic per CCG per month
    assert_raw_count_equal(2, 'vw__ccgstatistics')

    # We expect one chemical summary per CCG per month
    assert_raw_count_equal(2, 'vw__chemical_summary_by_ccg',
                           "chemical_id = '1001030C0'")

    # We expect one chemical summary per practice per month
    assert_raw_count_equal(4, 'vw__chemical_summary_by_practice',
                           "chemical_id = '1001030C0'")

    # We expect one summary per practice per month
    assert_raw_count_equal(4, 'vw__practice_summary')

    # We expect one presentation summary per month
    assert_raw_count_equal(1, 'vw__presentation_summary',
                           "presentation_code = '1001030C0AAAAAA'")

    # We expect one presentation summary per CCG per month
    assert_raw_count_equal(2, 'vw__presentation_summary_by_ccg',
                           "presentation_code = '1001030C0AAAAAA'")

    copy_tree(
        os.path.join(e2e_path, 'data-2'),
        os.path.join(e2e_path, 'data'),
    )

    runner.run_all(2017, 10, under_test=True)

    # We expect one MeasureGlobal per measure per month
    assert_count_equal(2 * num_measures, MeasureGlobal)

    # We expect one MeasureValue for each CCG or Practice per measure per month
    assert_count_equal(12 * num_measures, MeasureValue)

    # We expect one statistic per CCG per month
    assert_raw_count_equal(4, 'vw__ccgstatistics')

    # We expect one chemical summary per CCG per month
    assert_raw_count_equal(4, 'vw__chemical_summary_by_ccg',
                           "chemical_id = '1001030C0'")

    # We expect one chemical summary per practice per month
    assert_raw_count_equal(8, 'vw__chemical_summary_by_practice',
                           "chemical_id = '1001030C0'")

    # We expect one summary per practice per month
    assert_raw_count_equal(8, 'vw__practice_summary')

    # We expect one presentation summary per month
    assert_raw_count_equal(2, 'vw__presentation_summary',
                           "presentation_code = '1001030C0AAAAAA'")

    # We expect one presentation summary per CCG per month
    assert_raw_count_equal(4, 'vw__presentation_summary_by_ccg',
                           "presentation_code = '1001030C0AAAAAA'")
Beispiel #15
0
def run_end_to_end():
    print('BQ_NONCE: {}'.format(settings.BQ_NONCE))

    num_measures = 56

    shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True)

    with open(settings.PIPELINE_IMPORT_LOG_PATH, 'w') as f:
        f.write('{}')

    for blob in StorageClient().bucket().list_blobs():
        blob.delete()

    for dataset_key in DATASETS:
        BQClient(dataset_key).create_dataset()

    client = BQClient('hscic')
    client.create_table('bnf', schemas.BNF_SCHEMA)
    client.create_table('ccgs', schemas.CCG_SCHEMA)
    client.create_table('ppu_savings', schemas.PPU_SAVING_SCHEMA)
    client.create_table(
        'practice_statistics',
        schemas.PRACTICE_STATISTICS_SCHEMA
    )
    client.create_table(
        'practice_statistics_all_years',
        schemas.PRACTICE_STATISTICS_SCHEMA
    )
    client.create_table('practices', schemas.PRACTICE_SCHEMA)
    client.create_table('prescribing', schemas.PRESCRIBING_SCHEMA)
    client.create_table('presentation', schemas.PRESENTATION_SCHEMA)
    client.create_table('tariff', schemas.TARIFF_SCHEMA)

    call_command('generate_presentation_replacements')

    path = os.path.join(settings.SITE_ROOT, 'frontend', 'management',
                        'commands', 'replace_matviews.sql')
    with open(path) as f:
        with connection.cursor() as c:
            c.execute(f.read())

    copy_tree(
        os.path.join(e2e_path, 'data-1'),
        os.path.join(e2e_path, 'data'),
    )

    runner.run_all(2017, 9, under_test=True)

    # We expect one MeasureGlobal per measure per month.  If this assert fails,
    # check that num_measures is still correct.
    assert_count_equal(num_measures, MeasureGlobal)

    # We expect one MeasureValue for each CCG or Practice per measure per month
    assert_count_equal(6 * num_measures, MeasureValue)

    # We expect one statistic per CCG per month
    assert_raw_count_equal(2, 'vw__ccgstatistics')

    # We expect one chemical summary per CCG per month
    assert_raw_count_equal(2, 'vw__chemical_summary_by_ccg',
                           "chemical_id = '1001030C0'")

    # We expect one chemical summary per practice per month
    assert_raw_count_equal(4, 'vw__chemical_summary_by_practice',
                           "chemical_id = '1001030C0'")

    # We expect one summary per practice per month
    assert_raw_count_equal(4, 'vw__practice_summary')

    # We expect one presentation summary per month
    assert_raw_count_equal(1, 'vw__presentation_summary',
                           "presentation_code = '1001030C0AAAAAA'")

    # We expect one presentation summary per CCG per month
    assert_raw_count_equal(2, 'vw__presentation_summary_by_ccg',
                           "presentation_code = '1001030C0AAAAAA'")

    copy_tree(
        os.path.join(e2e_path, 'data-2'),
        os.path.join(e2e_path, 'data'),
    )

    runner.run_all(2017, 10, under_test=True)

    # We expect one MeasureGlobal per measure per month
    assert_count_equal(2 * num_measures, MeasureGlobal)

    # We expect one MeasureValue for each CCG or Practice per measure per month
    assert_count_equal(12 * num_measures, MeasureValue)

    # We expect one statistic per CCG per month
    assert_raw_count_equal(4, 'vw__ccgstatistics')

    # We expect one chemical summary per CCG per month
    assert_raw_count_equal(4, 'vw__chemical_summary_by_ccg',
                           "chemical_id = '1001030C0'")

    # We expect one chemical summary per practice per month
    assert_raw_count_equal(8, 'vw__chemical_summary_by_practice',
                           "chemical_id = '1001030C0'")

    # We expect one summary per practice per month
    assert_raw_count_equal(8, 'vw__practice_summary')

    # We expect one presentation summary per month
    assert_raw_count_equal(2, 'vw__presentation_summary',
                           "presentation_code = '1001030C0AAAAAA'")

    # We expect one presentation summary per CCG per month
    assert_raw_count_equal(4, 'vw__presentation_summary_by_ccg',
                           "presentation_code = '1001030C0AAAAAA'")
 def upload_to_storage(self, local_path, storage_path):
     client = StorageClient()
     bucket = client.bucket()
     blob = bucket.blob(storage_path)
     with open(local_path) as f:
         blob.upload_from_file(f)
def run_end_to_end():
    print("BQ_NONCE: {}".format(settings.BQ_NONCE))

    call_command("migrate")

    # No MeasureGlobals or MeasureValues are generated for the ghost branded
    # generics measure, because both numerator and denominator are computed
    # from a view (vw__ghost_generic_measure) which has no data.  Rather than
    # populate this view, it is simpler to pretend it doesn't exist.
    num_measures = (len(
        glob.glob(os.path.join(settings.MEASURE_DEFINITIONS_PATH, "*.json"))) -
                    1)

    shutil.rmtree(settings.PIPELINE_DATA_BASEDIR, ignore_errors=True)

    with open(settings.PIPELINE_IMPORT_LOG_PATH, "w") as f:
        f.write("{}")

    for blob in StorageClient().bucket().list_blobs():
        blob.delete()

    for dataset_key in DATASETS:
        BQClient(dataset_key).create_dataset()

    client = BQClient("hscic")
    client.create_table("bnf", schemas.BNF_SCHEMA)
    client.create_table("ccgs", schemas.CCG_SCHEMA)
    client.create_table("stps", schemas.STP_SCHEMA)
    client.create_table("regional_teams", schemas.REGIONAL_TEAM_SCHEMA)
    client.create_table("ppu_savings", schemas.PPU_SAVING_SCHEMA)
    client.create_table("practice_statistics",
                        schemas.PRACTICE_STATISTICS_SCHEMA)
    client.create_table("practice_statistics_all_years",
                        schemas.PRACTICE_STATISTICS_SCHEMA)
    client.create_table("practices", schemas.PRACTICE_SCHEMA)
    client.create_table("prescribing", schemas.PRESCRIBING_SCHEMA)
    client.create_table("presentation", schemas.PRESENTATION_SCHEMA)
    client.create_table("tariff", schemas.TARIFF_SCHEMA)
    client.create_table("bdz_adq", schemas.BDZ_ADQ_SCHEMA)

    client = BQClient("measures")
    # This is enough of a schema to allow the practice_data_all_low_priority
    # table to be created, since it references these fields.  Once populated by
    # import_measures, the tables in the measures dataset will have several
    # more fields.  But we don't need to specify exactly what they are, as BQ
    # will work it out when the data is inserted with insert_rows_from_query.
    measures_schema = build_schema(
        ("month", "DATE"),
        ("practice_id", "STRING"),
        ("numerator", "INTEGER"),
        ("denominator", "INTEGER"),
    )

    for path in glob.glob(
            os.path.join(settings.MEASURE_DEFINITIONS_PATH, "*.json")):
        measure_id = os.path.splitext(os.path.basename(path))[0]
        client.create_table("practice_data_" + measure_id, measures_schema)
        client.create_table("ccg_data_" + measure_id, measures_schema)
        client.create_table("global_data_" + measure_id, measures_schema)

    # Although there are no model instances, we call upload_model to create the
    # dm+d tables in BQ that are required by certain measure views.
    client = BQClient("dmd")
    for model in apps.get_app_config("dmd2").get_models():
        client.upload_model(model)

    call_command("generate_presentation_replacements")

    copy_tree(os.path.join(e2e_path, "data-1"), os.path.join(e2e_path, "data"))

    runner.run_all(2017, 9, under_test=True)

    # We expect one MeasureGlobal per measure per month.
    assert_count_equal(num_measures, MeasureGlobal)

    # We expect one MeasureValue for each organisation per measure per month
    # (There are 4 practices, 2 CCGs, 2 STPs, and 2 regional teams).
    assert_count_equal(10 * num_measures, MeasureValue)

    copy_tree(os.path.join(e2e_path, "data-2"), os.path.join(e2e_path, "data"))

    runner.run_all(2017, 10, under_test=True)

    # We expect one MeasureGlobal per measure per month
    assert_count_equal(2 * num_measures, MeasureGlobal)

    # We expect one MeasureValue for each organisation per measure per month
    assert_count_equal(20 * num_measures, MeasureValue)