コード例 #1
0
    def handle(self, *args, **kwargs):
        last_imported = kwargs["last_imported"]
        prescribing_date = "-".join(last_imported.split("_")) + "-01"
        date_condition = ('month > TIMESTAMP(DATE_SUB(DATE "%s", '
                          "INTERVAL 5 YEAR))" % prescribing_date)

        path = os.path.join(settings.PIPELINE_METADATA_DIR, "smoketests")
        for sql_file in glob.glob(os.path.join(path, "*.sql")):
            test_name = os.path.splitext(os.path.basename(sql_file))[0]
            with open(sql_file) as f:
                query = f.read().replace("{{ date_condition }}",
                                         date_condition)
            print(query)
            client = Client()
            results = client.query(query)

            quantity = []
            cost = []
            items = []

            for row in results_to_dicts(results):
                quantity.append(row["quantity"])
                cost.append(row["actual_cost"])
                items.append(row["items"])

            print("Updating test expectations for %s" % test_name)
            json_path = os.path.join(path, "%s.json" % test_name)
            with open(json_path, "w") as f:
                obj = {"cost": cost, "items": items, "quantity": quantity}
                json.dump(obj, f, indent=2)
コード例 #2
0
    def handle(self, *args, **kwargs):
        last_imported = kwargs['last_imported']
        prescribing_date = "-".join(last_imported.split('_')) + '-01'
        date_condition = ('month > TIMESTAMP(DATE_SUB(DATE "%s", '
                          'INTERVAL 5 YEAR))' % prescribing_date)

        path = os.path.join(settings.PIPELINE_METADATA_DIR, 'smoketests')
        for sql_file in glob.glob(os.path.join(path, '*.sql')):
            test_name = os.path.splitext(os.path.basename(sql_file))[0]
            with open(sql_file, 'rb') as f:
                query = f.read().replace(
                    '{{ date_condition }}', date_condition)
            print(query)
            client = Client()
            results = client.query(query)

            quantity = []
            cost = []
            items = []

            for row in results_to_dicts(results):
                quantity.append(row['quantity'])
                cost.append(row['actual_cost'])
                items.append(row['items'])

            print("Updating test expectations for %s" % test_name)
            json_path = os.path.join(path, '%s.json' % test_name)
            with open(json_path, 'wb') as f:
                obj = {'cost': cost,
                       'items': items,
                       'quantity': quantity}
                json.dump(obj, f, indent=2)
コード例 #3
0
def ensure_stats_downloaded_for_date(date):
    """
    Download practice statistics for date, or do nothing if already downloaded
    """
    filename = get_practice_stats_filename(date)
    if os.path.exists(filename):
        return
    client = Client("hscic")
    check_stats_in_bigquery(date, client)
    logger.info("Downloading practice statistics for %s", date)
    temp_name = get_temp_filename(filename)
    result = client.query(
        """
        SELECT *
        FROM {hscic}.practice_statistics_all_years
        WHERE month = TIMESTAMP("%s")
        """
        % (date,)
    )

    with gzip.open(temp_name, "wt") as f:
        writer = csv.writer(f)
        writer.writerow(result.field_names)
        for row in result.rows:
            writer.writerow(row)
    os.rename(temp_name, filename)
コード例 #4
0
    def backup_table(self, table_name):
        client = Client("hscic")
        sql = "SELECT max(month) FROM {hscic}.%s" % table_name
        latest_date = client.query(sql).rows[0][0]
        latest_year_and_month = latest_date.strftime("%Y_%m")
        table = client.get_table(table_name)

        storage_client = StorageClient()
        bucket = storage_client.bucket()
        year_and_months = set()

        prefix_base = "backups/{}/".format(table_name)

        for blob in bucket.list_blobs(prefix=prefix_base):
            match = re.search("/(\d{4}_\d{2})/", blob.name)
            year_and_months.add(match.groups()[0])

        if latest_year_and_month in year_and_months:
            print("{} table already backed up for {}".format(
                table_name, latest_year_and_month))
            return

        storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month,
                                            table_name)
        exporter = TableExporter(table, storage_prefix)
        exporter.export_to_storage()
コード例 #5
0
 def handle(self, *args, **options):
     client = Client("tmp_eu")
     year_and_month = ImportLog.objects.latest_in_category(
         "prescribing").current_at.strftime("%Y_%m")
     raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month)
     sql = SQL.format(detailed_raw_data_table="{tmp_eu}.%s" %
                      raw_data_table_name)
     with transaction.atomic():
         for row in client.query(sql).rows:
             bnf_code_regex, adq_per_quantity = row
             matches = Presentation.objects.filter(
                 bnf_code__regex=bnf_code_regex)
             matches.update(adq_per_quantity=adq_per_quantity)
コード例 #6
0
    def handle(self, *args, **options):
        path = options["filename"]
        head, filename = os.path.split(path)
        converted_path = "{}_formatted.CSV".format(os.path.splitext(path)[0])
        _, year_and_month = os.path.split(head)

        logger.info("path: %s", path)
        logger.info("converted_path: %s", converted_path)
        logger.info("year_and_month: %s", year_and_month)

        date = year_and_month + "_01"
        try:
            datetime.datetime.strptime(date, "%Y_%m_%d")
        except ValueError:
            message = ("The file path must have a YYYY_MM "
                       "date component in the containing directory: ")
            message += path
            raise CommandError(message)

        hscic_dataset_client = Client("hscic")
        tmp_dataset_client = Client("tmp_eu")

        # Check that we haven't already processed data for this month
        sql = """SELECT COUNT(*)
        FROM {dataset}.prescribing
        WHERE month = TIMESTAMP('{date}')""".format(
            dataset=hscic_dataset_client.dataset_id,
            date=date.replace("_", "-"))
        try:
            results = hscic_dataset_client.query(sql)
            assert results.rows[0][0] == 0
        except NotFound:
            pass

        # Create BQ table backed backed by uploaded source CSV file
        raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month)
        gcs_path = "hscic/prescribing/{}/{}".format(year_and_month, filename)

        logger.info("raw_data_table_name: %s", raw_data_table_name)
        logger.info("gcs_path: %s", gcs_path)

        raw_data_table = tmp_dataset_client.create_storage_backed_table(
            raw_data_table_name, RAW_PRESCRIBING_SCHEMA, gcs_path)

        # Append aggregated data to prescribing table
        sql = """
         SELECT
          Area_Team_Code AS sha,
          LEFT(PCO_Code, 3) AS pct,
          Practice_Code AS practice,
          BNF_Code AS bnf_code,
          BNF_Description AS bnf_name,
          SUM(Items) AS items,
          SUM(NIC) AS net_cost,
          SUM(Actual_Cost) AS actual_cost,
          SUM(Quantity * Items) AS quantity,
          TIMESTAMP('%s') AS month,
         FROM %s
         WHERE Practice_Code NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           bnf_code, bnf_name, pct,
           practice, sha
        """ % (
            date.replace("_", "-"),
            raw_data_table.qualified_name,
        )

        logger.info("sql: %s", sql)

        prescribing_table = hscic_dataset_client.get_table("prescribing")
        prescribing_table.insert_rows_from_query(
            sql, legacy=True, write_disposition="WRITE_APPEND")

        # Write aggregated data to new table, for download
        sql = """
         SELECT
          LEFT(PCO_Code, 3) AS pct_id,
          Practice_Code AS practice_code,
          BNF_Code AS presentation_code,
          SUM(Items) AS total_items,
          SUM(NIC) AS net_cost,
          SUM(Actual_Cost) AS actual_cost,
          SUM(Quantity * Items) AS quantity,
          '%s' AS processing_date,
         FROM %s
         WHERE Practice_Code NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           presentation_code, pct_id, practice_code
        """ % (
            date,
            raw_data_table.qualified_name,
        )

        fmtd_data_table_name = "formatted_prescribing_%s" % year_and_month

        logger.info("sql: %s", sql)
        logger.info("fmtd_data_table_name: %s", fmtd_data_table_name)

        fmtd_data_table = tmp_dataset_client.get_table(fmtd_data_table_name)
        fmtd_data_table.insert_rows_from_query(sql, legacy=True)

        # Export new table to storage, and download
        exporter = TableExporter(fmtd_data_table, gcs_path + "_formatted-")
        exporter.export_to_storage(print_header=False)

        with tempfile.NamedTemporaryFile(dir=head) as f:
            exporter.download_from_storage_and_unzip(f)

            # Sort the output.
            #
            # Why? Because this is equivalent to CLUSTERing the table on
            # loading, but less resource-intensive than doing it in
            # Postgres. And the table is too big to sort within BigQuery.
            subprocess.call(
                "ionice -c 2 nice -n 10 sort -k3,3 -k1,1 -k2,2 -t, %s > %s" %
                (f.name, converted_path),
                shell=True,
            )
コード例 #7
0
    def handle(self, *args, **kwargs):
        update_bnf_table()

        client = BQClient("hscic")

        table = client.get_table("practices")
        table.insert_rows_from_pg(models.Practice, schemas.PRACTICE_SCHEMA)

        table = client.get_table("presentation")
        table.insert_rows_from_pg(
            models.Presentation,
            schemas.PRESENTATION_SCHEMA,
            transformer=schemas.presentation_transform,
        )

        table = client.get_table("practice_statistics")
        columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA]
        columns[0] = "date"
        columns[-1] = "practice_id"
        table.insert_rows_from_pg(
            models.PracticeStatistics,
            schema=schemas.PRACTICE_STATISTICS_SCHEMA,
            columns=columns,
            transformer=schemas.statistics_transform,
        )

        sql = "SELECT MAX(month) FROM {hscic}.practice_statistics_all_years"
        results = client.query(sql)
        if results.rows[0][0] is None:
            last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1)
        else:
            last_uploaded_practice_statistics_date = results.rows[0][0].date()

        table = client.get_table("practice_statistics_all_years")
        sql = """SELECT *
        FROM {hscic}.practice_statistics
        WHERE month > TIMESTAMP('{date}')"""
        substitutions = {"date": last_uploaded_practice_statistics_date}
        table.insert_rows_from_query(sql,
                                     write_disposition="WRITE_APPEND",
                                     substitutions=substitutions)

        table = client.get_table("pcns")
        table.insert_rows_from_pg(models.PCN, schemas.PCN_SCHEMA)

        table = client.get_table("ccgs")
        table.insert_rows_from_pg(models.PCT,
                                  schemas.CCG_SCHEMA,
                                  transformer=schemas.ccgs_transform)

        table = client.get_table("stps")
        table.insert_rows_from_pg(models.STP, schemas.STP_SCHEMA)

        table = client.get_table("regional_teams")
        table.insert_rows_from_pg(models.RegionalTeam,
                                  schemas.REGIONAL_TEAM_SCHEMA)

        date = models.ImportLog.objects.latest_in_category(
            "prescribing").current_at
        table = client.get_table("prescribing_" + date.strftime("%Y_%m"))
        sql = """SELECT * FROM {hscic}.prescribing_v2
        WHERE month = TIMESTAMP('{date}')"""
        substitutions = {"date": date}
        table.insert_rows_from_query(sql, substitutions=substitutions)
コード例 #8
0
    def handle(self, *args, **kwargs):
        # Make sure that PracticeStatistics and Prescription tables both have
        # latest data.
        latest_practice_statistic_date = models.PracticeStatistics.objects.aggregate(
            Max("date"))["date__max"]
        latest_prescription_date = models.Prescription.objects.aggregate(
            Max("processing_date"))["processing_date__max"]

        if latest_practice_statistic_date != latest_prescription_date:
            msg = ("Latest PracticeStatistics object has date {}, "
                   "while latest Prescription object has processing_date {}".
                   format(latest_practice_statistic_date,
                          latest_prescription_date))
            raise CommandError(msg)

        date = latest_prescription_date

        update_bnf_table()

        client = BQClient("hscic")

        table = client.get_table("practices")
        table.insert_rows_from_pg(models.Practice, schemas.PRACTICE_SCHEMA)

        table = client.get_table("presentation")
        table.insert_rows_from_pg(
            models.Presentation,
            schemas.PRESENTATION_SCHEMA,
            transformer=schemas.presentation_transform,
        )

        table = client.get_table("practice_statistics")
        columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA]
        columns[0] = "date"
        columns[-1] = "practice_id"
        table.insert_rows_from_pg(
            models.PracticeStatistics,
            schema=schemas.PRACTICE_STATISTICS_SCHEMA,
            columns=columns,
            transformer=schemas.statistics_transform,
        )

        sql = "SELECT MAX(month) FROM {hscic}.practice_statistics_all_years"
        results = client.query(sql)
        if results.rows[0][0] is None:
            last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1)
        else:
            last_uploaded_practice_statistics_date = results.rows[0][0].date()

        table = client.get_table("practice_statistics_all_years")
        sql = """SELECT *
        FROM {hscic}.practice_statistics
        WHERE month > TIMESTAMP('{date}')"""
        substitutions = {"date": last_uploaded_practice_statistics_date}
        table.insert_rows_from_query(sql,
                                     write_disposition="WRITE_APPEND",
                                     substitutions=substitutions)

        table = client.get_table("ccgs")
        table.insert_rows_from_pg(models.PCT,
                                  schemas.CCG_SCHEMA,
                                  transformer=schemas.ccgs_transform)

        table = client.get_table("stps")
        table.insert_rows_from_pg(models.STP, schemas.STP_SCHEMA)

        table = client.get_table("regional_teams")
        table.insert_rows_from_pg(models.RegionalTeam,
                                  schemas.REGIONAL_TEAM_SCHEMA)

        table = client.get_table("prescribing_" + date.strftime("%Y_%m"))
        sql = """SELECT * FROM {hscic}.prescribing
        WHERE month = TIMESTAMP('{date}')"""
        substitutions = {"date": date}
        table.insert_rows_from_query(sql, substitutions=substitutions)
コード例 #9
0
    def handle(self, *args, **options):
        path = options["filename"]
        head, filename = os.path.split(path)
        _, year_and_month = os.path.split(head)

        logger.info("path: %s", path)
        logger.info("year_and_month: %s", year_and_month)

        date = year_and_month + "_01"
        try:
            datetime.datetime.strptime(date, "%Y_%m_%d")
        except ValueError:
            message = ("The file path must have a YYYY_MM "
                       "date component in the containing directory: ")
            message += path
            raise CommandError(message)

        hscic_dataset_client = Client("hscic")
        tmp_dataset_client = Client("tmp_eu")

        # Check that we haven't already processed data for this month
        sql = """SELECT COUNT(*)
        FROM {hscic}.prescribing_v2
        WHERE month = TIMESTAMP('{date}')"""

        try:
            results = hscic_dataset_client.query(
                sql, substitutions={"date": date.replace("_", "-")})
            assert results.rows[0][0] == 0
        except NotFound:
            pass

        # Create BQ table backed backed by uploaded source CSV file
        raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month)
        gcs_path = "hscic/prescribing_v2/{}/{}".format(year_and_month,
                                                       filename)

        logger.info("raw_data_table_name: %s", raw_data_table_name)
        logger.info("gcs_path: %s", gcs_path)

        raw_data_table = tmp_dataset_client.create_storage_backed_table(
            raw_data_table_name, RAW_PRESCRIBING_SCHEMA_V2, gcs_path)

        # Append aggregated data to prescribing table
        sql = """
         SELECT
          AREA_TEAM_CODE AS sha,
          LEFT(PCO_CODE, 3) AS pct,
          PRACTICE_CODE AS practice,
          BNF_CODE AS bnf_code,
          BNF_DESCRIPTION AS bnf_name,
          SUM(ITEMS) AS items,
          SUM(NIC) AS net_cost,
          SUM(ACTUAL_COST) AS actual_cost,
          SUM(TOTAL_QUANTITY) AS quantity,
          TIMESTAMP('%s') AS month,
         FROM %s
         WHERE PRACTICE_CODE NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           bnf_code, bnf_name, pct,
           practice, sha
        """ % (
            date.replace("_", "-"),
            raw_data_table.qualified_name,
        )

        logger.info("sql: %s", sql)

        prescribing_table = hscic_dataset_client.get_table("prescribing_v2")
        prescribing_table.insert_rows_from_query(
            sql, legacy=True, write_disposition="WRITE_APPEND")

        ImportLog.objects.create(current_at=date.replace("_", "-"),
                                 filename=filename,
                                 category="prescribing")
コード例 #10
0
    def handle(self, *args, **kwargs):
        # Make sure that PracticeStatistics and Prescription tables both have
        # latest data.
        latest_practice_statistic_date = models.PracticeStatistics.objects\
            .aggregate(Max('date'))['date__max']
        latest_prescription_date = models.Prescription.objects\
            .aggregate(Max('processing_date'))['processing_date__max']

        if latest_practice_statistic_date != latest_prescription_date:
            msg = 'Latest PracticeStatistics object has date {}, '\
                'while latest Prescription object has processing_date {}'\
                .format(latest_practice_statistic_date, latest_prescription_date)
            raise CommandError(msg)

        date = latest_prescription_date

        update_bnf_table()

        client = BQClient('hscic')

        table = client.get_table('practices')
        columns = [field.name for field in schemas.PRACTICE_SCHEMA]
        table.insert_rows_from_pg(models.Practice, columns)

        table = client.get_table('presentation')
        columns = [field.name for field in schemas.PRESENTATION_SCHEMA]
        table.insert_rows_from_pg(models.Presentation, columns,
                                  schemas.presentation_transform)

        table = client.get_table('practice_statistics')
        columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA]
        columns[0] = 'date'
        columns[-1] = 'practice_id'
        table.insert_rows_from_pg(models.PracticeStatistics, columns,
                                  schemas.statistics_transform)

        sql = 'SELECT MAX(month) FROM {hscic}.practice_statistics_all_years'
        results = client.query(sql)
        if results.rows[0][0] is None:
            last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1)
        else:
            last_uploaded_practice_statistics_date = results.rows[0][0].date()

        table = client.get_table('practice_statistics_all_years')
        sql = '''SELECT
                    month,
                    pct_id,
                    practice,
                    male_0_4,
                    female_0_4,
                    male_5_14,
                    female_5_14,
                    male_15_24,
                    female_15_24,
                    male_25_34,
                    female_25_34,
                    male_35_44,
                    female_35_44,
                    male_45_54,
                    female_45_54,
                    male_55_64,
                    female_55_64,
                    male_65_74,
                    female_65_74,
                    male_75_plus,
                    female_75_plus,
                    total_list_size
        FROM {hscic}.practice_statistics
        WHERE month > TIMESTAMP('{date}')'''
        substitutions = {'date': last_uploaded_practice_statistics_date}
        table.insert_rows_from_query(sql,
                                     write_disposition='WRITE_APPEND',
                                     substitutions=substitutions)

        table = client.get_table('ccgs')
        columns = [field.name for field in schemas.CCG_SCHEMA]
        table.insert_rows_from_pg(models.PCT, columns, schemas.ccgs_transform)

        table = client.get_table('ppu_savings')
        columns = [field.name for field in PPU_SAVING_SCHEMA]
        table.insert_rows_from_pg(PPUSaving, columns, ppu_savings_transform)

        table = client.get_table('prescribing_' + date.strftime('%Y_%m'))
        sql = '''SELECT * FROM {hscic}.prescribing
        WHERE month = TIMESTAMP('{date}')'''
        substitutions = {'date': date}
        table.insert_rows_from_query(sql, substitutions=substitutions)
コード例 #11
0
    def handle(self, *args, **options):
        path = options['filename']
        head, filename = os.path.split(path)
        converted_path = '{}_formatted.CSV'.format(os.path.splitext(path)[0])
        _, year_and_month = os.path.split(head)

        logger.info('path: %s', path)
        logger.info('converted_path: %s', converted_path)
        logger.info('year_and_month: %s', year_and_month)

        date = year_and_month + '_01'
        try:
            datetime.datetime.strptime(date, '%Y_%m_%d')
        except ValueError:
            message = ('The file path must have a YYYY_MM '
                       'date component in the containing directory: ')
            message += path
            raise CommandError(message)

        hscic_dataset_client = Client('hscic')
        tmp_dataset_client = Client('tmp_eu')

        # Check that we haven't already processed data for this month
        sql = '''SELECT COUNT(*)
        FROM {dataset}.prescribing
        WHERE month = TIMESTAMP('{date}')'''.format(
            dataset=hscic_dataset_client.dataset_id,
            date=date.replace('_', '-'),
        )
        try:
            results = hscic_dataset_client.query(sql)
            assert results.rows[0][0] == 0
        except NotFound:
            pass

        # Create BQ table backed backed by uploaded source CSV file
        raw_data_table_name = 'raw_prescribing_data_{}'.format(year_and_month)
        gcs_path = 'hscic/prescribing/{}/{}'.format(year_and_month, filename)

        logger.info('raw_data_table_name: %s', raw_data_table_name)
        logger.info('gcs_path: %s', gcs_path)

        schema = [
            {'name': 'Regional_Office_Name', 'type': 'string'},
            {'name': 'Regional_Office_Code', 'type': 'string'},
            {'name': 'Area_Team_Name', 'type': 'string'},
            {'name': 'Area_Team_Code', 'type': 'string', 'mode': 'required'},
            {'name': 'PCO_Name', 'type': 'string'},
            {'name': 'PCO_Code', 'type': 'string'},
            {'name': 'Practice_Name', 'type': 'string'},
            {'name': 'Practice_Code', 'type': 'string', 'mode': 'required'},
            {'name': 'BNF_Code', 'type': 'string', 'mode': 'required'},
            {'name': 'BNF_Description', 'type': 'string', 'mode': 'required'},
            {'name': 'Items', 'type': 'integer', 'mode': 'required'},
            {'name': 'Quantity', 'type': 'integer', 'mode': 'required'},
            {'name': 'ADQ_Usage', 'type': 'float'},
            {'name': 'NIC', 'type': 'float', 'mode': 'required'},
            {'name': 'Actual_Cost', 'type': 'float', 'mode': 'required'},
        ]
        raw_data_table = tmp_dataset_client.create_storage_backed_table(
            raw_data_table_name,
            schema,
            gcs_path
        )

        # Append aggregated data to prescribing table
        sql = '''
         SELECT
          Area_Team_Code AS sha,
          LEFT(PCO_Code, 3) AS pct,
          Practice_Code AS practice,
          BNF_Code AS bnf_code,
          BNF_Description AS bnf_name,
          SUM(Items) AS items,
          SUM(NIC) AS net_cost,
          SUM(Actual_Cost) AS actual_cost,
          SUM(Quantity * Items) AS quantity,
          TIMESTAMP('%s') AS month,
         FROM %s
         WHERE Practice_Code NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           bnf_code, bnf_name, pct,
           practice, sha
        ''' % (date.replace('_', '-'), raw_data_table.qualified_name)

        logger.info('sql: %s', sql)

        prescribing_table = hscic_dataset_client.get_table('prescribing')
        prescribing_table.insert_rows_from_query(
            sql,
            legacy=True,
            write_disposition='WRITE_APPEND'
        )

        # Write aggregated data to new table, for download
        sql = '''
         SELECT
          LEFT(PCO_Code, 3) AS pct_id,
          Practice_Code AS practice_code,
          BNF_Code AS presentation_code,
          SUM(Items) AS total_items,
          SUM(NIC) AS net_cost,
          SUM(Actual_Cost) AS actual_cost,
          SUM(Quantity * Items) AS quantity,
          '%s' AS processing_date,
         FROM %s
         WHERE Practice_Code NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           presentation_code, pct_id, practice_code
        ''' % (date, raw_data_table.qualified_name)

        fmtd_data_table_name = 'formatted_prescribing_%s' % year_and_month

        logger.info('sql: %s', sql)
        logger.info('fmtd_data_table_name: %s', fmtd_data_table_name)

        fmtd_data_table = tmp_dataset_client.get_table(fmtd_data_table_name)
        fmtd_data_table.insert_rows_from_query(sql, legacy=True)

        # Export new table to storage, and download
        exporter = TableExporter(fmtd_data_table, gcs_path + '_formatted-')
        exporter.export_to_storage(print_header=False)

        with tempfile.NamedTemporaryFile(dir=head) as f:
            exporter.download_from_storage_and_unzip(f)

            # Sort the output.
            #
            # Why? Because this is equivalent to CLUSTERing the table on
            # loading, but less resource-intensive than doing it in
            # Postgres. And the table is too big to sort within BigQuery.
            subprocess.call(
                "ionice -c 2 nice -n 10 sort -k3,3 -k1,1 -k2,2 -t, %s > %s" % (
                    f.name, converted_path),
                shell=True)
コード例 #12
0
    def test_the_lot(self):
        client = Client('test')

        schema = build_schema(
            ('a', 'INTEGER'),
            ('b', 'STRING'),
        )

        headers = ['a', 'b']
        rows = [
            (1, 'apple'),
            (2, 'banana'),
            (3, 'coconut'),
        ]

        t1 = client.get_or_create_table('t1', schema)
        t1_qname = t1.qualified_name

        # Test Table.insert_rows_from_csv
        t1.insert_rows_from_csv('gcutils/tests/test_table.csv')

        self.assertEqual(sorted(t1.get_rows()), rows)

        # Test Table.insert_rows_from_query
        t2 = client.get_table('t2')

        sql = 'SELECT * FROM {} WHERE a > 1'.format(t1_qname)
        t2.insert_rows_from_query(sql)

        self.assertEqual(sorted(t2.get_rows()), rows[1:])

        # Test Client.query
        sql = 'SELECT * FROM {} WHERE a > 2'.format(t1_qname)
        results = client.query(sql)

        self.assertEqual(sorted(results.rows), rows[2:])

        # Test Client.query_into_dataframe
        sql = 'SELECT * FROM {} WHERE a > 2'.format(t1_qname)
        df = client.query_into_dataframe(sql)

        self.assertEqual(df.values.tolist(), [list(rows[2])])

        # Test TableExporter.export_to_storage and
        # TableExporter.download_from_storage_and_unzip
        t1_exporter = TableExporter(t1, self.storage_prefix + 'test_table-')
        t1_exporter.export_to_storage()

        with tempfile.NamedTemporaryFile(mode='r+') as f:
            t1_exporter.download_from_storage_and_unzip(f)
            f.seek(0)
            reader = csv.reader(f)
            data = [reader.next()] + sorted(reader)

        self.assertEqual(data, [map(str, row) for row in [headers] + rows])

        # Test Table.insert_rows_from_storage
        storage_path = self.storage_prefix + 'test_table.csv'
        self.upload_to_storage('gcutils/tests/test_table.csv', storage_path)

        t2.insert_rows_from_storage(storage_path)

        self.assertEqual(sorted(t2.get_rows()), rows)

        # Test Client.create_storage_backed_table
        storage_path = self.storage_prefix + 'test_table_headers.csv'
        self.upload_to_storage(
            'gcutils/tests/test_table_headers.csv',
            storage_path
        )

        schema = [
            {'name': 'a', 'type': 'integer'},
            {'name': 'b', 'type': 'string'},
        ]

        t3 = client.create_storage_backed_table(
            't3',
            schema,
            storage_path
        )

        results = client.query('SELECT * FROM {}'.format(t3.qualified_name))

        self.assertEqual(sorted(results.rows), rows)

        self.upload_to_storage(
            'gcutils/tests/test_table_headers_2.csv',
            storage_path
        )

        results = client.query('SELECT * FROM {}'.format(t3.qualified_name))

        self.assertEqual(sorted(results.rows), rows + [(4, u'damson')])

        # Test Client.create_table_with_view
        sql = 'SELECT * FROM {{project}}.{} WHERE a > 1'.format(t1_qname)

        t4 = client.create_table_with_view('t4', sql, False)

        results = client.query('SELECT * FROM {}'.format(t4.qualified_name))

        self.assertEqual(sorted(results.rows), rows[1:])

        # Test Client.insert_rows_from_pg
        PCT.objects.create(code='ABC', name='CCG 1')
        PCT.objects.create(code='XYZ', name='CCG 2')

        def transformer(row):
            return [ord(row[0][0]), row[1]]
        t1.insert_rows_from_pg(PCT, ['code', 'name'], transformer)

        self.assertEqual(sorted(t1.get_rows()), [(65, 'CCG 1'), (88, 'CCG 2')])

        # Test Table.delete_all_rows
        t1.delete_all_rows()

        self.assertEqual(list(t1.get_rows()), [])
コード例 #13
0
    def test_the_lot(self):
        client = Client("test")
        archive_client = Client("archive")

        orig_schema = build_schema(("a", "STRING"), ("b", "INTEGER"))

        schema = build_schema(("a", "INTEGER"), ("b", "STRING"))

        headers = ["a", "b"]
        rows = [(1, "apple"), (2, "banana"), (3, "coconut")]

        t1 = client.get_or_create_table("t1", orig_schema)
        t1_qname = t1.qualified_name

        # Test Table.insert_rows_from_csv
        t1.insert_rows_from_csv("gcutils/tests/test_table.csv", schema)

        self.assertEqual(sorted(t1.get_rows()), rows)

        # Test Table.insert_rows_from_query
        t2 = client.get_table("t2")

        sql = "SELECT * FROM {} WHERE a > 1".format(t1_qname)
        t2.insert_rows_from_query(sql)

        self.assertEqual(sorted(t2.get_rows()), rows[1:])

        # Test Client.query
        sql = "SELECT * FROM {} WHERE a > 2".format(t1_qname)
        results = client.query(sql)

        self.assertEqual(sorted(results.rows), rows[2:])

        # Test Client.query_into_dataframe
        sql = "SELECT * FROM {} WHERE a > 2".format(t1_qname)
        df = client.query_into_dataframe(sql)

        self.assertEqual(df.values.tolist(), [list(rows[2])])

        # Test TableExporter.export_to_storage and
        # TableExporter.download_from_storage_and_unzip
        t1_exporter = TableExporter(t1, self.storage_prefix + "test_table-")
        t1_exporter.export_to_storage()

        with tempfile.NamedTemporaryFile(mode="r+") as f:
            t1_exporter.download_from_storage_and_unzip(f)
            f.seek(0)
            reader = csv.reader(f)
            data = [next(reader)] + sorted(reader)

        self.assertEqual(data,
                         [list(map(str, row)) for row in [headers] + rows])

        # Test Table.insert_rows_from_storage
        storage_path = self.storage_prefix + "test_table.csv"
        self.upload_to_storage("gcutils/tests/test_table.csv", storage_path)

        t2.insert_rows_from_storage(storage_path)

        self.assertEqual(sorted(t2.get_rows()), rows)

        # Test Client.create_storage_backed_table
        storage_path = self.storage_prefix + "test_table_headers.csv"
        self.upload_to_storage("gcutils/tests/test_table_headers.csv",
                               storage_path)

        schema = build_schema(("a", "INTEGER"), ("b", "STRING"))

        t3 = client.create_storage_backed_table("t3", schema, storage_path)

        results = client.query("SELECT * FROM {}".format(t3.qualified_name))

        self.assertEqual(sorted(results.rows), rows)

        self.upload_to_storage("gcutils/tests/test_table_headers_2.csv",
                               storage_path)

        results = client.query("SELECT * FROM {}".format(t3.qualified_name))

        self.assertEqual(sorted(results.rows), rows + [(4, "damson")])

        # Test Client.create_table_with_view
        sql = "SELECT * FROM {{project}}.{} WHERE a > 1".format(t1_qname)

        t4 = client.create_table_with_view("t4", sql, False)

        results = client.query("SELECT * FROM {}".format(t4.qualified_name))

        self.assertEqual(sorted(results.rows), rows[1:])

        # Test Table.copy_to_new_dataset
        t1.copy_to_new_dataset("archive")
        t1_archived = archive_client.get_table("t1")
        self.assertEqual(sorted(t1_archived.get_rows()), rows)
        self.assertEqual(sorted(t1.get_rows()), rows)

        # Test Table.move_to_new_dataset
        t2.move_to_new_dataset("archive")
        t2_archived = archive_client.get_table("t2")
        self.assertEqual(sorted(t2_archived.get_rows()), rows)
        with self.assertRaises(NotFound):
            list(t2.get_rows())

        # Test Client.insert_rows_from_pg
        PCT.objects.create(code="ABC", name="CCG 1")
        PCT.objects.create(code="XYZ", name="CCG 2")

        def transformer(row):
            return [ord(row[0][0]), row[1]]

        t1.insert_rows_from_pg(
            PCT,
            build_schema(("code", "INTEGER"), ("name", "STRING")),
            transformer=transformer,
        )

        self.assertEqual(sorted(t1.get_rows()), [(65, "CCG 1"), (88, "CCG 2")])

        # Test Table.delete_all_rows
        t1.delete_all_rows()

        self.assertEqual(list(t1.get_rows()), [])
コード例 #14
0
    def handle(self, *args, **kwargs):
        s = requests.Session()

        # Get URLs keyed by the date (year-month) they're for
        urls = set(self.iter_dataset_urls(s))
        urls_by_month = dict(self.iter_months(urls))

        # set up the BigQuery client, dataset, and table
        client = Client(dataset_key="scmd")
        self.ensure_dataset_exists(client)
        table = client.get_or_create_table("scmd", schema=SCHEMA)

        # look for existing months in BigQuery
        sql = "SELECT DISTINCT year_month FROM {};".format(
            table.qualified_name)
        known_dates = [r[0] for r in client.query(sql).rows]

        # convert the datetime.dates the query gives us to strings since
        # that's what we're dealing with elsewhere.
        known_months = {d.strftime("%Y-%m") for d in known_dates}
        print(known_months)

        missing_months = set(urls_by_month.keys()) - known_months
        print("Missing months: {}".format(", ".join(sorted(missing_months))))
        pending_urls = {m: urls_by_month[m] for m in missing_months}

        # abort if there's nothing to get
        if not pending_urls:
            print("no pending urls, aborting")
            return

        # grab missing months
        for month, url in sorted(pending_urls.items()):
            print("{} | Getting: {}".format(month, url))
            r = s.get(url)
            r.raise_for_status()
            print("{} | Downloaded: {}".format(month, url))

            # read CSV into memory
            decoded_content = r.content.decode("utf-8")
            reader = csv.reader(decoded_content.splitlines(), delimiter=",")

            # remove headers
            next(reader)

            # use a tempfile so we can write the CSV to disk with necessary
            # changes (adding days to the year-month dates at time of writing),
            # before uploading from that file to BigQuery.
            with tempfile.NamedTemporaryFile(mode="w+") as f:
                writer = csv.writer(f, delimiter=",")
                for line in reader:
                    # Convert year-month dates to year-month-day
                    line[0] = line[0] + "-01"
                    writer.writerow(line)
                print("{} | Wrote: {}".format(month, f.name))

                # rewind the file so we can read it into BigQuery
                f.seek(0)

                # insert into BigQuery
                table.insert_rows_from_csv(f.name,
                                           SCHEMA,
                                           write_disposition="WRITE_APPEND")
                print("{} | Ingested into BigQuery".format(month))