def handle(self, *args, **kwargs): last_imported = kwargs["last_imported"] prescribing_date = "-".join(last_imported.split("_")) + "-01" date_condition = ('month > TIMESTAMP(DATE_SUB(DATE "%s", ' "INTERVAL 5 YEAR))" % prescribing_date) path = os.path.join(settings.PIPELINE_METADATA_DIR, "smoketests") for sql_file in glob.glob(os.path.join(path, "*.sql")): test_name = os.path.splitext(os.path.basename(sql_file))[0] with open(sql_file) as f: query = f.read().replace("{{ date_condition }}", date_condition) print(query) client = Client() results = client.query(query) quantity = [] cost = [] items = [] for row in results_to_dicts(results): quantity.append(row["quantity"]) cost.append(row["actual_cost"]) items.append(row["items"]) print("Updating test expectations for %s" % test_name) json_path = os.path.join(path, "%s.json" % test_name) with open(json_path, "w") as f: obj = {"cost": cost, "items": items, "quantity": quantity} json.dump(obj, f, indent=2)
def handle(self, *args, **kwargs): last_imported = kwargs['last_imported'] prescribing_date = "-".join(last_imported.split('_')) + '-01' date_condition = ('month > TIMESTAMP(DATE_SUB(DATE "%s", ' 'INTERVAL 5 YEAR))' % prescribing_date) path = os.path.join(settings.PIPELINE_METADATA_DIR, 'smoketests') for sql_file in glob.glob(os.path.join(path, '*.sql')): test_name = os.path.splitext(os.path.basename(sql_file))[0] with open(sql_file, 'rb') as f: query = f.read().replace( '{{ date_condition }}', date_condition) print(query) client = Client() results = client.query(query) quantity = [] cost = [] items = [] for row in results_to_dicts(results): quantity.append(row['quantity']) cost.append(row['actual_cost']) items.append(row['items']) print("Updating test expectations for %s" % test_name) json_path = os.path.join(path, '%s.json' % test_name) with open(json_path, 'wb') as f: obj = {'cost': cost, 'items': items, 'quantity': quantity} json.dump(obj, f, indent=2)
def ensure_stats_downloaded_for_date(date): """ Download practice statistics for date, or do nothing if already downloaded """ filename = get_practice_stats_filename(date) if os.path.exists(filename): return client = Client("hscic") check_stats_in_bigquery(date, client) logger.info("Downloading practice statistics for %s", date) temp_name = get_temp_filename(filename) result = client.query( """ SELECT * FROM {hscic}.practice_statistics_all_years WHERE month = TIMESTAMP("%s") """ % (date,) ) with gzip.open(temp_name, "wt") as f: writer = csv.writer(f) writer.writerow(result.field_names) for row in result.rows: writer.writerow(row) os.rename(temp_name, filename)
def backup_table(self, table_name): client = Client("hscic") sql = "SELECT max(month) FROM {hscic}.%s" % table_name latest_date = client.query(sql).rows[0][0] latest_year_and_month = latest_date.strftime("%Y_%m") table = client.get_table(table_name) storage_client = StorageClient() bucket = storage_client.bucket() year_and_months = set() prefix_base = "backups/{}/".format(table_name) for blob in bucket.list_blobs(prefix=prefix_base): match = re.search("/(\d{4}_\d{2})/", blob.name) year_and_months.add(match.groups()[0]) if latest_year_and_month in year_and_months: print("{} table already backed up for {}".format( table_name, latest_year_and_month)) return storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month, table_name) exporter = TableExporter(table, storage_prefix) exporter.export_to_storage()
def handle(self, *args, **options): client = Client("tmp_eu") year_and_month = ImportLog.objects.latest_in_category( "prescribing").current_at.strftime("%Y_%m") raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month) sql = SQL.format(detailed_raw_data_table="{tmp_eu}.%s" % raw_data_table_name) with transaction.atomic(): for row in client.query(sql).rows: bnf_code_regex, adq_per_quantity = row matches = Presentation.objects.filter( bnf_code__regex=bnf_code_regex) matches.update(adq_per_quantity=adq_per_quantity)
def handle(self, *args, **options): path = options["filename"] head, filename = os.path.split(path) converted_path = "{}_formatted.CSV".format(os.path.splitext(path)[0]) _, year_and_month = os.path.split(head) logger.info("path: %s", path) logger.info("converted_path: %s", converted_path) logger.info("year_and_month: %s", year_and_month) date = year_and_month + "_01" try: datetime.datetime.strptime(date, "%Y_%m_%d") except ValueError: message = ("The file path must have a YYYY_MM " "date component in the containing directory: ") message += path raise CommandError(message) hscic_dataset_client = Client("hscic") tmp_dataset_client = Client("tmp_eu") # Check that we haven't already processed data for this month sql = """SELECT COUNT(*) FROM {dataset}.prescribing WHERE month = TIMESTAMP('{date}')""".format( dataset=hscic_dataset_client.dataset_id, date=date.replace("_", "-")) try: results = hscic_dataset_client.query(sql) assert results.rows[0][0] == 0 except NotFound: pass # Create BQ table backed backed by uploaded source CSV file raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month) gcs_path = "hscic/prescribing/{}/{}".format(year_and_month, filename) logger.info("raw_data_table_name: %s", raw_data_table_name) logger.info("gcs_path: %s", gcs_path) raw_data_table = tmp_dataset_client.create_storage_backed_table( raw_data_table_name, RAW_PRESCRIBING_SCHEMA, gcs_path) # Append aggregated data to prescribing table sql = """ SELECT Area_Team_Code AS sha, LEFT(PCO_Code, 3) AS pct, Practice_Code AS practice, BNF_Code AS bnf_code, BNF_Description AS bnf_name, SUM(Items) AS items, SUM(NIC) AS net_cost, SUM(Actual_Cost) AS actual_cost, SUM(Quantity * Items) AS quantity, TIMESTAMP('%s') AS month, FROM %s WHERE Practice_Code NOT LIKE '%%998' -- see issue #349 GROUP BY bnf_code, bnf_name, pct, practice, sha """ % ( date.replace("_", "-"), raw_data_table.qualified_name, ) logger.info("sql: %s", sql) prescribing_table = hscic_dataset_client.get_table("prescribing") prescribing_table.insert_rows_from_query( sql, legacy=True, write_disposition="WRITE_APPEND") # Write aggregated data to new table, for download sql = """ SELECT LEFT(PCO_Code, 3) AS pct_id, Practice_Code AS practice_code, BNF_Code AS presentation_code, SUM(Items) AS total_items, SUM(NIC) AS net_cost, SUM(Actual_Cost) AS actual_cost, SUM(Quantity * Items) AS quantity, '%s' AS processing_date, FROM %s WHERE Practice_Code NOT LIKE '%%998' -- see issue #349 GROUP BY presentation_code, pct_id, practice_code """ % ( date, raw_data_table.qualified_name, ) fmtd_data_table_name = "formatted_prescribing_%s" % year_and_month logger.info("sql: %s", sql) logger.info("fmtd_data_table_name: %s", fmtd_data_table_name) fmtd_data_table = tmp_dataset_client.get_table(fmtd_data_table_name) fmtd_data_table.insert_rows_from_query(sql, legacy=True) # Export new table to storage, and download exporter = TableExporter(fmtd_data_table, gcs_path + "_formatted-") exporter.export_to_storage(print_header=False) with tempfile.NamedTemporaryFile(dir=head) as f: exporter.download_from_storage_and_unzip(f) # Sort the output. # # Why? Because this is equivalent to CLUSTERing the table on # loading, but less resource-intensive than doing it in # Postgres. And the table is too big to sort within BigQuery. subprocess.call( "ionice -c 2 nice -n 10 sort -k3,3 -k1,1 -k2,2 -t, %s > %s" % (f.name, converted_path), shell=True, )
def handle(self, *args, **kwargs): update_bnf_table() client = BQClient("hscic") table = client.get_table("practices") table.insert_rows_from_pg(models.Practice, schemas.PRACTICE_SCHEMA) table = client.get_table("presentation") table.insert_rows_from_pg( models.Presentation, schemas.PRESENTATION_SCHEMA, transformer=schemas.presentation_transform, ) table = client.get_table("practice_statistics") columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA] columns[0] = "date" columns[-1] = "practice_id" table.insert_rows_from_pg( models.PracticeStatistics, schema=schemas.PRACTICE_STATISTICS_SCHEMA, columns=columns, transformer=schemas.statistics_transform, ) sql = "SELECT MAX(month) FROM {hscic}.practice_statistics_all_years" results = client.query(sql) if results.rows[0][0] is None: last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1) else: last_uploaded_practice_statistics_date = results.rows[0][0].date() table = client.get_table("practice_statistics_all_years") sql = """SELECT * FROM {hscic}.practice_statistics WHERE month > TIMESTAMP('{date}')""" substitutions = {"date": last_uploaded_practice_statistics_date} table.insert_rows_from_query(sql, write_disposition="WRITE_APPEND", substitutions=substitutions) table = client.get_table("pcns") table.insert_rows_from_pg(models.PCN, schemas.PCN_SCHEMA) table = client.get_table("ccgs") table.insert_rows_from_pg(models.PCT, schemas.CCG_SCHEMA, transformer=schemas.ccgs_transform) table = client.get_table("stps") table.insert_rows_from_pg(models.STP, schemas.STP_SCHEMA) table = client.get_table("regional_teams") table.insert_rows_from_pg(models.RegionalTeam, schemas.REGIONAL_TEAM_SCHEMA) date = models.ImportLog.objects.latest_in_category( "prescribing").current_at table = client.get_table("prescribing_" + date.strftime("%Y_%m")) sql = """SELECT * FROM {hscic}.prescribing_v2 WHERE month = TIMESTAMP('{date}')""" substitutions = {"date": date} table.insert_rows_from_query(sql, substitutions=substitutions)
def handle(self, *args, **kwargs): # Make sure that PracticeStatistics and Prescription tables both have # latest data. latest_practice_statistic_date = models.PracticeStatistics.objects.aggregate( Max("date"))["date__max"] latest_prescription_date = models.Prescription.objects.aggregate( Max("processing_date"))["processing_date__max"] if latest_practice_statistic_date != latest_prescription_date: msg = ("Latest PracticeStatistics object has date {}, " "while latest Prescription object has processing_date {}". format(latest_practice_statistic_date, latest_prescription_date)) raise CommandError(msg) date = latest_prescription_date update_bnf_table() client = BQClient("hscic") table = client.get_table("practices") table.insert_rows_from_pg(models.Practice, schemas.PRACTICE_SCHEMA) table = client.get_table("presentation") table.insert_rows_from_pg( models.Presentation, schemas.PRESENTATION_SCHEMA, transformer=schemas.presentation_transform, ) table = client.get_table("practice_statistics") columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA] columns[0] = "date" columns[-1] = "practice_id" table.insert_rows_from_pg( models.PracticeStatistics, schema=schemas.PRACTICE_STATISTICS_SCHEMA, columns=columns, transformer=schemas.statistics_transform, ) sql = "SELECT MAX(month) FROM {hscic}.practice_statistics_all_years" results = client.query(sql) if results.rows[0][0] is None: last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1) else: last_uploaded_practice_statistics_date = results.rows[0][0].date() table = client.get_table("practice_statistics_all_years") sql = """SELECT * FROM {hscic}.practice_statistics WHERE month > TIMESTAMP('{date}')""" substitutions = {"date": last_uploaded_practice_statistics_date} table.insert_rows_from_query(sql, write_disposition="WRITE_APPEND", substitutions=substitutions) table = client.get_table("ccgs") table.insert_rows_from_pg(models.PCT, schemas.CCG_SCHEMA, transformer=schemas.ccgs_transform) table = client.get_table("stps") table.insert_rows_from_pg(models.STP, schemas.STP_SCHEMA) table = client.get_table("regional_teams") table.insert_rows_from_pg(models.RegionalTeam, schemas.REGIONAL_TEAM_SCHEMA) table = client.get_table("prescribing_" + date.strftime("%Y_%m")) sql = """SELECT * FROM {hscic}.prescribing WHERE month = TIMESTAMP('{date}')""" substitutions = {"date": date} table.insert_rows_from_query(sql, substitutions=substitutions)
def handle(self, *args, **options): path = options["filename"] head, filename = os.path.split(path) _, year_and_month = os.path.split(head) logger.info("path: %s", path) logger.info("year_and_month: %s", year_and_month) date = year_and_month + "_01" try: datetime.datetime.strptime(date, "%Y_%m_%d") except ValueError: message = ("The file path must have a YYYY_MM " "date component in the containing directory: ") message += path raise CommandError(message) hscic_dataset_client = Client("hscic") tmp_dataset_client = Client("tmp_eu") # Check that we haven't already processed data for this month sql = """SELECT COUNT(*) FROM {hscic}.prescribing_v2 WHERE month = TIMESTAMP('{date}')""" try: results = hscic_dataset_client.query( sql, substitutions={"date": date.replace("_", "-")}) assert results.rows[0][0] == 0 except NotFound: pass # Create BQ table backed backed by uploaded source CSV file raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month) gcs_path = "hscic/prescribing_v2/{}/{}".format(year_and_month, filename) logger.info("raw_data_table_name: %s", raw_data_table_name) logger.info("gcs_path: %s", gcs_path) raw_data_table = tmp_dataset_client.create_storage_backed_table( raw_data_table_name, RAW_PRESCRIBING_SCHEMA_V2, gcs_path) # Append aggregated data to prescribing table sql = """ SELECT AREA_TEAM_CODE AS sha, LEFT(PCO_CODE, 3) AS pct, PRACTICE_CODE AS practice, BNF_CODE AS bnf_code, BNF_DESCRIPTION AS bnf_name, SUM(ITEMS) AS items, SUM(NIC) AS net_cost, SUM(ACTUAL_COST) AS actual_cost, SUM(TOTAL_QUANTITY) AS quantity, TIMESTAMP('%s') AS month, FROM %s WHERE PRACTICE_CODE NOT LIKE '%%998' -- see issue #349 GROUP BY bnf_code, bnf_name, pct, practice, sha """ % ( date.replace("_", "-"), raw_data_table.qualified_name, ) logger.info("sql: %s", sql) prescribing_table = hscic_dataset_client.get_table("prescribing_v2") prescribing_table.insert_rows_from_query( sql, legacy=True, write_disposition="WRITE_APPEND") ImportLog.objects.create(current_at=date.replace("_", "-"), filename=filename, category="prescribing")
def handle(self, *args, **kwargs): # Make sure that PracticeStatistics and Prescription tables both have # latest data. latest_practice_statistic_date = models.PracticeStatistics.objects\ .aggregate(Max('date'))['date__max'] latest_prescription_date = models.Prescription.objects\ .aggregate(Max('processing_date'))['processing_date__max'] if latest_practice_statistic_date != latest_prescription_date: msg = 'Latest PracticeStatistics object has date {}, '\ 'while latest Prescription object has processing_date {}'\ .format(latest_practice_statistic_date, latest_prescription_date) raise CommandError(msg) date = latest_prescription_date update_bnf_table() client = BQClient('hscic') table = client.get_table('practices') columns = [field.name for field in schemas.PRACTICE_SCHEMA] table.insert_rows_from_pg(models.Practice, columns) table = client.get_table('presentation') columns = [field.name for field in schemas.PRESENTATION_SCHEMA] table.insert_rows_from_pg(models.Presentation, columns, schemas.presentation_transform) table = client.get_table('practice_statistics') columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA] columns[0] = 'date' columns[-1] = 'practice_id' table.insert_rows_from_pg(models.PracticeStatistics, columns, schemas.statistics_transform) sql = 'SELECT MAX(month) FROM {hscic}.practice_statistics_all_years' results = client.query(sql) if results.rows[0][0] is None: last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1) else: last_uploaded_practice_statistics_date = results.rows[0][0].date() table = client.get_table('practice_statistics_all_years') sql = '''SELECT month, pct_id, practice, male_0_4, female_0_4, male_5_14, female_5_14, male_15_24, female_15_24, male_25_34, female_25_34, male_35_44, female_35_44, male_45_54, female_45_54, male_55_64, female_55_64, male_65_74, female_65_74, male_75_plus, female_75_plus, total_list_size FROM {hscic}.practice_statistics WHERE month > TIMESTAMP('{date}')''' substitutions = {'date': last_uploaded_practice_statistics_date} table.insert_rows_from_query(sql, write_disposition='WRITE_APPEND', substitutions=substitutions) table = client.get_table('ccgs') columns = [field.name for field in schemas.CCG_SCHEMA] table.insert_rows_from_pg(models.PCT, columns, schemas.ccgs_transform) table = client.get_table('ppu_savings') columns = [field.name for field in PPU_SAVING_SCHEMA] table.insert_rows_from_pg(PPUSaving, columns, ppu_savings_transform) table = client.get_table('prescribing_' + date.strftime('%Y_%m')) sql = '''SELECT * FROM {hscic}.prescribing WHERE month = TIMESTAMP('{date}')''' substitutions = {'date': date} table.insert_rows_from_query(sql, substitutions=substitutions)
def handle(self, *args, **options): path = options['filename'] head, filename = os.path.split(path) converted_path = '{}_formatted.CSV'.format(os.path.splitext(path)[0]) _, year_and_month = os.path.split(head) logger.info('path: %s', path) logger.info('converted_path: %s', converted_path) logger.info('year_and_month: %s', year_and_month) date = year_and_month + '_01' try: datetime.datetime.strptime(date, '%Y_%m_%d') except ValueError: message = ('The file path must have a YYYY_MM ' 'date component in the containing directory: ') message += path raise CommandError(message) hscic_dataset_client = Client('hscic') tmp_dataset_client = Client('tmp_eu') # Check that we haven't already processed data for this month sql = '''SELECT COUNT(*) FROM {dataset}.prescribing WHERE month = TIMESTAMP('{date}')'''.format( dataset=hscic_dataset_client.dataset_id, date=date.replace('_', '-'), ) try: results = hscic_dataset_client.query(sql) assert results.rows[0][0] == 0 except NotFound: pass # Create BQ table backed backed by uploaded source CSV file raw_data_table_name = 'raw_prescribing_data_{}'.format(year_and_month) gcs_path = 'hscic/prescribing/{}/{}'.format(year_and_month, filename) logger.info('raw_data_table_name: %s', raw_data_table_name) logger.info('gcs_path: %s', gcs_path) schema = [ {'name': 'Regional_Office_Name', 'type': 'string'}, {'name': 'Regional_Office_Code', 'type': 'string'}, {'name': 'Area_Team_Name', 'type': 'string'}, {'name': 'Area_Team_Code', 'type': 'string', 'mode': 'required'}, {'name': 'PCO_Name', 'type': 'string'}, {'name': 'PCO_Code', 'type': 'string'}, {'name': 'Practice_Name', 'type': 'string'}, {'name': 'Practice_Code', 'type': 'string', 'mode': 'required'}, {'name': 'BNF_Code', 'type': 'string', 'mode': 'required'}, {'name': 'BNF_Description', 'type': 'string', 'mode': 'required'}, {'name': 'Items', 'type': 'integer', 'mode': 'required'}, {'name': 'Quantity', 'type': 'integer', 'mode': 'required'}, {'name': 'ADQ_Usage', 'type': 'float'}, {'name': 'NIC', 'type': 'float', 'mode': 'required'}, {'name': 'Actual_Cost', 'type': 'float', 'mode': 'required'}, ] raw_data_table = tmp_dataset_client.create_storage_backed_table( raw_data_table_name, schema, gcs_path ) # Append aggregated data to prescribing table sql = ''' SELECT Area_Team_Code AS sha, LEFT(PCO_Code, 3) AS pct, Practice_Code AS practice, BNF_Code AS bnf_code, BNF_Description AS bnf_name, SUM(Items) AS items, SUM(NIC) AS net_cost, SUM(Actual_Cost) AS actual_cost, SUM(Quantity * Items) AS quantity, TIMESTAMP('%s') AS month, FROM %s WHERE Practice_Code NOT LIKE '%%998' -- see issue #349 GROUP BY bnf_code, bnf_name, pct, practice, sha ''' % (date.replace('_', '-'), raw_data_table.qualified_name) logger.info('sql: %s', sql) prescribing_table = hscic_dataset_client.get_table('prescribing') prescribing_table.insert_rows_from_query( sql, legacy=True, write_disposition='WRITE_APPEND' ) # Write aggregated data to new table, for download sql = ''' SELECT LEFT(PCO_Code, 3) AS pct_id, Practice_Code AS practice_code, BNF_Code AS presentation_code, SUM(Items) AS total_items, SUM(NIC) AS net_cost, SUM(Actual_Cost) AS actual_cost, SUM(Quantity * Items) AS quantity, '%s' AS processing_date, FROM %s WHERE Practice_Code NOT LIKE '%%998' -- see issue #349 GROUP BY presentation_code, pct_id, practice_code ''' % (date, raw_data_table.qualified_name) fmtd_data_table_name = 'formatted_prescribing_%s' % year_and_month logger.info('sql: %s', sql) logger.info('fmtd_data_table_name: %s', fmtd_data_table_name) fmtd_data_table = tmp_dataset_client.get_table(fmtd_data_table_name) fmtd_data_table.insert_rows_from_query(sql, legacy=True) # Export new table to storage, and download exporter = TableExporter(fmtd_data_table, gcs_path + '_formatted-') exporter.export_to_storage(print_header=False) with tempfile.NamedTemporaryFile(dir=head) as f: exporter.download_from_storage_and_unzip(f) # Sort the output. # # Why? Because this is equivalent to CLUSTERing the table on # loading, but less resource-intensive than doing it in # Postgres. And the table is too big to sort within BigQuery. subprocess.call( "ionice -c 2 nice -n 10 sort -k3,3 -k1,1 -k2,2 -t, %s > %s" % ( f.name, converted_path), shell=True)
def test_the_lot(self): client = Client('test') schema = build_schema( ('a', 'INTEGER'), ('b', 'STRING'), ) headers = ['a', 'b'] rows = [ (1, 'apple'), (2, 'banana'), (3, 'coconut'), ] t1 = client.get_or_create_table('t1', schema) t1_qname = t1.qualified_name # Test Table.insert_rows_from_csv t1.insert_rows_from_csv('gcutils/tests/test_table.csv') self.assertEqual(sorted(t1.get_rows()), rows) # Test Table.insert_rows_from_query t2 = client.get_table('t2') sql = 'SELECT * FROM {} WHERE a > 1'.format(t1_qname) t2.insert_rows_from_query(sql) self.assertEqual(sorted(t2.get_rows()), rows[1:]) # Test Client.query sql = 'SELECT * FROM {} WHERE a > 2'.format(t1_qname) results = client.query(sql) self.assertEqual(sorted(results.rows), rows[2:]) # Test Client.query_into_dataframe sql = 'SELECT * FROM {} WHERE a > 2'.format(t1_qname) df = client.query_into_dataframe(sql) self.assertEqual(df.values.tolist(), [list(rows[2])]) # Test TableExporter.export_to_storage and # TableExporter.download_from_storage_and_unzip t1_exporter = TableExporter(t1, self.storage_prefix + 'test_table-') t1_exporter.export_to_storage() with tempfile.NamedTemporaryFile(mode='r+') as f: t1_exporter.download_from_storage_and_unzip(f) f.seek(0) reader = csv.reader(f) data = [reader.next()] + sorted(reader) self.assertEqual(data, [map(str, row) for row in [headers] + rows]) # Test Table.insert_rows_from_storage storage_path = self.storage_prefix + 'test_table.csv' self.upload_to_storage('gcutils/tests/test_table.csv', storage_path) t2.insert_rows_from_storage(storage_path) self.assertEqual(sorted(t2.get_rows()), rows) # Test Client.create_storage_backed_table storage_path = self.storage_prefix + 'test_table_headers.csv' self.upload_to_storage( 'gcutils/tests/test_table_headers.csv', storage_path ) schema = [ {'name': 'a', 'type': 'integer'}, {'name': 'b', 'type': 'string'}, ] t3 = client.create_storage_backed_table( 't3', schema, storage_path ) results = client.query('SELECT * FROM {}'.format(t3.qualified_name)) self.assertEqual(sorted(results.rows), rows) self.upload_to_storage( 'gcutils/tests/test_table_headers_2.csv', storage_path ) results = client.query('SELECT * FROM {}'.format(t3.qualified_name)) self.assertEqual(sorted(results.rows), rows + [(4, u'damson')]) # Test Client.create_table_with_view sql = 'SELECT * FROM {{project}}.{} WHERE a > 1'.format(t1_qname) t4 = client.create_table_with_view('t4', sql, False) results = client.query('SELECT * FROM {}'.format(t4.qualified_name)) self.assertEqual(sorted(results.rows), rows[1:]) # Test Client.insert_rows_from_pg PCT.objects.create(code='ABC', name='CCG 1') PCT.objects.create(code='XYZ', name='CCG 2') def transformer(row): return [ord(row[0][0]), row[1]] t1.insert_rows_from_pg(PCT, ['code', 'name'], transformer) self.assertEqual(sorted(t1.get_rows()), [(65, 'CCG 1'), (88, 'CCG 2')]) # Test Table.delete_all_rows t1.delete_all_rows() self.assertEqual(list(t1.get_rows()), [])
def test_the_lot(self): client = Client("test") archive_client = Client("archive") orig_schema = build_schema(("a", "STRING"), ("b", "INTEGER")) schema = build_schema(("a", "INTEGER"), ("b", "STRING")) headers = ["a", "b"] rows = [(1, "apple"), (2, "banana"), (3, "coconut")] t1 = client.get_or_create_table("t1", orig_schema) t1_qname = t1.qualified_name # Test Table.insert_rows_from_csv t1.insert_rows_from_csv("gcutils/tests/test_table.csv", schema) self.assertEqual(sorted(t1.get_rows()), rows) # Test Table.insert_rows_from_query t2 = client.get_table("t2") sql = "SELECT * FROM {} WHERE a > 1".format(t1_qname) t2.insert_rows_from_query(sql) self.assertEqual(sorted(t2.get_rows()), rows[1:]) # Test Client.query sql = "SELECT * FROM {} WHERE a > 2".format(t1_qname) results = client.query(sql) self.assertEqual(sorted(results.rows), rows[2:]) # Test Client.query_into_dataframe sql = "SELECT * FROM {} WHERE a > 2".format(t1_qname) df = client.query_into_dataframe(sql) self.assertEqual(df.values.tolist(), [list(rows[2])]) # Test TableExporter.export_to_storage and # TableExporter.download_from_storage_and_unzip t1_exporter = TableExporter(t1, self.storage_prefix + "test_table-") t1_exporter.export_to_storage() with tempfile.NamedTemporaryFile(mode="r+") as f: t1_exporter.download_from_storage_and_unzip(f) f.seek(0) reader = csv.reader(f) data = [next(reader)] + sorted(reader) self.assertEqual(data, [list(map(str, row)) for row in [headers] + rows]) # Test Table.insert_rows_from_storage storage_path = self.storage_prefix + "test_table.csv" self.upload_to_storage("gcutils/tests/test_table.csv", storage_path) t2.insert_rows_from_storage(storage_path) self.assertEqual(sorted(t2.get_rows()), rows) # Test Client.create_storage_backed_table storage_path = self.storage_prefix + "test_table_headers.csv" self.upload_to_storage("gcutils/tests/test_table_headers.csv", storage_path) schema = build_schema(("a", "INTEGER"), ("b", "STRING")) t3 = client.create_storage_backed_table("t3", schema, storage_path) results = client.query("SELECT * FROM {}".format(t3.qualified_name)) self.assertEqual(sorted(results.rows), rows) self.upload_to_storage("gcutils/tests/test_table_headers_2.csv", storage_path) results = client.query("SELECT * FROM {}".format(t3.qualified_name)) self.assertEqual(sorted(results.rows), rows + [(4, "damson")]) # Test Client.create_table_with_view sql = "SELECT * FROM {{project}}.{} WHERE a > 1".format(t1_qname) t4 = client.create_table_with_view("t4", sql, False) results = client.query("SELECT * FROM {}".format(t4.qualified_name)) self.assertEqual(sorted(results.rows), rows[1:]) # Test Table.copy_to_new_dataset t1.copy_to_new_dataset("archive") t1_archived = archive_client.get_table("t1") self.assertEqual(sorted(t1_archived.get_rows()), rows) self.assertEqual(sorted(t1.get_rows()), rows) # Test Table.move_to_new_dataset t2.move_to_new_dataset("archive") t2_archived = archive_client.get_table("t2") self.assertEqual(sorted(t2_archived.get_rows()), rows) with self.assertRaises(NotFound): list(t2.get_rows()) # Test Client.insert_rows_from_pg PCT.objects.create(code="ABC", name="CCG 1") PCT.objects.create(code="XYZ", name="CCG 2") def transformer(row): return [ord(row[0][0]), row[1]] t1.insert_rows_from_pg( PCT, build_schema(("code", "INTEGER"), ("name", "STRING")), transformer=transformer, ) self.assertEqual(sorted(t1.get_rows()), [(65, "CCG 1"), (88, "CCG 2")]) # Test Table.delete_all_rows t1.delete_all_rows() self.assertEqual(list(t1.get_rows()), [])
def handle(self, *args, **kwargs): s = requests.Session() # Get URLs keyed by the date (year-month) they're for urls = set(self.iter_dataset_urls(s)) urls_by_month = dict(self.iter_months(urls)) # set up the BigQuery client, dataset, and table client = Client(dataset_key="scmd") self.ensure_dataset_exists(client) table = client.get_or_create_table("scmd", schema=SCHEMA) # look for existing months in BigQuery sql = "SELECT DISTINCT year_month FROM {};".format( table.qualified_name) known_dates = [r[0] for r in client.query(sql).rows] # convert the datetime.dates the query gives us to strings since # that's what we're dealing with elsewhere. known_months = {d.strftime("%Y-%m") for d in known_dates} print(known_months) missing_months = set(urls_by_month.keys()) - known_months print("Missing months: {}".format(", ".join(sorted(missing_months)))) pending_urls = {m: urls_by_month[m] for m in missing_months} # abort if there's nothing to get if not pending_urls: print("no pending urls, aborting") return # grab missing months for month, url in sorted(pending_urls.items()): print("{} | Getting: {}".format(month, url)) r = s.get(url) r.raise_for_status() print("{} | Downloaded: {}".format(month, url)) # read CSV into memory decoded_content = r.content.decode("utf-8") reader = csv.reader(decoded_content.splitlines(), delimiter=",") # remove headers next(reader) # use a tempfile so we can write the CSV to disk with necessary # changes (adding days to the year-month dates at time of writing), # before uploading from that file to BigQuery. with tempfile.NamedTemporaryFile(mode="w+") as f: writer = csv.writer(f, delimiter=",") for line in reader: # Convert year-month dates to year-month-day line[0] = line[0] + "-01" writer.writerow(line) print("{} | Wrote: {}".format(month, f.name)) # rewind the file so we can read it into BigQuery f.seek(0) # insert into BigQuery table.insert_rows_from_csv(f.name, SCHEMA, write_disposition="WRITE_APPEND") print("{} | Ingested into BigQuery".format(month))