def restore_view(tbldef, fromdir, todataset): """ Restore schema & data from GCS to BigQuery table :param tbldef: Table definition dict :param todir: GCS input directory gs://..../dataset/tablename/ :param todataset: BigQuery dataset name """ query = tbldef['view']['query'] view_name = tbldef['tableReference']['tableId'] legacy_sql = tbldef['view']['useLegacySql'] exec_shell_command([ 'bq', 'mk', '--view', query, '--{}use_legacy_sql'.format('no' if not legacy_sql else ''), '{}.{}'.format(todataset, view_name) ])
def backup_table(dataset, tablename, todir, schemaonly): """ Store schema & data in table to GCS :param dataset: BigQuery dataset name :param tablename: BigQuery table name :param todir: GCS output prefix :param schemaonly: don't export data, just the schema :return: None """ full_table_name = '{}.{}'.format(dataset, tablename) # write schema to GCS schema = exec_shell_command(['bq', 'show', '--schema', full_table_name]) write_json_string(schema, os.path.join(todir, dataset, tablename, 'schema.json')) if not schemaonly: # back up the table definition tbldef = exec_shell_command( ['bq', '--format=json', 'show', full_table_name]) write_json_string( tbldef, os.path.join(todir, dataset, tablename, 'tbldef.json')) tbldef = json.loads(tbldef) # array of dicts if tbldef['type'] == 'VIEW': return # no need to extract data # read the data output_data_name = os.path.join(todir, dataset, tablename, 'data_*.avro') _ = exec_shell_command([ 'bq', 'extract', '--destination_format=AVRO', '--use_avro_logical_types', # for DATE, TIME, NUMERIC '{}.{}'.format(dataset, tablename), output_data_name ])
def restore_table(fromdir, todataset): """ Restore schema & data from GCS to BigQuery table :param todir: GCS input directory gs://..../dataset/tablename/ :param todataset: BigQuery dataset name """ # start to create load command load_command = [ 'bq', 'load', '--source_format', 'AVRO', '--use_avro_logical_types', # for DATE, TIME, NUMERIC ] # get table definition tbldef = read_json_string(os.path.join(fromdir, 'tbldef.json')) if tbldef['type'] == 'VIEW': restore_view(tbldef, fromdir, todataset) return if 'timePartitioning' in tbldef: load_command += [ '--time_partitioning_expiration', tbldef['timePartitioning']['expirationMs'], '--time_partitioning_field', tbldef['timePartitioning']['field'], '--time_partitioning_type', tbldef['timePartitioning']['type'], ] if 'rangePartitioning' in tbldef: load_command += [ '--range_partitioning', '{},{},{},{}'.format( tbldef['rangePartitioning']['field'], tbldef['rangePartitioning']['range']['start'], tbldef['rangePartitioning']['range']['end'], tbldef['rangePartitioning']['range']['interval']) ] if 'clustering' in tbldef: load_command += [ '--clustering_fields', ','.join(tbldef['clustering']['fields']) ] # write schema to a temporary file schema = tbldef['schema']['fields'] # array of dicts fd, schema_file = tempfile.mkstemp() with open(schema_file, 'w') as ofp: json.dump(schema, ofp, sort_keys=False, indent=2) os.close(fd) load_command += [ '--schema', schema_file, ] # load the data into BigQuery table_name = tbldef['tableReference']['tableId'] load_command += [ '{}.{}'.format(todataset, table_name), os.path.join(fromdir, 'data_*.avro') ] exec_shell_command(load_command)
parser.add_argument('--schema', action='store_true', help='Write out only the schema, no data') parser.add_argument('--quiet', action='store_true', help='Turn off verbose logging') args = parser.parse_args() if not args.quiet: logging.basicConfig(level=logging.INFO) if '.' in args.input: dataset, table = args.input.split('.') tables = [table] else: dataset = args.input dataset_contents = exec_shell_command( ['bq', '--format=json', 'ls', '--max_results', '10000', dataset]) dataset_contents = json.loads(dataset_contents) # array of dicts tables = [] for entry in dataset_contents: if entry['type'] == 'TABLE' or entry['type'] == 'VIEW': tables.append(entry['tableReference']['tableId']) else: logging.warning('Not backing up {} because it is a {}'.format( entry['id'], entry['type'])) print(tables) for table in tables: backup_table(dataset, table, args.output, args.schema)
parser.add_argument('--schema', action='store_true', help='Write out only the schema, no data') parser.add_argument('--quiet', action='store_true', help='Turn off verbose logging') args = parser.parse_args() if not args.quiet: logging.basicConfig(level=logging.INFO) if '.' in args.input: dataset, table = args.input.split('.') tables = [table] else: dataset = args.input dataset_contents = exec_shell_command( ['bq', '--format=json', 'ls', dataset]) dataset_contents = json.loads(dataset_contents) # array of dicts tables = [] for entry in dataset_contents: if entry['type'] == 'TABLE' or entry['type'] == 'VIEW': tables.append(entry['tableReference']['tableId']) else: logging.warning('Not backing up {} because it is a {}'.format( entry['id'], entry['type'])) print(tables) for table in tables: backup_table(dataset, table, args.output, args.schema)