def validate(self, parsed_args, client=None): if not client: credentials = GoogleCredentials.get_application_default( ).create_scoped(['https://www.googleapis.com/auth/bigquery']) client = bigquery.BigqueryV2(credentials=credentials) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( parsed_args.input_table) if not bigquery_util.table_exist(client, project_id, dataset_id, table_id): raise ValueError('Table {}:{}.{} does not exist.'.format( project_id, dataset_id, table_id)) if parsed_args.custom_sample_info_table: sample_project_id, sample_dataset_id, sample_table_id = ( bigquery_util.parse_table_reference( parsed_args.custom_sample_info_table)) else: if table_id.count(TABLE_SUFFIX_SEPARATOR) != 1: raise ValueError( 'Input table {} is malformed - exactly one suffix separator "{}" ' 'is required'.format(parsed_args.input_table, TABLE_SUFFIX_SEPARATOR)) base_table_id = table_id[:table_id.find(TABLE_SUFFIX_SEPARATOR)] sample_project_id = project_id sample_dataset_id = dataset_id sample_table_id = bigquery_util.compose_table_name( base_table_id, SAMPLE_INFO_TABLE_SUFFIX) if not bigquery_util.table_exist(client, sample_project_id, sample_dataset_id, sample_table_id): raise ValueError('Sample table {}:{}.{} does not exist.'.format( sample_project_id, sample_dataset_id, sample_table_id))
def _get_variant_query(known_args, schema): # type: (argparse.Namespace, bigquery_v2.TableSchema) -> str """Returns a BigQuery query for the interested regions.""" columns = _get_query_columns(schema) base_query = _BASE_QUERY_TEMPLATE.format( COLUMNS=', '.join(columns), INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))) conditions = [] if known_args.genomic_regions: for region in known_args.genomic_regions: ref, start, end = genomic_region_parser.parse_genomic_region( region) conditions.append( _GENOMIC_REGION_TEMPLATE.format( REFERENCE_NAME_ID=bigquery_util.ColumnKeyConstants. REFERENCE_NAME, REFERENCE_NAME_VALUE=ref, START_POSITION_ID=bigquery_util.ColumnKeyConstants. START_POSITION, START_POSITION_VALUE=start, END_POSITION_ID=bigquery_util.ColumnKeyConstants. END_POSITION, END_POSITION_VALUE=end)) if not conditions: return base_query return ' '.join([base_query, 'WHERE', ' OR '.join(conditions)])
def __init__(self, base_table_id, suffixes, append): # type (str, List[str]) -> None """Initialize `FlattenCallColumn` object. In preparation to convert variant lookup optimized tables to sample lookup optimized tables, we initiate this class with the base table name of variant opt table (set using --output_table flag) and the list of suffixes (which are extracted from sharding config file). Args: base_table_id: Base name of variant opt outputs (set by --output_table). suffixes: List of suffixes (extracted from sharding config file). append: Whether or not we are appending to the destination tables. """ (self._project_id, self._dataset_id, self._base_table) = bigquery_util.parse_table_reference(base_table_id) assert suffixes self._suffixes = suffixes[:] self._column_names = [] self._sub_fields = [] job_config = bigquery.job.QueryJobConfig( write_disposition='WRITE_TRUNCATE' if append else 'WRITE_EMPTY') self._client = bigquery.Client(project=self._project_id, default_query_job_config=job_config) self._find_one_non_empty_table()
def _get_schema(input_table): # type: (str) -> bigquery_v2.TableSchema project_id, dataset_id, table_id = bigquery_util.parse_table_reference( input_table) credentials = (client.GoogleCredentials.get_application_default(). create_scoped(['https://www.googleapis.com/auth/bigquery'])) bigquery_client = bigquery_v2.BigqueryV2(credentials=credentials) table = bigquery_client.tables.Get(bigquery_v2.BigqueryTablesGetRequest( projectId=project_id, datasetId=dataset_id, tableId=table_id)) return table.schema
def __init__( self, avro_root_path, # type: str output_table, # type: str suffixes, # type: List[str] delete_empty_tables # type: bool ): """Initializes `LoadAvro` object. This class loads AVRO files generated by Dataflow pipeline into BigQuery. In our default sharding config we have 25 output tables, so here `suffixes` will have 25 + 1 (sample_info) values. For each of those suffixes this class will load destination BigQuery table with its AVRO files, for example: for `chr1` suffix: gs://TEMP_LOCATION/avro/JOB_NAME/YYYYMMDD_HHMMSS/chr1-* will be loaded to: PROJECT_ID.DATASET_ID.BASE_TABLE_ID__chr1 After loading, if 0 rows were loaded (ie empty AVRO files) then destination table will be deleted if `delete_empty_tables` is set. Note1: This class assumes the destination table is already created. This is because integer range partitioning and clustering of columns must be done when the table is created. Note2: If we run all 26 jobs in parallel BigQuery will be overwhelmed and jobs fail randomly, thus we use _MAX_NUM_CONCURRENT_AVRO_LOAD_JOBS limit. Args: avro_root_path: Location of AVRO files on Google Cloud Storage (GCS). output_table: Base table_name `__` + suffixes will be added to it. suffixes: List of table suffixes: `__chr1`, `__chr2`, ... `sample_info`. delete_empty_tables: Whether or not to delete tables with 0 rows loaded. """ self._avro_root_path = avro_root_path project_id, dataset_id, table_id = bigquery_util.parse_table_reference( output_table) self._table_base_name = '{}.{}.{}'.format(project_id, dataset_id, table_id) self._num_load_jobs_retries = 0 self._suffixes_to_load_jobs = { } # type: Dict[str, bigquery.job.LoadJob] self._remaining_load_jobs = suffixes[:] self._delete_empty_tables = delete_empty_tables self._not_empty_suffixes = [] self._client = bigquery.Client(project=project_id)
def _validate_output_tables(self, client, output_table_base_name, sharding_config_path, append, is_main_output): if (output_table_base_name != bigquery_util.get_table_base_name(output_table_base_name)): raise ValueError( ('Output table cannot contain "{}". we reserve this ' 'string to mark sharded output tables.').format( bigquery_util.TABLE_SUFFIX_SEPARATOR)) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( output_table_base_name) bigquery_util.raise_error_if_dataset_not_exists( client, project_id, dataset_id) all_output_tables = [] if is_main_output: all_output_tables.append( bigquery_util.compose_table_name(table_id, SAMPLE_INFO_TABLE_SUFFIX)) sharding = variant_sharding.VariantSharding(sharding_config_path) num_shards = sharding.get_num_shards() # In case there is no residual in config we will ignore the last shard. if not sharding.should_keep_shard(sharding.get_residual_index()): num_shards -= 1 for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) if table_suffix != bigquery_util.get_table_base_name(table_suffix): raise ValueError( ('Table suffix cannot contain "{}" we reserve this ' 'string to mark sharded output tables.').format( bigquery_util.TABLE_SUFFIX_SEPARATOR)) all_output_tables.append( bigquery_util.compose_table_name(table_id, table_suffix)) for output_table in all_output_tables: if append: if not bigquery_util.table_exist(client, project_id, dataset_id, output_table): raise ValueError( 'Table {}:{}.{} does not exist, cannot append to it.'. format(project_id, dataset_id, output_table)) else: if bigquery_util.table_exist(client, project_id, dataset_id, output_table): raise ValueError(( 'Table {}:{}.{} already exists, cannot overwrite it. Please ' 'set `--append True` if you want to append to it.' ).format(project_id, dataset_id, output_table))
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str vcf_data_header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the data header to `vcf_data_header_file_path`. TODO(allieychen): Eventually, it also generates the meta information file. """ bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant()) call_names = ( variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) _ = (call_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_data_header, _VCF_FIXED_COLUMNS, vcf_data_header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines())
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) options = pipeline_options.PipelineOptions(pipeline_args) with beam.Pipeline(options=options) as p: _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant() | densify_variants.DensifyVariants() | vcfio.WriteToVcf(known_args.output_file))
def copy_to_flatten_table(self, output_base_table_id): # type: (str) -> None """Copies data from variant lookup optimized tables to sample lookup tables. Copies rows from _base_table_id__* to output_base_table_id__* for each value in _suffixes. Here we assume destination tables are already created and are partitioned based on call_sample_id column. The copying process is done via a flattening query similar to the one used in get_flatten_table_schema(). Note that if source tables have repeated sample_ids then output table will have more rows than input table. Essentially: Number of output rows = Number of input rows * Number of repeated sample_ids Args: output_base_table_id: Base table name of output tables. """ # Here we assume all output_table_base + suffices[:] are already created. (output_project_id, output_dataset_id, output_base_table) = ( bigquery_util.parse_table_reference(output_base_table_id)) select_columns = self._get_flatten_column_names() for suffix in self._suffixes: input_table_id = bigquery_util.compose_table_name( self._base_table, suffix) output_table_id = bigquery_util.compose_table_name( output_base_table, suffix) full_output_table_id = '{}.{}.{}'.format(output_project_id, output_dataset_id, output_table_id) cp_query = _FLATTEN_CALL_QUERY.format( SELECT_COLUMNS=select_columns, PROJECT_ID=self._project_id, DATASET_ID=self._dataset_id, TABLE_ID=input_table_id, MAIN_TABLE_ALIAS=_MAIN_TABLE_ALIAS, CALL_COLUMN=bigquery_util.ColumnKeyConstants.CALLS, CALL_TABLE_ALIAS=_CALL_TABLE_ALIAS) self._copy_to_flatten_table(full_output_table_id, cp_query) logging.info('Flatten table is fully loaded: %s', full_output_table_id)
def validate(self, parsed_args, client=None): # type: (argparse.Namespace, bigquery.BigqueryV2) -> None if not parsed_args.output_table and parsed_args.output_avro_path: # Writing into BigQuery is not requested; no more BigQuery checks needed. return project_id, dataset_id, table_id = bigquery_util.parse_table_reference( parsed_args.output_table) if not client: credentials = GoogleCredentials.get_application_default( ).create_scoped(['https://www.googleapis.com/auth/bigquery']) client = bigquery.BigqueryV2(credentials=credentials) bigquery_util.raise_error_if_dataset_not_exists( client, project_id, dataset_id) # Ensuring given output table doesn't already exist to avoid overwriting it. if not parsed_args.append: if parsed_args.update_schema_on_append: raise ValueError( '--update_schema_on_append requires --append to be ' 'true.') bigquery_util.raise_error_if_table_exists(client, project_id, dataset_id, table_id)
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the sample names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) variant_query = _get_variant_query(known_args, schema) logging.info('Processing BigQuery query %s:', variant_query) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( known_args.input_table) bq_variant_source = bigquery.BigQuerySource(query=variant_query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) base_table_id = bigquery_util.get_table_base_name(table_id) sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format( PROJECT_ID=project_id, DATASET_ID=dataset_id, TABLE_NAME=bigquery_util.compose_table_name(base_table_id, SAMPLE_INFO_TABLE_SUFFIX)) bq_sample_source = bigquery.BigQuerySource(query=sample_query, validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) sample_table_rows = ( p | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source)) if known_args.sample_names: temp_sample_names = (p | transforms.Create(known_args.sample_names, reshuffle=False)) else: # Get sample names from sample IDs in the variants and sort. id_to_name_hash_table = (sample_table_rows | 'SampleIdToNameDict' >> sample_mapping_table.SampleIdToNameDict()) temp_sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner( known_args.preserve_sample_order)) temp_sample_names = ( temp_sample_ids | 'GetSampleNames' >> sample_mapping_table.GetSampleNames( beam.pvalue.AsSingleton(id_to_name_hash_table)) | 'CombineToList' >> beam.combiners.ToList() | 'SortSampleNames' >> beam.ParDo(sorted)) name_to_id_hash_table = ( sample_table_rows | 'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict()) sample_ids = (temp_sample_names | 'GetSampleIds' >> sample_mapping_table.GetSampleIds( beam.pvalue.AsSingleton(name_to_id_hash_table)) | 'CombineSortedSampleIds' >> beam.combiners.ToList()) sample_names = temp_sample_names | beam.combiners.ToList() _ = (sample_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))