def validate(self, parsed_args, client=None): if not client: credentials = GoogleCredentials.get_application_default( ).create_scoped(['https://www.googleapis.com/auth/bigquery']) client = bigquery.BigqueryV2(credentials=credentials) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( parsed_args.input_table) if not bigquery_util.table_exist(client, project_id, dataset_id, table_id): raise ValueError('Table {}:{}.{} does not exist.'.format( project_id, dataset_id, table_id)) if table_id.count(TABLE_SUFFIX_SEPARATOR) != 1: raise ValueError( 'Input table {} is malformed - exactly one suffix separator "{}" is ' 'required'.format(parsed_args.input_table, TABLE_SUFFIX_SEPARATOR)) base_table_id = table_id[:table_id.find(TABLE_SUFFIX_SEPARATOR)] sample_table_id = bigquery_util.compose_table_name( base_table_id, SAMPLE_INFO_TABLE_SUFFIX) if not bigquery_util.table_exist(client, project_id, dataset_id, sample_table_id): raise ValueError('Sample table {}:{}.{} does not exist.'.format( project_id, dataset_id, sample_table_id))
def _validate_output_tables(self, client, output_table_base_name, sharding_config_path, append, is_main_output): if (output_table_base_name != bigquery_util.get_table_base_name(output_table_base_name)): raise ValueError( ('Output table cannot contain "{}". we reserve this ' 'string to mark sharded output tables.').format( bigquery_util.TABLE_SUFFIX_SEPARATOR)) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( output_table_base_name) bigquery_util.raise_error_if_dataset_not_exists( client, project_id, dataset_id) all_output_tables = [] if is_main_output: all_output_tables.append( bigquery_util.compose_table_name(table_id, SAMPLE_INFO_TABLE_SUFFIX)) sharding = variant_sharding.VariantSharding(sharding_config_path) num_shards = sharding.get_num_shards() # In case there is no residual in config we will ignore the last shard. if not sharding.should_keep_shard(sharding.get_residual_index()): num_shards -= 1 for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) if table_suffix != bigquery_util.get_table_base_name(table_suffix): raise ValueError( ('Table suffix cannot contain "{}" we reserve this ' 'string to mark sharded output tables.').format( bigquery_util.TABLE_SUFFIX_SEPARATOR)) all_output_tables.append( bigquery_util.compose_table_name(table_id, table_suffix)) for output_table in all_output_tables: if append: if not bigquery_util.table_exist(client, project_id, dataset_id, output_table): raise ValueError( 'Table {}:{}.{} does not exist, cannot append to it.'. format(project_id, dataset_id, output_table)) else: if bigquery_util.table_exist(client, project_id, dataset_id, output_table): raise ValueError(( 'Table {}:{}.{} already exists, cannot overwrite it. Please ' 'set `--append True` if you want to append to it.' ).format(project_id, dataset_id, output_table))
def _find_one_non_empty_table(self): # Any non empty input table can be used as the source for schema extraction. for suffix in self._suffixes: table_id = bigquery_util.compose_table_name(self._base_table, suffix) if not bigquery_util.table_empty( self._project_id, self._dataset_id, table_id): self._schema_table_id = table_id return raise ValueError('All of the variant optimized tables are empty!')
def copy_to_flatten_table(self, output_base_table_id): # type: (str) -> None """Copies data from variant lookup optimized tables to sample lookup tables. Copies rows from _base_table_id__* to output_base_table_id__* for each value in _suffixes. Here we assume destination tables are already created and are partitioned based on call_sample_id column. The copying process is done via a flattening query similar to the one used in get_flatten_table_schema(). Note that if source tables have repeated sample_ids then output table will have more rows than input table. Essentially: Number of output rows = Number of input rows * Number of repeated sample_ids Args: output_base_table_id: Base table name of output tables. """ # Here we assume all output_table_base + suffices[:] are already created. (output_project_id, output_dataset_id, output_base_table) = ( bigquery_util.parse_table_reference(output_base_table_id)) select_columns = self._get_flatten_column_names() for suffix in self._suffixes: input_table_id = bigquery_util.compose_table_name( self._base_table, suffix) output_table_id = bigquery_util.compose_table_name( output_base_table, suffix) full_output_table_id = '{}.{}.{}'.format(output_project_id, output_dataset_id, output_table_id) cp_query = _FLATTEN_CALL_QUERY.format( SELECT_COLUMNS=select_columns, PROJECT_ID=self._project_id, DATASET_ID=self._dataset_id, TABLE_ID=input_table_id, MAIN_TABLE_ALIAS=_MAIN_TABLE_ALIAS, CALL_COLUMN=bigquery_util.ColumnKeyConstants.CALLS, CALL_TABLE_ALIAS=_CALL_TABLE_ALIAS) self._copy_to_flatten_table(full_output_table_id, cp_query) logging.info('Flatten table is fully loaded: %s', full_output_table_id)
def _start_one_load_job(self, suffix): # After issue #582 is resolved we can remove the create_disposition flag. job_config = bigquery.LoadJobConfig( source_format=bigquery.SourceFormat.AVRO, create_disposition='CREATE_NEVER') uri = self._avro_root_path + suffix + '-*' table_id = bigquery_util.compose_table_name(self._table_base_name, suffix) load_job = self._client.load_table_from_uri(uri, table_id, job_config=job_config) self._suffixes_to_load_jobs.update({suffix: load_job})
def __init__(self, output_table_prefix, sample_name_encoding, append=False): # type: (str, Dict[str, str], bool, int) -> None """Initializes the transform. Args: output_table_prefix: The prefix of the output BigQuery table. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. sample_name_encoding: If SampleNameEncoding.WITHOUT_FILE_PATH is supplied, sample_id would only use sample_name in to get a hashed name; otherwise both sample_name and file_name will be used. """ self._output_table = bigquery_util.compose_table_name( output_table_prefix, bigquery_util.SAMPLE_INFO_TABLE_SUFFIX) self._append = append self._sample_name_encoding = sample_name_encoding self._schema = sample_info_table_schema_generator.generate_schema()
def _handle_failed_load_job(self, suffix, load_job): table_id = bigquery_util.compose_table_name(self._table_base_name, suffix) logging.warning('Failed to load AVRO to BigQuery table: %s', table_id) exception_str = '' if load_job.exception(): exception_str = str(load_job.exception()) logging.warning('Load job exception: %s', exception_str) if self._num_load_jobs_retries < bigquery_util.BQ_NUM_RETRIES: logging.warning('Retrying the failed job...') self._num_load_jobs_retries += 1 time.sleep(300) self._start_one_load_job(suffix) else: logging.error( 'AVRO load jobs have failed more than BQ_NUM_RETRIES.') self._cancel_all_running_load_jobs() raise ValueError( 'Failed to load AVRO to BigQuery table {} \n state: {} \n ' 'job_id: {} \n exception: {}.'.format(table_id, load_job.state, load_job.path, exception_str))
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the sample names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) variant_query = _get_variant_query(known_args, schema) logging.info('Processing BigQuery query %s:', variant_query) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( known_args.input_table) bq_variant_source = bigquery.BigQuerySource(query=variant_query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) base_table_id = bigquery_util.get_table_base_name(table_id) sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format( PROJECT_ID=project_id, DATASET_ID=dataset_id, TABLE_NAME=bigquery_util.compose_table_name(base_table_id, SAMPLE_INFO_TABLE_SUFFIX)) bq_sample_source = bigquery.BigQuerySource(query=sample_query, validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) sample_table_rows = ( p | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source)) if known_args.sample_names: temp_sample_names = (p | transforms.Create(known_args.sample_names, reshuffle=False)) else: # Get sample names from sample IDs in the variants and sort. id_to_name_hash_table = (sample_table_rows | 'SampleIdToNameDict' >> sample_mapping_table.SampleIdToNameDict()) temp_sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner( known_args.preserve_sample_order)) temp_sample_names = ( temp_sample_ids | 'GetSampleNames' >> sample_mapping_table.GetSampleNames( beam.pvalue.AsSingleton(id_to_name_hash_table)) | 'CombineToList' >> beam.combiners.ToList() | 'SortSampleNames' >> beam.ParDo(sorted)) name_to_id_hash_table = ( sample_table_rows | 'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict()) sample_ids = (temp_sample_names | 'GetSampleIds' >> sample_mapping_table.GetSampleIds( beam.pvalue.AsSingleton(name_to_id_hash_table)) | 'CombineSortedSampleIds' >> beam.combiners.ToList()) sample_names = temp_sample_names | beam.combiners.ToList() _ = (sample_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) if known_args.auto_flags_experiment: _get_input_dimensions(known_args, pipeline_args) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) all_patterns = ([annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) avro_root_path = _get_avro_root_path(beam_pipeline_options) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) schema = schema_converter.generate_schema_from_header_fields( header_fields, processed_variant_factory, variant_merger, known_args.use_1_based_coordinate, known_args.include_call_name) sharding = variant_sharding.VariantSharding( known_args.sharding_config_path) if sharding.should_keep_shard(sharding.get_residual_index()): num_shards = sharding.get_num_shards() else: num_shards = sharding.get_num_shards() - 1 if known_args.update_schema_on_append: for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) table_name = bigquery_util.compose_table_name( known_args.output_table, table_suffix) bigquery_util.update_bigquery_schema_on_append( schema.fields, table_name) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants( all_patterns, pipeline, known_args, pipeline_mode, use_1_based_coordinate=known_args.use_1_based_coordinate) if known_args.allow_malformed_records: variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants() sharded_variants = variants | 'ShardVariants' >> beam.Partition( shard_variants.ShardVariants(sharding), sharding.get_num_shards()) variants = [] for i in range(num_shards): suffix = sharding.get_output_table_suffix(i) # Convert tuples to list variants.append(sharded_variants[i]) if variant_merger: variants[i] |= ('MergeVariants' + suffix >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + suffix >> beam.Map(processed_variant_factory.create_processed_variant). \ with_output_types(processed_variant.ProcessedVariant)) _ = (variants[i] | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles( avro_root_path + suffix, schema, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement), include_call_name=known_args.include_call_name)) result = pipeline.run() try: state = result.wait_until_finish() if state != beam.runners.runner.PipelineState.DONE: logging.error( 'Dataflow pipeline terminated in an unexpected state: %s', state) raise AssertionError( 'Dataflow pipeline terminated in {} state'.format(state)) except Exception as e: logging.error('Dataflow pipeline failed.') raise e else: logging.info('Dataflow pipeline finished successfully.') metrics_util.log_all_counters(result) # After pipeline is done, create output tables and load AVRO files into them. schema_file = _write_schema_to_temp_file(schema, avro_root_path) suffixes = [] try: for i in range(num_shards): suffixes.append(sharding.get_output_table_suffix(i)) partition_range_end = sharding.get_output_table_partition_range_end( i) if not known_args.append: table_name = bigquery_util.compose_table_name( known_args.output_table, suffixes[i]) partitioning.create_bq_table( table_name, schema_file, bigquery_util.ColumnKeyConstants.START_POSITION, partition_range_end) _record_newly_created_table(table_name) logging.info('Integer range partitioned table %s was created.', table_name) if not known_args.append: _record_newly_created_table( sample_info_table_schema_generator.create_sample_info_table( known_args.output_table)) suffixes.append( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table, suffixes, False) not_empty_variant_suffixes = load_avro.start_loading() logging.info('Following tables were loaded with at least 1 row:') for suffix in not_empty_variant_suffixes: logging.info( bigquery_util.compose_table_name(known_args.output_table, suffix)) # Remove sample_info table from both lists to avoid duplicating it when # --sample_lookup_optimized_output_table flag is set suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\ not_empty_variant_suffixes: not_empty_variant_suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) except Exception as e: logging.error( 'Something unexpected happened during the loading of AVRO ' 'files to BigQuery: %s', str(e)) logging.info( 'Since the write to BigQuery stage failed, we did not delete ' 'AVRO files in your GCS bucket. You can manually import them ' 'to BigQuery. To avoid extra storage charges, delete them if ' 'you do not need them, AVRO files are located at: %s', avro_root_path) raise e else: logging.warning('All AVRO files were successfully loaded to BigQuery.') if known_args.keep_intermediate_avro_files: logging.info( 'Since "--keep_intermediate_avro_files" flag is set, the ' 'AVRO files are kept and stored at: %s', avro_root_path) else: if bigquery_util.delete_gcs_files(avro_root_path) != 0: logging.error( 'Deletion of intermediate AVRO files located at "%s" has ' 'failed.', avro_root_path) if known_args.sample_lookup_optimized_output_table: flatten_call_column = partitioning.FlattenCallColumn( known_args.output_table, not_empty_variant_suffixes, known_args.append) try: flatten_schema_file = tempfile.mkstemp( suffix=_BQ_SCHEMA_FILE_SUFFIX)[1] if not flatten_call_column.get_flatten_table_schema( flatten_schema_file): raise ValueError('Failed to extract schema of flatten table') # Create output flatten tables if needed if not known_args.append: # Create all sample optimized tables including those that will be empty. for suffix in suffixes: output_table_id = bigquery_util.compose_table_name( known_args.sample_lookup_optimized_output_table, suffix) partitioning.create_bq_table( output_table_id, flatten_schema_file, bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, partitioning.MAX_RANGE_END) _record_newly_created_table(output_table_id) logging.info( 'Sample lookup optimized table %s was created.', output_table_id) # Copy to flatten sample lookup tables from the variant lookup tables. # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607). flatten_call_column.copy_to_flatten_table( known_args.sample_lookup_optimized_output_table) logging.info( 'All sample lookup optimized tables are fully loaded.') except Exception as e: logging.error( 'Something unexpected happened during the loading rows to ' 'sample optimized table stage: %s', str(e)) raise e
def create_sample_info_table(output_table): full_table_id = bigquery_util.compose_table_name(output_table, SAMPLE_INFO_TABLE_SUFFIX) partitioning.create_bq_table(full_table_id, SAMPLE_INFO_TABLE_SCHEMA_FILE_PATH) return full_table_id