def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) # TODO(allieychen): Add support for local location. if not google_cloud_options.temp_location or not google_cloud_options.project: raise ValueError('temp_location and project must be set.') timestamp_str = datetime.now().strftime('%Y%m%d_%H%M%S') vcf_data_temp_folder = filesystems.FileSystems.join( google_cloud_options.temp_location, 'bq_to_vcf_data_temp_files_{}'.format(timestamp_str)) vcf_data_header_file_path = filesystems.FileSystems.join( google_cloud_options.temp_location, 'bq_to_vcf_data_header_{}'.format(timestamp_str)) _bigquery_to_vcf_shards(known_args, options, vcf_data_temp_folder, vcf_data_header_file_path) vcf_file_composer.compose_vcf_shards(google_cloud_options.project, vcf_data_header_file_path, vcf_data_temp_folder, known_args.output_file)
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) is_direct_runner = _is_direct_runner(beam.Pipeline(options=options)) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) if not google_cloud_options.project: raise ValueError('project must be set.') if not is_direct_runner and not known_args.output_file.startswith('gs://'): raise ValueError( 'Please set the output file {} to GCS when running with ' 'DataflowRunner.'.format(known_args.output_file)) if is_direct_runner: known_args.number_of_bases_per_shard = sys.maxsize temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp() # TODO(allieychen): Refactor the generation of the unique temp id to a common # lib. unique_temp_id = '-'.join([ google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME, datetime.now().strftime('%Y%m%d-%H%M%S'), str(uuid.uuid4()) ]) vcf_data_temp_folder = filesystems.FileSystems.join( temp_folder, '{}_data_temp_files'.format(unique_temp_id)) # Create the directory manually. FileSystems cannot create a file if the # directory does not exist when using Direct Runner. filesystems.FileSystems.mkdirs(vcf_data_temp_folder) vcf_header_file_path = filesystems.FileSystems.join( temp_folder, '{}_header_with_call_names.vcf'.format(unique_temp_id)) if not known_args.representative_header_file: known_args.representative_header_file = filesystems.FileSystems.join( temp_folder, '{}_meta_info.vcf'.format(unique_temp_id)) _write_vcf_meta_info(known_args.input_table, known_args.representative_header_file, known_args.allow_incompatible_schema) _bigquery_to_vcf_shards(known_args, options, vcf_data_temp_folder, vcf_header_file_path) if is_direct_runner: vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path, vcf_data_temp_folder, known_args.output_file) else: vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project, vcf_header_file_path, vcf_data_temp_folder, known_args.output_file)
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) options = pipeline_options.PipelineOptions(pipeline_args) with beam.Pipeline(options=options) as p: _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant() | densify_variants.DensifyVariants() | vcfio.WriteToVcf(known_args.output_file))
def run(argv=None): # type: (List[str]) -> (str, str) """Runs preprocess pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) pipeline_mode = vcf_to_bq_common.get_pipeline_mode( known_args.input_pattern) with beam.Pipeline(options=options) as p: headers = vcf_to_bq_common.read_headers(p, pipeline_mode, known_args) merged_headers = vcf_to_bq_common.get_merged_headers(headers) merged_definitions = ( headers | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) if known_args.report_all_conflicts: variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf( known_args.input_pattern, allow_malformed_records=True) malformed_records = variants | filter_variants.ExtractMalformedVariants( ) inferred_headers, merged_headers = (_get_inferred_headers( variants, merged_headers)) _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers), beam.pvalue.AsSingleton(inferred_headers), beam.pvalue.AsIter(malformed_records))) else: _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers))) if known_args.resolved_headers_path: vcf_to_bq_common.write_headers(merged_headers, known_args.resolved_headers_path)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) # Note VepRunner creates new input files, so it should be run before any # other access to known_args.input_pattern. if known_args.run_annotation_pipeline: runner = vep_runner.create_runner_and_update_args( known_args, pipeline_args) runner.run_on_all_files() runner.wait_until_done() logging.info('Using VEP processed files: %s', known_args.input_pattern) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = vcf_to_bq_common.get_pipeline_mode( known_args.input_pattern, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, counter_factory) partitioner = None if known_args.optimize_for_large_inputs or known_args.partition_config_path: partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(pipeline, known_args) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVaraints' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = ( variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards) ) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)