def test_partition_variants(self): expected_partitions = self._get_standard_variant_partitions() expected_partitions.update(self._get_nonstandard_variant_partitions()) variants = [variant for variant_list in expected_partitions.values() for variant in variant_list] partitioner = variant_partition.VariantPartition() pipeline = TestPipeline() partitions = ( pipeline | Create(variants) | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), partitioner.get_num_partitions())) for i in xrange(partitioner.get_num_partitions()): assert_that(partitions[i], equal_to(expected_partitions.get(i, [])), label=str(i)) pipeline.run()
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) # Note VepRunner creates new input files, so it should be run before any # other access to known_args.input_pattern. if known_args.run_annotation_pipeline: runner = vep_runner.create_runner_and_update_args( known_args, pipeline_args) runner.run_on_all_files() runner.wait_until_done() logging.info('Using VEP processed files: %s', known_args.input_pattern) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = vcf_to_bq_common.get_pipeline_mode( known_args.input_pattern, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, counter_factory) partitioner = None if known_args.optimize_for_large_inputs or known_args.partition_config_path: partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(pipeline, known_args) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVaraints' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = ( variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards) ) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args(argv, _COMMAND_LINE_OPTIONS) if known_args.auto_flags_experiment: _get_input_dimensions(known_args, pipeline_args) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) all_patterns = ( [annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode( all_patterns, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) partitioner = None if ((known_args.optimize_for_large_inputs and variant_merger) or known_args.partition_config_path): partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(all_patterns, pipeline, known_args, pipeline_mode) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 if known_args.output_table: for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = (variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, update_schema_on_append=known_args.update_schema_on_append, allow_incompatible_records=known_args.allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards, null_numeric_value_replacement=( known_args.null_numeric_value_replacement))) if known_args.output_avro_path: # TODO(bashir2): Add an integration test that outputs to Avro files and # also imports to BigQuery. Then import those Avro outputs using the bq # tool and verify that the two tables are identical. _ = ( variants | 'FlattenToOnePCollection' >> beam.Flatten() | 'VariantToAvro' >> variant_to_avro.VariantToAvroFiles( known_args.output_avro_path, header_fields, processed_variant_factory, variant_merger=variant_merger, allow_incompatible_records=known_args.allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement)) ) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)