def test_default_optimize_for_large_inputs(self): args = self._create_mock_args(input_pattern='') match_result = collections.namedtuple('MatchResult', ['metadata_list']) match = match_result(range(101)) with mock.patch.object(FileSystems, 'match', return_value=[match]): self.assertEqual( vcf_to_bq_common.get_pipeline_mode(args.input_pattern), PipelineModes.MEDIUM)
def run(argv=None): # type: (List[str]) -> (str, str) """Runs preprocess pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) pipeline_mode = vcf_to_bq_common.get_pipeline_mode( known_args.input_pattern) with beam.Pipeline(options=options) as p: headers = vcf_to_bq_common.read_headers(p, pipeline_mode, known_args) merged_headers = vcf_to_bq_common.get_merged_headers(headers) merged_definitions = ( headers | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) if known_args.report_all_conflicts: variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf( known_args.input_pattern, allow_malformed_records=True) malformed_records = variants | filter_variants.ExtractMalformedVariants( ) inferred_headers, merged_headers = (_get_inferred_headers( variants, merged_headers)) _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers), beam.pvalue.AsSingleton(inferred_headers), beam.pvalue.AsIter(malformed_records))) else: _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers))) if known_args.resolved_headers_path: vcf_to_bq_common.write_headers(merged_headers, known_args.resolved_headers_path)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) # Note VepRunner creates new input files, so it should be run before any # other access to known_args.input_pattern. if known_args.run_annotation_pipeline: runner = vep_runner.create_runner_and_update_args( known_args, pipeline_args) runner.run_on_all_files() runner.wait_until_done() logging.info('Using VEP processed files: %s', known_args.input_pattern) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = vcf_to_bq_common.get_pipeline_mode( known_args.input_pattern, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, counter_factory) partitioner = None if known_args.optimize_for_large_inputs or known_args.partition_config_path: partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(pipeline, known_args) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVaraints' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = ( variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards) ) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)
def _get_pipeline_mode(self, args): return vcf_to_bq_common.get_pipeline_mode( args.input_pattern, args.optimize_for_large_inputs)