def test_default_optimize_for_large_inputs(self):
        args = self._create_mock_args(input_pattern='')
        match_result = collections.namedtuple('MatchResult', ['metadata_list'])

        match = match_result(range(101))
        with mock.patch.object(FileSystems, 'match', return_value=[match]):
            self.assertEqual(
                vcf_to_bq_common.get_pipeline_mode(args.input_pattern),
                PipelineModes.MEDIUM)
def run(argv=None):
    # type: (List[str]) -> (str, str)
    """Runs preprocess pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    pipeline_mode = vcf_to_bq_common.get_pipeline_mode(
        known_args.input_pattern)

    with beam.Pipeline(options=options) as p:
        headers = vcf_to_bq_common.read_headers(p, pipeline_mode, known_args)
        merged_headers = vcf_to_bq_common.get_merged_headers(headers)
        merged_definitions = (
            headers
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())
        if known_args.report_all_conflicts:
            variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf(
                known_args.input_pattern, allow_malformed_records=True)
            malformed_records = variants | filter_variants.ExtractMalformedVariants(
            )
            inferred_headers, merged_headers = (_get_inferred_headers(
                variants, merged_headers))
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers),
                     beam.pvalue.AsSingleton(inferred_headers),
                     beam.pvalue.AsIter(malformed_records)))
        else:
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers)))

        if known_args.resolved_headers_path:
            vcf_to_bq_common.write_headers(merged_headers,
                                           known_args.resolved_headers_path)
Exemple #3
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    # Note VepRunner creates new input files, so it should be run before any
    # other access to known_args.input_pattern.
    if known_args.run_annotation_pipeline:
        runner = vep_runner.create_runner_and_update_args(
            known_args, pipeline_args)
        runner.run_on_all_files()
        runner.wait_until_done()
        logging.info('Using VEP processed files: %s', known_args.input_pattern)

    variant_merger = _get_variant_merge_strategy(known_args)
    pipeline_mode = vcf_to_bq_common.get_pipeline_mode(
        known_args.input_pattern, known_args.optimize_for_large_inputs)

    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.annotation_fields, known_args.use_allele_num,
        known_args.minimal_vep_alt_matching, counter_factory)

    partitioner = None
    if known_args.optimize_for_large_inputs or known_args.partition_config_path:
        partitioner = variant_partition.VariantPartition(
            known_args.partition_config_path)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(pipeline, known_args)
    variants |= 'FilterVariants' >> filter_variants.FilterVariants(
        reference_names=known_args.reference_names)
    if partitioner:
        num_partitions = partitioner.get_num_partitions()
        partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
            partition_variants.PartitionVariants(partitioner), num_partitions)
        variants = []
        for i in range(num_partitions):
            if partitioner.should_keep_partition(i):
                variants.append(partitioned_variants[i])
            else:
                num_partitions -= 1
    else:
        # By default we don't partition the data, so we have only 1 partition.
        num_partitions = 1
        variants = [variants]

    for i in range(num_partitions):
        if variant_merger:
            variants[i] |= ('MergeVariants' + str(i) >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVaraints' + str(i) >>
            beam.Map(processed_variant_factory.create_processed_variant).\
                with_output_types(processed_variant.ProcessedVariant))
    if partitioner and partitioner.should_flatten():
        variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
        num_partitions = 1

    for i in range(num_partitions):
        table_suffix = ''
        if partitioner and partitioner.get_partition_name(i):
            table_suffix = '_' + partitioner.get_partition_name(i)
        table_name = known_args.output_table + table_suffix
        _ = (
            variants[i] | 'VariantToBigQuery' + table_suffix >>
            variant_to_bigquery.VariantToBigQuery(
                table_name,
                header_fields,
                variant_merger,
                processed_variant_factory,
                append=known_args.append,
                allow_incompatible_records=known_args.
                allow_incompatible_records,
                omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                num_bigquery_write_shards=known_args.num_bigquery_write_shards)
        )

    result = pipeline.run()
    result.wait_until_finish()

    metrics_util.log_all_counters(result)
 def _get_pipeline_mode(self, args):
     return vcf_to_bq_common.get_pipeline_mode(
         args.input_pattern, args.optimize_for_large_inputs)