Ejemplo n.º 1
0
  def test_partition_variants(self):
    expected_partitions = self._get_standard_variant_partitions()
    expected_partitions.update(self._get_nonstandard_variant_partitions())
    variants = [variant
                for variant_list in expected_partitions.values()
                for variant in variant_list]

    partitioner = variant_partition.VariantPartition()
    pipeline = TestPipeline()
    partitions = (
        pipeline
        | Create(variants)
        | 'PartitionVariants' >> beam.Partition(
            partition_variants.PartitionVariants(partitioner),
            partitioner.get_num_partitions()))
    for i in xrange(partitioner.get_num_partitions()):
      assert_that(partitions[i], equal_to(expected_partitions.get(i, [])),
                  label=str(i))
    pipeline.run()
Ejemplo n.º 2
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    # Note VepRunner creates new input files, so it should be run before any
    # other access to known_args.input_pattern.
    if known_args.run_annotation_pipeline:
        runner = vep_runner.create_runner_and_update_args(
            known_args, pipeline_args)
        runner.run_on_all_files()
        runner.wait_until_done()
        logging.info('Using VEP processed files: %s', known_args.input_pattern)

    variant_merger = _get_variant_merge_strategy(known_args)
    pipeline_mode = vcf_to_bq_common.get_pipeline_mode(
        known_args.input_pattern, known_args.optimize_for_large_inputs)

    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.annotation_fields, known_args.use_allele_num,
        known_args.minimal_vep_alt_matching, counter_factory)

    partitioner = None
    if known_args.optimize_for_large_inputs or known_args.partition_config_path:
        partitioner = variant_partition.VariantPartition(
            known_args.partition_config_path)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(pipeline, known_args)
    variants |= 'FilterVariants' >> filter_variants.FilterVariants(
        reference_names=known_args.reference_names)
    if partitioner:
        num_partitions = partitioner.get_num_partitions()
        partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
            partition_variants.PartitionVariants(partitioner), num_partitions)
        variants = []
        for i in range(num_partitions):
            if partitioner.should_keep_partition(i):
                variants.append(partitioned_variants[i])
            else:
                num_partitions -= 1
    else:
        # By default we don't partition the data, so we have only 1 partition.
        num_partitions = 1
        variants = [variants]

    for i in range(num_partitions):
        if variant_merger:
            variants[i] |= ('MergeVariants' + str(i) >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVaraints' + str(i) >>
            beam.Map(processed_variant_factory.create_processed_variant).\
                with_output_types(processed_variant.ProcessedVariant))
    if partitioner and partitioner.should_flatten():
        variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
        num_partitions = 1

    for i in range(num_partitions):
        table_suffix = ''
        if partitioner and partitioner.get_partition_name(i):
            table_suffix = '_' + partitioner.get_partition_name(i)
        table_name = known_args.output_table + table_suffix
        _ = (
            variants[i] | 'VariantToBigQuery' + table_suffix >>
            variant_to_bigquery.VariantToBigQuery(
                table_name,
                header_fields,
                variant_merger,
                processed_variant_factory,
                append=known_args.append,
                allow_incompatible_records=known_args.
                allow_incompatible_records,
                omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                num_bigquery_write_shards=known_args.num_bigquery_write_shards)
        )

    result = pipeline.run()
    result.wait_until_finish()

    metrics_util.log_all_counters(result)
Ejemplo n.º 3
0
def run(argv=None):
  # type: (List[str]) -> None
  """Runs VCF to BigQuery pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)

  if known_args.auto_flags_experiment:
    _get_input_dimensions(known_args, pipeline_args)

  annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

  all_patterns = (
      [annotated_vcf_pattern] if annotated_vcf_pattern
      else known_args.all_patterns)

  variant_merger = _get_variant_merge_strategy(known_args)

  pipeline_mode = pipeline_common.get_pipeline_mode(
      all_patterns,
      known_args.optimize_for_large_inputs)
  # Starts a pipeline to merge VCF headers in beam if the total files that
  # match the input pattern exceeds _SMALL_DATA_THRESHOLD
  _merge_headers(known_args, pipeline_args,
                 pipeline_mode, annotated_vcf_pattern)


  # Retrieve merged headers prior to launching the pipeline. This is needed
  # since the BigQuery schema cannot yet be dynamically created based on input.
  # See https://issues.apache.org/jira/browse/BEAM-2801.
  header_fields = vcf_header_parser.get_vcf_headers(
      known_args.representative_header_file)
  counter_factory = metrics_util.CounterFactory()
  processed_variant_factory = processed_variant.ProcessedVariantFactory(
      header_fields,
      known_args.split_alternate_allele_info_fields,
      known_args.allow_malformed_records,
      known_args.annotation_fields,
      known_args.use_allele_num,
      known_args.minimal_vep_alt_matching,
      known_args.infer_annotation_types,
      counter_factory)

  partitioner = None
  if ((known_args.optimize_for_large_inputs and variant_merger) or
      known_args.partition_config_path):
    partitioner = variant_partition.VariantPartition(
        known_args.partition_config_path)

  beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
  pipeline = beam.Pipeline(options=beam_pipeline_options)
  variants = _read_variants(all_patterns, pipeline, known_args, pipeline_mode)
  variants |= 'FilterVariants' >> filter_variants.FilterVariants(
      reference_names=known_args.reference_names)
  if partitioner:
    num_partitions = partitioner.get_num_partitions()
    partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
        partition_variants.PartitionVariants(partitioner), num_partitions)
    variants = []
    for i in range(num_partitions):
      if partitioner.should_keep_partition(i):
        variants.append(partitioned_variants[i])
      else:
        num_partitions -= 1
  else:
    # By default we don't partition the data, so we have only 1 partition.
    num_partitions = 1
    variants = [variants]

  for i in range(num_partitions):
    if variant_merger:
      variants[i] |= ('MergeVariants' + str(i) >>
                      merge_variants.MergeVariants(variant_merger))
    variants[i] |= (
        'ProcessVariants' + str(i) >>
        beam.Map(processed_variant_factory.create_processed_variant).\
            with_output_types(processed_variant.ProcessedVariant))
  if partitioner and partitioner.should_flatten():
    variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
    num_partitions = 1

  if known_args.output_table:
    for i in range(num_partitions):
      table_suffix = ''
      if partitioner and partitioner.get_partition_name(i):
        table_suffix = '_' + partitioner.get_partition_name(i)
      table_name = known_args.output_table + table_suffix
      _ = (variants[i] | 'VariantToBigQuery' + table_suffix >>
           variant_to_bigquery.VariantToBigQuery(
               table_name,
               header_fields,
               variant_merger,
               processed_variant_factory,
               append=known_args.append,
               update_schema_on_append=known_args.update_schema_on_append,
               allow_incompatible_records=known_args.allow_incompatible_records,
               omit_empty_sample_calls=known_args.omit_empty_sample_calls,
               num_bigquery_write_shards=known_args.num_bigquery_write_shards,
               null_numeric_value_replacement=(
                   known_args.null_numeric_value_replacement)))

  if known_args.output_avro_path:
    # TODO(bashir2): Add an integration test that outputs to Avro files and
    # also imports to BigQuery. Then import those Avro outputs using the bq
    # tool and verify that the two tables are identical.
    _ = (
        variants | 'FlattenToOnePCollection' >> beam.Flatten()
        | 'VariantToAvro' >>
        variant_to_avro.VariantToAvroFiles(
            known_args.output_avro_path,
            header_fields,
            processed_variant_factory,
            variant_merger=variant_merger,
            allow_incompatible_records=known_args.allow_incompatible_records,
            omit_empty_sample_calls=known_args.omit_empty_sample_calls,
            null_numeric_value_replacement=(
                known_args.null_numeric_value_replacement))
    )

  result = pipeline.run()
  result.wait_until_finish()

  metrics_util.log_all_counters(result)