Beispiel #1
0
def _shard_variants(known_args, pipeline_args, pipeline_mode):
  # type: (argparse.Namespace, List[str], int) -> List[str]
  """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  shard_files_job_name = pipeline_common.generate_unique_name(
      _SHARD_VCF_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
  vcf_shards_output_dir = filesystems.FileSystems.join(
      known_args.annotation_output_dir, _SHARDS_FOLDER)
  with beam.Pipeline(options=options) as p:
    variants = _read_variants(
        known_args.all_patterns, p, known_args, pipeline_mode)
    call_names = (variants
                  | 'CombineCallNames' >>
                  combine_call_names.CallNamesCombiner())
    _ = (variants
         | 'DensifyVariants' >> densify_variants.DensifyVariants(
             beam.pvalue.AsSingleton(call_names))
         | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
             vcf_shards_output_dir,
             beam.pvalue.AsSingleton(call_names),
             known_args.number_of_variants_per_shard))

  return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
          _GCS_RECURSIVE_WILDCARD]
Beispiel #2
0
    def test_call_names_combiner_pipeline_duplicate_call_names(self):
        variant_call = vcfio.VariantCall(name='sample1')
        variants = [vcfio.Variant(calls=[variant_call, variant_call])]

        pipeline = TestPipeline()
        _ = (pipeline
             | transforms.Create(variants)
             | 'CombineCallNames' >> combine_call_names.CallNamesCombiner())
        with self.assertRaises(ValueError):
            pipeline.run()
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the call names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    # TODO(allieychen): Modify the SQL query with the specified call_names.
    query = _get_bigquery_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', query)
    bq_source = bigquery.BigQuerySource(query=query,
                                        validate=True,
                                        use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        if known_args.call_names:
            call_names = (p
                          | transforms.Create(known_args.call_names)
                          | beam.combiners.ToList())
        else:
            call_names = (
                variants
                | 'CombineCallNames' >> combine_call_names.CallNamesCombiner(
                    known_args.preserve_call_names_order))

        _ = (call_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_call_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(call_names))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines())
Beispiel #4
0
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        vcf_data_header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the data header to `vcf_data_header_file_path`.
  TODO(allieychen): Eventually, it also generates the meta information file.
  """
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
                    | bigquery_to_variant.BigQueryToVariant())
        call_names = (
            variants
            | 'CombineCallNames' >> combine_call_names.CallNamesCombiner())

        _ = (call_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_data_header, _VCF_FIXED_COLUMNS,
                 vcf_data_header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(call_names))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines())
Beispiel #5
0
    def test_call_names_combiner_pipeline(self):
        call_names = ['sample1', 'sample2', 'sample3']
        variant_calls = [
            vcfio.VariantCall(name=call_names[0]),
            vcfio.VariantCall(name=call_names[1]),
            vcfio.VariantCall(name=call_names[2])
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        combined_call_names = (
            pipeline
            | transforms.Create(variants)
            | 'CombineCallNames' >> combine_call_names.CallNamesCombiner())
        assert_that(combined_call_names, equal_to([call_names]))
        pipeline.run()
Beispiel #6
0
    def test_call_names_combiner_pipeline_preserve_call_names_order_error(
            self):
        call_names = ['sample1', 'sample2', 'sample3']
        variant_calls = [
            vcfio.VariantCall(name=call_names[0]),
            vcfio.VariantCall(name=call_names[1]),
            vcfio.VariantCall(name=call_names[2])
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        _ = (pipeline
             | transforms.Create(variants)
             | 'CombineCallNames' >> combine_call_names.CallNamesCombiner(
                 preserve_call_names_order=True))
        with self.assertRaises(ValueError):
            pipeline.run()