コード例 #1
0
def _shard_variants(known_args, pipeline_args, pipeline_mode):
  # type: (argparse.Namespace, List[str], int) -> List[str]
  """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  shard_files_job_name = pipeline_common.generate_unique_name(
      _SHARD_VCF_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
  vcf_shards_output_dir = filesystems.FileSystems.join(
      known_args.annotation_output_dir, _SHARDS_FOLDER)
  with beam.Pipeline(options=options) as p:
    variants = _read_variants(
        known_args.all_patterns, p, known_args, pipeline_mode)
    call_names = (variants
                  | 'CombineCallNames' >>
                  combine_call_names.CallNamesCombiner())
    _ = (variants
         | 'DensifyVariants' >> densify_variants.DensifyVariants(
             beam.pvalue.AsSingleton(call_names))
         | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
             vcf_shards_output_dir,
             beam.pvalue.AsSingleton(call_names),
             known_args.number_of_variants_per_shard))

  return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
          _GCS_RECURSIVE_WILDCARD]
 def test_write_to_shards_pipeline(self):
     with temp_dir.TempDir() as tempdir:
         pipeline = TestPipeline()
         _ = (pipeline
              | Create(self._get_variants())
              | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
                  tempdir.get_path(), ['Sample 1', 'Sample 2']))
         pipeline.run()
コード例 #3
0
def _shard_variants(known_args, pipeline_args, pipeline_mode):
    # type: (argparse.Namespace, List[str], int) -> List[str]
    """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
    options = pipeline_options.PipelineOptions(pipeline_args)
    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    shard_files_job_name = pipeline_common.generate_unique_name(
        _SHARD_VCF_FILES_JOB_NAME)
    _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
    vcf_shards_output_dir = filesystems.FileSystems.join(
        known_args.annotation_output_dir, _SHARDS_FOLDER)
    with beam.Pipeline(options=options) as p:
        variants = _read_variants(known_args.all_patterns,
                                  p,
                                  known_args,
                                  pipeline_mode,
                                  pre_infer_headers=False,
                                  keep_raw_sample_names=True,
                                  use_1_based_coordinate=False)
        sample_ids = (
            variants
            | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner()
            | 'CombineToList' >> beam.combiners.ToList())
        # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
        # of sample names in the the sharded VCF files, which would lead to double
        # hashing of samples. Needs to be fixed ASAP.
        _ = (variants
             | 'DensifyVariants' >> densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
                 vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids),
                 known_args.number_of_variants_per_shard))

    return [
        vep_runner_util.format_dir_path(vcf_shards_output_dir) +
        _GCS_RECURSIVE_WILDCARD
    ]