def _shard_variants(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> List[str] """Reads the variants and writes them to VCF shards. Returns: The VCF shards directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) shard_files_job_name = pipeline_common.generate_unique_name( _SHARD_VCF_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, shard_files_job_name) vcf_shards_output_dir = filesystems.FileSystems.join( known_args.annotation_output_dir, _SHARDS_FOLDER) with beam.Pipeline(options=options) as p: variants = _read_variants( known_args.all_patterns, p, known_args, pipeline_mode) call_names = (variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) _ = (variants | 'DensifyVariants' >> densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( vcf_shards_output_dir, beam.pvalue.AsSingleton(call_names), known_args.number_of_variants_per_shard)) return [vep_runner_util.format_dir_path(vcf_shards_output_dir) + _GCS_RECURSIVE_WILDCARD]
def test_write_to_shards_pipeline(self): with temp_dir.TempDir() as tempdir: pipeline = TestPipeline() _ = (pipeline | Create(self._get_variants()) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( tempdir.get_path(), ['Sample 1', 'Sample 2'])) pipeline.run()
def _shard_variants(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> List[str] """Reads the variants and writes them to VCF shards. Returns: The VCF shards directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) shard_files_job_name = pipeline_common.generate_unique_name( _SHARD_VCF_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, shard_files_job_name) vcf_shards_output_dir = filesystems.FileSystems.join( known_args.annotation_output_dir, _SHARDS_FOLDER) with beam.Pipeline(options=options) as p: variants = _read_variants(known_args.all_patterns, p, known_args, pipeline_mode, pre_infer_headers=False, keep_raw_sample_names=True, use_1_based_coordinate=False) sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner() | 'CombineToList' >> beam.combiners.ToList()) # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead # of sample names in the the sharded VCF files, which would lead to double # hashing of samples. Needs to be fixed ASAP. _ = (variants | 'DensifyVariants' >> densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids), known_args.number_of_variants_per_shard)) return [ vep_runner_util.format_dir_path(vcf_shards_output_dir) + _GCS_RECURSIVE_WILDCARD ]