def test_sample_ids_combiner_pipeline_preserve_sample_order(self): sample_ids = [ hash_name('sample2'), hash_name('sample1'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]) ] variants = [ vcfio.Variant( calls=[variant_calls[0], variant_calls[1], variant_calls[2]]), vcfio.Variant( calls=[variant_calls[0], variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() combined_sample_ids = ( pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True) | combiners.ToList()) assert_that(combined_sample_ids, equal_to([sample_ids])) pipeline.run()
def test_sample_ids_combiner_pipeline_duplicate_sample_ids(self): variant_call = vcfio.VariantCall(sample_id=hash_name('sample1')) variants = [vcfio.Variant(calls=[variant_call, variant_call])] pipeline = TestPipeline() _ = (pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner() | combiners.ToList()) with self.assertRaises(ValueError): pipeline.run()
def _shard_variants(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> List[str] """Reads the variants and writes them to VCF shards. Returns: The VCF shards directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) shard_files_job_name = pipeline_common.generate_unique_name( _SHARD_VCF_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, shard_files_job_name) vcf_shards_output_dir = filesystems.FileSystems.join( known_args.annotation_output_dir, _SHARDS_FOLDER) with beam.Pipeline(options=options) as p: variants = _read_variants(known_args.all_patterns, p, known_args, pipeline_mode, pre_infer_headers=False, keep_raw_sample_names=True, use_1_based_coordinate=False) sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner() | 'CombineToList' >> beam.combiners.ToList()) # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead # of sample names in the the sharded VCF files, which would lead to double # hashing of samples. Needs to be fixed ASAP. _ = (variants | 'DensifyVariants' >> densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids), known_args.number_of_variants_per_shard)) return [ vep_runner_util.format_dir_path(vcf_shards_output_dir) + _GCS_RECURSIVE_WILDCARD ]
def test_sample_ids_combiner_pipeline_preserve_sample_order_error(self): sample_ids = [ hash_name('sample1'), hash_name('sample2'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]) ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() _ = (pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True) | combiners.ToList()) with self.assertRaises(ValueError): pipeline.run()
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the sample names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) variant_query = _get_variant_query(known_args, schema) logging.info('Processing BigQuery query %s:', variant_query) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( known_args.input_table) bq_variant_source = bigquery.BigQuerySource(query=variant_query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) base_table_id = bigquery_util.get_table_base_name(table_id) sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format( PROJECT_ID=project_id, DATASET_ID=dataset_id, TABLE_NAME=bigquery_util.compose_table_name(base_table_id, SAMPLE_INFO_TABLE_SUFFIX)) bq_sample_source = bigquery.BigQuerySource(query=sample_query, validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) sample_table_rows = ( p | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source)) if known_args.sample_names: temp_sample_names = (p | transforms.Create(known_args.sample_names, reshuffle=False)) else: # Get sample names from sample IDs in the variants and sort. id_to_name_hash_table = (sample_table_rows | 'SampleIdToNameDict' >> sample_mapping_table.SampleIdToNameDict()) temp_sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner( known_args.preserve_sample_order)) temp_sample_names = ( temp_sample_ids | 'GetSampleNames' >> sample_mapping_table.GetSampleNames( beam.pvalue.AsSingleton(id_to_name_hash_table)) | 'CombineToList' >> beam.combiners.ToList() | 'SortSampleNames' >> beam.ParDo(sorted)) name_to_id_hash_table = ( sample_table_rows | 'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict()) sample_ids = (temp_sample_names | 'GetSampleIds' >> sample_mapping_table.GetSampleIds( beam.pvalue.AsSingleton(name_to_id_hash_table)) | 'CombineSortedSampleIds' >> beam.combiners.ToList()) sample_names = temp_sample_names | beam.combiners.ToList() _ = (sample_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))