def _add_inferred_headers(all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace merged_header, # type: pvalue.PCollection pipeline_mode # type: int ): # type: (...) -> pvalue.PCollection annotation_fields_to_infer = (known_args.annotation_fields if known_args.infer_annotation_types else []) inferred_headers = ( _read_variants(all_patterns, pipeline, known_args, pipeline_mode) | 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( pvalue.AsSingleton(merged_header), known_args.allow_incompatible_records, known_args.infer_headers, annotation_fields_to_infer)) merged_header = ( (inferred_headers, merged_header) | 'FlattenHeaders' >> beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header
def expand(self, pcoll): return (pcoll | 'InferUndefinedHeaderFields' >> beam.ParDo( _InferUndefinedHeaderFields(), self._defined_headers) # TODO(nmousavi): Modify the MergeHeaders to resolve 1 vs '.' # mistmatch for headers extracted from variants. # # Note: argument `split_alternate_allele_info_fileds` is not # relevant here since no fields with `Number=A` will be extracted # from variants, therefore we let the default value (True) for it # be used. Should this changes, we should modify the default value. | 'MergeHeaders' >> merge_headers.MergeHeaders( split_alternate_allele_info_fields=True))
def _add_inferred_headers(pipeline, known_args, merged_header): inferred_headers = ( _read_variants(pipeline, known_args) | 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) | ' InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( beam.pvalue.AsSingleton(merged_header))) merged_header = ( (inferred_headers, merged_header) | beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields)) return merged_header
def _get_inferred_headers(variants, # type: pvalue.PCollection merged_header # type: pvalue.PCollection ): # type: (...) -> (pvalue.PCollection, pvalue.PCollection) inferred_headers = (variants | 'FilterVariants' >> filter_variants.FilterVariants() | ' InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(merged_header))) merged_header = ( (inferred_headers, merged_header) | beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( allow_incompatible_records=True)) return inferred_headers, merged_header
def get_merged_headers(headers, split_alternate_allele_info_fields=True, allow_incompatible_records=True): # type: (pvalue.PCollection, bool, bool) -> pvalue.PCollection """Applies the ``MergeHeaders`` PTransform on PCollection of ``VcfHeader``. Args: headers: The VCF headers. split_alternate_allele_info_fields: If true, the INFO fields with `Number=A` in BigQuery schema is not repeated. This is relevant as it changes the header compatibility rules. allow_incompatible_records: If true, always resolve the conflicts when merging headers. """ return (headers | 'MergeHeaders' >> merge_headers.MergeHeaders( split_alternate_allele_info_fields, allow_incompatible_records))
def _merge_headers(known_args, pipeline_args, pipeline_mode): """Merges VCF headers using beam based on pipeline_mode.""" if known_args.representative_header_file: return options = PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == PipelineModes.SMALL and not known_args.infer_undefined_headers): options.view_as(StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(GoogleCloudOptions) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + _MERGE_HEADERS_JOB_NAME else: google_cloud_options.job_name = _MERGE_HEADERS_JOB_NAME temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() # Add a time prefix to ensure files are unique in case multiple # pipelines are run at the same time. temp_merged_headers_file_name = '-'.join([ datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) known_args.representative_header_file = FileSystems.join( temp_directory, temp_merged_headers_file_name) with beam.Pipeline(options=options) as p: headers = p if pipeline_mode == PipelineModes.LARGE: headers |= (beam.Create([known_args.input_pattern]) | vcf_header_io.ReadAllVcfHeaders()) else: headers |= vcf_header_io.ReadVcfHeaders(known_args.input_pattern) merged_header = (headers | 'MergeHeaders' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields)) if known_args.infer_undefined_headers: merged_header = _add_inferred_headers(p, known_args, merged_header) _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders( known_args.representative_header_file))
def _add_inferred_headers( pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace merged_header # type: pvalue.PCollection ): # type: (...) -> pvalue.PCollection inferred_headers = (_read_variants(pipeline, known_args) | 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) | ' InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(merged_header))) merged_header = ( (inferred_headers, merged_header) | beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header
def add_annotation_headers(pipeline, known_args, pipeline_mode, merged_header, annotated_vcf_pattern): if pipeline_mode == PipelineModes.LARGE: annotation_headers = ( pipeline | 'ReadAnnotatedVCF' >> beam.Create([annotated_vcf_pattern]) | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders()) else: annotation_headers = ( pipeline | 'ReadHeaders' >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern)) merged_header = ((merged_header, annotation_headers) | beam.Flatten() | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header
def test_combine_pipeline(self): headers_1 = self._get_header_from_lines(FILE_1_LINES) headers_2 = self._get_header_from_lines(FILE_2_LINES) # TODO(nmousavi): Either use TestPipeline or combiner_fn.* everywhere. # After moving out _HeaderMerger to its file, it makes sense to use # TestPipeline everywhere. header_merger = HeaderMerger( vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields=True)) expected = vcf_header_io.VcfHeader() header_merger.merge(expected, headers_1) header_merger.merge(expected, headers_2) pipeline = TestPipeline() merged_headers = (pipeline | Create([headers_1, headers_2]) | 'MergeHeaders' >> merge_headers.MergeHeaders()) assert_that(merged_headers, equal_to([expected]))