def read_headers(pipeline, pipeline_mode, input_pattern): # type: (beam.Pipeline, int, str) -> pvalue.PCollection """Creates an initial PCollection by reading the VCF file headers.""" if pipeline_mode == PipelineModes.LARGE: headers = (pipeline | beam.Create([input_pattern]) | vcf_header_io.ReadAllVcfHeaders()) else: headers = pipeline | vcf_header_io.ReadVcfHeaders(input_pattern) return headers
def read_headers(pipeline, pipeline_mode, all_patterns): # type: (beam.Pipeline, int, List[str]) -> pvalue.PCollection """Creates an initial PCollection by reading the VCF file headers.""" compression_type = get_compression_type(all_patterns) if pipeline_mode == PipelineModes.LARGE: headers = (pipeline | beam.Create(all_patterns) | vcf_header_io.ReadAllVcfHeaders( compression_type=compression_type)) else: headers = pipeline | vcf_header_io.ReadVcfHeaders( all_patterns[0], compression_type=compression_type) return headers
def _merge_headers(known_args, pipeline_args, pipeline_mode): """Merges VCF headers using beam based on pipeline_mode.""" if known_args.representative_header_file: return options = PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == PipelineModes.SMALL and not known_args.infer_undefined_headers): options.view_as(StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(GoogleCloudOptions) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + _MERGE_HEADERS_JOB_NAME else: google_cloud_options.job_name = _MERGE_HEADERS_JOB_NAME temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() # Add a time prefix to ensure files are unique in case multiple # pipelines are run at the same time. temp_merged_headers_file_name = '-'.join([ datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) known_args.representative_header_file = FileSystems.join( temp_directory, temp_merged_headers_file_name) with beam.Pipeline(options=options) as p: headers = p if pipeline_mode == PipelineModes.LARGE: headers |= (beam.Create([known_args.input_pattern]) | vcf_header_io.ReadAllVcfHeaders()) else: headers |= vcf_header_io.ReadVcfHeaders(known_args.input_pattern) merged_header = (headers | 'MergeHeaders' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields)) if known_args.infer_undefined_headers: merged_header = _add_inferred_headers(p, known_args, merged_header) _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders( known_args.representative_header_file))
def add_annotation_headers(pipeline, known_args, pipeline_mode, merged_header, annotated_vcf_pattern): if pipeline_mode == PipelineModes.LARGE: annotation_headers = ( pipeline | 'ReadAnnotatedVCF' >> beam.Create([annotated_vcf_pattern]) | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders()) else: annotation_headers = ( pipeline | 'ReadHeaders' >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern)) merged_header = ((merged_header, annotation_headers) | beam.Flatten() | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header