Example #1
0
def read_headers(pipeline, pipeline_mode, input_pattern):
  # type: (beam.Pipeline, int, str) -> pvalue.PCollection
  """Creates an initial PCollection by reading the VCF file headers."""
  if pipeline_mode == PipelineModes.LARGE:
    headers = (pipeline
               | beam.Create([input_pattern])
               | vcf_header_io.ReadAllVcfHeaders())
  else:
    headers = pipeline | vcf_header_io.ReadVcfHeaders(input_pattern)
  return headers
def read_headers(pipeline, pipeline_mode, all_patterns):
  # type: (beam.Pipeline, int, List[str]) -> pvalue.PCollection
  """Creates an initial PCollection by reading the VCF file headers."""
  compression_type = get_compression_type(all_patterns)
  if pipeline_mode == PipelineModes.LARGE:
    headers = (pipeline
               | beam.Create(all_patterns)
               | vcf_header_io.ReadAllVcfHeaders(
                   compression_type=compression_type))
  else:
    headers = pipeline | vcf_header_io.ReadVcfHeaders(
        all_patterns[0], compression_type=compression_type)

  return headers
def _merge_headers(known_args, pipeline_args, pipeline_mode):
  """Merges VCF headers using beam based on pipeline_mode."""
  if known_args.representative_header_file:
    return

  options = PipelineOptions(pipeline_args)

  # Always run pipeline locally if data is small.
  if (pipeline_mode == PipelineModes.SMALL and
      not known_args.infer_undefined_headers):
    options.view_as(StandardOptions).runner = 'DirectRunner'


  google_cloud_options = options.view_as(GoogleCloudOptions)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + _MERGE_HEADERS_JOB_NAME
  else:
    google_cloud_options.job_name = _MERGE_HEADERS_JOB_NAME

  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  # Add a time prefix to ensure files are unique in case multiple
  # pipelines are run at the same time.
  temp_merged_headers_file_name = '-'.join([
      datetime.datetime.now().strftime('%Y%m%d-%H%M%S'),
      google_cloud_options.job_name,
      _MERGE_HEADERS_FILE_NAME])
  known_args.representative_header_file = FileSystems.join(
      temp_directory, temp_merged_headers_file_name)

  with beam.Pipeline(options=options) as p:
    headers = p
    if pipeline_mode == PipelineModes.LARGE:
      headers |= (beam.Create([known_args.input_pattern])
                  | vcf_header_io.ReadAllVcfHeaders())
    else:
      headers |= vcf_header_io.ReadVcfHeaders(known_args.input_pattern)

    merged_header = (headers
                     | 'MergeHeaders' >> merge_headers.MergeHeaders(
                         known_args.split_alternate_allele_info_fields))

    if known_args.infer_undefined_headers:
      merged_header = _add_inferred_headers(p, known_args, merged_header)

    _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders(
        known_args.representative_header_file))
Example #4
0
def add_annotation_headers(pipeline, known_args, pipeline_mode, merged_header,
                           annotated_vcf_pattern):
    if pipeline_mode == PipelineModes.LARGE:
        annotation_headers = (
            pipeline
            | 'ReadAnnotatedVCF' >> beam.Create([annotated_vcf_pattern])
            | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
    else:
        annotation_headers = (
            pipeline
            | 'ReadHeaders' >>
            vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
    merged_header = ((merged_header, annotation_headers)
                     | beam.Flatten()
                     |
                     'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
                         known_args.split_alternate_allele_info_fields,
                         known_args.allow_incompatible_records))
    return merged_header