def _merge_headers(known_args, pipeline_args, pipeline_mode):
  """Merges VCF headers using beam based on pipeline_mode."""
  if known_args.representative_header_file:
    return

  options = PipelineOptions(pipeline_args)

  # Always run pipeline locally if data is small.
  if (pipeline_mode == PipelineModes.SMALL and
      not known_args.infer_undefined_headers):
    options.view_as(StandardOptions).runner = 'DirectRunner'


  google_cloud_options = options.view_as(GoogleCloudOptions)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + _MERGE_HEADERS_JOB_NAME
  else:
    google_cloud_options.job_name = _MERGE_HEADERS_JOB_NAME

  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  # Add a time prefix to ensure files are unique in case multiple
  # pipelines are run at the same time.
  temp_merged_headers_file_name = '-'.join([
      datetime.datetime.now().strftime('%Y%m%d-%H%M%S'),
      google_cloud_options.job_name,
      _MERGE_HEADERS_FILE_NAME])
  known_args.representative_header_file = FileSystems.join(
      temp_directory, temp_merged_headers_file_name)

  with beam.Pipeline(options=options) as p:
    headers = p
    if pipeline_mode == PipelineModes.LARGE:
      headers |= (beam.Create([known_args.input_pattern])
                  | vcf_header_io.ReadAllVcfHeaders())
    else:
      headers |= vcf_header_io.ReadVcfHeaders(known_args.input_pattern)

    merged_header = (headers
                     | 'MergeHeaders' >> merge_headers.MergeHeaders(
                         known_args.split_alternate_allele_info_fields))

    if known_args.infer_undefined_headers:
      merged_header = _add_inferred_headers(p, known_args, merged_header)

    _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders(
        known_args.representative_header_file))
Exemple #2
0
def write_headers(merged_header, file_path):
  # type: (pvalue.PCollection, str) -> None
  """Writes a PCollection of ``VcfHeader`` to location ``file_path``."""
  _ = (merged_header | 'WriteHeaders' >>
       vcf_header_io.WriteVcfHeaders(file_path))