Exemple #1
0
def _add_inferred_headers(all_patterns,  # type: List[str]
                          pipeline,  # type: beam.Pipeline
                          known_args,  # type: argparse.Namespace
                          merged_header,  # type: pvalue.PCollection
                          pipeline_mode  # type: int
                         ):
  # type: (...) -> pvalue.PCollection
  annotation_fields_to_infer = (known_args.annotation_fields if
                                known_args.infer_annotation_types else [])
  inferred_headers = (
      _read_variants(all_patterns,
                     pipeline,
                     known_args,
                     pipeline_mode)
      | 'FilterVariants' >> filter_variants.FilterVariants(
          reference_names=known_args.reference_names)
      | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
          pvalue.AsSingleton(merged_header),
          known_args.allow_incompatible_records,
          known_args.infer_headers,
          annotation_fields_to_infer))
  merged_header = (
      (inferred_headers, merged_header)
      | 'FlattenHeaders' >> beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header
 def expand(self, pcoll):
   return (pcoll
           | 'InferUndefinedHeaderFields' >> beam.ParDo(
               _InferUndefinedHeaderFields(), self._defined_headers)
           # TODO(nmousavi): Modify the MergeHeaders to resolve 1 vs '.'
           # mistmatch for headers extracted from variants.
           #
           # Note: argument `split_alternate_allele_info_fileds` is not
           # relevant here since no fields with `Number=A` will be extracted
           # from variants, therefore we let the default value (True) for it
           # be used. Should this changes, we should modify the default value.
           | 'MergeHeaders' >> merge_headers.MergeHeaders(
               split_alternate_allele_info_fields=True))
def _add_inferred_headers(pipeline, known_args, merged_header):
  inferred_headers = (
      _read_variants(pipeline, known_args)
      | 'FilterVariants' >> filter_variants.FilterVariants(
          reference_names=known_args.reference_names)
      | ' InferUndefinedHeaderFields' >>
      infer_undefined_headers.InferUndefinedHeaderFields(
          beam.pvalue.AsSingleton(merged_header)))
  merged_header = (
      (inferred_headers, merged_header)
      | beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields))
  return merged_header
Exemple #4
0
def _get_inferred_headers(variants,  # type: pvalue.PCollection
                          merged_header  # type: pvalue.PCollection
                         ):
  # type: (...) -> (pvalue.PCollection, pvalue.PCollection)
  inferred_headers = (variants
                      | 'FilterVariants' >> filter_variants.FilterVariants()
                      | ' InferUndefinedHeaderFields' >>
                      infer_undefined_headers.InferUndefinedHeaderFields(
                          pvalue.AsSingleton(merged_header)))
  merged_header = (
      (inferred_headers, merged_header)
      | beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          allow_incompatible_records=True))
  return inferred_headers, merged_header
Exemple #5
0
def get_merged_headers(headers,
                       split_alternate_allele_info_fields=True,
                       allow_incompatible_records=True):
  # type: (pvalue.PCollection, bool, bool) -> pvalue.PCollection
  """Applies the ``MergeHeaders`` PTransform on PCollection of ``VcfHeader``.

  Args:
    headers: The VCF headers.
    split_alternate_allele_info_fields: If true, the INFO fields with `Number=A`
      in BigQuery schema is not repeated. This is relevant as it changes the
      header compatibility rules.
    allow_incompatible_records: If true, always resolve the conflicts when
      merging headers.
  """
  return (headers | 'MergeHeaders' >> merge_headers.MergeHeaders(
      split_alternate_allele_info_fields, allow_incompatible_records))
def _merge_headers(known_args, pipeline_args, pipeline_mode):
  """Merges VCF headers using beam based on pipeline_mode."""
  if known_args.representative_header_file:
    return

  options = PipelineOptions(pipeline_args)

  # Always run pipeline locally if data is small.
  if (pipeline_mode == PipelineModes.SMALL and
      not known_args.infer_undefined_headers):
    options.view_as(StandardOptions).runner = 'DirectRunner'


  google_cloud_options = options.view_as(GoogleCloudOptions)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + _MERGE_HEADERS_JOB_NAME
  else:
    google_cloud_options.job_name = _MERGE_HEADERS_JOB_NAME

  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  # Add a time prefix to ensure files are unique in case multiple
  # pipelines are run at the same time.
  temp_merged_headers_file_name = '-'.join([
      datetime.datetime.now().strftime('%Y%m%d-%H%M%S'),
      google_cloud_options.job_name,
      _MERGE_HEADERS_FILE_NAME])
  known_args.representative_header_file = FileSystems.join(
      temp_directory, temp_merged_headers_file_name)

  with beam.Pipeline(options=options) as p:
    headers = p
    if pipeline_mode == PipelineModes.LARGE:
      headers |= (beam.Create([known_args.input_pattern])
                  | vcf_header_io.ReadAllVcfHeaders())
    else:
      headers |= vcf_header_io.ReadVcfHeaders(known_args.input_pattern)

    merged_header = (headers
                     | 'MergeHeaders' >> merge_headers.MergeHeaders(
                         known_args.split_alternate_allele_info_fields))

    if known_args.infer_undefined_headers:
      merged_header = _add_inferred_headers(p, known_args, merged_header)

    _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders(
        known_args.representative_header_file))
Exemple #7
0
def _add_inferred_headers(
        pipeline,  # type: beam.Pipeline
        known_args,  # type: argparse.Namespace
        merged_header  # type: pvalue.PCollection
):
    # type: (...) -> pvalue.PCollection
    inferred_headers = (_read_variants(pipeline, known_args)
                        | 'FilterVariants' >> filter_variants.FilterVariants(
                            reference_names=known_args.reference_names)
                        | ' InferUndefinedHeaderFields' >>
                        infer_undefined_headers.InferUndefinedHeaderFields(
                            pvalue.AsSingleton(merged_header)))
    merged_header = (
        (inferred_headers, merged_header)
        | beam.Flatten()
        | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
            known_args.split_alternate_allele_info_fields,
            known_args.allow_incompatible_records))
    return merged_header
Exemple #8
0
def add_annotation_headers(pipeline, known_args, pipeline_mode, merged_header,
                           annotated_vcf_pattern):
    if pipeline_mode == PipelineModes.LARGE:
        annotation_headers = (
            pipeline
            | 'ReadAnnotatedVCF' >> beam.Create([annotated_vcf_pattern])
            | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
    else:
        annotation_headers = (
            pipeline
            | 'ReadHeaders' >>
            vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
    merged_header = ((merged_header, annotation_headers)
                     | beam.Flatten()
                     |
                     'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
                         known_args.split_alternate_allele_info_fields,
                         known_args.allow_incompatible_records))
    return merged_header
    def test_combine_pipeline(self):
        headers_1 = self._get_header_from_lines(FILE_1_LINES)
        headers_2 = self._get_header_from_lines(FILE_2_LINES)

        # TODO(nmousavi): Either use TestPipeline or combiner_fn.* everywhere.
        # After moving out _HeaderMerger to its file, it makes sense to use
        # TestPipeline everywhere.
        header_merger = HeaderMerger(
            vcf_field_conflict_resolver.FieldConflictResolver(
                split_alternate_allele_info_fields=True))
        expected = vcf_header_io.VcfHeader()
        header_merger.merge(expected, headers_1)
        header_merger.merge(expected, headers_2)

        pipeline = TestPipeline()
        merged_headers = (pipeline
                          | Create([headers_1, headers_2])
                          | 'MergeHeaders' >> merge_headers.MergeHeaders())

        assert_that(merged_headers, equal_to([expected]))