Beispiel #1
0
    def test_infer_annotation_pipeline(self):
        anno_fields = ['CSQ']
        header = self._get_sample_header_fields(with_annotation=True)
        variant1 = self._get_sample_variant_1()
        variant1.info['CSQ'] = [
            'A|1|100|1.2', 'A|2|101|1.3', 'A|12|start|0', 'TT|13|end|7'
        ]
        variant2 = self._get_sample_variant_1()
        variant2.info['CSQ'] = [
            'A|1|100|', 'A|2|101|', 'A|1.2|102|0', 'TT|1.3|103|7'
        ]
        desc = 'Inferred type field for annotation {}.'
        expected = vcf_header_io.VcfHeader(
            infos={
                'CSQ_Gene_TYPE':
                createInfo('CSQ_Gene_TYPE', 1, 'Float', desc.format('Gene')),
                'CSQ_Position_TYPE':
                createInfo('CSQ_Position_TYPE', 1, 'String',
                           desc.format('Position')),
                'CSQ_Score_TYPE':
                createInfo('CSQ_Score_TYPE', 1, 'Float', desc.format('Score'))
            })

        with TestPipeline() as p:
            inferred_headers = (
                p
                | Create([variant1, variant2])
                | 'InferAnnotationTypes' >> infer_headers.InferHeaderFields(
                    defined_headers=header,
                    infer_headers=False,
                    annotation_fields_to_infer=anno_fields))
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()
  def test_header_fields_inferred_from_two_variants(self):
    with TestPipeline() as p:
      variant_1 = self._get_sample_variant_1()
      variant_2 = self._get_sample_variant_2()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              defined_headers=None))

      expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''),
                        'ISI': Info('ISI', 1, 'Integer', '', '', ''),
                        'ISF': Info('ISF', 1, 'Float', '', '', ''),
                        'IF': Info('IF', 1, 'Float', '', '', ''),
                        'IB': Info('IB', 0, 'Flag', '', '', ''),
                        'IA': Info('IA', None, 'Float', '', '', ''),
                        'IS_2': Info('IS_2', 1, 'String', '', '', '')}
      expected_formats = {'FI': Format('FI', 1, 'Integer', ''),
                          'FU': Format('FU', None, 'Float', ''),
                          'FI_2': Format('FI_2', 1, 'Integer', '')}

      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers,
                  asserts.header_fields_equal_ignore_order([expected]))
      p.run()
  def test_pipeline(self):
    infos = {'IS': Info('IS', 1, 'String', '', '', ''),
             'ISI': Info('ISI', 1, 'Integer', '', '', ''),
             'ISF': Info('ISF', 1, 'Float', '', '', ''),
             'IB': Info('IB', 0, 'Flag', '', '', ''),
             'IA': Info('IA', -1, 'Integer', '', '', '')}
    formats = OrderedDict([
        ('FS', Format('FS', 1, 'String', 'desc')),
        ('FI', Format('FI', 2, 'Integer', 'desc')),
        ('GT', Format('GT', 2, 'Integer', 'Special GT key')),
        ('PS', Format('PS', 1, 'Integer', 'Special PS key'))])

    with TestPipeline() as p:
      variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch()
      variant_2 = self._get_sample_variant_format_fi_float_value()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferHeaderFields' >>
          infer_headers.InferHeaderFields(
              defined_headers=vcf_header_io.VcfHeader(infos=infos,
                                                      formats=formats),
              allow_incompatible_records=True))

      expected_infos = {'IA': Info('IA', None, 'Float', '', '', ''),
                        'IF': Info('IF', 1, 'Float', '', '', '')}
      expected_formats = {'FI': Format('FI', 2, 'Float', 'desc'),
                          'FU': Format('FU', None, 'Float', '')}
      expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                         formats=expected_formats)
      assert_that(inferred_headers,
                  asserts.header_fields_equal_ignore_order([expected]))
      p.run()
  def test_defined_fields_filtered_two_variants(self):
    # Only INFO and FORMAT in the first variants are already defined in the
    # header section of the VCF files.
    with TestPipeline() as p:
      vcf_headers = self._get_sample_header_fields()
      vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers])
      variant_1 = self._get_sample_variant_1()
      variant_2 = self._get_sample_variant_2()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              pvalue.AsSingleton(vcf_headers_side_input)))

      expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')}
      expected_formats = {'FI_2': Format('FI_2', 1, 'Integer', '')}
      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers,
                  asserts.header_fields_equal_ignore_order([expected]))
      p.run()