def test_vcf_header_to_schema_to_vcf_header(self):
        infos = OrderedDict([
            ('I1', createInfo('I1', '.', 'String', 'desc', None, None)),
            ('IA', createInfo('IA', '.', 'Integer', 'desc', None, None))
        ])
        formats = OrderedDict([('F1', createFormat('F1', '.', 'String',
                                                   'desc')),
                               ('F2', createFormat('F2', '.', 'Integer',
                                                   'desc')),
                               ('FU', createFormat('FU', '.', 'Float',
                                                   'desc'))])
        original_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)

        schema = schema_converter.generate_schema_from_header_fields(
            original_header,
            processed_variant.ProcessedVariantFactory(original_header))
        reconstructed_header = (
            schema_converter.generate_header_fields_from_schema(schema))

        self.assertEqual(original_header, reconstructed_header)
    def test_generate_header_fields_from_schema_schema_compatibility(self):
        schema_conflict = bigquery.TableSchema()
        schema_conflict.fields.append(
            bigquery.TableFieldSchema(
                name='AA',
                type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='desc'))
        with self.assertRaises(ValueError):
            schema_converter.generate_header_fields_from_schema(
                schema_conflict)

        header = schema_converter.generate_header_fields_from_schema(
            schema_conflict, allow_incompatible_schema=True)
        infos = OrderedDict([('AA',
                              createInfo('AA', 1, 'Integer', 'desc', None,
                                         None))])
        expected_header = vcf_header_io.VcfHeader(infos=infos,
                                                  formats=OrderedDict())
        self.assertEqual(header, expected_header)
    def test_generate_header_fields_from_schema_with_annotation(self):
        sample_schema = bigquery_schema_util.get_sample_table_schema(
            with_annotation_fields=True)
        header = schema_converter.generate_header_fields_from_schema(
            sample_schema)

        infos = OrderedDict([
            ('AF', createInfo('AF', 'A', 'Float', 'desc', None, None)),
            ('CSQ',
             createInfo('CSQ', '.', 'String',
                        'desc Format: Consequence|IMPACT', None, None)),
            ('AA', createInfo('AA', 1, 'String', 'desc', None, None)),
            ('IFR', createInfo('IFR', '.', 'Float', 'desc', None, None)),
            ('IS', createInfo('IS', 1, 'String', 'desc', None, None))
        ])
        formats = OrderedDict([('FB', createFormat('FB', 1, 'String', 'desc')),
                               ('GQ', createFormat('GQ', 1, 'Integer',
                                                   'desc'))])
        expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)
        self.assertEqual(header, expected_header)
  def test_combine_pipeline(self):
    headers_1 = self._get_header_from_lines(FILE_1_LINES)
    headers_2 = self._get_header_from_lines(FILE_2_LINES)

    # TODO(nmousavi): Either use TestPipeline or combiner_fn.* everywhere.
    # After moving out _HeaderMerger to its file, it makes sense to use
    # TestPipeline everywhere.
    header_merger = HeaderMerger(
        vcf_field_conflict_resolver.FieldConflictResolver(
            split_alternate_allele_info_fields=True))
    expected = vcf_header_io.VcfHeader()
    header_merger.merge(expected, headers_1)
    header_merger.merge(expected, headers_2)

    pipeline = TestPipeline()
    merged_headers = (
        pipeline
        | Create([headers_1, headers_2])
        | 'MergeHeaders' >> merge_headers.MergeHeaders())

    assert_that(merged_headers, equal_to([expected]))
コード例 #5
0
    def test_infer_annotation_types_with_missing(self):
        anno_fields = ['CSQ']
        header = self._get_sample_header_fields(with_annotation=True)
        variant = self._get_sample_variant_1()
        variant.info['CSQ'] = ['A||100|', 'A||101|1.3', 'A|||1.4', 'TT|||']
        infer_header_fields = infer_headers._InferHeaderFields(
            False, anno_fields)
        inferred_headers = next(infer_header_fields.process(variant, header))
        expected_types = {
            'CSQ_Gene_TYPE': '.',
            'CSQ_Position_TYPE': 'Integer',
            'CSQ_Score_TYPE': 'Float'
        }
        for key, item in inferred_headers.infos.items():
            self.assertEqual(item['type'], expected_types[key])
        self.assertEqual(len(expected_types), len(inferred_headers.infos))

        variant.info['CSQ'] = []
        inferred_headers = next(infer_header_fields.process(variant, header))
        expected = vcf_header_io.VcfHeader()
        self.assertEqual(expected, inferred_headers)
  def test_defined_fields_filtered_two_variants(self):
    # Only INFO and FORMAT in the first variants are already defined in the
    # header section of the VCF files.
    with TestPipeline() as p:
      vcf_headers = self._get_sample_header_fields()
      vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers])
      variant_1 = self._get_sample_variant_1()
      variant_2 = self._get_sample_variant_2()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              pvalue.AsSingleton(vcf_headers_side_input)))

      expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')}
      expected_formats = {'FI_2': Format('FI_2', 1, 'String', '')}
      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers, equal_to([expected]))
      p.run()
コード例 #7
0
 def test_bigquery_field_name_sanitize(self):
   infos = OrderedDict([
       ('_', createInfo('_', 1, 'String', 'desc', 'src', 'v')),
       ('_A', createInfo('_A', 1, 'String', 'desc', 'src', 'v')),
       ('0a', createInfo('0a', 1, 'String', 'desc', 'src', 'v')),
       ('A-B*C', createInfo('A-B*C', 1, 'String', 'desc', 'src', 'v')),
       ('I-A', createInfo('I-A', 'A', 'Float', 'desc', 'src', 'v')),
       ('OK_info_09', createInfo('OK_info_09', 1, 'String', 'desc'))])
   formats = OrderedDict([
       ('a^b', createFormat('a^b', 1, 'String', 'desc')),
       ('OK_format_09', createFormat('OK_format_09', 1, 'String', 'desc'))])
   header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
   self._validate_schema(
       self._generate_expected_fields(
           alt_fields=['I_A'],
           call_fields=['a_b', 'OK_format_09'],
           info_fields=['field__', 'field__A', 'field_0a', 'A_B_C',
                        'OK_info_09']),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields)))
コード例 #8
0
 def test_info_and_format_header_fields(self):
   infos = OrderedDict([
       ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))])
   # GT and PS should not be set as they're already included in special
   # 'genotype' and 'phaseset' fields.
   formats = OrderedDict([
       ('F1', createFormat('F1', 1, 'String', 'desc')),
       ('F2', createFormat('F2', 2, 'Integer', 'desc')),
       ('FU', createFormat('FU', '.', 'Float', 'desc')),
       ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
       ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))])
   header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
   self._validate_schema(
       self._generate_expected_fields(
           alt_fields=['IA'],
           call_fields=['F1', 'F2', 'FU'],
           info_fields=['I1']),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields)))
コード例 #9
0
  def test_header_fields_inferred_one_variant(self):
    with TestPipeline() as p:
      variant = self._get_sample_variant_1()
      inferred_headers = (
          p
          | Create([variant])
          | 'InferHeaderFields' >>
          infer_headers.InferHeaderFields(defined_headers=None))

      expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''),
                        'ISI': Info('ISI', 1, 'Integer', '', '', ''),
                        'ISF': Info('ISF', 1, 'Float', '', '', ''),
                        'IF': Info('IF', 1, 'Float', '', '', ''),
                        'IB': Info('IB', 0, 'Flag', '', '', ''),
                        'IA': Info('IA', None, 'Integer', '', '', '')}
      expected_formats = {'FI': Format('FI', 1, 'Integer', ''),
                          'FU': Format('FU', None, 'Float', '')}

      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers, equal_to([expected]))
      p.run()
コード例 #10
0
  def test_header_fields_inferred_from_two_variants(self):
    with TestPipeline() as p:
      variant_1 = self._get_sample_variant_1()
      variant_2 = self._get_sample_variant_2()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              defined_headers=None))

      expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''),
                        'IF': Info('IF', 0, 'Flag', '', '', ''),
                        'IA': Info('IA', None, 'String', '', '', ''),
                        'IS_2': Info('IS_2', 1, 'String', '', '', '')}
      expected_formats = {'FI': Format('FI', 1, 'String', ''),
                          'FU': Format('FU', None, 'String', ''),
                          'FI_2': Format('FI_2', 1, 'String', '')}

      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers, equal_to([expected]))
      p.run()
コード例 #11
0
  def _get_sample_variant_and_header_with_csq(self, additional_infos=None):
    """Provides a simple `Variant` and `VcfHeader` with info fields

    Args:
      additional_infos: A list of tuples of the format (key, `Info`) to be added
        to the `VcfHeader`.
    """
    # type:  (
    variant = self._get_sample_variant()
    variant.info['CSQ'] = ['A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3']
    infos = OrderedDict([
        ('A1', Info('A1', 1, None, '', None, None)),
        ('A2', Info('A2', parser.field_counts['A'], None, '', None, None)),
        ('CSQ', Info('CSQ',
                     parser.field_counts['.'],
                     None,
                     'some desc Allele|Consequence|IMPACT|SYMBOL|Gene',
                     None,
                     None))])
    if additional_infos is not None:
      for key, value in additional_infos:
        infos[key] = value
    header_fields = vcf_header_io.VcfHeader(infos=infos)
    return variant, header_fields
コード例 #12
0
def make_header(header_num_dict):
    # type: (Dict[str, str]) -> VcfHeader
    """Builds a VcfHeader based on the header_num_dict.

  All fields of parser._Info are set to their default values except for the
  'id' which is set to the keys in header_num_dict and 'num' which is set based
  on header_num_dict values mapped according to parser.field_counts.

  Args:
    header_num_dict: a dictionary mapping info keys to string num values.
  """
    infos = {}
    for k, v in header_num_dict.iteritems():
        if v in parser.field_counts:
            pyvcf_num_field_value = parser.field_counts[v]
        else:
            pyvcf_num_field_value = int(v)
        infos[k] = parser._Info(id=k,
                                num=pyvcf_num_field_value,
                                type=None,
                                desc='',
                                source=None,
                                version=None)
    return vcf_header_io.VcfHeader(infos=infos)
コード例 #13
0
 def test_create_processed_variant_annotation_alt_allele_num(self):
     csq_info = createInfo(None,
                           '.',
                           '.',
                           'some desc Allele|Consequence|IMPACT|ALLELE_NUM',
                           source=None,
                           version=None)
     header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info})
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         # The following represent a SNV and an insertion, resp.
         alternate_bases=['T', 'CT'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation
         # ALT because it can map to either the 'T' SNV or the 'CT' insertion.
         # But because there is ALLELE_NUM there should be no ambiguity.
         # The last four annotations have incorrect ALLELE_NUMs.
         info={
             'CSQ': [
                 'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3',
                 'T|C5|I5|TEST', 'T|C6|I6|'
             ]
         })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         use_allele_num=True,
         minimal_match=True,  # This should be ignored by the factory method.
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('T')
     alt1._info = {
         'CSQ': [{
             annotation_parser.ANNOTATION_ALT: 'T',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'ALLELE_NUM': '1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('CT')
     alt2._info = {
         'CSQ': [{
             annotation_parser.ANNOTATION_ALT: 'T',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'ALLELE_NUM': '2'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
コード例 #14
0
 def create_accumulator(self):
     # type: () -> vcf_header_io.VcfHeader
     return vcf_header_io.VcfHeader()
コード例 #15
0
 def create_accumulator(self):
     return vcf_header_io.VcfHeader()