def test_generate_header_fields_from_schema_none_mode(self):
        schema_non_reserved_fields = bigquery.TableSchema()
        schema_non_reserved_fields.fields.append(
            bigquery.TableFieldSchema(
                name='field',
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                description='desc'))
        header = schema_converter.generate_header_fields_from_schema(
            schema_non_reserved_fields)
        infos = OrderedDict([('field',
                              createInfo('field', 1, 'String', 'desc', None,
                                         None))])
        formats = OrderedDict()
        expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)
        self.assertEqual(header, expected_header)

        schema_reserved_fields = bigquery.TableSchema()
        schema_reserved_fields.fields.append(
            bigquery.TableFieldSchema(
                name='AA',
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                description='desc'))
        header = schema_converter.generate_header_fields_from_schema(
            schema_reserved_fields)
        infos = OrderedDict([('AA',
                              createInfo('AA', 1, 'String', 'desc', None,
                                         None))])
        formats = OrderedDict()
        expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)
        self.assertEqual(header, expected_header)
Example #2
0
    def _get_sample_variant_and_header_with_csq(self, additional_infos=None):
        """Provides a simple `Variant` and `VcfHeader` with info fields

    Args:
      additional_infos: A list of tuples of the format (key, `Info`) to be added
        to the `VcfHeader`.
    """
        # type:  (
        variant = self._get_sample_variant()
        variant.info['CSQ'] = [
            'A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'
        ]
        infos = OrderedDict([
            ('A1', createInfo('A1', 1, '.', 'desc', None, None)),
            ('A2', createInfo('A2', 'A', '.', 'desc', None, None)),
            ('CSQ',
             createInfo('CSQ', '.', '.',
                        'some desc Allele|Consequence|IMPACT|SYMBOL|Gene',
                        None, None))
        ])
        if additional_infos is not None:
            for key, value in additional_infos:
                infos[key] = value
        header_fields = vcf_header_io.VcfHeader(infos=infos)
        return variant, header_fields
    def test_add_info_fields_reserved_field(self):
        field_with_desc = bigquery.TableFieldSchema(
            name='AA',
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='bigquery desc')
        infos = OrderedDict()
        schema_converter._add_info_fields(field_with_desc, infos)
        expected_infos = OrderedDict([
            ('AA', createInfo('AA', 1, 'String', 'bigquery desc', None, None))
        ])
        self.assertEqual(infos, expected_infos)

        field_without_desc = bigquery.TableFieldSchema(
            name='AA',
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='')
        infos = OrderedDict()
        schema_converter._add_info_fields(field_without_desc, infos)
        expected_infos = OrderedDict([
            ('AA', createInfo('AA', 1, 'String', 'Ancestral allele', None,
                              None))
        ])
        self.assertEqual(infos, expected_infos)
Example #4
0
    def test_infer_annotation_pipeline(self):
        anno_fields = ['CSQ']
        header = self._get_sample_header_fields(with_annotation=True)
        variant1 = self._get_sample_variant_1()
        variant1.info['CSQ'] = [
            'A|1|100|1.2', 'A|2|101|1.3', 'A|12|start|0', 'TT|13|end|7'
        ]
        variant2 = self._get_sample_variant_1()
        variant2.info['CSQ'] = [
            'A|1|100|', 'A|2|101|', 'A|1.2|102|0', 'TT|1.3|103|7'
        ]
        desc = 'Inferred type field for annotation {}.'
        expected = vcf_header_io.VcfHeader(
            infos={
                'CSQ_Gene_TYPE':
                createInfo('CSQ_Gene_TYPE', 1, 'Float', desc.format('Gene')),
                'CSQ_Position_TYPE':
                createInfo('CSQ_Position_TYPE', 1, 'String',
                           desc.format('Position')),
                'CSQ_Score_TYPE':
                createInfo('CSQ_Score_TYPE', 1, 'Float', desc.format('Score'))
            })

        with TestPipeline() as p:
            inferred_headers = (
                p
                | Create([variant1, variant2])
                | 'InferAnnotationTypes' >> infer_headers.InferHeaderFields(
                    defined_headers=header,
                    infer_headers=False,
                    annotation_fields_to_infer=anno_fields))
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()
Example #5
0
 def test_variant_merger_modify_schema(self):
   infos = OrderedDict([
       ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))])
   formats = OrderedDict([('F1', createFormat('F1', 1, 'String', 'desc'))])
   header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
   self._validate_schema(
       self._generate_expected_fields(
           alt_fields=['IA'],
           call_fields=['F1'],
           info_fields=['I1', 'ADDED_BY_MERGER']),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields),
           variant_merger=_DummyVariantMergeStrategy()))
Example #6
0
  def test_generate_header_fields_from_schema(self):
    sample_schema = bigquery_schema_util.get_sample_table_schema()
    header = schema_converter.generate_header_fields_from_schema(
        sample_schema)

    infos = OrderedDict([
        ('AF', createInfo('AF', 'A', 'Float', 'desc', None, None)),
        ('AA', createInfo('AA', 1, 'String', 'desc', None, None)),
        ('IFR', createInfo('IFR', '.', 'Float', 'desc', None, None)),
        ('IS', createInfo('IS', 1, 'String', 'desc', None, None))])
    formats = OrderedDict([
        ('FB', createFormat('FB', 1, 'String', 'desc')),
        ('GQ', createFormat('GQ', 1, 'Integer', 'desc'))])
    expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)
    self.assertEqual(header, expected_header)
    def test_report_multiple_files(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {
            'NS': {
                Definition(1, 'Float'): ['file1', 'file2'],
                Definition(1, 'Integer'): ['file3']
            }
        }

        infos = OrderedDict([('NS',
                              createInfo('NS', 1, 'Float', 'Number samples',
                                         None, None))])
        resolved_headers = VcfHeader(infos=infos)

        expected = [
            preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
            preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join([
                'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n'
            ]), (preprocess_reporter._DELIMITER).join(
                [' ', ' ', ' ', 'file2', ' \n']),
            (preprocess_reporter._DELIMITER).join(
                [' ', ' ', 'num=1 type=Integer', 'file3', ' \n']), '\n'
        ]
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, resolved_headers)
Example #8
0
 def test_infer_annotation_types_with_multiple_annotation_fields(self):
     anno_fields = ['CSQ', 'CSQ_VT']
     infos = self._get_annotation_infos()
     infos['CSQ_VT'] = createInfo(
         'CSQ_VT', 'A', 'String',
         'Annotations from VEP. Format: Allele|Gene|Position|Score',
         'source', 'v')
     variant = self._get_sample_variant_1()
     variant.info['CSQ_VT'] = ['A|1|100|1.2', 'A|2|101|1.3']
     variant.info['CSQ'] = ['A|1|100|1.2', 'A|2|101|1.3']
     inferred_infos = infer_headers_util.infer_info_fields(
         variant, vcf_header_io.VcfHeader(infos=infos), False, anno_fields)
     expected_infos = {
         'CSQ_Gene_TYPE':
         self._get_inferred_info('CSQ', 'Gene', 'Integer'),
         'CSQ_Position_TYPE':
         self._get_inferred_info('CSQ', 'Position', 'Integer'),
         'CSQ_Score_TYPE':
         self._get_inferred_info('CSQ', 'Score', 'Float'),
         'CSQ_VT_Gene_TYPE':
         self._get_inferred_info('CSQ_VT', 'Gene', 'Integer'),
         'CSQ_VT_Position_TYPE':
         self._get_inferred_info('CSQ_VT', 'Position', 'Integer'),
         'CSQ_VT_Score_TYPE':
         self._get_inferred_info('CSQ_VT', 'Score', 'Float')
     }
     self.assertDictEqual(expected_infos, inferred_infos)
    def test_report_conflicted_and_inferred_headers(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {
            'NS': {
                Definition(1, 'Float'): ['file1'],
                Definition(1, 'Integer'): ['file2']
            }
        }

        infos = OrderedDict([('NS',
                              createInfo('NS', 1, 'Float', 'Number samples',
                                         None, None))])
        formats = OrderedDict([('DP',
                                createFormat('DP', 2, 'Float',
                                             'Total Depth'))])
        resolved_headers = VcfHeader(infos=infos, formats=formats)
        inferred_headers = VcfHeader(formats=formats)
        expected = [
            preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
            preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join([
                'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n'
            ]), (preprocess_reporter._DELIMITER).join(
                [' ', ' ', 'num=1 type=Integer', 'file2', ' \n']), '\n',
            preprocess_reporter._InconsistencyType.INFERRED_HEADERS + '\n',
            preprocess_reporter._HeaderLine.INFERRED_FIELD_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join(
                ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n'
        ]
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, resolved_headers, inferred_headers)
Example #10
0
 def test_infer_annotation_types_with_multiple_annotation_fields(self):
     anno_fields = ['CSQ', 'CSQ_VT']
     csq_vt = [
         ('CSQ_VT',
          createInfo(
              'CSQ_VT', 'A', 'String',
              'Annotations from VEP. Format: Allele|Gene|Position|Score',
              'source', 'v'))
     ]
     header = self._get_sample_header_fields(with_annotation=csq_vt)
     variant = self._get_sample_variant_1()
     variant.info['CSQ_VT'] = ['A|1|100|1.2', 'A|2|101|1.3']
     variant.info['CSQ'] = ['A|1|100|1.2', 'A|2|101|1.3']
     infer_header_fields = infer_headers._InferHeaderFields(
         False, anno_fields)
     inferred_headers = next(infer_header_fields.process(variant, header))
     expected_types = {
         'CSQ_Gene_TYPE': 'Integer',
         'CSQ_Position_TYPE': 'Integer',
         'CSQ_Score_TYPE': 'Float',
         'CSQ_VT_Gene_TYPE': 'Integer',
         'CSQ_VT_Position_TYPE': 'Integer',
         'CSQ_VT_Score_TYPE': 'Float'
     }
     for key, item in inferred_headers.infos.items():
         self.assertEqual(item['type'], expected_types[key])
     self.assertEqual(len(expected_types), len(inferred_headers.infos))
    def test_add_info_fields_reserved_field_schema_compatibility(self):
        field_conflict_info_type = bigquery.TableFieldSchema(
            name='AA',
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='desc')
        with self.assertRaises(ValueError):
            schema_converter._add_info_fields(field_conflict_info_type,
                                              OrderedDict())

        field_conflict_info_format = bigquery.TableFieldSchema(
            name='AA',
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='desc')
        with self.assertRaises(ValueError):
            schema_converter._add_info_fields(field_conflict_info_format,
                                              OrderedDict())

        info_allow_incompatible_schema = OrderedDict()
        schema_converter._add_info_fields(field_conflict_info_format,
                                          info_allow_incompatible_schema,
                                          allow_incompatible_schema=True)
        expected_infos = OrderedDict([('AA',
                                       createInfo('AA', '.', 'String', 'desc',
                                                  None, None))])
        self.assertEqual(info_allow_incompatible_schema, expected_infos)
Example #12
0
 def test_create_alt_bases_field_schema_types(self):
     ids = [
         'CSQ_Allele_TYPE', 'CSQ_Consequence_TYPE', 'CSQ_IMPACT_TYPE',
         'CSQ_SYMBOL_TYPE'
     ]
     types = ['String', 'Integer', 'Integer', 'Float']
     infos = [(i, createInfo(i, 1, t, 'desc', None, None))
              for i, t in zip(ids, types)]
     _, header_fields = self._get_sample_variant_and_header_with_csq(
         additional_infos=infos)
     for hfi in header_fields.infos.values():
         if hfi['type'] == '.':
             hfi['type'] = 'String'
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'])
     schema = factory.create_alt_bases_field_schema()
     csq_field = [field for field in schema.fields
                  if field.name == 'CSQ'][0]
     expected_name_type_map = {
         'CSQ': 'RECORD',
         'allele': 'STRING',
         'Consequence': 'INTEGER',
         'IMPACT': 'INTEGER',
         'SYMBOL': 'FLOAT',
         'Gene': 'STRING'
     }
     for field in csq_field.fields:
         self.assertEqual(field.type, expected_name_type_map[field.name])
Example #13
0
    def test_header_fields_inferred_from_two_variants(self):
        with TestPipeline() as p:
            variant_1 = self._get_sample_variant_1()
            variant_2 = self._get_sample_variant_2()
            inferred_headers = (
                p
                | Create([variant_1, variant_2])
                | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
                    defined_headers=None, infer_headers=True))

            expected_infos = {
                'IS': createInfo('IS', 1, 'String', ''),
                'ISI': createInfo('ISI', 1, 'Integer', ''),
                'ISF': createInfo('ISF', 1, 'Float', ''),
                'IF': createInfo('IF', 1, 'Float', ''),
                'IB': createInfo('IB', 0, 'Flag', ''),
                'IA': createInfo('IA', '.', 'Integer', ''),
                'IS_2': createInfo('IS_2', 1, 'String', '')
            }
            expected_formats = {
                'FI': createFormat('FI', 1, 'Integer', ''),
                'FU': createFormat('FU', '.', 'Float', ''),
                'FI_2': createFormat('FI_2', 1, 'Integer', '')
            }

            expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                               formats=expected_formats)
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()
Example #14
0
    def _get_sample_header_fields(self, with_annotation=False):
        """Provides a simple `VcfHeader` with info and format fields

    Args:
      with_annotation: Can be bool or list of tuples. Tuples should be
        additional annotation fields in the format (key, `Info`).
    """
        infos = OrderedDict([
            ('IS', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
            ('ISI', createInfo('ISI', 1, 'Integer', 'desc', 'src', 'v')),
            ('ISF', createInfo('ISF', 1, 'Float', 'desc', 'src', 'v')),
            ('IF', createInfo('IF', 1, 'Float', 'desc', 'src', 'v')),
            ('IB', createInfo('I1', 1, 'Flag', 'desc', 'src', 'v')),
            ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))
        ])
        if with_annotation:
            infos['CSQ'] = createInfo(
                'CSQ', '.', 'String',
                'Annotations from VEP. Format: Allele|Gene|Position|Score',
                'src', 'v')
            if isinstance(with_annotation, list):
                for key, value in with_annotation:
                    infos[key] = value
        formats = OrderedDict([
            ('FS', createFormat('FS', 1, 'String', 'desc')),
            ('FI', createFormat('FI', 2, 'Integer', 'desc')),
            ('FU', createFormat('FU', '.', 'Float', 'desc')),
            ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
            ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))
        ])
        return vcf_header_io.VcfHeader(infos=infos, formats=formats)
Example #15
0
  def test_vcf_header_to_schema_to_vcf_header(self):
    infos = OrderedDict([
        ('I1', createInfo('I1', '.', 'String', 'desc', None, None)),
        ('IA', createInfo('IA', '.', 'Integer', 'desc', None, None))])
    formats = OrderedDict([
        ('F1', createFormat('F1', '.', 'String', 'desc')),
        ('F2', createFormat('F2', '.', 'Integer', 'desc')),
        ('FU', createFormat('FU', '.', 'Float', 'desc'))])
    original_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)

    schema = schema_converter.generate_schema_from_header_fields(
        original_header,
        processed_variant.ProcessedVariantFactory(original_header))
    reconstructed_header = (
        schema_converter.generate_header_fields_from_schema(
            schema))

    self.assertEqual(original_header, reconstructed_header)
    def test_info_header_fields(self):
        infos = OrderedDict([
            ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
            ('I2', createInfo('I2', 2, 'Integer', 'desc', 'src', 'v')),
            ('IA', createInfo('IA', 'A', 'Float', 'desc', 'src', 'v')),
            ('IU', createInfo('IU', '.', 'Character', 'desc', 'src', 'v')),
            ('IG', createInfo('IG', 'G', 'String', 'desc', 'src', 'v')),
            ('I0', createInfo('I0', 0, 'Flag', 'desc', 'src', 'v')),
            ('IA2', createInfo('IA2', 'A', 'Float', 'desc', 'src', 'v')),
            (
                'END',  # END should not be included in the generated schema.
                createInfo('END', 1, 'Integer', 'Special END key', 'src', 'v'))
        ])
        header_fields = vcf_header_io.VcfHeader(infos=infos)

        self._validate_schema(
            self._generate_expected_fields(
                alt_fields=['IA', 'IA2'],
                info_fields=['I1', 'I2', 'IU', 'IG', 'I0']),
            schema_converter.generate_schema_from_header_fields(
                header_fields,
                processed_variant.ProcessedVariantFactory(header_fields)))

        # Test with split_alternate_allele_info_fields=False.
        actual_schema = (schema_converter.generate_schema_from_header_fields(
            header_fields,
            processed_variant.ProcessedVariantFactory(
                header_fields, split_alternate_allele_info_fields=False)))
        self._validate_schema(
            self._generate_expected_fields(
                info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']),
            actual_schema)
        # Verify types and modes.
        expected_type_modes = {
            'I1': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_NULLABLE),
            'I2': (TableFieldConstants.TYPE_INTEGER,
                   TableFieldConstants.MODE_REPEATED),
            'IA': (TableFieldConstants.TYPE_FLOAT,
                   TableFieldConstants.MODE_REPEATED),
            'IU': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'IG': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'I0': (TableFieldConstants.TYPE_BOOLEAN,
                   TableFieldConstants.MODE_NULLABLE),
            'IA2':
            (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED)
        }
        for field in actual_schema.fields:
            if field.name in expected_type_modes:
                expected_type, expected_mode = expected_type_modes[field.name]
                self.assertEqual(expected_type, field.type)
                self.assertEqual(expected_mode, field.mode)
    def test_add_info_fields_from_alternate_bases_reserved_field(self):
        alternate_bases_record_with_desc = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each alternate base (if any).')
        alternate_bases_record_with_desc.fields.append(
            bigquery.TableFieldSchema(
                name='AF',
                type=bigquery_util.TableFieldConstants.TYPE_FLOAT,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='bigquery desc'))
        infos_with_desc = OrderedDict()
        schema_converter._add_info_fields(alternate_bases_record_with_desc,
                                          infos_with_desc)
        expected_infos = OrderedDict([
            ('AF', createInfo('AF', 'A', 'Float', 'bigquery desc', None, None))
        ])
        self.assertEqual(infos_with_desc, expected_infos)

        alternate_bases_record_no_desc = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each alternate base (if any).')
        alternate_bases_record_no_desc.fields.append(
            bigquery.TableFieldSchema(
                name='AF',
                type=bigquery_util.TableFieldConstants.TYPE_FLOAT,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description=''))
        infos_no_desc = OrderedDict()
        schema_converter._add_info_fields(alternate_bases_record_no_desc,
                                          infos_no_desc)
        expected_infos = OrderedDict([(
            'AF',
            createInfo(
                'AF', 'A', 'Float',
                'Allele frequency for each ALT allele in the same order '
                'as listed (estimated from primary data, not called genotypes',
                None, None))])
        self.assertEqual(infos_no_desc, expected_infos)
Example #18
0
 def test_create_processed_variant_annotation_alt_allele_num(self):
   csq_info = createInfo(
       None, '.', '.', 'some desc Allele|Consequence|IMPACT|ALLELE_NUM',
       source=None, version=None)
   header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info})
   variant = vcfio.Variant(
       reference_name='19', start=11, end=12, reference_bases='C',
       # The following represent a SNV and an insertion, resp.
       alternate_bases=['T', 'CT'],
       names=['rs1'], quality=2,
       filters=['PASS'],
       # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation
       # ALT because it can map to either the 'T' SNV or the 'CT' insertion.
       # But because there is ALLELE_NUM there should be no ambiguity.
       # The last four annotations have incorrect ALLELE_NUMs.
       info={'CSQ': ['T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3',
                     'T|C5|I5|TEST', 'T|C6|I6|']})
   counter_factory = _CounterSpyFactory()
   factory = processed_variant.ProcessedVariantFactory(
       header_fields,
       split_alternate_allele_info_fields=True,
       annotation_fields=['CSQ'],
       use_allele_num=True,
       minimal_match=True,  # This should be ignored by the factory method.
       counter_factory=counter_factory)
   proc_var = factory.create_processed_variant(variant)
   alt1 = processed_variant.AlternateBaseData('T')
   alt1._info = {
       'CSQ': [
           {annotation_parser.ANNOTATION_ALT: 'T',
            'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1'}]
   }
   alt2 = processed_variant.AlternateBaseData('CT')
   alt2._info = {
       'CSQ': [
           {annotation_parser.ANNOTATION_ALT: 'T',
            'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2'}]
   }
   self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
   self.assertFalse('CSQ' in proc_var.non_alt_info)
   self.assertEqual(counter_factory.counter_map[
       CEnum.VARIANT.value].get_value(), 1)
   self.assertEqual(counter_factory.counter_map[
       CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
   self.assertEqual(
       counter_factory.counter_map[
           CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
   self.assertEqual(
       counter_factory.counter_map[
           CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0)
   self.assertEqual(
       counter_factory.counter_map[
           CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
 def test_add_info_fields_non_reserved_field(self):
     non_reserved_field = bigquery.TableFieldSchema(
         name='non_reserved_info',
         type=bigquery_util.TableFieldConstants.TYPE_STRING,
         mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
         description='')
     infos = OrderedDict()
     schema_converter._add_info_fields(non_reserved_field, infos)
     expected_infos = OrderedDict([('non_reserved_info',
                                    createInfo('non_reserved_info', 1,
                                               'String', '', None, None))])
     self.assertEqual(infos, expected_infos)
Example #20
0
 def test_info_and_format_header_fields(self):
   infos = OrderedDict([
       ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))])
   # GT and PS should not be set as they're already included in special
   # 'genotype' and 'phaseset' fields.
   formats = OrderedDict([
       ('F1', createFormat('F1', 1, 'String', 'desc')),
       ('F2', createFormat('F2', 2, 'Integer', 'desc')),
       ('FU', createFormat('FU', '.', 'Float', 'desc')),
       ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
       ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))])
   header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
   self._validate_schema(
       self._generate_expected_fields(
           alt_fields=['IA'],
           call_fields=['F1', 'F2', 'FU'],
           info_fields=['I1']),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields)))
 def test_bigquery_field_name_sanitize(self):
     infos = OrderedDict([
         ('_', createInfo('_', 1, 'String', 'desc', 'src', 'v')),
         ('_A', createInfo('_A', 1, 'String', 'desc', 'src', 'v')),
         ('0a', createInfo('0a', 1, 'String', 'desc', 'src', 'v')),
         ('A-B*C', createInfo('A-B*C', 1, 'String', 'desc', 'src', 'v')),
         ('I-A', createInfo('I-A', 'A', 'Float', 'desc', 'src', 'v')),
         ('OK_info_09', createInfo('OK_info_09', 1, 'String', 'desc'))
     ])
     formats = OrderedDict([
         ('a^b', createFormat('a^b', 1, 'String', 'desc')),
         ('OK_format_09', createFormat('OK_format_09', 1, 'String', 'desc'))
     ])
     header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
     self._validate_schema(
         self._generate_expected_fields(alt_fields=['I_A'],
                                        call_fields=['a_b', 'OK_format_09'],
                                        info_fields=[
                                            'field__', 'field__A',
                                            'field_0a', 'A_B_C',
                                            'OK_info_09'
                                        ]),
         schema_converter.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
Example #22
0
 def _get_annotation_infos(self):
     return OrderedDict([
         ('CSQ',
          createInfo(
              'CSQ', '.', 'String',
              'Annotations from VEP. Format: Allele|Gene|Position|Score',
              'src', 'v')),
         ('IS', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
         ('ISI', createInfo('ISI', 1, 'Integer', 'desc', 'src', 'v')),
         ('ISF', createInfo('ISF', 1, 'Float', 'desc', 'src', 'v')),
         ('IF', createInfo('IF', 1, 'Float', 'desc', 'src', 'v')),
         ('IB', createInfo('I1', 1, 'Flag', 'desc', 'src', 'v')),
         ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))
     ])
    def test_generate_header_fields_from_schema_invalid_description(self):
        schema = bigquery.TableSchema()
        schema.fields.append(
            bigquery.TableFieldSchema(
                name='invalid_description',
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='Desc\nThis is added intentionally.'))
        header = schema_converter.generate_header_fields_from_schema(schema)

        infos = OrderedDict([('invalid_description',
                              createInfo('invalid_description', 1, 'String',
                                         'Desc This is added intentionally.',
                                         None, None))])
        expected_header = vcf_header_io.VcfHeader(infos=infos,
                                                  formats=OrderedDict())
        self.assertEqual(header, expected_header)
Example #24
0
 def test_infer_info_fields_combined_conflicts(self):
     variant = self._get_sample_variant_info_ia_cardinality_mismatch()
     infos = {
         'IS': createInfo('IS', 1, 'String', ''),
         'ISI': createInfo('ISI', 1, 'Integer', ''),
         'ISF': createInfo('ISF', 1, 'Float', ''),
         'IB': createInfo('IB', 0, 'Flag', ''),
         'IA': createInfo('IA', 'A', 'Integer', '')
     }
     inferred_infos = infer_headers_util.infer_info_fields(
         variant, vcf_header_io.VcfHeader(infos=infos), infer_headers=True)
     expected_infos = {
         'IF': createInfo('IF', 1, 'Float', ''),
         'IA': createInfo('IA', '.', 'Float', '')
     }
     self.assertEqual(expected_infos, inferred_infos)
Example #25
0
 def test_infer_mismatched_info_field_correct_type_list(self):
     variant = self._get_sample_variant_info_ia_float_in_list()
     infos = {
         'IS': createInfo('IS', 1, 'String', ''),
         'ISI': createInfo('ISI', 1, 'Integer', ''),
         'ISF': createInfo('ISF', 1, 'Float', ''),
         'IF': createInfo('IF', 1, 'Float', ''),
         'IB': createInfo('IB', 0, 'Flag', ''),
         'IA': createInfo('IA', '.', 'Integer', '')
     }
     corrected_info = infer_headers_util._infer_mismatched_info_field(
         'IA', variant.info.get('IA'),
         vcf_header_io.VcfHeader(infos=infos).infos.get('IA'),
         len(variant.alternate_bases))
     expected = createInfo('IA', '.', 'Float', '')
     self.assertEqual(expected, corrected_info)
Example #26
0
 def test_add_info_fields_from_alternate_bases_non_reserved_field(self):
   alternate_bases_record = bigquery.TableFieldSchema(
       name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
       type=bigquery_util.TableFieldConstants.TYPE_RECORD,
       mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
       description='One record for each alternate base (if any).')
   alternate_bases_record.fields.append(bigquery.TableFieldSchema(
       name='non_reserved',
       type=bigquery_util.TableFieldConstants.TYPE_FLOAT,
       mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
       description='bigquery desc'))
   infos = OrderedDict()
   schema_converter._add_info_fields(
       alternate_bases_record, infos)
   expected_infos = OrderedDict([
       ('non_reserved', createInfo('non_reserved', 'A', 'Float',
                                   'bigquery desc', None, None))])
   self.assertEqual(infos, expected_infos)
Example #27
0
 def test_infer_info_fields_no_conflicts(self):
     variant = self._get_sample_variant_1()
     infos = {
         'IS': createInfo('IS', 1, 'String', ''),
         'ISI': createInfo('ISI', 1, 'Integer', ''),
         'ISF': createInfo('ISF', 1, 'Float', ''),
         'IF': createInfo('IF', 1, 'Float', ''),
         'IB': createInfo('IB', 0, 'Flag', ''),
         'IA': createInfo('IA', 'A', 'Float', '')
     }
     inferred_infos = infer_headers_util.infer_info_fields(
         variant, vcf_header_io.VcfHeader(infos=infos), infer_headers=True)
     self.assertEqual({}, inferred_infos)
Example #28
0
  def test_generate_header_fields_from_schema_schema_compatibility(self):
    schema_conflict = bigquery.TableSchema()
    schema_conflict.fields.append(bigquery.TableFieldSchema(
        name='AA',
        type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
        mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
        description='desc'))
    with self.assertRaises(ValueError):
      schema_converter.generate_header_fields_from_schema(
          schema_conflict)

    header = schema_converter.generate_header_fields_from_schema(
        schema_conflict,
        allow_incompatible_schema=True)
    infos = OrderedDict([
        ('AA', createInfo('AA', 1, 'Integer', 'desc', None, None))])
    expected_header = vcf_header_io.VcfHeader(infos=infos,
                                              formats=OrderedDict())
    self.assertEqual(header, expected_header)
    def test_report_no_conflicts(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}}
        header_definitions._formats = {
            'NS': {
                Definition(1, 'Float'): ['file2']
            }
        }

        infos = OrderedDict([('NS',
                              createInfo('NS', 1, 'Integer', 'Number samples',
                                         None, None))])
        formats = OrderedDict([('NS',
                                createFormat('NS', 1, 'Float',
                                             'Number samples'))])
        resolved_headers = VcfHeader(infos=infos, formats=formats)

        expected = ['No Header Conflicts Found.\n', '\n']
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, resolved_headers)
Example #30
0
    def test_defined_fields_filtered_two_variants(self):
        # Only INFO and FORMAT in the first variants are already defined in the
        # header section of the VCF files.
        with TestPipeline() as p:
            vcf_headers = self._get_sample_header_fields()
            vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers])
            variant_1 = self._get_sample_variant_1()
            variant_2 = self._get_sample_variant_2()
            inferred_headers = (
                p
                | Create([variant_1, variant_2])
                | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
                    pvalue.AsSingleton(vcf_headers_side_input),
                    infer_headers=True))

            expected_infos = {'IS_2': createInfo('IS_2', 1, 'String', '')}
            expected_formats = {'FI_2': createFormat('FI_2', 1, 'Integer', '')}
            expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                               formats=expected_formats)
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()