def test_bigquery_field_name_sanitize(self):
     infos = OrderedDict([
         ('_', createInfo('_', 1, 'String', 'desc', 'src', 'v')),
         ('_A', createInfo('_A', 1, 'String', 'desc', 'src', 'v')),
         ('0a', createInfo('0a', 1, 'String', 'desc', 'src', 'v')),
         ('A-B*C', createInfo('A-B*C', 1, 'String', 'desc', 'src', 'v')),
         ('I-A', createInfo('I-A', 'A', 'Float', 'desc', 'src', 'v')),
         ('OK_info_09', createInfo('OK_info_09', 1, 'String', 'desc'))
     ])
     formats = OrderedDict([
         ('a^b', createFormat('a^b', 1, 'String', 'desc')),
         ('OK_format_09', createFormat('OK_format_09', 1, 'String', 'desc'))
     ])
     header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
     self._validate_schema(
         self._generate_expected_fields(alt_fields=['I_A'],
                                        call_fields=['a_b', 'OK_format_09'],
                                        info_fields=[
                                            'field__', 'field__A',
                                            'field_0a', 'A_B_C',
                                            'OK_info_09'
                                        ]),
         schema_converter.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
Beispiel #2
0
    def test_header_fields_inferred_from_two_variants(self):
        with TestPipeline() as p:
            variant_1 = self._get_sample_variant_1()
            variant_2 = self._get_sample_variant_2()
            inferred_headers = (
                p
                | Create([variant_1, variant_2])
                | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
                    defined_headers=None, infer_headers=True))

            expected_infos = {
                'IS': createInfo('IS', 1, 'String', ''),
                'ISI': createInfo('ISI', 1, 'Integer', ''),
                'ISF': createInfo('ISF', 1, 'Float', ''),
                'IF': createInfo('IF', 1, 'Float', ''),
                'IB': createInfo('IB', 0, 'Flag', ''),
                'IA': createInfo('IA', '.', 'Integer', ''),
                'IS_2': createInfo('IS_2', 1, 'String', '')
            }
            expected_formats = {
                'FI': createFormat('FI', 1, 'Integer', ''),
                'FU': createFormat('FU', '.', 'Float', ''),
                'FI_2': createFormat('FI_2', 1, 'Integer', '')
            }

            expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                               formats=expected_formats)
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()
Beispiel #3
0
  def test_add_format_fields_reserved_field(self):
    calls_record_with_desc = bigquery.TableFieldSchema(
        name=bigquery_util.ColumnKeyConstants.CALLS,
        type=bigquery_util.TableFieldConstants.TYPE_RECORD,
        mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    calls_record_with_desc.fields.append(bigquery.TableFieldSchema(
        name='GQ',
        type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
        mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
        description='bigquery desc'))
    formats = OrderedDict()
    schema_converter._add_format_fields(calls_record_with_desc,
                                        formats)
    expected_formats = OrderedDict([
        ('GQ', createFormat('GQ', 1, 'Integer', 'bigquery desc'))])
    self.assertEqual(formats, expected_formats)

    calls_record_without_desc = bigquery.TableFieldSchema(
        name=bigquery_util.ColumnKeyConstants.CALLS,
        type=bigquery_util.TableFieldConstants.TYPE_RECORD,
        mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    calls_record_without_desc.fields.append(bigquery.TableFieldSchema(
        name='GQ',
        type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
        mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
        description=''))
    formats = OrderedDict()
    schema_converter._add_format_fields(calls_record_without_desc,
                                        formats)
    expected_formats = OrderedDict([
        ('GQ', createFormat(
            'GQ', 1, 'Integer', 'Conditional genotype quality'))])
    self.assertEqual(formats, expected_formats)
Beispiel #4
0
 def test_infer_mismatched_format_field(self):
     variant = self._get_sample_variant_format_fi_float_value()
     corrected_format = infer_headers_util._infer_mismatched_format_field(
         'FI', variant.calls[0].info.get('FI'),
         createFormat('FI', 2, 'Integer', 'desc'))
     expected_formats = createFormat('FI', 2, 'Float', 'desc')
     self.assertEqual(expected_formats, corrected_format)
Beispiel #5
0
    def _get_sample_header_fields(self, with_annotation=False):
        """Provides a simple `VcfHeader` with info and format fields

    Args:
      with_annotation: Can be bool or list of tuples. Tuples should be
        additional annotation fields in the format (key, `Info`).
    """
        infos = OrderedDict([
            ('IS', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
            ('ISI', createInfo('ISI', 1, 'Integer', 'desc', 'src', 'v')),
            ('ISF', createInfo('ISF', 1, 'Float', 'desc', 'src', 'v')),
            ('IF', createInfo('IF', 1, 'Float', 'desc', 'src', 'v')),
            ('IB', createInfo('I1', 1, 'Flag', 'desc', 'src', 'v')),
            ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))
        ])
        if with_annotation:
            infos['CSQ'] = createInfo(
                'CSQ', '.', 'String',
                'Annotations from VEP. Format: Allele|Gene|Position|Score',
                'src', 'v')
            if isinstance(with_annotation, list):
                for key, value in with_annotation:
                    infos[key] = value
        formats = OrderedDict([
            ('FS', createFormat('FS', 1, 'String', 'desc')),
            ('FI', createFormat('FI', 2, 'Integer', 'desc')),
            ('FU', createFormat('FU', '.', 'Float', 'desc')),
            ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
            ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))
        ])
        return vcf_header_io.VcfHeader(infos=infos, formats=formats)
Beispiel #6
0
    def test_infer_format_fields_no_conflicts(self):
        variant = self._get_sample_variant_1()
        formats = OrderedDict([
            ('FS', createFormat('FS', 1, 'String', 'desc')),
            ('FI', createFormat('FI', 2, 'Integer', 'desc')),
            ('FU', createFormat('FU', '.', 'Float', 'desc')),
            ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
            ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))
        ])

        header = infer_headers_util.infer_format_fields(
            variant, vcf_header_io.VcfHeader(formats=formats))
        self.assertEqual({}, header)
Beispiel #7
0
  def test_generate_header_fields_from_schema(self):
    sample_schema = bigquery_schema_util.get_sample_table_schema()
    header = schema_converter.generate_header_fields_from_schema(
        sample_schema)

    infos = OrderedDict([
        ('AF', createInfo('AF', 'A', 'Float', 'desc', None, None)),
        ('AA', createInfo('AA', 1, 'String', 'desc', None, None)),
        ('IFR', createInfo('IFR', '.', 'Float', 'desc', None, None)),
        ('IS', createInfo('IS', 1, 'String', 'desc', None, None))])
    formats = OrderedDict([
        ('FB', createFormat('FB', 1, 'String', 'desc')),
        ('GQ', createFormat('GQ', 1, 'Integer', 'desc'))])
    expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)
    self.assertEqual(header, expected_header)
    def test_add_format_fields_reserved_field_schema_compatibility(self):
        schema_conflict_format = bigquery.TableSchema()
        calls_record = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each call.')
        calls_record.fields.append(
            bigquery.TableFieldSchema(
                name='GQ',
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='desc'))
        schema_conflict_format.fields.append(calls_record)
        with self.assertRaises(ValueError):
            schema_converter.generate_header_fields_from_schema(
                schema_conflict_format)

        formats_allow_incompatible_schema = OrderedDict()
        schema_converter._add_format_fields(calls_record,
                                            formats_allow_incompatible_schema,
                                            allow_incompatible_schema=True)
        expected_formats = OrderedDict([('GQ',
                                         createFormat('GQ', 1, 'String',
                                                      'desc'))])
        self.assertEqual(formats_allow_incompatible_schema, expected_formats)
    def test_report_conflicted_and_inferred_headers(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {
            'NS': {
                Definition(1, 'Float'): ['file1'],
                Definition(1, 'Integer'): ['file2']
            }
        }

        infos = OrderedDict([('NS',
                              createInfo('NS', 1, 'Float', 'Number samples',
                                         None, None))])
        formats = OrderedDict([('DP',
                                createFormat('DP', 2, 'Float',
                                             'Total Depth'))])
        resolved_headers = VcfHeader(infos=infos, formats=formats)
        inferred_headers = VcfHeader(formats=formats)
        expected = [
            preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
            preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join([
                'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n'
            ]), (preprocess_reporter._DELIMITER).join(
                [' ', ' ', 'num=1 type=Integer', 'file2', ' \n']), '\n',
            preprocess_reporter._InconsistencyType.INFERRED_HEADERS + '\n',
            preprocess_reporter._HeaderLine.INFERRED_FIELD_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join(
                ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n'
        ]
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, resolved_headers, inferred_headers)
Beispiel #10
0
  def test_vcf_header_to_schema_to_vcf_header(self):
    infos = OrderedDict([
        ('I1', createInfo('I1', '.', 'String', 'desc', None, None)),
        ('IA', createInfo('IA', '.', 'Integer', 'desc', None, None))])
    formats = OrderedDict([
        ('F1', createFormat('F1', '.', 'String', 'desc')),
        ('F2', createFormat('F2', '.', 'Integer', 'desc')),
        ('FU', createFormat('FU', '.', 'Float', 'desc'))])
    original_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)

    schema = schema_converter.generate_schema_from_header_fields(
        original_header,
        processed_variant.ProcessedVariantFactory(original_header))
    reconstructed_header = (
        schema_converter.generate_header_fields_from_schema(
            schema))

    self.assertEqual(original_header, reconstructed_header)
Beispiel #11
0
 def test_info_and_format_header_fields(self):
   infos = OrderedDict([
       ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))])
   # GT and PS should not be set as they're already included in special
   # 'genotype' and 'phaseset' fields.
   formats = OrderedDict([
       ('F1', createFormat('F1', 1, 'String', 'desc')),
       ('F2', createFormat('F2', 2, 'Integer', 'desc')),
       ('FU', createFormat('FU', '.', 'Float', 'desc')),
       ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
       ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))])
   header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
   self._validate_schema(
       self._generate_expected_fields(
           alt_fields=['IA'],
           call_fields=['F1', 'F2', 'FU'],
           info_fields=['I1']),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields)))
Beispiel #12
0
 def test_variant_merger_modify_schema(self):
   infos = OrderedDict([
       ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))])
   formats = OrderedDict([('F1', createFormat('F1', 1, 'String', 'desc'))])
   header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
   self._validate_schema(
       self._generate_expected_fields(
           alt_fields=['IA'],
           call_fields=['F1'],
           info_fields=['I1', 'ADDED_BY_MERGER']),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields),
           variant_merger=_DummyVariantMergeStrategy()))
    def test_report_inferred_headers_only(self):
        header_definitions = VcfHeaderDefinitions()
        formats = OrderedDict([('DP',
                                createFormat('DP', 2, 'Float',
                                             'Total Depth'))])

        inferred_headers = VcfHeader(formats=formats)
        expected = [
            'No Header Conflicts Found.\n', '\n',
            preprocess_reporter._InconsistencyType.INFERRED_HEADERS + '\n',
            preprocess_reporter._HeaderLine.INFERRED_FIELD_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join(
                ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n'
        ]
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, inferred_headers=inferred_headers)
    def test_report_no_conflicts(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}}
        header_definitions._formats = {
            'NS': {
                Definition(1, 'Float'): ['file2']
            }
        }

        infos = OrderedDict([('NS',
                              createInfo('NS', 1, 'Integer', 'Number samples',
                                         None, None))])
        formats = OrderedDict([('NS',
                                createFormat('NS', 1, 'Float',
                                             'Number samples'))])
        resolved_headers = VcfHeader(infos=infos, formats=formats)

        expected = ['No Header Conflicts Found.\n', '\n']
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, resolved_headers)
Beispiel #15
0
 def test_infer_format_fields_combined_conflicts(self):
     variant = self._get_sample_variant_format_fi_float_value()
     formats = OrderedDict([
         ('FS', createFormat('FS', 1, 'String', 'desc')),
         ('FI', createFormat('FI', 2, 'Integer', 'desc')),
         ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
         ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))
     ])
     inferred_formats = infer_headers_util.infer_format_fields(
         variant, vcf_header_io.VcfHeader(formats=formats))
     expected_formats = {
         'FI': createFormat('FI', 2, 'Float', 'desc'),
         'FU': createFormat('FU', '.', 'Float', '')
     }
     self.assertEqual(expected_formats, inferred_formats)
Beispiel #16
0
    def test_defined_fields_filtered_two_variants(self):
        # Only INFO and FORMAT in the first variants are already defined in the
        # header section of the VCF files.
        with TestPipeline() as p:
            vcf_headers = self._get_sample_header_fields()
            vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers])
            variant_1 = self._get_sample_variant_1()
            variant_2 = self._get_sample_variant_2()
            inferred_headers = (
                p
                | Create([variant_1, variant_2])
                | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
                    pvalue.AsSingleton(vcf_headers_side_input),
                    infer_headers=True))

            expected_infos = {'IS_2': createInfo('IS_2', 1, 'String', '')}
            expected_formats = {'FI_2': createFormat('FI_2', 1, 'Integer', '')}
            expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                               formats=expected_formats)
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()
Beispiel #17
0
    def test_pipeline(self):
        infos = {
            'IS': createInfo('IS', 1, 'String', ''),
            'ISI': createInfo('ISI', 1, 'Integer', ''),
            'ISF': createInfo('ISF', 1, 'Float', ''),
            'IB': createInfo('IB', 0, 'Flag', ''),
            'IA': createInfo('IA', 'A', 'Integer', '')
        }
        formats = OrderedDict([
            ('FS', createFormat('FS', 1, 'String', 'desc')),
            ('FI', createFormat('FI', 2, 'Integer', 'desc')),
            ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
            ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))
        ])

        with TestPipeline() as p:
            variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch()
            variant_2 = self._get_sample_variant_format_fi_float_value()
            inferred_headers = (
                p
                | Create([variant_1, variant_2])
                | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
                    defined_headers=vcf_header_io.VcfHeader(infos=infos,
                                                            formats=formats),
                    allow_incompatible_records=True,
                    infer_headers=True))

            expected_infos = {
                'IA': createInfo('IA', '.', 'Float', ''),
                'IF': createInfo('IF', 1, 'Float', '')
            }
            expected_formats = {
                'FI': createFormat('FI', 2, 'Float', 'desc'),
                'FU': createFormat('FU', '.', 'Float', '')
            }
            expected = vcf_header_io.VcfHeader(infos=expected_infos,
                                               formats=expected_formats)
            assert_that(inferred_headers,
                        asserts.header_fields_equal_ignore_order([expected]))
            p.run()