def test_bigquery_field_name_sanitize(self): infos = OrderedDict([ ('_', createInfo('_', 1, 'String', 'desc', 'src', 'v')), ('_A', createInfo('_A', 1, 'String', 'desc', 'src', 'v')), ('0a', createInfo('0a', 1, 'String', 'desc', 'src', 'v')), ('A-B*C', createInfo('A-B*C', 1, 'String', 'desc', 'src', 'v')), ('I-A', createInfo('I-A', 'A', 'Float', 'desc', 'src', 'v')), ('OK_info_09', createInfo('OK_info_09', 1, 'String', 'desc')) ]) formats = OrderedDict([ ('a^b', createFormat('a^b', 1, 'String', 'desc')), ('OK_format_09', createFormat('OK_format_09', 1, 'String', 'desc')) ]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields(alt_fields=['I_A'], call_fields=['a_b', 'OK_format_09'], info_fields=[ 'field__', 'field__A', 'field_0a', 'A_B_C', 'OK_info_09' ]), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_header_fields_inferred_from_two_variants(self): with TestPipeline() as p: variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( defined_headers=None, infer_headers=True)) expected_infos = { 'IS': createInfo('IS', 1, 'String', ''), 'ISI': createInfo('ISI', 1, 'Integer', ''), 'ISF': createInfo('ISF', 1, 'Float', ''), 'IF': createInfo('IF', 1, 'Float', ''), 'IB': createInfo('IB', 0, 'Flag', ''), 'IA': createInfo('IA', '.', 'Integer', ''), 'IS_2': createInfo('IS_2', 1, 'String', '') } expected_formats = { 'FI': createFormat('FI', 1, 'Integer', ''), 'FU': createFormat('FU', '.', 'Float', ''), 'FI_2': createFormat('FI_2', 1, 'Integer', '') } expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def test_add_format_fields_reserved_field(self): calls_record_with_desc = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record_with_desc.fields.append(bigquery.TableFieldSchema( name='GQ', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='bigquery desc')) formats = OrderedDict() schema_converter._add_format_fields(calls_record_with_desc, formats) expected_formats = OrderedDict([ ('GQ', createFormat('GQ', 1, 'Integer', 'bigquery desc'))]) self.assertEqual(formats, expected_formats) calls_record_without_desc = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record_without_desc.fields.append(bigquery.TableFieldSchema( name='GQ', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='')) formats = OrderedDict() schema_converter._add_format_fields(calls_record_without_desc, formats) expected_formats = OrderedDict([ ('GQ', createFormat( 'GQ', 1, 'Integer', 'Conditional genotype quality'))]) self.assertEqual(formats, expected_formats)
def test_infer_mismatched_format_field(self): variant = self._get_sample_variant_format_fi_float_value() corrected_format = infer_headers_util._infer_mismatched_format_field( 'FI', variant.calls[0].info.get('FI'), createFormat('FI', 2, 'Integer', 'desc')) expected_formats = createFormat('FI', 2, 'Float', 'desc') self.assertEqual(expected_formats, corrected_format)
def _get_sample_header_fields(self, with_annotation=False): """Provides a simple `VcfHeader` with info and format fields Args: with_annotation: Can be bool or list of tuples. Tuples should be additional annotation fields in the format (key, `Info`). """ infos = OrderedDict([ ('IS', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('ISI', createInfo('ISI', 1, 'Integer', 'desc', 'src', 'v')), ('ISF', createInfo('ISF', 1, 'Float', 'desc', 'src', 'v')), ('IF', createInfo('IF', 1, 'Float', 'desc', 'src', 'v')), ('IB', createInfo('I1', 1, 'Flag', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v')) ]) if with_annotation: infos['CSQ'] = createInfo( 'CSQ', '.', 'String', 'Annotations from VEP. Format: Allele|Gene|Position|Score', 'src', 'v') if isinstance(with_annotation, list): for key, value in with_annotation: infos[key] = value formats = OrderedDict([ ('FS', createFormat('FS', 1, 'String', 'desc')), ('FI', createFormat('FI', 2, 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key')) ]) return vcf_header_io.VcfHeader(infos=infos, formats=formats)
def test_infer_format_fields_no_conflicts(self): variant = self._get_sample_variant_1() formats = OrderedDict([ ('FS', createFormat('FS', 1, 'String', 'desc')), ('FI', createFormat('FI', 2, 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key')) ]) header = infer_headers_util.infer_format_fields( variant, vcf_header_io.VcfHeader(formats=formats)) self.assertEqual({}, header)
def test_generate_header_fields_from_schema(self): sample_schema = bigquery_schema_util.get_sample_table_schema() header = schema_converter.generate_header_fields_from_schema( sample_schema) infos = OrderedDict([ ('AF', createInfo('AF', 'A', 'Float', 'desc', None, None)), ('AA', createInfo('AA', 1, 'String', 'desc', None, None)), ('IFR', createInfo('IFR', '.', 'Float', 'desc', None, None)), ('IS', createInfo('IS', 1, 'String', 'desc', None, None))]) formats = OrderedDict([ ('FB', createFormat('FB', 1, 'String', 'desc')), ('GQ', createFormat('GQ', 1, 'Integer', 'desc'))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) self.assertEqual(header, expected_header)
def test_add_format_fields_reserved_field_schema_compatibility(self): schema_conflict_format = bigquery.TableSchema() calls_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record.fields.append( bigquery.TableFieldSchema( name='GQ', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc')) schema_conflict_format.fields.append(calls_record) with self.assertRaises(ValueError): schema_converter.generate_header_fields_from_schema( schema_conflict_format) formats_allow_incompatible_schema = OrderedDict() schema_converter._add_format_fields(calls_record, formats_allow_incompatible_schema, allow_incompatible_schema=True) expected_formats = OrderedDict([('GQ', createFormat('GQ', 1, 'String', 'desc'))]) self.assertEqual(formats_allow_incompatible_schema, expected_formats)
def test_report_conflicted_and_inferred_headers(self): header_definitions = VcfHeaderDefinitions() header_definitions._infos = { 'NS': { Definition(1, 'Float'): ['file1'], Definition(1, 'Integer'): ['file2'] } } infos = OrderedDict([('NS', createInfo('NS', 1, 'Float', 'Number samples', None, None))]) formats = OrderedDict([('DP', createFormat('DP', 2, 'Float', 'Total Depth'))]) resolved_headers = VcfHeader(infos=infos, formats=formats) inferred_headers = VcfHeader(formats=formats) expected = [ preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n', preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n', (preprocess_reporter._DELIMITER).join([ 'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n' ]), (preprocess_reporter._DELIMITER).join( [' ', ' ', 'num=1 type=Integer', 'file2', ' \n']), '\n', preprocess_reporter._InconsistencyType.INFERRED_HEADERS + '\n', preprocess_reporter._HeaderLine.INFERRED_FIELD_HEADER + '\n', (preprocess_reporter._DELIMITER).join( ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n' ] self._generate_report_and_assert_contents_equal( expected, header_definitions, resolved_headers, inferred_headers)
def test_vcf_header_to_schema_to_vcf_header(self): infos = OrderedDict([ ('I1', createInfo('I1', '.', 'String', 'desc', None, None)), ('IA', createInfo('IA', '.', 'Integer', 'desc', None, None))]) formats = OrderedDict([ ('F1', createFormat('F1', '.', 'String', 'desc')), ('F2', createFormat('F2', '.', 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc'))]) original_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) schema = schema_converter.generate_schema_from_header_fields( original_header, processed_variant.ProcessedVariantFactory(original_header)) reconstructed_header = ( schema_converter.generate_header_fields_from_schema( schema)) self.assertEqual(original_header, reconstructed_header)
def test_info_and_format_header_fields(self): infos = OrderedDict([ ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))]) # GT and PS should not be set as they're already included in special # 'genotype' and 'phaseset' fields. formats = OrderedDict([ ('F1', createFormat('F1', 1, 'String', 'desc')), ('F2', createFormat('F2', 2, 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1', 'F2', 'FU'], info_fields=['I1']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_variant_merger_modify_schema(self): infos = OrderedDict([ ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([('F1', createFormat('F1', 1, 'String', 'desc'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1'], info_fields=['I1', 'ADDED_BY_MERGER']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields), variant_merger=_DummyVariantMergeStrategy()))
def test_report_inferred_headers_only(self): header_definitions = VcfHeaderDefinitions() formats = OrderedDict([('DP', createFormat('DP', 2, 'Float', 'Total Depth'))]) inferred_headers = VcfHeader(formats=formats) expected = [ 'No Header Conflicts Found.\n', '\n', preprocess_reporter._InconsistencyType.INFERRED_HEADERS + '\n', preprocess_reporter._HeaderLine.INFERRED_FIELD_HEADER + '\n', (preprocess_reporter._DELIMITER).join( ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n' ] self._generate_report_and_assert_contents_equal( expected, header_definitions, inferred_headers=inferred_headers)
def test_report_no_conflicts(self): header_definitions = VcfHeaderDefinitions() header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}} header_definitions._formats = { 'NS': { Definition(1, 'Float'): ['file2'] } } infos = OrderedDict([('NS', createInfo('NS', 1, 'Integer', 'Number samples', None, None))]) formats = OrderedDict([('NS', createFormat('NS', 1, 'Float', 'Number samples'))]) resolved_headers = VcfHeader(infos=infos, formats=formats) expected = ['No Header Conflicts Found.\n', '\n'] self._generate_report_and_assert_contents_equal( expected, header_definitions, resolved_headers)
def test_infer_format_fields_combined_conflicts(self): variant = self._get_sample_variant_format_fi_float_value() formats = OrderedDict([ ('FS', createFormat('FS', 1, 'String', 'desc')), ('FI', createFormat('FI', 2, 'Integer', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key')) ]) inferred_formats = infer_headers_util.infer_format_fields( variant, vcf_header_io.VcfHeader(formats=formats)) expected_formats = { 'FI': createFormat('FI', 2, 'Float', 'desc'), 'FU': createFormat('FU', '.', 'Float', '') } self.assertEqual(expected_formats, inferred_formats)
def test_defined_fields_filtered_two_variants(self): # Only INFO and FORMAT in the first variants are already defined in the # header section of the VCF files. with TestPipeline() as p: vcf_headers = self._get_sample_header_fields() vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers]) variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( pvalue.AsSingleton(vcf_headers_side_input), infer_headers=True)) expected_infos = {'IS_2': createInfo('IS_2', 1, 'String', '')} expected_formats = {'FI_2': createFormat('FI_2', 1, 'Integer', '')} expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def test_pipeline(self): infos = { 'IS': createInfo('IS', 1, 'String', ''), 'ISI': createInfo('ISI', 1, 'Integer', ''), 'ISF': createInfo('ISF', 1, 'Float', ''), 'IB': createInfo('IB', 0, 'Flag', ''), 'IA': createInfo('IA', 'A', 'Integer', '') } formats = OrderedDict([ ('FS', createFormat('FS', 1, 'String', 'desc')), ('FI', createFormat('FI', 2, 'Integer', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key')) ]) with TestPipeline() as p: variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch() variant_2 = self._get_sample_variant_format_fi_float_value() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( defined_headers=vcf_header_io.VcfHeader(infos=infos, formats=formats), allow_incompatible_records=True, infer_headers=True)) expected_infos = { 'IA': createInfo('IA', '.', 'Float', ''), 'IF': createInfo('IF', 1, 'Float', '') } expected_formats = { 'FI': createFormat('FI', 2, 'Float', 'desc'), 'FU': createFormat('FU', '.', 'Float', '') } expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()