def deepvariant_header(contigs, sample_names): """Returns a VcfHeader used for writing VCF output. This function fills out the FILTER, INFO, FORMAT, and extra header information created by the DeepVariant pipeline using consistent fields that DeepVariant creates. The `contigs` and `sample_names` fields are unique depending on the input data used, so are required inputs. Args: contigs: list(ContigInfo). The list of contigs on which variants were called. sample_names: list(str). The list of samples present in the run. Returns: A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given samples and contigs populated. """ version = variants_pb2.VcfExtra(key='DeepVariant_version', value=DEEP_VARIANT_VERSION) return variants_pb2.VcfHeader( fileformat='VCFv4.2', filters=[ vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_REF_FILTER, description='Genotyping model thinks this site is reference.'), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_QUAL_FILTER, description='Confidence in this variant being real is below ' 'calling threshold.'), ], infos=[ vcf_constants.reserved_info_field('END'), ], formats=[ vcf_constants.reserved_format_field('GT'), vcf_constants.reserved_format_field('GQ'), vcf_constants.reserved_format_field('DP'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_MIN_DP_FORMAT, number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), vcf_constants.reserved_format_field('AD'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_VAF_FORMAT, number='A', type='Float', description='Variant allele fractions.'), vcf_constants.reserved_format_field('PL'), ], contigs=contigs, sample_names=sample_names, extras=[version])
def test_vcf_header(self): header = self.sites_reader.header expected1 = variants_pb2.VcfStructuredExtra( key='ALT', fields=[ variants_pb2.VcfExtra(key='ID', value='NON_REF'), variants_pb2.VcfExtra( key='Description', value='Represents any possible alternative allele at th' 'is location') ]) expected2 = variants_pb2.VcfStructuredExtra( key='META', fields=[ variants_pb2.VcfExtra(key='ID', value='TESTMETA'), variants_pb2.VcfExtra(key='Description', value='blah') ]) self.assertLen(header.structured_extras, 2) self.assertEqual(header.structured_extras[1], expected2) self.assertEqual(header.structured_extras[0], expected1)
def deepvariant_header(contigs, sample_names, add_info_candidates=False, include_med_dp=True): """Returns a VcfHeader used for writing VCF output. This function fills out the FILTER, INFO, FORMAT, and extra header information created by the DeepVariant pipeline using consistent fields that DeepVariant creates. The `contigs` and `sample_names` fields are unique depending on the input data used, so are required inputs. Args: contigs: list(ContigInfo). The list of contigs on which variants were called. sample_names: list(str). The list of samples present in the run. add_info_candidates: Adds the 'CANDIDATES' info field for debugging purposes. include_med_dp: boolean. If True, we will include MED_DP. Returns: A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given samples and contigs populated. """ version = variants_pb2.VcfExtra( key='DeepVariant_version', value=DEEP_VARIANT_VERSION) info_fields = [ vcf_constants.reserved_info_field('END'), ] formats = [ vcf_constants.reserved_format_field('GT'), vcf_constants.reserved_format_field('GQ'), vcf_constants.reserved_format_field('DP'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_MIN_DP_FORMAT, number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), vcf_constants.reserved_format_field('AD'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_VAF_FORMAT, number='A', type='Float', description='Variant allele fractions.'), vcf_constants.reserved_format_field('PL'), ] if add_info_candidates: info_fields.append( variants_pb2.VcfInfo( id='CANDIDATES', number='1', type=vcf_constants.STRING_TYPE, description='pipe-delimited candidate alleles.')) if include_med_dp: formats.append( variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_MED_DP_FORMAT, number='1', type='Integer', description='Median DP observed within the GVCF block ' 'rounded to the nearest integer.')) return variants_pb2.VcfHeader( fileformat='VCFv4.2', filters=[ vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_REF_FILTER, description='Genotyping model thinks this site is reference.'), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_QUAL_FILTER, description='Confidence in this variant being real is below ' 'calling threshold.'), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_NO_CALL, description='Site has depth=0 resulting in no call.'), ], infos=info_fields, formats=formats, contigs=contigs, sample_names=sample_names, extras=[version])