def deepvariant_header(contigs, sample_names): """Returns a VcfHeader used for writing VCF output. This function fills out the FILTER, INFO, FORMAT, and extra header information created by the DeepVariant pipeline using consistent fields that DeepVariant creates. The `contigs` and `sample_names` fields are unique depending on the input data used, so are required inputs. Args: contigs: list(ContigInfo). The list of contigs on which variants were called. sample_names: list(str). The list of samples present in the run. Returns: A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given samples and contigs populated. """ version = variants_pb2.VcfExtra(key='DeepVariant_version', value=DEEP_VARIANT_VERSION) return variants_pb2.VcfHeader( fileformat='VCFv4.2', filters=[ vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_REF_FILTER, description='Genotyping model thinks this site is reference.'), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_QUAL_FILTER, description='Confidence in this variant being real is below ' 'calling threshold.'), ], infos=[ vcf_constants.reserved_info_field('END'), ], formats=[ vcf_constants.reserved_format_field('GT'), vcf_constants.reserved_format_field('GQ'), vcf_constants.reserved_format_field('DP'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_MIN_DP_FORMAT, number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), vcf_constants.reserved_format_field('AD'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_VAF_FORMAT, number='A', type='Float', description='Variant allele fractions.'), vcf_constants.reserved_format_field('PL'), ], contigs=contigs, sample_names=sample_names, extras=[version])
def deepvariant_header(contigs, sample_names): """Returns a VcfHeader used for writing VCF output. This function fills out the FILTER, INFO, FORMAT, and extra header information created by the DeepVariant pipeline using consistent fields that DeepVariant creates. The `contigs` and `sample_names` fields are unique depending on the input data used, so are required inputs. Args: contigs: list(ContigInfo). The list of contigs on which variants were called. sample_names: list(str). The list of samples present in the run. Returns: A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given samples and contigs populated. """ return variants_pb2.VcfHeader( fileformat='VCFv4.2', filters=[ vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_REF_FILTER, description='Genotyping model thinks this site is reference.'), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_QUAL_FILTER, description='Confidence in this variant being real is below ' 'calling threshold.'), ], infos=[ vcf_constants.reserved_info_field('END'), ], formats=[ vcf_constants.reserved_format_field('GT'), vcf_constants.reserved_format_field('GQ'), vcf_constants.reserved_format_field('DP'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_MIN_DP_FORMAT, number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), vcf_constants.reserved_format_field('AD'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_VAF_FORMAT, number='A', type='Float', description='Variant allele fractions.'), vcf_constants.reserved_format_field('GL'), vcf_constants.reserved_format_field('PL'), ], contigs=contigs, sample_names=sample_names, )
def test_get_reserved_info(self, field_id): info = vcf_constants.reserved_info_field(field_id) self.assertIsInstance(info, variants_pb2.VcfInfo) self.assertEqual(info.id, field_id)
def test_invalid_get_reserved_info(self, field_id): with self.assertRaisesRegexp(ValueError, 'No reserved field with id'): vcf_constants.reserved_info_field(field_id)
def test_invalid_get_reserved_info(self, field_id): with self.assertRaisesRegexp(ValueError, 'No reserved field with id'): vcf_constants.reserved_info_field(field_id)
def test_get_reserved_info(self, field_id): info = vcf_constants.reserved_info_field(field_id) self.assertIsInstance(info, variants_pb2.VcfInfo) self.assertEqual(info.id, field_id)
def deepvariant_header(contigs, sample_names, add_info_candidates=False, include_med_dp=True): """Returns a VcfHeader used for writing VCF output. This function fills out the FILTER, INFO, FORMAT, and extra header information created by the DeepVariant pipeline using consistent fields that DeepVariant creates. The `contigs` and `sample_names` fields are unique depending on the input data used, so are required inputs. Args: contigs: list(ContigInfo). The list of contigs on which variants were called. sample_names: list(str). The list of samples present in the run. add_info_candidates: Adds the 'CANDIDATES' info field for debugging purposes. include_med_dp: boolean. If True, we will include MED_DP. Returns: A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given samples and contigs populated. """ version = variants_pb2.VcfExtra( key='DeepVariant_version', value=DEEP_VARIANT_VERSION) info_fields = [ vcf_constants.reserved_info_field('END'), ] formats = [ vcf_constants.reserved_format_field('GT'), vcf_constants.reserved_format_field('GQ'), vcf_constants.reserved_format_field('DP'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_MIN_DP_FORMAT, number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), vcf_constants.reserved_format_field('AD'), variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_VAF_FORMAT, number='A', type='Float', description='Variant allele fractions.'), vcf_constants.reserved_format_field('PL'), ] if add_info_candidates: info_fields.append( variants_pb2.VcfInfo( id='CANDIDATES', number='1', type=vcf_constants.STRING_TYPE, description='pipe-delimited candidate alleles.')) if include_med_dp: formats.append( variants_pb2.VcfFormatInfo( id=DEEP_VARIANT_MED_DP_FORMAT, number='1', type='Integer', description='Median DP observed within the GVCF block ' 'rounded to the nearest integer.')) return variants_pb2.VcfHeader( fileformat='VCFv4.2', filters=[ vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_REF_FILTER, description='Genotyping model thinks this site is reference.'), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_QUAL_FILTER, description='Confidence in this variant being real is below ' 'calling threshold.'), variants_pb2.VcfFilterInfo( id=DEEP_VARIANT_NO_CALL, description='Site has depth=0 resulting in no call.'), ], infos=info_fields, formats=formats, contigs=contigs, sample_names=sample_names, extras=[version])