コード例 #1
0
def deepvariant_header(contigs, sample_names):
    """Returns a VcfHeader used for writing VCF output.

  This function fills out the FILTER, INFO, FORMAT, and extra header information
  created by the DeepVariant pipeline using consistent fields that DeepVariant
  creates. The `contigs` and `sample_names` fields are unique depending on the
  input data used, so are required inputs.

  Args:
    contigs: list(ContigInfo). The list of contigs on which variants were
      called.
    sample_names: list(str). The list of samples present in the run.

  Returns:
    A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given
    samples and contigs populated.
  """
    version = variants_pb2.VcfExtra(key='DeepVariant_version',
                                    value=DEEP_VARIANT_VERSION)

    return variants_pb2.VcfHeader(
        fileformat='VCFv4.2',
        filters=[
            vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS),
            variants_pb2.VcfFilterInfo(
                id=DEEP_VARIANT_REF_FILTER,
                description='Genotyping model thinks this site is reference.'),
            variants_pb2.VcfFilterInfo(
                id=DEEP_VARIANT_QUAL_FILTER,
                description='Confidence in this variant being real is below '
                'calling threshold.'),
        ],
        infos=[
            vcf_constants.reserved_info_field('END'),
        ],
        formats=[
            vcf_constants.reserved_format_field('GT'),
            vcf_constants.reserved_format_field('GQ'),
            vcf_constants.reserved_format_field('DP'),
            variants_pb2.VcfFormatInfo(
                id=DEEP_VARIANT_MIN_DP_FORMAT,
                number='1',
                type='Integer',
                description='Minimum DP observed within the GVCF block.'),
            vcf_constants.reserved_format_field('AD'),
            variants_pb2.VcfFormatInfo(
                id=DEEP_VARIANT_VAF_FORMAT,
                number='A',
                type='Float',
                description='Variant allele fractions.'),
            vcf_constants.reserved_format_field('PL'),
        ],
        contigs=contigs,
        sample_names=sample_names,
        extras=[version])
コード例 #2
0
 def test_vcf_header(self):
   header = self.sites_reader.header
   expected1 = variants_pb2.VcfStructuredExtra(
       key='ALT',
       fields=[
           variants_pb2.VcfExtra(key='ID', value='NON_REF'),
           variants_pb2.VcfExtra(
               key='Description',
               value='Represents	any	possible	alternative	allele	at	th'
               'is	location')
       ])
   expected2 = variants_pb2.VcfStructuredExtra(
       key='META',
       fields=[
           variants_pb2.VcfExtra(key='ID', value='TESTMETA'),
           variants_pb2.VcfExtra(key='Description', value='blah')
       ])
   self.assertLen(header.structured_extras, 2)
   self.assertEqual(header.structured_extras[1], expected2)
   self.assertEqual(header.structured_extras[0], expected1)
コード例 #3
0
def deepvariant_header(contigs,
                       sample_names,
                       add_info_candidates=False,
                       include_med_dp=True):
  """Returns a VcfHeader used for writing VCF output.

  This function fills out the FILTER, INFO, FORMAT, and extra header information
  created by the DeepVariant pipeline using consistent fields that DeepVariant
  creates. The `contigs` and `sample_names` fields are unique depending on the
  input data used, so are required inputs.

  Args:
    contigs: list(ContigInfo). The list of contigs on which variants were
      called.
    sample_names: list(str). The list of samples present in the run.
    add_info_candidates: Adds the 'CANDIDATES' info field for
      debugging purposes.
    include_med_dp: boolean. If True, we will include MED_DP.

  Returns:
    A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given
    samples and contigs populated.
  """
  version = variants_pb2.VcfExtra(
      key='DeepVariant_version', value=DEEP_VARIANT_VERSION)

  info_fields = [
      vcf_constants.reserved_info_field('END'),
  ]
  formats = [
      vcf_constants.reserved_format_field('GT'),
      vcf_constants.reserved_format_field('GQ'),
      vcf_constants.reserved_format_field('DP'),
      variants_pb2.VcfFormatInfo(
          id=DEEP_VARIANT_MIN_DP_FORMAT,
          number='1',
          type='Integer',
          description='Minimum DP observed within the GVCF block.'),
      vcf_constants.reserved_format_field('AD'),
      variants_pb2.VcfFormatInfo(
          id=DEEP_VARIANT_VAF_FORMAT,
          number='A',
          type='Float',
          description='Variant allele fractions.'),
      vcf_constants.reserved_format_field('PL'),
  ]
  if add_info_candidates:
    info_fields.append(
        variants_pb2.VcfInfo(
            id='CANDIDATES',
            number='1',
            type=vcf_constants.STRING_TYPE,
            description='pipe-delimited candidate alleles.'))

  if include_med_dp:
    formats.append(
        variants_pb2.VcfFormatInfo(
            id=DEEP_VARIANT_MED_DP_FORMAT,
            number='1',
            type='Integer',
            description='Median DP observed within the GVCF block '
            'rounded to the nearest integer.'))

  return variants_pb2.VcfHeader(
      fileformat='VCFv4.2',
      filters=[
          vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_REF_FILTER,
              description='Genotyping model thinks this site is reference.'),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_QUAL_FILTER,
              description='Confidence in this variant being real is below '
              'calling threshold.'),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_NO_CALL,
              description='Site has depth=0 resulting in no call.'),
      ],
      infos=info_fields,
      formats=formats,
      contigs=contigs,
      sample_names=sample_names,
      extras=[version])