Exemple #1
0
def deepvariant_header(contigs, sample_names):
    """Returns a VcfHeader used for writing VCF output.

  This function fills out the FILTER, INFO, FORMAT, and extra header information
  created by the DeepVariant pipeline using consistent fields that DeepVariant
  creates. The `contigs` and `sample_names` fields are unique depending on the
  input data used, so are required inputs.

  Args:
    contigs: list(ContigInfo). The list of contigs on which variants were
      called.
    sample_names: list(str). The list of samples present in the run.

  Returns:
    A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given
    samples and contigs populated.
  """
    version = variants_pb2.VcfExtra(key='DeepVariant_version',
                                    value=DEEP_VARIANT_VERSION)

    return variants_pb2.VcfHeader(
        fileformat='VCFv4.2',
        filters=[
            vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS),
            variants_pb2.VcfFilterInfo(
                id=DEEP_VARIANT_REF_FILTER,
                description='Genotyping model thinks this site is reference.'),
            variants_pb2.VcfFilterInfo(
                id=DEEP_VARIANT_QUAL_FILTER,
                description='Confidence in this variant being real is below '
                'calling threshold.'),
        ],
        infos=[
            vcf_constants.reserved_info_field('END'),
        ],
        formats=[
            vcf_constants.reserved_format_field('GT'),
            vcf_constants.reserved_format_field('GQ'),
            vcf_constants.reserved_format_field('DP'),
            variants_pb2.VcfFormatInfo(
                id=DEEP_VARIANT_MIN_DP_FORMAT,
                number='1',
                type='Integer',
                description='Minimum DP observed within the GVCF block.'),
            vcf_constants.reserved_format_field('AD'),
            variants_pb2.VcfFormatInfo(
                id=DEEP_VARIANT_VAF_FORMAT,
                number='A',
                type='Float',
                description='Variant allele fractions.'),
            vcf_constants.reserved_format_field('PL'),
        ],
        contigs=contigs,
        sample_names=sample_names,
        extras=[version])
def deepvariant_header(contigs, sample_names):
  """Returns a VcfHeader used for writing VCF output.

  This function fills out the FILTER, INFO, FORMAT, and extra header information
  created by the DeepVariant pipeline using consistent fields that DeepVariant
  creates. The `contigs` and `sample_names` fields are unique depending on the
  input data used, so are required inputs.

  Args:
    contigs: list(ContigInfo). The list of contigs on which variants were
      called.
    sample_names: list(str). The list of samples present in the run.

  Returns:
    A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given
    samples and contigs populated.
  """
  return variants_pb2.VcfHeader(
      fileformat='VCFv4.2',
      filters=[
          vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_REF_FILTER,
              description='Genotyping model thinks this site is reference.'),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_QUAL_FILTER,
              description='Confidence in this variant being real is below '
              'calling threshold.'),
      ],
      infos=[
          vcf_constants.reserved_info_field('END'),
      ],
      formats=[
          vcf_constants.reserved_format_field('GT'),
          vcf_constants.reserved_format_field('GQ'),
          vcf_constants.reserved_format_field('DP'),
          variants_pb2.VcfFormatInfo(
              id=DEEP_VARIANT_MIN_DP_FORMAT,
              number='1',
              type='Integer',
              description='Minimum DP observed within the GVCF block.'),
          vcf_constants.reserved_format_field('AD'),
          variants_pb2.VcfFormatInfo(
              id=DEEP_VARIANT_VAF_FORMAT,
              number='A',
              type='Float',
              description='Variant allele fractions.'),
          vcf_constants.reserved_format_field('GL'),
          vcf_constants.reserved_format_field('PL'),
      ],
      contigs=contigs,
      sample_names=sample_names,
  )
 def test_get_reserved_info(self, field_id):
     info = vcf_constants.reserved_info_field(field_id)
     self.assertIsInstance(info, variants_pb2.VcfInfo)
     self.assertEqual(info.id, field_id)
 def test_invalid_get_reserved_info(self, field_id):
     with self.assertRaisesRegexp(ValueError, 'No reserved field with id'):
         vcf_constants.reserved_info_field(field_id)
 def test_invalid_get_reserved_info(self, field_id):
   with self.assertRaisesRegexp(ValueError, 'No reserved field with id'):
     vcf_constants.reserved_info_field(field_id)
 def test_get_reserved_info(self, field_id):
   info = vcf_constants.reserved_info_field(field_id)
   self.assertIsInstance(info, variants_pb2.VcfInfo)
   self.assertEqual(info.id, field_id)
Exemple #7
0
def deepvariant_header(contigs,
                       sample_names,
                       add_info_candidates=False,
                       include_med_dp=True):
  """Returns a VcfHeader used for writing VCF output.

  This function fills out the FILTER, INFO, FORMAT, and extra header information
  created by the DeepVariant pipeline using consistent fields that DeepVariant
  creates. The `contigs` and `sample_names` fields are unique depending on the
  input data used, so are required inputs.

  Args:
    contigs: list(ContigInfo). The list of contigs on which variants were
      called.
    sample_names: list(str). The list of samples present in the run.
    add_info_candidates: Adds the 'CANDIDATES' info field for
      debugging purposes.
    include_med_dp: boolean. If True, we will include MED_DP.

  Returns:
    A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given
    samples and contigs populated.
  """
  version = variants_pb2.VcfExtra(
      key='DeepVariant_version', value=DEEP_VARIANT_VERSION)

  info_fields = [
      vcf_constants.reserved_info_field('END'),
  ]
  formats = [
      vcf_constants.reserved_format_field('GT'),
      vcf_constants.reserved_format_field('GQ'),
      vcf_constants.reserved_format_field('DP'),
      variants_pb2.VcfFormatInfo(
          id=DEEP_VARIANT_MIN_DP_FORMAT,
          number='1',
          type='Integer',
          description='Minimum DP observed within the GVCF block.'),
      vcf_constants.reserved_format_field('AD'),
      variants_pb2.VcfFormatInfo(
          id=DEEP_VARIANT_VAF_FORMAT,
          number='A',
          type='Float',
          description='Variant allele fractions.'),
      vcf_constants.reserved_format_field('PL'),
  ]
  if add_info_candidates:
    info_fields.append(
        variants_pb2.VcfInfo(
            id='CANDIDATES',
            number='1',
            type=vcf_constants.STRING_TYPE,
            description='pipe-delimited candidate alleles.'))

  if include_med_dp:
    formats.append(
        variants_pb2.VcfFormatInfo(
            id=DEEP_VARIANT_MED_DP_FORMAT,
            number='1',
            type='Integer',
            description='Median DP observed within the GVCF block '
            'rounded to the nearest integer.'))

  return variants_pb2.VcfHeader(
      fileformat='VCFv4.2',
      filters=[
          vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_REF_FILTER,
              description='Genotyping model thinks this site is reference.'),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_QUAL_FILTER,
              description='Confidence in this variant being real is below '
              'calling threshold.'),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_NO_CALL,
              description='Site has depth=0 resulting in no call.'),
      ],
      infos=info_fields,
      formats=formats,
      contigs=contigs,
      sample_names=sample_names,
      extras=[version])