Beispiel #1
0
    def __init__(self,
                 output_path,
                 header=None,
                 round_qualities=False,
                 excluded_info_fields=None,
                 excluded_format_fields=None):
        """Initializer for NativeVcfWriter.

    Args:
      output_path: str. The path to which to write the VCF file.
      header: nucleus.genomics.v1.VcfHeader. The header that defines all
        information germane to the constituent variants. This includes contigs,
        FILTER fields, INFO fields, FORMAT fields, samples, and all other
        structured and unstructured header lines.
      round_qualities: bool. If True, the QUAL field is rounded to one point
        past the decimal.
      excluded_info_fields: list(str). A list of INFO field IDs that should not
        be written to the output. If None, all INFO fields are included.
      excluded_format_fields: list(str). A list of FORMAT field IDs that should
        not be written to the output. If None, all FORMAT fields are included.
    """
        super(NativeVcfWriter, self).__init__()

        if header is None:
            header = variants_pb2.VcfHeader()
        writer_options = variants_pb2.VcfWriterOptions(
            round_qual_values=round_qualities,
            excluded_info_fields=excluded_info_fields,
            excluded_format_fields=excluded_format_fields,
        )
        self._writer = vcf_writer.VcfWriter.to_file(output_path, header,
                                                    writer_options)
        self.field_access_cache = VcfHeaderCache(header)
Beispiel #2
0
 def setUp(self):
   self.out_fname = test_utils.test_tmpfile('output.vcf')
   self.header = variants_pb2.VcfHeader(
       contigs=[
           reference_pb2.ContigInfo(name='Chr1', n_bases=50, pos_in_fasta=0),
           reference_pb2.ContigInfo(name='Chr2', n_bases=25, pos_in_fasta=1),
       ],
       sample_names=['Fido', 'Spot'],
       formats=[
           variants_pb2.VcfFormatInfo(
               id='GT', number='1', type='String', description='Genotype'),
           variants_pb2.VcfFormatInfo(
               id='GQ',
               number='1',
               type='Float',
               description='Genotype Quality')
       ],
   )
   self.options = variants_pb2.VcfWriterOptions()
   self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.header,
                                              self.options)
   self.variant = test_utils.make_variant(
       chrom='Chr1',
       start=10,
       alleles=['A', 'C'],
   )
   self.variant.calls.extend([
       variants_pb2.VariantCall(genotype=[0, 0], call_set_name='Fido'),
       variants_pb2.VariantCall(genotype=[0, 1], call_set_name='Spot'),
   ])
Beispiel #3
0
    def __init__(self, header):
        """Initializer.

    Args:
      header: nucleus.genomics.v1.VcfHeader proto. Used to define the accessor
        functions needed.
    """
        if header is None:
            header = variants_pb2.VcfHeader()
        self._info_get_cache = _create_get_fn_cache(header.infos)
        self._info_set_cache = _create_set_fn_cache(header.infos)
        self._format_get_cache = _create_get_fn_cache(header.formats)
        self._format_set_cache = _create_set_fn_cache(header.formats)
Beispiel #4
0
 def write_variant_to_tempfile(self, variant):
   output_path = test_utils.test_tmpfile('test.vcf')
   header = variants_pb2.VcfHeader(
       contigs=[reference_pb2.ContigInfo(name='20')],
       sample_names=[call.call_set_name for call in variant.calls],
       formats=[
           variants_pb2.VcfFormatInfo(
               id='DP', number='1', type='Integer', description='Read depth'),
           variants_pb2.VcfFormatInfo(
               id='AD',
               number='R',
               type='Integer',
               description='Read depth for each allele')
       ])
   writer = vcf.VcfWriter(output_path, header=header)
   with writer:
     writer.write(variant)
   return output_path
Beispiel #5
0
 def setUp(self):
     self.variants = [
         test_utils.make_variant(chrom='1', start=10),
         test_utils.make_variant(chrom='1', start=20),
         test_utils.make_variant(chrom='1', start=30),
         test_utils.make_variant(chrom='2', start=25),
         test_utils.make_variant(chrom='2', start=55),
         test_utils.make_variant(chrom='3', start=10),
     ]
     self.header = variants_pb2.VcfHeader(contigs=[
         reference_pb2.ContigInfo(name='1', n_bases=100),
         reference_pb2.ContigInfo(name='2', n_bases=100),
         reference_pb2.ContigInfo(name='3', n_bases=100),
         reference_pb2.ContigInfo(name='4', n_bases=100),
     ],
                                          filters=[],
                                          sample_names=['NA12878'])
     self.reader = vcf.InMemoryVcfReader(self.variants, self.header)
Beispiel #6
0
  def test_writing_canned_variants(self):
    """Tests writing all the variants that are 'canned' in our tfrecord file."""
    # This file is in the TF record format
    tfrecord_file = test_utils.genomics_core_testdata(
        'test_samples.vcf.golden.tfrecord')

    writer_options = variants_pb2.VcfWriterOptions()
    header = variants_pb2.VcfHeader(
        contigs=[
            reference_pb2.ContigInfo(name='chr1', n_bases=248956422),
            reference_pb2.ContigInfo(name='chr2', n_bases=242193529),
            reference_pb2.ContigInfo(name='chr3', n_bases=198295559),
            reference_pb2.ContigInfo(name='chrX', n_bases=156040895)
        ],
        sample_names=['NA12878_18_99'],
        filters=[
            variants_pb2.VcfFilterInfo(
                id='PASS', description='All filters passed'),
            variants_pb2.VcfFilterInfo(id='LowQual', description=''),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00+'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'),
            variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'),
        ],
        infos=[
            variants_pb2.VcfInfo(
                id='END',
                number='1',
                type='Integer',
                description='Stop position of the interval')
        ],
        formats=[
            variants_pb2.VcfFormatInfo(
                id='GT', number='1', type='String', description='Genotype'),
            variants_pb2.VcfFormatInfo(
                id='GQ',
                number='1',
                type='Integer',
                description='Genotype Quality'),
            variants_pb2.VcfFormatInfo(
                id='DP',
                number='1',
                type='Integer',
                description='Read depth of all passing filters reads.'),
            variants_pb2.VcfFormatInfo(
                id='MIN_DP',
                number='1',
                type='Integer',
                description='Minimum DP observed within the GVCF block.'),
            variants_pb2.VcfFormatInfo(
                id='AD',
                number='R',
                type='Integer',
                description=
                'Read depth of all passing filters reads for each allele.'),
            variants_pb2.VcfFormatInfo(
                id='VAF',
                number='A',
                type='Float',
                description='Variant allele fractions.'),
            variants_pb2.VcfFormatInfo(
                id='PL',
                number='G',
                type='Integer',
                description='Genotype likelihoods, Phred encoded'),
        ],
    )
    variant_records = list(
        io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant))
    out_fname = test_utils.test_tmpfile('output.vcf')
    with vcf_writer.VcfWriter.to_file(out_fname, header,
                                      writer_options) as writer:
      for record in variant_records[:5]:
        writer.write(record)

    # Check: are the variants written as expected?
    # pylint: disable=line-too-long
    expected_vcf_content = [
        '##fileformat=VCFv4.2\n',
        '##FILTER=<ID=PASS,Description="All filters passed">\n',
        '##FILTER=<ID=LowQual,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n',
        '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of '
        'the interval">\n',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
        '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all '
        'passing filters reads.">\n',
        '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP '
        'observed within the GVCF block.">\n',
        '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all '
        'passing filters reads for each allele.">\n',
        '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele '
        'fractions.">\n',
        '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype '
        'likelihoods, Phred encoded">\n',
        '##contig=<ID=chr1,length=248956422>\n',
        '##contig=<ID=chr2,length=242193529>\n',
        '##contig=<ID=chr3,length=198295559>\n',
        '##contig=<ID=chrX,length=156040895>\n',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n',
        'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n',
        'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n',
        'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n',
        'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n',
        'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n'
    ]
    # pylint: enable=line-too-long

    with gfile.GFile(out_fname, 'r') as f:
      self.assertEqual(f.readlines(), expected_vcf_content)