Ejemplo n.º 1
0
def main_vcf_to_fasta(args):
    ''' Take input genotypes (VCF) and construct a consensus sequence
        (fasta) by using majority-read-count alleles in the VCF.
        Genotypes in the VCF will be ignored--we will use the allele
        with majority read support (or an ambiguity base if there is no clear majority).
        Uncalled positions will be emitted as N's.
        Author: dpark.
    '''
    assert args.min_dp >= 0
    assert 0.0 <= args.major_cutoff < 1.0
    
    with util.vcf.VcfReader(args.inVcf) as vcf:
        chrlens = dict(vcf.chrlens())
        samples = vcf.samples()
    with open(args.outFasta, 'wt') as outf:
        chr_idx = 0
        for header, seq in vcf_to_seqs(util.file.read_tabfile(args.inVcf),
            chrlens, samples, min_dp=args.min_dp, major_cutoff=args.major_cutoff,
            min_dp_ratio=args.min_dp_ratio):
            if args.trim_ends:
                seq = seq.strip('Nn')
            if args.name:
                header = args.name[chr_idx % len(args.name)]
            for line in util.file.fastaMaker([(header, seq)]):
                outf.write(line)

    # done
    log.info("done")
    return 0
Ejemplo n.º 2
0
 def test_headers_with_two_samps(self):
     ref = makeTempFasta([('ref1', 'ATCGTTCA'), ('ref2', 'GGCCC')])
     s1  = makeTempFasta([('s1_1', 'ATCGCA'),   ('s1_2', 'GGCCC')])
     s2  = makeTempFasta([('s2_1', 'ATCGTTCA'), ('s2_2', 'GGCCC')])
     emptyfile = util.file.mkstempfname('.txt')
     outVcf = util.file.mkstempfname('.vcf.gz')
     intrahost.merge_to_vcf(ref, outVcf, ['s1', 's2'], [emptyfile, emptyfile], [s1, s2])
     with util.vcf.VcfReader(outVcf) as vcf:
         self.assertEqual(vcf.samples(), ['s1', 's2'])
         self.assertEqual(vcf.chrlens(), {'ref1':8, 'ref2':5})
Ejemplo n.º 3
0
 def test_headers_with_two_samps(self):
     ref = makeTempFasta([('ref1', 'ATCGTTCA'), ('ref2', 'GGCCC')])
     s1 = makeTempFasta([('s1_1', 'ATCGCA'), ('s1_2', 'GGCCC')])
     s2 = makeTempFasta([('s2_1', 'ATCGTTCA'), ('s2_2', 'GGCCC')])
     emptyfile = util.file.mkstempfname('.txt')
     outVcf = util.file.mkstempfname('.vcf.gz')
     self.assertRaises(LookupError, intrahost.merge_to_vcf, ref, outVcf,
                       ['s1', 's2'], [emptyfile, emptyfile], [s1, s2])
     with util.vcf.VcfReader(outVcf) as vcf:
         self.assertEqual(vcf.samples(), ['s1', 's2'])
         self.assertEqual(vcf.chrlens(), {'ref1': 8, 'ref2': 5})
Ejemplo n.º 4
0
def main_vcf_to_fasta(args):
    """ Take input genotypes (VCF) and construct a consensus sequence
        (fasta) by using majority-read-count alleles in the VCF.
        Genotypes in the VCF will be ignored--we will use the allele
        with majority read support (or an ambiguity base if there is no clear majority).
        Uncalled positions will be emitted as N's.
        Author: dpark.
    """
    assert args.min_dp >= 0
    assert 0.0 <= args.major_cutoff < 1.0

    with util.vcf.VcfReader(args.inVcf) as vcf:
        chrlens = dict(vcf.chrlens())
        samples = vcf.samples()

    assert (
        len(samples) == 1
    ), """Multiple sample columns were found in the intermediary VCF file
        of the refine_assembly step, suggesting multiple sample names are present
        upstream in the BAM file. Please correct this so there is only one sample in the BAM file."""

    with open(args.outFasta, "wt") as outf:
        chr_idx = 0
        for header, seq in vcf_to_seqs(
            util.file.read_tabfile(args.inVcf),
            chrlens,
            samples,
            min_dp=args.min_dp,
            major_cutoff=args.major_cutoff,
            min_dp_ratio=args.min_dp_ratio,
        ):
            if args.trim_ends:
                seq = seq.strip("Nn")
            if args.name:
                header = args.name[chr_idx % len(args.name)]
            for line in util.file.fastaMaker([(header, seq)]):
                outf.write(line)

    # done
    log.info("done")
    return 0
Ejemplo n.º 5
0
def main_vcf_to_fasta(args):
    assert args.min_dp >= 0
    assert 0.0 <= args.major_cutoff < 1.0
    
    with util.vcf.VcfReader(args.inVcf) as vcf:
        chrlens = dict(vcf.chrlens())
        samples = vcf.samples()
    with open(args.outFasta, 'wt') as outf:
        for header, seq in vcf_to_seqs(util.file.read_tabfile(args.inVcf),
            chrlens, samples, min_dp=args.min_dp, major_cutoff=args.major_cutoff,
            min_dp_ratio=args.min_dp_ratio):
            if args.trim_ends:
                seq = seq.strip('Nn')
            if args.name!=None:
                header = args.name
            for line in util.file.fastaMaker([(header, seq)]):
                outf.write(line)

    # done
    log.info("done")
    return 0
Ejemplo n.º 6
0
def main_vcf_to_fasta(args):
    ''' Take input genotypes (VCF) and construct a consensus sequence
        (fasta) by using majority-read-count alleles in the VCF.
        Genotypes in the VCF will be ignored--we will use the allele
        with majority read support (or an ambiguity base if there is no clear majority).
        Uncalled positions will be emitted as N's.
        Author: dpark.
    '''
    assert args.min_dp >= 0
    assert 0.0 <= args.major_cutoff < 1.0

    with util.vcf.VcfReader(args.inVcf) as vcf:
        chrlens = dict(vcf.chrlens())
        samples = vcf.samples()

    assert len(
        samples
    ) == 1, """Multiple sample columns were found in the intermediary VCF file
        of the refine_assembly step, suggesting multiple sample names are present
        upstream in the BAM file. Please correct this so there is only one sample in the BAM file."""

    with open(args.outFasta, 'wt') as outf:
        chr_idx = 0
        for header, seq in vcf_to_seqs(util.file.read_tabfile(args.inVcf),
                                       chrlens,
                                       samples,
                                       min_dp=args.min_dp,
                                       major_cutoff=args.major_cutoff,
                                       min_dp_ratio=args.min_dp_ratio):
            if args.trim_ends:
                seq = seq.strip('Nn')
            if args.name:
                header = args.name[chr_idx % len(args.name)]
            for line in util.file.fastaMaker([(header, seq)]):
                outf.write(line)

    # done
    log.info("done")
    return 0