Beispiel #1
0
    def test_intersect(self):
        # Test from the snp workflow.
        expected = ('chr',91143,91144, ('C','*A','0','|EBMYCG00000002479|Rv0083',1,0))
        a = genrep.Assembly('mycoTube_H37RV')
        c = concat_fields(a.annot_track('CDS','chr'), infields=['name','strand','frame'], as_tuple=True)
        feat = fstream([('chr',91143,91144,('C','*A','0'))], fields=['chr','start','end','rest'])
        g = intersect([feat,c], win_size=10000)
        self.assertEqual(g.next(),expected)

        fields = ['chr','start','end','name','strand','score']
        s1 = fstream([('chr',0,20,'a1',1,6.),('chr',40,60,'b',1,3.)], fields=fields)
        s2 = fstream([('chr',10,30,'a2',1,8.),('chr',50,70,'b',-1,4.)], fields=fields)
        res = list(intersect([s1,s2]))
        expected = [('chr',10,20,'a1|a2',1,14.),('chr',50,60,'b|b',0,7.)]
        self.assertListEqual(res,expected)
Beispiel #2
0
    def test_intersect(self):
        # Test from the snp workflow.
        expected = ('chr', 91143, 91144, ('C', '*A', '0',
                                          '|EBMYCG00000002479|Rv0083', 1, 0))
        a = genrep.Assembly('mycoTube_H37RV')
        c = concat_fields(a.annot_track('CDS', 'chr'),
                          infields=['name', 'strand', 'frame'],
                          as_tuple=True)
        feat = fstream([('chr', 91143, 91144, ('C', '*A', '0'))],
                       fields=['chr', 'start', 'end', 'rest'])
        g = intersect([feat, c], win_size=10000)
        self.assertEqual(g.next(), expected)

        fields = ['chr', 'start', 'end', 'name', 'strand', 'score']
        s1 = fstream([('chr', 0, 20, 'a1', 1, 6.),
                      ('chr', 40, 60, 'b', 1, 3.)],
                     fields=fields)
        s2 = fstream([('chr', 10, 30, 'a2', 1, 8.),
                      ('chr', 50, 70, 'b', -1, 4.)],
                     fields=fields)
        res = list(intersect([s1, s2]))
        expected = [('chr', 10, 20, 'a1|a2', 1, 14.),
                    ('chr', 50, 60, 'b|b', 0, 7.)]
        self.assertListEqual(res, expected)
Beispiel #3
0
def exon_snps(chrom,outexons,allsnps,assembly,sample_names,genomeRef={},
              logfile=sys.stdout,debugfile=sys.stderr):
    """Annotates SNPs described in `filedict` (a dictionary of the form {chromosome: filename}
    where `filename` is an output of parse_pileupFile).
    Adds columns 'gene', 'location_type' and 'distance' to the output of parse_pileupFile.
    Returns two files: the first contains all SNPs annotated with their position respective to genes in
    the specified assembly, and the second contains only SNPs found within CDS regions.

    :param chrom: (str) chromosome name.
    :param outexons: (str) name of the file containing the list of SNPs on exons.
    :param allsnps: list of tuples (chr,start,end,ref,alt1..altN) as returned by all_snps().
        Ex: [('chr', 3684115, 3684116, 'G', 'G', 'G', 'G', 'T (56% of 167)', 'G'), ...]
    :param assembly: genrep.Assembly object
    :param sample_names: list of sample names.
    :param genomeRef: dict of the form {'chr1': filename}, where filename is the name of a fasta file
        containing the reference sequence for the chromosome.
    """
    def _write_buffer(_buffer, outex):
        new_codon = None
        # One position at a time
        for chr,pos,refbase,variants,cds,strand,ref_codon,shift in _buffer:
            varbase = list(variants)  # Ex: ['G','G','G','T (56% of 167)','G'],  ['A/A','G/G (100% of 7)']
            if new_codon is None:
                new_codon = [[ref_codon] for _ in range(len(varbase))]
            variants = []  # [[variants sample1], [variants sample2], ...]
            # One sample at a time
            for variant in varbase:
                if variant in ['0','-']:
                    variants.append([refbase])
                else: # Ex: 'C/G (80% of 10)' : heterozygous simple (ref is C) or double snp (ref is not C)
                    v = variant.split()[0]  # Ex: C/G
                    v = unique(v.split('/'))  # Ex: G/G -> 'G'
                    if refbase in v: v.remove(refbase)  # Ex: C/G -> 'G' (if ref is C)
                    variants.append(v)
            # One sample at a time
            for k,v in enumerate(variants):
                cnumb = len(new_codon[k])
                newc = new_codon[k]*len(v)
                for i,vari in enumerate(v):
                    for j in range(cnumb):
                        newc[i*cnumb+j] = newc[i*cnumb+j][:shift]  +vari +newc[i*cnumb+j][shift+1:]
                        assert ref_codon[shift] == refbase, "bug with shift within codon"
                new_codon[k] = newc
        if new_codon is None: return
        if strand == -1:
            ref_codon = revcomp(ref_codon)
            new_codon = [[revcomp(s) for s in c] for c in new_codon]
        for chr,pos,refbase,variants,cds,strand,dummy,shift in _buffer:
            refc = [iupac.get(x,x) for x in ref_codon]
            ref_codon = [''.join(x) for x in product(*refc)]
            newc = [[[iupac.get(x,x) for x in variant] for variant in sample]
                    for sample in new_codon]
            new_codon = [[''.join(x) for codon in sample for x in product(*codon)] for sample in newc]
            if refbase == "*":
                result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \
                         + [','.join([translate.get(refc,'?') for refc in ref_codon])] + ["indel"]
            else:
                result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \
                         + [','.join([translate.get(refc,'?') for refc in ref_codon])] \
                         + [','.join([translate.get(s,'?') for s in newc]) for newc in new_codon]
            outex.write("\t".join([str(r) for r in result])+"\n")

    #############################################################
    snp_stream = FeatureStream(allsnps, fields=['chr','start','end','ref']+sample_names)
    inclstream = concat_fields(snp_stream, infields=snp_stream.fields[3:], as_tuple=True)
    snp_stream = FeatureStream(allsnps, fields=['chr','start','end','ref']+sample_names)
    inclstream = concat_fields(snp_stream, infields=snp_stream.fields[3:], as_tuple=True)
    try:
        annotstream = concat_fields(assembly.annot_track('CDS',chrom),
                                    infields=['name','strand','frame'], as_tuple=True)
        annotstream = FeatureStream((x[:3]+(x[1:3]+x[3],) for x in annotstream),fields=annotstream.fields)
    except:
        return False
    _buffer = {1:[], -1:[]}
    last_start = {1:-1, -1:-1}
    logfile.write("  Intersection with CDS - codon changes\n"); logfile.flush()
    outex = open(outexons,"a")
    for x in gm_stream.intersect([inclstream, annotstream]):
        # x = (chr,   start,end, (alt1,alt2,   , start1,end1,cds1,strand1,phase1,  start2,end2,cds2,strand2,phase2    ))
        # x = ('chrV',1606,1607, ('T','C (43%)', 1612,1724,'YEL077C|YEL077C',-1,0, 1712,1723,'YEL077W-A|YEL077W-A',1,0))
        nsamples = len(sample_names)
        chr = x[0]; pos = x[1]; rest = x[3]
        refbase = rest[0]
        annot = [rest[5*i+nsamples+1 : 5*i+5+nsamples+1]
                 for i in range(len(rest[nsamples+1:])/5)] # list of (start,end,cds,strand,phase)
        for es,ee,cds,strand,phase in annot:
            if strand == 1:
                shift = (pos-es-phase) % 3
            elif strand == -1:
                shift = (pos-ee+phase) % 3
            else:
                continue
            codon_start = pos-shift
            ref_codon = assembly.fasta_from_regions({chr: [[codon_start,codon_start+3]]}, out={},
                                                    path_to_ref=genomeRef.get(chr))[0][chr][0]
            info = [chr,pos,refbase,list(rest[1:nsamples+1]),cds,strand,
                    ref_codon.upper(),shift]
            # Either the codon is the same as the previous one on this strand, or it will never be.
            # Only if one codon is passed, can write its snps to a file.
            if codon_start == last_start[strand]:
                _buffer[strand].append(info)
            else:
                _write_buffer(_buffer[strand],outex)
                _buffer[strand] = [info]
                last_start[strand] = codon_start
    for strand in [1,-1]:
        _write_buffer(_buffer[strand],outex)
    outex.close()
    return True
Beispiel #4
0
def exon_snps(chrom,
              outexons,
              allsnps,
              assembly,
              sample_names,
              genomeRef={},
              logfile=sys.stdout,
              debugfile=sys.stderr):
    """Annotates SNPs described in `filedict` (a dictionary of the form {chromosome: filename}
    where `filename` is an output of parse_pileupFile).
    Adds columns 'gene', 'location_type' and 'distance' to the output of parse_pileupFile.
    Returns two files: the first contains all SNPs annotated with their position respective to genes in
    the specified assembly, and the second contains only SNPs found within CDS regions.

    :param chrom: (str) chromosome name.
    :param outexons: (str) name of the file containing the list of SNPs on exons.
    :param allsnps: list of tuples (chr,start,end,ref,alt1..altN) as returned by all_snps().
        Ex: [('chr', 3684115, 3684116, 'G', 'G', 'G', 'G', 'T (56% of 167)', 'G'), ...]
    :param assembly: genrep.Assembly object
    :param sample_names: list of sample names.
    :param genomeRef: dict of the form {'chr1': filename}, where filename is the name of a fasta file
        containing the reference sequence for the chromosome.
    """
    def _write_buffer(_buffer, outex):
        new_codon = None
        # One position at a time
        for chr, pos, refbase, variants, cds, strand, ref_codon, shift in _buffer:
            varbase = list(
                variants
            )  # Ex: ['G','G','G','T (56% of 167)','G'],  ['A/A','G/G (100% of 7)']
            if new_codon is None:
                new_codon = [[ref_codon] for _ in range(len(varbase))]
            variants = []  # [[variants sample1], [variants sample2], ...]
            # One sample at a time
            for variant in varbase:
                if variant in ['0', '-']:
                    variants.append([refbase])
                else:  # Ex: 'C/G (80% of 10)' : heterozygous simple (ref is C) or double snp (ref is not C)
                    v = variant.split()[0]  # Ex: C/G
                    v = unique(v.split('/'))  # Ex: G/G -> 'G'
                    if refbase in v:
                        v.remove(refbase)  # Ex: C/G -> 'G' (if ref is C)
                    variants.append(v)
            # One sample at a time
            for k, v in enumerate(variants):
                cnumb = len(new_codon[k])
                newc = new_codon[k] * len(v)
                for i, vari in enumerate(v):
                    for j in range(cnumb):
                        newc[i * cnumb +
                             j] = newc[i * cnumb +
                                       j][:shift] + vari + newc[i * cnumb +
                                                                j][shift + 1:]
                        assert ref_codon[
                            shift] == refbase, "bug with shift within codon"
                new_codon[k] = newc
        if new_codon is None: return
        if strand == -1:
            ref_codon = revcomp(ref_codon)
            new_codon = [[revcomp(s) for s in c] for c in new_codon]
        for chr, pos, refbase, variants, cds, strand, dummy, shift in _buffer:
            refc = [iupac.get(x, x) for x in ref_codon]
            ref_codon = [''.join(x) for x in product(*refc)]
            newc = [[[iupac.get(x, x) for x in variant] for variant in sample]
                    for sample in new_codon]
            new_codon = [[
                ''.join(x) for codon in sample for x in product(*codon)
            ] for sample in newc]
            if refbase == "*":
                result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \
                         + [','.join([translate.get(refc,'?') for refc in ref_codon])] + ["indel"]
            else:
                result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \
                         + [','.join([translate.get(refc,'?') for refc in ref_codon])] \
                         + [','.join([translate.get(s,'?') for s in newc]) for newc in new_codon]
            outex.write("\t".join([str(r) for r in result]) + "\n")

    #############################################################
    snp_stream = FeatureStream(allsnps,
                               fields=['chr', 'start', 'end', 'ref'] +
                               sample_names)
    inclstream = concat_fields(snp_stream,
                               infields=snp_stream.fields[3:],
                               as_tuple=True)
    snp_stream = FeatureStream(allsnps,
                               fields=['chr', 'start', 'end', 'ref'] +
                               sample_names)
    inclstream = concat_fields(snp_stream,
                               infields=snp_stream.fields[3:],
                               as_tuple=True)
    try:
        annotstream = concat_fields(assembly.annot_track('CDS', chrom),
                                    infields=['name', 'strand', 'frame'],
                                    as_tuple=True)
        annotstream = FeatureStream(
            (x[:3] + (x[1:3] + x[3], ) for x in annotstream),
            fields=annotstream.fields)
    except:
        return False
    _buffer = {1: [], -1: []}
    last_start = {1: -1, -1: -1}
    logfile.write("  Intersection with CDS - codon changes\n")
    logfile.flush()
    outex = open(outexons, "a")
    for x in gm_stream.intersect([inclstream, annotstream]):
        # x = (chr,   start,end, (alt1,alt2,   , start1,end1,cds1,strand1,phase1,  start2,end2,cds2,strand2,phase2    ))
        # x = ('chrV',1606,1607, ('T','C (43%)', 1612,1724,'YEL077C|YEL077C',-1,0, 1712,1723,'YEL077W-A|YEL077W-A',1,0))
        nsamples = len(sample_names)
        chr = x[0]
        pos = x[1]
        rest = x[3]
        refbase = rest[0]
        annot = [
            rest[5 * i + nsamples + 1:5 * i + 5 + nsamples + 1]
            for i in range(len(rest[nsamples + 1:]) / 5)
        ]  # list of (start,end,cds,strand,phase)
        for es, ee, cds, strand, phase in annot:
            if strand == 1:
                shift = (pos - es - phase) % 3
            elif strand == -1:
                shift = (pos - ee + phase) % 3
            else:
                continue
            codon_start = pos - shift
            ref_codon = assembly.fasta_from_regions(
                {chr: [[codon_start, codon_start + 3]]},
                out={},
                path_to_ref=genomeRef.get(chr))[0][chr][0]
            info = [
                chr, pos, refbase,
                list(rest[1:nsamples + 1]), cds, strand,
                ref_codon.upper(), shift
            ]
            # Either the codon is the same as the previous one on this strand, or it will never be.
            # Only if one codon is passed, can write its snps to a file.
            if codon_start == last_start[strand]:
                _buffer[strand].append(info)
            else:
                _write_buffer(_buffer[strand], outex)
                _buffer[strand] = [info]
                last_start[strand] = codon_start
    for strand in [1, -1]:
        _write_buffer(_buffer[strand], outex)
    outex.close()
    return True