Ejemplo n.º 1
0
    def __init__(self, samfile, seqfile=None, zerobase=False, use_slider=False):
        if isinstance(samfile, basestring):
            self.samfile = open(samfile)
        else:
            self.samfile = samfile

        if use_slider:
            from rnarry import batchutils
            self.samfile = batchutils.slider_file(self.samfile)

        self.samfile = QueueableLineReader(self.samfile)
        self.seqfile = seqfile
        if seqfile is not None:
            self.seqidx = GiantFASTAFile(seqfile)
        self.zerobase = zerobase
        self.seqlen = {}
Ejemplo n.º 2
0
                '%s' % dbinfo['geneName'], '.', '+'
            ])
        else:
            print >> bedout, '\t'.join([
                nracc, '0',
                str(exonlength),
                '%s' % dbinfo['geneName'], '.', '+'
            ])

        print >> faout, '>%s %s' % (nracc, dbinfo['geneName'])
        faout.write(textwrap(mutatedseq))


if __name__ == '__main__':
    refflatdbpath = sys.argv[1]
    nrlistpath = sys.argv[2]
    genomefastapath = sys.argv[3]
    rnaseqgspace = sys.argv[4]
    fastaoutpath = sys.argv[5]
    bedoutpath = sys.argv[6]

    refFlat = shelve.open(refflatdbpath, 'r')
    mm9 = GiantFASTAFile(genomefastapath)
    rseqarr = MultiTrackSplitBinnedArray(rnaseqgspace)
    nrlist = open(nrlistpath).read().split()

    bedout = open(bedoutpath, 'w')
    faout = open(fastaoutpath, 'w')

    process(nrlist)
Ejemplo n.º 3
0
        output.write(name.ljust(32, '\x00'))
        output.write(struct.pack('II', len(refseq), len(grp)))
        output.write(refseq)
        for _, start, end, nreads in grp:
            output.write(struct.pack('III', start, end, nreads))


if __name__ == '__main__':
    import sys

    refseqfile = sys.argv[1] # fasta format
    mappingfile = sys.argv[2] # gzipped gmap native format
    readerrorfile = sys.argv[3] # python pickle format
    packfile = sys.argv[4]

    greffile = GiantFASTAFile(refseqfile)
    mappingf = gzip.open(mappingfile)

    packf = open(packfile, 'w')

    print 'Loading read error profile'
    profile, maxpos = load_profile(readerrorfile)

    print 'Dumping read error profile'
    dump_profile(profile, maxpos, packf)

    print 'Loading gmap mapping file'
    spans = scan_gmap(mappingf)

    print 'Dumping mapped reads'
    dump_gmap(spans, packf, greffile)
Ejemplo n.º 4
0
            print >> outd, kcnt, refbase, '%.6f' % del_ratio
            print >> outm, kcnt, refbase, '%.6f' % mod_ratio
            print >> outmd, kcnt, refbase, '%.6f' % moddel_ratio
            print >> oute, kcnt, refbase, '%.6f' % sentropy


if __name__ == '__main__':
    import sys

    mappingfile = sys.argv[1]  # gmap native format
    greffile = sys.argv[2]
    outputprefix = sys.argv[3]

    mappingf = gzip.open(mappingfile)
    greffile = GiantFASTAFile(greffile)

    def opensortedout(fname):
        return os.popen(
            "sort -k1,1n -k2,2 -k3,3r | uniq -c | "
            "awk '{ print $2, $3, $4, $1; }' | gzip - >%s" % fname, 'w')

    nonzeroout = gzip.open(outputprefix + 'nonzero.posrcnt.gz', 'w')
    distout = {
        'D': opensortedout(outputprefix + 'del.real.gz'),
        'M': opensortedout(outputprefix + 'mod.real.gz'),
        'MD': opensortedout(outputprefix + 'moddel.real.gz'),
        'E': opensortedout(outputprefix + 'entropy.real.gz'),
    }

    spans = scan_gmap(mappingf)
Ejemplo n.º 5
0
class SAMParser(object):
    def __init__(self, samfile, seqfile=None, zerobase=False, use_slider=False):
        if isinstance(samfile, basestring):
            self.samfile = open(samfile)
        else:
            self.samfile = samfile

        if use_slider:
            from rnarry import batchutils
            self.samfile = batchutils.slider_file(self.samfile)

        self.samfile = QueueableLineReader(self.samfile)
        self.seqfile = seqfile
        if seqfile is not None:
            self.seqidx = GiantFASTAFile(seqfile)
        self.zerobase = zerobase
        self.seqlen = {}

    def getsubseq(self, name, seqfrom, seqto): # [from, to) both 0-base
        if self.seqfile is None:
            raise ValueError, 'Sequence file is not given.'

        return self.seqidx.get(name, seqfrom, seqto)

    def __iter__(self):
        return self.iteralignments()

    # WARNING: 'mapped' coordiates are 1-based, both side inclusive by default
    def iteralignments(self, strands='+-', withref=False):
        geteditdist= lambda x: x[4]

        for line in self.samfile:
            fields = line[:-1].split('\t')

            if line[0] == '@':
                if line[:3] != '@SQ':
                    continue
                sqname = fields[1][3:]
                sqlen = [int(fl[3:]) for fl in fields[2:] if fl[:3] == 'LN:'][0]
                self.seqlen[sqname] = sqlen
                continue

            qname = fields[0]
            flags = int(fields[1])
            rname = fields[2]
            pos = int(fields[3]) # 1-based leftmost
            mapq = int(fields[4]) # phred-scaled
            cigar = fields[5]
            seq = fields[9]
            options = dict(v.split(':', 1) for v in fields[11:])

            if flags & F_REVERSE_STRAND:
                strand = '-'
                seq = reverse_complement(seq)
            else:
                strand = '+'

            editdist = int(options.get('NM', 'i:-1')[2:])

            if rname == '*' or strand not in strands:
                mapped = []
            else:
                reflen, _ = calculate_cigar_length(cigar)
                stop = pos + reflen - 1
                start = pos - 1 if self.zerobase else pos
                mapped = [(rname, start, stop, strand, editdist, cigar)]

            for altmatch in options.get('XA', 'Z:')[2:].split(';')[:-1]:
                altfields = altmatch.split(',')
                strand = altfields[1][0]
                pos = int(altfields[1][1:])
                rname = altfields[0]
                cigar = altfields[2]
                editdist = int(altfields[3])
                reflen, _ = calculate_cigar_length(cigar)
                stop = pos + reflen - 1
                start = pos - 1 if self.zerobase else pos

                if strand in strands:
                    mapped.append((rname, start, stop, strand, editdist,
                                   cigar))

            # search for alternative reads
            for altline in self.samfile:
                altfields = altline[:-1].split('\t')
                altqname = altfields[0]
                altflags = int(altfields[1])
                if altqname != qname:
                    self.samfile.push(altline)
                    break

                altrname = altfields[2]
                altpos = int(altfields[3]) # 1-based leftmost
                altmapq = int(altfields[4]) # phred-scaled
                altcigar = altfields[5]
                altseq = altfields[9]
                altoptions = dict(v.split(':', 1) for v in altfields[11:])

                if altflags & F_REVERSE_STRAND:
                    altstrand = '-'
                    altseq = reverse_complement(altseq)
                else:
                    altstrand = '+'

                alteditdist = int(altoptions.get('NM', 'i:-1')[2:])

                if altrname != '*' and altstrand in strands:
                    altreflen, _ = calculate_cigar_length(altcigar)
                    altstop = altpos + altreflen - 1
                    altstart = altpos - 1 if self.zerobase else altpos
                    mapped.append((altrname, altstart, altstop, altstrand,
                                   alteditdist, altcigar))

            mapped.sort(key=geteditdist)

            if withref:
                newmapped = []
                for m in mapped:
                    subseq = self.getsubseq(m[0], m[1]-1, m[2])
                    if m[3] == '-':
                        subseq = reverse_complement(subseq)
                    newmapped.append(m + (subseq,))
                mapped = newmapped

            yield {
                'qname': qname,
                'flags': flags,
                'mapq': mapq,
                'seq': seq,
                'options': options,
                'mapped': mapped, # positions are 1-based
            }