Ejemplo n.º 1
0
def oldtails():
    aligns = M5File(primary)
    #where I'm putting the good hits
    mapOut = open(outname, "w")

    #where I'm putting the tails
    tfq = NamedTemporaryFile(prefix="tails_", suffix=".fastq", delete=False, dir=basedir)
    ALLTEMPFILES.append( tfq.name )
    whichEnd = defaultdict(list)
    #extract the tails
    ntails = 0
    for a in aligns:
        if a.qstart >= MINTAIL:
            tseq1 = reads[a.qname].subSeq(None, a.qstart)
            #prolog
            tseq1.name = "%s_::_5_::_%d,%d" % (tseq1.name, a.qstart, a.qseqlength)
            tfq.write(str(tseq1))
            ntails += 1
        if a.qend - a.qseqlength > MINTAIL:
            tseq2 = reads[a.qname].subSeq(a.qend, None)
            #epilog
            tseq2.name = "%s_::_3_::_%d,%d" % (tseq2.name, a.qend, a.qseqlength)
            tfq.write(str(tseq2))
            ntails += 1
        mapOut.write(str(a)+"\n")
        #don't want redundant hits on a single flank
        whichEnd[a.qname].append(a.tname)
    tfq.close()
    logging.info("%d unmapped tails" % (ntails))
    #map tails
    tailAlign = NamedTemporaryFile(prefix="tails_", suffix=".m5", delete=False, dir=basedir)
    tailAlign = tailAlign.name
    ALLTEMPFILES.append(tailAlign)
    blasr(tfq.name, target, nproc=nproc, bestn=1, outname=tailAlign)
    aligns2 = M5File(tailAlign)
    logging.info("%d tails mapped" % len(aligns2))
    for a in aligns2:
        #get the carryon info
        name, direct, se = a.qname.split("_::_")
        pos, length = map(int, se.split(','))
        #correct it's information
        a.qname = name
        a.qseqlength = length
        #prevent redundant flank map
        if a.tname in whichEnd[a.qname]:
            logging.info("%s failed ref map" % a.tname)
            continue
        whichEnd[a.qname].append(a.tname)
        #epilogs need to be updated
        if direct == '3':
            a.qstart += pos
            a.qend += pos
        mapOut.write(str(a)+"\n")
    mapOut.close()

    return
Ejemplo n.º 2
0
def m5ToOvlGraph(readNames, fileName):
    """
    Create the graph
    """
    connector = AlignmentConnector()
    alignments = M5File(fileName)
    graph = nx.Graph()

    #filt = []
    #get only the single best alignment between any two reads
    fdict = {}
    for align in alignments:
        if align.qname == align.tname:
            continue
        name = [align.qname, align.tname]
        name.sort()
        name = ":".join(name)
        if name in fdict:
            if align.score < fdict[name].score:
                fdict[name] = align
        else:
            fdict[name] = align

    alignments = fdict.values()
    #make edges for all overlaps
    for align in alignments:
        if align.qname == align.tname:
            continue
        extend = connector.extendsTarget(align)
        align.support = extend
        if extend != SUPPORTFLAGS.none:
            graph.add_edge(align.qname, align.tname, data=align)

    return graph
Ejemplo n.º 3
0
def run(argv):
    print argv
    args = parseArgs(argv)
    if args.m4.endswith("m5"):
        aligns = M5File(args.m4)
    else:
        aligns = M4File(args.m4)
    if args.reads.endswith("fasta"):
        reads = FastaFile(args.reads)
    elif args.reads.endswith("fastq"):
        temp = FastqFile(args.reads)
        reads = {}
        for i in temp:
            reads[i] = temp[i].seq
        del (temp)
    else:
        logging.error("Expected Fasta or Fastq for READS (%s)" % args.reads)
        exit(1)

    logging.info("Extracting tails")
    tailfastq = tempfile.NamedTemporaryFile(suffix=".fasta",
                                            delete=False,
                                            dir=args.temp)
    tailfastq.close()
    tailfastq = tailfastq.name
    logging.debug("Tail read tmp file %s " % (tailfastq))
    r, t, m = extractTails(aligns,
                           reads,
                           outFq=tailfastq,
                           minLength=args.minTail)

    logging.info("Parsed %d reads" % (r))
    logging.info("Found %d tails" % (t))
    logging.info("%d reads had double tails" % (m))
    if t == 0:
        logging.info("No tails -- Exiting")
        exit(0)

    logging.info("Mapping Tails")
    tailmap = tempfile.NamedTemporaryFile(suffix=".m4",
                                          delete=False,
                                          dir=args.temp)
    tailmap.close()
    tailmap = tailmap.name
    logging.debug("Read map tmp file %s " % (tailmap))
    mapTails(tailfastq,
             args.ref,
             nproc=args.nproc,
             out=tailmap,
             useSa=args.noSa)

    logging.info("Consolidating alignments")
    logging.debug("Final file %s " % (args.output))
    n = uniteTails(aligns, tailmap, args.output, args.inplace)
    logging.info("%d tails mapped" % (n))
Ejemplo n.º 4
0
def singleOverlapAssembly(alldata, args):
    """

    """
    global ALLTEMPFILES
    data = alldata.stats
    reads = NamedTemporaryFile(prefix="sol_", suffix=".fasta", delete=False, dir=args.tempDir)
    ALLTEMPFILES.append(reads.name)
    e1Seq = data["extendSeq1"]; e2Seq = data["extendSeq2"]
    reads.write(">%s\n%s\n>%s\n%s\n" % ("seq1", e1Seq, "seq2", e2Seq))
    reads.close()

    alignFn = NamedTemporaryFile(prefix="sol_",suffix=".m5", delete=False, dir=args.tempDir)
    ALLTEMPFILES.append(alignFn.name)
    blasr(reads.name, reads.name, nproc=args.nproc, outname=alignFn.name)
    aligns = M5File(alignFn)
    # find best hit between the two
    connector = AlignmentConnector()
    bestS = None
    bestA = 0
    for i in aligns:
        if i.qname != i.tname:
            if connector.extendsTarget(i):
                if i.score < bestS:
                    bestA = i
                    bestS = i.score
    if bestS is None:
        logging.info("no overlap between extenders")
        return

    #any of these steps could fail --
    #Ensure the hit is valid
    #(if + + and sameStrand we are okay, if - + and not sameStrand we are okay)
    if data["sameStrand"] == (bestA.tstrand == '0'):
        logging.info("bad overlap between extenders")
        return

    con = consensus([bestA])
    bestA = bestA[0]
    #strand correction...
    if bestA.qname == "seq1":
        if bestA.tstrand == '1':
            e2Seq = e2Seq[:bestA.tstart].translate(revComp)[::-1]
            seq = e1Seq[:bestA.qstart] + con.sequence.translate(revComp)[::-1] + e2Seq
        else:
            seq = e1Seq[:bestA.qstart] + con.sequence + e2Seq[bestA.tend:]
    else:
        if bestA.tstrand == '1':
            e2Seq = e2Seq[:bestA.qstart].translate(revComp)[::-1]
            seq = e1Seq[:bestA.tstart] + con.sequence + e2Seq
        else:
            seq = e1Seq[:bestA.qstart] + con.sequence + e2Seq[bestA.tstart:]

    return seq
Ejemplo n.º 5
0
    def consensusCalling(self, spot, bam, reference, args):
        """
        Make a consensus of all the reads in the region and identify all of the SVs in the region
        """
        #
        MAXNUMREADS = 100  #I don't think we'll need more than this many reads
        MAXATTEMPTS = MAXNUMREADS / 2  #I don't feel like trying 100 times
        SPANBUFFER = 100  #number of bases I want a read to span

        chrom, start, end = spot.chrom, spot.start, spot.end
        buffer = args.buffer

        supportReads = []
        spanReads = []
        #Fetch reads and trim
        totCnt = 0
        for read in bam.fetch(chrom, max(0, start - buffer - SPANBUFFER),
                              end + buffer + SPANBUFFER):
            if read.qname not in spot.varReads:
                continue
            seq, qual = self.readTrim(read, start - buffer, end + buffer)
            if read.pos < start - SPANBUFFER and read.aend > end + SPANBUFFER:
                spanReads.append((len(seq), seq, qual))
            else:
                supportReads.append((seq, qual))
            totCnt += 1

        if len(spanReads) == 0:
            logging.debug("noone spans - consensus aborted. %s" % (str(spot)))
            spot.tags["noSpan"] = True
            return [spot]

        spanReads.sort(reverse=True)
        if len(spanReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads[:MAXNUMREADS]]
        elif len(spanReads) + len(supportReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads
                                ] + supportReads[:MAXNUMREADS - len(spanReads)]
        else:
            origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads
        logging.debug("Alt reads: %d total, %d extra support" %
                      (totCnt, len(origSupportReads)))

        mySpots = []
        refReadId = 0
        haveVar = False

        #Attempt each spanRead until we get one that passes
        #while refReadId < len(spanReads) and not haveVar and refReadId < MAXATTEMPTS:
        #refread = spanReads[refReadId]
        #supportReads = origSupportReads[:refReadId] + origSupportReads[refReadId+1:]
        refReadId += 1

        #read that spans most of the region goes first
        #use the rest for cleaning

        #building consensus sequence
        foutreads = NamedTemporaryFile(suffix=".fasta")
        qoutreads = open(foutreads.name + '.qual', 'w')
        for id, i in enumerate(origSupportReads):
            foutreads.write(">%d\n%s\n" % (id, i[0]))
            qoutreads.write(">%d\n%s\n" %
                            (id, " ".join(str(ord(j) - 33) for j in i[1])))
        foutreads.flush()
        qoutreads.flush()

        #foutref = NamedTemporaryFile(suffix=".fasta")
        #foutref.write(">%s:%d-%d\n%s" % (spot.chrom, start, end, refread[1]))
        #foutref.flush()

        logging.debug("Making the contig....")
        #run it through phrap
        #make out.fasta and out.fasta.qual
        #run phrap
        #if asm -- consensus only
        r, o, e = exe("phrap %s -minmatch 6 -minscore 20" % (foutreads.name),
                      timeout=3)

        if r != 0:  #failed
            logging.warning('phrap failed ' + self.name)
            logging.warning(o)
            logging.warning(e)
            return [
            ]  #here is where I'd like to add just the no-consensus spot

        results = mergeFastaQual(foutreads.name + ".contigs",
                                 foutreads.name + ".contigs.qual")
        if len(results) == 0:
            logging.warning('no asm made ' + self.name)
            return [
            ]  #here is where I'd like to add just the no-consensus spot
        logging.info('%d contigs made %s' % (len(results), self.name))

        #then run it through consensus
        logging.debug("Polishing contigs")

        alignOut = NamedTemporaryFile(suffix=".m5")
        blasr(foutreads.name,
              foutreads.name + ".contigs",
              format="-m 5",
              nproc=1,
              outname=alignOut.name)
        # elif no asm and consensus only (faster)

        if args.polish == "pbbanana":
            aligns = M5File(alignOut.name)
            con = ">con\n%s\n" % consensus(aligns).sequence
            conName = "pbbanana"
        elif args.polish == "pbdagcon":
            logging.debug("pbdagcon is running")
            #using minerrreads - 1 because one f them is already being used as seed!
            r, con, e = exe("pbdagcon -c %d -t 0 %s" %
                            (max(0, args.minErrReads - 1), alignOut.name),
                            timeout=1)
            #r, con, e = exe("pbdagcon %s" % (alignOut.name), timeout=2)
            logging.debug("back from pbdagcon")
            logging.debug((r, e))
            #raw_input("press ent")
            if con is not None:
                con = con[con.index("\n") + 1:]
            else:
                con = ""
            conName = "pbdagcon"
        alignOut.close()
        #foutref.close()
        foutreads.close()
        #we don't have a consensus - retry
        if len(con) == 0:
            logging.debug("Trying another seed read for consensus")
            con = results.values()[0].seq
        logging.debug("%s %d bp seq" % (conName, len(con.split('\n')[1])))

        #try improving consensus
        conOut = NamedTemporaryFile(suffix=".fasta")
        conOut.write(con)
        #conOut.close()
        conOut.flush()

        refOut = NamedTemporaryFile(suffix=".fasta")
        #j = reference.fetch(chrom, max(0, start-buffer), end+buffer)
        #fout = open("f****e.ref.fasta",'w')
        #fout.write(j)
        #fout.close()
        refOut.write(">%s:%d-%d\n%s\n" % (chrom, start, end, \
                    reference.fetch(chrom, max(0, start-buffer), end+buffer)))
        refOut.flush()

        #map consensus to refregion
        varSam = NamedTemporaryFile(suffix=".sam")
        blasr(conOut.name, refOut.name, format="--sam", outname=varSam.name)
        #consensus=False) -- would this help?
        #or what if I fed it through leftalign?

        sam = pysam.Samfile(varSam.name)

        matches = 0.0
        bases = 0.0
        nReads = 0
        mySpots = []
        for read in sam:
            nReads += 1
            spot.tags["consensusCreated"] = True
            for svstart, svsize, svtype, altseq in expandCigar(
                    read, args.minIndelSize, CONFIRMCOLLAPSE, True):
                newspot = copy.deepcopy(spot)

                if spot.svtype == svtype and svtype == "INS":
                    haveVar = True
                    newspot.start = svstart + start - buffer
                    newspot.end = svstart + start - buffer
                    newspot.tags["seq"] = altseq
                    newspot.size = svsize
                    gt, gq = genotype(newspot)
                    newspot.tags["GT"] = gt
                    newspot.tags["GQ"] = gq
                    mySpots.append(newspot)

                elif spot.svtype == svtype and svtype == "DEL":
                    haveVar = True
                    newspot.start = svstart + start - buffer
                    newspot.end = svstart + svsize + start - buffer
                    newspot.size = -svsize
                    gt, gq = genotype(newspot)
                    newspot.tags["GT"] = gt
                    newspot.tags["GQ"] = gq
                    newspot.tags["seq"] = reference.fetch(
                        chrom, newspot.start, newspot.end)
                    mySpots.append(newspot)
        #identity = matches/bases
        #If no var, nothing is returned.
        #for newspot in mySpots:
        #newspot.tags["alnIdentityEstimate"] = identity
        #Keep reporting the actual contigs out until we
        #find a reason to need it (and also we can get quals...)
        #vbam.reset()
        #for id, read in enumerate(vbam):
        #newspot.tags["contigSeq%d" % (id)] = read.seq
        #newspot.tags["contigQual%d" % (id)] = read.qual

        #vbam.close()
        #varBam.close()
        refOut.close()

        logging.debug("%d consensus reads created %d spots" %
                      (nReads, len(mySpots)))

        return mySpots
Ejemplo n.º 6
0
#!/usr/bin/python
import sys
from pbsuite.utils.FileHandlers import M4File, M5File

if __name__ == '__main__':
    try:
        fn = sys.argv[1]
    except:
        sys.stderr.write(("Error! Expected One Argument, " \
                          "an m4 or m5 alignment file\n"))
        exit(1)

    if fn.endswith('.m4'):
        file = M4File(sys.argv[1])
    elif fn.endswith('.m5'):
        file = M5File(sys.argv[1])
    else:
        print "Unrecognized File Type (expecting  .m4 or .m5)"
        exit(1)

    if len(sys.argv) == 3:
        out = open(sys.argv[2], 'w')
    else:
        out = sys.stdout

    out.write("\n".join(map(lambda x: x.toBed(), file)) + "\n")
Ejemplo n.º 7
0
if __name__ == '__main__':
    args = parseArgs()
    
    alignFile = args.outname+".m5"
    consensusFile = args.outname+".fasta"
    

    #extract the read I'm looking for    
    if args.target is not None:#Name
        tempOut = open("temp.fasta",'w')
        fasta = FastaFile(args.reads)
        tempOut.write(">%s\n%s\n" % (args.target, fasta[args.target]))
        tempOut.write
        blasr(args.reads, tempOut.name, nproc=args.nproc, outName=alignFile)
        
        aligns = M5File(alignFile)   
        fout = open(consensusFile, 'w')
        results = consensus(aligns)
        fout.write(">pbjpolish_%d_vote_%d_len\n" % (results.contribBases,\
                                     results.fillBases, results.sequence))
        #fout.write(">\n%s\n" % consensus(aligns))
    
        fout.close()    
    elif args.Target is not None:#File
        blasr(args.reads, args.Target, nproc=args.nproc, outName=alignFile)
        
        aligns = M5File(alignFile)   
        fout = open(consensusFile, 'w')
        results = consensus(aligns)
        fout.write(">pbjpolish_%d_vote_%d_len\n%s\n" % (results.contribBases,\
                                     results.fillBases, results.sequence))
Ejemplo n.º 8
0
def preunitereads(inputFastq, args):
    """
    sent query, I'm going to pop all of the united reads onto this
    """
    global ALLTEMPFILES
    alignFile = NamedTemporaryFile(prefix="uni_",
                                   suffix=".m5",
                                   delete=False,
                                   dir=args.tempDir).name
    ALLTEMPFILES.append(alignFile)
    readFile = NamedTemporaryFile(prefix="uni_",
                                  suffix=".fasta",
                                  delete=False,
                                  dir=args.tempDir)
    ALLTEMPFILES.append(readFile.name)

    input = FastqFile(inputFastq)
    for read in input:
        readFile.write(">%s\n%s\n" % (input[read].name, input[read].seq))
    readFile.close()
    readFile = readFile.name
    blasr(readFile,
          readFile,
          bestn=5,
          nCandidates=20,
          nproc=args.nproc,
          outname=alignFile)
    aligns = M5File(alignFile)
    con = AlignmentConnector()
    extenders = []
    for a in aligns:
        if a.tname == a.qname:
            continue
        if a.qstart - a.qend < 500 or a.tstart - a.tend < 500:
            continue
        sup = con.extendsTarget(a, minCovers=500)
        #sup = con.extendsTarget(a, minCovers=100)
        a.support = sup
        if sup in [SUPPORTFLAGS.left, SUPPORTFLAGS.right]:
            extenders.append(a)

    best = {}  #best of queries
    for i in extenders:
        score = 0
        if i.qname in best:
            score = best[i.qname].score

        if i.score < score:
            best[i.qname] = i

    #print "q"
    #for i in best.values():
    #print str(i)

    best2 = {}  #best of targets
    for i in best.values():
        score = 0
        if i.tname in best2:
            score = best2[i.tname].score
        if i.score < score:
            best2[i.tname] = i
    #print "t"
    #for i in best2.values():
    #print str(i)

    best3 = {}  #best of both
    for i in best2.values():
        keys = [i.qname, i.tname]
        keys.sort()
        keys = "".join(keys)
        score = 0
        if keys in best3:
            score = best3[keys].score
        if i.score < score:
            best3[keys] = i
    #print 'b'
    #for i in best3.values():
    #print str(i)

    reads = FastqFile(inputFastq)
    fout = open(inputFastq, 'a')
    count = 0
    for i in best3.values():
        qseq = None
        if i.support == SUPPORTFLAGS.left:
            if i.qstrand == '0':
                qseq = reads[i.qname].seq + reads[i.tname].seq[i.tend:]
            elif i.qstrand == '1':
                qseq = reads[i.qname].seq + reads[
                    i.tname].seq[i.tend:].translate(revComp)
        if i.support == SUPPORTFLAGS.right:
            if i.qstrand == '0':
                qseq = reads[i.tname].seq[:i.tstart] + reads[i.qname].seq
            elif i.qstrand == '1':
                qseq = reads[i.tname].seq[:i.tstart].translate(
                    revComp) + reads[i.qname].seq
        if qseq is not None:
            count += 1
            fout.write("@%s_%s\n%s\n+\n%s\n" %
                       (i.qname, i.tname, qseq, "!" * len(qseq)))
    logging.info("Preunited %d reads" % (count))
    fout.close()
Ejemplo n.º 9
0
def buildFillSeq(data, inputReads, args):
    """
    Using all of the information in the namedtuple returned from getSubSeqs, 
    go through the process of building the filling sequence.

    load the filling sequence in to the data
    """
    #try to build span
    if SUPPORTFLAGS.span in data.stats["support"][0]:
        logging.debug("build span")
        alignFile = NamedTemporaryFile(prefix="scon_",
                                       suffix=".m5",
                                       delete=False,
                                       dir=args.tempDir)
        alignFile.close()
        alignFile = alignFile.name
        ALLTEMPFILES.append(alignFile)
        #blasr(data.spanReads, data.spanSeed, bestn = 1, nproc = args.nproc, outname=alignFile)
        blasr(inputReads,
              data.spanSeed,
              bestn=1,
              nproc=args.nproc,
              outname=alignFile)
        aligns = M5File(alignFile)
        if len(aligns) > 0:
            con = consensus(aligns)
            #if successful we're done
            if con.contribBases > 0 and con.fillBases > 0:  #must be
                sequence = con.sequence  #strandCorrector(data.stats["spanSeedStrand1"], con.sequence)
                data.stats["fillSeq"] = sequence
                data.stats["contribSeqs"] = con.contribSeqs
                data.stats["contribBases"] = con.contribBases
                data.stats["fillBases"] = con.fillBases
                return
        else:
            logging.info("no mapping... picking span seq")
            sequence = FastaFile(data.spanSeed).values()[0]
            data.stats["fillSeq"] = sequence
            data.stats["contribSeqs"] = 1
            data.stats["contribBases"] = len(sequence)
            data.stats["fillBases"] = len(sequence)
            return

    #no span -- we need to do flanks
    flank1Success = False
    flank2Success = False
    logging.debug(json.dumps(data.stats, indent=4))
    fl1Flag = SUPPORTFLAGS.left if data.stats["seed1"].endswith(
        "e5") else SUPPORTFLAGS.right
    if data.stats["seed2"] is not None:
        fl2Flag = SUPPORTFLAGS.left if data.stats["seed2"].endswith(
            "e5") else SUPPORTFLAGS.right
    else:
        fl2Flag = None

    logging.debug((fl1Flag, fl2Flag))
    if fl1Flag in data.stats["support"][1]:
        logging.debug("build flank1 %d" % fl1Flag)
        alignFile = NamedTemporaryFile(prefix="f1con_",
                                       suffix=".m5",
                                       delete=False,
                                       dir=args.tempDir)
        alignFile.close()
        alignFile = alignFile.name
        ALLTEMPFILES.append(alignFile)
        #blasr(data.flank1Reads, data.flank1Seed, bestn=1, nproc=args.nproc, outname=alignFile)
        blasr(inputReads,
              data.flank1Seed,
              bestn=1,
              nproc=args.nproc,
              outname=alignFile)
        aligns = M5File(alignFile)
        if len(aligns) > 0:
            con = consensus(aligns)
            if con.contribBases > 0 and con.fillBases > 0:  #must be
                sequence = con.sequence  #strandCorrector(data.stats["extendF1SeedStrand"], con.sequence)
                data.stats["extendSeq1"] = sequence
                data.stats["contribSeqs"] += con.contribSeqs
                data.stats["contribBases"] += con.contribBases
                data.stats["fillBases"] += con.fillBases
                flank1Success = True
        else:
            logging.info("no mapping... picking f1 seq")
            sequence = FastaFile(data.flank1Seed).values()[0]
            data.stats["extendSeq1"] = sequence
            data.stats["contribSeqs"] = 1
            data.stats["contribBases"] = len(sequence)
            data.stats["fillBases"] = len(sequence)
            flank1Success = True

    if fl2Flag in data.stats["support"][2]:
        logging.debug("build flank2 %d" % fl2Flag)
        alignFile = NamedTemporaryFile(prefix="f2con_",
                                       suffix=".m5",
                                       delete=False,
                                       dir=args.tempDir)
        alignFile.close()
        alignFile = alignFile.name
        ALLTEMPFILES.append(alignFile)
        #blasr(data.flank2Reads, data.flank2Seed, bestn=1, nproc=args.nproc, outname=alignFile)
        blasr(inputReads,
              data.flank2Seed,
              bestn=1,
              nproc=args.nproc,
              outname=alignFile)
        aligns = M5File(alignFile)
        if len(aligns) > 0:
            con = consensus(aligns)
            if con.contribBases > 0 and con.fillBases > 0:  #must be
                sequence = con.sequence  #strandCorrector(data.stats["extendF2SeedStrand"], con.sequence)
                data.stats["extendSeq2"] = sequence
                data.stats["contribSeqs"] += con.contribSeqs
                data.stats["contribBases"] += con.contribBases
                data.stats["fillBases"] += con.fillBases
                flank2Success = True
        else:
            logging.info("no mapping... picking f1 seq")
            sequence = FastaFile(data.flank2Seed).values()[0]
            data.stats["extendSeq2"] = sequence
            data.stats["contribSeqs"] = 1
            data.stats["contribBases"] = len(sequence)
            data.stats["fillBases"] = len(sequence)
            flank2Success = True

    if flank1Success and flank2Success:
        logging.debug("mid unite")
        seq = singleOverlapAssembly(data, args)
        if seq is not None:
            data.stats["fillSeq"] = seq

    return
Ejemplo n.º 10
0
import sys
from pbsuite.utils.FileHandlers import FastqFile, M5File
from pbsuite.jelly.Support import AlignmentConnector, SUPPORTFLAGS

"""
Need to do work here
"""
if __name__ == '__main__':
    connector = AlignmentConnector()
    aligns = connector.parseAlignments(M5File(sys.argv[1]))

    reads = FastqFile(sys.argv[2])

    bestScore = None
    best = None
    fout = open("reads.fastq",'w')
    spanCount = 0
    for readGroup in aligns:
        if readGroup[0].qname.startswith("ref"):
            continue
        if len(readGroup) == 2:
            r1, r2 = readGroup
            a = connector.extendsTarget(r1)
            b = connector.extendsTarget(r2)
            if a != SUPPORTFLAGS.none and b != SUPPORTFLAGS.none:
                spanCount += 1
                print r1.qname, "spans"
                
                rStart = min(r1.qend, r2.qend)
                rEnd = max(r1.qstart, r2.qstart)
                t = reads[r1.qname].subSeq(rStart, rEnd)
Ejemplo n.º 11
0
    def consensusCalling(self, spot, bam, reference, args):
        """
        Make a consensus of all the reads in the region and identify all of the SVs in the region
        """
        #
        MAXNUMREADS = 100  #I don't think we'll need more than this many reads
        MAXATTEMPTS = 5  #MAXNUMREADS/2 #I don't feel like trying 100 times
        SPANBUFFER = 100  #number of bases I want a read to span

        chrom, start, end = spot.chrom, spot.start, spot.end
        buffer = args.buffer

        supportReads = []
        spanReads = []
        #Fetch reads and trim
        totCnt = 0
        for read in bam.fetch(chrom, max(0, start - buffer - SPANBUFFER),
                              end + buffer + SPANBUFFER):
            if read.qname not in spot.varReads:
                continue
            seq, qual = self.readTrim(read, start - buffer, end + buffer)
            if read.pos < start - SPANBUFFER and read.aend > end + SPANBUFFER:
                sz = spot.varReadsSize[spot.varReads.index(read.qname)]
                spanReads.append((abs(sz - spot.tags["szMedian"]), seq, qual))
            else:
                supportReads.append((seq, qual))
            totCnt += 1

        if len(spanReads) == 0:
            logging.debug("noone spans - consensus aborted. %s" % (str(spot)))
            spot.tags["noSpan"] = True
            return [spot]

        #spanReads.sort(reverse=True)
        spanReads.sort()
        if len(spanReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads[:MAXNUMREADS]]
        elif len(spanReads) + len(supportReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads
                                ] + supportReads[:MAXNUMREADS - len(spanReads)]
        else:
            origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads
        mySpots = []
        refReadId = 0
        haveVar = False
        #Attempt each spanRead until we get one that passes
        while refReadId < len(
                spanReads) and not haveVar and refReadId < MAXATTEMPTS:
            refread = spanReads[refReadId]
            supportReads = origSupportReads[:refReadId] + origSupportReads[
                refReadId + 1:]
            refReadId += 1

            #read that spans most of the region goes first
            #use the rest for cleaning

            #building consensus sequence
            foutreads = NamedTemporaryFile(suffix=".fastq")
            for id, i in enumerate(supportReads):
                foutreads.write("@%d\n%s\n+\n%s\n" % (id, i[0], i[1]))
            foutreads.flush()
            foutref = NamedTemporaryFile(suffix=".fasta")
            foutref.write(">%s:%d-%d\n%s" %
                          (spot.chrom, start, end, refread[1]))
            foutref.flush()

            alignOut = NamedTemporaryFile(suffix=".m5")
            logging.debug("making the contig....")
            #run it through phrap

            #then run it through consensus
            blasr(foutreads.name,
                  foutref.name,
                  format="-m 5",
                  nproc=1,
                  outname=alignOut.name)
            if args.consensus == "pbbanana":
                aligns = M5File(alignOut.name)
                con = ">con\n%s\n" % consensus(aligns).sequence
                conName = "pbbanana"
            elif args.consensus == "pbdagcon":
                logging.debug("pbdagcon is running")
                #using minerreads - 1 because one f them is already being used as seed!
                #I want to be sure I get something out... so just require somebody on there
                #r, con, e = exe("pbdagcon -c %d -t 0 %s" % (1, alignOut.name), timeout=1)
                #r, con, e = exe("pbdagcon -m 100 -c %d -t 0 %s" % (max(args.minErrReads - 1, 0), alignOut.name), timeout=1)
                r, con, e = exe("pbdagcon -m 100 -c %d -t 0 %s" %
                                (3, alignOut.name),
                                timeout=1)
                logging.debug("back from pbdagcon")
                logging.debug((r, e))
                #raw_input("press ent")
                if con is not None:
                    con = con[con.index("\n") + 1:]
                else:
                    con = ""
                conName = "pbdagcon"
            alignOut.close()
            foutref.close()
            foutreads.close()
            #we don't have a consensus - retry
            if len(con) == 0:
                logging.debug("Trying another seed read for consensus")
                continue
            logging.debug("%s %d bp seq" % (conName, len(con.split('\n')[1])))

            #try improving consensus
            conOut = NamedTemporaryFile(suffix=".fasta")
            conOut.write(con)
            #conOut.close()
            conOut.flush()

            refOut = NamedTemporaryFile(suffix=".fasta")
            #j = reference.fetch(chrom, max(0, start-buffer), end+buffer)
            #fout = open("f****e.ref.fasta",'w')
            #fout.write(j)
            #fout.close()
            refOut.write(">%s:%d-%d\n%s\n" % (chrom, start, end, \
                        reference.fetch(chrom, max(0, start-(buffer*2)), end+(buffer*2))))
            refOut.flush()

            #map consensus to refregion
            varSam = NamedTemporaryFile(suffix=".sam")
            blasr(conOut.name, refOut.name, format="-sam", outname=varSam.name,\
                consensus=False) #-- would this help?
            #or what if I fed it through leftalign?
            #os.system("cp %s ." % (refOut.name))
            #os.system("cp %s ." % (varSam.name))
            sam = pysam.Samfile(varSam.name)

            matches = 0.0
            bases = 0.0
            nReads = 0
            minVarDiff = 10000
            for read in sam:
                localSpots = []
                nReads += 1
                spot.tags["consensusCreated"] = True
                for svstart, svsize, svtype, altseq in expandCigar(
                        read, args.minIndelSize, CONFIRMCOLLAPSE, True):
                    newspot = copy.deepcopy(spot)

                    if spot.svtype == svtype and svtype == "INS":
                        #haveVar = True
                        newspot.start = svstart + start - (buffer * 2)
                        newspot.end = svstart + start - (buffer * 2)
                        newspot.tags["seq"] = altseq
                        newspot.size = svsize
                        gt, gq = genotype(newspot)
                        newspot.tags["GT"] = gt
                        newspot.tags["GQ"] = gq
                        if abs(spot.tags["szMedian"] -
                               newspot.size) < minVarDiff:
                            minVarDiff = abs(spot.tags["szMedian"] -
                                             newspot.size)
                        if args.reportContig:
                            newspot.tags["contigseq"] = read.seq
                            newspot.tags["contigqual"] = read.qual
                        localSpots.append(newspot)

                    elif spot.svtype == svtype and svtype == "DEL":
                        #haveVar = True
                        newspot.start = svstart + start - (buffer * 2)
                        newspot.end = svstart + svsize + start - (buffer * 2)
                        newspot.size = svsize
                        gt, gq = genotype(newspot)
                        newspot.tags["GT"] = gt
                        newspot.tags["GQ"] = gq
                        newspot.tags["seq"] = reference.fetch(
                            chrom, newspot.start, newspot.end)
                        if abs(spot.tags["szMedian"] -
                               newspot.size) < minVarDiff:
                            minVarDiff = abs(spot.tags["szMedian"] -
                                             newspot.size)
                        if args.reportContig:
                            newspot.tags["contigseq"] = read.seq
                            newspot.tags["contigqual"] = read.qual
                        localSpots.append(newspot)
                if len(localSpots) > 0:
                    mySpots.append((minVarDiff, localSpots))

            #identity = matches/bases
            #If no var, nothing is returned.
            #for newspot in mySpots:
            #newspot.tags["alnIdentityEstimate"] = identity
            #Keep reporting the actual contigs out until we
            #find a reason to need it (and also we can get quals...)
            #vbam.reset()
            #for id, read in enumerate(vbam):
            #newspot.tags["contigSeq%d" % (id)] = read.seq
            #newspot.tags["contigQual%d" % (id)] = read.qual

            #vbam.close()
            #varBam.close()
            refOut.close()

            #logging.debug("%d consensus reads created %d spots" % (nReads, len(localSpots)))

        if len(mySpots) == 0:
            return []

        mySpots.sort()
        return mySpots[0][1]