Beispiel #1
0
def run(argv):
    print argv
    args = parseArgs(argv)
    if args.m4.endswith("m5"):
        aligns = M5File(args.m4)
    else:
        aligns = M4File(args.m4)
    if args.reads.endswith("fasta"):
        reads = FastaFile(args.reads)
    elif args.reads.endswith("fastq"):
        temp = FastqFile(args.reads)
        reads = {}
        for i in temp:
            reads[i] = temp[i].seq
        del (temp)
    else:
        logging.error("Expected Fasta or Fastq for READS (%s)" % args.reads)
        exit(1)

    logging.info("Extracting tails")
    tailfastq = tempfile.NamedTemporaryFile(suffix=".fasta",
                                            delete=False,
                                            dir=args.temp)
    tailfastq.close()
    tailfastq = tailfastq.name
    logging.debug("Tail read tmp file %s " % (tailfastq))
    r, t, m = extractTails(aligns,
                           reads,
                           outFq=tailfastq,
                           minLength=args.minTail)

    logging.info("Parsed %d reads" % (r))
    logging.info("Found %d tails" % (t))
    logging.info("%d reads had double tails" % (m))
    if t == 0:
        logging.info("No tails -- Exiting")
        exit(0)

    logging.info("Mapping Tails")
    tailmap = tempfile.NamedTemporaryFile(suffix=".m4",
                                          delete=False,
                                          dir=args.temp)
    tailmap.close()
    tailmap = tailmap.name
    logging.debug("Read map tmp file %s " % (tailmap))
    mapTails(tailfastq,
             args.ref,
             nproc=args.nproc,
             out=tailmap,
             useSa=args.noSa)

    logging.info("Consolidating alignments")
    logging.debug("Final file %s " % (args.output))
    n = uniteTails(aligns, tailmap, args.output, args.inplace)
    logging.info("%d tails mapped" % (n))
Beispiel #2
0
class NullDevice():
    def write(self, s):
            pass

if __name__ == '__main__':
    args = parseArgs()
    
    alignFile = args.outname+".m5"
    consensusFile = args.outname+".fasta"
    

    #extract the read I'm looking for    
    if args.target is not None:#Name
        tempOut = open("temp.fasta",'w')
        fasta = FastaFile(args.reads)
        tempOut.write(">%s\n%s\n" % (args.target, fasta[args.target]))
        tempOut.write
        blasr(args.reads, tempOut.name, nproc=args.nproc, outName=alignFile)
        
        aligns = M5File(alignFile)   
        fout = open(consensusFile, 'w')
        results = consensus(aligns)
        fout.write(">pbjpolish_%d_vote_%d_len\n" % (results.contribBases,\
                                     results.fillBases, results.sequence))
        #fout.write(">\n%s\n" % consensus(aligns))
    
        fout.close()    
    elif args.Target is not None:#File
        blasr(args.reads, args.Target, nproc=args.nproc, outName=alignFile)
        
Beispiel #3
0
#!/usr/bin/python

import argparse, json
from pbsuite.jelly.Jelly import JellyProtocol
from pbsuite.utils.FileHandlers import FastaFile, FastqFile
from pbsuite.utils.summarizeAssembly import getStats

USAGE = """Get statistics on fasta/fastq sequences recorded in a Protocol.xml"""

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=USAGE, \
            formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("xml", metavar="XML", type=str, \
                        help="Protocol.xml with inputs listed")
    args = parser.parse_args()
    protocol = JellyProtocol(args.xml)
    seqLengths = []
    for i in protocol.inputs:
        if i.endswith(".fasta"):
            f = FastaFile(i)
            for j in f.values():
                seqLengths.append(len(j))
        if i.endswith(".fastq"):
            f = FastqFile(i)
            for j in f.values():
                seqLengths.append(len(j.seq))
    print "Read Stats", json.dumps(getStats(seqLengths), indent=4)
Beispiel #4
0
#!/usr/bin/env python

import argparse, json
from pbsuite.jelly.Jelly import JellyProtocol
from pbsuite.utils.FileHandlers import FastaFile, FastqFile
from pbsuite.utils.summarizeAssembly import getStats

USAGE = """Get statistics on fasta/fastq sequences recorded in a Protocol.xml"""

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=USAGE, \
            formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("xml", metavar="XML", type=str, \
                        help="Protocol.xml with inputs listed")
    args = parser.parse_args()
    protocol = JellyProtocol(args.xml)
    seqLengths = []
    for i in protocol.inputs:
        if i.endswith(".fasta"):
            f = FastaFile(i)
            for j in f.values():
                seqLengths.append(len(j))
        if i.endswith(".fastq"):
            f = FastqFile(i)
            for j in f.values():
                seqLengths.append(len(j.seq))
    print "Read Stats", json.dumps(getStats(seqLengths), indent=4)
Beispiel #5
0
    def run(self):
        #Fasta Ref Output
        scaffTempName = self.scaffInput + ".tempFasta"
        scaffOutput = open(scaffTempName, 'w')

        #Qual Ref Output
        if self.qualInput is not None:
            qualTempName = self.qualInput + ".tempQual"
            qualOutput = open(qualTempName, 'w')

        #Gaps Output
        if self.opts.gapOutput is not None:
            gapTableOut = open(self.opts.gapOutput, 'w')
        else:
            gapTableOut = False

        logging.info(
            "Creating reference sequence index names and identifying gaps")

        refTemplate = "ref%07d"
        refId = 1

        #Read References
        reference = FastaFile(self.scaffInput)
        if self.qualInput is not None:
            qualReference = QualFile(self.qualInput)

        for key in reference:

            scaffIndex = refTemplate % refId
            scaffName = key.replace(' ', '_')

            refId += 1

            scaffName = scaffName + "|" + scaffIndex
            scaffOutput.write(">" + scaffName + "\n" + wrap(reference[key]) +
                              "\n")

            if self.qualInput is not None:
                qualOutput.write(">" + scaffName + "\n" +
                                 qwrap(qualReference[key]) + "\n")

            gapCoords = []
            for gap in re.finditer("[^Nn]([Nn]{%d,%s})[^Nn]" % \
                    (self.opts.minGap, self.opts.maxGap), reference[key]):
                gapCoords.append([gap.start() + 1, gap.end() - 1])

            if len(gapCoords) == 0:  #no Gaps
                gapTableOut.write("\t".join(
                    [scaffName, 'na', 'na', scaffIndex + "_0_0", '3']) + '\n')
                logging.debug("Scaffold %s is empty" % scaffName)
                continue

            #Consolidate gaps that are too close -- indicating LQ regions.
            i = 0
            while i < len(gapCoords) - 1:
                if gapCoords[i + 1][0] - gapCoords[i][1] < 25:
                    gapCoords[i + 1][0] = gapCoords[i][0]
                    del (gapCoords[i])
                else:
                    i += 1

            prevEnd = 0  #Contig Start Tracking
            idx = 0
            #Make the first gap
            prevEnd = gapCoords[0][1]
            gapCoords[0][1] - gapCoords[0][0]

            flag = Gap.BEGIN
            if len(gapCoords) == 1:
                flag += Gap.END
            if gapTableOut:
                gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \
                        % (scaffName, gapCoords[0][0], gapCoords[0][1], scaffIndex, idx, idx+1, flag))

            #Now Go Through the rest of the gaps
            for i in range(1, len(gapCoords)):
                idx += 1
                prevEnd = gapCoords[i][1]
                gapCoords[i][1] - gapCoords[i][0]

                if gapTableOut:
                    if i == len(gapCoords) - 1:
                        flag = Gap.END
                    else:
                        flag = 0
                    gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \
                        % (scaffName, gapCoords[i][0], gapCoords[i][1], scaffIndex, idx, idx+1, flag))

        #Close shop
        scaffOutput.close()
        os.rename(self.scaffInput, self.scaffInput + ".original")
        os.rename(scaffTempName, self.scaffInput)

        if self.qualInput is not None:
            qualOutput.close()
            os.rename(self.qualInput, self.qualInput + ".original")
            os.rename(qualTempName, self.qualInput)

        if gapTableOut:
            gapTableOut.close()

        if self.opts.index:
            logging.info("Creating .sa indexes for references")
            r, o, e = exe("sawriter %s.sa %s" %
                          (self.scaffInput, self.scaffInput))
            if r != 0:
                logging.error("sawriter returned %d" % r)
                logging.error("Ensure it's in your path")
                exit(1)
            logging.debug(str(o) + ' ' + str(e))

        logging.info("Finished!")
from pbsuite.utils.FileHandlers import FastaFile
import json, re, sys
"""
## Arguments

1 - reference.fasta -- input reference created by Setup.py
2 - liftOverTable.json - created at end of Jelly run
3 - jelly.out.fasta -- new reference created by Jelly

"""
fasta = FastaFile(sys.argv[1])
nameLookup = {}
for entry in fasta:
    data = entry.split('|')
    refKey = data[-1]
    origName = "|".join(data[:-1])
    nameLookup[refKey] = origName

liftOver = json.load(open(sys.argv[2], 'r'))
jellyFasta = FastaFile(sys.argv[3])
regex = re.compile("ref\d{7}")
for key in liftOver:
    myRefs = set()
    for id, strand, size in liftOver[key]:
        myRefs.update(regex.findall(id))
    newName = []
    for refId in myRefs:
        newName.append(nameLookup[refId])
    sys.stdout.write(">%s\n%s\n" % ("_".join(newName), jellyFasta[key]))
Beispiel #7
0
                used.append(node)
                next = getStrongestEdge(node, direction, used)
                if next is None:
                    break
                path.append(next)
                if next.qstrand == '-':
                    if direction == SUPPORTFLAGS.left:
                        direction = SUPPORTFLAGS.right
                    elif direction == SUPPORTFLAGS.right:
                        direction = SUPPORTFLAGS.left
                node = next.qname
            paths.append(path)
    for p in paths:
        if len(p) == 0:
            continue
        for i in p:
            if not i.tname.startswith("ref"):
                sys.stdout.write(i.tname.split('/')[1], i.tstrand, "\t")
            else:
                sys.stdout.write(i.tname, i.tstrand, "\t")
        sys.stdout.write('\n')


if __name__ == '__main__':
    reads = sys.argv[1]
    fasta = FastaFile(reads)
    #blasr(reads, reads, 4)
    ovl = m5ToOvlGraph(fasta.keys(), "out.m5")
    ovlSimplify(ovl)
    nx.write_gml(ovl, "ovl.gml")
Beispiel #8
0
import sys, random
from pbsuite.utils.FileHandlers import FastaFile, revComp, wrap

def getRandomSeq(length):
    return "".join([random.choice(['A', 'T', 'C', 'G']) for i in xrange(length)])
    
if __name__ == '__main__':
    fasta = FastaFile(sys.argv[1])
    key = fasta.keys()[0]
    ref = list(fasta[key])
    
    #800bp insertion in the sample (deletion in the reference) 
    ref[5000:5800] = ""
    #5000 Insertion

    #Inversion in the sample (inversion in the reference) tails
    ref[9000:12000] = list("".join(ref[10000:13000]).translate(revComp)[::-1])
    #9000-12000 - INversion
    
    #1kb deletion in sample (insert into the reference) tails
    seq = getRandomSeq(1000)
    ref[20000:20000] = list(seq)
    #20000-21000 -- Deletion 
    
    #100bp insertion in sample (deletion in the reference) spots
    ref[30000:30100] = ""
    #30000 - Insertion

    #200bp deletion in sample (insert into the reference) spots
    seq = getRandomSeq(200)
    ref[35000:35000] = list(seq)