def main(args): logging.info(" ========> parsing GFFFile") logging.info("ARGS %s"%args) gfffilename = args.gfffile fastafilename = args.fastafile gff_factory = GFFFactory(gfffilename) fastafile = FastaFile(fastafilename) seqs = {} seq = fastafile.nextSeq() while seq is not None: seqs[seq['id']] = seq['seq'] seq = fastafile.nextSeq() try: gff = gff_factory.nextGFF() except Exception, e: sys.stderr.write("ERROR: %s\n"%e)
def testCreateMummerDeltaFile(self): """ We need the sequences to create the alignments """ refseqs = FastaFile.getSequenceDict(self.input_files[0]) qryseqs = FastaFile.getSequenceDict(self.input_files[1]) self.assertTrue(len(refseqs) == 87) self.assertTrue(len(qryseqs) == 34) mdf = MummerDeltaFile(self.deltafile,refseqs,qryseqs) self.assertTrue(mdf) mdf.parse() alns = mdf.alns self.assertTrue(len(alns) == 54) self.assertTrue('GG739631.1' in alns) idalns = alns['GG739631.1'] self.assertTrue(len(idalns) == 14) self.assertTrue(idalns[13].qstart == 293765)
def main(args): logging.info(" ========> process_alnfile.py") logging.info("ARGS %s" % args) ff = FastaFile(args.fastafile) seqs = [] seq = ff.nextSeq() # We first want to flag which sequences are present and which are not # get the ids # sort alphbetically # print the ids joined together # We then want to get some stats on the alignment # Number of sequences # Length # Coverage for each sequence # Get consensus # PID to consensus for each sequence # Mismatches for each sequence # We then want some specifics # Positions where 2 sequences are the same and the other is not ids = [] while seq is not None: ids.append(seq['id']) seq = ff.nextSeq() pid = 101 if args.pid is not None: pid = int(args.pid) stats = ff.calcStats() if stats['av_ungapped_percentid'] < pid: print "\nFILESTATS\t%s\tNumber_of_seqs\t%d\tIDS\t%s" % ( args.fastafile, len(ids), ','.join(ids)) print stats['outstr'] print stats['avpercentid']
def main(args): blatobj = BlatFile(args.blatfile) fastaobj = FastaFile(args.fastafile) # Parse the fasta file seqs = [] ids = {} seq = fastaobj.nextSeq() while seq is not None: seqs.append(seq) ids[seq['id']] = len(seq['seq']) seq = fastaobj.nextSeq() feat = blatobj.nextFeature() tmpfeat = [] tmpqid = None foundids = {} foundhits = {} while feat: if tmpqid is not None and len(tmpfeat) > 0: if tmpqid != feat.qid: print tophit = getBestHit(tmpfeat) foundids[tophit.qid] = 1 foundhits[tophit.qid] = tophit tmpfeat = [] tmpfeat.append(feat) tmpqid = feat.qid feat = blatobj.nextFeature() for id in ids: if id not in foundids: print "MISSINGID %s LEN %d" % (id, ids[id]) else: tophit = foundhits[id] print "FOUNDID\t%d\t%d\t%d\t%s" % (tophit.pid, tophit.qcov, tophit.hcov, tophit)
def main(args): logging.info(" ========> filter_fastafile.py") logging.info("ARGS %s"%args) ff = FastaFile(args.fastafile) stub = args.stub flen = open(stub+".chrlen",'w') blen = open(stub+ '.cytoband.txt','w') seq = ff.nextSeq() while seq is not None: flen.write("%s\t%d\n"%(seq['id'],seq['len'])) blen.write("%s\t0\t%d\tband0\tband0\n"%(seq['id'],seq['len'])) seq = ff.nextSeq() flen.close() blen.close()
def getCommands(self): self.commands = [] self.output_files = [] self.checkDiskSpace() seqs = FastaFile.getSequenceDict(self.refgenome, False) if self.checkInputFiles() == False: raise Exception("Input files [%s] don't exist = can't continue" % (self.input_files)) fileparts = FileUtils.getFileParts(self.input_files[0]) self.basename = fileparts['basename'] # Need to set dbtype somewhere outfile1 = self.working_dir + "/" + self.basename + ".raw.vcf" outfile2 = self.working_dir + "/" + self.basename + ".flt.vcf" regstr = "" if self.regionstr != "": regstr = " -r " + self.regionstr outfile1 = self.working_dir + "/" + self.basename + "." + self.regionstr + ".raw.vcf" outfile2 = self.working_dir + "/" + self.basename + "." + self.regionstr + ".flt.vcf" self.expected_output_files.append(outfile1) self.expected_output_files.append(outfile2) command1 = self.samtools + " mpileup -uf " + self.refgenome + " " + self.input_files[ 0] + " " + regstr + " | " + self.bcftools + " view " + " -bvcg - > " + outfile1 command2 = self.bcftools + " view " + outfile1 + " | " + self.vcfutils + " varFilter -D100 > " + outfile2 print "Command %s" % command1 print "Command %s" % command2 self.commands.append( AnalysisCommand(command=command1, command_rank=len(self.commands) + 1)) self.commands.append( AnalysisCommand(command=command2, command_rank=len(self.commands) + 1)) return self.commands
def getCommands(self): self.commands = [] self.output_files = [] self.checkDiskSpace() print "Reading genome file" seqs = FastaFile.getSequenceDict(self.refgenome,False) if self.checkInputFiles() == False: raise Exception("Input files [%s] don't exist = can't continue"%(self.input_files)) fileparts = FileUtils.getFileParts(self.input_files[0]) self.basename = fileparts['basename'] for seq in seqs: len = seqs[seq]['len'] i = 1 while i < len: end = i + self.chunk -1 if end > len: end = len regionstr = "%s:%d-%d"%(seq,i,end) tmpana = AnalysisFactory.createAnalysisFromModuleName("SamtoolsMpileup") tmpana.setInputFiles(self.input_files,self.input_types) tmpana.refgenome = self.refgenome tmpana.regionstr = regionstr tmpana.init() tmpcmds = tmpana.getCommands() for cmd in tmpcmds: self.commands.append(cmd) i = i + self.chunk return self.commands
def main(args): logging.info(" ========> filter_fastafile.py") logging.info("ARGS %s" % args) ff = FastaFile(args.fastafile) filterstr = args.str seqs = [] seq = ff.nextSeq() while seq is not None: seqs.append(seq) seq = ff.nextSeq() newseqs = FastaFilter.filterById(seqs, args.str) print FastaFile.toString(newseqs)
def main(args): logging.info(" ========> Converting mummer delta format for %s %s %s" % (args.deltafile, args.reffile, args.queryfile)) logging.info("ARGS %s" % args) ref = FastaFile(args.reffile) qry = FastaFile(args.queryfile) gff = GFFFactory(args.gfffile) g = gff.nextGFF() gffs = {} while g is not None: if g.type2 == "CDS": #print "QID %s %s"%(g.qid,g.type2) if g.qid not in gffs: gffs[g.qid] = [] gffs[g.qid].append(g) g = gff.nextGFF() refseqs = {} qryseqs = {} seq = ref.nextSeq() while seq is not None: refseqs[seq['id']] = seq seq = ref.nextSeq() seq = qry.nextSeq() while seq is not None: qryseqs[seq['id']] = seq seq = qry.nextSeq() fh = open(args.deltafile) alns = {} lnum = 0 line = fh.readline() id1 = None id2 = None len1 = None len2 = None while line != "": # Can't use for line in fh: because we read the alignment in chunks lnum = lnum + 1 line = line.rstrip('\n') ff = line.split(' ') if lnum == 1: """ The first line lists the two original input files separated by a space.""" if1 = ff[0] if2 = ff[1] print "Input files [%s][%s]\n" % (if1, if2) elif lnum == 2: """ The second line specifies the alignment data type, either NUCMER or "PROMER""" alntype = ff[0] if alntype != "NUCMER": raise Exception( "Only NUCMER alignments are currently parsed - we have [%s]" % alntyp) else: """ Every grouping of alignment regions have a header, just like the cluster's header in the .cluster file. This is a FASTA style header and lists the two sequences that produced the following alignments after a '>' and separated by a space. After the two sequences are the lengths of those sequences in the same order. An example header might look like: >tagA1 tagB1 500 2000000 """ if ff[0].startswith(">"): id1 = ff[0].replace(">", '') id2 = ff[1] len1 = int(ff[2]) len2 = int(ff[3]) #print "IDs %s %s %d %d"%(id1,id2,len1,len2) else: #print "Parsing %s"%line """ The four digits are the start and end in the reference sequence respectively and the start and end in the query sequence respectively. These coordinates are always measured in DNA bases regardless of the alignment data type. The three digits after the starts and stops are: the number of errors (non-identities), similarity errors (non- positive match scores) non-alpha characters in the sequence (used to count stop-codons i promer data). An example header might look like: 5198 22885 5389 23089 20 20 0 """ rstart = int(ff[0]) rend = int(ff[1]) qstart = int(ff[2]) qend = int(ff[3]) qstrand = 1 hstrand = 1 if rend < rstart: qstrand = -1 else: qstrand = 1 if qend < qstart: hstrand = -1 else: hstrand = 1 #print "Strands %d %d"%(qstrand,hstrand) errors = int(ff[4]) simerrs = int(ff[5]) nonalpha = int(ff[6]) if id1 not in refseqs: raise Exception( "Can't find reference sequence [%s] in ref file [%s]" % (id1, args.reffile)) if id2 not in qryseqs: raise Exception( "Can't find query sequence [%s] in query file [%s]" % (id2, args.queryfile)) rseq = refseqs[id1] qseq = qryseqs[id2] #print "Found alignment header %s %d %d :: %s %d %d"%(id1,rstart,rend,id2,qstart,qend) """ Each of these headers is followed by a string of signed digits, one per line, with the final line before the next header equaling 0 (zero). Each digit represents the distance to the next insertion in the reference (positive int) or deletion in the reference (negative int), as measured in DNA bases or amino acids depending on the alignment data type. For example, with 'nucmer' the delta sequence (1, -3, 4, 0) would represent - an insertion at positions 1 and 7 in the reference sequence and - an insertion at position 3 in the query sequence. Or with letters: A = acgtagctgag$ B = cggtagtgag$ Delta = (1, -3, 4, 0) A = acg.tagctgag$ B = .cggtag.tgag$ """ count = fh.readline() count = count.rstrip('\n') count = int(count) tmprseq = rseq['seq'] tmpqseq = qseq['seq'] if rend > rstart: tmprseq = tmprseq[rstart - 1:rend - 1] else: tmprseq = tmprseq[rend:rstart] tmprseq = reverse_complement(tmprseq) if qend > qstart: tmpqseq = tmpqseq[qstart - 1:qend - 1] else: tmpqseq = tmpqseq[qend:qstart] tmpqseq = reverse_complement(tmpqseq) insertpos = 0 while count != 0: if count < 0: """ This is an insertion in the query sequence so we put a - in the ref""" insertpos = insertpos + abs(count) tmprseq = tmprseq[:insertpos - 1] + "-" + tmprseq[insertpos - 1:] elif count > 0: """ This is an insertion in the reference sequence """ insertpos = insertpos + abs(count) tmpqseq = tmpqseq[:insertpos - 1] + "-" + tmpqseq[insertpos - 1:] count = fh.readline() count = count.rstrip('\n') count = int(count) seq1 = {} seq2 = {} seq1['id'] = id1 seq2['id'] = id2 seq1['seq'] = tmprseq seq2['seq'] = tmpqseq if (seq1 != seq2 and id1 == "GG739696.1"): print prettyPrint([seq1, seq2]) if id1 not in alns: alns[id1] = [] tmpgff = Feature() tmpgff.qid = id1 tmpgff.qstart = rstart tmpgff.qend = rend #print "Strand %d %d"%(qstrand,hstrand) tmpgff.hitattr['qseq'] = seq1 tmpgff.hitattr['hseq'] = seq2 tmpgff.hitattr['hid'] = id1 tmpgff.hitattr['insertpos'] = insertpos #alns[id1].append([seq1,seq2]) alns[id1].append(tmpgff) line = fh.readline() #for id in alns: #print id #for gff in alns[id]: #print "%s - %s"%( tmpgff.qid,tmpgff.hitattr['hid']) gnum = 1 for id in gffs: for g in gffs[id]: outstr = [] name = g.hitattr['Name'] prod = g.hitattr['product'] #for h in g.hitattr: # print "%s %s"%(h,g.hitattr[h]) found = False foundgff = None status = "NEW" if id in alns: for tmpgff in alns[id]: if g.overlaps(tmpgff): if tmpgff.contains(g): #print "Contained Seq qstart/end %d %d"%(tmpgff.qstart,tmpgff.qend) found = True foundgff = tmpgff else: ostart = g.qstart oend = g.qend if tmpgff.qstart > g.qstart: ostart = tmpgff.qstart if tmpgff.qend < g.qend: oend = tmpgff.qend frac = int(100 * (oend - ostart + 1) / (g.qend - g.qstart + 1)) status = "PARTALIGN" outstr.append( "============1 Processing gene %d %s %s" % (gnum, name, prod)) outstr.append( "Contig coords from gff file %s %d-%d" % (g.qid, g.qstart, g.qend)) outstr.append( "Partial overlap of %d percent overlap coords are %d %d" % (frac, ostart, oend)) if not found: if status == "NEW": status = "NOALIGN" outstr.append("============2 Processing gene %d %s %s" % (gnum, name, prod)) outstr.append( "Contig coords from gff file %s %d-%d %s %s" % (g.qid, g.qstart, g.qend, name, prod)) outstr.append( "ERROR: No align for %s %s qstart/end %d %d %s" % (name, tmpgff.qid, tmpgff.qstart, tmpgff.qend, prod)) else: if qstrand == -1: status = "REVSTRAND" outstr.append("===========3 Processing gene %d %s %s" % (gnum, name, prod)) outstr.append( "Contig coords from gff file %s %d-%d %s %s" % (g.qid, g.qstart, g.qend, name, prod)) outstr.append( "ERROR: can't deal with reverse strand reference alignments" ) else: gstrand = g.strand gstart = g.qstart gend = g.qend astrand = foundgff.strand astart = foundgff.qstart aend = foundgff.qend apos1 = findAlnPos(foundgff, gstart) apos2 = findAlnPos(foundgff, gend) if gstrand == 1: qseq = foundgff.hitattr['qseq']['seq'][apos1:apos2] hseq = foundgff.hitattr['hseq']['seq'][apos1:apos2] else: qseq = foundgff.hitattr['qseq']['seq'][apos1 + 1:apos2 + 1] hseq = foundgff.hitattr['hseq']['seq'][apos1 + 1:apos2 + 1] qseq = reverse_complement(qseq) hseq = reverse_complement(hseq) if qseq != hseq: status = "MUTATION" outstr.append("===========4 Processing gene %d %s %s" % (gnum, name, prod)) #print "GFF %s %s %d %d %s %s"%(g.qid,g.hid,g.qstart,g.qend,name,prod) outstr.append("DNA alignment\n") tmpstr = prettyPrint([{ 'id': id1, 'seq': qseq }, { 'id': id2, 'seq': hseq }]) tmpff = tmpstr.split('\n') for f in tmpff: outstr.append(f) qpep = translate(qseq) hpep = translate(hseq) tmpstr = prettyPrint([{ 'id': id1, 'seq': qpep }, { 'id': id2, 'seq': hpep }]) outstr.append("PEP alignment\n") tmpff = tmpstr.split('\n') for f in tmpff: outstr.append(f) #print "GFF start-end strand %d-%d %d %s %s"%(gstart,gend,gstrand,name,prod) #print "ALN start-end strand %d-%d %d %s %s"%(astart,aend,astrand,name,prod) #print "POS %d %d",(apos1,apos2) #print "QSEQ %s"%qseq #print "HSEQ %s"%hseq #print "QPEP %s"%qpep #print "HPEP %s"%hpep else: status = "IDENTICAL" outstr.append( "============5 Processing gene %d %s %s" % (gnum, name, prod)) outstr.append("NO CHANGE for this alignment %s %s %s" % (tmpgff.qid, name, prod)) for i in outstr: print "%-15s %s" % (status, i) print "\n" gnum = gnum + 1
def main(args): ff = FastaFile(args.fastafile) seqs = [] lens = [] bins = {} seq = ff.nextSeq() binsize = int(args.binsize) minlen = int(args.minlen) print minlen # 1. Number of sequences # 2. Array of lengths # 3. Median # 4. Bins # 5. Distribution # 6. Translate totlen = 0 while seq is not None: if seq['len'] >= minlen: #print "LEN\t%d\t%d"%(minlen,seq['len']) lens.append(seq['len']) totlen = totlen + seq['len'] bin = int(seq['len'] / binsize) if bin not in bins: bins[bin] = 0 bins[bin] = bins[bin] + 1 #pep = SeqUtils.translate(seq['seq']) #pep = re.sub(r'(.{80})',r'\1\n',pep) #print ">%s\n%s"%(seq['id'],pep) seqs.append(seq) else: print "MIN\t%d\t%d" % (minlen, seq['len']) seq = ff.nextSeq() sortedseqs = sorted(seqs, key=lambda k: k['len']) median = None n50 = None tmplen = 0 seqnum = len(seqs) i = 0 for seq in sortedseqs: i = i + 1 if n50 is None and tmplen > totlen / 2: n50 = seq['len'] if median is None and i > seqnum / 2: median = seq['len'] #print seq['len'],seq['id'] tmplen = tmplen + seq['len'] i = 0 cumul = {} tmpcount = 0 percent = 0 for key in sorted(bins): count = bins[key] tmpcount = tmpcount + count percent = int(100 * tmpcount / seqnum) cumul[percent] = key * binsize print tmpcount, seqnum, percent, key * binsize mean = int(totlen / seqnum) print("Num\t%d\tN50\t%d\tMedian\t%d\tMean\t%d" % (seqnum, n50, median, mean)) for key in sorted(bins): value = bins[key] # print("%d\t%d"%(binsize*key,value)) for key in sorted(cumul): value = cumul[key] print("%d\t%d" % (key, value))
import re import os import sys import unittest from datamodel.factory.FastaFile import FastaFile import importlib ff = FastaFile(sys.argv[1]) id = sys.argv[2] seq = ff.nextSeq() while seq is not None: if seq['id'] == id: str = FastaFile.toString([seq]) print str exit() seq = ff.nextSeq()
def read_sequences(seqfile): seqs = FastaFile.getSequenceDict(seqfile) return seqs
def main(args): logging.info(" ========> trim_alnfile.py") logging.info("ARGS %s"%args) ff = FastaFile(args.fastafile) seqs = [] seq = ff.nextSeq() while seq is not None: if not seq['id'].startswith('7'): seqs.append(seq) seq = ff.nextSeq() seqlen = len(seqs[0]['seq']) newseqs = [] prof = ff.calcProfile(seqs) print prof if args.trim : j = 0 while j < len(seqs): print ">%s\n%s"%(seqs[j]['id'], seqs[j]['seq']) j = j + 1 exit() j = 0 while j < len(seqs): newseqs.append("") j = j + 1 i = 0 while i < seqlen: j = 0 count = 0 while j < len(seqs): if seqs[j]['seq'][i] == '-': count = count + 1 j = j+1 if count < len(seqs): j = 0; while j < len(seqs): newseqs[j] = newseqs[j] + seqs[j]['seq'][i] j = j + 1 i = i + 1 j = 0 while j < len(seqs): print ">%s\n%s"%(seqs[j]['id'], newseqs[j]) j = j + 1