def dbORF( inFileName, nb=0, size=0, outFileName="" ): inFile = open( inFileName ) if outFileName == "": outFileName = inFileName + ".orf.map" outFile = open( outFileName, "w" ) seq = Bioseq() seq_out = Bioseq() numseq = 0 while 1: seq.read( inFile ) if seq.sequence == None: break seq.upCase() numseq = numseq + 1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]' orf = seq.findORF() best_orf = [] for i in orf.keys(): l = len(orf[i]) for j in xrange(1,l): start = orf[i][j-1] + 4 end = orf[i][j] + 3 if end - start >= size: best_orf.append( ( end-start, i+1, start, end ) ) seq.sequence = seq.complement() orf = seq.findORF() seqlen = seq.getLength() for i in orf.keys(): l = len(orf[i]) for j in xrange(1,l): start = seqlen - orf[i][j-1] - 3 end = seqlen - orf[i][j] - 2 if start - end >= size: best_orf.append( ( start-end, (i+1)*-1, start, end ) ) best_orf.sort() best_orf.reverse() l = len(best_orf) if nb > l or nb == 0 : nb = l for i in xrange(0,nb): print best_orf[i] outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(best_orf[i][1])+\ "|"+str(best_orf[i][0]),seq.header, best_orf[i][2],best_orf[i][3])) inFile.close() outFile.close() return 0
def dbTraduit(inFileName,phase=0,complement='T',pep_filename=""): """ deprecated """ file_db=open(inFileName) if pep_filename=="": pep_filename=inFileName+'.pep' file_pep=open(pep_filename,'w') seq=Bioseq() seq_out=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),\ '[',seq.header[0:40],'...]' if phase>=0 : if phase==1 or phase==0 : seq_out.sequence=seq.traduit(1) seq_out.header=seq.header+" (phase 1)" seq_out.write(file_pep) if phase==2 or phase==0 : seq_out.sequence=seq.traduit(2) seq_out.header=seq.header+" (phase 2)" seq_out.write(file_pep) if phase==3 or phase==0 : seq_out.sequence=seq.traduit(3) seq_out.header=seq.header+" (phase 3)" seq_out.write(file_pep) if complement=='T' or phase<0 : seq.sequence=seq.complement() if phase==-1 or phase==0 : seq_out.sequence=seq.traduit(1) seq_out.header=seq.header+" (phase -1)" seq_out.write(file_pep) if phase==-2 or phase==0 : seq_out.sequence=seq.traduit(2) seq_out.header=seq.header+" (phase -2)" seq_out.write(file_pep) if phase==-3 or phase==0 : seq_out.sequence=seq.traduit(3) seq_out.header=seq.header+" (phase -3)" seq_out.write(file_pep) file_db.close() file_pep.close()
def sortSequencesByIncreasingLength( inFile, outFile, verbose=0 ): """ Save sequences in 'inFile' into 'outFile' sorted by their length in increasing order. """ if verbose > 0: print "sort sequences by increasing length" sys.stdout.flush() if not os.path.exists( inFile ): print "ERROR: file '%s' doesn't exist" % ( inFile ) sys.exit(1) # read each seq one by one # save them in distinct temporary files # with their length in the name inFileHandler = open( inFile, "r" ) bs = Bioseq() countSeq = 0 while True: bs.read( inFileHandler ) if bs.header == None: break countSeq += 1 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) bs.save( tmpFile ) if verbose > 1: print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) bs.header = "" bs.sequence = "" inFileHandler.close() # sort temporary file names # concatenate them into the output file if os.path.exists( outFile ): os.remove( outFile ) lFiles = glob.glob( "*bp_*nb" ) lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) for fileName in lFiles: cmd = "cat %s >> %s" % ( fileName, outFile ) returnValue = os.system( cmd ) if returnValue != 0: print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFile ) sys.exit(1) os.remove( fileName ) return 0
def dbComplement(inFileName,comp_filename=""): """ deprecated """ file_db=open(inFileName) if comp_filename=="": comp_filename=inFileName+'.comp' file_comp=open(comp_filename,'w') seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]' seq.sequence=seq.complement() seq.header=seq.header+" (complement!)" seq.write(file_comp) file_db.close() file_comp.close()