def dbTraduit(inFileName,phase=0,complement='T',pep_filename=""): """ deprecated """ file_db=open(inFileName) if pep_filename=="": pep_filename=inFileName+'.pep' file_pep=open(pep_filename,'w') seq=Bioseq() seq_out=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),\ '[',seq.header[0:40],'...]' if phase>=0 : if phase==1 or phase==0 : seq_out.sequence=seq.traduit(1) seq_out.header=seq.header+" (phase 1)" seq_out.write(file_pep) if phase==2 or phase==0 : seq_out.sequence=seq.traduit(2) seq_out.header=seq.header+" (phase 2)" seq_out.write(file_pep) if phase==3 or phase==0 : seq_out.sequence=seq.traduit(3) seq_out.header=seq.header+" (phase 3)" seq_out.write(file_pep) if complement=='T' or phase<0 : seq.sequence=seq.complement() if phase==-1 or phase==0 : seq_out.sequence=seq.traduit(1) seq_out.header=seq.header+" (phase -1)" seq_out.write(file_pep) if phase==-2 or phase==0 : seq_out.sequence=seq.traduit(2) seq_out.header=seq.header+" (phase -2)" seq_out.write(file_pep) if phase==-3 or phase==0 : seq_out.sequence=seq.traduit(3) seq_out.header=seq.header+" (phase -3)" seq_out.write(file_pep) file_db.close() file_pep.close()
def sortSequencesByIncreasingLength( inFile, outFile, verbose=0 ): """ Save sequences in 'inFile' into 'outFile' sorted by their length in increasing order. """ if verbose > 0: print "sort sequences by increasing length" sys.stdout.flush() if not os.path.exists( inFile ): print "ERROR: file '%s' doesn't exist" % ( inFile ) sys.exit(1) # read each seq one by one # save them in distinct temporary files # with their length in the name inFileHandler = open( inFile, "r" ) bs = Bioseq() countSeq = 0 while True: bs.read( inFileHandler ) if bs.header == None: break countSeq += 1 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) bs.save( tmpFile ) if verbose > 1: print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) bs.header = "" bs.sequence = "" inFileHandler.close() # sort temporary file names # concatenate them into the output file if os.path.exists( outFile ): os.remove( outFile ) lFiles = glob.glob( "*bp_*nb" ) lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) for fileName in lFiles: cmd = "cat %s >> %s" % ( fileName, outFile ) returnValue = os.system( cmd ) if returnValue != 0: print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFile ) sys.exit(1) os.remove( fileName ) return 0
def dbComplement(inFileName,comp_filename=""): """ deprecated """ file_db=open(inFileName) if comp_filename=="": comp_filename=inFileName+'.comp' file_comp=open(comp_filename,'w') seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]' seq.sequence=seq.complement() seq.header=seq.header+" (complement!)" seq.write(file_comp) file_db.close() file_comp.close()
def dbConsensus(filename,consensus_filename,max_set_size=20,max_len=20000,min_len=50,min_base_nb=1): """ deprecated """ os.system("orienter "+filename) tmp_consensus_filename=filename+".oriented.consensus.tmp" size_db=dbSize(filename+".oriented") file_in=open(filename+".oriented") file_out=open(consensus_filename,'w') seq=Bioseq() if size_db==1: seq.read(file_in) seq.header="not a consensus" seq.write(file_out) file_out.close() file_in.close() os.system("cp "+filename+".oriented"+ " "+filename+ ".malign.fa") os.system("cp "+filename+".oriented"+ " "+filename+ ".malign.fa.cons") sys.exit(1) seq_in_set=0 nb_consensus=0 count_set=0 set_size=size_db while set_size>max_set_size: set_size=set_size/2 tmp_file_out=open(tmp_consensus_filename,'w') last_seq=0 while 1: #read subset of sequence seq.read(file_in) if seq.sequence!=None: if seq.getLength() < max_len and seq.getLength() > min_len: seq.write(tmp_file_out) seq_in_set=seq_in_set+1 else: if seq.getLength() > max_len: print seq.header+" too long!!" if not seq.header.find(" too long, not aligned"): seq.header=seq.header+" too long, not aligned" seq.write(file_out) if seq.getLength() < min_len: print seq.header+" too short!!" else: last_seq=1 if seq_in_set==0: return count_set # aligne subset if seq_in_set==set_size or last_seq: count_set=count_set+1 print "aligning the set #",count_set," of ",seq_in_set," sequences" tmp_file_out.close() if seq_in_set>1: os.system("nice malign "+tmp_consensus_filename +" 20 -8 16 4 > " +tmp_consensus_filename+".malign" +str(count_set)+".fa") os.system("nice consensusFastaAli.py -n " +str(min_base_nb)+" " +tmp_consensus_filename +".malign"+str(count_set)+".fa ") os.system("cp "+tmp_consensus_filename+ ".malign"+str(count_set)+".fa " +filename+ ".malign"+str(count_set)+".fa") else: os.system("cp "+tmp_consensus_filename+ " "+filename+ ".malign"+str(count_set)+".fa") os.system("cp "+tmp_consensus_filename+ " "+tmp_consensus_filename+ ".malign"+str(count_set)+".fa.cons") os.system("cat "+tmp_consensus_filename+ ".malign"+str(count_set)+\ ".fa.cons >> "+consensus_filename) seq_in_set=0 tmp_file_out=open(tmp_consensus_filename,'w') if set_size==size_db or last_seq: break tmp_file_out.close() file_out.close() file_in.close() os.system("rm "+tmp_consensus_filename+"* "+filename+".oriented" ) return count_set