def dbLengthFilter(len_min,inFileName, verbose=0): file_db=open(inFileName) file_dbInf=open(inFileName+".Inf"+str(len_min),'w') file_dbSup=open(inFileName+".Sup"+str(len_min),'w') seq=Bioseq() numseq=0 nbsave=0 while 1: seq.read(file_db) if seq.sequence==None: break l=seq.getLength() numseq=numseq+1 if l>=len_min: seq.write(file_dbSup) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' nbsave=nbsave+1 else: seq.write(file_dbInf) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' nbsave=nbsave+1 file_db.close() file_dbInf.close() file_dbSup.close() if verbose > 0: print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
def dbCleanByPattern(pattern,inFileName,db2_filename=""): if pattern=="": return srch=re.compile(pattern) file_db=open(inFileName) if db2_filename=="": db2_filename=inFileName+'.cleaned' file_db2=open(db2_filename,'w') seq=Bioseq() numseq=0 nbsave=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 if not srch.search(seq.header): seq.write(file_db2) print 'sequence #',numseq,'[',seq.header[0:40],'...] saved !!' nbsave=nbsave+1 file_db.close() file_db2.close() print nbsave,'saved sequences in',db2_filename
def dbExtractByPattern( pattern, inFileName, outFileName="" ): if pattern == "": return srch = re.compile( pattern ) file_db = open( inFileName ) if outFileName == "": outFileName = inFileName + '.extracted' file_db2 = open( outFileName, 'w' ) seq = Bioseq() numseq = 0 nbsave = 0 while 1: seq.read( file_db ) if seq.sequence == None: break numseq = numseq + 1 m = srch.search( seq.header ) if m: seq.write( file_db2 ) print 'seq #',numseq,'matched on',m.group(),'[',seq.header[0:40],'...] saved !!' nbsave = nbsave + 1 file_db.close() file_db2.close() print nbsave,'saved sequences in',outFileName
def dbCleanByFilePattern( file, inFileName, db2_filename="", verbose=0 ): if file == "": print "*** Error: no file of pattern" sys.exit(1) seq = Bioseq() numseq=0 nbsave=0 header=[] file_db = open( inFileName ) while True: seq.read( file_db ) if seq.sequence==None: break numseq=numseq+1 header.append( seq.header ) file_db.close() f=open(file) to_remove=[] for pattern in f: if verbose > 0: print "Pattern: ",pattern[:-1]; sys.stdout.flush() srch=re.compile(pattern[:-1]) for h in header: if srch.search(h): to_remove.append(h) f.close() if db2_filename == "": db2_filename = inFileName + '.cleaned' file_db2 = open( db2_filename, 'w' ) file_db=open( inFileName ) num = 0 while True: seq.read( file_db ) num += 1 if seq.sequence==None: break if seq.header not in to_remove: seq.write( file_db2 ) print 'sequence #',num,'/',numseq,'[',seq.header[0:40],'...] saved !!'; sys.stdout.flush() nbsave=nbsave+1 file_db.close() file_db2.close() print nbsave,'saved sequences in',db2_filename
def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): if patternFileName=="": print "*** Error: no file of pattern" sys.exit(1) seq = Bioseq() numseq = 0 nbsave = 0 header = [] inFile = open( inFileName ) while True: seq.read( inFile ) if seq.sequence == None: break numseq = numseq + 1 header.append( seq.header ) inFile.close() to_keep = [] patternFile = open( patternFileName ) for pattern in patternFile: if verbose > 0: print "pattern: ",pattern[:-1]; sys.stdout.flush() srch = re.compile(pattern[:-1]) for h in header: if srch.search(h): to_keep.append(h) patternFile.close() if outFileName == "": outFileName = inFileName + ".extracted" file_db2=open( outFileName, "w" ) inFile = open( inFileName ) while 1: seq.read(inFile) if seq.sequence==None: break if seq.header in to_keep: seq.write(file_db2) print 'sequence #',numseq,'[',seq.header[0:40],'...] saved !!'; sys.stdout.flush() nbsave=nbsave+1 inFile.close() file_db2.close() print nbsave,'saved sequences in',outFileName
def dbExtractByNumber(num,inFileName,db2_filename=""): file_db=open(inFileName) if db2_filename=="": db2_filename=inFileName+'.extracted' file_db2=open(db2_filename,'w') seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 if numseq==num: seq.write(file_db2) print 'sequence #',numseq,'[',seq.header[0:40],'...] saved !!' break file_db.close() file_db2.close()
def dbCleanSeqCluster(filename,list_cluster,filename_out=""): print "reading ",filename, "..." file=open(filename) line="start" srchCl=re.compile('Cl\d+') seq=Bioseq() if filename_out=="": filename_out=filename+".cleaned_cluster" fout=open(filename_out,"w") count=0 while 1: seq.read(file) if seq.sequence==None: break m=srchCl.search(seq.header) gr=int(m.string[m.start(0)+2:m.end(0)]) if gr not in list_cluster: count=count+1 seq.write(fout) file.close() fout.close() print count,"sequences saved in ",filename_out
def dbExtractSeqGroups(inFileName,list_group,filename_out=""): print "reading ",inFileName, "..." file=open(inFileName) line="start" srchGrp=re.compile('Gr\d+') seq=Bioseq() if filename_out=="": filename_out=inFileName+".extracted_group" fout=open(filename_out,"w") count=0 while 1: seq.read(file) if seq.sequence==None: break m=srchGrp.search(seq.header) gr=int(m.string[m.start(0)+2:m.end(0)]) if gr in list_group: count=count+1 seq.write(fout) file.close() fout.close() print count,"sequences saved in ",filename_out
def dbComplement(inFileName,comp_filename=""): """ deprecated """ file_db=open(inFileName) if comp_filename=="": comp_filename=inFileName+'.comp' file_comp=open(comp_filename,'w') seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]' seq.sequence=seq.complement() seq.header=seq.header+" (complement!)" seq.write(file_comp) file_db.close() file_comp.close()
def dbExtractByNumberList(numlist,inFileName,db2_filename=""): file_db=open(inFileName) if db2_filename=="": db2_filename=inFileName+'.extracted' file_db2=open(db2_filename,'w') seq=Bioseq() numseq=0 nbsave=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 if numseq in numlist: seq.write(file_db2) print 'sequence #',numseq,'[',seq.header[0:40],'...] saved !!' nbsave=nbsave+1 file_db.close() file_db2.close() print nbsave,'saved sequences in ',db2_filename
def dbITRsearch(inFileName,len_min,mismatch,skip_len=20000): """ deprecated """ import reputer n=0 s=0 file_db=open(inFileName) file_out=open(inFileName+".stree_itr",'w') seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',\ seq.getLength(),'[',\ seq.header[0:40],'...]' if seq.getLength()<skip_len: rep=reputer.find(seq,"-p -l "+str(len_min)+\ " -e "+str(mismatch)) for i in rep.rep_list: if i.pos1 < 5 \ and i.pos2+i.length2>seq.getLength()-5: i.view() n=n+1 seq.write(file_out) break else: s=s+1 print ' too long, skipped' print n,"found ", s, "skipped" file_db.close() file_out.close()
def dbTraduit(inFileName,phase=0,complement='T',pep_filename=""): """ deprecated """ file_db=open(inFileName) if pep_filename=="": pep_filename=inFileName+'.pep' file_pep=open(pep_filename,'w') seq=Bioseq() seq_out=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),\ '[',seq.header[0:40],'...]' if phase>=0 : if phase==1 or phase==0 : seq_out.sequence=seq.traduit(1) seq_out.header=seq.header+" (phase 1)" seq_out.write(file_pep) if phase==2 or phase==0 : seq_out.sequence=seq.traduit(2) seq_out.header=seq.header+" (phase 2)" seq_out.write(file_pep) if phase==3 or phase==0 : seq_out.sequence=seq.traduit(3) seq_out.header=seq.header+" (phase 3)" seq_out.write(file_pep) if complement=='T' or phase<0 : seq.sequence=seq.complement() if phase==-1 or phase==0 : seq_out.sequence=seq.traduit(1) seq_out.header=seq.header+" (phase -1)" seq_out.write(file_pep) if phase==-2 or phase==0 : seq_out.sequence=seq.traduit(2) seq_out.header=seq.header+" (phase -2)" seq_out.write(file_pep) if phase==-3 or phase==0 : seq_out.sequence=seq.traduit(3) seq_out.header=seq.header+" (phase -3)" seq_out.write(file_pep) file_db.close() file_pep.close()
def dbConsensus(filename,consensus_filename,max_set_size=20,max_len=20000,min_len=50,min_base_nb=1): """ deprecated """ os.system("orienter "+filename) tmp_consensus_filename=filename+".oriented.consensus.tmp" size_db=dbSize(filename+".oriented") file_in=open(filename+".oriented") file_out=open(consensus_filename,'w') seq=Bioseq() if size_db==1: seq.read(file_in) seq.header="not a consensus" seq.write(file_out) file_out.close() file_in.close() os.system("cp "+filename+".oriented"+ " "+filename+ ".malign.fa") os.system("cp "+filename+".oriented"+ " "+filename+ ".malign.fa.cons") sys.exit(1) seq_in_set=0 nb_consensus=0 count_set=0 set_size=size_db while set_size>max_set_size: set_size=set_size/2 tmp_file_out=open(tmp_consensus_filename,'w') last_seq=0 while 1: #read subset of sequence seq.read(file_in) if seq.sequence!=None: if seq.getLength() < max_len and seq.getLength() > min_len: seq.write(tmp_file_out) seq_in_set=seq_in_set+1 else: if seq.getLength() > max_len: print seq.header+" too long!!" if not seq.header.find(" too long, not aligned"): seq.header=seq.header+" too long, not aligned" seq.write(file_out) if seq.getLength() < min_len: print seq.header+" too short!!" else: last_seq=1 if seq_in_set==0: return count_set # aligne subset if seq_in_set==set_size or last_seq: count_set=count_set+1 print "aligning the set #",count_set," of ",seq_in_set," sequences" tmp_file_out.close() if seq_in_set>1: os.system("nice malign "+tmp_consensus_filename +" 20 -8 16 4 > " +tmp_consensus_filename+".malign" +str(count_set)+".fa") os.system("nice consensusFastaAli.py -n " +str(min_base_nb)+" " +tmp_consensus_filename +".malign"+str(count_set)+".fa ") os.system("cp "+tmp_consensus_filename+ ".malign"+str(count_set)+".fa " +filename+ ".malign"+str(count_set)+".fa") else: os.system("cp "+tmp_consensus_filename+ " "+filename+ ".malign"+str(count_set)+".fa") os.system("cp "+tmp_consensus_filename+ " "+tmp_consensus_filename+ ".malign"+str(count_set)+".fa.cons") os.system("cat "+tmp_consensus_filename+ ".malign"+str(count_set)+\ ".fa.cons >> "+consensus_filename) seq_in_set=0 tmp_file_out=open(tmp_consensus_filename,'w') if set_size==size_db or last_seq: break tmp_file_out.close() file_out.close() file_in.close() os.system("rm "+tmp_consensus_filename+"* "+filename+".oriented" ) return count_set
def filterClassifConsensus( inFileName, outFileName, filterSSRs, maxLengthToFilterSSRs, filterHostGenes, filterConfused, filterNoCat, nbAlignSeqNoCat, verbose=0 ): """ Filter each consensus according to the classification in its header. @param inFileName: name of the input fasta file @type inFileName: string @param outFileName: name of the output fasta file @type outFileName: string @param filterSSRs: filter the consensus classified as SSR @type filterSSRs: boolean @param maxLengthToFilterSSRs: length below which a SSR is filtered @type maxLengthToFilterSSRs: integer @param filterSSRs: filter the consensus classified as HostGene @type filterSSRs: boolean @param filterConfused: filter the consensus classified as confused @type filterConfused: boolean @param filterNoCat: filter the consensus classified as NoCat @type filterNoCat: boolean @param nbAlignSeqNoCat: minimum number of sequences in the MSA from which the NoCat consensus as been built @type nbAlignSeqNoCat: string @param verbose: verbosity level @type verbose: integer """ if outFileName == "": outFileName = "%s.filtered" % ( inFileName ) nbAlignSeqNoCat = int( nbAlignSeqNoCat ) if verbose > 0: print "input file: %s" % ( inFileName ) print "output file: %s" % ( outFileName ) if filterSSRs: if maxLengthToFilterSSRs == 0: print "filter SSRs" else: print "filter SSRs (<%ibp)" % ( maxLengthToFilterSSRs ) if filterHostGenes: print "filter host's genes" if filterNoCat: print "filter NoCat" if filterConfused: print "filter confused" sys.stdout.flush() inFile = open( inFileName, "r" ) outFile = open( outFileName, "w" ) bs = Bioseq() nbInSeq = 0 nbRmv = 0 while True: bs.read( inFile ) if bs.header == None: break nbInSeq += 1 if verbose > 1: print bs.header if filterSSRs == True and "SSR" in bs.header and ( maxLengthToFilterSSRs == 0 or bs.getLength() <= maxLengthToFilterSSRs ): nbRmv += 1 if verbose > 1: print "filtered !" elif filterHostGenes == True and "HostGene" in bs.header: nbRmv += 1 if verbose > 1: print "filtered !" elif filterConfused == True and "confusedness=yes" in bs.header: nbRmv += 1 if verbose > 1: print "filtered !" elif filterNoCat == True and "NoCat" in bs.header: algoMSA = "" for i in ["Map","MAP","Malign","Mafft","Prank","Clustalw","Muscle","Tcoffee"]: if i in bs.header: algoMSA = i regexp = ".*" + algoMSA + "_(\d*)\|.*" header = re.match(regexp, bs.header) nb = header.group(1) nbAlignSeq = int( nb ) if nbAlignSeq <= nbAlignSeqNoCat: nbRmv += 1 if verbose > 1: print "filtered !" else: bs.write( outFile ) else: bs.write( outFile ) inFile.close() outFile.close() if verbose > 0: print "nb of input seq: %i" % ( nbInSeq ) print "nb of filtered seq: %i" % ( nbRmv ) sys.stdout.flush()