def _parse(self): i = 0 while i < len(self.lines): line = self.lines[i] toks = line.split() if '-i' in toks and not self.fastafile: idx = toks.index('-i') self.fastafile = toks[idx+1] if (len(toks) > 0 and toks[0] == 'Motif'): seqs = [] while 1: i = i + 1 line = self.lines[i] toks = line.split() if toks[0][0] == '*': break seqs.append(toks[0]) M = Motif(seqs) i = i + 1 line = self.lines[i] toks = line.split() MAP = float(toks[2]) if MAP > 1000: MAP = 0 #likely to be an AlignACE error M.MAP = MAP self.motifs.append(M) i = i + 1 self.nmotifs = len(self.motifs)
def generateMotifs(seqGroups, align, outputPrefix, transSeq=False, extendAlphabet=False, clusterMotifs=False, protein=False, threads=2, stream=None): from TAMO.MotifTools import Motif ighvMotifs = [] if clusterMotifs and 'gene' in outputPrefix: findMotifClusters(ighvMotifs, outputPrefix, stream=stream) printto(stream, '\t\tPWMs, consensus and logos are being generated for {} motifs ... '.format(len(seqGroups))) pwmFile = open(outputPrefix + '_pwm.txt', 'w') consensusFile = open(outputPrefix + '_consensus.txt', 'w') logosFolder = outputPrefix + '_logos' if not os.path.exists(logosFolder): os.makedirs(logosFolder) # create the sequence alphabet: DNA or Protein alphabet = createAlphabet(align, transSeq, extendAlphabet, protein) groups = seqGroups.keys() groups.sort() for group in groups: filename = os.path.join(logosFolder, group.replace('/', '') + '.png') seqs = seqGroups[group] m = generateMotif(seqs, group, alphabet, filename, align, transSeq, protein, outDir=logosFolder, threads=threads, stream=stream) if m is None: # motif file found, no further work required return motifSeqs = m.instances pwm = m.counts.normalize(pseudocounts=None) # {'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6} consensusMax = str(m.consensus) pwmFile.write('#{} {} sequences\n'.format(group, len(motifSeqs))) pwmFile.write(str(pwm)) consensusFile.write('>{} max_count\n'.format(group)) consensusFile.write(consensusMax + '\n') # print(str(m.anticonsensus)) # smallest values in the columns if not transSeq and not align and not protein: consensusIupac = str(m.degenerate_consensus) # print(consensusIupac) # IUPAC ambiguous nucleotides consensusFile.write('>{} degenerate\n'.format(group)) consensusFile.write(consensusIupac + '\n') pwmFile.flush() consensusFile.flush() gc.collect() if clusterMotifs and len(motifSeqs) > 10: motif = Motif(map(lambda x: str(x), motifSeqs), backgroundD={'A': 0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}, id=group) motif.addpseudocounts(0.1) ighvMotifs.append(motif) pwmFile.close() consensusFile.close() gc.collect() printto(stream, "\tPosition weight matrices are written to " + os.path.basename(outputPrefix + '_pwm.txt')) printto(stream, "\tConsensus sequences are written to " + os.path.basename(outputPrefix + '_consensus.txt')) if clusterMotifs: findMotifClusters(ighvMotifs, outputPrefix, stream=stream)
def ifKmerInAll(kmer,dictOfSeqs, factor=1): kmer = Motif(kmer) results = [] for seq in dictOfSeqs: temp = kmer.scan(dictOfSeqs[seq], factor=factor) if temp[0]: results.append(True) else: results.append(False) if False in results: return False else: return True
def taMotif2MopatMatrix(taMotif): """ Uses ONE tamo motif to produce a probability matrix compatible with MOPAT. RETURNS: [oneLetterName, [list of lists]] """ name = taMotif.oneletter.replace('.','n') matrix = [] # Use logLiklihoods to generate info for motif.counts taMotif = Motif(taMotif.bogus_kmers()) for pos in taMotif.counts: mopatPos = [] for nuc in sorted(pos.keys()): mopatPos.append(pos[nuc]) matrix.append(mopatPos) return [name, matrix]
def _parse(self): 'Parse MDscan file' alloutput = '\n'.join(self.lines) premotifs = alloutput.split('\nMtf ') print len(premotifs) for pm in premotifs: sublines = pm.split('\n') score, seednum = 0, 0 seqs = [] for line in sublines: if line.find('Final Motif') == 0: toks = line.split() score = float(toks[6]) seednum = int(toks[8]) if line.find('>') == 0: seqs.append(line.split()[-1]) #print "SEQS: ",seqs if seqs: m = Motif(seqs) m.MAP = score m.seednum = seednum self.motifs.append(m)
def _parse(self): 'Parse MDscan file' alloutput = '\n'.join(self.lines) premotifs = alloutput.split('\nMtf ') print len(premotifs) for pm in premotifs: sublines = pm.split('\n') score, seednum = 0,0 seqs = [] for line in sublines: if line.find('Final Motif') == 0: toks = line.split() score = float(toks[6]) seednum = int(toks[8]) if line.find('>') == 0: seqs.append(line.split()[-1]) #print "SEQS: ",seqs if seqs: m = Motif(seqs) m.MAP = score m.seednum = seednum self.motifs.append(m)
from TAMO.MotifTools import Motif import motility tM = Motif('WGATAR') sites = tM.bogus_kmers() tM = Motif(sites) mM = motility.make_pwm(sites) s = 'ATGCATGCTAGCGGCTGATAACGCTTATCATATGC' mReults = mM.find(s,mM.max_score()*0.75,)