def generateMotifs(seqGroups, align, outputPrefix, transSeq=False, extendAlphabet=False, clusterMotifs=False, protein=False, threads=2, stream=None): from TAMO.MotifTools import Motif ighvMotifs = [] if clusterMotifs and 'gene' in outputPrefix: findMotifClusters(ighvMotifs, outputPrefix, stream=stream) printto(stream, '\t\tPWMs, consensus and logos are being generated for {} motifs ... '.format(len(seqGroups))) pwmFile = open(outputPrefix + '_pwm.txt', 'w') consensusFile = open(outputPrefix + '_consensus.txt', 'w') logosFolder = outputPrefix + '_logos' if not os.path.exists(logosFolder): os.makedirs(logosFolder) # create the sequence alphabet: DNA or Protein alphabet = createAlphabet(align, transSeq, extendAlphabet, protein) groups = seqGroups.keys() groups.sort() for group in groups: filename = os.path.join(logosFolder, group.replace('/', '') + '.png') seqs = seqGroups[group] m = generateMotif(seqs, group, alphabet, filename, align, transSeq, protein, outDir=logosFolder, threads=threads, stream=stream) if m is None: # motif file found, no further work required return motifSeqs = m.instances pwm = m.counts.normalize(pseudocounts=None) # {'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6} consensusMax = str(m.consensus) pwmFile.write('#{} {} sequences\n'.format(group, len(motifSeqs))) pwmFile.write(str(pwm)) consensusFile.write('>{} max_count\n'.format(group)) consensusFile.write(consensusMax + '\n') # print(str(m.anticonsensus)) # smallest values in the columns if not transSeq and not align and not protein: consensusIupac = str(m.degenerate_consensus) # print(consensusIupac) # IUPAC ambiguous nucleotides consensusFile.write('>{} degenerate\n'.format(group)) consensusFile.write(consensusIupac + '\n') pwmFile.flush() consensusFile.flush() gc.collect() if clusterMotifs and len(motifSeqs) > 10: motif = Motif(map(lambda x: str(x), motifSeqs), backgroundD={'A': 0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}, id=group) motif.addpseudocounts(0.1) ighvMotifs.append(motif) pwmFile.close() consensusFile.close() gc.collect() printto(stream, "\tPosition weight matrices are written to " + os.path.basename(outputPrefix + '_pwm.txt')) printto(stream, "\tConsensus sequences are written to " + os.path.basename(outputPrefix + '_consensus.txt')) if clusterMotifs: findMotifClusters(ighvMotifs, outputPrefix, stream=stream)
def _parse(self): 'Parse MDscan file' alloutput = '\n'.join(self.lines) premotifs = alloutput.split('\nMtf ') print len(premotifs) for pm in premotifs: sublines = pm.split('\n') score, seednum = 0, 0 seqs = [] for line in sublines: if line.find('Final Motif') == 0: toks = line.split() score = float(toks[6]) seednum = int(toks[8]) if line.find('>') == 0: seqs.append(line.split()[-1]) #print "SEQS: ",seqs if seqs: m = Motif(seqs) m.MAP = score m.seednum = seednum self.motifs.append(m)