コード例 #1
0
def generateMotifs(seqGroups, align, outputPrefix, transSeq=False,
                        extendAlphabet=False, clusterMotifs=False, protein=False, threads=2, stream=None):
    from TAMO.MotifTools import Motif
    ighvMotifs = []
    if clusterMotifs and 'gene' in outputPrefix:
        findMotifClusters(ighvMotifs, outputPrefix, stream=stream)
    printto(stream, '\t\tPWMs, consensus and logos are being generated for {} motifs ... '.format(len(seqGroups)))
    pwmFile = open(outputPrefix + '_pwm.txt', 'w')
    consensusFile = open(outputPrefix + '_consensus.txt', 'w')
    logosFolder = outputPrefix + '_logos'

    if not os.path.exists(logosFolder):
        os.makedirs(logosFolder)

    # create the sequence alphabet: DNA or Protein
    alphabet = createAlphabet(align, transSeq, extendAlphabet, protein)
    groups = seqGroups.keys()
    groups.sort()        
    
    for group in groups:    
        filename = os.path.join(logosFolder, group.replace('/', '') + '.png')
        seqs = seqGroups[group]
        m = generateMotif(seqs, group, alphabet, filename, align, transSeq, protein, outDir=logosFolder,
                          threads=threads, stream=stream)
        if m is None:
            # motif file found, no further work required
            return
        motifSeqs = m.instances
        pwm = m.counts.normalize(pseudocounts=None)  # {'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}
        consensusMax = str(m.consensus)      
               
        pwmFile.write('#{} {} sequences\n'.format(group, len(motifSeqs)))
        pwmFile.write(str(pwm))  
        consensusFile.write('>{} max_count\n'.format(group))
        consensusFile.write(consensusMax + '\n')      
    #             print(str(m.anticonsensus)) # smallest values in the columns
        if not transSeq and not align and not protein:
            consensusIupac = str(m.degenerate_consensus)
    #             print(consensusIupac) # IUPAC ambiguous nucleotides            
            consensusFile.write('>{} degenerate\n'.format(group))
            consensusFile.write(consensusIupac + '\n')
        
        pwmFile.flush()
        consensusFile.flush()
        gc.collect()
        if clusterMotifs and len(motifSeqs) > 10:
            motif = Motif(map(lambda x: str(x), motifSeqs),
                          backgroundD={'A': 0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}, id=group)
            motif.addpseudocounts(0.1)
            ighvMotifs.append(motif)
            
    pwmFile.close()
    consensusFile.close()      
    gc.collect()
    printto(stream, "\tPosition weight matrices are written to " + os.path.basename(outputPrefix + '_pwm.txt'))
    printto(stream, "\tConsensus sequences are written to " + os.path.basename(outputPrefix + '_consensus.txt'))
    if clusterMotifs:
        findMotifClusters(ighvMotifs, outputPrefix, stream=stream)