def pick_chunk_score(wdir, TAMO_file, target, genome): '''Trims and returns the top motif in a cluster. This script takes in the TAMO file from the motifs in a single cluster. It trims the low-information ends from each motifs. It then indentifies the motif that is most significantly represented in the target genes in your genome. If no motif is significantly represented, then a blank top motif file is created. ''' os.system("cd %s" % wdir) os.chdir(wdir) script_dir = '/'.join(os.path.abspath(__file__).split('/') [:-1]) # path to pcc_merge_CC.py script ## # step 1 trim tamo to eliminate low information flanking sequence trim_motif(TAMO_file, 0.1) ## # step 2 Group Specificity Score" from the Church lab # python MotifMetrics.py [Genes of interest] -genome [FASTA of promoter sequence] -t [Trimmed TAMO of cluster motifs] # MotifMetrics.py checks if the motifs appear disproportionatly to the # targets compared to the rest of the genes. os.system( "python %s/MotifMetrics.py %s -genome %s -t %s_0.1.trim -spec > %s_0.1.trim_Cout" % (script_dir, target, genome, TAMO_file, TAMO_file)) ## # Gets the motif that is most significantly represented in your target genes # Returns "None" if none of the motifs has a p-value above 0.001. topm = parse_out_pcs("%s_0.1.trim_Cout" % TAMO_file) print "topm", topm ## # Writes the top motif to its own directory. if topm != "None": newdic = {} ml = MotifTools.txt2motifs("%s_0.1.trim" % TAMO_file) for m in ml: if m.oneletter == topm: newdic[m.oneletter] = m save_motifs(newdic.values(), "%s.TOP" % TAMO_file) os.system("rm %s_0.1.trim" % TAMO_file) os.system("rm %s_0.1.trim_Cout" % TAMO_file) ## # Writes a blank document if there was no top motif. else: oup = open("%s.TOP" % TAMO_file, "w") oup.close()
def trim_motif(TAMO_file, cut=0.4): '''Trims the motifs in TAMO_file, eliminating low-information flanks.''' testmotifs = MotifTools.load(TAMO_file) file = TAMO_file + "_" + str(cut) + ".trim" new_mlist = [] for motif in testmotifs: m = motif.trimmed(cut) new_mlist.append(m) save_motifs(new_mlist, file)
def TAMO_split(TAMO_file, motifs_per_file=190): '''This function splits a TAMO into smaller files for create_cc''' ml = MotifTools.txt2motifs(TAMO_file) total = len(ml) / int(motifs_per_file) # Total number of TAMOs to generate by = motifs_per_file for i in range(total): print i print i * by + by, TAMO_file + '_n%s' % i save_motifs(ml[i * by:i * by + by], TAMO_file + '_n%s' % i) print total * by, len(ml), TAMO_file + '_n%s' % (total) save_motifs(ml[total * by:len(ml)], TAMO_file + '_n%s' % (total)) return (total)
def tf2tamo(tf_file): inp = open(tf_file, "r") line = inp.readline() motifs = [] while 1: if not line: break if line.startswith("DE"): name = line.strip().split()[1] block = [] line = inp.readline() while not line.startswith("XX"): if not line: break block.append(line) line = inp.readline() motifs.append(parse_block(name, block)) else: line = inp.readline() save_motifs(motifs, '%s.tm' % (tf_file))
tamoMotifs = [] for jMat in jasparFiles: tempMat = map(lambda l: l.strip().split(), open(jMat, "rU").readlines()) ## eval() inteligently converts text numbers to int or float! # for i in range(len(tempMat)): # for j in range(len(tempMat[i])): # tempMat[i][j] = eval(tempMat[i][j]) # transpose matrix tempMat = transpose(mat(tempMat)).tolist() for i in range(len(tempMat)): tempMat[i] = { "A": eval(tempMat[i][0]), "C": eval(tempMat[i][1]), "G": eval(tempMat[i][2]), "T": eval(tempMat[i][3]), } jasparTAMO_Motif = Motif_from_counts(tempMat, bg=nucBack) jasparTAMO_Motif.sourceFile = jMat.split("/")[-1] tamoMotifs.append(jasparTAMO_Motif) # print to file cPickle.dump(tamoMotifs, open(pklFile, "w")) save_motifs(tamoMotifs, tmoFile, kmer_count=60) print "Done."
def merge_runs_cc(TAMO_file, wdir, height, distance, ancestor, target, genome): '''This script is used to merge motifs with the PCC matrix of all motifs. The script was originally written by Cheng Zou, and then converted to a function by Alex Seddon. ''' print "Here are the parameters you specified in this run " print "-tamo %s" % TAMO_file print "-wdir %s" % wdir print "-h height to cut the tree, %s" % height print "-ancestor %s" % ancestor print "-target %s" % target print "-genome %s" % genome if TAMO_file == '' or wdir == '': help() os.system("cd %s" % wdir) os.chdir(wdir) # Get the directory where the script is located. script_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) # This code was in the original clustering script. It has been taken out # because the processes involved take too long and have been taken up by # the matrrix creation scripts and the run_UPGMA script. #if distance==0: # os.system("python /mnt/home/seddonal/gil scottscripts/5_motif_merging/3.calculate_distance_matrix.py -i %s --dfunc pccrange" % TAMO_file) #os.system("R --vanilla --slave --args %s.dm %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file)) cl_dic = {} n = 0 # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear # in the TAMO_file. If two motifs have the same number, they are considered # a part of the same cluster. # This loop pulls the clustering information out of this file and creats # the dictionary cl_dic = {cluster_index:{motif_index:'1'}} for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"): # Gets the clusterindex of this motif cl = line.strip() # Adds the cluster index if it has not been if not cl_dic.has_key(cl): cl_dic[cl] = {} cl_dic[cl][n] = "1" # Adds the motif to that cluster n += 1 # Increases the motif index for the next motif #print cl_dic ml = MotifTools.txt2motifs(TAMO_file) old = [] # List of motifs that are the sole members of a cluster. # I think I can divide up this portion of the code to create a series print ancestor, ancestor == 0 cc_output = open('merge_runs_cc', 'w') if ancestor == 0: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) cc_output.write( 'module load TAMO; python %s/pcc_merge_CC.py merge_runs_no_ancestor -t %s/%s -i %s -target %s -genome %s\n' % (script_dir, wdir, TAMO_file, i, target, genome)) # If there is only one motif in the cluster, it leaves it alone, # And adds it to old else: key = cl_dic[i].keys()[0] old.append(ml[key]) if ancestor == 1: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) cc_output.write( 'module load TAMO; module load STAMPmotif; python %s/pcc_merge_CC.py merge_runs_ancestor -t %s/%s -i %s -target %s -genome %s\n' % (script_dir, wdir, TAMO_file, i, target, genome)) else: key = cl_dic[i].keys()[0] old.append(ml[key]) # Combine together the motifs that are in there own cluster. #os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file,TAMO_file)) save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
def merge_runs(TAMO_file, wdir, height, distance, ancestor, target, genome): '''This script is used to merge motifs with the PCC matrix of all motifs. The script was originally written by Cheng Zou, and then converted to a function by Alex Seddon. ''' print "Here are the parameters you specified in this run " print "-tamo %s" % TAMO_file print "-wdir %s" % wdir print "-h height to cut the tree, %s" % height print "-distance %s" % distance print "-ancestor %s" % ancestor print "-target %s" % target print "-genome %s" % genome if TAMO_file == '' or wdir == '': help() os.system("cd %s" % wdir) os.chdir(wdir) # This code was in the original clustering script. It has been taken out # because the processes involved take too long and have been replaced by # the matrix creation scripts and the run_UPGMA script. #if distance==0: # os.system("python /mnt/home/seddonal/scripts/5_motif_merging/3.calculate_distance_matrix.py -i %s --dfunc pccrange" % TAMO_file) #os.system("R --vanilla --slave --args %s.dm %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file)) cl_dic = {} n = 0 # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear # in the TAMO_file. If two motifs have the same number, they are considered # a part of the same cluster. # This loop pulls the clustering information out of this file and creats # the dictionary cl_dic = {cluster_index:{motif_index:'1'}} for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"): # Gets the clusterindex of this motif cl = line.strip() # Adds the cluster index if it has not been if not cl_dic.has_key(cl): cl_dic[cl] = {} cl_dic[cl][n] = "1" # Adds the motif to that cluster n += 1 # Increases the motif index for the next motif #print cl_dic ml = MotifTools.txt2motifs(TAMO_file) old = [] # List of motifs that are the sole members of a cluster. # I think I can divide up this portion of the code to create a series print ancestor, ancestor == 0 if ancestor == 0: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) # I am fairly certain that this process of converting to TF and # then returning it to TAMO format is only for keeping the names # consistent. I need to verify this suspicion tamo2tf("%s_sub_%s.tm" % (TAMO_file, i)) os.system("cat %s_sub_%s.tm.tf > %s_sub_%s_sum.tm.tf" % (TAMO_file, i, TAMO_file, i)) tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i)) # Gets the top motif in the cluster. pick_chunk_score(wdir, '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i), target, genome) # Removes the files that were created. os.system("rm %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i)) os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i)) os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i)) # If there is only one motif in the cluster, it leaves it alone, # And adds it to old else: key = cl_dic[i].keys()[0] old.append(ml[key]) if ancestor == 1: # This loop Looks at each cluster and attempts to merge the motifs # in the cluster if there are multiple motifs. for i in cl_dic.keys(): print i, cl_dic[i] # If there are multiple motifs in the cluster, it merges the motifs if len(cl_dic[i]) > 1: # Adds all of the motifs in the cluster to an object called # mlist. mlist = [] for j in cl_dic[i]: mlist.append(ml[j]) # Saves these motifs to there own TAMO file. save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i)) # Merges the motifs in the same cluster using STAMP tamo2tf("%s_sub_%s.tm" % (TAMO_file, i)) # Gets the JASPER motifs that best match the motifs from within # the cluster. os.system( "STAMP -tf %s_sub_%s.tm.tf -sd /home/chengzou/bin/STAMP/ScoreDists/JaspRand_PCC_SWU.scores \ -go 1000 -ge 1000 -cc PCC -align SWU -out %s_sub_%s.tm.tf_STout -chp > %s_sub_%s.tm.tf_STout.log" % (TAMO_file, i, TAMO_file, i, TAMO_file, i)) parse_out_STAMP(TAMO_file, i) # combines the JASPER motifs with the cluster motif and then # converts them all to one TAMO file os.system( "cat %s_sub_%s.tm.tf %s_sub_%s.tm.tf_SToutFBP.txt.mod %s_sub_%s.tm.tf_STout_tree_clusters.txt > %s_sub_%s_sum.tm.tf" % (TAMO_file, i, TAMO_file, i, TAMO_file, i, TAMO_file, i)) tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i)) # Gets the top motif within the TAMO file. pick_chunk_score(wdir, '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i), target, genome) # Removes any files created in the processing. os.system("rm %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i)) os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i)) os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i)) else: key = cl_dic[i].keys()[0] old.append(ml[key]) # Combine together the top motifs from every os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file, TAMO_file)) save_motifs(old, "%s_sub_old.tm" % (TAMO_file)) os.system("cat %s_sub_old.tm %s_sub_new.tm > %s_P1.tm" % (TAMO_file, TAMO_file, TAMO_file))
tar_dir = sys.argv[i + 1].rstrip("/") print tar_dir ml = MotifTools.txt2motifs(file) n = 0 new_list = [] ## # Looks at each motif from the TAMO file. Uses the find function from # motility to find the sequences with that motif. for Ikey in range(len(ml)): #print m.ll time1 = time.time() m = ml[Ikey] # Pull out the motif from the motif list. save_motifs([m], file + '_' + str(Ikey)) # Save the motif as a file. ## # Create the PWM matrix from the TAMO motif. matrix = [] for i in range(len(m.ll)): position = [] for nt in ["A", "C", "G", "T"]: position.append(m.ll[i][nt]) matrix.append(position) #print matrix pwm = motility.PWM(matrix) # ## ##