def main(args): # Parse arguments for PWM filename if len(args) != 2: print "Usage: ./cm_scan.py PWM_FILE" sys.exit(1) pwm_file = args[1] # Hard-coded values p = -12.0 fasta = "../doc/fasta/human1ku-corrected.fasta" a_file = open("../doc/pickle/alignment.pickle", "r") # Load the 5-way alignment alignment = pickle.load(a_file) # Close relevant files a_file.close() # Run PWM_SCAN on the human FASTA, generate a GFF file pwm = pwm_file.replace(".pwm","") #os.popen("pwm_scan -f "+fasta+" -M "+pwm+" -p "+str(p)+" -s 1") #os.popen("pwm_scan -f "+fasta+" -M "+pwm+" -p "+str(p)+" -s 2") gff = open(fasta+"."+pwm.replace("../doc/pwm/","")+".gff", "r") # For each hit in the GFF, throw out hits that span gaps, build motifs motifs = buildMotifs(gff, alignment) if len(motifs) == 0: print "NO HITS" sys.exit(0) #print motifs #pickle.dump(hits, open("hits.tmp", "w")) #motifs = pickle.load(open("hits.tmp", "r")) # Create the CM mega-dictionary mega = [] for i in range(len(motifs[0][0])-1): cmuts = dict() alphabet = [ "A->A", "A->C", "A->G", "A->T", "C->A", "C->C", "C->G", "C->T", "G->A", "G->C", "G->G", "G->T", "T->A", "T->C", "T->G", "T->T" ] for i in alphabet: for j in alphabet: cmuts[i+"/"+j] = 0 mega.append(cmuts) # Create the tree structure supertree = Tree([]) root = TreeNode("r") rodent = TreeNode("") primate = TreeNode("") highermammal = TreeNode("") mouse = TreeNode("3") rat = TreeNode("4") human = TreeNode("1") chimp = TreeNode("2") dog = TreeNode("5") root.children = [ rodent, highermammal ] rodent.children = [ mouse, rat ] highermammal.children = [ dog, primate ] primate.children = [ human, chimp ] supertree.r = root supertree.node = [ root, highermammal, rodent, mouse, rat, primate, human, chimp ] # For each motif: for m in motifs: # Check the motif: is it any good? flag = True for subm in m: if len(subm) != len(m[0]): flag = False if "N" in subm: flag = False if not flag: continue # Find the aligned sequence from other species, write motif to file f = open("cm_scan.tmp", "w") writeMotif(m, f) f.close() # Run the Ptree estimator to get a tree structure #os.popen("./estimator.sh") # Load the tree structure from the Estimator output #supertree = loadTree("cm_scan.tree") # For each position in the hit, color a new tree trees = [] for position in range(len(m[0])): submotif = [] for i in range(len(m)): submotif.append(m[i][position]) trees.append(color(supertree, submotif)) # For every pair of adjacent trees, count CMs for i in range(len(m[0])-1): t1 = trees[i] t2 = trees[i+1] countCMs(t1.r, t2.r, mega[i]) # Do a chi-squared analysis on each position matrix print "DEGREES OF FREEDOM:", 15*15 alphabet = [ "A->A", "A->C", "A->G", "A->T", "C->A", "C->C", "C->G", "C->T", "G->A", "G->C", "G->G", "G->T", "T->A", "T->C", "T->G", "T->T" ] #print mega[0]["A->A/G->G"] for i in range(len(motifs[0][0])-1): m = zeros((len(alphabet), len(alphabet))) for j in range(len(alphabet)): for k in range(len(alphabet)): #print i, j, k #print "\t", alphabet[j], alphabet[k] m[j,k] = mega[i][alphabet[j]+"/"+alphabet[k]] print "PVALUE at pos (", i, ", ", i+1, ")", chisquarematrix.chisquare(m)
def main(args): # Parse arguments for PWM filename if len(args) != 2: print "Usage: ./cm_scan.py PWM_FILE" sys.exit(1) pwm_file = args[1] # Hard-coded values p = -12.0 fasta = "../doc/fasta/human1ku-corrected.fasta" a_file = open("../doc/pickle/alignment.pickle", "r") # Load the 5-way alignment alignment = pickle.load(a_file) # Close relevant files a_file.close() # Run PWM_SCAN on the human FASTA, generate a GFF file pwm = pwm_file.replace(".pwm","") #os.popen("pwm_scan -f "+fasta+" -M "+pwm+" -p "+str(p)+" -s 1") #os.popen("pwm_scan -f "+fasta+" -M "+pwm+" -p "+str(p)+" -s 2") gff = open(fasta+"."+pwm.replace("../doc/pwm/","")+".gff", "r") # For each hit in the GFF, throw out hits that span gaps, build motifs motifs = buildMotifs(gff, alignment) if len(motifs) == 0: print "NO HITS" sys.exit(0) #print motifs #pickle.dump(hits, open("hits.tmp", "w")) #motifs = pickle.load(open("hits.tmp", "r")) # Create the CM mega-dictionary mega = [] for i in range(len(motifs[0][0])-1): cmuts = dict() alphabet = [ "A->A", "A->C", "A->G", "A->T", "C->A", "C->C", "C->G", "C->T", "G->A", "G->C", "G->G", "G->T", "T->A", "T->C", "T->G", "T->T" ] for i in alphabet: for j in alphabet: cmuts[i+"/"+j] = 0 mega.append(cmuts) # Create the tree structure supertree = Tree([]) root = TreeNode("r") rodent = TreeNode("") primate = TreeNode("") highermammal = TreeNode("") mouse = TreeNode("3") rat = TreeNode("4") human = TreeNode("1") chimp = TreeNode("2") dog = TreeNode("5") root.children = [ rodent, highermammal ] rodent.children = [ mouse, rat ] highermammal.children = [ dog, primate ] primate.children = [ human, chimp ] supertree.r = root supertree.node = [ root, highermammal, rodent, mouse, rat, primate, human, chimp ] # Create the positional uberdictionaries and interdirectories uberlist = [] for i in range(len(motifs[0][0])): cdict = dict() alphabet = [ "A->A", "A->C", "A->G", "A->T", "C->A", "C->C", "C->G", "C->T", "G->A", "G->C", "G->G", "G->T", "T->A", "T->C", "T->G", "T->T" ] for edge in alphabet: cdict[edge] = 0 d = dict() d["root->rodent"] = copy.copy(cdict) d["root->highermammal"] = copy.copy(cdict) d["rodent->rat"] = copy.copy(cdict) d["rodent->mouse"] = copy.copy(cdict) d["highermammal->dog"] = copy.copy(cdict) d["highermammal->primate"] = copy.copy(cdict) d["primate->human"] = copy.copy(cdict) d["primate->chimp"] = copy.copy(cdict) uberlist.append(d) interlist = [] for i in range(len(motifs[0][0])-1): cdict = dict() alphabet = [ "A->A", "A->C", "A->G", "A->T", "C->A", "C->C", "C->G", "C->T", "G->A", "G->C", "G->G", "G->T", "T->A", "T->C", "T->G", "T->T" ] for i in range(len(alphabet)): for j in range(len(alphabet)): cdict[alphabet[i]+"/"+alphabet[j]] = 0 d = dict() d["root->rodent"] = copy.copy(cdict) d["root->highermammal"] = copy.copy(cdict) d["rodent->rat"] = copy.copy(cdict) d["rodent->mouse"] = copy.copy(cdict) d["highermammal->dog"] = copy.copy(cdict) d["highermammal->primate"] = copy.copy(cdict) d["primate->human"] = copy.copy(cdict) d["primate->chimp"] = copy.copy(cdict) interlist.append(d) # Keeps track of conditional root probabilities rlist = [] for i in range(len(motifs[0][0])-1): alphabet = [ "A|A", "A|C", "A|G", "A|T", "C|A", "C|C", "C|G", "C|T", "G|A", "G|C", "G|G", "G|T", "T|A", "T|C", "T|G", "T|T" ] d = dict() for i in alphabet: d[i] = 0 rlist.append(d) # Keeps track of background probabilities blist = dict() alphabet = [ "A", "T", "C", "G" ] for i in alphabet: blist[i] = 0 # We want to randomly split the instances into training and test sets # And we want to repeat this for random splits until we get it right incrementor = 0 logs = range(len(motifs[0][0])-1) while True: # Split the instance set randomly training = motifs[:int(random.uniform(0, 1)*len(motifs))] test = motifs[int(random.uniform(0, 1)*len(motifs)):] random.shuffle(motifs) if len(training) == 0 or len(test) == 0: continue # For each instance: for m in training: # Check the motif: is it any good? flag = True for subm in m: if len(subm) != len(m[0]): flag = False if "N" in subm: flag = False if not flag: continue # Find the aligned sequence from other species, write motif to file f = open("cm_scan.tmp", "w") writeMotif(m, f) f.close() # Run the Ptree estimator to get a tree structure #os.popen("./estimator.sh") # Load the tree structure from the Estimator output #supertree = loadTree("cm_scan.tree") # For each position in the hit, color a new tree trees = [] for position in range(len(m[0])): submotif = [] for i in range(len(m)): submotif.append(m[i][position]) trees.append(color(supertree, submotif)) # For every pair of adjacent trees, update the ubers/inters for i in range(len(m[0])-1): t1 = trees[i] t2 = trees[i+1] # Update the ubers roo = t1.r rod = roo.children[0] hig = roo.children[1] mou = rod.children[0] rat = rod.children[1] dog = hig.children[0] pri = hig.children[1] hum = pri.children[0] chi = pri.children[1] #edge counts for a position in the alignment #so this data involves one position uberlist[i]["root->rodent"][roo.color+"->"+rod.color] += 1 uberlist[i]["root->highermammal"][roo.color+"->"+hig.color] += 1 uberlist[i]["rodent->mouse"][rod.color+"->"+mou.color] += 1 uberlist[i]["rodent->rat"][rod.color+"->"+rat.color] += 1 uberlist[i]["highermammal->dog"][hig.color+"->"+dog.color] += 1 uberlist[i]["highermammal->primate"][hig.color+"->"+pri.color] += 1 uberlist[i]["primate->human"][pri.color+"->"+hum.color] += 1 uberlist[i]["primate->chimp"][pri.color+"->"+chi.color] += 1 # Update the inters roo2 = t2.r rod2 = roo2.children[0] hig2 = roo2.children[1] mou2 = rod2.children[0] rat2 = rod2.children[1] dog2 = hig2.children[0] pri2 = hig2.children[1] hum2 = pri2.children[0] chi2 = pri2.children[1] #edge counts for adjacent positions in the alignment #this data involves two positions interlist[i]["root->rodent"][roo.color+"->"+rod.color+"/"+roo2.color+"->"+rod2.color] += 1 interlist[i]["root->highermammal"][roo.color+"->"+hig.color+"/"+roo2.color+"->"+hig2.color] += 1 interlist[i]["rodent->mouse"][rod.color+"->"+mou.color+"/"+rod2.color+"->"+mou2.color] += 1 interlist[i]["rodent->rat"][rod.color+"->"+rat.color+"/"+rod2.color+"->"+rat2.color] += 1 interlist[i]["highermammal->dog"][hig.color+"->"+dog.color+"/"+hig2.color+"->"+dog2.color] += 1 interlist[i]["highermammal->primate"][hig.color+"->"+pri.color+"/"+hig2.color+"->"+pri2.color] += 1 interlist[i]["primate->human"][pri.color+"->"+hum.color+"/"+pri2.color+"->"+hum2.color] += 1 interlist[i]["primate->chimp"][pri.color+"->"+chi.color+"/"+pri2.color+"->"+chi2.color] += 1 # Update the rlist rlist[i][roo.color+"|"+roo2.color] += 1 # Update the blist blist[roo.color] += 1 # Now turn those counts into ratios for i in range(len(training[0][0])-1): #find the prob of seeing an edge at a certain position #does not calculate seeing a node given another node in the same edge for j in uberlist[i].keys(): ubersum = 0 for k in uberlist[i][j].keys(): ubersum += uberlist[i][j][k] for k in uberlist[i][j].keys(): uberlist[i][j][k] = float(uberlist[i][j][k])/ubersum #also finds the probabilities of seeing say, ACTG, for the two positions #does not find conditional probs #wait, make sure it doesn't do cond probs #Greg is counting the probs of seeing AC and TG #he could use this later for the conditionals, i guess for j in interlist[i].keys(): intersum = 0 for k in interlist[i][j].keys(): intersum += interlist[i][j][k] for k in interlist[i][j].keys(): interlist[i][j][k] = float(interlist[i][j][k])/intersum rsum = 0 for j in rlist[i].keys(): rsum += rlist[i][j] for j in rlist[i].keys(): rlist[i][j] = float(rlist[i][j])/rsum bsum = 0 for i in blist.keys(): bsum += blist[i] for i in blist.keys(): blist[i] = float(blist[i])/bsum # Now that we have estimated parameters, calculate the log-likelihood # ratio of each tree pair in each instance and average ratios = [] for i in range(len(training[0][0])-1): ratios.append(0) # For each motif: for m in test: # Check the motif: is it any good? flag = True for subm in m: if len(subm) != len(m[0]): flag = False if "N" in subm: flag = False if not flag: continue # For each position in the hit, color a new tree trees = [] for position in range(len(m[0])): submotif = [] for i in range(len(m)): submotif.append(m[i][position]) trees.append(color(supertree, submotif)) # For every pair of adjacent trees, update the ubers/inters for i in range(len(m[0])-1): t1 = trees[i] t2 = trees[i+1] roo = t1.r rod = roo.children[0] hig = roo.children[1] mou = rod.children[0] rat = rod.children[1] dog = hig.children[0] pri = hig.children[1] hum = pri.children[0] chi = pri.children[1] roo2 = t2.r rod2 = roo2.children[0] hig2 = roo2.children[1] mou2 = rod2.children[0] rat2 = rod2.children[1] dog2 = hig2.children[0] pri2 = hig2.children[1] hum2 = pri2.children[0] chi2 = pri2.children[1] # Compute the probability of seeing t1 #why do it this way? #this does not take dependencies between nodes in an edge into account #maybe this is the way it should be done? #yeah, this does does not look at the node before it #the nodes are assumed to be independent for some reason, #yet the prob of seeing a node considers the node before it roorod = uberlist[i]["root->rodent"]["A->"+rod.color]+\ uberlist[i]["root->rodent"]["G->"+rod.color]+\ uberlist[i]["root->rodent"]["C->"+rod.color]+\ uberlist[i]["root->rodent"]["T->"+rod.color] roohig = uberlist[i]["root->highermammal"]["A->"+hig.color]+\ uberlist[i]["root->highermammal"]["G->"+hig.color]+\ uberlist[i]["root->highermammal"]["C->"+hig.color]+\ uberlist[i]["root->highermammal"]["T->"+hig.color] rodmou = uberlist[i]["rodent->mouse"]["A->"+mou.color]+\ uberlist[i]["rodent->mouse"]["G->"+mou.color]+\ uberlist[i]["rodent->mouse"]["C->"+mou.color]+\ uberlist[i]["rodent->mouse"]["T->"+mou.color] rodrat = uberlist[i]["rodent->rat"]["A->"+rat.color]+\ uberlist[i]["rodent->rat"]["G->"+rat.color]+\ uberlist[i]["rodent->rat"]["C->"+rat.color]+\ uberlist[i]["rodent->rat"]["T->"+rat.color] higdog = uberlist[i]["highermammal->dog"]["A->"+dog.color]+\ uberlist[i]["highermammal->dog"]["G->"+dog.color]+\ uberlist[i]["highermammal->dog"]["C->"+dog.color]+\ uberlist[i]["highermammal->dog"]["T->"+dog.color] higpri = uberlist[i]["highermammal->primate"]["A->"+pri.color]+\ uberlist[i]["highermammal->primate"]["G->"+pri.color]+\ uberlist[i]["highermammal->primate"]["C->"+pri.color]+\ uberlist[i]["highermammal->primate"]["T->"+pri.color] prichi = uberlist[i]["primate->chimp"]["A->"+chi.color]+\ uberlist[i]["primate->chimp"]["G->"+chi.color]+\ uberlist[i]["primate->chimp"]["C->"+chi.color]+\ uberlist[i]["primate->chimp"]["T->"+chi.color] prihum = uberlist[i]["primate->human"]["A->"+hum.color]+\ uberlist[i]["primate->human"]["G->"+hum.color]+\ uberlist[i]["primate->human"]["C->"+hum.color]+\ uberlist[i]["primate->human"]["T->"+hum.color] p_t1 = blist[roo.color]*roorod*roohig*rodmou*rodrat*higdog*higpri*prichi*prihum #p_t1 = blist[roo.color]* \ # uberlist[i]["root->rodent"][roo.color+"->"+rod.color]*uberlist[i]["root->highermammal"][roo.color+"->"+hig.color]*uberlist[i]["rodent->mouse"][rod.color+"->"+mou.color]*uberlist[i]["rodent->rat"][rod.color+"->"+rat.color]*uberlist[i]["highermammal->dog"][hig.color+"->"+dog.color]*uberlist[i]["highermammal->primate"][hig.color+"->"+pri.color]*uberlist[i]["primate->chimp"][pri.color+"->"+chi.color]*uberlist[i]["primate->human"][pri.color+"->"+hum.color] # Compute the probability of seeing t2 | t1 #this also assumes the independence of nodes #the two positions don't involve conditionals, it is #doing the prob of the intersection roorod = getprobs(interlist[i]["root->rodent"], rod.color, rod2.color) roohig = getprobs(interlist[i]["root->highermammal"], hig.color, hig2.color) rodmou = getprobs(interlist[i]["rodent->mouse"], mou.color, mou2.color) rodrat = getprobs(interlist[i]["rodent->rat"], rat.color, rat2.color) higdog = getprobs(interlist[i]["highermammal->dog"], dog.color, dog2.color) higpri = getprobs(interlist[i]["highermammal->primate"], pri.color, pri2.color) prichi = getprobs(interlist[i]["primate->chimp"], chi.color, chi2.color) prihum = getprobs(interlist[i]["primate->human"], hum.color, hum2.color) p_t2 = rlist[i][roo.color+"|"+roo2.color]*roorod*roohig*rodmou*rodrat*higdog*higpri*prichi*prihum #p_t2 = rlist[i][roo.color+"|"+roo2.color]* \ # interlist[i]["root->rodent"][roo.color+"->"+rod.color+"/"+roo2.color+"->"+rod2.color]*interlist[i]["root->highermammal"][roo.color+"->"+hig.color+"/"+roo2.color+"->"+hig2.color]*interlist[i]["rodent->mouse"][rod.color+"->"+mou.color+"/"+rod2.color+"->"+mou2.color]*interlist[i]["rodent->rat"][rod.color+"->"+rat.color+"/"+rod2.color+"->"+rat2.color]*interlist[i]["highermammal->dog"][hig.color+"->"+dog.color+"/"+hig2.color+"->"+dog2.color]*interlist[i]["highermammal->primate"][hig.color+"->"+pri.color+"/"+hig2.color+"->"+pri2.color]*interlist[i]["primate->chimp"][pri.color+"->"+chi.color+"/"+pri2.color+"->"+chi2.color]*interlist[i]["primate->human"][pri.color+"->"+hum.color+"/"+pri2.color+"->"+hum2.color] # Compute the log-likelihood ratio if p_t2 == 0 or p_t1 == 0: ratios[i] += 0 else: ratios[i] += -1*math.log(p_t2/p_t1) #for i in range(len(ratios)): print i, i+1, ratios[i] for i in range(len(ratios)): logs[i] += ratios[i] incrementor += 1 if incrementor >= 200: break # Print the log-sums for i in range(len(logs)): print i+1, i+2, logs[i] # Prematurely terminate - let's not worry about X^2 analysis right now sys.exit(0) # Do a chi-squared analysis on each position matrix print "DEGREES OF FREEDOM:", 15*15 alphabet = [ "A->A", "A->C", "A->G", "A->T", "C->A", "C->C", "C->G", "C->T", "G->A", "G->C", "G->G", "G->T", "T->A", "T->C", "T->G", "T->T" ] #print mega[0]["A->A/G->G"] for i in range(len(motifs[0][0])-1): m = zeros((len(alphabet), len(alphabet))) for j in range(len(alphabet)): for k in range(len(alphabet)): #print i, j, k #print "\t", alphabet[j], alphabet[k] m[j,k] = mega[i][alphabet[j]+"/"+alphabet[k]] print "PVALUE at pos (", i, ", ", i+1, ")", chisquarematrix.chisquare(m)