Exemple #1
0
def main(args):

    # Parse arguments for PWM filename
    if len(args) != 2:
        print "Usage: ./cm_scan.py PWM_FILE"
        sys.exit(1)
    pwm_file = args[1]

    # Hard-coded values
    p = -12.0
    fasta = "../doc/fasta/human1ku-corrected.fasta"
    a_file = open("../doc/pickle/alignment.pickle", "r")

    # Load the 5-way alignment
    alignment = pickle.load(a_file)

    # Close relevant files
    a_file.close()

    # Run PWM_SCAN on the human FASTA, generate a GFF file
    pwm = pwm_file.replace(".pwm","")
    #os.popen("pwm_scan -f "+fasta+" -M "+pwm+" -p "+str(p)+" -s 1")
    #os.popen("pwm_scan -f "+fasta+" -M "+pwm+" -p "+str(p)+" -s 2")
    gff = open(fasta+"."+pwm.replace("../doc/pwm/","")+".gff", "r")

    # For each hit in the GFF, throw out hits that span gaps, build motifs
    motifs = buildMotifs(gff, alignment)
    if len(motifs) == 0:
        print "NO HITS"
        sys.exit(0)
    #print motifs
    #pickle.dump(hits, open("hits.tmp", "w"))
    #motifs = pickle.load(open("hits.tmp", "r"))

    # Create the CM mega-dictionary
    mega = []
    for i in range(len(motifs[0][0])-1):
        cmuts = dict()
        alphabet = [ "A->A", "A->C", "A->G", "A->T",
                     "C->A", "C->C", "C->G", "C->T",
                     "G->A", "G->C", "G->G", "G->T",
                     "T->A", "T->C", "T->G", "T->T" ]
        for i in alphabet:
            for j in alphabet:
                cmuts[i+"/"+j] = 0
        mega.append(cmuts)

    # Create the tree structure
    supertree = Tree([])
    root = TreeNode("r")
    rodent = TreeNode("")
    primate = TreeNode("")
    highermammal = TreeNode("")
    mouse = TreeNode("3")
    rat = TreeNode("4")
    human = TreeNode("1")
    chimp = TreeNode("2")
    dog = TreeNode("5")
    root.children = [ rodent, highermammal ]
    rodent.children = [ mouse, rat ]
    highermammal.children = [ dog, primate ]
    primate.children = [ human, chimp ]
    supertree.r = root
    supertree.node = [ root, highermammal, rodent, mouse,
                       rat, primate, human, chimp ]

    # For each motif:
    for m in motifs:

        # Check the motif: is it any good?
        flag = True
        for subm in m:
            if len(subm) != len(m[0]): flag = False
            if "N" in subm: flag = False
        if not flag: continue
        
        # Find the aligned sequence from other species, write motif to file
        f = open("cm_scan.tmp", "w")
        writeMotif(m, f)
        f.close()
        
        # Run the Ptree estimator to get a tree structure
        #os.popen("./estimator.sh")

        # Load the tree structure from the Estimator output
        #supertree = loadTree("cm_scan.tree")

        # For each position in the hit, color a new tree
        trees = []
        for position in range(len(m[0])):
            submotif = []
            for i in range(len(m)): submotif.append(m[i][position])
            trees.append(color(supertree, submotif))
            
        # For every pair of adjacent trees, count CMs
        for i in range(len(m[0])-1):
            t1 = trees[i]
            t2 = trees[i+1]
            countCMs(t1.r, t2.r, mega[i])

    # Do a chi-squared analysis on each position matrix
    print "DEGREES OF FREEDOM:", 15*15
    alphabet = [ "A->A", "A->C", "A->G", "A->T",
                 "C->A", "C->C", "C->G", "C->T",
                 "G->A", "G->C", "G->G", "G->T",
                 "T->A", "T->C", "T->G", "T->T" ]
    #print mega[0]["A->A/G->G"]
    for i in range(len(motifs[0][0])-1):
        m = zeros((len(alphabet), len(alphabet)))
        for j in range(len(alphabet)):
            for k in range(len(alphabet)):
                #print i, j, k
                #print "\t", alphabet[j], alphabet[k]
                m[j,k] = mega[i][alphabet[j]+"/"+alphabet[k]]
        print "PVALUE at pos (", i, ", ", i+1, ")", chisquarematrix.chisquare(m)
Exemple #2
0
def main(args):

    # Parse arguments for PWM filename
    if len(args) != 2:
        print "Usage: ./cm_scan.py PWM_FILE"
        sys.exit(1)
    pwm_file = args[1]

    # Hard-coded values
    p = -12.0
    fasta = "../doc/fasta/human1ku-corrected.fasta"
    a_file = open("../doc/pickle/alignment.pickle", "r")

    # Load the 5-way alignment
    alignment = pickle.load(a_file)

    # Close relevant files
    a_file.close()

    # Run PWM_SCAN on the human FASTA, generate a GFF file
    pwm = pwm_file.replace(".pwm","")
    #os.popen("pwm_scan -f "+fasta+" -M "+pwm+" -p "+str(p)+" -s 1")
    #os.popen("pwm_scan -f "+fasta+" -M "+pwm+" -p "+str(p)+" -s 2")
    gff = open(fasta+"."+pwm.replace("../doc/pwm/","")+".gff", "r")

    # For each hit in the GFF, throw out hits that span gaps, build motifs
    motifs = buildMotifs(gff, alignment)
    if len(motifs) == 0:
        print "NO HITS"
        sys.exit(0)
    #print motifs
    #pickle.dump(hits, open("hits.tmp", "w"))
    #motifs = pickle.load(open("hits.tmp", "r"))

    # Create the CM mega-dictionary
    mega = []
    for i in range(len(motifs[0][0])-1):
        cmuts = dict()
        alphabet = [ "A->A", "A->C", "A->G", "A->T",
                     "C->A", "C->C", "C->G", "C->T",
                     "G->A", "G->C", "G->G", "G->T",
                     "T->A", "T->C", "T->G", "T->T" ]
        for i in alphabet:
            for j in alphabet:
                cmuts[i+"/"+j] = 0
        mega.append(cmuts)

    # Create the tree structure
    supertree = Tree([])
    root = TreeNode("r")
    rodent = TreeNode("")
    primate = TreeNode("")
    highermammal = TreeNode("")
    mouse = TreeNode("3")
    rat = TreeNode("4")
    human = TreeNode("1")
    chimp = TreeNode("2")
    dog = TreeNode("5")
    root.children = [ rodent, highermammal ]
    rodent.children = [ mouse, rat ]
    highermammal.children = [ dog, primate ]
    primate.children = [ human, chimp ]
    supertree.r = root
    supertree.node = [ root, highermammal, rodent, mouse,
                       rat, primate, human, chimp ]
    
    # Create the positional uberdictionaries and interdirectories
    uberlist = []
    for i in range(len(motifs[0][0])):
        cdict = dict()
        alphabet = [ "A->A", "A->C", "A->G", "A->T",
                     "C->A", "C->C", "C->G", "C->T",
                     "G->A", "G->C", "G->G", "G->T",
                     "T->A", "T->C", "T->G", "T->T" ]
        for edge in alphabet: cdict[edge] = 0
        d = dict()
        d["root->rodent"] = copy.copy(cdict)
        d["root->highermammal"] = copy.copy(cdict)
        d["rodent->rat"] = copy.copy(cdict)
        d["rodent->mouse"] = copy.copy(cdict)
        d["highermammal->dog"] = copy.copy(cdict)
        d["highermammal->primate"] = copy.copy(cdict)
        d["primate->human"] = copy.copy(cdict)
        d["primate->chimp"] = copy.copy(cdict)
        uberlist.append(d)
    interlist = []
    for i in range(len(motifs[0][0])-1):
        cdict = dict()
        alphabet = [ "A->A", "A->C", "A->G", "A->T",
                     "C->A", "C->C", "C->G", "C->T",
                     "G->A", "G->C", "G->G", "G->T",
                     "T->A", "T->C", "T->G", "T->T" ]
        for i in range(len(alphabet)):
            for j in range(len(alphabet)):
                cdict[alphabet[i]+"/"+alphabet[j]] = 0
        d = dict()
        d["root->rodent"] = copy.copy(cdict)
        d["root->highermammal"] = copy.copy(cdict)
        d["rodent->rat"] = copy.copy(cdict)
        d["rodent->mouse"] = copy.copy(cdict)
        d["highermammal->dog"] = copy.copy(cdict)
        d["highermammal->primate"] = copy.copy(cdict)
        d["primate->human"] = copy.copy(cdict)
        d["primate->chimp"] = copy.copy(cdict)
        interlist.append(d)
    # Keeps track of conditional root probabilities
    rlist = []
    for i in range(len(motifs[0][0])-1):
        alphabet = [ "A|A", "A|C", "A|G", "A|T",
                     "C|A", "C|C", "C|G", "C|T",
                     "G|A", "G|C", "G|G", "G|T",
                     "T|A", "T|C", "T|G", "T|T" ]
        d = dict()
        for i in alphabet: d[i] = 0
        rlist.append(d)
    # Keeps track of background probabilities
    blist = dict()
    alphabet = [ "A", "T", "C", "G" ]
    for i in alphabet: blist[i] = 0

    # We want to randomly split the instances into training and test sets
    # And we want to repeat this for random splits until we get it right
    incrementor = 0
    logs = range(len(motifs[0][0])-1)
    while True:
        
        # Split the instance set randomly
        training = motifs[:int(random.uniform(0, 1)*len(motifs))]
        test = motifs[int(random.uniform(0, 1)*len(motifs)):]
        random.shuffle(motifs)

        if len(training) == 0 or len(test) == 0: continue

        # For each instance:
        for m in training:
            
            # Check the motif: is it any good?
            flag = True
            for subm in m:
                if len(subm) != len(m[0]): flag = False
                if "N" in subm: flag = False
            if not flag: continue
        
            # Find the aligned sequence from other species, write motif to file
            f = open("cm_scan.tmp", "w")
            writeMotif(m, f)
            f.close()

            # Run the Ptree estimator to get a tree structure
            #os.popen("./estimator.sh")
            # Load the tree structure from the Estimator output
            #supertree = loadTree("cm_scan.tree")

            # For each position in the hit, color a new tree
            trees = []
            for position in range(len(m[0])):
                submotif = []
                for i in range(len(m)): submotif.append(m[i][position])
                trees.append(color(supertree, submotif))
            
            # For every pair of adjacent trees, update the ubers/inters
            for i in range(len(m[0])-1):
                t1 = trees[i]
                t2 = trees[i+1]
                # Update the ubers
                roo = t1.r
                rod = roo.children[0]
                hig = roo.children[1]
                mou = rod.children[0]
                rat = rod.children[1]
                dog = hig.children[0]
                pri = hig.children[1]
                hum = pri.children[0]
                chi = pri.children[1]
                #edge counts for a position in the alignment
                #so this data involves one position
                uberlist[i]["root->rodent"][roo.color+"->"+rod.color] += 1
                uberlist[i]["root->highermammal"][roo.color+"->"+hig.color] += 1
                uberlist[i]["rodent->mouse"][rod.color+"->"+mou.color] += 1
                uberlist[i]["rodent->rat"][rod.color+"->"+rat.color] += 1
                uberlist[i]["highermammal->dog"][hig.color+"->"+dog.color] += 1
                uberlist[i]["highermammal->primate"][hig.color+"->"+pri.color] += 1
                uberlist[i]["primate->human"][pri.color+"->"+hum.color] += 1
                uberlist[i]["primate->chimp"][pri.color+"->"+chi.color] += 1                
                # Update the inters
                roo2 = t2.r
                rod2 = roo2.children[0]
                hig2 = roo2.children[1]
                mou2 = rod2.children[0]
                rat2 = rod2.children[1]
                dog2 = hig2.children[0]
                pri2 = hig2.children[1]
                hum2 = pri2.children[0]
                chi2 = pri2.children[1]
                #edge counts for adjacent positions in the alignment
                #this data involves two positions
                interlist[i]["root->rodent"][roo.color+"->"+rod.color+"/"+roo2.color+"->"+rod2.color] += 1
                interlist[i]["root->highermammal"][roo.color+"->"+hig.color+"/"+roo2.color+"->"+hig2.color] += 1
                interlist[i]["rodent->mouse"][rod.color+"->"+mou.color+"/"+rod2.color+"->"+mou2.color] += 1
                interlist[i]["rodent->rat"][rod.color+"->"+rat.color+"/"+rod2.color+"->"+rat2.color] += 1
                interlist[i]["highermammal->dog"][hig.color+"->"+dog.color+"/"+hig2.color+"->"+dog2.color] += 1
                interlist[i]["highermammal->primate"][hig.color+"->"+pri.color+"/"+hig2.color+"->"+pri2.color] += 1
                interlist[i]["primate->human"][pri.color+"->"+hum.color+"/"+pri2.color+"->"+hum2.color] += 1
                interlist[i]["primate->chimp"][pri.color+"->"+chi.color+"/"+pri2.color+"->"+chi2.color] += 1
                # Update the rlist
                rlist[i][roo.color+"|"+roo2.color] += 1
                # Update the blist
                blist[roo.color] += 1

        # Now turn those counts into ratios
        for i in range(len(training[0][0])-1):
            #find the prob of seeing an edge at a certain position
            #does not calculate seeing a node given another node in the same edge
            for j in uberlist[i].keys():
                ubersum = 0
                for k in uberlist[i][j].keys(): ubersum += uberlist[i][j][k]
                for k in uberlist[i][j].keys():
                    uberlist[i][j][k] = float(uberlist[i][j][k])/ubersum
            #also finds the probabilities of seeing say, ACTG, for the two positions
            #does not find conditional probs
            #wait, make sure it doesn't do cond probs
            #Greg is counting the probs of seeing AC and TG
            #he could use this later for the conditionals, i guess
            for j in interlist[i].keys():
                intersum = 0
                for k in interlist[i][j].keys(): intersum += interlist[i][j][k]
                for k in interlist[i][j].keys():
                    interlist[i][j][k] = float(interlist[i][j][k])/intersum
                rsum = 0
            for j in rlist[i].keys(): rsum += rlist[i][j]
            for j in rlist[i].keys(): rlist[i][j] = float(rlist[i][j])/rsum
        bsum = 0
        for i in blist.keys(): bsum += blist[i]
        for i in blist.keys(): blist[i] = float(blist[i])/bsum

        # Now that we have estimated parameters, calculate the log-likelihood
        # ratio of each tree pair in each instance and average
        ratios = []
        for i in range(len(training[0][0])-1): ratios.append(0)
    
        # For each motif:
        for m in test:

            # Check the motif: is it any good?
            flag = True
            for subm in m:
                if len(subm) != len(m[0]): flag = False
                if "N" in subm: flag = False
            if not flag: continue

            # For each position in the hit, color a new tree
            trees = []
            for position in range(len(m[0])):
                submotif = []
                for i in range(len(m)): submotif.append(m[i][position])
                trees.append(color(supertree, submotif))
            
            # For every pair of adjacent trees, update the ubers/inters
            for i in range(len(m[0])-1):
                t1 = trees[i]
                t2 = trees[i+1]
                roo = t1.r
                rod = roo.children[0]
                hig = roo.children[1]
                mou = rod.children[0]
                rat = rod.children[1]
                dog = hig.children[0]
                pri = hig.children[1]
                hum = pri.children[0]
                chi = pri.children[1]
                roo2 = t2.r
                rod2 = roo2.children[0]
                hig2 = roo2.children[1]
                mou2 = rod2.children[0]
                rat2 = rod2.children[1]
                dog2 = hig2.children[0]
                pri2 = hig2.children[1]
                hum2 = pri2.children[0]
                chi2 = pri2.children[1]
                
                # Compute the probability of seeing t1
                #why do it this way?
                #this does not take dependencies between nodes in an edge into account
                #maybe this is the way it should be done?
                #yeah, this does does not look at the node before it
                #the nodes are assumed to be independent for some reason,
                #yet the prob of seeing a node considers the node before it
                roorod = uberlist[i]["root->rodent"]["A->"+rod.color]+\
                         uberlist[i]["root->rodent"]["G->"+rod.color]+\
                         uberlist[i]["root->rodent"]["C->"+rod.color]+\
                         uberlist[i]["root->rodent"]["T->"+rod.color]
                roohig = uberlist[i]["root->highermammal"]["A->"+hig.color]+\
                         uberlist[i]["root->highermammal"]["G->"+hig.color]+\
                         uberlist[i]["root->highermammal"]["C->"+hig.color]+\
                         uberlist[i]["root->highermammal"]["T->"+hig.color]
                rodmou = uberlist[i]["rodent->mouse"]["A->"+mou.color]+\
                         uberlist[i]["rodent->mouse"]["G->"+mou.color]+\
                         uberlist[i]["rodent->mouse"]["C->"+mou.color]+\
                         uberlist[i]["rodent->mouse"]["T->"+mou.color]
                rodrat = uberlist[i]["rodent->rat"]["A->"+rat.color]+\
                         uberlist[i]["rodent->rat"]["G->"+rat.color]+\
                         uberlist[i]["rodent->rat"]["C->"+rat.color]+\
                         uberlist[i]["rodent->rat"]["T->"+rat.color]
                higdog = uberlist[i]["highermammal->dog"]["A->"+dog.color]+\
                         uberlist[i]["highermammal->dog"]["G->"+dog.color]+\
                         uberlist[i]["highermammal->dog"]["C->"+dog.color]+\
                         uberlist[i]["highermammal->dog"]["T->"+dog.color]
                higpri = uberlist[i]["highermammal->primate"]["A->"+pri.color]+\
                         uberlist[i]["highermammal->primate"]["G->"+pri.color]+\
                         uberlist[i]["highermammal->primate"]["C->"+pri.color]+\
                         uberlist[i]["highermammal->primate"]["T->"+pri.color]
                prichi = uberlist[i]["primate->chimp"]["A->"+chi.color]+\
                         uberlist[i]["primate->chimp"]["G->"+chi.color]+\
                         uberlist[i]["primate->chimp"]["C->"+chi.color]+\
                         uberlist[i]["primate->chimp"]["T->"+chi.color]
                prihum = uberlist[i]["primate->human"]["A->"+hum.color]+\
                         uberlist[i]["primate->human"]["G->"+hum.color]+\
                         uberlist[i]["primate->human"]["C->"+hum.color]+\
                         uberlist[i]["primate->human"]["T->"+hum.color]
                p_t1 = blist[roo.color]*roorod*roohig*rodmou*rodrat*higdog*higpri*prichi*prihum
                #p_t1 = blist[roo.color]* \
                #       uberlist[i]["root->rodent"][roo.color+"->"+rod.color]*uberlist[i]["root->highermammal"][roo.color+"->"+hig.color]*uberlist[i]["rodent->mouse"][rod.color+"->"+mou.color]*uberlist[i]["rodent->rat"][rod.color+"->"+rat.color]*uberlist[i]["highermammal->dog"][hig.color+"->"+dog.color]*uberlist[i]["highermammal->primate"][hig.color+"->"+pri.color]*uberlist[i]["primate->chimp"][pri.color+"->"+chi.color]*uberlist[i]["primate->human"][pri.color+"->"+hum.color]
                
                # Compute the probability of seeing t2 | t1
                #this also assumes the independence of nodes
                #the two positions don't involve conditionals, it is
                #doing the prob of the intersection
                roorod = getprobs(interlist[i]["root->rodent"], rod.color, rod2.color)
                roohig = getprobs(interlist[i]["root->highermammal"], hig.color, hig2.color)
                rodmou = getprobs(interlist[i]["rodent->mouse"], mou.color, mou2.color)
                rodrat = getprobs(interlist[i]["rodent->rat"], rat.color, rat2.color)
                higdog = getprobs(interlist[i]["highermammal->dog"], dog.color, dog2.color)
                higpri = getprobs(interlist[i]["highermammal->primate"], pri.color, pri2.color)
                prichi = getprobs(interlist[i]["primate->chimp"], chi.color, chi2.color)
                prihum = getprobs(interlist[i]["primate->human"], hum.color, hum2.color)
                p_t2 = rlist[i][roo.color+"|"+roo2.color]*roorod*roohig*rodmou*rodrat*higdog*higpri*prichi*prihum
                #p_t2 = rlist[i][roo.color+"|"+roo2.color]* \
                #       interlist[i]["root->rodent"][roo.color+"->"+rod.color+"/"+roo2.color+"->"+rod2.color]*interlist[i]["root->highermammal"][roo.color+"->"+hig.color+"/"+roo2.color+"->"+hig2.color]*interlist[i]["rodent->mouse"][rod.color+"->"+mou.color+"/"+rod2.color+"->"+mou2.color]*interlist[i]["rodent->rat"][rod.color+"->"+rat.color+"/"+rod2.color+"->"+rat2.color]*interlist[i]["highermammal->dog"][hig.color+"->"+dog.color+"/"+hig2.color+"->"+dog2.color]*interlist[i]["highermammal->primate"][hig.color+"->"+pri.color+"/"+hig2.color+"->"+pri2.color]*interlist[i]["primate->chimp"][pri.color+"->"+chi.color+"/"+pri2.color+"->"+chi2.color]*interlist[i]["primate->human"][pri.color+"->"+hum.color+"/"+pri2.color+"->"+hum2.color]
                
                # Compute the log-likelihood ratio
                if p_t2 == 0 or p_t1 == 0: ratios[i] += 0
                else: ratios[i] += -1*math.log(p_t2/p_t1)

        #for i in range(len(ratios)): print i, i+1, ratios[i]
        for i in range(len(ratios)): logs[i] += ratios[i]

        incrementor += 1
        if incrementor >= 200: break

    # Print the log-sums
    for i in range(len(logs)): print i+1, i+2, logs[i]

    # Prematurely terminate - let's not worry about X^2 analysis right now
    sys.exit(0)
    
    # Do a chi-squared analysis on each position matrix
    print "DEGREES OF FREEDOM:", 15*15
    alphabet = [ "A->A", "A->C", "A->G", "A->T",
                 "C->A", "C->C", "C->G", "C->T",
                 "G->A", "G->C", "G->G", "G->T",
                 "T->A", "T->C", "T->G", "T->T" ]
    #print mega[0]["A->A/G->G"]
    for i in range(len(motifs[0][0])-1):
        m = zeros((len(alphabet), len(alphabet)))
        for j in range(len(alphabet)):
            for k in range(len(alphabet)):
                #print i, j, k
                #print "\t", alphabet[j], alphabet[k]
                m[j,k] = mega[i][alphabet[j]+"/"+alphabet[k]]
        print "PVALUE at pos (", i, ", ", i+1, ")", chisquarematrix.chisquare(m)