Example #1
0
def pick_chunk_score(wdir, TAMO_file, target, genome):
    '''Trims and returns the top motif in a cluster.
    
    This script takes in the TAMO file from the motifs in a single cluster. It
    trims the low-information ends from each motifs. It then indentifies the
    motif that is most significantly represented in the target genes in your
    genome. If no motif is significantly represented, then a blank top motif
    file is created.
    '''
    os.system("cd %s" % wdir)
    os.chdir(wdir)

    script_dir = '/'.join(os.path.abspath(__file__).split('/')
                          [:-1])  # path to pcc_merge_CC.py script

    ##
    # step 1 trim tamo to eliminate low information flanking sequence
    trim_motif(TAMO_file, 0.1)

    ##
    # step 2 Group Specificity Score" from the Church lab
    # python MotifMetrics.py [Genes of interest] -genome [FASTA of promoter sequence] -t [Trimmed TAMO of cluster motifs]
    # MotifMetrics.py checks if the motifs appear disproportionatly to the
    # targets compared to the rest of the genes.
    os.system(
        "python %s/MotifMetrics.py %s -genome %s -t %s_0.1.trim -spec > %s_0.1.trim_Cout"
        % (script_dir, target, genome, TAMO_file, TAMO_file))

    ##
    # Gets the motif that is most significantly represented in your target genes
    # Returns "None" if none of the motifs has a p-value above 0.001.
    topm = parse_out_pcs("%s_0.1.trim_Cout" % TAMO_file)
    print "topm", topm

    ##
    # Writes the top motif to its own directory.
    if topm != "None":

        newdic = {}
        ml = MotifTools.txt2motifs("%s_0.1.trim" % TAMO_file)

        for m in ml:

            if m.oneletter == topm:
                newdic[m.oneletter] = m

        save_motifs(newdic.values(), "%s.TOP" % TAMO_file)
        os.system("rm %s_0.1.trim" % TAMO_file)
        os.system("rm %s_0.1.trim_Cout" % TAMO_file)

    ##
    # Writes a blank document if there was no top motif.
    else:
        oup = open("%s.TOP" % TAMO_file, "w")
        oup.close()
Example #2
0
def trim_motif(TAMO_file, cut=0.4):
    '''Trims the motifs in TAMO_file, eliminating low-information flanks.'''

    testmotifs = MotifTools.load(TAMO_file)
    file = TAMO_file + "_" + str(cut) + ".trim"

    new_mlist = []
    for motif in testmotifs:
        m = motif.trimmed(cut)
        new_mlist.append(m)
    save_motifs(new_mlist, file)
Example #3
0
def TAMO_split(TAMO_file, motifs_per_file=190):
    '''This function splits a TAMO into smaller files for create_cc'''
    ml = MotifTools.txt2motifs(TAMO_file)
    total = len(ml) / int(motifs_per_file)  # Total number of TAMOs to generate
    by = motifs_per_file
    for i in range(total):
        print i
        print i * by + by, TAMO_file + '_n%s' % i
        save_motifs(ml[i * by:i * by + by], TAMO_file + '_n%s' % i)
    print total * by, len(ml), TAMO_file + '_n%s' % (total)
    save_motifs(ml[total * by:len(ml)], TAMO_file + '_n%s' % (total))
    return (total)
Example #4
0
def tf2tamo(tf_file):

    inp = open(tf_file, "r")
    line = inp.readline()

    motifs = []
    while 1:
        if not line:
            break
        if line.startswith("DE"):
            name = line.strip().split()[1]
            block = []
            line = inp.readline()
            while not line.startswith("XX"):
                if not line:
                    break
                block.append(line)
                line = inp.readline()
            motifs.append(parse_block(name, block))
        else:
            line = inp.readline()

    save_motifs(motifs, '%s.tm' % (tf_file))
Example #5
0
tamoMotifs = []

for jMat in jasparFiles:
    tempMat = map(lambda l: l.strip().split(), open(jMat, "rU").readlines())

    ## eval() inteligently converts text numbers to int or float!
    # for i in range(len(tempMat)):
    # for j in range(len(tempMat[i])):
    # tempMat[i][j] = eval(tempMat[i][j])

    # transpose matrix
    tempMat = transpose(mat(tempMat)).tolist()

    for i in range(len(tempMat)):
        tempMat[i] = {
            "A": eval(tempMat[i][0]),
            "C": eval(tempMat[i][1]),
            "G": eval(tempMat[i][2]),
            "T": eval(tempMat[i][3]),
        }
    jasparTAMO_Motif = Motif_from_counts(tempMat, bg=nucBack)
    jasparTAMO_Motif.sourceFile = jMat.split("/")[-1]
    tamoMotifs.append(jasparTAMO_Motif)

# print to file
cPickle.dump(tamoMotifs, open(pklFile, "w"))
save_motifs(tamoMotifs, tmoFile, kmer_count=60)


print "Done."
Example #6
0
def merge_runs_cc(TAMO_file, wdir, height, distance, ancestor, target, genome):
    '''This script is used to merge motifs with the PCC matrix of all motifs.
    
    The script was originally written by Cheng Zou, and then converted to a 
    function by Alex Seddon.
    '''

    print "Here are the parameters you specified in this run "
    print "-tamo        %s" % TAMO_file
    print "-wdir        %s" % wdir
    print "-h        height to cut the tree, %s" % height
    print "-ancestor    %s" % ancestor
    print "-target    %s" % target
    print "-genome    %s" % genome

    if TAMO_file == '' or wdir == '':
        help()

    os.system("cd %s" % wdir)

    os.chdir(wdir)

    # Get the directory where the script is located.
    script_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])

    # This code was in the original clustering script. It has been taken out
    # because the processes involved take too long and have been taken up by
    # the matrrix creation scripts and the run_UPGMA script.
    #if distance==0:
    #    os.system("python /mnt/home/seddonal/gil scottscripts/5_motif_merging/3.calculate_distance_matrix.py   -i %s --dfunc pccrange" % TAMO_file)
    #os.system("R --vanilla --slave --args %s.dm  %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file))

    cl_dic = {}
    n = 0

    # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear
    # in the TAMO_file. If two motifs have the same number, they are considered
    # a part of the same cluster.
    # This loop pulls the clustering information out of this file and creats
    # the dictionary cl_dic = {cluster_index:{motif_index:'1'}}
    for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"):

        # Gets the clusterindex of this motif
        cl = line.strip()

        # Adds the cluster index if it has not been
        if not cl_dic.has_key(cl):
            cl_dic[cl] = {}

        cl_dic[cl][n] = "1"  # Adds the motif to that cluster
        n += 1  # Increases the motif index for the next motif

    #print cl_dic

    ml = MotifTools.txt2motifs(TAMO_file)
    old = []  # List of motifs that are the sole members of a cluster.

    # I think I can divide up this portion of the code to create a series
    print ancestor, ancestor == 0

    cc_output = open('merge_runs_cc', 'w')

    if ancestor == 0:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                cc_output.write(
                    'module load TAMO; python %s/pcc_merge_CC.py merge_runs_no_ancestor -t %s/%s -i %s -target %s -genome %s\n'
                    % (script_dir, wdir, TAMO_file, i, target, genome))

            # If there is only one motif in the cluster, it leaves it alone,
            # And adds it to old
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    if ancestor == 1:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                cc_output.write(
                    'module load TAMO; module load STAMPmotif; python %s/pcc_merge_CC.py merge_runs_ancestor -t %s/%s -i %s -target %s -genome %s\n'
                    % (script_dir, wdir, TAMO_file, i, target, genome))

            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    # Combine together the motifs that are in there own cluster.
    #os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file,TAMO_file))
    save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
Example #7
0
def merge_runs(TAMO_file, wdir, height, distance, ancestor, target, genome):
    '''This script is used to merge motifs with the PCC matrix of all motifs.
    
    The script was originally written by Cheng Zou, and then converted to a 
    function by Alex Seddon.
    '''

    print "Here are the parameters you specified in this run "
    print "-tamo        %s" % TAMO_file
    print "-wdir        %s" % wdir
    print "-h        height to cut the tree, %s" % height
    print "-distance    %s" % distance
    print "-ancestor    %s" % ancestor
    print "-target    %s" % target
    print "-genome    %s" % genome

    if TAMO_file == '' or wdir == '':
        help()

    os.system("cd %s" % wdir)

    os.chdir(wdir)

    # This code was in the original clustering script. It has been taken out
    # because the processes involved take too long and have been replaced by
    # the matrix creation scripts and the run_UPGMA script.
    #if distance==0:
    #    os.system("python /mnt/home/seddonal/scripts/5_motif_merging/3.calculate_distance_matrix.py   -i %s --dfunc pccrange" % TAMO_file)
    #os.system("R --vanilla --slave --args %s.dm  %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file))

    cl_dic = {}
    n = 0

    # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear
    # in the TAMO_file. If two motifs have the same number, they are considered
    # a part of the same cluster.
    # This loop pulls the clustering information out of this file and creats
    # the dictionary cl_dic = {cluster_index:{motif_index:'1'}}
    for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"):

        # Gets the clusterindex of this motif
        cl = line.strip()

        # Adds the cluster index if it has not been
        if not cl_dic.has_key(cl):
            cl_dic[cl] = {}

        cl_dic[cl][n] = "1"  # Adds the motif to that cluster
        n += 1  # Increases the motif index for the next motif

    #print cl_dic

    ml = MotifTools.txt2motifs(TAMO_file)
    old = []  # List of motifs that are the sole members of a cluster.

    # I think I can divide up this portion of the code to create a series
    print ancestor, ancestor == 0
    if ancestor == 0:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                # I am fairly certain that this process of converting to TF and
                # then returning it to TAMO format is only for keeping the names
                # consistent. I need to verify this suspicion
                tamo2tf("%s_sub_%s.tm" % (TAMO_file, i))
                os.system("cat  %s_sub_%s.tm.tf > %s_sub_%s_sum.tm.tf" %
                          (TAMO_file, i, TAMO_file, i))
                tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i))

                # Gets the top motif in the cluster.
                pick_chunk_score(wdir,
                                 '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i),
                                 target, genome)

                # Removes the files that were created.
                os.system("rm  %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i))
                os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i))
                os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i))

            # If there is only one motif in the cluster, it leaves it alone,
            # And adds it to old
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    if ancestor == 1:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                # Merges the motifs in the same cluster using STAMP
                tamo2tf("%s_sub_%s.tm" % (TAMO_file, i))

                # Gets the JASPER motifs that best match the motifs from within
                # the cluster.
                os.system(
                    "STAMP -tf  %s_sub_%s.tm.tf  -sd /home/chengzou/bin/STAMP/ScoreDists/JaspRand_PCC_SWU.scores  \
                 -go  1000 -ge 1000 -cc PCC -align SWU -out %s_sub_%s.tm.tf_STout -chp > %s_sub_%s.tm.tf_STout.log"
                    % (TAMO_file, i, TAMO_file, i, TAMO_file, i))
                parse_out_STAMP(TAMO_file, i)

                # combines the JASPER motifs with the cluster motif and then
                # converts them all to one TAMO file
                os.system(
                    "cat  %s_sub_%s.tm.tf %s_sub_%s.tm.tf_SToutFBP.txt.mod %s_sub_%s.tm.tf_STout_tree_clusters.txt > %s_sub_%s_sum.tm.tf"
                    % (TAMO_file, i, TAMO_file, i, TAMO_file, i, TAMO_file, i))
                tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i))

                # Gets the top motif within the TAMO file.
                pick_chunk_score(wdir,
                                 '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i),
                                 target, genome)

                # Removes any files created in the processing.
                os.system("rm  %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i))
                os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i))
                os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i))
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    # Combine together the top motifs from every
    os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" %
              (TAMO_file, TAMO_file))
    save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
    os.system("cat %s_sub_old.tm %s_sub_new.tm > %s_P1.tm" %
              (TAMO_file, TAMO_file, TAMO_file))
            tar_dir = sys.argv[i + 1].rstrip("/")
    print tar_dir
    ml = MotifTools.txt2motifs(file)
    n = 0
    new_list = []

    ##
    # Looks at each motif from the TAMO file. Uses the find function from
    # motility to find the sequences with that motif.
    for Ikey in range(len(ml)):
        #print m.ll

        time1 = time.time()

        m = ml[Ikey]  # Pull out the motif from the motif list.
        save_motifs([m], file + '_' + str(Ikey))  # Save the motif as a file.

        ##
        # Create the PWM matrix from the TAMO motif.
        matrix = []
        for i in range(len(m.ll)):
            position = []
            for nt in ["A", "C", "G", "T"]:
                position.append(m.ll[i][nt])
            matrix.append(position)
        #print matrix
        pwm = motility.PWM(matrix)
        #
        ##

        ##