Esempio n. 1
0
def tamo2tf(TAMO_file):
    '''Converts TAMO files to the TRANSFAC format
    '''

    ml = MotifTools.txt2motifs(TAMO_file)
    TAMO_file_name = TAMO_file.split("/")[-1]
    ACGT = ["A", "C", "G", "T"]
    n = 1
    oup = open("%s.tf" % (TAMO_file), "w")
    for m in ml:
        if m.source == "":
            oup.write("DE\t%s_%s\t%s_%s\n" %
                      (TAMO_file_name, n, TAMO_file_name, n))
        else:
            oup.write("DE\t%s\t%s\n" % (m.source, m.source))
        count = 0
        #print m.source
        for i in range(m.width):
            oup.write("%s\t" % count)
            for letter in ACGT:
                if m.logP:
                    Pij = pow(2.0, m.logP[i][letter])
                    oup.write("%s\t" % int(Pij * 100))
            oup.write("\n")
            count += 1
        oup.write("XX\n")
        n += 1
    oup.close()
Esempio n. 2
0
def parse_opts():
    global GLOBALS
    global DFUNC, DMAX
    short_opts = 'm:'
    long_opts  = ['dfunc:']
    try:   opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
        usage()
    if not opts: usage()

    GLOBALS['args'] = args
    GLOBALS['motifs'] = []
    DFUNCtxt = None
    for opt,value in opts:
        if opt == '-m':                   GLOBALS['motifs'] = MotifTools.txt2motifs(value)
        if opt == '--dfunc':              DFUNCtxt = value
        if opt == '-d':                   DMAX     = float(value)

    # Deal with DFUNC and DMAX
    if DFUNCtxt == 'NCB':
        _DFUNC = MotifCompare.negcommonbits
    elif DFUNCtxt:
        try:
            exec ("_DFUNC = MotifCompare.%s"%DFUNCtxt)
        except:
            usage("No such distance metric: %s"%DFUNCtxt)
    if _DFUNC:  set_dfunc(_DFUNC,DMAX)
Esempio n. 3
0
def pick_chunk_score(wdir, TAMO_file, target, genome):
    '''Trims and returns the top motif in a cluster.
    
    This script takes in the TAMO file from the motifs in a single cluster. It
    trims the low-information ends from each motifs. It then indentifies the
    motif that is most significantly represented in the target genes in your
    genome. If no motif is significantly represented, then a blank top motif
    file is created.
    '''
    os.system("cd %s" % wdir)
    os.chdir(wdir)

    script_dir = '/'.join(os.path.abspath(__file__).split('/')
                          [:-1])  # path to pcc_merge_CC.py script

    ##
    # step 1 trim tamo to eliminate low information flanking sequence
    trim_motif(TAMO_file, 0.1)

    ##
    # step 2 Group Specificity Score" from the Church lab
    # python MotifMetrics.py [Genes of interest] -genome [FASTA of promoter sequence] -t [Trimmed TAMO of cluster motifs]
    # MotifMetrics.py checks if the motifs appear disproportionatly to the
    # targets compared to the rest of the genes.
    os.system(
        "python %s/MotifMetrics.py %s -genome %s -t %s_0.1.trim -spec > %s_0.1.trim_Cout"
        % (script_dir, target, genome, TAMO_file, TAMO_file))

    ##
    # Gets the motif that is most significantly represented in your target genes
    # Returns "None" if none of the motifs has a p-value above 0.001.
    topm = parse_out_pcs("%s_0.1.trim_Cout" % TAMO_file)
    print "topm", topm

    ##
    # Writes the top motif to its own directory.
    if topm != "None":

        newdic = {}
        ml = MotifTools.txt2motifs("%s_0.1.trim" % TAMO_file)

        for m in ml:

            if m.oneletter == topm:
                newdic[m.oneletter] = m

        save_motifs(newdic.values(), "%s.TOP" % TAMO_file)
        os.system("rm %s_0.1.trim" % TAMO_file)
        os.system("rm %s_0.1.trim_Cout" % TAMO_file)

    ##
    # Writes a blank document if there was no top motif.
    else:
        oup = open("%s.TOP" % TAMO_file, "w")
        oup.close()
Esempio n. 4
0
def TAMO_split(TAMO_file, motifs_per_file=190):
    '''This function splits a TAMO into smaller files for create_cc'''
    ml = MotifTools.txt2motifs(TAMO_file)
    total = len(ml) / int(motifs_per_file)  # Total number of TAMOs to generate
    by = motifs_per_file
    for i in range(total):
        print i
        print i * by + by, TAMO_file + '_n%s' % i
        save_motifs(ml[i * by:i * by + by], TAMO_file + '_n%s' % i)
    print total * by, len(ml), TAMO_file + '_n%s' % (total)
    save_motifs(ml[total * by:len(ml)], TAMO_file + '_n%s' % (total))
    return (total)
Esempio n. 5
0
def combine_distance_matrix_for_2(wdir, TAMO_file_1, TAMO_file_2):
    '''Combines matricies made from two TAMO files.
    
    This script is used to create the final matrix after all jobs from 
    create_cc_for_2 are complete.
    '''

    ml_1 = MotifTools.txt2motifs(TAMO_file_1)
    ml_2 = MotifTools.txt2motifs(TAMO_file_2)

    n_split_1 = len(ml_1) / 100
    n_split_2 = len(ml_2) / 100

    print n_split_1, len(ml_1)
    print n_split_2

    # Change to the working directory.
    os.system("cd %s" % wdir)
    os.chdir(wdir)

    # This loop will paste together matricies
    for i in range(n_split_1 + 1):
        com = "paste "
        for j in range(n_split_2 + 1):
            com += "%s_n%s-%s_n%s.dm " % (TAMO_file_1, i, TAMO_file_2, j)
        com += "> distance_%s" % i
        print com
        os.system(com)

    #
    com = "cat "
    for i in range(n_split_1 + 1):
        com += "distance_%s " % i
    com += "> %s-%s.dm" % (TAMO_file_1, TAMO_file_2)

    print com
    os.system(com)
Esempio n. 6
0
def parse_opts():
    global GLOBALS
    short_opts = 'm:g:'
    long_opts  = ['genome=','top=']
    try:   opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
        usage()
    if not opts: usage()

    GLOBALS['args'] = args
    for opt,value in opts:
        if opt == '-m':                GLOBALS['motifs']     = MotifTools.txt2motifs(value)
        if opt in ['-g', '--genome']:  GLOBALS['genomefile'] = value
        if opt == '--top':             GLOBALS['top']        = int(value)
Esempio n. 7
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="])  # AD added 'i'
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    if not opts:
        usage()
        sys.exit(1)
        

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7
    ambigs = []

    scale   = 50.0 / 1000.0
    
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt ==  '-f':  fastafile = value
        elif opt ==  '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt ==  '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt ==  '-L':  labels    = list(value)
        elif opt ==  '-t':  thresh    = float(value)
        elif opt ==  '-a':  ambigs.extend(value.split(','))
        elif opt ==  '-S':  scale     = float(value)
        elif opt ==  '-i':  motiffile = value  # AD added this option to ACTUALLY supply the tamo motif file at the command-line.  The code to deal with motiffiles already existed. There was just no code for User to supply one.
        
    probes = Fasta.load(fastafile)
    
    if motiffile:
        for f in motiffile.split(','):      # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs
            motifs.extend(MotifTools.load(f))
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
        else:
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                hits_by_motif.append(result)
                save_flag = 1
            else:
                hits_by_motif.append(None)
        if save_flag:
            probehits[key]=hits_by_motif

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                            else:
                                a[start+offset] = '%d'%i
                            break
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
        
    print
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
Esempio n. 8
0
def merge_runs_cc(TAMO_file, wdir, height, distance, ancestor, target, genome):
    '''This script is used to merge motifs with the PCC matrix of all motifs.
    
    The script was originally written by Cheng Zou, and then converted to a 
    function by Alex Seddon.
    '''

    print "Here are the parameters you specified in this run "
    print "-tamo        %s" % TAMO_file
    print "-wdir        %s" % wdir
    print "-h        height to cut the tree, %s" % height
    print "-ancestor    %s" % ancestor
    print "-target    %s" % target
    print "-genome    %s" % genome

    if TAMO_file == '' or wdir == '':
        help()

    os.system("cd %s" % wdir)

    os.chdir(wdir)

    # Get the directory where the script is located.
    script_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])

    # This code was in the original clustering script. It has been taken out
    # because the processes involved take too long and have been taken up by
    # the matrrix creation scripts and the run_UPGMA script.
    #if distance==0:
    #    os.system("python /mnt/home/seddonal/gil scottscripts/5_motif_merging/3.calculate_distance_matrix.py   -i %s --dfunc pccrange" % TAMO_file)
    #os.system("R --vanilla --slave --args %s.dm  %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file))

    cl_dic = {}
    n = 0

    # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear
    # in the TAMO_file. If two motifs have the same number, they are considered
    # a part of the same cluster.
    # This loop pulls the clustering information out of this file and creats
    # the dictionary cl_dic = {cluster_index:{motif_index:'1'}}
    for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"):

        # Gets the clusterindex of this motif
        cl = line.strip()

        # Adds the cluster index if it has not been
        if not cl_dic.has_key(cl):
            cl_dic[cl] = {}

        cl_dic[cl][n] = "1"  # Adds the motif to that cluster
        n += 1  # Increases the motif index for the next motif

    #print cl_dic

    ml = MotifTools.txt2motifs(TAMO_file)
    old = []  # List of motifs that are the sole members of a cluster.

    # I think I can divide up this portion of the code to create a series
    print ancestor, ancestor == 0

    cc_output = open('merge_runs_cc', 'w')

    if ancestor == 0:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                cc_output.write(
                    'module load TAMO; python %s/pcc_merge_CC.py merge_runs_no_ancestor -t %s/%s -i %s -target %s -genome %s\n'
                    % (script_dir, wdir, TAMO_file, i, target, genome))

            # If there is only one motif in the cluster, it leaves it alone,
            # And adds it to old
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    if ancestor == 1:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                cc_output.write(
                    'module load TAMO; module load STAMPmotif; python %s/pcc_merge_CC.py merge_runs_ancestor -t %s/%s -i %s -target %s -genome %s\n'
                    % (script_dir, wdir, TAMO_file, i, target, genome))

            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    # Combine together the motifs that are in there own cluster.
    #os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file,TAMO_file))
    save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
Esempio n. 9
0
def merge_runs(TAMO_file, wdir, height, distance, ancestor, target, genome):
    '''This script is used to merge motifs with the PCC matrix of all motifs.
    
    The script was originally written by Cheng Zou, and then converted to a 
    function by Alex Seddon.
    '''

    print "Here are the parameters you specified in this run "
    print "-tamo        %s" % TAMO_file
    print "-wdir        %s" % wdir
    print "-h        height to cut the tree, %s" % height
    print "-distance    %s" % distance
    print "-ancestor    %s" % ancestor
    print "-target    %s" % target
    print "-genome    %s" % genome

    if TAMO_file == '' or wdir == '':
        help()

    os.system("cd %s" % wdir)

    os.chdir(wdir)

    # This code was in the original clustering script. It has been taken out
    # because the processes involved take too long and have been replaced by
    # the matrix creation scripts and the run_UPGMA script.
    #if distance==0:
    #    os.system("python /mnt/home/seddonal/scripts/5_motif_merging/3.calculate_distance_matrix.py   -i %s --dfunc pccrange" % TAMO_file)
    #os.system("R --vanilla --slave --args %s.dm  %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file))

    cl_dic = {}
    n = 0

    # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear
    # in the TAMO_file. If two motifs have the same number, they are considered
    # a part of the same cluster.
    # This loop pulls the clustering information out of this file and creats
    # the dictionary cl_dic = {cluster_index:{motif_index:'1'}}
    for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"):

        # Gets the clusterindex of this motif
        cl = line.strip()

        # Adds the cluster index if it has not been
        if not cl_dic.has_key(cl):
            cl_dic[cl] = {}

        cl_dic[cl][n] = "1"  # Adds the motif to that cluster
        n += 1  # Increases the motif index for the next motif

    #print cl_dic

    ml = MotifTools.txt2motifs(TAMO_file)
    old = []  # List of motifs that are the sole members of a cluster.

    # I think I can divide up this portion of the code to create a series
    print ancestor, ancestor == 0
    if ancestor == 0:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                # I am fairly certain that this process of converting to TF and
                # then returning it to TAMO format is only for keeping the names
                # consistent. I need to verify this suspicion
                tamo2tf("%s_sub_%s.tm" % (TAMO_file, i))
                os.system("cat  %s_sub_%s.tm.tf > %s_sub_%s_sum.tm.tf" %
                          (TAMO_file, i, TAMO_file, i))
                tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i))

                # Gets the top motif in the cluster.
                pick_chunk_score(wdir,
                                 '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i),
                                 target, genome)

                # Removes the files that were created.
                os.system("rm  %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i))
                os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i))
                os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i))

            # If there is only one motif in the cluster, it leaves it alone,
            # And adds it to old
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    if ancestor == 1:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                # Merges the motifs in the same cluster using STAMP
                tamo2tf("%s_sub_%s.tm" % (TAMO_file, i))

                # Gets the JASPER motifs that best match the motifs from within
                # the cluster.
                os.system(
                    "STAMP -tf  %s_sub_%s.tm.tf  -sd /home/chengzou/bin/STAMP/ScoreDists/JaspRand_PCC_SWU.scores  \
                 -go  1000 -ge 1000 -cc PCC -align SWU -out %s_sub_%s.tm.tf_STout -chp > %s_sub_%s.tm.tf_STout.log"
                    % (TAMO_file, i, TAMO_file, i, TAMO_file, i))
                parse_out_STAMP(TAMO_file, i)

                # combines the JASPER motifs with the cluster motif and then
                # converts them all to one TAMO file
                os.system(
                    "cat  %s_sub_%s.tm.tf %s_sub_%s.tm.tf_SToutFBP.txt.mod %s_sub_%s.tm.tf_STout_tree_clusters.txt > %s_sub_%s_sum.tm.tf"
                    % (TAMO_file, i, TAMO_file, i, TAMO_file, i, TAMO_file, i))
                tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i))

                # Gets the top motif within the TAMO file.
                pick_chunk_score(wdir,
                                 '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i),
                                 target, genome)

                # Removes any files created in the processing.
                os.system("rm  %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i))
                os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i))
                os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i))
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    # Combine together the top motifs from every
    os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" %
              (TAMO_file, TAMO_file))
    save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
    os.system("cat %s_sub_old.tm %s_sub_new.tm > %s_P1.tm" %
              (TAMO_file, TAMO_file, TAMO_file))
Esempio n. 10
0
def combine_distance_matrix(wdir, TAMO_file):
    '''Combines the PCC score matricies and outputs them as a single matrix.
    
    Originaly written by Cheng Zou, and converted to a function by Alex Seddon.
    '''
    ml = MotifTools.txt2motifs(TAMO_file)
    n_split = len(ml) / 100
    ##
    # Change to the working directory.
    os.system("cd %s" % wdir)
    os.chdir(wdir)
    #
    ##
    ##
    # The following loop keeps counts the number of lines in the each of the
    # PCC matricies for a comparison of a TAMO file with itself.
    lendic = {}  # Dictionary with the length of PCC matricies.
    for i in range(n_split + 1):
        lendic[i] = line_count("%s_n%s.dm" % (TAMO_file, i))
    print lendic
    #
    ##
    ##
    # This loop creates files with blanks. The files are used to ensure that
    # the PCC-distance matrix is square. The blank files will be created to take
    # the place of files that would have been left blank
    for i in range(n_split + 1):
        for j in range(0, i):
            # open the file to add blanks
            oup = open("%s_n%s-%s_n%s.dm" % (TAMO_file, i, TAMO_file, j), "w")
            print lendic[j], lendic[i]
            list = []
            # Add a number of "-" to the list equal to the number of lines in
            # the self comparison files.
            for y in range(lendic[j]):
                list.append("-")
            for x in range(lendic[i]):
                oup.write("%s\n" % "\t".join(list))
            oup.close()
    #
    ##

    ##
    # Creates a copy of the self comparison file so that it can be easily picked
    # out by the function.
    for i in range(n_split + 1):
        os.system("cp %s_n%s.dm %s_n%s-%s_n%s.dm" %
                  (TAMO_file, i, TAMO_file, i, TAMO_file, i))
    #
    ##

    ##
    # This loop will look at each
    for i in range(n_split + 1):
        com = "paste "
        for j in range(n_split + 1):
            com += "%s_n%s-%s_n%s.dm " % (TAMO_file, i, TAMO_file, j)
        com += "> distance_%s" % i
        print com
        os.system(com)

    com = "cat "
    for i in range(n_split + 1):
        com += "distance_%s " % i
    com += "> %s.dm" % TAMO_file

    print com
    # Concatonate all the matricies
    os.system(com)
    # My embarisingly ad hoc way of removing double tabs
    remove_double_tabs("%s.dm" % TAMO_file)
    threshold = math.pow(10, -float(sys.argv[2]))
    maxthreshold = float(sys.argv[3])  # for strong score, using 0.9*max score
    ATbias = float(sys.argv[4])  # 0.33
    GCbias = float(sys.argv[5])  # 0.17
    seq_file = sys.argv[6]  # FASTA file of the sequence
    tar_dir = ""  # Target directory for the output file
    #
    ##

    ##
    #
    for i in range(1, len(sys.argv)):
        if sys.argv[i] == "-d":
            tar_dir = sys.argv[i + 1].rstrip("/")
    print tar_dir
    ml = MotifTools.txt2motifs(file)
    n = 0
    new_list = []

    ##
    # Looks at each motif from the TAMO file. Uses the find function from
    # motility to find the sequences with that motif.
    for Ikey in range(len(ml)):
        #print m.ll

        time1 = time.time()

        m = ml[Ikey]  # Pull out the motif from the motif list.
        save_motifs([m], file + '_' + str(Ikey))  # Save the motif as a file.

        ##
Esempio n. 12
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:", ["help", "output="])
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    if not opts:
        usage()
        sys.exit(1)
        

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.7)
    ambigs = []

    scale   = 50.0 / 1000.0
    
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt == '-f':  fastafile = value
        elif opt == '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt == '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt == '-L':  labels    = list(value)
        elif opt == '-t':  thresh    = float(value)
        elif opt == '-a':  ambigs.extend(value.split(','))
        elif opt == '-S':  scale     = float(value)
        
    probes = Fasta.load(fastafile)
    
    if motiffile:
        motifs.extend(TAMO.tamofile2motifs(motiffile))
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
        else:
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                hits_by_motif.append(result)
                save_flag = 1
            else:
                hits_by_motif.append(None)
        if save_flag:
            probehits[key]=hits_by_motif

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                            else:
                                a[start+offset] = '%d'%i
                            break
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
        
    print
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))