Ejemplo n.º 1
0
def parse():
    global probefile, PROBESET, letter, fsafile
    try:
        idx = sys.argv.index('-genome')
        del sys.argv[idx]
        probefile = sys.argv[idx]
        del sys.argv[idx]
        PROBESET = MotifMetrics.ProbeSet(probefile)
        PROBESET.factor = 0.7
    except:
        pass
    try:
        idx = sys.argv.index('-letter')
        del sys.argv[idx]
        letter = sys.argv[idx]
        del sys.argv[idx]
    except:
        pass
    try:
        idx = sys.argv.index('-f')
        del sys.argv[idx]
        fsafile = sys.argv[idx]
        del sys.argv[idx]
    except:
        pass
Ejemplo n.º 2
0
def parse():
    global probefile, PROBESET
    try:
        idx = sys.argv.index('-genome')
        del sys.argv[idx]
        probefile = sys.argv[idx]
        del sys.argv[idx]
        PROBESET = MotifMetrics.ProbeSet(probefile)
        PROBESET.factor = 0.65
    except: pass
Ejemplo n.º 3
0
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile

    motifs = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ', fsaname
    fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict')
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs" % len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if motif.numbound == 0:
            matching = PROBESET.matching_ids(motif, [], factor=0.7)
            matchbound = [x for x in matching if x in probes]
            motif.numbound = len(probes)
            motif.nummotif = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    MotifTools.save_motifs(motifs, outname)
Ejemplo n.º 4
0
def main():
    short_opts = 'f:'
    long_opts = ['genome=', 'range=', 'top=', 'pcnt=', 'bgfile=']
    try:
        opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
        usage()
    if not opts: usage()

    fastafile = ''
    top_count = 10
    top_pcnt = None
    genome = 'YEAST'
    w_start = 8
    w_stop = 15
    bgfile = MDSCAN_DIR + 'yeast_int.bg'
    for opt, value in opts:
        if opt == '-f': fastafile = value
        if opt == '--genome': genome = value
        if opt == '--top': top_count = int(value)
        if opt == '--pcnt': top_pcnt = float(value)
        if opt == '--range':
            w_start, w_stop = [int(x) for x in value.split(',')]

    print "#" + ' '.join(sys.argv)
    probeids = Fasta.keys(fastafile)
    Genome = MotifMetrics.ProbeSet(genome)

    probeids = Genome.filter(probeids)

    if top_pcnt:
        top_count = max(top_count, int(top_pcnt / 100.0 * len(probeids)))

    theMeta = metaMDscan(fastafile, w_start, w_stop, top_count)

    for m in theMeta.motifs:
        m.pvalue = Genome.p_value(m, probeids, 'v')
        m.church = Genome.church(m, probeids, 'v')
        sys.stdout.flush()

    theMeta.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    print_motifs(theMeta.motifs)
Ejemplo n.º 5
0
def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        sys.stdout.flush()

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(mdobject.motifs,tamoname)
Ejemplo n.º 6
0
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <fasta_file>" % (re.sub('^.*/', '', sys.argv[0]))
        print "     [-genome genomefile.fsa]   Genome file (for computing Enrichment, etc..."
        print "     [-bfile  file          ]   File for Markov Background Model"
        print '     [-bigdata              ]   Adds "-maxsize 2000000" for large datasets'
        sys.exit(1)

    fastafile = sys.argv[1]
    width = 0
    valid_tfs = []
    iter = 10
    genome = 'YEAST'
    xtra = ''
    bfile = None

    for tok, i in zip(sys.argv, range(len(sys.argv))):
        if tok == '-w': width = int(sys.argv[i + 1])
        elif tok == '-human': genome = 'HUMAN'
        elif tok == '-H250': genome = 'HUMAN_250'
        elif tok == '-Ch22': genome = 'Ch22'
        elif tok == '-genome': genome = sys.argv[i + 1]
        elif tok == '-bigdata': xtra = '-maxsize 2000000'
        elif tok == '-bfile': bfile = sys.argv[i + 1]

    theMeme = Meme(fastafile, width, xtra, genome, bfile)
    Genome = MotifMetrics.ProbeSet(genome)
    ids = theMeme.probes
    #ids     = Genome.ids_from_file(fastafile)

    motifs = theMeme.motifs
    for motif in motifs:
        motif.pvalue = Genome.p_value(motif, ids, 'v')
        for valid_tf in valid_tfs:
            motif.valid = Validate.validate(motif, valid_tf, '', 'Want Tuple')

    print_motifs(motifs)

    print '#' * 80
    for line in theMeme.lines:
        print line,
Ejemplo n.º 7
0
def motifs2tamo(motifs, outname):
    global probefile, PROBESET
    
    fsaname = find_fsa(outname)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
    MotifTools.save_motifs(motifs,outname)
Ejemplo n.º 8
0
def random_seqs(numseq=50,genome='YEAST',want_dict=None):
    global PROBESETS, BADPROBES, BADPROBEFILES, ALL_IDS
    if PROBESETS.has_key(genome): probeset = PROBESETS[genome]
    else:
        probeset = MotifMetrics.ProbeSet(genome)
        PROBESETS[genome] = probeset
    if not BADPROBES:
        _d = {}
        for file in BADPROBEFILES:
            F = open(file)
            for id in [x.strip() for x in F.readlines()]:
                _d[id] = 1
            F.close()
        BADPROBES = _d.keys()
        simfilter= GenerateFastas.SimilarFilter(50)
        all_ids  = [x for x in probeset.probes.keys() if (x not in BADPROBES)]
        ALL_IDS  = simfilter.filter(all_ids)

    ids = ALL_IDS
    randomids= []
    count    = 0
    numids   = len(ids)
    while 1:
        randomid = ids[int(random.random() * numids)]
        if randomid not in randomids:
            randomids.append(randomid)
            count = count + 1
        if count >= numseq: break
    if not want_dict:
        seqs  = []
        for randomid in randomids:
            seqs.append( probeset.probes[randomid] )
    else:
        seqs = {}
        for randomid in randomids:
            seqs[randomid] = probeset.probes[randomid]
    return(seqs)
Ejemplo n.º 9
0
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET

    motifs = []
    for filename in files:
        print ">>>SDFSD>F ", filename
        if re.search('\.ace$', filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = filename.replace('.ace', '.fsa')
        elif re.search('\.meme.*$', filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = re.sub('\..\.meme', '.meme',
                                            filename).replace('.meme', '.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    fsaname = Fasta.find(mdobject.fastafile)
    fsaD = Fasta.load(fsaname)
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key, seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        if motif.E_site == None:
            motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if re.search('\.meme$', filename):
            motif.MAP = -math.log(motif.evalue) / math.log(10)
        if 1 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    if re.search('\.meme$', filename):
        mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs, tamoname)
Ejemplo n.º 10
0
def main():
    print "#" + ' '.join([x.replace(' ', '\ ') for x in sys.argv])
    parse_opts()
    ARGS = getarg('args')

    GLOBALS['GENOME'] = MotifMetrics.ProbeSet(getarg('genome'))
    print '# Loaded %s' % getarg('genome')

    badprobes = []
    for f in BADPROBES:
        b = [x.strip() for x in open(f).readlines()]
        badprobes.extend(b)
    d = getarg('DATA')
    p = getarg('GENOME')
    S = SimilarFilter(50)
    experiments = getarg('expts')
    top = getarg('top')
    THRESH = getarg('pvalue')
    NO_FILTER = getarg('nofilter')
    ratioabove = getarg('ratioabove')

    if not experiments: experiments = d.experiments

    for expt in experiments:
        e = expt

        if top:
            _tups = d.scores(e)
            _tups.sort(lambda x, y: cmp(x[0], y[0]))
            unfiltered = [x[1] for x in _tups[0:top]]
        elif ratioabove:
            unfiltered = d.ratioabove(e, ratioabove)
        else:
            unfiltered = d.bound(e, THRESH)

        badfiltered = [x for x in unfiltered if not (x in badprobes)]
        #badfiltered  = unfiltered # Turn back on for real data
        if len(unfiltered) - len(badfiltered) > 2: unfiltered = badfiltered
        #else:   continue    # Necessary when only wanting to regenerate problemed data

        bound_ids = p.filter(unfiltered)
        filtered_ids = bound_ids

        print '### Removed ', (len(bound_ids) -
                               len(S.filter(bound_ids))), 'from ', expt

        if not NO_FILTER:
            filtered_ids = p.filter(S.filter(bound_ids))

        #filtered_ids = bound_ids # Turn back on for real data
        if NO_FILTER:
            print '#%-15s   %3d    ' % (expt, len(bound_ids))
        else:
            print '#%-15s   Before %3d    After %3d ' % (expt, len(bound_ids),
                                                         len(filtered_ids))
        if len(unfiltered) - len(bound_ids) > 2:
            diff = [x for x in unfiltered
                    if (not x in bound_ids)]  #l_andnot(unfiltered,bound_ids)
            print '%-15s  %3d probes (out of %3d) without predicted sequences ' % (
                expt, len(diff), len(unfiltered))
            for _p in diff:
                print '# Absent in (%s) %s' % (expt, _p)
        #continue #Comment this

        #sort
        final_ids, final_scores = [], []
        _tups = d.scores(e)  #Sometimes redundant, but who cares?
        _tups.sort(lambda x, y: cmp(x[0], y[0]))
        for score, id in _tups:
            #if (score <= THRESH) and (id in filtered_ids):
            if (id in filtered_ids):  #Does this break everything?
                final_ids.append(id)
                final_scores.append('%8.4e' % score)
        if final_scores:
            print "#%% %-15s %s" % (expt, final_scores[-1])
        else:
            print "#%% %-15s None" % (expt)
        s = p.fsa_string_from_ids(final_ids, final_scores)
        if len(s) == 0: continue
        f = expt + '.fsa'
        f = re.sub(' ', '_', f)
        FID = open(f, 'w')
        FID.write(s)
        FID.close()
        sys.stdout.flush()