def sample(hyperparameters, rho, K, F): "Sample from the model." G = len(rho) q_alpha = GammaDist(hyperparameters.a_alpha, hyperparameters.b_alpha) alpha = q_alpha.sample() q_beta = GammaDist(hyperparameters.a_beta, hyperparameters.b_beta) beta = q_beta.sample() q_gamma = GammaDist(hyperparameters.a_gamma, hyperparameters.b_gamma) gamma = q_gamma.sample() q_lambda = GammaDist(hyperparameters.a_lambda, hyperparameters.b_lambda) lambda_ = q_lambda.sample() q_tau = DirichletDist(hyperparameters.a_tau) tau = q_tau.sample() q_omega = DirichletDist(hyperparameters.a_omega) omega = q_omega.sample() q_pi_bar = BetaDist(numpy.ones(K), gamma * numpy.ones(K)) pi_bar = q_pi_bar.sample() pi = numpy.empty_like(pi_bar) for k in xrange(K-1): pi[k] = pi_bar[k] * (1.-pi_bar[:k]).prod() pi[-1] = 1. - pi[:-1].sum() if pi[-1] < 0.: # adjust for numerical errors pi[-1] = 0. theta = numpy.random.dirichlet(alpha * pi, size=G) phi = numpy.empty((K+1, F)) phi[0] = numpy.random.dirichlet(lambda_ * omega) phi[1:] = numpy.random.dirichlet(beta * tau, size=K) # sample the correct number of sites for each gene sites = [None] * G for g, rho_g in enumerate(rho): v_g = [bernoulli(rho_i) for rho_i in rho_g] z_g = [v_gi and discrete_sample(theta[g])+1 or 0 for v_gi in v_g] x_g = [discrete_sample(phi[z_gi]) for z_gi in z_g] sites[g] = (v_g, z_g, x_g) result_type = namedtuple('Sample', 'alpha beta gamma lambda_ tau omega pi_bar pi theta phi sites') return result_type( alpha=alpha, beta=beta, gamma=gamma, lambda_=lambda_, tau=tau, omega=omega, pi_bar=pi_bar, pi=pi, theta=theta, phi=phi, sites=sites, )
def convert_base(b): if 'a' == b or 'A' == b: return 0 if 'c' == b or 'C' == b: return 1 if 'g' == b or 'G' == b: return 2 if 't' == b or 'T' == b: return 3 raise RuntimeError('Unknown base: %s' % str(b)) def convert_seq(seq): return map(convert_base, seq) sites_filename = os.path.join('c:\\', 'Dev', 'MyProjects', 'Vincent', 'transfac_matrix_sites.txt') TransfacSiteSet = namedtuple('TransfacSiteSet', 'matrix name seqs') def load_transfac_sites(sites_filename=sites_filename): return dict((matrix, TransfacSiteSet(matrix=matrix, name=name, seqs=seqs)) for matrix, name, seqs in read_sites(open(sites_filename))) def convert_transfac_sites(sites): result = dict() for matrix, site_set in sites.iteritems(): try: converted_seqs = map(convert_seq, site_set.seqs) # convert to feature values if not converted_seqs: # if no sequences ignore continue
else: seqs.append(line) def convert_base(b): if 'a' == b or 'A' == b: return 0 if 'c' == b or 'C' == b: return 1 if 'g' == b or 'G' == b: return 2 if 't' == b or 'T' == b: return 3 raise RuntimeError('Unknown base: %s' % str(b)) def convert_seq(seq): return map(convert_base, seq) sites_filename = os.path.join('c:\\', 'Dev', 'MyProjects', 'Vincent', 'transfac_matrix_sites.txt') TransfacSiteSet = namedtuple('TransfacSiteSet', 'matrix name seqs') def load_transfac_sites(sites_filename=sites_filename): return dict( (matrix, TransfacSiteSet(matrix=matrix, name=name, seqs=seqs)) for matrix, name, seqs in read_sites(open(sites_filename)) ) def convert_transfac_sites(sites): result = dict() for matrix, site_set in sites.iteritems(): try: converted_seqs = map(convert_seq, site_set.seqs) # convert to feature values if not converted_seqs: # if no sequences ignore continue
Code to run MEME algorithm. """ import subprocess import re import time import warnings from cookbook.interval import Interval from stempy import ensure_dir_exists, logging, os, parse_options from cookbook.named_tuple import namedtuple logger = logging.getLogger(__name__) Start = namedtuple( 'Start', 'w0 nsites0 cons0 cons w nsites sig em_time niters cons_after_em') if False: warnings.warn('Using debug MEME') _meme_binary = '/home/john/local/debug/bin/meme.bin' else: _meme_binary = '/home/john/local/bin/meme.bin' def run_meme(fasta, options, extra_args=None): """ Runs MEME. """ # set up command line
""" Code to analyse distances (spacings) between pairs of occurrences of TFs. """ import logging logger = logging.getLogger(__name__) from cookbook.named_tuple import namedtuple import pyicl import numpy import pylab from collections import defaultdict from .scan import footprint from scipy.special import gammaln PairOccurrence = namedtuple('PairOccurrence', 'spacing seq pos strand') Spacing = namedtuple( 'Spacing', 'primary secondary same_strand upstream distance') def add_max_distance_option(parser): parser.add_option( "-d", "--max-distance", type=int, default=30, help="Only look for occurrences of motifs up to MAX_DISTANCE " "base pairs apart", metavar="MAX_DISTANCE" )
# """ Test read sequences. """ # # Trickery to find update path to import stempy from # from setup_environment import init_test_env, fasta_dir init_test_env(__file__) import stempy, os from cookbook.named_tuple import namedtuple Start = namedtuple('Start', 'seed num_sites score model best_w_mers') options = stempy.get_default_options() options.output_dir = os.path.join('output', 'test-em') seed = 'CACTTT' W = len(seed) # read the sequences and build STEME object from index fasta = os.path.join(fasta_dir(), 'em-1-test.fa') algorithm = stempy.Algorithm(options) algorithm._initialise(fasta) motif_finder = algorithm.create_motif_finder() model = algorithm.create_model_of_input(W) model.bs.seed(seed, True) start = Start(seed=seed, num_sites=10, score=0., model=model, best_w_mers=stempy.InstanceVec())
logger = logging.getLogger(__name__) from . import html_copy_static import os import pyicl import pylab import numpy import bisect from cookbook.named_tuple import namedtuple from collections import defaultdict from itertools import ifilter from cookbook.pylab_utils import pylab_context_ioff, create_format_cycler # from cookbook.pylab_utils import simple_marker_styles SeqInfo = namedtuple('SeqInfo', 'name length') Occurrence = namedtuple( 'Occurrence', 'motif wmer seq pos strand Z score pvalue') def footprint(occ): """Return the footprint (interval) of the occurrence. """ return pyicl.IntInterval(occ.pos, occ.pos + len(occ.wmer)) def parse_occurrence(line): """Parse one occurrence in the format outputted by steme-pwm-scan. """ fields = line.strip().split(',') if 8 != len(fields):