Beispiel #1
0
def create_data(*seqs):
    string_set = stempy.StringSet()
    for seq in seqs:
        logging.info('Adding sequence "%s" to string set.', seq)
        string_set.append(seq)
    logging.info('Building index.')
    index = stempy.build_index(string_set)
    logging.info('Creating data object.')
    return stempy.Data(index)
Beispiel #2
0
instance_finder.instances.sort()
# at least 13 instances in sequences
#2012-06-16 11:32:58,686 - INFO - seq=    5; pos=    67; strand=+; W-mer=AACCTCGAGAG; Z=0.749857
#2012-06-16 11:32:58,686 - INFO - seq=    0; pos=    48; strand=+; W-mer=AACCTAAGAAA; Z=0.814953
#2012-06-16 11:32:58,686 - INFO - seq=    3; pos=    51; strand=+; W-mer=AAACTGTGGCT; Z=0.819370
#2012-06-16 11:32:58,686 - INFO - seq=    5; pos=    79; strand=+; W-mer=AAGCTAAAGAG; Z=0.827948
#2012-06-16 11:32:58,687 - INFO - seq=    3; pos=    36; strand=-; W-mer=AAGCTTATCAG; Z=0.862206
#2012-06-16 11:32:58,687 - INFO - seq=    5; pos=    97; strand=-; W-mer=GAACTGGGGAT; Z=0.912242
#2012-06-16 11:32:58,687 - INFO - seq=    2; pos=    47; strand=+; W-mer=AAACTTGGGAA; Z=0.919355
#2012-06-16 11:32:58,687 - INFO - seq=    1; pos=     6; strand=+; W-mer=AACCTTAGACG; Z=0.963969
#2012-06-16 11:32:58,687 - INFO - seq=    6; pos=    46; strand=-; W-mer=AAGCTGGGGAC; Z=0.968255
#2012-06-16 11:32:58,687 - INFO - seq=    9; pos=    73; strand=-; W-mer=GACCTGATGAG; Z=0.968813
#2012-06-16 11:32:58,687 - INFO - seq=    5; pos=    16; strand=-; W-mer=AACCTGAGCCG; Z=0.974733
#2012-06-16 11:32:58,687 - INFO - seq=    6; pos=    73; strand=+; W-mer=AACCTTAGGCG; Z=0.984093
#2012-06-16 11:32:58,687 - INFO - seq=    3; pos=    10; strand=+; W-mer=AACCTTAGGAT; Z=0.984245



#
# Print the instances
#
for instance in instance_finder.instances:
    seq, pos = data.pos_localise(instance.global_pos)
    W_mer = data.get_W_mer(W, instance.global_pos)
    if instance.rev_comp:
        W_mer = stempy.reverse_complement(W_mer)
    logging.info('seq=%5d; pos=%6d; strand=%s; W-mer=%s; Z=%4f', seq, pos, instance.rev_comp and '-' or '+', W_mer, instance.Z)

logging.info('Found %d instances', len(instance_finder.instances))
assert 13 == len(instance_finder.instances)
freqs_with_pseudo_counts = freqs.add_pseudo_counts(options.back_dist_prior)
lls = mm.calculate_likelihoods(data)
bg_model = stempy.create_bg_model_from_base_likelihoods(W, data, lls, freqs_with_pseudo_counts)

# binding site model
bs_model = stempy.PssmBindingSiteModel(stempy.initialise_uniform_pssm(W, options.alphabet_size))
bs_model.seed(seed)

# whole model
model = stempy.Model(data, bs_model, bg_model, _lambda=0.)

Z_threshold = .3
with Timer(msg='find instances with Z>%f' % Z_threshold):
    instance_finder = stempy.FindInstances(data, model, Z_threshold)
    instance_finder()
    logging.info('Found %d instances', len(instance_finder.instances))


num_W_mers_to_find = 10000
with Timer(msg='find %d best W-mers' % num_W_mers_to_find):
    w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_W_mers_to_find)
    w_mer_finder()
    logging.info('Found %d instances', len(w_mer_finder.best_w_mers))
    

def global_overlap(pos1, pos2, W):
    return abs(pos1 - pos2) < W

def get_non_overlapping(instances, W):
    instances.sort()
    instances.reverse()
Beispiel #4
0
    for seq in seqs:
        logging.info('Adding sequence "%s" to string set.', seq)
        string_set.append(seq)
    logging.info('Building index.')
    index = stempy.build_index(string_set)
    logging.info('Creating data object.')
    return stempy.Data(index)

def feq(x, y, eps=1e-4):
    return fabs(x - y < eps)



seq = "ACGTACACAC"
data = create_data(seq)
logging.info('Creating Markov model.')
mm, freqs = stempy.create_markov_model_order_3(data, 1.)
logging.info('Calculating likelihoods.')
lls = mm.calculate_likelihoods(data)
base_probs = map(exp, base_lls(lls[0]))
logging.info(', '.join('%.5f' % p for p in base_probs))
assert feq(stempy.W_mer_log_likelihood(lls[0], 0, 1), (seq.count('A')+1.)/(len(seq)+4.))
assert feq(base_probs[1], (seq.count('AC')+1.)/(seq.count('A')+4.))
assert feq(base_probs[2], (seq.count('ACG')+1.)/(seq.count('AC')-1.+4.)) # -1. because last 'AC' has no following character
assert feq(base_probs[3], (seq.count('ACGT')+1.)/(seq.count('ACG')+4.))
assert feq(base_probs[4], (seq.count('CGTA')+1.)/(seq.count('CGT')+4.))

# check the bg model from likelihoods
bg_model = stempy.create_bg_model_from_base_likelihoods(4, data, lls, freqs)

seq = "AAAC"
init_test_env(__file__, level=logging.INFO)

import os, stempy


#
# First run STEME
#
options = stempy.get_default_options()
options.output_dir = os.path.join('output', 'test-meme-like-output')
options.min_w = options.max_w = 8
options.meme_like_output = 'meme.out'
algorithm = stempy.Algorithm(options)
fasta = os.path.join(os.path.dirname(__file__), 'fasta', 'T00759-tiny.fa')
algorithm(fasta)
logging.info('Showing MEME output from %s', algorithm.meme_like_output_file)
os.system('cat %s' % algorithm.meme_like_output_file)


#
# Test BioPython parser
#
from Bio import Motif
motifs = list(Motif.parse(open(algorithm.meme_like_output_file), "MEME"))


#
# Doesn't quite work with pycogent yet. Pycogent expects a summary section
# that contains sites in all the sequences 
#
#from cogent import LoadSeqs
Beispiel #6
0
fasta_file = os.path.join(fasta_dir(), 'random-seqs-4-sites.fasta')
options = stempy.get_default_options()
options.output_dir = os.path.join('output', 'test-num-sites')
options.bg_model_order = 0
options.min_w = options.max_w = W
options.min_num_sites = 2
options.max_num_sites = 10
options.meme_like_output = 'test-num-sites.txt'
meme_output = os.path.join(options.output_dir, options.meme_like_output)


#
# Run the STEME algorithm
#
algorithm = stempy.Algorithm(options)
algorithm(fasta_file)

#
# Make sure we choose a motif that predicted 4 sites
#    
predicted_sites = parse_meme_output_for_sites(meme_output)
for seq, sites in predicted_sites.iteritems():
    for i, _id in enumerate(algorithm.input_sequences.ids):
        if _id.startswith(seq):
            break
    for site in sites:
        global_pos = algorithm.input_sequences.data.pos_globalise(i, site.first)
        logging.info('%2d %3d %s %s', i, site.first, algorithm.input_sequences.data.get_W_mer(W, global_pos), seq)
assert 4 == len(predicted_sites)

Beispiel #7
0
import pkg_resources
from optparse import OptionParser
from stempy.scan import load_occurrences_from_stream
from stempy.spacing import add_max_distance_option, count_all_pairs, spacing_idx

parser = OptionParser()
options = parser.get_default_values()
options.max_distance = 4


#
# Load the occurrences and associated sequence lengths,
# they will come sorted by position
#
logging.info('Loading occurrences')
occurrences, seq_infos, motifs = load_occurrences_from_stream(
    pkg_resources.resource_stream('stempy', 'test/spacing/steme-pwm-scan.out'),
    pkg_resources.resource_stream('stempy', 'test/spacing/steme-pwm-scan.seqs'),
)


#
# Iterate through the occurrences counting spacings
#
logging.info(
    'Examining spacings of up to %d b.p. between %d occurrences of %d motifs in %d sequences',
     options.max_distance, len(occurrences), len(motifs), len(seq_infos)
)
spacings = count_all_pairs(occurrences, seq_infos, ignore_close_to_end=True, options=options)
#
input_sequences = stempy.SequenceSet(fasta.encode(sys.stdin.encoding or 'ascii'), options)



#
# Initialise the background
#
bg_manager = stempy.get_background_manager(input_sequences, input_sequences.mm, options)



#
# Create the model
#
model = input_sequences.create_model(bg_manager.get_bg_model(W), W)
model.bs.pssm.log_probs.values()[:] = log_pwm
model.bs.recalculate()
model.lambda_ = lambda_




#
# Create the instance finder and find the instances
#
instance_finder = stempy.FindInstances(input_sequences.data, model, Z_threshold)
instance_finder()
instance_finder.instances.sort()
logging.info('Found %d instances', len(instance_finder.instances))
Beispiel #9
0
init_test_env(__file__, level=logging.INFO)

import stempy, os
from stempy.planted_sites import parse_fasta_for_sites, parse_meme_output_for_sites, calculate_positives_and_negatives
from infpy.roc import RocCalculator
from optparse import OptionParser

parser = OptionParser()
parser.add_option("--run-meme", action='store_true', help="Run MEME as well on the sequences")
cmd_line_options, args = parser.parse_args()

#
# The data sets with minimum sensitivity and specificity values required.
#
if is_debug_python():
    logging.info('Detected debug version of Python, only using smallest data set.')
    data_sets = [
        #('random-seqs-03-050'        , .13, .86),
        ('random-seqs-with-Ns-03-050', .13, .86),
    ]
else:
    data_sets = [
        ('random-seqs-03-050'        , .13, .86),
        ('random-seqs-with-Ns-03-050', .13, .86),
        ('random-seqs-05-100'        , .40, .89),
        ('random-seqs-with-Ns-05-100', .60, .90),
        ('random-seqs-05-100'        , .40, .89), # cannot achieve (.6,.9) stats when finding starts up-front
        ('random-seqs-10-100'        , .60, .91),
        ('random-seqs-with-Ns-10-100', .20, .98), # lower specificity with Ns
        ('random-seqs-30-200'        , .46, .99),
        ('random-seqs-with-Ns-30-200', .46, .99),