def load_seqs(filename): "Load and convert sequences from fasta file." logging.info('Loading sequences: %s', filename) sequences = dict(sequences_from_fasta(filename)) numpy_seqs = dict((desc, hmm.preprocess_sequence(seq_to_numpy(seq))) for desc, (seq, tally) in sequences.iteritems()) tally = sum(imap(N.array, (tally for desc, (seq, tally) in sequences.iteritems()))) logging.info('Loaded %d sequences with %d bases', len(sequences), sum(imap(len, (seq for seq, tally in sequences.values())))) return numpy_seqs, tally
def test_traits(self): from hmm.pssm import create_background_model, PssmTraits, seq_to_numpy from infpy import check_is_close_2 p_binding_site = .01 num_background_states = 2 emission_dists = [ [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [1., 0., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], [0., 0., 0., 1.], [0., 0., 0., 1.], [0., 0., 1., 0.], [0., 1., 0., 0.], [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 0., 1.], ] K = len(emission_dists) test_seq = 'accagtttgcact' # matches dist above test_seq_order_0 = seq_to_numpy(test_seq) # for various different orders for order in [1, 2]: # build a model of distribution above traits = PssmTraits(K, p_binding_site, order, num_background_states, create_background_model, emission_dists=emission_dists) model = traits.new_model() converted = hmm.model_states_2_model(model) B = converted.B # check the reverse complement states are correct for n in xrange(model.N): for o in xrange(model.M): rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement( n, o) assert check_is_close_2( B[rev_comp_state, rev_comp_obs], B[n, o]), ('%d,%d %d,%d: %f %f' % (rev_comp_state, rev_comp_obs, n, o, B[rev_comp_state, rev_comp_obs], B[n, o])) # check viterbi gives correct result test_seq_order_n = converted.converter.to_order_n(test_seq_order_0) LL, states = converted.viterbi(test_seq_order_n) for i, state in enumerate(states): assert state == num_background_states + i
def evaluate_model(model, sequence): """ Evaluates the model against the sequence. @return: True if there is at least one hit in the sequence """ hmm, traits = model LL, states = hmm.viterbi(seq_to_numpy(sequence)) # we have a hit if we find at least K/2 states in the state sequence that are not in the # background return sum(state not in traits.background_states for state in states) > traits.K / 2
def load_seqs(filename): "Load and convert sequences from fasta file." logging.info('Loading sequences: %s', filename) sequences = dict(sequences_from_fasta(filename)) numpy_seqs = dict((desc, hmm.preprocess_sequence(seq_to_numpy(seq))) for desc, (seq, tally) in sequences.iteritems()) tally = sum( imap(N.array, (tally for desc, (seq, tally) in sequences.iteritems()))) logging.info('Loaded %d sequences with %d bases', len(sequences), sum(imap(len, (seq for seq, tally in sequences.values())))) return numpy_seqs, tally
def test_traits(self): from hmm.pssm import create_background_model, PssmTraits, seq_to_numpy from infpy import check_is_close_2 p_binding_site = .01 num_background_states = 2 emission_dists = [ [ 1., 0., 0., 0. ], [ 0., 1., 0., 0. ], [ 0., 1., 0., 0. ], [ 1., 0., 0., 0. ], [ 0., 0., 1., 0. ], [ 0., 0., 0., 1. ], [ 0., 0., 0., 1. ], [ 0., 0., 0., 1. ], [ 0., 0., 1., 0. ], [ 0., 1., 0., 0. ], [ 1., 0., 0., 0. ], [ 0., 1., 0., 0. ], [ 0., 0., 0., 1. ], ] K = len(emission_dists) test_seq = 'accagtttgcact' # matches dist above test_seq_order_0 = seq_to_numpy(test_seq) # for various different orders for order in [1, 2]: # build a model of distribution above traits = PssmTraits(K, p_binding_site, order, num_background_states, create_background_model, emission_dists=emission_dists) model = traits.new_model() converted = hmm.model_states_2_model(model) B = converted.B # check the reverse complement states are correct for n in xrange(model.N): for o in xrange(model.M): rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(n,o) assert check_is_close_2(B[rev_comp_state,rev_comp_obs], B[n,o]), ('%d,%d %d,%d: %f %f' % (rev_comp_state,rev_comp_obs,n,o,B[rev_comp_state,rev_comp_obs],B[n,o])) # check viterbi gives correct result test_seq_order_n = converted.converter.to_order_n(test_seq_order_0) LL, states = converted.viterbi(test_seq_order_n) for i, state in enumerate(states): assert state == num_background_states+i
'# site sequence;sequence;start;is reverse complement; state sequence\n' ) for seq_idx, seq_sites in enumerate(sites): for site, states, start, is_rev_comp in seq_sites: if is_rev_comp: site = hmm.pssm.rev_comp(site) f.write('%s;%d;%d;%d;%s\n' % (hmm.pssm.numpy_to_seq(site), seq_idx, start, is_rev_comp, states)) if '__main__' == __name__: from hmm.pssm import PssmTraits, create_background_model, seq_to_numpy, random_sequence, information_content from random import random site = seq_to_numpy('aaactcaa') K = len(site) rev_comp_site = seq_to_numpy('ttgagttt') num_seqs = 60 seq_length = K + 30 start = 20 def gen_sequence(): seq = random_sequence(seq_length) if random() > .5: seq[start:start + K] = site else: seq[start:start + K] = rev_comp_site return seq p_binding_site = .01
option_parser.add_option( "--threshold-graph", dest="threshold_graph", help="file to write an image showing how # seqs with site varies by threshold." ) # sys.argv='dummy.py -m /home/reid/T00759.pssm -s /home/reid/T00759.fa --threshold-graph test.png'.split() options, args = option_parser.parse_args() for option in option_parser.option_list: if option.dest: logging.info('%s: %s (%s)', option.dest, str(getattr(options, option.dest)), option.help) logging.info('Loading sequences: %s', options.sequences_file) sequences = dict(sequences_from_fasta(options.sequences_file)) numpy_seqs = dict((desc, seq_to_numpy(seq)) for desc, seq in sequences.iteritems()) logging.info('Loaded %d sequences', len(sequences)) logging.info('Parsing PSSMs: %s', options.models_file) pssms = list(parse_models(open(options.models_file))) logging.info('Building models') models = [ build_hmm_from_semi_parsed(parsed, p_binding_site=options.p_binding_site) for parsed in pssms ] logging.info('Analysing sequences') p_binding_sites = list()
f.write('# site sequence;sequence;start;is reverse complement; state sequence\n') for seq_idx, seq_sites in enumerate(sites): for site, states, start, is_rev_comp in seq_sites: if is_rev_comp: site = hmm.pssm.rev_comp(site) f.write('%s;%d;%d;%d;%s\n' % (hmm.pssm.numpy_to_seq(site), seq_idx, start, is_rev_comp, states)) if '__main__' == __name__: from hmm.pssm import PssmTraits, create_background_model, seq_to_numpy, random_sequence, information_content from random import random site = seq_to_numpy('aaactcaa') K = len(site) rev_comp_site = seq_to_numpy('ttgagttt') num_seqs = 60 seq_length = K + 30 start = 20 def gen_sequence(): seq = random_sequence(seq_length) if random() > .5: seq[start:start+K] = site else: seq[start:start+K] = rev_comp_site return seq p_binding_site = .01 order = 1