def get_model(args, filename, allow_mask=True): loader = HMMLoader(args.mathType) # TODO: rename HMMLoader to ModelLoader register_classifier_states(loader) register_annotation_states(loader) register_cannotation_states(loader) register_annotations(loader) for i in range(0, len(args.bind_file), 2): loader.addFile(args.bind_file[i], args.bind_file[i + 1]) for i in range(0, len(args.bind_constant_file), 2): loader.addConstant( args.bind_constant_file[i], loader.load(args.bind_constant_file[i + 1]) ) for i in range(0, len(args.bind_constant_file), 2): loader.addConstant( args.bind_constant_file[i], loader.loads(args.bind_constant_file[i + 1]), ) model = loader.load(filename) if type(model) is dict and 'model' in model: model = model["model"] if args.add_masked_to_distribution and allow_mask: model.add_soft_masking_to_distribution() return model
def main(args): # TODO: build model params if args.model == None: print("You have to provide model") exit(1) loader = HMMLoader(LogNum) for state in loader.load(args.model)['model'].states: if state.onechar == 'R': model = state #BEGIN COPY D, stats = do_find_repeats( args.fasta, None, model, LogNum, args.stats, '$', ) #END COPY if args.stats != None: out_stats = dict() for k, v in stats.iteritems(): out_stats[str(k)] = v with open(args.stats, 'w') as f: json.dump(out_stats, f, indent=4) for key in D: D[key] = [ (x.start, x.end, x.repetitions, x.consensus, x.sequence) for x in D[key] ] with open(args.output, 'w') as f: json.dump(D, f, indent=4)
def train(sequences, original_model, new_model): loader = HMMLoader(LogNum) model = loader.load(original_model)['model'] with open('mmm.dot', 'w') as f: f.write(model_to_dot(model)) ID = model.statenameToID['Repeat'] with open(sequences) as f: sequences = json.load(f) model.states[ID].trainModel(sequences) def ln_to_float(x): if type(x) in [dict, defaultdict]: for k, v in x.iteritems(): x[k] = ln_to_float(v) elif type(x) == list: x = map(ln_to_float, x) elif type(x) == tuple: x = tuple(map(ln_to_float, x)) elif isinstance(x, LogNum): return float(x) return x js = {"model": ln_to_float(model.toJSON())} with open(new_model, 'w') as f: json.dump(js, f, sort_keys=True, indent=4)
def load_model(self, fname): loader = HMMLoader(float) register_classifier_states(loader) register_annotation_states(loader) register_cannotation_states(loader) self.fname = fname self.model = loader.load(fname) self.states_dict = dict() for i, state in enumerate(self.model['model'].states): self.states_dict[state.onechar] = i
def setModel(self, model): """ Set model or link to the model, so we have HMM generator """ if type(model) == str: loader = HMMLoader(self.mathType) for state in loader.load(model)['model'].states: if state.onechar == 'R': model = state if isinstance(model, HMM): for state in model.states: if state.onechar == 'R': model = state if not isinstance(model, State): raise "TODO" self.model = model
def __init__( self, preparer, state_class=SimpleMatchState, model='data/models/SimpleHMM2.js', ): self._preparer = None self.preparer = preparer for state in HMMLoader().load(model)['model'].states: if isinstance(state, state_class): self.emissions = state.emissions break
def train(sequences, original_model, new_model): loader = HMMLoader(LogNum) model = loader.load(original_model)['model'] with open('mmm.dot', 'w') as f: f.write(model_to_dot(model)) ID = model.statenameToID['Repeat'] with open(sequences) as f: sequences = json.load(f) model.states[ID].trainModel(sequences) def ln_to_float(x): if type(x) in [dict, defaultdict]: for k, v in x.iteritems(): x[k] = ln_to_float(v) elif type(x) == list: x = map(ln_to_float,x) elif type(x) == tuple: x = tuple(map(ln_to_float, x)) elif isinstance(x,LogNum): return float(x) return x js = {"model": ln_to_float(model.toJSON())} with open(new_model, 'w') as f: json.dump(js, f, sort_keys=True, indent=4)
def main(): parser = argparse.ArgumentParser(description='Sample alignments.') parser.add_argument('output_file_template', type=str, help="Template for output file. Have to contain " + \ "string '{id}' as placeholder for sequence number.") parser.add_argument('--output_files', type=str, help="File where the " + \ 'list of output files will be written.', default='-') parser.add_argument('--model', type=str, default='data/models/repeatHMM.js', help="Model file") parser.add_argument('--bind_file', nargs='*', help='Replace filenames in ' + 'the input_file model.', default=[]) parser.add_argument('--bind_constant', nargs='*', help='Replace constants' + ' in the input_file model.', default=[]) parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + ' constants in the input_file model.', default=[]) parser.add_argument('n_samples', type=int, help='Number of samples.') parser.add_argument('seq1_length',type=int, help='Length of first sequence.') parser.add_argument('seq2_length', type=int, help='Length of second sequence.') parsed_arg = parser.parse_args() # ====== Validate input parameters ========================================= if parsed_arg.output_file_template.count("{id}") < 1: sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\ 'contain at least one "%d".\n') return 1 if len(parsed_arg.bind_file) % 2 != 0: sys.stderr.write('ERROR: If binding files, the number of arguments has' + 'to be divisible by 2\n') return 1 if len(parsed_arg.bind_constant_file) % 2 != 0: sys.stderr.write('ERROR: If binding constants (as files), the number of' + ' arguments has to be divisible by 2\n') return 1 if len(parsed_arg.bind_constant) % 2 != 0: sys.stderr.write('ERROR: If binding constants, the number of' + ' arguments has to be divisible by 2\n') return 1 # ====== Parse parameters ================================================== output_filename = parsed_arg.output_file_template output_files_filename = parsed_arg.output_files output_files = list() # ====== Load model ======================================================== loader = HMMLoader() for i in range(0, len(parsed_arg.bind_constant), 2): loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1]) for i in range(0, len(parsed_arg.bind_constant_file), 2): loader.addConstant( parsed_arg.bind_constant_file[i], loader.load(parsed_arg.bind_constant_file[i + 1]) ) for i in range(0, len(parsed_arg.bind_constant), 2): loader.addConstant( parsed_arg.bind_constant[i], loader.loads(parsed_arg.bind_constant[i + 1]), ) model_filename = parsed_arg.model PHMM = loader.load(model_filename)["model"] # ====== Sample ============================================================ PHMM.buildSampleTransitions() n_samples = parsed_arg.n_samples X_len = parsed_arg.seq1_length Y_len = parsed_arg.seq2_length dirname = os.path.dirname(output_filename) if not os.path.exists(dirname): os.makedirs(dirname) for i in range(n_samples): done = False while not done: tandemRepeats = {'sequence1': [], 'sequence2': []} seq = PHMM.generateSequence((X_len, Y_len)) X = "" Y = "" A = "" for (seq, state) in seq: ann_data = None if len(seq) == 2: x, y = seq else: x, y, ann_data = seq dx, dy = len(x), len(y) if ann_data != None: xlen = len(X.replace('-', '')) ylen = len(Y.replace('-', '')) if dx > 0: tandemRepeats['sequence1'].append(( xlen, xlen + dx, dx / ann_data[1], ann_data[0], x )) done = True if dy > 0: tandemRepeats['sequence2'].append(( ylen, ylen + dy, dy / ann_data[2], ann_data[0], y )) done = True A += PHMM.states[state].getChar() * max(dx, dy) X += x + ('-' * (dy - dx)) Y += y + ('-' * (dx - dy)) #if len(X) - X.count('-') > 2 * X_len: # done = False #if len(Y) - Y.count('-') > 2 * Y_len: # done = False aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)] json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats', 'w'), indent=4) Fasta.save(aln, output_filename.format(id=i)) output_files.append(output_filename.format(id=i)) with Open(output_files_filename, 'w') as output_file_object: json.dump(output_files, output_file_object, indent=4) return 0
def __init__(self, sequence_regexp, loader=None): if loader is None: self.loader = HMMLoader() register_annotations(self.loader) self.x_regexp = sequence_regexp[0] self.y_regexp = sequence_regexp[1]
class AnnotationLoader: def __init__(self, sequence_regexp, loader=None): if loader is None: self.loader = HMMLoader() register_annotations(self.loader) self.x_regexp = sequence_regexp[0] self.y_regexp = sequence_regexp[1] @staticmethod def get_annotation_at(annotations, i): """ Returns annotations at position i @param annotations: @param i: """ base_annotation = dict() if annotations is not None: for key in annotations: base_annotation[key] = annotations[key][i] return base_annotation def _intervals_to_interval_map(self, intervals, offset): """ Converts intervals from track to intervalmap, for searching currently supports binary annotations only """ m = intervalmap() m[:] = 0 for i in intervals: m[i[1] + offset:i[2] + offset] = 1 return m def _get_annotation_from_bed(self, fname, offset): """ Reads intervals from BED file """ try: with track.load(fname) as ann: ann = ann.read(fields=['start', 'end']) intervals = self._intervals_to_interval_map(ann, offset) except Exception: intervals = self._intervals_to_interval_map([], 0) return intervals def _get_sequence_annotations(self, annotations, sequence_annotations_config): """ Returns annotations for one sequence """ res = dict() for annotation in annotations: res[annotation] = self._get_annotation_from_bed( *sequence_annotations_config[annotation]) return res def _get_seq_name(self, names, regexp): r = re.compile(regexp) matches = [name for name in names if r.match(name)] if len(matches) != 1: raise RuntimeError('Cannot get name for regexp', regexp, '. Found', len(matches), 'matches.') return matches[0] def get_annotations_from_model(self, model): if not constants.annotations_enabled: return None, None, None if model is None: raise RuntimeError('No annotation model!') names = model.sequences.keys() x_name = self._get_seq_name(names, self.x_regexp) y_name = self._get_seq_name(names, self.y_regexp) annotations = model.annotations # print 'Using annotations for x:', x_name annotations_x = self._get_sequence_annotations(annotations, model.sequences[x_name]) # print 'Using annotations for y:', y_name annotations_y = self._get_sequence_annotations(annotations, model.sequences[y_name]) return annotations, annotations_x, annotations_y def get_annotations(self, fname): model = self.loader.load(fname) return self.get_annotations_from_model(model)
def get_model(args, filename, allow_mask=True): loader = HMMLoader(args.mathType) # TODO: rename HMMLoader to ModelLoader register_classifier_states(loader) register_annotation_states(loader) register_cannotation_states(loader) register_annotations(loader) for i in range(0, len(args.bind_file), 2): loader.addFile(args.bind_file[i], args.bind_file[i + 1]) for i in range(0, len(args.bind_constant_file), 2): loader.addConstant(args.bind_constant_file[i], loader.load(args.bind_constant_file[i + 1])) for i in range(0, len(args.bind_constant_file), 2): loader.addConstant( args.bind_constant_file[i], loader.loads(args.bind_constant_file[i + 1]), ) model = loader.load(filename) if type(model) is dict and 'model' in model: model = model["model"] if args.add_masked_to_distribution and allow_mask: model.add_soft_masking_to_distribution() return model
def main(model_file, additional_parameters, emmisions_file, transitions_file, repeat_consensus_file, repeat_length_file, trf_cover_file, output_file, simple_model): loader = HMMLoader() with Open(trf_cover_file, 'r') as f: trf_cover = json.load(f) if not simple_model: repeat_probability = (float(trf_cover['R_segment_count']) / (trf_cover['R_segment_count'] + trf_cover['M_count'])) repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']]) repeat_repeat_probability = float(trf_cover['RR']) / repeat_count nothing_repeat_probability = float(trf_cover['MR']) / repeat_count repeat_nothing_probability = float(trf_cover['RM']) / repeat_count loader.addDictionary('trackemi', {"value": { 'RR': 0.0,#repeat_repeat_probability, 'RM': repeat_nothing_probability, 'MR': nothing_repeat_probability, }}) for k, v in additional_parameters.iteritems(): loader.addDictionary(k, v) # Parse emissions with Open(emmisions_file, 'r') as f: emm = normalize_dict(json.load(f)) emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()] loader.addDictionary('MatchStateEmissions', {'value': emm}) background_prob = defaultdict(int) for ((r1, r2), v) in emm: background_prob[r1] += v background_prob[r2] += v background_prob = \ {'value': list(normalize_dict(background_prob).iteritems())} loader.addDictionary('background-probability', background_prob) # Parse transitions with Open(transitions_file, 'r') as f: __trans = json.load(f) trans = dict() for k, v in __trans.iteritems(): trans[''.join(ast.literal_eval(k))] = v trans = normalize_tuple_dict(trans) if not simple_model: for k in trans: trans[k] *= (1 - repeat_probability) trans['MR'] = repeat_probability trans['XR'] = repeat_probability trans['YR'] = repeat_probability trans['RR'] = repeat_probability trans['RX'] = (1 - repeat_probability) / 3 trans['RY'] = (1 - repeat_probability) / 3 trans['RM'] = (1 - repeat_probability) / 3 loader.addDictionary('trans', trans) # Parse emissions from trf if not simple_model: loader.addFile('consensus.js', os.path.relpath(os.path.abspath(repeat_consensus_file), os.path.dirname(model_file))) loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file)) model = loader.load(model_file) json_prep = {'model': model['model'].toJSON()} with Open(output_file, 'w') as f: json.dump(json_prep, f, indent=4) return output_file
def main(model_file, additional_parameters, emmisions_file, transitions_file, repeat_consensus_file, repeat_length_file, trf_cover_file, output_file, simple_model): loader = HMMLoader() with Open(trf_cover_file, 'r') as f: trf_cover = json.load(f) if not simple_model: repeat_probability = (float(trf_cover['R_segment_count']) / (trf_cover['R_segment_count'] + trf_cover['M_count'])) repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']]) repeat_repeat_probability = float(trf_cover['RR']) / repeat_count nothing_repeat_probability = float(trf_cover['MR']) / repeat_count repeat_nothing_probability = float(trf_cover['RM']) / repeat_count loader.addDictionary('trackemi', {"value": { 'RR': repeat_repeat_probability, 'RM': repeat_nothing_probability, 'MR': nothing_repeat_probability, }}) for k, v in additional_parameters.iteritems(): loader.addDictionary(k, v) # Parse emissions with Open(emmisions_file, 'r') as f: emm = normalize_dict(json.load(f)) emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()] loader.addDictionary('MatchStateEmissions', {'value': emm}) background_prob = defaultdict(int) for ((r1, r2), v) in emm: background_prob[r1] += v background_prob[r2] += v background_prob = \ {'value': list(normalize_dict(background_prob).iteritems())} loader.addDictionary('background-probability', background_prob) # Parse transitions with Open(transitions_file, 'r') as f: __trans = json.load(f) trans = dict() for k, v in __trans.iteritems(): trans[''.join(ast.literal_eval(k))] = v trans = normalize_tuple_dict(trans) if not simple_model: for k in trans: trans[k] *= (1 - repeat_probability) trans['MR'] = repeat_probability trans['XR'] = repeat_probability trans['YR'] = repeat_probability trans['RR'] = repeat_probability trans['RX'] = (1 - repeat_probability) / 3 trans['RY'] = (1 - repeat_probability) / 3 trans['RM'] = (1 - repeat_probability) / 3 loader.addDictionary('trans', trans) # Parse emissions from trf if not simple_model: loader.addFile('consensus.js', os.path.relpath(os.path.abspath(repeat_consensus_file), os.path.dirname(model_file))) loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file)) model = loader.load(model_file) json_prep = {'model': model['model'].toJSON()} with Open(output_file, 'w') as f: json.dump(json_prep, f, indent=4) return output_file
class AnnotationLoader: def __init__(self, sequence_regexp, loader=None): if loader is None: self.loader = HMMLoader() register_annotations(self.loader) self.x_regexp = sequence_regexp[0] self.y_regexp = sequence_regexp[1] @staticmethod def get_annotation_at(annotations, i): """ Returns annotations at position i @param annotations: @param i: """ base_annotation = dict() if annotations is not None: for key in annotations: base_annotation[key] = annotations[key][i] return base_annotation def _intervals_to_interval_map(self, intervals, offset): """ Converts intervals from track to intervalmap, for searching currently supports binary annotations only """ m = intervalmap() m[:] = 0 for i in intervals: m[i[1]+offset:i[2]+offset] = 1 return m def _get_annotation_from_bed(self, fname, offset): """ Reads intervals from BED file """ try: with track.load(fname) as ann: ann = ann.read(fields=['start', 'end']) intervals = self._intervals_to_interval_map(ann, offset) except Exception: intervals = self._intervals_to_interval_map([], 0) return intervals def _get_sequence_annotations( self, annotations, sequence_annotations_config ): """ Returns annotations for one sequence """ res = dict() for annotation in annotations: res[annotation] = self._get_annotation_from_bed( *sequence_annotations_config[annotation] ) return res def _get_seq_name(self, names, regexp): r = re.compile(regexp) matches = [name for name in names if r.match(name)] if len(matches) != 1: raise RuntimeError( 'Cannot get name for regexp', regexp, '. Found', len(matches), 'matches.' ) return matches[0] def get_annotations_from_model(self, model): if not constants.annotations_enabled: return None, None, None if model is None: raise RuntimeError('No annotation model!') names = model.sequences.keys() x_name = self._get_seq_name(names, self.x_regexp) y_name = self._get_seq_name(names, self.y_regexp) annotations = model.annotations # print 'Using annotations for x:', x_name annotations_x = self._get_sequence_annotations( annotations, model.sequences[x_name] ) # print 'Using annotations for y:', y_name annotations_y = self._get_sequence_annotations( annotations, model.sequences[y_name] ) return annotations, annotations_x, annotations_y def get_annotations(self, fname): model = self.loader.load(fname) return self.get_annotations_from_model(model)