def load(self, dictionary): State.load(self, dictionary) if 'backgroundprob' not in dictionary: raise ParseException("Backround probability was not found in state") self.backgroundProbability = [tuple(x) for x in dictionary['backgroundprob']] if 'time' not in dictionary: raise ParseException('Time was not found in state') self.time = dictionary['time'] if 'transitionmatrix' not in dictionary: raise ParseException('Transition matrix not found in state') self.transitionMatrix = dictionary['transitionmatrix'] if 'consensusdistribution' in dictionary: self.consensusDistribution = default_dist(normalize_dict( dictionary['consensusdistribution'], mathType=self.mathType )) else: self.consensusDistribution = defaultdict(lambda *x: self.mathType(1.0)) if 'repeatlengthdistribution' in dictionary: tp = type(dictionary['repeatlengthdistribution']) if tp in [dict, defaultdict]: self.repeatLengthDistribution = \ default_dist(normalize_dict( dictionary['repeatlengthdistribution'], mathType=self.mathType )) else: self.repeatLengthDistribution = \ dictionary['repeatlengthdistribution'] self.repProb = self.repeatLengthDistribution.p if 'trackemissions' in dictionary: self.trackEmissions = dictionary['trackemissions'] if 'version' in dictionary: self.version = dictionary['version'] else: self.version = 'v1' if 'repprob' in dictionary: self.repProb = self.mathType(dictionary['repprob']) if self.version == 'v2': self.trackEmissions = defaultdict(lambda *_: self.mathType(1.0)) self.trackEmissions['MM'] = self.mathType(1.0) self.repeatLengthDistribution = defaultdict(lambda *_: self.mathType(1.0)) self.repeatLengthDistribution[10] = self.mathType(1.0) self.factory = RepeatProfileFactory(self.mathType, self.version, self.repProb) self.factory.backgroudProbability = self.backgroundProbability self.factory.time = self.time self.factory.transitionMatrix = self.transitionMatrix
def improveModel(self, transitions, emissions): self.clearCache() back = list(normalize_dict(emissions['I'], self.mathType).iteritems()) self.backgroundProbability = back self.factory.backgroudProbability = back eqprob = emissions['M'][1] / sum(emissions['M'].values()) time = -3.0/4.0 * (math.log((self.mathType(4.0) * eqprob - 1.0)/3.0)) self.time = time self.factory.time = time totals = defaultdict(self.mathType) for k, v in transitions.iteritems(): totals[k[:-1]] += v for state in transitions: transitions[state] /= totals[state[:-1]] self.transitionMatrix = transitions self.factory.transitionMatrix = transitions
def main(model_file, additional_parameters, emmisions_file, transitions_file, repeat_consensus_file, repeat_length_file, trf_cover_file, output_file, simple_model): loader = HMMLoader() with Open(trf_cover_file, 'r') as f: trf_cover = json.load(f) if not simple_model: repeat_probability = (float(trf_cover['R_segment_count']) / (trf_cover['R_segment_count'] + trf_cover['M_count'])) repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']]) repeat_repeat_probability = float(trf_cover['RR']) / repeat_count nothing_repeat_probability = float(trf_cover['MR']) / repeat_count repeat_nothing_probability = float(trf_cover['RM']) / repeat_count loader.addDictionary('trackemi', {"value": { 'RR': 0.0,#repeat_repeat_probability, 'RM': repeat_nothing_probability, 'MR': nothing_repeat_probability, }}) for k, v in additional_parameters.iteritems(): loader.addDictionary(k, v) # Parse emissions with Open(emmisions_file, 'r') as f: emm = normalize_dict(json.load(f)) emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()] loader.addDictionary('MatchStateEmissions', {'value': emm}) background_prob = defaultdict(int) for ((r1, r2), v) in emm: background_prob[r1] += v background_prob[r2] += v background_prob = \ {'value': list(normalize_dict(background_prob).iteritems())} loader.addDictionary('background-probability', background_prob) # Parse transitions with Open(transitions_file, 'r') as f: __trans = json.load(f) trans = dict() for k, v in __trans.iteritems(): trans[''.join(ast.literal_eval(k))] = v trans = normalize_tuple_dict(trans) if not simple_model: for k in trans: trans[k] *= (1 - repeat_probability) trans['MR'] = repeat_probability trans['XR'] = repeat_probability trans['YR'] = repeat_probability trans['RR'] = repeat_probability trans['RX'] = (1 - repeat_probability) / 3 trans['RY'] = (1 - repeat_probability) / 3 trans['RM'] = (1 - repeat_probability) / 3 loader.addDictionary('trans', trans) # Parse emissions from trf if not simple_model: loader.addFile('consensus.js', os.path.relpath(os.path.abspath(repeat_consensus_file), os.path.dirname(model_file))) loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file)) model = loader.load(model_file) json_prep = {'model': model['model'].toJSON()} with Open(output_file, 'w') as f: json.dump(json_prep, f, indent=4) return output_file
def main(model_file, additional_parameters, emmisions_file, transitions_file, repeat_consensus_file, repeat_length_file, trf_cover_file, output_file, simple_model): loader = HMMLoader() with Open(trf_cover_file, 'r') as f: trf_cover = json.load(f) if not simple_model: repeat_probability = (float(trf_cover['R_segment_count']) / (trf_cover['R_segment_count'] + trf_cover['M_count'])) repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']]) repeat_repeat_probability = float(trf_cover['RR']) / repeat_count nothing_repeat_probability = float(trf_cover['MR']) / repeat_count repeat_nothing_probability = float(trf_cover['RM']) / repeat_count loader.addDictionary('trackemi', {"value": { 'RR': repeat_repeat_probability, 'RM': repeat_nothing_probability, 'MR': nothing_repeat_probability, }}) for k, v in additional_parameters.iteritems(): loader.addDictionary(k, v) # Parse emissions with Open(emmisions_file, 'r') as f: emm = normalize_dict(json.load(f)) emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()] loader.addDictionary('MatchStateEmissions', {'value': emm}) background_prob = defaultdict(int) for ((r1, r2), v) in emm: background_prob[r1] += v background_prob[r2] += v background_prob = \ {'value': list(normalize_dict(background_prob).iteritems())} loader.addDictionary('background-probability', background_prob) # Parse transitions with Open(transitions_file, 'r') as f: __trans = json.load(f) trans = dict() for k, v in __trans.iteritems(): trans[''.join(ast.literal_eval(k))] = v trans = normalize_tuple_dict(trans) if not simple_model: for k in trans: trans[k] *= (1 - repeat_probability) trans['MR'] = repeat_probability trans['XR'] = repeat_probability trans['YR'] = repeat_probability trans['RR'] = repeat_probability trans['RX'] = (1 - repeat_probability) / 3 trans['RY'] = (1 - repeat_probability) / 3 trans['RM'] = (1 - repeat_probability) / 3 loader.addDictionary('trans', trans) # Parse emissions from trf if not simple_model: loader.addFile('consensus.js', os.path.relpath(os.path.abspath(repeat_consensus_file), os.path.dirname(model_file))) loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file)) model = loader.load(model_file) json_prep = {'model': model['model'].toJSON()} with Open(output_file, 'w') as f: json.dump(json_prep, f, indent=4) return output_file