def __init__(self, *p): State.__init__(self, *p) self.hmm = None self.factory = None self.repeatGeneratorX = None self.repeatGeneratorY = None self.consensus = "" self.memoizeX = defaultdict(dict) self.memoizeY = defaultdict(dict) self.memoizeXsimple = dict() self.memoizeYsimple = dict() #self.dgmemoize = dict() #self.rdgmemoize = dict() self.consensusSampler = None self.durationSampler = None self.backgroundProbability = None self.time = None self.transitionMatrix = None self.consensusDistribution = None self.repeatLengthDistribution = None self.repProb = None self.modelversion = None self.trackEmissions = None self.x_count = 0 self.y_count = 0 self.cons_set = set() self.cons_list = list()
def load(self, dictionary): State.load(self, dictionary) if "durations" not in dictionary: raise ParseException("durations were not found in GeneralizedState") self.durations = list(dictionary["durations"]) for d in range(len(self.durations)): self.durations[d] = tuple(self.durations[d])
def test_state_loading(self): a = State() a.load(self.inputY) X = a.toJSON() Y = self.Y self.assertDictEqual(X, Y, "Loading and dumping to JSON does not " + \ " work: " + str(X) + " != " + str(Y))
def load(self, dictionary): State.load(self, dictionary) if "durations" not in dictionary: raise ParseException( "durations were not found in GeneralizedState") self.durations = list(dictionary["durations"]) for d in range(len(self.durations)): self.durations[d] = tuple(self.durations[d])
def load(self, dictionary): State.load(self, dictionary) if 'backgroundprob' not in dictionary: raise ParseException("Backround probability was not found in state") self.backgroundProbability = [tuple(x) for x in dictionary['backgroundprob']] if 'time' not in dictionary: raise ParseException('Time was not found in state') self.time = dictionary['time'] if 'transitionmatrix' not in dictionary: raise ParseException('Transition matrix not found in state') self.transitionMatrix = dictionary['transitionmatrix'] if 'consensusdistribution' in dictionary: self.consensusDistribution = default_dist(normalize_dict( dictionary['consensusdistribution'], mathType=self.mathType )) else: self.consensusDistribution = defaultdict(lambda *x: self.mathType(1.0)) if 'repeatlengthdistribution' in dictionary: tp = type(dictionary['repeatlengthdistribution']) if tp in [dict, defaultdict]: self.repeatLengthDistribution = \ default_dist(normalize_dict( dictionary['repeatlengthdistribution'], mathType=self.mathType )) else: self.repeatLengthDistribution = \ dictionary['repeatlengthdistribution'] self.repProb = self.repeatLengthDistribution.p if 'trackemissions' in dictionary: self.trackEmissions = dictionary['trackemissions'] if 'version' in dictionary: self.version = dictionary['version'] else: self.version = 'v1' if 'repprob' in dictionary: self.repProb = self.mathType(dictionary['repprob']) if self.version == 'v2': self.trackEmissions = defaultdict(lambda *_: self.mathType(1.0)) self.trackEmissions['MM'] = self.mathType(1.0) self.repeatLengthDistribution = defaultdict(lambda *_: self.mathType(1.0)) self.repeatLengthDistribution[10] = self.mathType(1.0) self.factory = RepeatProfileFactory(self.mathType, self.version, self.repProb) self.factory.backgroudProbability = self.backgroundProbability self.factory.time = self.time self.factory.transitionMatrix = self.transitionMatrix
def toJSON(self): ret = State.toJSON(self) ret['backgroundprob'] = self.backgroundProbability ret['time'] = self.time ret['transitionmatrix'] = self.transitionMatrix if self.consensusDistribution != None: ret['consensusdistribution'] = \ dist_to_json(self.consensusDistribution) if self.repeatLengthDistribution != None: ret['repeatlengthdistribution'] = \ dist_to_json(self.repeatLengthDistribution) ret['trackemissions'] = self.trackEmissions if self.version != None: ret['version'] = self.version if self.repProb != None: ret['repprob'] = float(self.repProb) #TODO: save consensus distribution return ret
def toJSON(self): ret = State.toJSON(self) ret["durations"] = self.durations return ret
def setUp(self): self.inputY = { "__name__": "State", "name": "name", "startprob": 1.0, "emission": [("A", 1.0), (["C", "D"], 0.5)], "endprob": 0.5 } Y = self.inputY Y["emission"][1] = (tuple(Y["emission"][1][0]), Y["emission"][1][1]) self.Y = Y self.inputHMMData = dict() for mathType in self.mathTypes: hmmInit = { "__name__": "HMM", "states": [], "transitions": [ {"from": "Init", "to": "White", "prob": mathType(0.3)}, {"from": "Init", "to": "Black", "prob": mathType(0.7)}, {"from": "White", "to": "White", "prob": mathType(1.0)}, {"from": "Black", "to": "Black", "prob": mathType(1.0)} ] } Init = State(mathType) Init.load({ "__name__": "State", "name": "Init", "startprob": 1.0, "endprob": 1.0, "emission": [("0", 0.5), ("1", 0.5)] }) White = State(mathType) White.load({ "__name__": "State", "name": "White", "startprob": 0.0, "endprob": 1.0, "emission": [("0", 0.05), ("1", 0.95)] }) Black = State(mathType) Black.load({ "__name__": "State", "name": "Black", "startprob": 0.0, "endprob": 1.0, "emission": [("0", 0.9), ("1", 0.1)] }) hmmInit["states"] = [White, Init, Black] self.inputHMMData[mathType] = hmmInit
def load(self, dictionary): State.load(self, dictionary) if 'order' not in dictionary: raise ParseException('order was not found in state') self.order = dictionary['order']
def __init__(self, *p): State.__init__(self, *p) self.order = 0
def toJSON(self): ret = State.toJSON(self) ret['order'] = self.order return ret
def createProfileHMMv1(mathType, consensus, time, backgroundProb, trans): length = len(consensus) states = [] transitions = [] for i in range(length): char = consensus[i] matchState = State(mathType) insertState = State(mathType) deleteState1 = GeneralizedState(mathType) deleteState2 = GeneralizedState(mathType) matchState.load({ "__name__": "State", "name": "m" + str(i), "startprob": 0.0, "emission": JCModel(char, time, "ACGT"), "endprob": 1.0 }) insertState.load({ "__name__": "State", "name": "i" + str(i), "startprob": 0.0, "emission": backgroundProb, "endprob": 1.0 }) deleteState1.load({ "__name__": "GeneralizedState", "name": "1d" + str(i), "startprob": 0.0, "emission": [("", 1.0)], "endprob": 0.0, "durations": [(0, 1.0)] }) deleteState2.load({ "__name__": "GeneralizedState", "name": "2d" + str(i), "startprob": 0.0, "emission": [("", 1.0)], "endprob": 0.0, "durations": [(0, 1.0)] }) states.extend([matchState, insertState, deleteState1, deleteState2]) if i < length - 1: transitions.extend([ { "from": "m" + str(i), "to": "m" + str(i + 1), "prob": trans['MM'] }, { "from": "m" + str(i), "to": "i" + str(i + 1), "prob": trans['MI'] }, { "from": "m" + str(i), "to": "1d" + str(i + 1), "prob": trans['MD'] }, { "from": "1d" + str(i), "to": "1d" + str(i + 1), "prob": trans['DD'] }, { "from": "1d" + str(i), "to": "m" + str(i + 1), "prob": trans['DM'] }, { "from": "1d" + str(i), "to": "i" + str(i + 1), "prob": trans['DI'] }, { "from": "2d" + str(i), "to": "2d" + str(i + 1), "prob": trans['DD'] }, { "from": "2d" + str(i), "to": "m" + str(i + 1), "prob": trans['DM'] }, { "from": "2d" + str(i), "to": "i" + str(i + 1), "prob": trans['DI'] }, ]) transitions.extend([ { "from": "i" + str(i), "to": "i" + str(i), "prob": trans['II'] }, { "from": "i" + str(i), "to": "m" + str(i), "prob": trans['IM'] }, { "from": "i" + str(i), "to": "1d" + str(i), "prob": trans['ID'] }, ]) transitions.extend([ { "from": "Init", "to": "m0", "prob": trans['_M'] }, { "from": "Init", "to": "i0", "prob": trans['_I'] }, { "from": "Init", "to": "1d0", "prob": trans['_D'] }, { "from": "1d" + str(length - 1), "to": "m0", "prob": trans['_M'] }, { "from": "1d" + str(length - 1), "to": "i0", "prob": trans['_I'] }, { "from": "1d" + str(length - 1), "to": "2d0", "prob": trans['_D'] }, { "from": "m" + str(length - 1), "to": "Init", "prob": 1.0 - trans['MI'] }, { "from": "m" + str(length - 1), "to": "i" + str(length), "prob": trans['MI'] }, { "from": "i" + str(length), "to": "i" + str(length), "prob": trans['II'] }, { "from": "i" + str(length), "to": "Init", "prob": 1.0 - trans['II'] }, { "from": "2d" + str(length - 1), "to": "m0", "prob": trans['_M'] / (trans['_M'] + trans['_I']) }, { "from": "2d" + str(length - 1), "to": "i0", "prob": trans['_I'] / (trans['_M'] + trans['_I']) }, ]) insertState = State(mathType) insertState.load({ "__name__": "State", "name": "i" + str(length), "startprob": 0.0, "emission": backgroundProb, "endprob": 1.0 }) states.append(insertState) initState = GeneralizedState(mathType) initState.load({ "__name__": "GeneralizedState", "name": "Init", "startprob": 1.0, "emission": [("", 1.0)], "endprob": 1.0, "durations": [(0, 1.0)] }) states.append(initState) hmm = GeneralizedHMM(mathType) hmm.load({ "__name__": "GeneralizedHMM", "states": states, "transitions": transitions, }) hmm.reorderStatesTopologically() nm = consensus if len(nm) > 20: nm = hashlib.md5(consensus).hexdigest() return hmm
def computeHints(self, realigner): State.computeHints(self, realigner) self.precomputeRepeatGenerators(realigner) self.precomputeEmissionCache(realigner)
def test_state(self): for numType in self.mathTypes: state = State(numType) state.load(self.inputY) #test duration X = list(state.durationGenerator()) Y = [(1, numType(1.0))] self.assertEqual(X, Y, "HMM.durationGenerator() does not work: " + \ str(X) + " != " + str(Y)) #test emission Y = numType(1.0) X = state.emission("AC", 0) self.assertAlmostEqual(X, Y, delta=1e-7, msg="HMM.emission(\"AC\", 0) does not " + \ "work: " + str(X) + " != " + str(Y)) #test stateID for Y in range(4): state.setStateID(Y) X = state.getStateID() self.assertEqual(X, Y, "HMM.set/getStateID({0}) is broken." \ .format(Y)) #test transitions & remap transitions = [(1, 1.0), (2, 0.4), (3, 0.2), (4, 0.6)] M = {1: 2, 2: 3, 3: 4, 4: 5} for (x, p) in transitions: state.addTransition(x, p) state.addReverseTransition(x, p) X = state.followingIDs() Y = transitions self.assertEqual(X, Y, "HMM.?transitions are not working.") X = state.previousIDs() self.assertEqual(X, Y, "HMM.?reverse transitions are not working.") state.remapIDs(M) transitions = [(M[x[0]], x[1]) for x in transitions] X = state.followingIDs() Y = transitions self.assertEqual(X, Y, "HMM.remapIDs() is not working.") X = state.previousIDs() self.assertEqual(X, Y, "HMM.remapIDs() is not working.") state.clearTransitions() Y = [] X = state.followingIDs() X.extend(state.previousIDs()) self.assertEqual(X, Y, "HMM.clearTransitions() is not working.") #test start & stop probability X = state.getStartProbability() Y = 1.0 self.assertAlmostEqual(X, Y, delta=1e-7, msg="HMM.getStartProbability is broken.") X = state.getEndProbability() Y = 0.5 self.assertAlmostEqual(X, Y, delta=1e-7, msg="HMM.getEndProbability is broken.")
def __init__(self, *p): State.__init__(self, *p) self.durations = list()
def build_model(consensus, modelParam): global model_cache mathType = modelParam["mathType"] model_factory = modelParam["modelFactory"] if consensus in model_cache: return model_cache[consensus] model = model_factory.getHMM(consensus) repProb = model_factory.repProb repProb = 0.01 original_init_states = [] original_end_states = [] for i in range(len(model.states)): if model.states[i].startProbability > 0: original_init_states.append(i) if model.states[i].endProbability > 0: original_end_states.append(i) background_state = State(mathType) background_state.load({ "__name__": "State", "name": "BackgroundState", "startprob": 0.0, "emission": model_factory.backgroundProbability, "endprob": 1.0, }) background_state_id = model.addState(background_state) init_state = GeneralizedState(mathType) init_state.load({ "__name__": "GeneralizedState", "name": "FinderInit", "startprob": 1.0, "emission": [("", 1.0)], "endprob": 0.0, "durations": [(0, 1.0)] }) init_state_id = model.addState(init_state) model.addTransition( init_state_id, background_state_id, mathType(1.0) - repProb ) model.addTransition( background_state_id, background_state_id, mathType(1.0) - repProb ) for i in original_init_states: prob = model.states[i].startProbability * repProb model.addTransition(init_state_id, i, prob) model.addTransition(background_state_id, i, prob) model.states[i].startProbability = mathType(0.0) for i in original_end_states: model.addTransition( i, background_state_id, model.states[i].endProbability ) model.reorderStatesTopologically() #for state in model.states: # print state.stateName #model_cache[consensus] = model nm = consensus if len(nm) > 20: nm = hashlib.md5(consensus).hexdigest() with open('submodels/{0}.js'.format(consensus), 'w') as f: def LogNumToJson(obj): if isinstance(obj, LogNum): return '{0} {1}'.format(str(float(obj)),str(obj.value)) raise TypeError json.dump(model.toJSON(), f, indent=4, sort_keys=True, default=LogNumToJson) return model
def createProfileHMMv2(mathType, consensus, time, backgroundProb, trans): if consensus == None or len(consensus) == 0: raise "Wrong consensus: {}".format(consensus) length = len(consensus) states = [] transitions = [] for i in range(length): char = consensus[i] matchState = State(mathType) insertState = State(mathType) deleteState1 = GeneralizedState(mathType) deleteState2 = GeneralizedState(mathType) matchState.load({ "__name__": "State", "name": "m" + str(i), "startprob": 0.0, "emission": JCModel(char, time, "ACGT"), "endprob": 0.0 }) insertState.load({ "__name__": "State", "name": "i" + str(i), "startprob": 0.0, "emission": backgroundProb, "endprob": 0.0 }) deleteState1.load({ "__name__": "GeneralizedState", "name": "1d" + str(i), "startprob": 0.0, "emission": [("", 1.0)], "endprob": 0.0, "durations": [(0, 1.0)] }) deleteState2.load({ "__name__": "GeneralizedState", "name": "2d" + str(i), "startprob": 0.0, "emission": [("", 1.0)], "endprob": 0.0, "durations": [(0, 1.0)] }) states.extend([matchState, insertState, deleteState1, deleteState2]) if i < length - 1: transitions.extend([ { "from": "m" + str(i), "to": "m" + str(i + 1), "prob": trans['MM'] }, { "from": "m" + str(i), "to": "i" + str(i + 1), "prob": trans['MI'] }, { "from": "m" + str(i), "to": "1d" + str(i + 1), "prob": trans['MD'] }, { "from": "1d" + str(i), "to": "1d" + str(i + 1), "prob": trans['DD'] }, { "from": "1d" + str(i), "to": "m" + str(i + 1), "prob": trans['DM'] }, { "from": "1d" + str(i), "to": "i" + str(i + 1), "prob": trans['DI'] }, { "from": "2d" + str(i), "to": "2d" + str(i + 1), "prob": trans['DD'] }, { "from": "2d" + str(i), "to": "m" + str(i + 1), "prob": trans['DM'] }, { "from": "2d" + str(i), "to": "i" + str(i + 1), "prob": trans['DI'] }, ]) transitions.extend([ { "from": "i" + str(i), "to": "i" + str(i), "prob": trans['II'] }, { "from": "i" + str(i), "to": "m" + str(i), "prob": trans['IM'] }, { "from": "i" + str(i), "to": "1d" + str(i), "prob": trans['ID'] }, ]) transitions.extend([ { "from": "Init", "to": "m0", "prob": trans['_M'] }, { "from": "Init", "to": "i0", "prob": trans['_I'] }, { "from": "Init", "to": "1d0", "prob": trans['_D'] }, { "from": "Init", "to": "End", "prob": trans['_E'] }, { "from": "1d" + str(length - 1), "to": "m0", "prob": trans['DRM'] }, { "from": "1d" + str(length - 1), "to": "End", "prob": trans['DRE'] }, { "from": "1d" + str(length - 1), "to": "i0", "prob": trans['DRI'] }, { "from": "1d" + str(length - 1), "to": "2d0", "prob": trans['DRD'] }, { "from": "m" + str(length - 1), "to": "Init", "prob": trans['MR_'] }, { "from": "m" + str(length - 1), "to": "End", "prob": trans['MRE'] }, { "from": "m" + str(length - 1), "to": "i" + str(length), "prob": trans['MRI'] }, { "from": "i" + str(length), "to": "i" + str(length), "prob": trans['IRI'] }, { "from": "i" + str(length), "to": "Init", "prob": trans['IR_'] }, { "from": "i" + str(length), "to": "End", "prob": trans['IRE'] }, ]) insertState = State(mathType) insertState.load({ "__name__": "State", "name": "i" + str(length), "startprob": 0.0, "emission": backgroundProb, "endprob": 0.0 }) states.append(insertState) initState = GeneralizedState(mathType) initState.load({ "__name__": "GeneralizedState", "name": "Init", "startprob": 1.0, "emission": [("", 1.0)], "endprob": 0.0, "durations": [(0, 1.0)] }) states.append(initState) endState = GeneralizedState(mathType) endState.load({ "__name__": "GeneralizedState", "name": "End", "startprob": 0.0, "emission": [("", 1.0)], "endprob": 1.0, "durations": [(0, 1.0)], }) states.append(endState) remstate = '2d' + str(length - 1) states = [state for state in states if state.stateName != remstate] transitions = [ tran for tran in transitions if tran['to'] != remstate and tran['from'] != remstate ] hmm = GeneralizedHMM(mathType) hmm.load({ "__name__": "GeneralizedHMM", "states": states, "transitions": transitions, }) for i in range(len(hmm.states)): hmm.states[i].normalizeTransitions() hmm.reorderStatesTopologically() nm = consensus if len(nm) > 20: nm = hashlib.md5(consensus).hexdigest() #with Open('submodels/{0}.js'.format(nm), 'w') as f: # def LogNumToJson(obj): # if isinstance(obj, LogNum): # return '{0} {1}'.format(str(float(obj)),str(obj.value)) # raise TypeError # json.dump(hmm.toJSON(), f, indent=4, sort_keys=True, # default=LogNumToJson) return hmm