def __init__(self, model, observations, l, nPrc): self._model = model self._nPrc = nPrc # Implementation assumes only 2 emissions. assert model.nEmissions == 2 # calculate a histogram of observed sequences in parallel res = runParallel(partial(_calcObservedDist, l=l), observations) # sum result histogram to one summary histogram H H = defaultdict(int) for d in res: for k, v in d.iteritems(): H[k] += v # create two lists: # - _obsSequences, containing all observed l-sequences # - _obsProbs, containng the matching observed probabilities self._obsSequences, self._obsProbs = [], [] s = float(sum(H.itervalues())) M, halfM = 2**l, 2**(l - 1) # for all observed l-sequences: for n in H.keys(): # normalize count to probability self._obsProbs.append(float(H[n]) / s) # map n to a binary sequence (where the first element is the msb) seq = [] assert n < M t = n for _ in xrange(l): msb = t / halfM assert msb in [0, 1] seq.append(msb) t = (t * 2) % M # represent n as an ObservedSequence object self._obsSequences.append(ObservedSequence.fromEmissionsList(seq)) writeOutput( 'Input sequences contain %d distinct %d-sequences (used for GOF statistics)' % (len(self._obsProbs), l))
def _maximizeQ(self, hiddenState, initThetas): # TODO nStartPoints 290? # number of start points nStartPoints = 60 # TODO remove # -Q is a positive measure we're trying to minimize... refs = [-self._Q(t, hiddenState) for t in initThetas] for r in refs : assert r > 0 # TODO move 'initTheta' to class field and see if it screws up performance. # initial points for optimizer; None is later converted to random init point # inputs = [self._thetaToVec(theta) for theta in initThetas] + [None for _ in xrange(nStartPoints - len(initThetas))] # inputs = [(x0, hiddenState) for x0 in inputs] inputs = [(self._thetaToVec(theta), hiddenState) for theta in [initThetas[0], initThetas[-1]]] # run self._maxQSingleStartPoint() on all items in inputs # Note: using partial(runMemberFunc, ...) to overcome Pool.map limitations on class methods. res = runParallel(runMemberFunc(self, '_maxQSingleStartPoint'), inputs) maxFound = -np.inf indices = [] for i in xrange(len(res)): # TODO return to for theta, val in res theta, val = res[i] if val > maxFound: maxFound = val maxTheta = theta assert val < 0.0 indices.append('{0}:{1}'.format(i, -val/refs[-1])) # TODO remove writeOutput('reference vals: ' + ','.join(str(r/refs[-1]) for r in refs), filename = 'DBG') writeOutput('indices: ' + ','.join(str(v) for v in ss.rankdata(indices)), filename = 'DBG') return maxTheta, maxFound
# TODO make sure everuthing works with nPrc = 1 # read input flags args = parser.parse_args() assert args.iter > 0 assert args.par > 0 if args.gof is not None: assert min(args.gof) > 0 # Init output-writer process and processes pool initParallel(args.par, args.o) # log command line # TODO perhaps printOutput() ? writeOutput(" ".join(sys.argv)) writeOutput('BW steps will be spanned over %d processes' % args.par) # read input dir & match all input files... files = [] for inpPattern in args.input: pathName = os.path.dirname(inpPattern) if pathName == '': pathName = os.curdir for f in os.listdir(pathName): if fnmatch.fnmatch(f, os.path.basename(inpPattern)): files.append(pathName + '/' + f) # TODO proper error message if (a) file doesn't exist (b) file doesn't match format # read all input files (executed in parallel) assert len(files) > 0
def run(self, observations, nIterations, trueTheta = None, initTheta = None, gof = []): # initialize theta theta = initTheta if theta is None: theta = self._initTheta(observations) #TODO DBGME = [theta] # we expect the log likelihood at the next iteration to be higher than this bound = -np.inf # print model specifications (see __str__ below for details): writeOutput('Model specifications:', 'loop') writeOutput(self, 'loop') writeOutput('\n', 'loop') # statistics to be collected self._statsNames = ['logL', 'Q-Init', 'Q-Max'] for l in gof: self._statsNames.append('G%d'%l) # initialize GOF classes if len(gof) > 0: start = time.time() gof = [GOF(model, observations, l) for l in gof] writeOutput('initialized gof statistics within %f seconds'%(time.time()-start)) # print the log-likelihood of the data under the true parameters (if given; simulated data only) if trueTheta is not None: # use the forward-backward algorithm to calculate the log-lokelihood of the observed sequence under trueTheta trueL = self._parallelExp(trueTheta, observations).logL # log True theta vals and statistics self._logVals('True parameters:', trueTheta, [trueL, '.', '.'], gof, target='DBG') for i in xrange(nIterations): writeOutput('starting BW iteration number %d'%(i + 1)) # BW expectation step start = time.time() inferredHiddenState = self._parallelExp(theta, observations) writeOutput('finished BW exp step within %f seconds'%(time.time()-start)) # sanity check: log(O|theta) has increased as expected in the last iteration if inferredHiddenState.logL < bound: writeOutput('WARNING **** BW error 1 %f %f'%(inferredHiddenState.logL, bound), 'ErrorLog') # sanity check (this is just Jensen's inequality... Q(theta | theta) = E( log(P(O,Z|theta) ) <= log( E(P(O,Z|theta)) ) = log( P(O|theta) ) Qtheta = self._Q(theta, inferredHiddenState) if Qtheta > inferredHiddenState.logL: writeOutput('WARNING **** BW error 2 %f %f'%(Qtheta, inferredHiddenState.logL), 'ErrorLog') # maximization step start = time.time() newTheta, Qmax = self._maximizeQ(inferredHiddenState, DBGME) writeOutput('finished BW max step within %f seconds'%(time.time()-start)) # sanity check: max_thetaStar Q(thetaStar | theta) >= Q(theta | theta) qDiff = Qmax - Qtheta if qDiff < 0: writeOutput('WARNING **** BW error 3 %f %f'%(Qmax, Qtheta), 'ErrorLog') # the log likelihood of newTheta should be higher by at least qDiff # (this is the inequality you get in the standard proof showing EM converges to a local maximum) bound = inferredHiddenState.logL + qDiff # sanity check for simulated data: verify that Qmax > Q(truetheta); This just helps convince us that the maximizer did converge. if trueTheta is not None: QTrue = self._Q(trueTheta, inferredHiddenState) if QTrue > Qmax: writeOutput('WARNING **** BW error 4 %f %f'%(QTrue, Qmax), 'ErrorLog') # log iteration self._logVals('After %d iterations:'%i, theta, [inferredHiddenState.logL, Qtheta, Qmax], gof) # update theta theta = newTheta DBGME.append(theta) # log final value of theta (for which some statistics are not calculated) self._logVals('After %d iterations:'%nIterations, theta, ['.', '.', '.'], gof)
def _logVals(self, header, theta, stats, gof, target = 'loop'): # print header writeOutput(header,target) writeOutput('\n',target) # print theta writeOutput(theta, target) writeOutput('\n',target) # calculate gof stats if len(gof) > 0: start = time.time() for c in gof: stats.append(c.G(theta)) writeOutput('calculated gof statistics within %f seconds'%(time.time()-start)) assert len(self._statsNames) == len(stats) temp = '\t' for i in xrange(len(self._statsNames)): temp += '{%d:<24}'%i writeOutput('Statistics:',target) writeOutput(temp.format(*self._statsNames),target) writeOutput(temp.format(*stats),target) writeOutput('\n',target)