def trainCGandEM(distInit, accumulate, ps = d.getDefaultParamSpec(), createAccEM = d.getDefaultCreateAcc(), estimateTotAux = d.getDefaultEstimateTotAux(), iterations = 5, length = -50, afterEst = None, verbosity = 0): """Re-estimates a distribution using conjugate gradients and EM. See the note in the docstring for this module for information on how the log likelihood is scaled. This scaling is presumed to have only a small impact on the dist returned by this function (via its impact on trainCG). """ assert iterations >= 1 dist = distInit for it in range(1, iterations + 1): if verbosity >= 1: print 'trainCGandEM: starting it =', it, 'of CG and EM' dist = (timed(trainCG) if verbosity >= 2 else trainCG)(dist, accumulate, ps = ps, length = length, verbosity = verbosity) dist, _, _, _ = expectationMaximization(dist, accumulate, createAcc = createAccEM, estimateTotAux = estimateTotAux, verbosity = verbosity) if afterEst is not None: afterEst(dist = dist, it = it) if verbosity >= 1: print 'trainCGandEM: finished it =', it, 'of CG and EM' print 'trainCGandEM:' return dist
def addLayers(self, stateInit, questionGroups): """Computes all remaining layers of the decision tree.""" splitInfoDict = dict() state = stateInit while True: addLayer = self.addLayer if self.verbosity >= 3: addLayer = timed(addLayer) state, splitInfoDictMore = addLayer(state, questionGroups) splitInfoDict.update(splitInfoDictMore) if self.verbosity >= 2: print 'cluster: added %s nodes' % len(splitInfoDictMore) if not splitInfoDictMore: break return splitInfoDict
def decisionTreeClusterDepthBased(clusteringSpec, labels, labelledAccChunks, createAcc): verbosity = clusteringSpec.verbosity accSummer1 = DepthBasedFirstLevelAccSummer(labelledAccChunks, createAcc) accSummer2 = SecondLevelAccSummer(createAcc) minCount = clusteringSpec.minCount leafEstimator = LeafEstimator( clusteringSpec.estimateTotAux, catchEstimationErrors=clusteringSpec.catchEstimationErrors) def getProtoRoot(): return leafEstimator.est(accSummer1.all()) if verbosity >= 3: getProtoRoot = timed(getProtoRoot) protoRoot = getProtoRoot() splitValuer = clusteringSpec.utilitySpec(protoRoot.dist, protoRoot.count, verbosity=verbosity) clusterer = DepthBasedClusterer(accSummer1, accSummer2, minCount, leafEstimator, splitValuer, clusteringSpec.goodThresh, verbosity=verbosity) if verbosity >= 1: print( 'cluster: decision tree clustering with perLeafPenalty = %s and' ' minCount = %s' % (splitValuer.perLeafPenalty, minCount)) questionGroups = removeTrivialQuestions(labels, clusteringSpec.questionGroups) stateInit = clusterer.getInitialState(labels, protoRoot) splitInfoDict = clusterer.addLayers(stateInit, questionGroups) dist, (aux, auxRat) = constructTree(splitInfoDict) if verbosity >= 1: countRoot = protoRoot.count # (FIXME : leaf computation relies on specific form of dist) print 'cluster: %s leaves' % len(dist.dist.distDict) print('cluster: aux root = %s (%s) -> aux tree = %s (%s) (%s count)' % (protoRoot.aux / countRoot, d.Rat.toString(protoRoot.auxRat), aux / countRoot, d.Rat.toString(auxRat), countRoot)) return dist
def subTreeSplitInfoIter(self, stateInit): agenda = [stateInit] while agenda: state = agenda.pop() labels, questionGroups, answerSeq, protoNoSplit = state if self.verbosity >= 2: self.printNodeInfo(state) if self.verbosity >= 3: indent = ' ' + ''.join([('| ' if answer != 0 else ' ') for answer in answerSeq]) computeBestSplit = timed( self.computeBestSplitAndStateAdj, msg='cluster:%schoose and perform split took' % indent) else: computeBestSplit = self.computeBestSplitAndStateAdj splitInfo, stateAdj = computeBestSplit(state) nextStates = self.getNextStates(stateAdj, splitInfo) agenda.extend(reversed(nextStates)) yield answerSeq, splitInfo
def subTreeSplitInfoIter(self, stateInit): agenda = [stateInit] while agenda: state = agenda.pop() labels, questionGroups, answerSeq, protoNoSplit = state if self.verbosity >= 2: self.printNodeInfo(state) if self.verbosity >= 3: indent = ' '+''.join([ ('| ' if answer != 0 else ' ') for answer in answerSeq ]) computeBestSplit = timed( self.computeBestSplitAndStateAdj, msg = 'cluster:%schoose and perform split took' % indent ) else: computeBestSplit = self.computeBestSplitAndStateAdj splitInfo, stateAdj = computeBestSplit(state) nextStates = self.getNextStates(stateAdj, splitInfo) agenda.extend(reversed(nextStates)) yield answerSeq, splitInfo
def decisionTreeClusterDepthBased(clusteringSpec, labels, labelledAccChunks, createAcc): verbosity = clusteringSpec.verbosity accSummer1 = DepthBasedFirstLevelAccSummer(labelledAccChunks, createAcc) accSummer2 = SecondLevelAccSummer(createAcc) minCount = clusteringSpec.minCount leafEstimator = LeafEstimator( clusteringSpec.estimateTotAux, catchEstimationErrors = clusteringSpec.catchEstimationErrors ) def getProtoRoot(): return leafEstimator.est(accSummer1.all()) if verbosity >= 3: getProtoRoot = timed(getProtoRoot) protoRoot = getProtoRoot() splitValuer = clusteringSpec.utilitySpec(protoRoot.dist, protoRoot.count, verbosity = verbosity) clusterer = DepthBasedClusterer(accSummer1, accSummer2, minCount, leafEstimator, splitValuer, clusteringSpec.goodThresh, verbosity = verbosity) if verbosity >= 1: print ('cluster: decision tree clustering with perLeafPenalty = %s and' ' minCount = %s' % (splitValuer.perLeafPenalty, minCount)) questionGroups = removeTrivialQuestions(labels, clusteringSpec.questionGroups) stateInit = clusterer.getInitialState(labels, protoRoot) splitInfoDict = clusterer.addLayers(stateInit, questionGroups) dist, (aux, auxRat) = constructTree(splitInfoDict) if verbosity >= 1: countRoot = protoRoot.count # (FIXME : leaf computation relies on specific form of dist) print 'cluster: %s leaves' % len(dist.dist.distDict) print ('cluster: aux root = %s (%s) -> aux tree = %s (%s) (%s count)' % (protoRoot.aux / countRoot, d.Rat.toString(protoRoot.auxRat), aux / countRoot, d.Rat.toString(auxRat), countRoot)) return dist
def trainCGandEM(distInit, accumulate, ps=d.getDefaultParamSpec(), createAccEM=d.getDefaultCreateAcc(), estimateTotAux=d.getDefaultEstimateTotAux(), iterations=5, length=-50, afterEst=None, verbosity=0): """Re-estimates a distribution using conjugate gradients and EM. See the note in the docstring for this module for information on how the log likelihood is scaled. This scaling is presumed to have only a small impact on the dist returned by this function (via its impact on trainCG). """ assert iterations >= 1 dist = distInit for it in range(1, iterations + 1): if verbosity >= 1: print 'trainCGandEM: starting it =', it, 'of CG and EM' dist = (timed(trainCG) if verbosity >= 2 else trainCG)( dist, accumulate, ps=ps, length=length, verbosity=verbosity) dist, _, _, _ = expectationMaximization(dist, accumulate, createAcc=createAccEM, estimateTotAux=estimateTotAux, verbosity=verbosity) if afterEst is not None: afterEst(dist=dist, it=it) if verbosity >= 1: print 'trainCGandEM: finished it =', it, 'of CG and EM' print 'trainCGandEM:' return dist
def synthComplete(self, dist, uttIds, method, synthOutDir, exptTag, afterSynth=None, verbosity=1): synthAcousticSeqIo = feat.AcousticSeqIo( synthOutDir, [vsio.VecSeqIo(stream.order) for stream in self.streams], ["%s.%s" % (exptTag, stream.name) for stream in self.streams], [stream.encoder for stream in self.streams], ) if verbosity >= 1: print "synth: synthesizing to", synthOutDir, "with tag", exptTag for uttId in uttIds: synthOutput = self.synth(dist, uttId, method) if afterSynth is not None: afterSynth(synthOutput=synthOutput, uttId=uttId, exptTag=exptTag) synthAcousticSeqIo.writeFiles(uttId, synthOutput) (timed(feat.doHtsDemoWaveformGeneration) if verbosity >= 1 else feat.doHtsDemoWaveformGeneration)( self.scriptsDir, synthOutDir, basenames=[uttId + "." + exptTag for uttId in uttIds], logFile=os.path.join(synthOutDir, exptTag + ".log"), )