def birth_select_targets_for_next_lap(self, hmodel, SS, BirthResults): ''' Create plans for next lap's birth moves Returns ------- BirthPlans : list of dicts, each entry represents the plan for one future birth move ''' if SS is not None: assert hmodel.allocModel.K == SS.K K = hmodel.allocModel.K # Update counter for which components haven't been updated in a while for kk in range(K): self.LapsSinceLastBirth[kk] += 1 # Ignore components that have just been added to the model. excludeList = self.birth_get_all_new_comps(BirthResults) # For each birth move, create a "plan" BirthPlans = list() for posID in range(self.algParams['birth']['birthPerLap']): try: ktarget = BirthMove.select_birth_component(SS, K=K, randstate=self.PRNG, excludeList=excludeList, doVerbose=False, lapsSinceLastBirth=self.LapsSinceLastBirth, **self.algParams['birth']) self.LapsSinceLastBirth[ktarget] = 0 excludeList.append(ktarget) Plan = dict(ktarget=ktarget, Data=None) except BirthMove.BirthProposalError, e: Plan = dict(ktarget=None, Data=None, msg=str(e)) BirthPlans.append(Plan)
def birth_collect_target_subsample(self, Dchunk, LPchunk, BirthPlans): ''' Collect subsample of the data in Dchunk, and add that subsample to overall targeted subsample stored in input list BirthPlans This overall sample is aggregated across many batches of data. Data from Dchunk is only collected if more data is needed. Returns ------- BirthPlans : list of planned births for the next lap, updated to include data from Dchunk if needed ''' import BirthMove for Plan in BirthPlans: # Skip this move if component selection failed if Plan['ktarget'] is None: continue birthParams = dict(**self.algParams['birth']) # Skip collection if have enough data already if Plan['Data'] is not None: if hasattr(Plan['Data'], 'nDoc'): if Plan['Data'].nDoc >= self.algParams['birth'][ 'maxTargetSize']: continue birthParams['maxTargetSize'] -= Plan['Data'].nDoc else: if Plan['Data'].nObs >= self.algParams['birth'][ 'maxTargetObs']: continue # Sample data from current batch, if more is needed targetData = BirthMove.subsample_data(Dchunk, LPchunk, Plan['ktarget'], randstate=self.PRNG, **birthParams) # Update Data for current entry in self.targetDataList if targetData is None: if Plan['Data'] is None: Plan[ 'msg'] = "TargetData: No samples for target comp found." else: if Plan['Data'] is None: Plan['Data'] = targetData else: Plan['Data'].add_data(targetData) Plan['msg'] = "TargetData: nObs %d" % (Plan['Data'].nObs) return BirthPlans
def run_birth_move(self, hmodel, Data, SS, LP, lap): ''' Run birth move on hmodel ''' import BirthMove # avoid circular import self.BirthLog = list() if not self.do_birth_at_lap(lap): return hmodel, LP kbirth = BirthMove.select_birth_component(SS, randstate=self.PRNG, **self.algParams['birth']) TargetData = BirthMove.subsample_data(Data, LP, kbirth, randstate=self.PRNG, **self.algParams['birth']) hmodel, SS, MoveInfo = BirthMove.run_birth_move( hmodel, TargetData, SS, ktarget=kbirth, randstate=self.PRNG, **self.algParams['birth']) self.print_msg(MoveInfo['msg']) self.BirthLog.extend(MoveInfo['birthCompIDs']) LP = None return hmodel, LP
def birth_collect_target_subsample(self, Dchunk, LPchunk, BirthPlans): ''' Collect subsample of the data in Dchunk, and add that subsample to overall targeted subsample stored in input list BirthPlans This overall sample is aggregated across many batches of data. Data from Dchunk is only collected if more data is needed. Returns ------- BirthPlans : list of planned births for the next lap, updated to include data from Dchunk if needed ''' import BirthMove for Plan in BirthPlans: # Skip this move if component selection failed if Plan['ktarget'] is None: continue birthParams = dict(**self.algParams['birth']) # Skip collection if have enough data already if Plan['Data'] is not None: if hasattr(Plan['Data'], 'nDoc'): if Plan['Data'].nDoc >= self.algParams['birth']['maxTargetSize']: continue birthParams['maxTargetSize'] -= Plan['Data'].nDoc else: if Plan['Data'].nObs >= self.algParams['birth']['maxTargetObs']: continue # Sample data from current batch, if more is needed targetData = BirthMove.subsample_data(Dchunk, LPchunk, Plan['ktarget'], randstate=self.PRNG, **birthParams) # Update Data for current entry in self.targetDataList if targetData is None: if Plan['Data'] is None: Plan['msg'] = "TargetData: No samples for target comp found." else: if Plan['Data'] is None: Plan['Data'] = targetData else: Plan['Data'].add_data(targetData) Plan['msg'] = "TargetData: nObs %d" % (Plan['Data'].nObs) return BirthPlans
def birth_select_targets_for_next_lap(self, hmodel, SS, BirthResults): ''' Create plans for next lap's birth moves Returns ------- BirthPlans : list of dicts, each entry represents the plan for one future birth move ''' if SS is not None: assert hmodel.allocModel.K == SS.K K = hmodel.allocModel.K # Update counter for which components haven't been updated in a while for kk in range(K): self.LapsSinceLastBirth[kk] += 1 # Ignore components that have just been added to the model. excludeList = self.birth_get_all_new_comps(BirthResults) # For each birth move, create a "plan" BirthPlans = list() for posID in range(self.algParams['birth']['birthPerLap']): try: ktarget = BirthMove.select_birth_component( SS, K=K, randstate=self.PRNG, excludeList=excludeList, doVerbose=False, lapsSinceLastBirth=self.LapsSinceLastBirth, **self.algParams['birth']) self.LapsSinceLastBirth[ktarget] = 0 excludeList.append(ktarget) Plan = dict(ktarget=ktarget, Data=None) except BirthMove.BirthProposalError, e: Plan = dict(ktarget=None, Data=None, msg=str(e)) BirthPlans.append(Plan)
def birth_create_new_comps(self, hmodel, SS, BirthPlans=list(), Data=None): ''' Create new components Returns ------- hmodel : bnpy HModel, with (possibly) new components SS : bnpy SuffStatBag, with (possibly) new components BirthResults : list of dictionaries, one entry per birth move each entry has fields * TODO ''' if Data is not None: if hasattr(Data, 'nDoc'): wordPerDocThr = self.algParams['birth']['birthWordsPerDocThr'] if wordPerDocThr > 0: nWordPerDoc = np.asarray( Data.to_sparse_docword_matrix().sum(axis=1)) candidates = nWordPerDoc >= wordPerDocThr candidates = np.flatnonzero(candidates) else: candidates = None targetData = Data.get_random_sample( self.algParams['birth']['maxTargetSize'], randstate=self.PRNG, candidates=candidates) else: targetData = Data.get_random_sample( self.algParams['birth']['maxTargetObs'], randstate=self.PRNG) Plan = dict(Data=targetData, ktarget=-1) BirthPlans = [Plan] nMoves = len(BirthPlans) BirthResults = list() for moveID, Plan in enumerate(BirthPlans): # Unpack data for current move ktarget = Plan['ktarget'] targetData = Plan['Data'] if ktarget is None or targetData is None: msg = Plan['msg'] elif targetData.nObs < self.algParams['birth']['minTargetObs']: # Verify targetData large enough that birth would be productive msg = "BIRTH skipped. Target data too small (size %d)" msg = msg % (targetData.nObs) elif hasattr(targetData, 'nDoc') \ and targetData.nDoc < self.algParams['birth']['minTargetSize']: msg = "BIRTH skipped. Target data too small (size %d)" msg = msg % (targetData.nDoc) else: hmodel, SS, MoveInfo = BirthMove.run_birth_move( hmodel, targetData, SS, randstate=self.PRNG, ktarget=ktarget, **self.algParams['birth']) msg = MoveInfo['msg'] if MoveInfo['didAddNew']: BirthResults.append(MoveInfo) for kk in MoveInfo['birthCompIDs']: self.LapsSinceLastBirth[kk] = -1 if Data is None: self.print_msg("%d/%d %s" % (moveID + 1, nMoves, msg)) else: self.print_msg("%d/%d BATCH %s" % (moveID + 1, nMoves, msg)) return hmodel, SS, BirthResults
def birth_create_new_comps(self, hmodel, SS, BirthPlans=list(), Data=None): ''' Create new components Returns ------- hmodel : bnpy HModel, with (possibly) new components SS : bnpy SuffStatBag, with (possibly) new components BirthResults : list of dictionaries, one entry per birth move each entry has fields * TODO ''' if Data is not None: if hasattr(Data, 'nDoc'): wordPerDocThr = self.algParams['birth']['birthWordsPerDocThr'] if wordPerDocThr > 0: nWordPerDoc = np.asarray(Data.to_sparse_docword_matrix().sum(axis=1)) candidates = nWordPerDoc >= wordPerDocThr candidates = np.flatnonzero(candidates) else: candidates = None targetData = Data.get_random_sample( self.algParams['birth']['maxTargetSize'], randstate=self.PRNG, candidates=candidates) else: targetData = Data.get_random_sample( self.algParams['birth']['maxTargetObs'], randstate=self.PRNG) Plan = dict(Data=targetData, ktarget=-1) BirthPlans = [Plan] nMoves = len(BirthPlans) BirthResults = list() for moveID, Plan in enumerate(BirthPlans): # Unpack data for current move ktarget = Plan['ktarget'] targetData = Plan['Data'] if ktarget is None or targetData is None: msg = Plan['msg'] elif targetData.nObs < self.algParams['birth']['minTargetObs']: # Verify targetData large enough that birth would be productive msg = "BIRTH skipped. Target data too small (size %d)" msg = msg % (targetData.nObs) elif hasattr(targetData, 'nDoc') \ and targetData.nDoc < self.algParams['birth']['minTargetSize']: msg = "BIRTH skipped. Target data too small (size %d)" msg = msg % (targetData.nDoc) else: hmodel, SS, MoveInfo = BirthMove.run_birth_move( hmodel, targetData, SS, randstate=self.PRNG, ktarget=ktarget, **self.algParams['birth']) msg = MoveInfo['msg'] if MoveInfo['didAddNew']: BirthResults.append(MoveInfo) for kk in MoveInfo['birthCompIDs']: self.LapsSinceLastBirth[kk] = -1 if Data is None: self.print_msg( "%d/%d %s" % (moveID+1, nMoves, msg) ) else: self.print_msg( "%d/%d BATCH %s" % (moveID+1, nMoves, msg) ) return hmodel, SS, BirthResults