parser.add_argument('--doPlotELBO', type=int, default=0) parser.add_argument('--doPlotComps', type=int, default=1) parser.add_argument('--ktarget', type=int, default=10) parser.add_argument('--kabsorbList', type=str, default='all') parser.add_argument('--verbose', type=int, default=True) parser.add_argument('--outputdir', type=str, default='/tmp/') parser.add_argument('--nUpdateSteps', type=int, default=25) parser.add_argument('--nELBOSteps', type=int, default=1) parser.add_argument('--d_initWordCounts', type=str, default='none') parser.add_argument('--d_initTargetDocTopicCount', type=str, default="cold_start") args = parser.parse_args() DLogger.configure(args.outputdir, doSaveToDisk=0, doWriteStdOut=1) tryDeleteProposalForSavedTask(**args.__dict__) ''' if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--doPlotELBO', type=int, default=1) parser.add_argument('--doPlotComps', type=int, default=0) parser.add_argument('--ktarget', type=int, default=10) parser.add_argument('--kabsorbList', type=str, default='all') parser.add_argument('--initname', type=str, default='truelabelsandjunk') parser.add_argument('--K', type=int, default=10) parser.add_argument('--nLap', type=int, default=1) args = parser.parse_args() ktarget = args.ktarget
def selectCandidateDeleteComps( hmodel, SS, MoveRecordsByUID=dict(), MovePlans=dict(), lapFrac=0, **DArgs): ''' Select specific comps to target with delete move. Returns ------- MovePlans : dict, with fields * d_targetUIDs : list of ints * d_absorbingUIDSet : set of ints, all uids that can absorb target mass OR * failMsg : string explaining why building list of eligible UIDs failed ''' DLogger.pprint("PLANNING delete at lap %.2f" % (lapFrac)) K = SS.K availableUIDs = set(SS.uids) if len(availableUIDs) < 2: DLogger.pprint( "Delete proposal requires at least 2 available UIDs.\n" + \ " Need 1 uid to target, and at least 1 to absorb." + \ " Only have %d total uids in the model." % (len(availableUIDs))) failMsg = "Ineligible. Did not find >= 2 UIDs in entire model." return dict(failMsg=failMsg) uidsBusyWithOtherMoves = set() ''' if 'm_UIDPairs' in MovePlans: for (uidA, uidB) in MovePlans['m_UIDPairs']: availableUIDs.discard(uidA) availableUIDs.discard(uidB) uidsBusyWithOtherMoves.add(uidA) uidsBusyWithOtherMoves.add(uidB) if 'b_shortlistUIDs' in MovePlans: for uid in MovePlans['b_shortlistUIDs']: availableUIDs.discard(uid) uidsBusyWithOtherMoves.add(uid) if len(availableUIDs) < 2: DLogger.pprint("Delete requires at least 2 UIDs" + \ " not occupied by merge or birth.\n" + \ " Need 1 uid to target, and at least 1 to absorb.\n" + \ " Only have %d total uids eligible." % (len(availableUIDs))) failMsg = "Ineligible. Too many uids occupied by merge or shortlisted for birth." return dict(failMsg=failMsg) ''' # Compute score for each eligible state countVec = np.maximum(SS.getCountVec(), 1e-100) eligibleUIDs = list() tooBigUIDs = list() failRecordUIDs = list() nFailRecord = 0 nReactivated = 0 for uid in availableUIDs: k = SS.uid2k(uid) size = countVec[k] if uid not in MoveRecordsByUID: MoveRecordsByUID[uid] = defaultdict(int) # Skip ahead if this cluster is too big if size > DArgs['d_maxNumAtomsForTargetComp']: tooBigUIDs.append(uid) continue # Avoid comps we've failed deleting in the past # unless they have changed by a reasonable amount # or enough laps have passed to try again lapsSinceLastTry = lapFrac - MoveRecordsByUID[uid]['d_latestLap'] nFailRecent_Delete = MoveRecordsByUID[uid]['d_nFailRecent'] > 0 oldsize = MoveRecordsByUID[uid]['d_latestCount'] if oldsize > 0 and nFailRecent_Delete > 0: nFailRecord += 1 sizePercDiff = np.abs(size - oldsize)/(1e-100 + np.abs(oldsize)) if sizePercDiff > DArgs['d_minPercChangeInNumAtomsToReactivate']: nReactivated += 1 elif DArgs['d_nLapToReactivate'] > 0 \ and lapsSinceLastTry > DArgs['d_nLapToReactivate']: nReactivated += 1 else: failRecordUIDs.append(uid) continue # If we make it here, the uid is eligible eligibleUIDs.append(uid) # Log which uids were marked has high potential births msg = "%d/%d UIDs busy with other moves (birth/merge)" % ( len(uidsBusyWithOtherMoves), K) DLogger.pprint(msg) if len(uidsBusyWithOtherMoves) > 0: DLogger.pprint( ' ' + vec2str(uidsBusyWithOtherMoves), 'debug') msg = "%d/%d UIDs too large [--d_maxNumAtomsForTargetComp %.2f]" % ( len(tooBigUIDs), K, DArgs['d_maxNumAtomsForTargetComp']) DLogger.pprint(msg) if len(tooBigUIDs) > 0: DLogger.pprint( ' ' + vec2str(tooBigUIDs), 'debug') # Log which uids were marked has having a record. msg = '%d/%d UIDs un-deleteable for past failures. %d reactivated.' % ( len(failRecordUIDs), K, nReactivated) DLogger.pprint(msg) if len(failRecordUIDs) > 0: DLogger.pprint( ' ' + vec2str(failRecordUIDs), 'debug') # Log all remaining eligible uids msg = '%d/%d UIDs eligible for targeted delete proposal' % ( len(eligibleUIDs), K) DLogger.pprint(msg) if len(eligibleUIDs) == 0: failMsg = ("Empty plan. 0 UIDs eligible as delete target." + \ " %d too busy with other moves." + \ " %d too big." + \ " %d have past failures.") % ( len(uidsBusyWithOtherMoves), len(tooBigUIDs), len(failRecordUIDs)) return dict(failMsg=failMsg) # Log count statistics for each uid eligibleCountVec = [countVec[SS.uid2k(u)] for u in eligibleUIDs] DLogger.pprint( ' uid ' + vec2str(eligibleUIDs), 'debug') DLogger.pprint( ' count ' + vec2str(eligibleCountVec), 'debug') # Select the single state to target # by taking the one with highest score #Scores = np.asarray([x for x in ScoreByEligibleUID.values()]) #targetUID = eligibleUIDs[np.argmax(eligibleCountVec)] #MovePlans['d_targetUIDs'] = [targetUID] targetUID = eligibleUIDs[np.argmax(eligibleCountVec)] MovePlans['d_targetUIDs'] = [targetUID] # Determine all comps eligible to receive its transfer mass absorbUIDset = set(eligibleUIDs) absorbUIDset.discard(targetUID) absorbUIDset.update(tooBigUIDs) absorbUIDset.update(failRecordUIDs) MovePlans['d_absorbingUIDSet'] = absorbUIDset DLogger.pprint('Selecting one single state to target.') DLogger.pprint('targetUID ' + str(targetUID)) DLogger.pprint('absorbingUIDs: ' + vec2str(absorbUIDset)) return MovePlans
def _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs, dataName, allocModelName, obsModelName, algName, doSaveToDisk, doWriteStdOut): """ Internal method (should never be called by end-user!) Executes learning for a particular job and particular taskid. Returns ------- hmodel : bnpy HModel, fit to the data LP : Local parameter (LP) dict for the specific dataset RunInfo : dict of information about the run, with fields - 'loss' : final loss value for algorithm - 'loss_history' : vector of loss values over time """ # Make shallow copies of input dicts, so we any modifications here # do not return to the caller. ReqArgs = dict(**ReqArgs) KwArgs = dict(**KwArgs) UnkArgs = dict(**UnkArgs) algseed = createUniqueRandomSeed(jobname, taskID=taskid) dataorderseed = createUniqueRandomSeed('', taskID=taskid) KwArgs[algName]['algseed'] = algseed KwArgs[algName]['dataorderseed'] = dataorderseed if algName in OnlineDataAlgSet: KwArgs[algName]['nLap'] = KwArgs['OnlineDataPrefs']['nLap'] if isinstance(dataName, str): if os.path.exists(dataName): # dataName is a path to many data files on disk Data, InitData = loadDataIteratorFromDisk(dataName, ReqArgs, KwArgs, dataorderseed) DataArgs = UnkArgs # Set the short name for this dataset, # so that the filepath for results is informative. if not hasattr(Data, 'name'): try: Data.name = KwArgs['OnlineDataPrefs']['datasetName'] except KeyError: Data.name = 'UnknownDatasetName' else: DataArgs = getKwArgsForLoadData(ReqArgs, UnkArgs, KwArgs) Data, InitData = loadData(ReqArgs, KwArgs, DataArgs, dataorderseed) else: Data = dataName InitData = dataName DataArgs = dict() assert isinstance(Data, bnpy.data.DataObj) if algName in OnlineDataAlgSet: OnlineDataArgs = KwArgs['OnlineDataPrefs'] OnlineDataArgs['dataorderseed'] = dataorderseed DataArgs = getKwArgsForLoadData(Data, UnkArgs) OnlineDataArgs.update(DataArgs) # add custom args Data = Data.to_iterator(**OnlineDataArgs) if hasattr(Data, 'name'): ReqArgs['dataName'] = Data.name if doSaveToDisk: task_output_path = make_task_output_path(ReqArgs, KwArgs, taskID=taskid) createEmptyOutputPathOnDisk(task_output_path) writeArgsToFile(ReqArgs, KwArgs, task_output_path, UnkArgs) else: task_output_path = None KwArgs['OutputPrefs']['task_output_path'] = task_output_path jobID = configLoggingToConsoleAndFile(task_output_path, taskid, doSaveToDisk, doWriteStdOut) # Write descriptions to the log if taskid == 1 or jobID > 0: # Warn user about any unknown keyword arguments showWarningForUnknownArgs(UnkArgs, DataArgs) Log.info('Dataset Summary:') Log.info(Data.get_text_summary()) Log.info(Data.get_stats_summary()) # Create and initialize model parameters hmodel = make_initialized_model( InitData, seed=algseed, taskid=taskid, allocModelName=ReqArgs['allocModelName'], obsModelName=ReqArgs['obsModelName'], algName=ReqArgs['algName'], KwArgs=KwArgs, verbose=(taskid == 1 or jobID > 0), ) # Create learning algorithm learnAlg = createLearnAlg(Data, hmodel, ReqArgs, KwArgs, algseed=algseed, task_output_path=task_output_path) if learnAlg.hasMove('birth'): import bnpy.birthmove.BLogger as BirthLogger BirthLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('delete'): import bnpy.deletemove.DLogger as DeleteLogger DeleteLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('merge'): import bnpy.mergemove.MLogger as MergeLogger MergeLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if learnAlg.hasMove('shuffle'): import bnpy.mergemove.SLogger as SLogger SLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) if str(type(hmodel.allocModel)).count('TopicModel'): import bnpy.allocmodel.topics.LocalStepLogger as LocalStepLogger LocalStepLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut) # Set up logging for how long each step of the alg takes. import bnpy.learnalg.ElapsedTimeLogger as ElapsedTimeLogger ElapsedTimeLogger.configure(task_output_path, KwArgs['MoveNames'], doSaveToDisk, doWriteStdOut) Log.info( 'Learn Alg: %s | task %2d/%d | alg. seed: %d | data order seed: %d' % (algName, taskid, nTask, algseed, dataorderseed)) Log.info('task_output_path: %s' % (task_output_path)) # Fit the model to the data! RunInfo = learnAlg.fit(hmodel, Data) RunInfo['UnkArgs'] = UnkArgs RunInfo['KwArgs'] = KwArgs RunInfo['ReqArgs'] = ReqArgs return hmodel, RunInfo