def showTopWordsForTask(taskpath, vocabfile, lap=None, doHTML=1, doCounts=1, sortTopics=False, **kwargs): ''' Print top words for each topic from results saved on disk. Returns ------- html : string, ready-to-print to display of the top words ''' with open(vocabfile, 'r') as f: vocabList = [x.strip() for x in f.readlines()] if doCounts and (lap is None or lap > 0): WordCounts = loadWordCountMatrixForLap(taskpath, lap) countVec = WordCounts.sum(axis=1) if sortTopics: sortIDs = np.argsort(-1 * countVec) # -1 to get descending order countVec = countVec[sortIDs] WordCounts = WordCounts[sortIDs] if doHTML: return htmlTopWordsFromWordCounts( WordCounts, vocabList, countVec=countVec, **kwargs) else: return printTopWordsFromWordCounts(WordCounts, vocabList) else: hmodel, lap = load_model_at_lap(taskpath, lap) if doHTML: return htmlTopWordsFromHModel(hmodel, vocabList, **kwargs) else: return printTopWordsFromHModel(hmodel, vocabList)
def tryBirthForTask(taskoutpath=None, lap=None, lapFrac=0, targetUID=0, batchID=None, **kwargs): ''' Post Condition -------------- * Logging messages are printed. * HTML report is saved. ''' if lap is not None: lapFrac = lap curModel, lapFrac = load_model_at_lap(taskoutpath, lapFrac) Data = loadDataFromSavedTask(taskoutpath, batchID=batchID) LPkwargs = loadLPKwargsFromDisk(taskoutpath) SavedBirthKwargs = loadKwargsFromDisk(taskoutpath, 'args-birth.txt') if targetUID < 0: targetUID = findCompInModelWithLargestMisalignment(curModel, Data) BirthArgs = dict(**DefaultBirthArgs) BirthArgs.update(SavedBirthKwargs) for key, val in list(kwargs.items()): if val is not None: BirthArgs[key] = val print('%s: %s' % (key, str(val))) curLP = curModel.calc_local_params(Data, **LPkwargs) curSS = curModel.get_global_suff_stats(Data, curLP, trackDocUsage=1, doPrecompEntropy=1, trackTruncationGrowth=1) curLscore = curModel.calc_evidence(SS=curSS) print("Target UID: %d" % (targetUID)) print("Current count: %.2f" % (curSS.getCountForUID(targetUID))) xSS = makeSummaryForBirthProposal_HTMLWrapper( Data, curModel, curLP, curSSwhole=curSS, targetUID=int(targetUID), newUIDs=list(range(curSS.K, curSS.K + int(BirthArgs['b_Kfresh']))), LPkwargs=LPkwargs, lapFrac=lapFrac, dataName=Data.name, **BirthArgs) '''
def tryDeleteProposalForSavedTask(taskoutpath=None, lap=None, lapFrac=0, batchID=None, **kwargs): ''' Try specific delete proposal for specified taskoutpath Post Condition -------------- * Logging messages are printed. ''' if lap is not None: lapFrac = lap hmodel, lapFrac = load_model_at_lap(taskoutpath, lapFrac) Data = loadDataFromSavedTask(taskoutpath, batchID=batchID) kwargs['LPkwargs'] = loadLPKwargsFromDisk(taskoutpath) tryDeleteProposalForSpecificTarget_HDPTopicModel(Data, hmodel, **kwargs)
def tryMergeProposalForSavedTask(taskoutpath=None, lap=None, lapFrac=0, batchID=None, **kwargs): ''' Post Condition -------------- * Logging messages are printed. * HTML report is saved. ''' if lap is not None: lapFrac = lap hmodel, lapFrac = load_model_at_lap(taskoutpath, lapFrac) Data = loadDataFromSavedTask(taskoutpath, batchID=batchID) kwargs['LPkwargs'] = loadLPKwargsFromDisk(taskoutpath) tryMergeProposalForSpecificTarget(Data, hmodel, **kwargs)
def evalTopicModelOnTestDataFromTaskpath(taskpath='', queryLap=0, nLap=0, elapsedTime=None, seed=42, dataSplitName='test', fracHeldout=0.2, printFunc=None, **kwargs): ''' Evaluate trained topic model saved in specified task on test data ''' stime = time.time() LPkwargs = dict(nnzPerRowLP=0, nCoordAscentItersLP=100, convThrLP=0.01, restartLP=0, initDocTopicCountLP='setDocProbsToEGlobalProbs') for key in kwargs: if key in LPkwargs and kwargs[key] is not None: LPkwargs[key] = str2val(kwargs[key]) # Force to be 0, which gives better performance # (due to mismatch in objectives) if 'restartLP' in LPkwargs: LPkwargs['restartLP'] = 0 # Force to be 0, so we are fair at test time if 'nnzPerRowLP' in LPkwargs: LPkwargs['nnzPerRowLP'] = 0 # Load test dataset Data = loadDataFromSavedTask(taskpath, dataSplitName=dataSplitName) # Check if info is stored in topic-model form topicFileList = glob.glob(os.path.join(taskpath, 'Lap*Topic*')) if len(topicFileList) > 0: topics, probs, alpha = loadTopicModel(taskpath, queryLap=queryLap, returnTPA=1, normalizeTopics=1, normalizeProbs=1) K = probs.size else: hmodel, foundLap = load_model_at_lap(taskpath, queryLap) if hasattr(Data, 'word_count'): # Convert to topics 2D array (K x V) topics = hmodel.obsModel.getTopics() probs = hmodel.allocModel.get_active_comp_probs() else: hmodel.obsModel.setEstParamsFromPost(hmodel.obsModel.Post) hmodel.obsModel.inferType = "EM" # Point estimate! assert np.allclose(foundLap, queryLap) if hasattr(hmodel.allocModel, 'alpha'): alpha = hmodel.allocModel.alpha else: try: DataKwargs = loadDataKwargsFromDisk(taskpath) alpha = float(DataKwargs['alpha']) except Exception: alpha = 0.5 K = hmodel.allocModel.K # Prepare debugging statements if printFunc: startmsg = "Heldout Metrics at lap %.3f" % (queryLap) filler = '=' * (80 - len(startmsg)) printFunc(startmsg + ' ' + filler) if hasattr(Data, 'word_count'): nAtom = Data.word_count.sum() else: nAtom = Data.nObs msg = "%s heldout data. %d documents. %d total atoms." % ( Data.name, Data.nDoc, nAtom) printFunc(msg) printFunc("Using trained model from lap %7.3f with %d topics" % (queryLap, K)) printFunc("Using alpha=%.3f for heldout inference." % (alpha)) printFunc("Local step params:") for key in ['nCoordAscentItersLP', 'convThrLP', 'restartLP']: printFunc(" %s: %s" % (key, str(LPkwargs[key]))) msg = "Splitting each doc" + \ " into %3.0f%% train and %3.0f%% test, with seed %d" % ( 100*(1-fracHeldout), 100*fracHeldout, seed) printFunc(msg) # Preallocate storage for metrics KactivePerDoc = np.zeros(Data.nDoc) logpTokensPerDoc = np.zeros(Data.nDoc) nTokensPerDoc = np.zeros(Data.nDoc, dtype=np.int32) if hasattr(Data, 'word_count'): aucPerDoc = np.zeros(Data.nDoc) RprecisionPerDoc = np.zeros(Data.nDoc) for d in range(Data.nDoc): Data_d = Data.select_subset_by_mask([d], doTrackFullSize=0) if hasattr(Data, 'word_count'): Info_d = calcPredLikForDoc(Data_d, topics, probs, alpha, fracHeldout=fracHeldout, seed=seed + d, LPkwargs=LPkwargs) logpTokensPerDoc[d] = Info_d['sumlogProbTokens'] nTokensPerDoc[d] = Info_d['nHeldoutToken'] aucPerDoc[d] = Info_d['auc'] RprecisionPerDoc[d] = Info_d['R_precision'] KactivePerDoc[d] = np.sum(Info_d['DocTopicCount'] >= 1.0) avgAUCscore = np.mean(aucPerDoc[:d + 1]) avgRscore = np.mean(RprecisionPerDoc[:d + 1]) scoreMsg = "avgLik %.4f avgAUC %.4f avgRPrec %.4f medianKact %d" % ( np.sum(logpTokensPerDoc[:d + 1]) / np.sum(nTokensPerDoc[:d + 1]), avgAUCscore, avgRscore, np.median(KactivePerDoc[:d + 1])) SVars = dict(avgRPrecScore=avgRscore, avgAUCScore=avgAUCscore, avgAUCScorePerDoc=aucPerDoc, avgRPrecScorePerDoc=RprecisionPerDoc) else: Info_d = calcPredLikForDocFromHModel(Data_d, hmodel, alpha=alpha, fracHeldout=fracHeldout, seed=seed + d, LPkwargs=LPkwargs) logpTokensPerDoc[d] = Info_d['sumlogProbTokens'] nTokensPerDoc[d] = Info_d['nHeldoutToken'] scoreMsg = "avgLik %.4f" % (np.sum(logpTokensPerDoc[:d + 1]) / np.sum(nTokensPerDoc[:d + 1]), ) SVars = dict() if d == 0 or (d + 1) % 25 == 0 or d == Data.nDoc - 1: if printFunc: etime = time.time() - stime msg = "%5d/%d after %8.1f sec " % (d + 1, Data.nDoc, etime) printFunc(msg + scoreMsg) # Aggregate results meanlogpTokensPerDoc = np.sum(logpTokensPerDoc) / np.sum(nTokensPerDoc) ''' # Compute heldout Lscore if not hasattr(Data, 'word_count'): if hasattr(hmodel.allocModel, 'gamma'): gamma = hmodel.allocModel.gamma else: gamma = hmodel.allocModel.gamma0 aParams = dict(gamma=gamma, alpha=alpha) oParams = hmodel.obsModel.get_prior_dict() del oParams['inferType'] # Create DP mixture model from current hmodel DPmodel = bnpy.HModel.CreateEntireModel('VB', 'DPMixtureModel', hmodel.getObsModelName(), aParams, oParams, Data) DPmodel.set_global_params(hmodel=hmodel) LP = DPmodel.calc_local_params(Data, **LPkwargs) SS = DPmodel.get_global_suff_stats(Data, LP, doPrecompEntropy=1) dpLscore = DPmodel.calc_evidence(SS=SS) # Create HDP topic model from current hmodel HDPmodel = bnpy.HModel.CreateEntireModel('VB', 'HDPTopicModel', hmodel.getObsModelName(), aParams, oParams, Data) HDPmodel.set_global_params(hmodel=hmodel) LP = HDPmodel.calc_local_params(Data, **LPkwargs) SS = HDPmodel.get_global_suff_stats(Data, LP, doPrecompEntropy=1) hdpLscore = HDPmodel.calc_evidence(SS=SS) SVars['dpLscore'] = dpLscore SVars['hdpLscore'] = hdpLscore printFunc("~~~ dpL=%.6e\n~~~hdpL=%.6e" % (dpLscore, hdpLscore)) ''' # Prepare to save results. if dataSplitName.count('test'): outfileprefix = 'predlik-' else: outfileprefix = dataSplitName + '-predlik-' prefix, lap = getPrefixForLapQuery(taskpath, queryLap) outmatfile = os.path.join(taskpath, prefix + "Heldout_%s.mat" % (dataSplitName)) # Collect all quantities to save into giant dict. SaveVars = dict(version=VERSION, outmatfile=outmatfile, fracHeldout=fracHeldout, predLLPerDoc=logpTokensPerDoc, avgPredLL=np.sum(logpTokensPerDoc) / np.sum(nTokensPerDoc), K=K, KactivePerDoc=KactivePerDoc, nTokensPerDoc=nTokensPerDoc, **LPkwargs) SaveVars.update(SVars) scipy.io.savemat(outmatfile, SaveVars, oned_as='row') SVars['avgLikScore'] = SaveVars['avgPredLL'] SVars['lapTrain'] = queryLap SVars['K'] = K for p in [10, 50, 90]: SVars['KactivePercentile%02d' % (p)] = np.percentile(KactivePerDoc, p) # Record total time spent doing current work timeSpent = time.time() - stime if elapsedTime is not None: SVars['timeTrainAndEval'] = elapsedTime + timeSpent # Load previous time spent non training from disk timeSpentFilepaths = glob.glob(os.path.join(taskpath, '*-timeEvalOnly.txt')) totalTimeSpent = timeSpent splitTimeSpent = timeSpent for timeSpentFilepath in timeSpentFilepaths: with open(timeSpentFilepath, 'r') as f: for line in f.readlines(): pass prevTime = float(line.strip()) cond1 = dataSplitName.count('valid') cond2 = timeSpentFilepath.count('valid') if cond1 and cond2: splitTimeSpent += prevTime elif (not cond1) and (not cond2): splitTimeSpent += prevTime totalTimeSpent += prevTime SVars['timeEvalOnly'] = splitTimeSpent # Mark total time spent purely on training if elapsedTime is not None: SVars['timeTrain'] = SVars['timeTrainAndEval'] - totalTimeSpent for key in SVars: if key.endswith('PerDoc'): continue outtxtfile = os.path.join(taskpath, outfileprefix + '%s.txt' % (key)) with open(outtxtfile, 'a') as f: f.write("%.6e\n" % (SVars[key])) if printFunc: printFunc("DONE with heldout inference at lap %.3f" % queryLap) printFunc("Wrote per-doc results in MAT file:" + outmatfile.split(os.path.sep)[-1]) printFunc(" Aggregate results in txt files: %s__.txt" % (outfileprefix)) # Write the summary message if printFunc: etime = time.time() - stime curLapStr = '%7.3f' % (queryLap) nLapStr = '%d' % (nLap) logmsg = ' %s/%s %s metrics | K %4d | %s' logmsg = logmsg % (curLapStr, nLapStr, '%5s' % (dataSplitName[:5]), K, scoreMsg) printFunc(logmsg, 'info') return SaveVars