예제 #1
0
def showTopWordsForTask(taskpath, vocabfile, lap=None, doHTML=1,
                        doCounts=1, sortTopics=False, **kwargs):
    ''' Print top words for each topic from results saved on disk.

    Returns
    -------
    html : string, ready-to-print to display of the top words
    '''
    with open(vocabfile, 'r') as f:
        vocabList = [x.strip() for x in f.readlines()]

    if doCounts and (lap is None or lap > 0):
        WordCounts = loadWordCountMatrixForLap(taskpath, lap)
        countVec = WordCounts.sum(axis=1)
        if sortTopics:
            sortIDs = np.argsort(-1 * countVec)  # -1 to get descending order
            countVec = countVec[sortIDs]
            WordCounts = WordCounts[sortIDs]
        if doHTML:
            return htmlTopWordsFromWordCounts(
                WordCounts, vocabList, countVec=countVec, **kwargs)
        else:
            return printTopWordsFromWordCounts(WordCounts, vocabList)

    else:
        hmodel, lap = load_model_at_lap(taskpath, lap)
        if doHTML:
            return htmlTopWordsFromHModel(hmodel, vocabList, **kwargs)
        else:
            return printTopWordsFromHModel(hmodel, vocabList)
예제 #2
0
def tryBirthForTask(taskoutpath=None,
                    lap=None,
                    lapFrac=0,
                    targetUID=0,
                    batchID=None,
                    **kwargs):
    '''

    Post Condition
    --------------
    * Logging messages are printed.
    * HTML report is saved.
    '''
    if lap is not None:
        lapFrac = lap

    curModel, lapFrac = load_model_at_lap(taskoutpath, lapFrac)
    Data = loadDataFromSavedTask(taskoutpath, batchID=batchID)

    LPkwargs = loadLPKwargsFromDisk(taskoutpath)
    SavedBirthKwargs = loadKwargsFromDisk(taskoutpath, 'args-birth.txt')

    if targetUID < 0:
        targetUID = findCompInModelWithLargestMisalignment(curModel, Data)

    BirthArgs = dict(**DefaultBirthArgs)
    BirthArgs.update(SavedBirthKwargs)
    for key, val in list(kwargs.items()):
        if val is not None:
            BirthArgs[key] = val
            print('%s: %s' % (key, str(val)))

    curLP = curModel.calc_local_params(Data, **LPkwargs)
    curSS = curModel.get_global_suff_stats(Data,
                                           curLP,
                                           trackDocUsage=1,
                                           doPrecompEntropy=1,
                                           trackTruncationGrowth=1)
    curLscore = curModel.calc_evidence(SS=curSS)

    print("Target UID: %d" % (targetUID))
    print("Current count: %.2f" % (curSS.getCountForUID(targetUID)))

    xSS = makeSummaryForBirthProposal_HTMLWrapper(
        Data,
        curModel,
        curLP,
        curSSwhole=curSS,
        targetUID=int(targetUID),
        newUIDs=list(range(curSS.K, curSS.K + int(BirthArgs['b_Kfresh']))),
        LPkwargs=LPkwargs,
        lapFrac=lapFrac,
        dataName=Data.name,
        **BirthArgs)
    '''
예제 #3
0
def tryDeleteProposalForSavedTask(taskoutpath=None,
                                  lap=None,
                                  lapFrac=0,
                                  batchID=None,
                                  **kwargs):
    ''' Try specific delete proposal for specified taskoutpath

    Post Condition
    --------------
    * Logging messages are printed.
    '''
    if lap is not None:
        lapFrac = lap

    hmodel, lapFrac = load_model_at_lap(taskoutpath, lapFrac)
    Data = loadDataFromSavedTask(taskoutpath, batchID=batchID)
    kwargs['LPkwargs'] = loadLPKwargsFromDisk(taskoutpath)

    tryDeleteProposalForSpecificTarget_HDPTopicModel(Data, hmodel, **kwargs)
예제 #4
0
파일: TryMerge.py 프로젝트: jpfeil/hydra
def tryMergeProposalForSavedTask(taskoutpath=None,
                                 lap=None,
                                 lapFrac=0,
                                 batchID=None,
                                 **kwargs):
    '''

    Post Condition
    --------------
    * Logging messages are printed.
    * HTML report is saved.
    '''
    if lap is not None:
        lapFrac = lap

    hmodel, lapFrac = load_model_at_lap(taskoutpath, lapFrac)
    Data = loadDataFromSavedTask(taskoutpath, batchID=batchID)
    kwargs['LPkwargs'] = loadLPKwargsFromDisk(taskoutpath)

    tryMergeProposalForSpecificTarget(Data, hmodel, **kwargs)
예제 #5
0
def evalTopicModelOnTestDataFromTaskpath(taskpath='',
                                         queryLap=0,
                                         nLap=0,
                                         elapsedTime=None,
                                         seed=42,
                                         dataSplitName='test',
                                         fracHeldout=0.2,
                                         printFunc=None,
                                         **kwargs):
    ''' Evaluate trained topic model saved in specified task on test data
    '''
    stime = time.time()

    LPkwargs = dict(nnzPerRowLP=0,
                    nCoordAscentItersLP=100,
                    convThrLP=0.01,
                    restartLP=0,
                    initDocTopicCountLP='setDocProbsToEGlobalProbs')
    for key in kwargs:
        if key in LPkwargs and kwargs[key] is not None:
            LPkwargs[key] = str2val(kwargs[key])
    # Force to be 0, which gives better performance
    # (due to mismatch in objectives)
    if 'restartLP' in LPkwargs:
        LPkwargs['restartLP'] = 0
    # Force to be 0, so we are fair at test time
    if 'nnzPerRowLP' in LPkwargs:
        LPkwargs['nnzPerRowLP'] = 0

    # Load test dataset
    Data = loadDataFromSavedTask(taskpath, dataSplitName=dataSplitName)

    # Check if info is stored in topic-model form
    topicFileList = glob.glob(os.path.join(taskpath, 'Lap*Topic*'))
    if len(topicFileList) > 0:
        topics, probs, alpha = loadTopicModel(taskpath,
                                              queryLap=queryLap,
                                              returnTPA=1,
                                              normalizeTopics=1,
                                              normalizeProbs=1)
        K = probs.size
    else:
        hmodel, foundLap = load_model_at_lap(taskpath, queryLap)
        if hasattr(Data, 'word_count'):
            # Convert to topics 2D array (K x V)
            topics = hmodel.obsModel.getTopics()
            probs = hmodel.allocModel.get_active_comp_probs()
        else:
            hmodel.obsModel.setEstParamsFromPost(hmodel.obsModel.Post)
            hmodel.obsModel.inferType = "EM"  # Point estimate!

        assert np.allclose(foundLap, queryLap)
        if hasattr(hmodel.allocModel, 'alpha'):
            alpha = hmodel.allocModel.alpha
        else:
            try:
                DataKwargs = loadDataKwargsFromDisk(taskpath)
                alpha = float(DataKwargs['alpha'])
            except Exception:
                alpha = 0.5
        K = hmodel.allocModel.K
    # Prepare debugging statements
    if printFunc:
        startmsg = "Heldout Metrics at lap %.3f" % (queryLap)
        filler = '=' * (80 - len(startmsg))
        printFunc(startmsg + ' ' + filler)
        if hasattr(Data, 'word_count'):
            nAtom = Data.word_count.sum()
        else:
            nAtom = Data.nObs
        msg = "%s heldout data. %d documents. %d total atoms." % (
            Data.name, Data.nDoc, nAtom)
        printFunc(msg)
        printFunc("Using trained model from lap %7.3f with %d topics" %
                  (queryLap, K))
        printFunc("Using alpha=%.3f for heldout inference." % (alpha))
        printFunc("Local step params:")
        for key in ['nCoordAscentItersLP', 'convThrLP', 'restartLP']:
            printFunc("    %s: %s" % (key, str(LPkwargs[key])))
        msg = "Splitting each doc" + \
            " into %3.0f%% train and %3.0f%% test, with seed %d" % (
            100*(1-fracHeldout), 100*fracHeldout, seed)
        printFunc(msg)

    # Preallocate storage for metrics
    KactivePerDoc = np.zeros(Data.nDoc)
    logpTokensPerDoc = np.zeros(Data.nDoc)
    nTokensPerDoc = np.zeros(Data.nDoc, dtype=np.int32)
    if hasattr(Data, 'word_count'):
        aucPerDoc = np.zeros(Data.nDoc)
        RprecisionPerDoc = np.zeros(Data.nDoc)
    for d in range(Data.nDoc):
        Data_d = Data.select_subset_by_mask([d], doTrackFullSize=0)
        if hasattr(Data, 'word_count'):
            Info_d = calcPredLikForDoc(Data_d,
                                       topics,
                                       probs,
                                       alpha,
                                       fracHeldout=fracHeldout,
                                       seed=seed + d,
                                       LPkwargs=LPkwargs)
            logpTokensPerDoc[d] = Info_d['sumlogProbTokens']
            nTokensPerDoc[d] = Info_d['nHeldoutToken']
            aucPerDoc[d] = Info_d['auc']
            RprecisionPerDoc[d] = Info_d['R_precision']
            KactivePerDoc[d] = np.sum(Info_d['DocTopicCount'] >= 1.0)
            avgAUCscore = np.mean(aucPerDoc[:d + 1])
            avgRscore = np.mean(RprecisionPerDoc[:d + 1])
            scoreMsg = "avgLik %.4f avgAUC %.4f avgRPrec %.4f medianKact %d" % (
                np.sum(logpTokensPerDoc[:d + 1]) /
                np.sum(nTokensPerDoc[:d + 1]), avgAUCscore, avgRscore,
                np.median(KactivePerDoc[:d + 1]))
            SVars = dict(avgRPrecScore=avgRscore,
                         avgAUCScore=avgAUCscore,
                         avgAUCScorePerDoc=aucPerDoc,
                         avgRPrecScorePerDoc=RprecisionPerDoc)
        else:
            Info_d = calcPredLikForDocFromHModel(Data_d,
                                                 hmodel,
                                                 alpha=alpha,
                                                 fracHeldout=fracHeldout,
                                                 seed=seed + d,
                                                 LPkwargs=LPkwargs)
            logpTokensPerDoc[d] = Info_d['sumlogProbTokens']
            nTokensPerDoc[d] = Info_d['nHeldoutToken']
            scoreMsg = "avgLik %.4f" % (np.sum(logpTokensPerDoc[:d + 1]) /
                                        np.sum(nTokensPerDoc[:d + 1]), )
            SVars = dict()

        if d == 0 or (d + 1) % 25 == 0 or d == Data.nDoc - 1:
            if printFunc:
                etime = time.time() - stime
                msg = "%5d/%d after %8.1f sec " % (d + 1, Data.nDoc, etime)
                printFunc(msg + scoreMsg)
    # Aggregate results
    meanlogpTokensPerDoc = np.sum(logpTokensPerDoc) / np.sum(nTokensPerDoc)
    '''
    # Compute heldout Lscore
    if not hasattr(Data, 'word_count'):
        if hasattr(hmodel.allocModel, 'gamma'):
            gamma = hmodel.allocModel.gamma
        else:
            gamma = hmodel.allocModel.gamma0
        aParams = dict(gamma=gamma, alpha=alpha)
        oParams = hmodel.obsModel.get_prior_dict()
        del oParams['inferType']

        # Create DP mixture model from current hmodel
        DPmodel = bnpy.HModel.CreateEntireModel('VB', 'DPMixtureModel',
            hmodel.getObsModelName(),
            aParams, oParams,
            Data)
        DPmodel.set_global_params(hmodel=hmodel)
        LP = DPmodel.calc_local_params(Data, **LPkwargs)
        SS = DPmodel.get_global_suff_stats(Data, LP, doPrecompEntropy=1)
        dpLscore = DPmodel.calc_evidence(SS=SS)

        # Create HDP topic model from current hmodel
        HDPmodel = bnpy.HModel.CreateEntireModel('VB', 'HDPTopicModel',
            hmodel.getObsModelName(),
            aParams, oParams,
            Data)
        HDPmodel.set_global_params(hmodel=hmodel)
        LP = HDPmodel.calc_local_params(Data, **LPkwargs)
        SS = HDPmodel.get_global_suff_stats(Data, LP, doPrecompEntropy=1)
        hdpLscore = HDPmodel.calc_evidence(SS=SS)

        SVars['dpLscore'] = dpLscore
        SVars['hdpLscore'] = hdpLscore
        printFunc("~~~ dpL=%.6e\n~~~hdpL=%.6e" % (dpLscore, hdpLscore))
    '''
    # Prepare to save results.
    if dataSplitName.count('test'):
        outfileprefix = 'predlik-'
    else:
        outfileprefix = dataSplitName + '-predlik-'
    prefix, lap = getPrefixForLapQuery(taskpath, queryLap)
    outmatfile = os.path.join(taskpath,
                              prefix + "Heldout_%s.mat" % (dataSplitName))
    # Collect all quantities to save into giant dict.
    SaveVars = dict(version=VERSION,
                    outmatfile=outmatfile,
                    fracHeldout=fracHeldout,
                    predLLPerDoc=logpTokensPerDoc,
                    avgPredLL=np.sum(logpTokensPerDoc) / np.sum(nTokensPerDoc),
                    K=K,
                    KactivePerDoc=KactivePerDoc,
                    nTokensPerDoc=nTokensPerDoc,
                    **LPkwargs)
    SaveVars.update(SVars)
    scipy.io.savemat(outmatfile, SaveVars, oned_as='row')
    SVars['avgLikScore'] = SaveVars['avgPredLL']
    SVars['lapTrain'] = queryLap
    SVars['K'] = K
    for p in [10, 50, 90]:
        SVars['KactivePercentile%02d' % (p)] = np.percentile(KactivePerDoc, p)

    # Record total time spent doing current work
    timeSpent = time.time() - stime
    if elapsedTime is not None:
        SVars['timeTrainAndEval'] = elapsedTime + timeSpent
    # Load previous time spent non training from disk
    timeSpentFilepaths = glob.glob(os.path.join(taskpath,
                                                '*-timeEvalOnly.txt'))
    totalTimeSpent = timeSpent
    splitTimeSpent = timeSpent
    for timeSpentFilepath in timeSpentFilepaths:
        with open(timeSpentFilepath, 'r') as f:
            for line in f.readlines():
                pass
            prevTime = float(line.strip())
        cond1 = dataSplitName.count('valid')
        cond2 = timeSpentFilepath.count('valid')
        if cond1 and cond2:
            splitTimeSpent += prevTime
        elif (not cond1) and (not cond2):
            splitTimeSpent += prevTime
        totalTimeSpent += prevTime
    SVars['timeEvalOnly'] = splitTimeSpent
    # Mark total time spent purely on training
    if elapsedTime is not None:
        SVars['timeTrain'] = SVars['timeTrainAndEval'] - totalTimeSpent
    for key in SVars:
        if key.endswith('PerDoc'):
            continue
        outtxtfile = os.path.join(taskpath, outfileprefix + '%s.txt' % (key))
        with open(outtxtfile, 'a') as f:
            f.write("%.6e\n" % (SVars[key]))
    if printFunc:
        printFunc("DONE with heldout inference at lap %.3f" % queryLap)
        printFunc("Wrote per-doc results in MAT file:" +
                  outmatfile.split(os.path.sep)[-1])
        printFunc("      Aggregate results in txt files: %s__.txt" %
                  (outfileprefix))

    # Write the summary message
    if printFunc:
        etime = time.time() - stime
        curLapStr = '%7.3f' % (queryLap)
        nLapStr = '%d' % (nLap)
        logmsg = '  %s/%s %s metrics   | K %4d | %s'
        logmsg = logmsg % (curLapStr, nLapStr, '%5s' %
                           (dataSplitName[:5]), K, scoreMsg)
        printFunc(logmsg, 'info')

    return SaveVars