Esempio n. 1
0
    parser.add_argument('--doPlotELBO', type=int, default=0)
    parser.add_argument('--doPlotComps', type=int, default=1)
    parser.add_argument('--ktarget', type=int, default=10)
    parser.add_argument('--kabsorbList', type=str, default='all')
    parser.add_argument('--verbose', type=int, default=True)
    parser.add_argument('--outputdir', type=str, default='/tmp/')
    parser.add_argument('--nUpdateSteps', type=int, default=25)
    parser.add_argument('--nELBOSteps', type=int, default=1)
    parser.add_argument('--d_initWordCounts',
        type=str, default='none')
    parser.add_argument('--d_initTargetDocTopicCount',
        type=str, default="cold_start")
    args = parser.parse_args()

    DLogger.configure(args.outputdir,
        doSaveToDisk=0,
        doWriteStdOut=1) 
    tryDeleteProposalForSavedTask(**args.__dict__)

'''
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--doPlotELBO', type=int, default=1)
    parser.add_argument('--doPlotComps', type=int, default=0)
    parser.add_argument('--ktarget', type=int, default=10)
    parser.add_argument('--kabsorbList', type=str, default='all')
    parser.add_argument('--initname', type=str, default='truelabelsandjunk')
    parser.add_argument('--K', type=int, default=10)
    parser.add_argument('--nLap', type=int, default=1)
    args = parser.parse_args()
    ktarget = args.ktarget
Esempio n. 2
0
def selectCandidateDeleteComps(
        hmodel, SS,
        MoveRecordsByUID=dict(),
        MovePlans=dict(),
        lapFrac=0,
        **DArgs):
    ''' Select specific comps to target with delete move.

    Returns
    -------
    MovePlans : dict, with fields
    * d_targetUIDs : list of ints
    * d_absorbingUIDSet : set of ints, all uids that can absorb target mass
    OR
    * failMsg : string explaining why building list of eligible UIDs failed
    '''
    DLogger.pprint("PLANNING delete at lap %.2f" % (lapFrac))
    K = SS.K

    availableUIDs = set(SS.uids)
    if len(availableUIDs) < 2:
        DLogger.pprint(
            "Delete proposal requires at least 2 available UIDs.\n" + \
            "   Need 1 uid to target, and at least 1 to absorb." + \
            "   Only have %d total uids in the model." % (len(availableUIDs)))
        failMsg = "Ineligible. Did not find >= 2 UIDs in entire model."
        return dict(failMsg=failMsg)

    uidsBusyWithOtherMoves = set()
    '''
    if 'm_UIDPairs' in MovePlans:
        for (uidA, uidB) in MovePlans['m_UIDPairs']:
            availableUIDs.discard(uidA)
            availableUIDs.discard(uidB)
            uidsBusyWithOtherMoves.add(uidA)
            uidsBusyWithOtherMoves.add(uidB)
    if 'b_shortlistUIDs' in MovePlans:
        for uid in MovePlans['b_shortlistUIDs']:
            availableUIDs.discard(uid)
            uidsBusyWithOtherMoves.add(uid)

    if len(availableUIDs) < 2:
        DLogger.pprint("Delete requires at least 2 UIDs" + \
            " not occupied by merge or birth.\n" + \
            "   Need 1 uid to target, and at least 1 to absorb.\n" + \
            "   Only have %d total uids eligible." % (len(availableUIDs)))
        failMsg = "Ineligible. Too many uids occupied by merge or shortlisted for birth."
        return dict(failMsg=failMsg)
    '''

    # Compute score for each eligible state
    countVec = np.maximum(SS.getCountVec(), 1e-100)
    eligibleUIDs = list()
    tooBigUIDs = list()
    failRecordUIDs = list()
    nFailRecord = 0
    nReactivated = 0
    for uid in availableUIDs:
        k = SS.uid2k(uid)
        size = countVec[k]
        if uid not in MoveRecordsByUID:
            MoveRecordsByUID[uid] = defaultdict(int)

        # Skip ahead if this cluster is too big
        if size > DArgs['d_maxNumAtomsForTargetComp']:
            tooBigUIDs.append(uid)
            continue
        # Avoid comps we've failed deleting in the past
        # unless they have changed by a reasonable amount
        # or enough laps have passed to try again
        lapsSinceLastTry = lapFrac - MoveRecordsByUID[uid]['d_latestLap']
        nFailRecent_Delete = MoveRecordsByUID[uid]['d_nFailRecent'] > 0
        oldsize = MoveRecordsByUID[uid]['d_latestCount']
        if oldsize > 0 and nFailRecent_Delete > 0:
            nFailRecord += 1
            sizePercDiff = np.abs(size - oldsize)/(1e-100 + np.abs(oldsize))
            if sizePercDiff > DArgs['d_minPercChangeInNumAtomsToReactivate']:
                nReactivated += 1
            elif DArgs['d_nLapToReactivate'] > 0 \
                    and lapsSinceLastTry > DArgs['d_nLapToReactivate']:
                nReactivated += 1
            else:
                failRecordUIDs.append(uid)
                continue
        # If we make it here, the uid is eligible
        eligibleUIDs.append(uid)

    # Log which uids were marked has high potential births
    msg = "%d/%d UIDs busy with other moves (birth/merge)" % (
       len(uidsBusyWithOtherMoves), K)
    DLogger.pprint(msg)
    if len(uidsBusyWithOtherMoves) > 0:
        DLogger.pprint(
            '  ' + vec2str(uidsBusyWithOtherMoves), 'debug')

    msg = "%d/%d UIDs too large [--d_maxNumAtomsForTargetComp %.2f]" % (
            len(tooBigUIDs), K, DArgs['d_maxNumAtomsForTargetComp'])
    DLogger.pprint(msg)
    if len(tooBigUIDs) > 0:
        DLogger.pprint(
            '  ' + vec2str(tooBigUIDs), 'debug')

    # Log which uids were marked has having a record.
    msg = '%d/%d UIDs un-deleteable for past failures. %d reactivated.' % (
        len(failRecordUIDs), K, nReactivated)
    DLogger.pprint(msg)
    if len(failRecordUIDs) > 0:
        DLogger.pprint(
            '  ' + vec2str(failRecordUIDs), 'debug')
    # Log all remaining eligible uids
    msg = '%d/%d UIDs eligible for targeted delete proposal' % (
        len(eligibleUIDs), K)
    DLogger.pprint(msg)
    if len(eligibleUIDs) == 0:
        failMsg = ("Empty plan. 0 UIDs eligible as delete target." + \
            " %d too busy with other moves." + \
            " %d too big." + \
            " %d have past failures.") % (
                len(uidsBusyWithOtherMoves),
                len(tooBigUIDs),
                len(failRecordUIDs))
        return dict(failMsg=failMsg)

    # Log count statistics for each uid
    eligibleCountVec = [countVec[SS.uid2k(u)] for u in eligibleUIDs]
    DLogger.pprint(
        ' uid   ' + vec2str(eligibleUIDs), 'debug')
    DLogger.pprint(
        ' count ' + vec2str(eligibleCountVec), 'debug')

    # Select the single state to target
    # by taking the one with highest score
    #Scores = np.asarray([x for x in ScoreByEligibleUID.values()])
    #targetUID = eligibleUIDs[np.argmax(eligibleCountVec)]
    #MovePlans['d_targetUIDs'] = [targetUID]

    targetUID = eligibleUIDs[np.argmax(eligibleCountVec)]
    MovePlans['d_targetUIDs'] = [targetUID]

    # Determine all comps eligible to receive its transfer mass
    absorbUIDset = set(eligibleUIDs)
    absorbUIDset.discard(targetUID)
    absorbUIDset.update(tooBigUIDs)
    absorbUIDset.update(failRecordUIDs)
    MovePlans['d_absorbingUIDSet'] = absorbUIDset

    DLogger.pprint('Selecting one single state to target.')
    DLogger.pprint('targetUID ' + str(targetUID))
    DLogger.pprint('absorbingUIDs: ' + vec2str(absorbUIDset))
    return MovePlans
Esempio n. 3
0
def _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs,
                       dataName, allocModelName, obsModelName, algName,
                       doSaveToDisk, doWriteStdOut):
    """ Internal method (should never be called by end-user!)
        Executes learning for a particular job and particular taskid.

        Returns
        -------
        hmodel : bnpy HModel, fit to the data
        LP : Local parameter (LP) dict for the specific dataset
        RunInfo : dict of information about the run, with fields
        - 'loss' : final loss value for algorithm
        - 'loss_history' : vector of loss values over time
    """
    # Make shallow copies of input dicts, so we any modifications here
    # do not return to the caller.
    ReqArgs = dict(**ReqArgs)
    KwArgs = dict(**KwArgs)
    UnkArgs = dict(**UnkArgs)

    algseed = createUniqueRandomSeed(jobname, taskID=taskid)
    dataorderseed = createUniqueRandomSeed('', taskID=taskid)
    KwArgs[algName]['algseed'] = algseed
    KwArgs[algName]['dataorderseed'] = dataorderseed

    if algName in OnlineDataAlgSet:
        KwArgs[algName]['nLap'] = KwArgs['OnlineDataPrefs']['nLap']

    if isinstance(dataName, str):
        if os.path.exists(dataName):
            # dataName is a path to many data files on disk
            Data, InitData = loadDataIteratorFromDisk(dataName, ReqArgs,
                                                      KwArgs, dataorderseed)
            DataArgs = UnkArgs
            # Set the short name for this dataset,
            # so that the filepath for results is informative.
            if not hasattr(Data, 'name'):
                try:
                    Data.name = KwArgs['OnlineDataPrefs']['datasetName']
                except KeyError:
                    Data.name = 'UnknownDatasetName'
        else:
            DataArgs = getKwArgsForLoadData(ReqArgs, UnkArgs, KwArgs)
            Data, InitData = loadData(ReqArgs, KwArgs, DataArgs, dataorderseed)
    else:
        Data = dataName
        InitData = dataName
        DataArgs = dict()
        assert isinstance(Data, bnpy.data.DataObj)
        if algName in OnlineDataAlgSet:
            OnlineDataArgs = KwArgs['OnlineDataPrefs']
            OnlineDataArgs['dataorderseed'] = dataorderseed

            DataArgs = getKwArgsForLoadData(Data, UnkArgs)
            OnlineDataArgs.update(DataArgs)  # add custom args
            Data = Data.to_iterator(**OnlineDataArgs)
    if hasattr(Data, 'name'):
        ReqArgs['dataName'] = Data.name
    if doSaveToDisk:
        task_output_path = make_task_output_path(ReqArgs,
                                                 KwArgs,
                                                 taskID=taskid)
        createEmptyOutputPathOnDisk(task_output_path)
        writeArgsToFile(ReqArgs, KwArgs, task_output_path, UnkArgs)
    else:
        task_output_path = None
    KwArgs['OutputPrefs']['task_output_path'] = task_output_path
    jobID = configLoggingToConsoleAndFile(task_output_path, taskid,
                                          doSaveToDisk, doWriteStdOut)

    # Write descriptions to the log
    if taskid == 1 or jobID > 0:
        # Warn user about any unknown keyword arguments
        showWarningForUnknownArgs(UnkArgs, DataArgs)

        Log.info('Dataset Summary:')
        Log.info(Data.get_text_summary())
        Log.info(Data.get_stats_summary())

    # Create and initialize model parameters
    hmodel = make_initialized_model(
        InitData,
        seed=algseed,
        taskid=taskid,
        allocModelName=ReqArgs['allocModelName'],
        obsModelName=ReqArgs['obsModelName'],
        algName=ReqArgs['algName'],
        KwArgs=KwArgs,
        verbose=(taskid == 1 or jobID > 0),
    )

    # Create learning algorithm
    learnAlg = createLearnAlg(Data,
                              hmodel,
                              ReqArgs,
                              KwArgs,
                              algseed=algseed,
                              task_output_path=task_output_path)
    if learnAlg.hasMove('birth'):
        import bnpy.birthmove.BLogger as BirthLogger
        BirthLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('delete'):
        import bnpy.deletemove.DLogger as DeleteLogger
        DeleteLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('merge'):
        import bnpy.mergemove.MLogger as MergeLogger
        MergeLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('shuffle'):
        import bnpy.mergemove.SLogger as SLogger
        SLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if str(type(hmodel.allocModel)).count('TopicModel'):
        import bnpy.allocmodel.topics.LocalStepLogger as LocalStepLogger
        LocalStepLogger.configure(task_output_path, doSaveToDisk,
                                  doWriteStdOut)

    # Set up logging for how long each step of the alg takes.
    import bnpy.learnalg.ElapsedTimeLogger as ElapsedTimeLogger
    ElapsedTimeLogger.configure(task_output_path, KwArgs['MoveNames'],
                                doSaveToDisk, doWriteStdOut)

    Log.info(
        'Learn Alg: %s | task %2d/%d | alg. seed: %d | data order seed: %d' %
        (algName, taskid, nTask, algseed, dataorderseed))
    Log.info('task_output_path: %s' % (task_output_path))

    # Fit the model to the data!
    RunInfo = learnAlg.fit(hmodel, Data)
    RunInfo['UnkArgs'] = UnkArgs
    RunInfo['KwArgs'] = KwArgs
    RunInfo['ReqArgs'] = ReqArgs
    return hmodel, RunInfo