Esempio n. 1
0
    Post Condition
    --------------
    * Logging messages are printed.
    * HTML report is saved.
    '''
    if lap is not None:
        lapFrac = lap

    hmodel, lapFrac = loadModelForLap(taskoutpath, lapFrac)
    Data = loadDataFromSavedTask(taskoutpath, batchID=batchID)
    kwargs['LPkwargs'] = loadLPKwargsFromDisk(taskoutpath)

    tryMergeProposalForSpecificTarget(Data, hmodel, **kwargs)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('taskoutpath', type=str)
    parser.add_argument('--lap', type=float, default=None)
    parser.add_argument('--lapFrac', type=float, default=None)
    parser.add_argument('--outputdir', type=str, default='/tmp/')
    parser.add_argument('--kA', type=int, default=0)
    parser.add_argument('--kB', type=int, default=1)
    parser.add_argument('--batchID', type=int, default=None)
    parser.add_argument('--verbose', type=int, default=True)
    parser.add_argument('--doHeuristicUpdateRho', type=int, default=0)
    args = parser.parse_args()

    MLogger.configure(args.outputdir, doSaveToDisk=0, doWriteStdOut=1)
    tryMergeProposalForSavedTask(**args.__dict__)
Esempio n. 2
0
def _run_task_internal(jobname, taskid, nTask, ReqArgs, KwArgs, UnkArgs,
                       dataName, allocModelName, obsModelName, algName,
                       doSaveToDisk, doWriteStdOut):
    """ Internal method (should never be called by end-user!)
        Executes learning for a particular job and particular taskid.

        Returns
        -------
        hmodel : bnpy HModel, fit to the data
        LP : Local parameter (LP) dict for the specific dataset
        RunInfo : dict of information about the run, with fields
        - 'loss' : final loss value for algorithm
        - 'loss_history' : vector of loss values over time
    """
    # Make shallow copies of input dicts, so we any modifications here
    # do not return to the caller.
    ReqArgs = dict(**ReqArgs)
    KwArgs = dict(**KwArgs)
    UnkArgs = dict(**UnkArgs)

    algseed = createUniqueRandomSeed(jobname, taskID=taskid)
    dataorderseed = createUniqueRandomSeed('', taskID=taskid)
    KwArgs[algName]['algseed'] = algseed
    KwArgs[algName]['dataorderseed'] = dataorderseed

    if algName in OnlineDataAlgSet:
        KwArgs[algName]['nLap'] = KwArgs['OnlineDataPrefs']['nLap']

    if isinstance(dataName, str):
        if os.path.exists(dataName):
            # dataName is a path to many data files on disk
            Data, InitData = loadDataIteratorFromDisk(dataName, ReqArgs,
                                                      KwArgs, dataorderseed)
            DataArgs = UnkArgs
            # Set the short name for this dataset,
            # so that the filepath for results is informative.
            if not hasattr(Data, 'name'):
                try:
                    Data.name = KwArgs['OnlineDataPrefs']['datasetName']
                except KeyError:
                    Data.name = 'UnknownDatasetName'
        else:
            DataArgs = getKwArgsForLoadData(ReqArgs, UnkArgs, KwArgs)
            Data, InitData = loadData(ReqArgs, KwArgs, DataArgs, dataorderseed)
    else:
        Data = dataName
        InitData = dataName
        DataArgs = dict()
        assert isinstance(Data, bnpy.data.DataObj)
        if algName in OnlineDataAlgSet:
            OnlineDataArgs = KwArgs['OnlineDataPrefs']
            OnlineDataArgs['dataorderseed'] = dataorderseed

            DataArgs = getKwArgsForLoadData(Data, UnkArgs)
            OnlineDataArgs.update(DataArgs)  # add custom args
            Data = Data.to_iterator(**OnlineDataArgs)
    if hasattr(Data, 'name'):
        ReqArgs['dataName'] = Data.name
    if doSaveToDisk:
        task_output_path = make_task_output_path(ReqArgs,
                                                 KwArgs,
                                                 taskID=taskid)
        createEmptyOutputPathOnDisk(task_output_path)
        writeArgsToFile(ReqArgs, KwArgs, task_output_path, UnkArgs)
    else:
        task_output_path = None
    KwArgs['OutputPrefs']['task_output_path'] = task_output_path
    jobID = configLoggingToConsoleAndFile(task_output_path, taskid,
                                          doSaveToDisk, doWriteStdOut)

    # Write descriptions to the log
    if taskid == 1 or jobID > 0:
        # Warn user about any unknown keyword arguments
        showWarningForUnknownArgs(UnkArgs, DataArgs)

        Log.info('Dataset Summary:')
        Log.info(Data.get_text_summary())
        Log.info(Data.get_stats_summary())

    # Create and initialize model parameters
    hmodel = make_initialized_model(
        InitData,
        seed=algseed,
        taskid=taskid,
        allocModelName=ReqArgs['allocModelName'],
        obsModelName=ReqArgs['obsModelName'],
        algName=ReqArgs['algName'],
        KwArgs=KwArgs,
        verbose=(taskid == 1 or jobID > 0),
    )

    # Create learning algorithm
    learnAlg = createLearnAlg(Data,
                              hmodel,
                              ReqArgs,
                              KwArgs,
                              algseed=algseed,
                              task_output_path=task_output_path)
    if learnAlg.hasMove('birth'):
        import bnpy.birthmove.BLogger as BirthLogger
        BirthLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('delete'):
        import bnpy.deletemove.DLogger as DeleteLogger
        DeleteLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('merge'):
        import bnpy.mergemove.MLogger as MergeLogger
        MergeLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if learnAlg.hasMove('shuffle'):
        import bnpy.mergemove.SLogger as SLogger
        SLogger.configure(task_output_path, doSaveToDisk, doWriteStdOut)
    if str(type(hmodel.allocModel)).count('TopicModel'):
        import bnpy.allocmodel.topics.LocalStepLogger as LocalStepLogger
        LocalStepLogger.configure(task_output_path, doSaveToDisk,
                                  doWriteStdOut)

    # Set up logging for how long each step of the alg takes.
    import bnpy.learnalg.ElapsedTimeLogger as ElapsedTimeLogger
    ElapsedTimeLogger.configure(task_output_path, KwArgs['MoveNames'],
                                doSaveToDisk, doWriteStdOut)

    Log.info(
        'Learn Alg: %s | task %2d/%d | alg. seed: %d | data order seed: %d' %
        (algName, taskid, nTask, algseed, dataorderseed))
    Log.info('task_output_path: %s' % (task_output_path))

    # Fit the model to the data!
    RunInfo = learnAlg.fit(hmodel, Data)
    RunInfo['UnkArgs'] = UnkArgs
    RunInfo['KwArgs'] = KwArgs
    RunInfo['ReqArgs'] = ReqArgs
    return hmodel, RunInfo
Esempio n. 3
0
def selectCandidateMergePairs(hmodel, SS,
        MovePlans=dict(),
        MoveRecordsByUID=dict(),
        lapFrac=None,
        m_maxNumPairsContainingComp=3,
        m_minPercChangeInNumAtomsToReactivate=0.01,
        m_nLapToReactivate=10,
        m_pair_ranking_procedure='total_size',
        m_pair_ranking_direction='descending',
        m_pair_ranking_do_exclude_by_thr=0,
        m_pair_ranking_exclusion_thr=-0.000001,
        **kwargs):
    ''' Select candidate pairs to consider for merge move.

    Returns
    -------
    Info : dict, with fields
        * m_UIDPairs : list of tuples, each defining a pair of uids
        * m_targetUIDSet : set of all uids involved in a proposed merge pair
    '''
    MLogger.pprint(
        "PLANNING merges at lap %.2f. K=%d" % (lapFrac, SS.K),
        'debug')

    # Mark any targetUIDs used in births as off-limits for merges
    uidUsageCount = defaultdict(int)
    if 'b_shortlistUIDs' in MovePlans:
        for uid in MovePlans['b_shortlistUIDs']:
            uidUsageCount[uid] = 10 * m_maxNumPairsContainingComp
    nDisqualified = len(list(uidUsageCount.keys()))
    MLogger.pprint(
        "   %d/%d UIDs ineligible because on shortlist for births. " % (
            nDisqualified, SS.K),
        'debug')
    if nDisqualified > 0:
        MLogger.pprint(
            "   Ineligible UIDs:" + \
                vec2str(list(uidUsageCount.keys())),
            'debug')

    uid2k = dict()
    uid2count = dict()
    for uid in SS.uids:
        uid2k[uid] = SS.uid2k(uid)
        uid2count[uid] = SS.getCountForUID(uid)

    EligibleUIDPairs = list()
    EligibleAIDPairs = list()
    nPairTotal = 0
    nPairDQ = 0
    nPairBusy = 0
    for kA, uidA in enumerate(SS.uids):
        for b, uidB in enumerate(SS.uids[kA+1:]):
            kB = kA + b + 1
            assert kA < kB
            nPairTotal += 1
            if uidUsageCount[uidA] > 0 or uidUsageCount[uidB] > 0:
                nPairBusy += 1
                continue
            if uidA < uidB:
                uidTuple = (uidA, uidB)
            else:
                uidTuple = (uidB, uidA)
            aidTuple = (kA, kB)

            if uidTuple not in MoveRecordsByUID:
                EligibleUIDPairs.append(uidTuple)
                EligibleAIDPairs.append(aidTuple)
            else:
                pairRecord = MoveRecordsByUID[uidTuple]
                assert pairRecord['m_nFailRecent'] >= 1
                latestMinCount = pairRecord['m_latestMinCount']
                newMinCount = np.minimum(uid2count[uidA], uid2count[uidB])
                percDiff = np.abs(latestMinCount - newMinCount) / \
                    latestMinCount
                if (lapFrac - pairRecord['m_latestLap']) >= m_nLapToReactivate:
                    EligibleUIDPairs.append(uidTuple)
                    EligibleAIDPairs.append(aidTuple)
                    del MoveRecordsByUID[uidTuple]
                elif percDiff >= m_minPercChangeInNumAtomsToReactivate:
                    EligibleUIDPairs.append(uidTuple)
                    EligibleAIDPairs.append(aidTuple)
                    del MoveRecordsByUID[uidTuple]
                else:
                    nPairDQ += 1
    MLogger.pprint(
        "   %d/%d pairs eligible. %d disqualified by past failures." % (
            len(EligibleAIDPairs), nPairTotal, nPairDQ),
        'debug')
    MLogger.pprint(
        "   Prioritizing elible pairs via ranking procedure: %s" % (
            m_pair_ranking_procedure),
        'debug')
    if m_pair_ranking_procedure == 'random':
        A = len(EligibleAIDPairs)
        prng = np.random.RandomState(lapFrac)
        rank_scores_per_pair = prng.permutation(np.arange(A))
    elif m_pair_ranking_procedure == 'total_size':
        A = len(EligibleAIDPairs)
        rank_scores_per_pair = np.asarray(
            [SS.getCountForUID(uidA) + SS.getCountForUID(uidB)
                for (uidA, uidB) in EligibleUIDPairs])
    elif m_pair_ranking_procedure.count('elbo'):
        # Compute Ldata gain for each possible pair of comps
        rank_scores_per_pair = hmodel.obsModel.calcHardMergeGap_SpecificPairs(
            SS, EligibleAIDPairs)
        if hasattr(hmodel.allocModel, 'calcHardMergeGap_SpecificPairs'):
            rank_scores_per_pair = \
                rank_scores_per_pair + hmodel.allocModel.calcHardMergeGap_SpecificPairs(
                    SS, EligibleAIDPairs)
        rank_scores_per_pair /= hmodel.obsModel.getDatasetScale(SS)
    else:
        raise ValueError(
            "Unrecognised --m_pair_ranking_procedure: %s" %
                m_pair_ranking_procedure)

    # Find pairs with positive gains
    if m_pair_ranking_direction == 'ascending':
        if m_pair_ranking_do_exclude_by_thr:
            MLogger.pprint(
                "Keeping only uid pairs with score < %.3e" % (
                    m_pair_ranking_exclusion_thr),
                'debug')
            keep_pair_ids = np.flatnonzero(rank_scores_per_pair < m_pair_ranking_exclusion_thr)
            ranked_pair_locs = keep_pair_ids[
                 np.argsort(rank_scores_per_pair[keep_pair_ids])]
        else:
            ranked_pair_locs = np.argsort(rank_scores_per_pair)
    else:
        if m_pair_ranking_do_exclude_by_thr:
            MLogger.pprint(
                "Keeping only uid pairs with score > %.3e" % (
                    m_pair_ranking_exclusion_thr),
                'debug')
            keep_pair_ids = np.flatnonzero(rank_scores_per_pair >m_pair_ranking_exclusion_thr)
            ranked_pair_locs = keep_pair_ids[
                 np.argsort(-1 * rank_scores_per_pair[keep_pair_ids])]
        else:
            ranked_pair_locs = np.argsort(-1 * rank_scores_per_pair)

    nKeep = 0
    mUIDPairs = list()
    mAIDPairs = list()
    mGainVals = list()
    for loc in ranked_pair_locs:
        uidA, uidB = EligibleUIDPairs[loc]
        kA, kB = EligibleAIDPairs[loc]
        if uidUsageCount[uidA] >= m_maxNumPairsContainingComp or \
                uidUsageCount[uidB] >= m_maxNumPairsContainingComp:
            continue
        uidUsageCount[uidA] += 1
        uidUsageCount[uidB] += 1

        mAIDPairs.append((kA, kB))
        mUIDPairs.append((uidA, uidB))
        mGainVals.append(rank_scores_per_pair[loc])
        if nKeep == 0:
            MLogger.pprint("Chosen uid pairs:", 'debug')
        MLogger.pprint(
            "%4d, %4d : pair_score %.3e, size %s %s" % (
                uidA, uidB,
                rank_scores_per_pair[loc],
                count2str(uid2count[uidA]),
                count2str(uid2count[uidB]),
                ),
            'debug')
        nKeep += 1
    Info = dict()
    Info['m_UIDPairs'] = mUIDPairs
    Info['m_GainVals'] = mGainVals
    Info['mPairIDs'] = mAIDPairs
    targetUIDs = set()
    for uidA, uidB in mUIDPairs:
        targetUIDs.add(uidA)
        targetUIDs.add(uidB)
        if 'b_shortlistUIDs' in MovePlans:
            for uid in MovePlans['b_shortlistUIDs']:
                assert uid != uidA
                assert uid != uidB
    Info['m_targetUIDSet'] = targetUIDs
    return Info