Example #1
0
 def set_random_seed_at_lap(self, lap):
   ''' Set internal random generator deterministically
         based on provided seed (unique to this run) and 
         the number of passes thru the data,
         so we can reproduce runs without starting over
   '''
   if isEvenlyDivisibleFloat(lap, 1.0):
     self.PRNG = np.random.RandomState(self.seed + int(lap))
Example #2
0
  def save_state(self, hmodel, iterid, lap, evBound, doFinal=False):
    ''' Save state of the hmodel's global parameters and evBound
    '''  
    traceEvery = self.outputParams['traceEvery']
    if traceEvery <= 0:
      traceEvery = -1
    doTrace = isEvenlyDivisibleFloat(lap, traceEvery) or iterid < 3
    
    if traceEvery > 0 and (doFinal or doTrace) and lap not in self.TraceLaps:
      # Record current evidence
      self.evTrace.append(evBound)
      self.TraceLaps.add(lap)

      # Exit here if we're not saving to disk
      if self.savedir is None:
        return
    
      # Record current state to plain-text files
      with open( self.mkfile('laps.txt'), 'a') as f:        
        f.write('%.4f\n' % (lap))
      with open( self.mkfile('evidence.txt'), 'a') as f:        
        f.write('%.9e\n' % (evBound))
      with open( self.mkfile('nObs.txt'), 'a') as f:
        f.write('%d\n' % (self.nObsProcessed))
      with open( self.mkfile('times.txt'), 'a') as f:
        f.write('%.3f\n' % (self.get_elapsed_time()))
      if self.hasMove('birth') or self.hasMove('merge'):
        with open( self.mkfile('K.txt'), 'a') as f:
          f.write('%d\n' % (hmodel.obsModel.K))

    saveEvery = self.outputParams['saveEvery']
    if saveEvery <= 0 or self.savedir is None:
      return

    doSave = isEvenlyDivisibleFloat(lap, saveEvery) or iterid < 3
    if (doFinal or doSave) and iterid not in self.SavedIters:
      self.SavedIters.add(iterid)
      with open(self.mkfile('laps-saved-params.txt'), 'a') as f:        
        f.write('%.4f\n' % (lap))
      prefix = ModelWriter.makePrefixForLap(lap)
      ModelWriter.save_model(hmodel, self.savedir, prefix,
                              doSavePriorInfo=(iterid<1), doLinkBest=True)
Example #3
0
    def set_random_seed_at_lap(self, lap):
        ''' Set internal random generator based on current lap.

        Reset the seed deterministically for each lap.
        using combination of seed attribute (unique to this run),
        and the provided lap argument. This allows reproducing
        exact values from this run later without starting over.

        Post Condition
        ------
        self.PRNG rest to new random seed.
        '''
        if isEvenlyDivisibleFloat(lap, 1.0):
            self.PRNG = np.random.RandomState(self.seed + int(lap))
Example #4
0
 def isSaveParamsCheckpoint(self, lap, nMstepUpdates):
     ''' Answer True/False whether to save full model now
     '''
     s = self.outputParams['saveEveryLogScaleFactor']
     sE = self.outputParams['saveEvery']
     if s > 0:
         new_sE = np.maximum(np.maximum(sE, sE**s), sE * s)
         if (lap >= new_sE):
             self.outputParams['saveEvery'] = new_sE
         if lap > 1.0:
             self.outputParams['saveEvery'] = \
                 np.ceil(self.outputParams['saveEvery'])
     saveEvery = self.outputParams['saveEvery']
     if saveEvery <= 0 or self.task_output_path is None:
         return False
     return isEvenlyDivisibleFloat(lap, saveEvery) \
         or (isEvenlyDivisibleFloat(lap, 1.0) and
             lap <= self.outputParams['saveEarly']) \
         or nMstepUpdates < 3 \
         or np.allclose(lap, 1.0) \
         or np.allclose(lap, 2.0) \
         or np.allclose(lap, 4.0) \
         or np.allclose(lap, 8.0)
def preselectPairs(curModel,
                   SS,
                   lapFrac,
                   mergePairSelection='wholeELBO',
                   prevScoreMat=None,
                   mergeScoreRefreshInterval=10,
                   mergeMaxDegree=5,
                   **kwargs):
    ''' Create list of candidate pairs for merge
    '''
    needRefresh = isEvenlyDivisibleFloat(lapFrac, mergeScoreRefreshInterval)
    if prevScoreMat is None or needRefresh:
        ScoreMat = np.zeros((SS.K, SS.K))
        doAllPairs = 1
    else:
        assert prevScoreMat.shape[0] == SS.K
        ScoreMat = prevScoreMat
        doAllPairs = 0
    ScoreMat = updateScoreMat_wholeELBO(ScoreMat, curModel, SS, doAllPairs)

    posMask = ScoreMat > -ELBO_GAP_ACCEPT_TOL
    Nvec = SS.getCountVec()
    tinyVec = Nvec < 25
    tinyMask = np.add(tinyVec, tinyVec[:, np.newaxis])
    posAndTiny = np.logical_and(posMask, tinyMask)
    posAndBothBig = np.logical_and(posMask, 1 - tinyMask)

    # Select list of pairs to track for merge
    # prioritizes merges that make big changes
    # avoids tracking too many pairs that involves same node
    pairsBig = selectPairsUsingAtMostNOfEachComp(posAndBothBig,
                                                 N=mergeMaxDegree)
    scoresBig = np.asarray([ScoreMat[a, b] for (a, b) in pairsBig])
    pairsBig = [pairsBig[x] for x in np.argsort(-1 * scoresBig)]

    pairsTiny = selectPairsUsingAtMostNOfEachComp(posAndTiny,
                                                  pairsBig,
                                                  N=mergeMaxDegree,
                                                  Nextra=2)
    scoresTiny = np.asarray([ScoreMat[a, b] for (a, b) in pairsTiny])
    pairsTiny = [pairsTiny[x] for x in np.argsort(-1 * scoresTiny)]
    return pairsBig + pairsTiny, ScoreMat
Example #6
0
    def print_state(self,
                    hmodel,
                    iterid,
                    lap,
                    evBound,
                    doFinal=False,
                    status='',
                    rho=None):
        printEvery = self.outputParams['printEvery']
        if printEvery <= 0:
            return None
        doPrint = iterid < 3 or isEvenlyDivisibleFloat(lap, printEvery)

        if rho is None:
            rhoStr = ''
        else:
            rhoStr = '%.4f |' % (rho)

        if iterid == lap:
            lapStr = '%7d' % (lap)
        else:
            lapStr = '%7.3f' % (lap)

        maxLapStr = '%d' % (self.algParams['nLap'] +
                            self.algParams['startLap'])

        logmsg = '  %s/%s after %6.0f sec. | K %4d | ev % .9e %s'
        # Print asterisk for early iterations of memoized,
        #  before the method has made one full pass thru data
        if self.__class__.__name__.count('Memo') > 0:
            if lap < self.algParams['startLap'] + 1.0:
                logmsg = '  %s/%s after %6.0f sec. | K %4d |*ev % .9e %s'

        logmsg = logmsg % (lapStr, maxLapStr, self.get_elapsed_time(),
                           hmodel.allocModel.K, evBound, rhoStr)

        if (doFinal or doPrint) and iterid not in self.PrintIters:
            self.PrintIters.add(iterid)
            Log.info(logmsg)
        if doFinal:
            Log.info('... done. %s' % (status))
Example #7
0
  def print_state(self, hmodel, iterid, lap, evBound, doFinal=False, status='', rho=None):
    printEvery = self.outputParams['printEvery']
    if printEvery <= 0:
      return None
    doPrint = iterid < 3 or isEvenlyDivisibleFloat(lap, printEvery)
  
    if rho is None:
      rhoStr = ''
    else:
      rhoStr = '%.4f |' % (rho)

    if iterid == lap:
      lapStr = '%7d' % (lap)
    else:
      lapStr = '%7.3f' % (lap)

    maxLapStr = '%d' % (self.algParams['nLap'] + self.algParams['startLap'])
    
    logmsg = '  %s/%s after %6.0f sec. | K %4d | ev % .9e %s'
    # Print asterisk for early iterations of memoized,
    #  before the method has made one full pass thru data
    if self.__class__.__name__.count('Memo') > 0:
      if lap < self.algParams['startLap'] + 1.0:
        logmsg = '  %s/%s after %6.0f sec. | K %4d |*ev % .9e %s'

    logmsg = logmsg % (lapStr, 
                        maxLapStr,
                        self.get_elapsed_time(),
                        hmodel.allocModel.K,
                        evBound, 
                        rhoStr)

    if (doFinal or doPrint) and iterid not in self.PrintIters:
      self.PrintIters.add(iterid)
      Log.info(logmsg)
    if doFinal:
      Log.info('... done. %s' % (status))
Example #8
0
    def fit(self, hmodel, DataIterator):
        ''' Run moVB learning algorithm, fit parameters of hmodel to Data,
          traversed one batch at a time from DataIterator

        Returns
        --------
        LP : None type, cannot fit all local params in memory
        Info : dict of run information, with fields
              evBound : final ELBO evidence bound
              status : str message indicating reason for termination
                        {'converged', 'max passes exceeded'}
    
    '''
        # Define how much of data we see at each mini-batch
        nBatch = float(DataIterator.nBatch)
        self.lapFracInc = 1.0 / nBatch
        # Set-up progress-tracking variables
        iterid = -1
        lapFrac = np.maximum(0, self.algParams['startLap'] - 1.0 / nBatch)
        if lapFrac > 0:
            # When restarting an existing run,
            #  need to start with last update for final batch from previous lap
            DataIterator.lapID = int(np.ceil(lapFrac)) - 1
            DataIterator.curLapPos = nBatch - 2
            iterid = int(nBatch * lapFrac) - 1

        # memoLPkeys : keep list of params that should be retained across laps
        self.memoLPkeys = hmodel.allocModel.get_keys_for_memoized_local_params(
        )
        mPairIDs = None

        BirthPlans = list()
        BirthResults = None
        prevBirthResults = None

        SS = None
        isConverged = False
        prevBound = -np.inf
        self.set_start_time_now()
        while DataIterator.has_next_batch():

            # Grab new data
            Dchunk = DataIterator.get_next_batch()
            batchID = DataIterator.batchID

            # Update progress-tracking variables
            iterid += 1
            lapFrac = (iterid + 1) * self.lapFracInc
            self.set_random_seed_at_lap(lapFrac)

            # M step
            if self.algParams['doFullPassBeforeMstep']:
                if SS is not None and lapFrac > 1.0:
                    hmodel.update_global_params(SS)
            else:
                if SS is not None:
                    hmodel.update_global_params(SS)

            # Birth move : track birth info from previous lap
            if self.isFirstBatch(lapFrac):
                if self.hasMove('birth') and self.do_birth_at_lap(lapFrac -
                                                                  1.0):
                    prevBirthResults = BirthResults
                else:
                    prevBirthResults = list()

            # Birth move : create new components
            if self.hasMove('birth') and self.do_birth_at_lap(lapFrac):
                if self.doBirthWithPlannedData(lapFrac):
                    hmodel, SS, BirthResults = self.birth_create_new_comps(
                        hmodel, SS, BirthPlans)

                if self.doBirthWithDataFromCurrentBatch(lapFrac):
                    hmodel, SS, BirthRes = self.birth_create_new_comps(
                        hmodel, SS, Data=Dchunk)
                    BirthResults.extend(BirthRes)

                self.BirthCompIDs = self.birth_get_all_new_comps(BirthResults)
                self.ModifiedCompIDs = self.birth_get_all_modified_comps(
                    BirthResults)
            else:
                BirthResults = list()
                self.BirthCompIDs = list()  # no births = no new components
                self.ModifiedCompIDs = list()

            # Select which components to merge
            if self.hasMove(
                    'merge') and not self.algParams['merge']['doAllPairs']:
                if self.isFirstBatch(lapFrac):
                    if self.hasMove('birth'):
                        compIDs = self.BirthCompIDs
                    else:
                        compIDs = []
                    mPairIDs = MergeMove.preselect_all_merge_candidates(
                        hmodel,
                        SS,
                        randstate=self.PRNG,
                        compIDs=compIDs,
                        **self.algParams['merge'])

            # E step
            if batchID in self.LPmemory:
                oldLPchunk = self.load_batch_local_params_from_memory(
                    batchID, prevBirthResults)
                LPchunk = hmodel.calc_local_params(Dchunk, oldLPchunk,
                                                   **self.algParamsLP)
            else:
                LPchunk = hmodel.calc_local_params(Dchunk, **self.algParamsLP)

            # Collect target data for birth
            if self.hasMove('birth') and self.do_birth_at_lap(lapFrac + 1.0):
                if self.isFirstBatch(lapFrac):
                    BirthPlans = self.birth_select_targets_for_next_lap(
                        hmodel, SS, BirthResults)
                BirthPlans = self.birth_collect_target_subsample(
                    Dchunk, LPchunk, BirthPlans)
            else:
                BirthPlans = list()

            # Suff Stat step
            if batchID in self.SSmemory:
                SSchunk = self.load_batch_suff_stat_from_memory(batchID, SS.K)
                SS -= SSchunk

            SSchunk = hmodel.get_global_suff_stats(
                Dchunk,
                LPchunk,
                doPrecompEntropy=True,
                doPrecompMergeEntropy=self.hasMove('merge'),
                mPairIDs=mPairIDs,
            )

            if SS is None:
                SS = SSchunk.copy()
            else:
                assert SSchunk.K == SS.K
                SS += SSchunk

            # Store batch-specific stats to memory
            if self.algParams['doMemoizeLocalParams']:
                self.save_batch_local_params_to_memory(batchID, LPchunk)
            self.save_batch_suff_stat_to_memory(batchID, SSchunk)

            # Handle removing "extra mass" of fresh components
            #  to make SS have size exactly consistent with entire dataset
            if self.hasMove('birth') and self.isLastBatch(lapFrac):
                hmodel, SS = self.birth_remove_extra_mass(
                    hmodel, SS, BirthResults)

            # ELBO calc
            #self.verify_suff_stats(Dchunk, SS, lapFrac)
            evBound = hmodel.calc_evidence(SS=SS)

            # Merge move!
            if self.hasMove('merge') and isEvenlyDivisibleFloat(lapFrac, 1.):
                hmodel, SS, evBound = self.run_merge_move(
                    hmodel, SS, evBound, mPairIDs)

            # Save and display progress
            self.add_nObs(Dchunk.nObs)
            self.save_state(hmodel, iterid, lapFrac, evBound)
            self.print_state(hmodel, iterid, lapFrac, evBound)
            self.eval_custom_func(hmodel, iterid, lapFrac)

            # Check for Convergence!
            #  evBound will increase monotonically AFTER first lap of the data
            #  verify_evidence will warn if bound isn't increasing monotonically
            if lapFrac > self.algParams['startLap'] + 1.0:
                isConverged = self.verify_evidence(evBound, prevBound, lapFrac)
                if isConverged and lapFrac > 5 and not self.hasMove('birth'):
                    break
            prevBound = evBound

        # Finally, save, print and exit
        if isConverged:
            msg = "converged."
        else:
            msg = "max passes thru data exceeded."
        self.save_state(hmodel, iterid, lapFrac, evBound, doFinal=True)
        self.print_state(hmodel,
                         iterid,
                         lapFrac,
                         evBound,
                         doFinal=True,
                         status=msg)
        return None, self.buildRunInfo(evBound, msg)
  def fit(self, hmodel, DataIterator):
    ''' Run moVB learning algorithm, fit parameters of hmodel to Data,
          traversed one batch at a time from DataIterator

        Returns
        --------
        LP : None type, cannot fit all local params in memory
        Info : dict of run information, with fields
              evBound : final ELBO evidence bound
              status : str message indicating reason for termination
                        {'converged', 'max passes exceeded'}
    
    '''
    # Define how much of data we see at each mini-batch
    nBatch = float(DataIterator.nBatch)
    self.lapFracInc = 1.0/nBatch
    # Set-up progress-tracking variables
    iterid = -1
    lapFrac = np.maximum(0, self.algParams['startLap'] - 1.0/nBatch)
    if lapFrac > 0:
      # When restarting an existing run,
      #  need to start with last update for final batch from previous lap
      DataIterator.lapID = int(np.ceil(lapFrac)) - 1
      DataIterator.curLapPos = nBatch - 2
      iterid = int(nBatch * lapFrac) - 1

    # memoLPkeys : keep list of params that should be retained across laps
    self.memoLPkeys = hmodel.allocModel.get_keys_for_memoized_local_params()
    mPairIDs = None

    BirthPlans = list()
    BirthResults = None
    prevBirthResults = None

    SS = None
    isConverged = False
    prevBound = -np.inf
    self.set_start_time_now()
    while DataIterator.has_next_batch():

      # Grab new data
      Dchunk = DataIterator.get_next_batch()
      batchID = DataIterator.batchID
      
      # Update progress-tracking variables
      iterid += 1
      lapFrac = (iterid + 1) * self.lapFracInc
      self.set_random_seed_at_lap(lapFrac)

      # M step
      if self.algParams['doFullPassBeforeMstep']:
        if SS is not None and lapFrac > 1.0:
          hmodel.update_global_params(SS)
      else:
        if SS is not None:
          hmodel.update_global_params(SS)
      
      # Birth move : track birth info from previous lap
      if self.isFirstBatch(lapFrac):
        if self.hasMove('birth') and self.do_birth_at_lap(lapFrac - 1.0):
          prevBirthResults = BirthResults
        else:
          prevBirthResults = list()

      # Birth move : create new components
      if self.hasMove('birth') and self.do_birth_at_lap(lapFrac):
        if self.doBirthWithPlannedData(lapFrac):
          hmodel, SS, BirthResults = self.birth_create_new_comps(
                                            hmodel, SS, BirthPlans)

        if self.doBirthWithDataFromCurrentBatch(lapFrac):
          hmodel, SS, BirthRes = self.birth_create_new_comps(
                                            hmodel, SS, Data=Dchunk)
          BirthResults.extend(BirthRes)

        self.BirthCompIDs = self.birth_get_all_new_comps(BirthResults)
        self.ModifiedCompIDs = self.birth_get_all_modified_comps(BirthResults)
      else:
        BirthResults = list()
        self.BirthCompIDs = list() # no births = no new components
        self.ModifiedCompIDs = list()

      # Select which components to merge
      if self.hasMove('merge') and not self.algParams['merge']['doAllPairs']:
        if self.isFirstBatch(lapFrac):
          if self.hasMove('birth'):
            compIDs = self.BirthCompIDs
          else:
            compIDs = []
          mPairIDs = MergeMove.preselect_all_merge_candidates(hmodel, SS, 
                           randstate=self.PRNG, compIDs=compIDs,
                           **self.algParams['merge'])

      # E step
      if batchID in self.LPmemory:
        oldLPchunk = self.load_batch_local_params_from_memory(
                                           batchID, prevBirthResults)
        LPchunk = hmodel.calc_local_params(Dchunk, oldLPchunk,
                                           **self.algParamsLP)
      else:
        LPchunk = hmodel.calc_local_params(Dchunk, **self.algParamsLP)

      # Collect target data for birth
      if self.hasMove('birth') and self.do_birth_at_lap(lapFrac+1.0):
        if self.isFirstBatch(lapFrac):
          BirthPlans = self.birth_select_targets_for_next_lap(
                                hmodel, SS, BirthResults)
        BirthPlans = self.birth_collect_target_subsample(
                                Dchunk, LPchunk, BirthPlans)
      else:
        BirthPlans = list()

      # Suff Stat step
      if batchID in self.SSmemory:
        SSchunk = self.load_batch_suff_stat_from_memory(batchID, SS.K)
        SS -= SSchunk

      SSchunk = hmodel.get_global_suff_stats(Dchunk, LPchunk,
                       doPrecompEntropy=True, 
                       doPrecompMergeEntropy=self.hasMove('merge'),
                       mPairIDs=mPairIDs,
                       )

      if SS is None:
        SS = SSchunk.copy()
      else:
        assert SSchunk.K == SS.K
        SS += SSchunk

      # Store batch-specific stats to memory
      if self.algParams['doMemoizeLocalParams']:
        self.save_batch_local_params_to_memory(batchID, LPchunk)          
      self.save_batch_suff_stat_to_memory(batchID, SSchunk)  

      # Handle removing "extra mass" of fresh components
      #  to make SS have size exactly consistent with entire dataset
      if self.hasMove('birth') and self.isLastBatch(lapFrac):
        hmodel, SS = self.birth_remove_extra_mass(hmodel, SS, BirthResults)

      # ELBO calc
      #self.verify_suff_stats(Dchunk, SS, lapFrac)
      evBound = hmodel.calc_evidence(SS=SS)

      # Merge move!      
      if self.hasMove('merge') and isEvenlyDivisibleFloat(lapFrac, 1.):
        hmodel, SS, evBound = self.run_merge_move(hmodel, SS, evBound, mPairIDs)

      # Save and display progress
      self.add_nObs(Dchunk.nObs)
      self.save_state(hmodel, iterid, lapFrac, evBound)
      self.print_state(hmodel, iterid, lapFrac, evBound)
      self.eval_custom_func(hmodel, iterid, lapFrac)

      # Check for Convergence!
      #  evBound will increase monotonically AFTER first lap of the data 
      #  verify_evidence will warn if bound isn't increasing monotonically
      if lapFrac > self.algParams['startLap'] + 1.0:
        isConverged = self.verify_evidence(evBound, prevBound, lapFrac)
        if isConverged and lapFrac > 5 and not self.hasMove('birth'):
          break
      prevBound = evBound

    # Finally, save, print and exit
    if isConverged:
      msg = "converged."
    else:
      msg = "max passes thru data exceeded."
    self.save_state(hmodel, iterid, lapFrac, evBound, doFinal=True) 
    self.print_state(hmodel, iterid, lapFrac,evBound,doFinal=True,status=msg)
    return None, self.buildRunInfo(evBound, msg)