Example #1
0
def gibbs_doc_mp(model, doc, params=None, callback=None):
    """Runs Gibbs iterations on a single document, by sampling with a prior constructed from each sample in the given Model. params applies to each sample, so should probably be much more limited than usual - the default if its undefined is to use 1 run and 1 sample and a burn in of only 500. Returns a DocModel with all the relevant samples in."""

    # Initialisation stuff - handle params, create the state and the DocModel object, plus a reporter...
    if params == None:
        params = Params()
        params.runs = 1
        params.samples = 1
        params.burnIn = 500

    state = State(doc, params)
    dm = DocModel()

    # Create a pool of worker processes...
    pool = mp.Pool()

    # Create a value for sub-processes to report back their progress with...
    manager = mp.Manager()
    doneIters = manager.Value('i', 0)
    totalIters = model.sampleCount() * params.runs * (
        params.burnIn + params.samples + (params.samples - 1) * params.lag)

    # Create a callback for when a job completes...
    def onComplete(s):
        dm.addFrom(s.getModel())

    # Create all the jobs, wait for their completion, report progress...
    try:
        jobs = []
        for sample in model.sampleList():
            tempState = State(state)
            tempState.setGlobalParams(sample)
            tempState.addPrior(sample)
            jobs.append(
                pool.apply_async(gibbs_run_wrap, (tempState, doneIters),
                                 callback=onComplete))
    finally:
        # Close the pool and wait for all the jobs to complete...
        pool.close()
        while len(jobs) != 0:
            if jobs[0].ready():
                del jobs[0]
                continue
            time.sleep(0.01)
            if callback != None: callback(doneIters.value, totalIters)
        pool.join()

    # Return...
    return dm
Example #2
0
def gibbs_doc_mp(model, doc, params = None, callback = None):
  """Runs Gibbs iterations on a single document, by sampling with a prior constructed from each sample in the given Model. params applies to each sample, so should probably be much more limited than usual - the default if its undefined is to use 1 run and 1 sample and a burn in of only 500. Returns a DocModel with all the relevant samples in."""

  # Initialisation stuff - handle params, create the state and the DocModel object, plus a reporter...
  if params==None:
    params = Params()
    params.runs = 1
    params.samples = 1
    params.burnIn = 500

  state = State(doc, params)
  dm = DocModel()

  # Create a pool of worker processes...
  pool = mp.Pool()

  # Create a value for sub-processes to report back their progress with...
  manager = mp.Manager()
  doneIters = manager.Value('i',0)
  totalIters = model.sampleCount() * params.runs * (params.burnIn + params.samples + (params.samples-1)*params.lag)

  # Create a callback for when a job completes...
  def onComplete(s):
    dm.addFrom(s.getModel())

  # Create all the jobs, wait for their completion, report progress...
  try:
    jobs = []
    for sample in model.sampleList():
      tempState = State(state)
      tempState.setGlobalParams(sample)
      tempState.addPrior(sample)
      jobs.append(pool.apply_async(gibbs_run_wrap,(tempState,doneIters), callback = onComplete))
  finally:
    # Close the pool and wait for all the jobs to complete...
    pool.close()
    while len(jobs)!=0:
      if jobs[0].ready():
        del jobs[0]
        continue
      time.sleep(0.01)
      if callback!=None: callback(doneIters.value,totalIters)
    pool.join()

  # Return...
  return dm
Example #3
0
def gibbs_doc(model, doc, params = None, callback = None):
  """Runs Gibbs iterations on a single document, by sampling with a prior constructed from each sample in the given Model. params applies to each sample, so should probably be much more limited than usual - the default if its undefined is to use 1 run and 1 sample and a burn in of only 500. Returns a DocModel with all the relevant samples in."""
  
  # Initialisation stuff - handle params, create the state and the DocModel object, plus a reporter...
  if params==None:
    params = Params()
    params.runs = 1
    params.samples = 1
    params.burnIn = 500

  state = State(doc, params)
  dm = DocModel()
  reporter = ProgReporter(params,callback,model.sampleCount())

  # Iterate and run for each sample in the model...
  for sample in model.sampleList():
    tempState = State(state)
    tempState.setGlobalParams(sample)
    tempState.addPrior(sample)
    gibbs_run(tempState,reporter.next)
    dm.addFrom(tempState.getModel())

  # Return...
  return dm
Example #4
0
def gibbs_doc(model, doc, params=None, callback=None):
    """Runs Gibbs iterations on a single document, by sampling with a prior constructed from each sample in the given Model. params applies to each sample, so should probably be much more limited than usual - the default if its undefined is to use 1 run and 1 sample and a burn in of only 500. Returns a DocModel with all the relevant samples in."""

    # Initialisation stuff - handle params, create the state and the DocModel object, plus a reporter...
    if params == None:
        params = Params()
        params.runs = 1
        params.samples = 1
        params.burnIn = 500

    state = State(doc, params)
    dm = DocModel()
    reporter = ProgReporter(params, callback, model.sampleCount())

    # Iterate and run for each sample in the model...
    for sample in model.sampleList():
        tempState = State(state)
        tempState.setGlobalParams(sample)
        tempState.addPrior(sample)
        gibbs_run(tempState, reporter.next)
        dm.addFrom(tempState.getModel())

    # Return...
    return dm
Example #5
0
def leftRightNegLogProbWord(sample, doc, cluster, particles, cap):
  """Does a left to right estimate of the negative log probability of the words in the given document, given a sample, the documents abnormalities and a cluster assignment. cap defines a cap on the number of documents resampled before each word is sampled for inclusion - set to a negative number for no cap, but be warned that the algorithm is then O(n^2) with regard to the number of words in the document. Should be set quite high in practise for a reasonable trade off between quality and run-time."""
  code = start_cpp(shared_code) + """
  // Setup - create the state, extract the document, set its cluster...
   State state;
   StatePyToCpp(stateIn, &state);
   Document & doc = state.doc[0];

   if (cluster>=0)
   {
    // Existing cluster...
     doc.SetCluster(state.clusters.Index(cluster));
   }
   else
   {
    // New cluster...
     ItemRef<Cluster,Conc> * newC = state.clusters.Append();
     newC->Body().alpha = state.rho.alpha;
     newC->Body().beta  = state.rho.beta;
     newC->Body().conc  = state.rho.conc;
     float * bmn = new float[state.behCount];
     float bmnDiv = 0.0;
     for (int b=0;b<state.behCount;b++)
     {
      bmn[b] = state.phi[b];
      bmnDiv += state.phi[b];
     }
     for (int b=0;b<state.behCount;b++) bmn[b] /= bmnDiv;
     newC->SetBMN(bmn);
     
     doc.SetCluster(newC);
   }

  // If the cap is negative set it to include all words, otherwise we need some storage...
   int * samIndex = 0;
   if (cap<0) cap = doc.SampleCount();
   else
   {
    samIndex = new int[cap];
   }

  
  // Create some memory for storing the results into, zeroed out...
   float * samProb = new float[doc.SampleCount()];
   for (int s=0;s<doc.SampleCount();s++) samProb[s] = 0.0; 


  // Do all the particles, summing the results into the samProb array...
   for (int p=0;p<particles;p++)
   {
    // Reset the document to have no assignments to words...
     for (int s=0;s<doc.SampleCount();s++)
     {
      doc.GetSample(s).SetDocInst(0);
     }

    // Iterate and factor in the result from each sample...
     for (int s=0;s<doc.SampleCount();s++)
     {
      // Resample preceding samples - 3 scenarios with regards to the cap...
      // (Note that duplication is allowed in the random sample selection - whilst strictly forbidden the situation is such that it can not cause any issues.)
       if (s<=cap)
       {
        // Less or equal number of samples than the cap - do them all...
         for (int s2=0;s2<s;s2++)
         {
          ResampleSample(state, doc, doc.GetSample(s2));
         }
       }
       else
       {
        if (s<=cap*2)
        {
         // Need to miss some samples out, but due to numbers its best to randomly select the ones to miss rather than the ones to do...
          int missCount = s-cap;
          for (int m=0;m<missCount;m++) samIndex[m] = sample_nat(s);
          qsort(samIndex, missCount, sizeof(int), compareInt);

          for (int s2=0;s2<samIndex[0];s2++)
          {
           ResampleSample(state, doc, doc.GetSample(s2));
          }

          for (int m=0;m<missCount-1;m++)
          {
           for (int s2=samIndex[m]+1;s2<samIndex[m+1];s2++)
           {
            ResampleSample(state, doc, doc.GetSample(s2));
           }
          }
          
          for (int s2=samIndex[missCount-1]+1;s2<s;s2++)
          {
           ResampleSample(state, doc, doc.GetSample(s2));
          }
        }
        else
        {
         // Need to select a subset of samples to do...
          for (int m=0;m<cap;m++) samIndex[m] = sample_nat(s);
          qsort(samIndex, cap, sizeof(int), compareInt);

          for (int m=0;m<cap;m++)
          {
           ResampleSample(state, doc, doc.GetSample(samIndex[m]));
          }
        }
       }

      // Calculate the contribution of this sample, whilst simultaneously filling out so we can make a draw from them...
       float pSum = CalcSampleProb(state, doc, doc.GetSample(s));
       samProb[s] += (pSum - samProb[s]) / (p+1);

      // Draw an assignment for the current sample, ready for the next iteration...
       ResampleSample(state, doc, doc.GetSample(s), pSum);
     }
   }


  // Sumarise the results buffer into a single log probability and return it...
   float ret = 0.0;
   for (int s=0;s<doc.SampleCount();s++) ret += log(samProb[s]);
   return_val = ret;


  // Clean up...
   delete[] samIndex;
   delete[] samProb;
  """

  stateIn = State(doc, Params())
  stateIn.setGlobalParams(sample)
  stateIn.addPrior(sample)

  ret = weave.inline(code,['stateIn','cluster','particles','cap'] , support_code=shared_code)

  return -ret # Convert to negative log on the return - before then stick to positive.
Example #6
0
def leftRightNegLogProbWord(sample, doc, cluster, particles, cap):
    """Does a left to right estimate of the negative log probability of the words in the given document, given a sample, the documents abnormalities and a cluster assignment. cap defines a cap on the number of documents resampled before each word is sampled for inclusion - set to a negative number for no cap, but be warned that the algorithm is then O(n^2) with regard to the number of words in the document. Should be set quite high in practise for a reasonable trade off between quality and run-time."""
    code = start_cpp(shared_code) + """
  // Setup - create the state, extract the document, set its cluster...
   State state;
   StatePyToCpp(stateIn, &state);
   Document & doc = state.doc[0];

   if (cluster>=0)
   {
    // Existing cluster...
     doc.SetCluster(state.clusters.Index(cluster));
   }
   else
   {
    // New cluster...
     ItemRef<Cluster,Conc> * newC = state.clusters.Append();
     newC->Body().alpha = state.rho.alpha;
     newC->Body().beta  = state.rho.beta;
     newC->Body().conc  = state.rho.conc;
     float * bmn = new float[state.behCount];
     float bmnDiv = 0.0;
     for (int b=0;b<state.behCount;b++)
     {
      bmn[b] = state.phi[b];
      bmnDiv += state.phi[b];
     }
     for (int b=0;b<state.behCount;b++) bmn[b] /= bmnDiv;
     newC->SetBMN(bmn);
     
     doc.SetCluster(newC);
   }

  // If the cap is negative set it to include all words, otherwise we need some storage...
   int * samIndex = 0;
   if (cap<0) cap = doc.SampleCount();
   else
   {
    samIndex = new int[cap];
   }

  
  // Create some memory for storing the results into, zeroed out...
   float * samProb = new float[doc.SampleCount()];
   for (int s=0;s<doc.SampleCount();s++) samProb[s] = 0.0; 


  // Do all the particles, summing the results into the samProb array...
   for (int p=0;p<particles;p++)
   {
    // Reset the document to have no assignments to words...
     for (int s=0;s<doc.SampleCount();s++)
     {
      doc.GetSample(s).SetDocInst(0);
     }

    // Iterate and factor in the result from each sample...
     for (int s=0;s<doc.SampleCount();s++)
     {
      // Resample preceding samples - 3 scenarios with regards to the cap...
      // (Note that duplication is allowed in the random sample selection - whilst strictly forbidden the situation is such that it can not cause any issues.)
       if (s<=cap)
       {
        // Less or equal number of samples than the cap - do them all...
         for (int s2=0;s2<s;s2++)
         {
          ResampleSample(state, doc, doc.GetSample(s2));
         }
       }
       else
       {
        if (s<=cap*2)
        {
         // Need to miss some samples out, but due to numbers its best to randomly select the ones to miss rather than the ones to do...
          int missCount = s-cap;
          for (int m=0;m<missCount;m++) samIndex[m] = sample_nat(s);
          qsort(samIndex, missCount, sizeof(int), compareInt);

          for (int s2=0;s2<samIndex[0];s2++)
          {
           ResampleSample(state, doc, doc.GetSample(s2));
          }

          for (int m=0;m<missCount-1;m++)
          {
           for (int s2=samIndex[m]+1;s2<samIndex[m+1];s2++)
           {
            ResampleSample(state, doc, doc.GetSample(s2));
           }
          }
          
          for (int s2=samIndex[missCount-1]+1;s2<s;s2++)
          {
           ResampleSample(state, doc, doc.GetSample(s2));
          }
        }
        else
        {
         // Need to select a subset of samples to do...
          for (int m=0;m<cap;m++) samIndex[m] = sample_nat(s);
          qsort(samIndex, cap, sizeof(int), compareInt);

          for (int m=0;m<cap;m++)
          {
           ResampleSample(state, doc, doc.GetSample(samIndex[m]));
          }
        }
       }

      // Calculate the contribution of this sample, whilst simultaneously filling out so we can make a draw from them...
       float pSum = CalcSampleProb(state, doc, doc.GetSample(s));
       samProb[s] += (pSum - samProb[s]) / (p+1);

      // Draw an assignment for the current sample, ready for the next iteration...
       ResampleSample(state, doc, doc.GetSample(s), pSum);
     }
   }


  // Sumarise the results buffer into a single log probability and return it...
   float ret = 0.0;
   for (int s=0;s<doc.SampleCount();s++) ret += log(samProb[s]);
   return_val = ret;


  // Clean up...
   delete[] samIndex;
   delete[] samProb;
  """

    stateIn = State(doc, Params())
    stateIn.setGlobalParams(sample)
    stateIn.addPrior(sample)

    ret = weave.inline(code, ['stateIn', 'cluster', 'particles', 'cap'],
                       support_code=shared_code)

    return -ret  # Convert to negative log on the return - before then stick to positive.