def gibbs_doc_mp(model, doc, params=None, callback=None): """Runs Gibbs iterations on a single document, by sampling with a prior constructed from each sample in the given Model. params applies to each sample, so should probably be much more limited than usual - the default if its undefined is to use 1 run and 1 sample and a burn in of only 500. Returns a DocModel with all the relevant samples in.""" # Initialisation stuff - handle params, create the state and the DocModel object, plus a reporter... if params == None: params = Params() params.runs = 1 params.samples = 1 params.burnIn = 500 state = State(doc, params) dm = DocModel() # Create a pool of worker processes... pool = mp.Pool() # Create a value for sub-processes to report back their progress with... manager = mp.Manager() doneIters = manager.Value('i', 0) totalIters = model.sampleCount() * params.runs * ( params.burnIn + params.samples + (params.samples - 1) * params.lag) # Create a callback for when a job completes... def onComplete(s): dm.addFrom(s.getModel()) # Create all the jobs, wait for their completion, report progress... try: jobs = [] for sample in model.sampleList(): tempState = State(state) tempState.setGlobalParams(sample) tempState.addPrior(sample) jobs.append( pool.apply_async(gibbs_run_wrap, (tempState, doneIters), callback=onComplete)) finally: # Close the pool and wait for all the jobs to complete... pool.close() while len(jobs) != 0: if jobs[0].ready(): del jobs[0] continue time.sleep(0.01) if callback != None: callback(doneIters.value, totalIters) pool.join() # Return... return dm
def gibbs_doc_mp(model, doc, params = None, callback = None): """Runs Gibbs iterations on a single document, by sampling with a prior constructed from each sample in the given Model. params applies to each sample, so should probably be much more limited than usual - the default if its undefined is to use 1 run and 1 sample and a burn in of only 500. Returns a DocModel with all the relevant samples in.""" # Initialisation stuff - handle params, create the state and the DocModel object, plus a reporter... if params==None: params = Params() params.runs = 1 params.samples = 1 params.burnIn = 500 state = State(doc, params) dm = DocModel() # Create a pool of worker processes... pool = mp.Pool() # Create a value for sub-processes to report back their progress with... manager = mp.Manager() doneIters = manager.Value('i',0) totalIters = model.sampleCount() * params.runs * (params.burnIn + params.samples + (params.samples-1)*params.lag) # Create a callback for when a job completes... def onComplete(s): dm.addFrom(s.getModel()) # Create all the jobs, wait for their completion, report progress... try: jobs = [] for sample in model.sampleList(): tempState = State(state) tempState.setGlobalParams(sample) tempState.addPrior(sample) jobs.append(pool.apply_async(gibbs_run_wrap,(tempState,doneIters), callback = onComplete)) finally: # Close the pool and wait for all the jobs to complete... pool.close() while len(jobs)!=0: if jobs[0].ready(): del jobs[0] continue time.sleep(0.01) if callback!=None: callback(doneIters.value,totalIters) pool.join() # Return... return dm
def gibbs_doc(model, doc, params = None, callback = None): """Runs Gibbs iterations on a single document, by sampling with a prior constructed from each sample in the given Model. params applies to each sample, so should probably be much more limited than usual - the default if its undefined is to use 1 run and 1 sample and a burn in of only 500. Returns a DocModel with all the relevant samples in.""" # Initialisation stuff - handle params, create the state and the DocModel object, plus a reporter... if params==None: params = Params() params.runs = 1 params.samples = 1 params.burnIn = 500 state = State(doc, params) dm = DocModel() reporter = ProgReporter(params,callback,model.sampleCount()) # Iterate and run for each sample in the model... for sample in model.sampleList(): tempState = State(state) tempState.setGlobalParams(sample) tempState.addPrior(sample) gibbs_run(tempState,reporter.next) dm.addFrom(tempState.getModel()) # Return... return dm
def gibbs_doc(model, doc, params=None, callback=None): """Runs Gibbs iterations on a single document, by sampling with a prior constructed from each sample in the given Model. params applies to each sample, so should probably be much more limited than usual - the default if its undefined is to use 1 run and 1 sample and a burn in of only 500. Returns a DocModel with all the relevant samples in.""" # Initialisation stuff - handle params, create the state and the DocModel object, plus a reporter... if params == None: params = Params() params.runs = 1 params.samples = 1 params.burnIn = 500 state = State(doc, params) dm = DocModel() reporter = ProgReporter(params, callback, model.sampleCount()) # Iterate and run for each sample in the model... for sample in model.sampleList(): tempState = State(state) tempState.setGlobalParams(sample) tempState.addPrior(sample) gibbs_run(tempState, reporter.next) dm.addFrom(tempState.getModel()) # Return... return dm
def leftRightNegLogProbWord(sample, doc, cluster, particles, cap): """Does a left to right estimate of the negative log probability of the words in the given document, given a sample, the documents abnormalities and a cluster assignment. cap defines a cap on the number of documents resampled before each word is sampled for inclusion - set to a negative number for no cap, but be warned that the algorithm is then O(n^2) with regard to the number of words in the document. Should be set quite high in practise for a reasonable trade off between quality and run-time.""" code = start_cpp(shared_code) + """ // Setup - create the state, extract the document, set its cluster... State state; StatePyToCpp(stateIn, &state); Document & doc = state.doc[0]; if (cluster>=0) { // Existing cluster... doc.SetCluster(state.clusters.Index(cluster)); } else { // New cluster... ItemRef<Cluster,Conc> * newC = state.clusters.Append(); newC->Body().alpha = state.rho.alpha; newC->Body().beta = state.rho.beta; newC->Body().conc = state.rho.conc; float * bmn = new float[state.behCount]; float bmnDiv = 0.0; for (int b=0;b<state.behCount;b++) { bmn[b] = state.phi[b]; bmnDiv += state.phi[b]; } for (int b=0;b<state.behCount;b++) bmn[b] /= bmnDiv; newC->SetBMN(bmn); doc.SetCluster(newC); } // If the cap is negative set it to include all words, otherwise we need some storage... int * samIndex = 0; if (cap<0) cap = doc.SampleCount(); else { samIndex = new int[cap]; } // Create some memory for storing the results into, zeroed out... float * samProb = new float[doc.SampleCount()]; for (int s=0;s<doc.SampleCount();s++) samProb[s] = 0.0; // Do all the particles, summing the results into the samProb array... for (int p=0;p<particles;p++) { // Reset the document to have no assignments to words... for (int s=0;s<doc.SampleCount();s++) { doc.GetSample(s).SetDocInst(0); } // Iterate and factor in the result from each sample... for (int s=0;s<doc.SampleCount();s++) { // Resample preceding samples - 3 scenarios with regards to the cap... // (Note that duplication is allowed in the random sample selection - whilst strictly forbidden the situation is such that it can not cause any issues.) if (s<=cap) { // Less or equal number of samples than the cap - do them all... for (int s2=0;s2<s;s2++) { ResampleSample(state, doc, doc.GetSample(s2)); } } else { if (s<=cap*2) { // Need to miss some samples out, but due to numbers its best to randomly select the ones to miss rather than the ones to do... int missCount = s-cap; for (int m=0;m<missCount;m++) samIndex[m] = sample_nat(s); qsort(samIndex, missCount, sizeof(int), compareInt); for (int s2=0;s2<samIndex[0];s2++) { ResampleSample(state, doc, doc.GetSample(s2)); } for (int m=0;m<missCount-1;m++) { for (int s2=samIndex[m]+1;s2<samIndex[m+1];s2++) { ResampleSample(state, doc, doc.GetSample(s2)); } } for (int s2=samIndex[missCount-1]+1;s2<s;s2++) { ResampleSample(state, doc, doc.GetSample(s2)); } } else { // Need to select a subset of samples to do... for (int m=0;m<cap;m++) samIndex[m] = sample_nat(s); qsort(samIndex, cap, sizeof(int), compareInt); for (int m=0;m<cap;m++) { ResampleSample(state, doc, doc.GetSample(samIndex[m])); } } } // Calculate the contribution of this sample, whilst simultaneously filling out so we can make a draw from them... float pSum = CalcSampleProb(state, doc, doc.GetSample(s)); samProb[s] += (pSum - samProb[s]) / (p+1); // Draw an assignment for the current sample, ready for the next iteration... ResampleSample(state, doc, doc.GetSample(s), pSum); } } // Sumarise the results buffer into a single log probability and return it... float ret = 0.0; for (int s=0;s<doc.SampleCount();s++) ret += log(samProb[s]); return_val = ret; // Clean up... delete[] samIndex; delete[] samProb; """ stateIn = State(doc, Params()) stateIn.setGlobalParams(sample) stateIn.addPrior(sample) ret = weave.inline(code,['stateIn','cluster','particles','cap'] , support_code=shared_code) return -ret # Convert to negative log on the return - before then stick to positive.
def leftRightNegLogProbWord(sample, doc, cluster, particles, cap): """Does a left to right estimate of the negative log probability of the words in the given document, given a sample, the documents abnormalities and a cluster assignment. cap defines a cap on the number of documents resampled before each word is sampled for inclusion - set to a negative number for no cap, but be warned that the algorithm is then O(n^2) with regard to the number of words in the document. Should be set quite high in practise for a reasonable trade off between quality and run-time.""" code = start_cpp(shared_code) + """ // Setup - create the state, extract the document, set its cluster... State state; StatePyToCpp(stateIn, &state); Document & doc = state.doc[0]; if (cluster>=0) { // Existing cluster... doc.SetCluster(state.clusters.Index(cluster)); } else { // New cluster... ItemRef<Cluster,Conc> * newC = state.clusters.Append(); newC->Body().alpha = state.rho.alpha; newC->Body().beta = state.rho.beta; newC->Body().conc = state.rho.conc; float * bmn = new float[state.behCount]; float bmnDiv = 0.0; for (int b=0;b<state.behCount;b++) { bmn[b] = state.phi[b]; bmnDiv += state.phi[b]; } for (int b=0;b<state.behCount;b++) bmn[b] /= bmnDiv; newC->SetBMN(bmn); doc.SetCluster(newC); } // If the cap is negative set it to include all words, otherwise we need some storage... int * samIndex = 0; if (cap<0) cap = doc.SampleCount(); else { samIndex = new int[cap]; } // Create some memory for storing the results into, zeroed out... float * samProb = new float[doc.SampleCount()]; for (int s=0;s<doc.SampleCount();s++) samProb[s] = 0.0; // Do all the particles, summing the results into the samProb array... for (int p=0;p<particles;p++) { // Reset the document to have no assignments to words... for (int s=0;s<doc.SampleCount();s++) { doc.GetSample(s).SetDocInst(0); } // Iterate and factor in the result from each sample... for (int s=0;s<doc.SampleCount();s++) { // Resample preceding samples - 3 scenarios with regards to the cap... // (Note that duplication is allowed in the random sample selection - whilst strictly forbidden the situation is such that it can not cause any issues.) if (s<=cap) { // Less or equal number of samples than the cap - do them all... for (int s2=0;s2<s;s2++) { ResampleSample(state, doc, doc.GetSample(s2)); } } else { if (s<=cap*2) { // Need to miss some samples out, but due to numbers its best to randomly select the ones to miss rather than the ones to do... int missCount = s-cap; for (int m=0;m<missCount;m++) samIndex[m] = sample_nat(s); qsort(samIndex, missCount, sizeof(int), compareInt); for (int s2=0;s2<samIndex[0];s2++) { ResampleSample(state, doc, doc.GetSample(s2)); } for (int m=0;m<missCount-1;m++) { for (int s2=samIndex[m]+1;s2<samIndex[m+1];s2++) { ResampleSample(state, doc, doc.GetSample(s2)); } } for (int s2=samIndex[missCount-1]+1;s2<s;s2++) { ResampleSample(state, doc, doc.GetSample(s2)); } } else { // Need to select a subset of samples to do... for (int m=0;m<cap;m++) samIndex[m] = sample_nat(s); qsort(samIndex, cap, sizeof(int), compareInt); for (int m=0;m<cap;m++) { ResampleSample(state, doc, doc.GetSample(samIndex[m])); } } } // Calculate the contribution of this sample, whilst simultaneously filling out so we can make a draw from them... float pSum = CalcSampleProb(state, doc, doc.GetSample(s)); samProb[s] += (pSum - samProb[s]) / (p+1); // Draw an assignment for the current sample, ready for the next iteration... ResampleSample(state, doc, doc.GetSample(s), pSum); } } // Sumarise the results buffer into a single log probability and return it... float ret = 0.0; for (int s=0;s<doc.SampleCount();s++) ret += log(samProb[s]); return_val = ret; // Clean up... delete[] samIndex; delete[] samProb; """ stateIn = State(doc, Params()) stateIn.setGlobalParams(sample) stateIn.addPrior(sample) ret = weave.inline(code, ['stateIn', 'cluster', 'particles', 'cap'], support_code=shared_code) return -ret # Convert to negative log on the return - before then stick to positive.