class Sample: """Stores a single sample drawn from the model - the topics, clusters and each document being sampled over. Stores counts and parameters required to make them into distributions, rather than final distributions. Has clonning capability.""" def __init__(self, state, calcNLL = True, priorsOnly = False): """Given a state this draws a sample from it, as a specific parametrisation of the model. Also a copy constructor, with a slight modification - if the priorsOnly flag is set it will only copy across the priors, and initialise to an empty model.""" if isinstance(state, Sample): # Code for clonning. self.alpha = state.alpha self.beta = state.beta.copy() self.gamma = state.gamma self.rho = state.rho self.mu = state.mu self.phi = state.phi.copy() if not priorsOnly: self.topicWord = state.topicWord.copy() self.topicUse = state.topicUse.copy() else: self.topicWord = numpy.zeros((0,state.topicWord.shape[1]), dtype=numpy.int32) self.topicUse = numpy.zeros(0,dtype=numpy.int32) self.topicConc = state.topicConc self.abnormTopicWord = state.abnormTopicWord.copy() self.abnorms = dict(state.abnorms) self.fia = FlagIndexArray(state.fia) if not priorsOnly: self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster) self.clusterUse = state.clusterUse.copy() else: self.cluster = [] self.clusterUse = numpy.zeros(0,dtype=numpy.int32) self.clusterConc = state.clusterConc if not priorsOnly: self.doc = map(lambda ds: DocSample(ds), state.doc) else: self.doc = [] else: # Normal initialisation code. self.alpha = state.alpha self.beta = state.beta.copy() self.gamma = state.gamma self.rho = state.rho self.mu = state.mu self.phi = state.phi.copy() # Topic stuff... self.topicWord = state.topicWord.copy() self.topicUse = state.topicUse.copy() self.topicConc = state.topicConc # Abnormality stuff... self.abnormTopicWord = state.abnormTopicWord.copy() self.abnorms = dict(state.abnorms) self.fia = FlagIndexArray(state.fia) # Cluster stuff... self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster) self.clusterUse = state.clusterUse.copy() self.clusterConc = state.clusterConc # The details for each document... self.doc = [] for d in xrange(len(state.doc)): self.doc.append(DocSample(state.doc[d])) # Second pass through documents to fill in the negative log liklihoods - need some data structures for this... if calcNLL: for d in xrange(len(state.doc)): self.doc[d].calcNLL(state.doc[d],state) def merge(self, other): """Given a sample this merges it into this sample. Works under the assumption that the new sample was learnt with this sample as its only prior, and ends up as though both the prior and the sample were drawn whilst simultaneously being modeled. Trashes the given sample - do not continue to use.""" # Update the old documents - there are potentially more behaviours in the new sample, which means adjusting the behaviour flags... if self.fia.getLength()!=other.fia.getLength(): for doc in self.doc: newBehFlags = numpy.zeros(other.fia.getLength(), dtype=numpy.uint8) newBehFlags[0] = doc.behFlags[0] for abnorm, index in self.abnorms: newIndex = other.abnorms[abnorm] newBehFlags[newIndex] = doc.behFlags[index] doc.behFlags = newBehFlags # Replace the basic parameters... self.alpha = other.alpha self.beta = other.beta self.gamma = other.gamma self.rho = other.rho self.mu = other.mu self.phi = other.phi self.topicWord = other.topicWord self.topicUse = other.topicUse self.topicConc = other.topicConc self.abnormTopicWord = other.abnormTopicWord self.abnorms = other.abnorms self.fia = other.fia self.cluster = other.cluster self.clusterUse = other.clusterUse self.clusterConc = other.clusterConc # Add in the (presumably) new documents... for doc in other.doc: self.doc.append(doc) def getAlphaPrior(self): """Returns the PriorConcDP that was used for the alpha parameter, which is the concentration parameter for the DP in each document.""" return self.alpha def getBeta(self): """Returns the beta prior, which is a vector representing a Dirichlet distribution from which the multinomials for each topic are drawn, from which words are drawn.""" return self.beta def getGammaPrior(self): """Returns the PriorConcDP that was used for the gamma parameter, which is the concentration parameter for the global DP from which topics are drawn.""" return self.gamma def getRhoPrior(self): """Returns the PriorConcDP that was used for the rho parameter, which is the concentration parameter for each specific clusters DP.""" return self.rho def getMuPrior(self): """Returns the PriorConcDP that was used for the mu parameter, which is the concentration parameter for the DP from which clusters are drawn.""" return self.mu def getPhi(self): """Returns the phi Dirichlet distribution prior on the behavioural multinomial for each cluster.""" return self.phi def getTopicCount(self): """Returns the number of topics in the sample.""" return self.topicWord.shape[0] def getWordCount(self): """Returns the number of words in the topic multinomial.""" return self.topicWord.shape[1] def getTopicUseWeight(self, t): """Returns how many times the given topic has been instanced in a cluster.""" return self.topicUse[t] def getTopicUseWeights(self): """Returns an array, indexed by topic id, that contains how many times each topic has been instanciated in a cluster. Do not edit the return value - copy it first.""" return self.topicUse def getTopicConc(self): """Returns the sampled concentration parameter for drawing topic instances from the global DP.""" return self.topicConc def getTopicWordCount(self, t): """Returns the number of samples assigned to each word for the given topic, as an integer numpy array. Do not edit the return value - make a copy first.""" return self.topicWord[t,:] def getTopicWordCounts(self, t): """Returns the number of samples assigned to each word for all topics, indexed [topic, word], as an integer numpy array. Do not edit the return value - make a copy first.""" return self.topicWord def getTopicMultinomial(self, t): """Returns the calculated multinomial for a given topic ident.""" ret = self.beta.copy() ret += self.topicWord[t,:] ret /= ret.sum() return ret def getTopicMultinomials(self): """Returns the multinomials for all topics, in a single array - indexed by [topic, word] to give P(word|topic).""" ret = numpy.vstack([self.beta]*self.topicWord.shape[0]) ret += self.topicWord ret = (ret.T / ret.sum(axis=1)).T return ret def getBehCount(self): """Returns the number of behaviours, which is the number of abnormalities plus 1, and the entry count for the indexing variable for abnormals in the relevant methods.""" return self.abnormTopicWord.shape[0] def getAbnormWordCount(self, b): """Returns the number of samples assigned to each word for the given abnormal topic. Note that entry 0 equates to normal behaviour and is a dummy that should be ignored.""" return self.abnormTopicWord[b,:] def getAbnormWordCounts(self): """Returns the number of samples assigned to each word in each abnormal behaviour. An integer 2D array indexed with [behaviour, word], noting that behaviour 0 is a dummy for normal behaviour. Do not edit the return value - make a copy first.""" return self.abnormTopicWord def getAbnormMultinomial(self, b): """Returns the calculated multinomial for a given abnormal behaviour.""" ret = self.beta.copy() ret += self.abnormTopicWord[b,:] ret /= ret.sum() return ret def getAbnormMultinomials(self): """Returns the multinomials for all abnormalities, in a single array - indexed by [behaviour, word] to give P(word|topic associated with behaviour). Entry 0 is a dummy to fill in for normal behaviour, and should be ignored.""" ret = numpy.vstack([self.beta]*self.abnormTopicWord.shape[0]) ret += self.abnormTopicWord ret = (ret.T / ret.sum(axis=1)).T return ret def getAbnormDict(self): """Returns a dictionary that takes each abnormalities user provided token to the behaviour index used for it. Allows the use of the getAbnorm* methods, amung other things.""" return self.abnorms def getClusterCount(self): """Returns how many clusters there are.""" return len(self.cluster) def getClusterDrawWeight(self, c): """Returns how many times the given cluster has been instanced by a document.""" return self.clusterUse[c] def getClusterDrawWeights(self): """Returns an array, indexed by cluster id, that contains how many times each cluster has been instanciated by a document. Do not edit the return value - copy it first.""" return self.clusterUse def getClusterDrawConc(self): """Returns the sampled concentration parameter for drawing cluster instances for documents.""" return self.clusterConc def getClusterInstCount(self, c): """Returns how many instances of topics exist in the given cluster.""" return self.cluster[c][0].shape[0] def getClusterInstWeight(self, c, ti): """Returns how many times the given cluster topic instance has been instanced by a documents DP.""" return self.cluster[c][0][ti,1] def getClusterInstTopic(self, c, ti): """Returns which topic the given cluster topic instance is an instance of.""" return self.cluster[c][0][ti,0] def getClusterInstDual(self, c): """Returns a 2D array, where the first dimension is indexed by the topic instance, and the second contains two columns - the first the topic index, the second the weight. Do not edit return value - copy before use.""" return self.cluster[c][0] def getClusterInstConc(self, c): """Returns the sampled concentration that goes with the DP from which the members of each documents DP are drawn.""" return self.cluster[c][1] def getClusterInstBehMN(self, c): """Returns the multinomial on drawing behaviours for the given cluster.""" return self.cluster[c][2] def getClusterInstPriorBehMN(self, c): """Returns the prior on the behaviour multinomial, as an array of integer counts aligned with the flag set.""" return self.cluster[c][3] def docCount(self): """Returns the number of documents stored within. Should be the same as the corpus from which the sample was drawn.""" return len(self.doc) def getDoc(self,d): """Given a document index this returns the appropriate DocSample object. These indices should align up with the document indices in the Corpus from which this Sample was drawn, assuming no documents have been deleted.""" return self.doc[d] def delDoc(self, ident): """Given a document ident this finds the document with the ident and removes it from the model, completly - i.e. all the variables in the sample are also updated. Primarilly used to remove documents for resampling prior to using the model as a prior. Note that this can potentially leave entities with no users - they get culled when the model is loaded into the C++ data structure so as to not cause problems.""" # Find and remove it from the document list... index = None for i in xrange(len(self.doc)): if self.doc[i].getIdent()==ident: index = i break if index==None: return victim = self.doc[index] self.doc = self.doc[:index] + self.doc[index+1:] # Update all the variables left behind by subtracting the relevant terms... cluster = self.cluster[victim.cluster] self.clusterUse[victim.cluster] -= 1 ## First pass through the dp and remove its influence; at the same time note the arrays that need to be updated by each user when looping through... dp_ext = [] for i in xrange(victim.dp.shape[0]): beh = victim.dp[i,0] #count = victim.dp[i,2] if beh==0: # Normal behaviour cluInst = victim.dp[i,1] # Update the instance, and topic use counts if necessary... topic = cluster[0][cluInst,0] cluster[0][cluInst,1] -= 1 if cluster[0][cluInst,1]==0: self.topicUse[topic] -= 1 # Store the entity that needs updating in correspondence with this dp instance in the next step... dp_ext.append((self.topicWord, topic)) else: # Abnormal behaviour. # Store the entity that needs updating in correspondence with the dp... dp_ext.append((self.abnormTopicWord, beh)) ## Go through the samples array and remove their influnce - the hard part was done by the preceding step... for si in xrange(victim.samples.shape[0]): inst = victim.samples[si,0] word = victim.samples[si,1] mat, topic = dp_ext[inst] mat[topic,word] -= 1 # Clean up all zeroed items... self.cleanZeros() def cleanZeros(self): """Goes through and removes anything that has a zero reference count, adjusting all indices accordingly.""" # Remove the zeros from this object, noting the changes... ## Topics... newTopicCount = 0 topicMap = dict() for t in xrange(self.topicUse.shape[0]): if self.topicUse[t]!=0: topicMap[t] = newTopicCount newTopicCount += 1 if newTopicCount!=self.topicUse.shape[0]: newTopicWord = numpy.zeros((newTopicCount, self.topicWord.shape[1]), dtype=numpy.int32) newTopicUse = numpy.zeros(newTopicCount,dtype=numpy.int32) for origin, dest in topicMap.iteritems(): newTopicWord[dest,:] = self.topicWord[origin,:] newTopicUse[dest] = self.topicUse[origin] self.topicWord = newTopicWord self.topicUse = newTopicUse ## Clusters... newClusterCount = 0 clusterMap = dict() for c in xrange(self.clusterUse.shape[0]): if self.clusterUse[c]!=0: clusterMap[c] = newClusterCount newClusterCount += 1 if newClusterCount!=self.clusterUse.shape[0]: newCluster = [None]*newClusterCount newClusterUse = numpy.zeros(newClusterCount, dtype=numpy.int32) for origin, dest in clusterMap.iteritems(): newCluster[dest] = self.cluster[origin] newClusterUse[dest] = self.clusterUse[origin] self.cluster = newCluster self.clusterUse = newClusterUse ## Cluster instances... # (Change is noted by a 2-tuple of (new length, dict) where new length is the new length and dict goes from old indices to new indices.) cluInstAdj = [] for ci in xrange(len(self.cluster)): newInstCount = 0 instMap = dict() for i in xrange(self.cluster[ci][0].shape[0]): if self.cluster[ci][0][i,1]!=0: instMap[i] = newInstCount newInstCount += 1 cluInstAdj.append((newInstCount, instMap)) if newInstCount!=self.cluster[ci][0].shape[0]: newInst = numpy.zeros((newInstCount,2), dtype=numpy.int32) for origin, dest in instMap.iteritems(): newInst[dest,:] = self.cluster[ci][0][origin,:] self.cluster[ci] = (newInst, self.cluster[ci][1], self.cluster[ci][2], self.cluster[ci][3]) # Iterate and update the topic indices of the cluster instances... for ci in xrange(len(self.cluster)): for i in xrange(self.cluster[ci][0].shape[0]): self.cluster[ci][0][i,0] = topicMap[self.cluster[ci][0][i,0]] # Now iterate the documents and update their cluster and cluster instance indices... for doc in self.doc: doc.cluster = clusterMap[doc.cluster] _, instMap = cluInstAdj[doc.cluster] for di in xrange(doc.dp.shape[0]): if doc.dp[di,0]==0: doc.dp[di,1] = instMap[doc.dp[di,1]] def nllAllDocs(self): """Returns the negative log likelihood of all the documents in the sample - a reasonable value to compare various samples with.""" return sum(map(lambda d: d.getNLL(),self.doc)) def logNegProbWordsGivenClusterAbnorm(self, doc, cluster, particles = 16, cap = -1): """Uses wallach's 'left to right' method to calculate the negative log probability of the words in the document given the rest of the model. Both the cluster (provided as an index) and the documents abnormalities vector are fixed for this calculation. Returns the average of the results for each sample contained within model. particles is the number of particles to use in the left to right estimation algorithm. This is implimented using scipy.weave.""" return solvers.leftRightNegLogProbWord(self, doc, cluster, particles, cap) def logNegProbWordsGivenAbnorm(self, doc, particles = 16, cap = -1): """Uses logNegProbWordsGivenClusterAbnorm and simply sums out the cluster variable.""" # Get the probability of each with the dependence with clusters... cluScores = map(lambda c: solvers.leftRightNegLogProbWord(self, doc, c, particles, cap), xrange(self.getClusterCount())) # Multiply each by the probability of the cluster, so it can be summed out... cluNorm = float(self.clusterUse.sum()) + self.clusterConc cluScores = map(lambda c,s: s - math.log(float(self.clusterUse[c])/cluNorm), xrange(len(cluScores)), cluScores) # Also need to include the probability of a new cluster, even though it is likelly to be a neglible contribution... newVal = solvers.leftRightNegLogProbWord(self, doc, -1, particles, cap) newVal -= math.log(self.clusterConc/cluNorm) cluScores.append(newVal) # Sum out the cluster variable, in a numerically stable way given that we are dealing with negative log likelihood values that will map to extremelly low probabilities... minScore = min(cluScores) cluPropProb = map(lambda s: math.exp(minScore-s), cluScores) return minScore - math.log(sum(cluPropProb))
def __init__(self, state, calcNLL = True, priorsOnly = False): """Given a state this draws a sample from it, as a specific parametrisation of the model. Also a copy constructor, with a slight modification - if the priorsOnly flag is set it will only copy across the priors, and initialise to an empty model.""" if isinstance(state, Sample): # Code for clonning. self.alpha = state.alpha self.beta = state.beta.copy() self.gamma = state.gamma self.rho = state.rho self.mu = state.mu self.phi = state.phi.copy() if not priorsOnly: self.topicWord = state.topicWord.copy() self.topicUse = state.topicUse.copy() else: self.topicWord = numpy.zeros((0,state.topicWord.shape[1]), dtype=numpy.int32) self.topicUse = numpy.zeros(0,dtype=numpy.int32) self.topicConc = state.topicConc self.abnormTopicWord = state.abnormTopicWord.copy() self.abnorms = dict(state.abnorms) self.fia = FlagIndexArray(state.fia) if not priorsOnly: self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster) self.clusterUse = state.clusterUse.copy() else: self.cluster = [] self.clusterUse = numpy.zeros(0,dtype=numpy.int32) self.clusterConc = state.clusterConc if not priorsOnly: self.doc = map(lambda ds: DocSample(ds), state.doc) else: self.doc = [] else: # Normal initialisation code. self.alpha = state.alpha self.beta = state.beta.copy() self.gamma = state.gamma self.rho = state.rho self.mu = state.mu self.phi = state.phi.copy() # Topic stuff... self.topicWord = state.topicWord.copy() self.topicUse = state.topicUse.copy() self.topicConc = state.topicConc # Abnormality stuff... self.abnormTopicWord = state.abnormTopicWord.copy() self.abnorms = dict(state.abnorms) self.fia = FlagIndexArray(state.fia) # Cluster stuff... self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster) self.clusterUse = state.clusterUse.copy() self.clusterConc = state.clusterConc # The details for each document... self.doc = [] for d in xrange(len(state.doc)): self.doc.append(DocSample(state.doc[d])) # Second pass through documents to fill in the negative log liklihoods - need some data structures for this... if calcNLL: for d in xrange(len(state.doc)): self.doc[d].calcNLL(state.doc[d],state)
def __init__(self, obj, params = None): """Constructs a state object given either another State object (clone), or a Corpus and a Params object. If the Params object is omitted it uses the default. Also supports construction from a single Document, where it uses lots of defaults but is basically identical to a Corpus with a single Document in - used as a shortcut when fitting a Document to an already learnt model.""" if isinstance(obj, State): # Cloning time... self.dnrDocInsts = obj.dnrDocInsts self.dnrCluInsts = obj.dnrCluInsts self.seperateClusterConc = obj.seperateClusterConc self.seperateDocumentConc = obj.seperateDocumentConc self.oneCluster = obj.oneCluster self.calcBeta = obj.calcBeta self.calcCluBmn = obj.calcCluBmn self.calcPhi = obj.calcPhi self.resampleConcs = obj.resampleConcs self.behSamples = obj.behSamples self.alpha = PriorConcDP(obj.alpha) self.beta = obj.beta.copy() self.gamma = PriorConcDP(obj.gamma) self.rho = PriorConcDP(obj.rho) self.mu = PriorConcDP(obj.mu) self.phi = obj.phi.copy() self.topicWord = obj.topicWord.copy() self.topicUse = obj.topicUse.copy() self.topicConc = obj.topicConc self.abnormTopicWord = obj.abnormTopicWord.copy() self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy()),obj.cluster) self.clusterUse = obj.clusterUse.copy() self.clusterConc = obj.clusterConc self.doc = map(lambda d: DocState(d), obj.doc) self.abnorms = dict(obj.abnorms) self.fia = FlagIndexArray(obj.fia) self.params = Params(obj.params) self.model = Model(obj.model) elif isinstance(obj, Document): # Construct from a single document... self.dnrDocInsts = False self.dnrCluInsts = False self.seperateClusterConc = False self.seperateDocumentConc = False self.oneCluster = False self.calcBeta = False self.calcCluBmn = False self.calcPhi = False self.resampleConcs = False self.behSamples = 1024 wordCount = obj.getWord(obj.getWordCount()-1)[0] self.alpha = PriorConcDP() self.beta = numpy.ones(wordCount, dtype=numpy.float32) self.gamma = PriorConcDP() self.rho = PriorConcDP() self.mu = PriorConcDP() self.phi = numpy.ones(1+len(obj.getAbnorms()), dtype=numpy.float32) self.phi[0] *= 10.0 self.phi /= self.phi.sum() self.topicWord = numpy.zeros((0,wordCount), dtype=numpy.int32) self.topicUse = numpy.zeros(0,dtype=numpy.int32) self.topicConc = self.gamma.conc self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnorms()), wordCount), dtype=numpy.int32) self.cluster = [] self.clusterUse = numpy.zeros(0,dtype=numpy.int32) self.clusterConc = self.mu.conc abnormDict = dict() for i, abnorm in enumerate(obj.getAbnorms()): abnormDict[abnorm] = i+1 self.doc = [DocState(obj,self.alpha,abnormDict)] self.abnorms = dict() for num, abnorm in enumerate(obj.getAbnorms()): self.abnorms[abnorm] = num+1 self.fia = FlagIndexArray(len(self.abnorms)+1) self.fia.addSingles() for doc in self.doc: doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags) if params!=None: self.params = params else: self.params = Params() self.model = Model() else: # Construct from a corpus, as that is the only remaining option... # Behaviour flags... self.dnrDocInsts = obj.getDocInstsDNR() self.dnrCluInsts = obj.getCluInstsDNR() self.seperateClusterConc = obj.getSeperateClusterConc() self.seperateDocumentConc = obj.getSeperateDocumentConc() self.oneCluster = obj.getOneCluster() self.calcBeta = obj.getCalcBeta() self.calcCluBmn = obj.getCalcClusterBMN() self.calcPhi = obj.getCalcPhi() self.resampleConcs = obj.getResampleConcs() self.behSamples = obj.getBehSamples() # Concentration parameters - these are all constant... self.alpha = PriorConcDP(obj.getAlpha()) self.beta = numpy.ones(obj.getWordCount(),dtype=numpy.float32) self.beta *= obj.getBeta() self.gamma = PriorConcDP(obj.getGamma()) self.rho = PriorConcDP(obj.getRho()) self.mu = PriorConcDP(obj.getMu()) self.phi = numpy.ones(1+len(obj.getAbnormDict()), dtype=numpy.float32) self.phi[0] *= obj.getPhiRatio() self.phi *= obj.getPhiConc()*self.phi.shape[0] / self.phi.sum() # The topics in the model - consists of three parts - first an array indexed by [topic,word] which gives how many times each word has been drawn from the given topic - this alongside beta allows the relevant Dirichlet posterior to be determined. Additionally we have topicUse, which counts how many times each topic has been instanced in a cluster - this alongside topicConc, which is the sampled concentration, defines the DP from which topics are drawn for inclusion in clusters... self.topicWord = numpy.zeros((0,obj.getWordCount()),dtype=numpy.int32) self.topicUse = numpy.zeros(0,dtype=numpy.int32) self.topicConc = self.gamma.conc # A second topicWord-style matrix, indexed by behaviour and containing the abnormal topics. Entry 0, which is normal, is again an empty dummy... self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnormDict()), obj.getWordCount()), dtype=numpy.int32) # Defines the clusters, as a list of (inst, conc, bmn, bmnPrior). inst is a 2D array, containing all the topic instances that make up the cluster - whilst the first dimension of the array indexes each instance the second has two entrys only, the first the index number for the topic, the second the number of using document instances. conc is the sampled concentration that completes the definition of the DP defined for each cluster. bmn is the multinomial on behaviours associated with the cluster - a 1D array of floats. bmnPrior is the flagSet aligned integer array that is the prior on bmn. Additionally we have the DDP from which the specific clusters are drawn - this is defined by clusterUse and clusterConc, just as for the topics... self.cluster = [] self.clusterUse = numpy.zeros(0, dtype=numpy.int32) self.clusterConc = self.mu.conc # List of document objects, to contain the documents - whilst declared immediatly below as an empty list we then proceed to fill it in with the information from the given Corpus... self.doc = [] for doc in obj.documentList(): self.doc.append(DocState(doc, self.alpha, obj.getAbnormDict())) # The abnormality dictionary - need a copy so we can convert from flags to the user provided codes after fitting the model... self.abnorms = dict(obj.getAbnormDict()) # The flag index array - converts each flag combination to an index - required for learning the per-cluster behaviour multinomials... self.fia = FlagIndexArray(len(self.abnorms)+1) self.fia.addSingles() for doc in self.doc: doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags) # Store the parameters... if params!=None: self.params = params else: self.params = Params() # Create a model object, for storing samples into... self.model = Model()
class State: """State object, as manipulated by a Gibbs sampler to get samples of the unknown parameters of the model.""" def __init__(self, obj, params = None): """Constructs a state object given either another State object (clone), or a Corpus and a Params object. If the Params object is omitted it uses the default. Also supports construction from a single Document, where it uses lots of defaults but is basically identical to a Corpus with a single Document in - used as a shortcut when fitting a Document to an already learnt model.""" if isinstance(obj, State): # Cloning time... self.dnrDocInsts = obj.dnrDocInsts self.dnrCluInsts = obj.dnrCluInsts self.seperateClusterConc = obj.seperateClusterConc self.seperateDocumentConc = obj.seperateDocumentConc self.oneCluster = obj.oneCluster self.calcBeta = obj.calcBeta self.calcCluBmn = obj.calcCluBmn self.calcPhi = obj.calcPhi self.resampleConcs = obj.resampleConcs self.behSamples = obj.behSamples self.alpha = PriorConcDP(obj.alpha) self.beta = obj.beta.copy() self.gamma = PriorConcDP(obj.gamma) self.rho = PriorConcDP(obj.rho) self.mu = PriorConcDP(obj.mu) self.phi = obj.phi.copy() self.topicWord = obj.topicWord.copy() self.topicUse = obj.topicUse.copy() self.topicConc = obj.topicConc self.abnormTopicWord = obj.abnormTopicWord.copy() self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy()),obj.cluster) self.clusterUse = obj.clusterUse.copy() self.clusterConc = obj.clusterConc self.doc = map(lambda d: DocState(d), obj.doc) self.abnorms = dict(obj.abnorms) self.fia = FlagIndexArray(obj.fia) self.params = Params(obj.params) self.model = Model(obj.model) elif isinstance(obj, Document): # Construct from a single document... self.dnrDocInsts = False self.dnrCluInsts = False self.seperateClusterConc = False self.seperateDocumentConc = False self.oneCluster = False self.calcBeta = False self.calcCluBmn = False self.calcPhi = False self.resampleConcs = False self.behSamples = 1024 wordCount = obj.getWord(obj.getWordCount()-1)[0] self.alpha = PriorConcDP() self.beta = numpy.ones(wordCount, dtype=numpy.float32) self.gamma = PriorConcDP() self.rho = PriorConcDP() self.mu = PriorConcDP() self.phi = numpy.ones(1+len(obj.getAbnorms()), dtype=numpy.float32) self.phi[0] *= 10.0 self.phi /= self.phi.sum() self.topicWord = numpy.zeros((0,wordCount), dtype=numpy.int32) self.topicUse = numpy.zeros(0,dtype=numpy.int32) self.topicConc = self.gamma.conc self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnorms()), wordCount), dtype=numpy.int32) self.cluster = [] self.clusterUse = numpy.zeros(0,dtype=numpy.int32) self.clusterConc = self.mu.conc abnormDict = dict() for i, abnorm in enumerate(obj.getAbnorms()): abnormDict[abnorm] = i+1 self.doc = [DocState(obj,self.alpha,abnormDict)] self.abnorms = dict() for num, abnorm in enumerate(obj.getAbnorms()): self.abnorms[abnorm] = num+1 self.fia = FlagIndexArray(len(self.abnorms)+1) self.fia.addSingles() for doc in self.doc: doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags) if params!=None: self.params = params else: self.params = Params() self.model = Model() else: # Construct from a corpus, as that is the only remaining option... # Behaviour flags... self.dnrDocInsts = obj.getDocInstsDNR() self.dnrCluInsts = obj.getCluInstsDNR() self.seperateClusterConc = obj.getSeperateClusterConc() self.seperateDocumentConc = obj.getSeperateDocumentConc() self.oneCluster = obj.getOneCluster() self.calcBeta = obj.getCalcBeta() self.calcCluBmn = obj.getCalcClusterBMN() self.calcPhi = obj.getCalcPhi() self.resampleConcs = obj.getResampleConcs() self.behSamples = obj.getBehSamples() # Concentration parameters - these are all constant... self.alpha = PriorConcDP(obj.getAlpha()) self.beta = numpy.ones(obj.getWordCount(),dtype=numpy.float32) self.beta *= obj.getBeta() self.gamma = PriorConcDP(obj.getGamma()) self.rho = PriorConcDP(obj.getRho()) self.mu = PriorConcDP(obj.getMu()) self.phi = numpy.ones(1+len(obj.getAbnormDict()), dtype=numpy.float32) self.phi[0] *= obj.getPhiRatio() self.phi *= obj.getPhiConc()*self.phi.shape[0] / self.phi.sum() # The topics in the model - consists of three parts - first an array indexed by [topic,word] which gives how many times each word has been drawn from the given topic - this alongside beta allows the relevant Dirichlet posterior to be determined. Additionally we have topicUse, which counts how many times each topic has been instanced in a cluster - this alongside topicConc, which is the sampled concentration, defines the DP from which topics are drawn for inclusion in clusters... self.topicWord = numpy.zeros((0,obj.getWordCount()),dtype=numpy.int32) self.topicUse = numpy.zeros(0,dtype=numpy.int32) self.topicConc = self.gamma.conc # A second topicWord-style matrix, indexed by behaviour and containing the abnormal topics. Entry 0, which is normal, is again an empty dummy... self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnormDict()), obj.getWordCount()), dtype=numpy.int32) # Defines the clusters, as a list of (inst, conc, bmn, bmnPrior). inst is a 2D array, containing all the topic instances that make up the cluster - whilst the first dimension of the array indexes each instance the second has two entrys only, the first the index number for the topic, the second the number of using document instances. conc is the sampled concentration that completes the definition of the DP defined for each cluster. bmn is the multinomial on behaviours associated with the cluster - a 1D array of floats. bmnPrior is the flagSet aligned integer array that is the prior on bmn. Additionally we have the DDP from which the specific clusters are drawn - this is defined by clusterUse and clusterConc, just as for the topics... self.cluster = [] self.clusterUse = numpy.zeros(0, dtype=numpy.int32) self.clusterConc = self.mu.conc # List of document objects, to contain the documents - whilst declared immediatly below as an empty list we then proceed to fill it in with the information from the given Corpus... self.doc = [] for doc in obj.documentList(): self.doc.append(DocState(doc, self.alpha, obj.getAbnormDict())) # The abnormality dictionary - need a copy so we can convert from flags to the user provided codes after fitting the model... self.abnorms = dict(obj.getAbnormDict()) # The flag index array - converts each flag combination to an index - required for learning the per-cluster behaviour multinomials... self.fia = FlagIndexArray(len(self.abnorms)+1) self.fia.addSingles() for doc in self.doc: doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags) # Store the parameters... if params!=None: self.params = params else: self.params = Params() # Create a model object, for storing samples into... self.model = Model() def setGlobalParams(self, sample): """Sets a number of parameters for the State after initialisation, taking them from the given Sample object. Designed for use with the addPrior method this allows you to extract all relevant parameters from a Sample. Must be called before any Gibbs sampling takes place.""" self.alpha = PriorConcDP(sample.alpha) self.beta = sample.beta.copy() self.gamma = PriorConcDP(sample.gamma) self.rho = PriorConcDP(sample.rho) self.mu = PriorConcDP(sample.mu) # No correct way of combining - the below seems reasonable enough however, and is correct if they have the same entrys... for key,fromIndex in sample.abnorms.iteritems(): if key in self.abnorms: toIndex = self.abnorms[key] self.phi[toIndex] = sample.phi[fromIndex] self.phi /= self.phi.sum() self.topicConc = sample.topicConc self.clusterConc = sample.clusterConc for doc in self.doc: doc.conc = self.alpha.conc def addPrior(self, sample): """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place.""" # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ################### # Do the topics... offset = self.topicWord.shape[0] if self.topicWord.shape[0]!=0: self.topicWord = numpy.vstack((self.topicWord,sample.topicWord)) else: self.topicWord = sample.topicWord.copy() self.topicUse = numpy.hstack((self.topicUse,sample.topicUse)) # Calculate the new abnormalities dictionary... newAbnorms = dict(sample.abnorms) for key,_ in self.abnorms.iteritems(): if key not in newAbnorms: val = len(newAbnorms)+1 newAbnorms[key] = val # Transfer over the abnormal word counts... newAbnormTopicWord = numpy.zeros((1+len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32) for abnorm,origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] limit = self.abnormTopicWord.shape[1] newAbnormTopicWord[dest,:limit] += self.abnormTopicWord[origin,:limit] for abnorm,origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] limit = sample.abnormTopicWord.shape[1] newAbnormTopicWord[dest,:limit] += sample.abnormTopicWord[origin,:limit] # Update the document flags/counts for behaviours... for doc in self.doc: newFlags = numpy.zeros(1+len(newAbnorms), dtype=numpy.uint8) newCounts = numpy.zeros(1+len(newAbnorms), dtype=numpy.int32) newFlags[0] = doc.behFlags[0] newCounts[0] = doc.behCounts[0] for abnorm,origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] newFlags[dest] = doc.behFlags[origin] newCounts[dest] = doc.behCounts[origin] doc.behFlags = newFlags doc.behCounts = newCounts # Update the old clusters behaviour arrays... def mapOldCluster(c): c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32) c2 /= c2.sum() c2[0] *= c[2][0] for abnorm,origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] c2[dest] *= c[2][origin] c2 /= c2.sum() return (c[0],c[1],c2,c[3]) self.cluster = map(mapOldCluster ,self.cluster) origCluCount = len(self.cluster) # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents... def mapCluster(pair): ci, c = pair c0 = c[0].copy() c0[:,0] += offset c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32) c2 /= c2.sum() c2[0] *= c[2][0] for abnorm,origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] c2[dest] *= c[2][origin] c2 /= c2.sum() c3 = c[3].copy() for doc in filter(lambda doc: doc.cluster==ci, sample.doc): fi = sample.fia.flagIndex(doc.behFlags, False) if fi>=len(doc.behFlags): # Only bother if the document has abnormalities, of which this is a valid test. total = 0 for i in xrange(doc.dp.shape[0]): c3[doc.dp[i,0]] += doc.dp[i,2] total += doc.dp[i,2] c3[fi] -= total + 1 return (c0,c[1],c2,c3) self.cluster += map(mapCluster, enumerate(sample.cluster)) self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse)) # Update phi... newPhi = numpy.ones(len(newAbnorms)+1,dtype=numpy.float32) newPhi[0] = 0.5*(self.phi[0]+sample.phi[0]) for abnorm,origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] newPhi[dest] = self.phi[origin] for abnorm,origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] if abnorm not in self.abnorms: newPhi[dest] = sample.phi[origin] else: newPhi[dest] = 0.5*(newPhi[dest] + sample.phi[origin]) self.phi = newPhi self.phi /= self.phi.sum() # Recreate the flag index array... remapOrig = dict() # Old flag positions to new flag positions. remapOrig[0] = 0 for abnorm,origin in self.abnorms.iteritems(): remapOrig[origin] = newAbnorms[abnorm] remapSam = dict() # sample flag positions to new flag positions. remapSam[0] = 0 for abnorm,origin in sample.abnorms.iteritems(): remapSam[origin] = newAbnorms[abnorm] newFia = FlagIndexArray(len(newAbnorms)+1) newFia.addSingles() behIndAdjOrig = newFia.addFlagIndexArray(self.fia,remapOrig) behIndAdjSam = newFia.addFlagIndexArray(sample.fia,remapSam) for doc in self.doc: doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex] # Update cluster priors on bmn arrays... for c in xrange(len(self.cluster)): clu = self.cluster[c] newBmn = numpy.zeros(newFia.flagCount(),dtype=numpy.int32) oldBmn = clu[3].copy() # Transilate from old set... for b in xrange(oldBmn.shape[0]): index = behIndAdjOrig[b] if c<origCluCount else behIndAdjSam[b] newBmn[index] += oldBmn[b] self.cluster[c] = (clu[0], clu[1], clu[2], newBmn) # Replace the old abnormality and fia stuff... self.abnormTopicWord = newAbnormTopicWord self.abnorms = newAbnorms self.fia = newFia def sample(self): """Samples the current state, storing the current estimate of the model parameters.""" self.model.sampleState(self) def absorbClone(self,clone): """Given a clone absorb all its samples - used for multiprocessing.""" self.model.absorbModel(clone.model) def getParams(self): """Returns the parameters object.""" return self.params def getModel(self): """Returns the model constructed from all the calls to sample().""" return self.model
def addPrior(self, sample): """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place.""" # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ################### # Do the topics... offset = self.topicWord.shape[0] if self.topicWord.shape[0]!=0: self.topicWord = numpy.vstack((self.topicWord,sample.topicWord)) else: self.topicWord = sample.topicWord.copy() self.topicUse = numpy.hstack((self.topicUse,sample.topicUse)) # Calculate the new abnormalities dictionary... newAbnorms = dict(sample.abnorms) for key,_ in self.abnorms.iteritems(): if key not in newAbnorms: val = len(newAbnorms)+1 newAbnorms[key] = val # Transfer over the abnormal word counts... newAbnormTopicWord = numpy.zeros((1+len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32) for abnorm,origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] limit = self.abnormTopicWord.shape[1] newAbnormTopicWord[dest,:limit] += self.abnormTopicWord[origin,:limit] for abnorm,origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] limit = sample.abnormTopicWord.shape[1] newAbnormTopicWord[dest,:limit] += sample.abnormTopicWord[origin,:limit] # Update the document flags/counts for behaviours... for doc in self.doc: newFlags = numpy.zeros(1+len(newAbnorms), dtype=numpy.uint8) newCounts = numpy.zeros(1+len(newAbnorms), dtype=numpy.int32) newFlags[0] = doc.behFlags[0] newCounts[0] = doc.behCounts[0] for abnorm,origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] newFlags[dest] = doc.behFlags[origin] newCounts[dest] = doc.behCounts[origin] doc.behFlags = newFlags doc.behCounts = newCounts # Update the old clusters behaviour arrays... def mapOldCluster(c): c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32) c2 /= c2.sum() c2[0] *= c[2][0] for abnorm,origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] c2[dest] *= c[2][origin] c2 /= c2.sum() return (c[0],c[1],c2,c[3]) self.cluster = map(mapOldCluster ,self.cluster) origCluCount = len(self.cluster) # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents... def mapCluster(pair): ci, c = pair c0 = c[0].copy() c0[:,0] += offset c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32) c2 /= c2.sum() c2[0] *= c[2][0] for abnorm,origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] c2[dest] *= c[2][origin] c2 /= c2.sum() c3 = c[3].copy() for doc in filter(lambda doc: doc.cluster==ci, sample.doc): fi = sample.fia.flagIndex(doc.behFlags, False) if fi>=len(doc.behFlags): # Only bother if the document has abnormalities, of which this is a valid test. total = 0 for i in xrange(doc.dp.shape[0]): c3[doc.dp[i,0]] += doc.dp[i,2] total += doc.dp[i,2] c3[fi] -= total + 1 return (c0,c[1],c2,c3) self.cluster += map(mapCluster, enumerate(sample.cluster)) self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse)) # Update phi... newPhi = numpy.ones(len(newAbnorms)+1,dtype=numpy.float32) newPhi[0] = 0.5*(self.phi[0]+sample.phi[0]) for abnorm,origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] newPhi[dest] = self.phi[origin] for abnorm,origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] if abnorm not in self.abnorms: newPhi[dest] = sample.phi[origin] else: newPhi[dest] = 0.5*(newPhi[dest] + sample.phi[origin]) self.phi = newPhi self.phi /= self.phi.sum() # Recreate the flag index array... remapOrig = dict() # Old flag positions to new flag positions. remapOrig[0] = 0 for abnorm,origin in self.abnorms.iteritems(): remapOrig[origin] = newAbnorms[abnorm] remapSam = dict() # sample flag positions to new flag positions. remapSam[0] = 0 for abnorm,origin in sample.abnorms.iteritems(): remapSam[origin] = newAbnorms[abnorm] newFia = FlagIndexArray(len(newAbnorms)+1) newFia.addSingles() behIndAdjOrig = newFia.addFlagIndexArray(self.fia,remapOrig) behIndAdjSam = newFia.addFlagIndexArray(sample.fia,remapSam) for doc in self.doc: doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex] # Update cluster priors on bmn arrays... for c in xrange(len(self.cluster)): clu = self.cluster[c] newBmn = numpy.zeros(newFia.flagCount(),dtype=numpy.int32) oldBmn = clu[3].copy() # Transilate from old set... for b in xrange(oldBmn.shape[0]): index = behIndAdjOrig[b] if c<origCluCount else behIndAdjSam[b] newBmn[index] += oldBmn[b] self.cluster[c] = (clu[0], clu[1], clu[2], newBmn) # Replace the old abnormality and fia stuff... self.abnormTopicWord = newAbnormTopicWord self.abnorms = newAbnorms self.fia = newFia
class State: """State object, as manipulated by a Gibbs sampler to get samples of the unknown parameters of the model.""" def __init__(self, obj, params=None): """Constructs a state object given either another State object (clone), or a Corpus and a Params object. If the Params object is omitted it uses the default. Also supports construction from a single Document, where it uses lots of defaults but is basically identical to a Corpus with a single Document in - used as a shortcut when fitting a Document to an already learnt model.""" if isinstance(obj, State): # Cloning time... self.dnrDocInsts = obj.dnrDocInsts self.dnrCluInsts = obj.dnrCluInsts self.seperateClusterConc = obj.seperateClusterConc self.seperateDocumentConc = obj.seperateDocumentConc self.oneCluster = obj.oneCluster self.calcBeta = obj.calcBeta self.calcCluBmn = obj.calcCluBmn self.calcPhi = obj.calcPhi self.resampleConcs = obj.resampleConcs self.behSamples = obj.behSamples self.alpha = PriorConcDP(obj.alpha) self.beta = obj.beta.copy() self.gamma = PriorConcDP(obj.gamma) self.rho = PriorConcDP(obj.rho) self.mu = PriorConcDP(obj.mu) self.phi = obj.phi.copy() self.topicWord = obj.topicWord.copy() self.topicUse = obj.topicUse.copy() self.topicConc = obj.topicConc self.abnormTopicWord = obj.abnormTopicWord.copy() self.cluster = map(lambda t: (t[0].copy(), t[1], t[2].copy()), obj.cluster) self.clusterUse = obj.clusterUse.copy() self.clusterConc = obj.clusterConc self.doc = map(lambda d: DocState(d), obj.doc) self.abnorms = dict(obj.abnorms) self.fia = FlagIndexArray(obj.fia) self.params = Params(obj.params) self.model = Model(obj.model) elif isinstance(obj, Document): # Construct from a single document... self.dnrDocInsts = False self.dnrCluInsts = False self.seperateClusterConc = False self.seperateDocumentConc = False self.oneCluster = False self.calcBeta = False self.calcCluBmn = False self.calcPhi = False self.resampleConcs = False self.behSamples = 1024 wordCount = obj.getWord(obj.getWordCount() - 1)[0] self.alpha = PriorConcDP() self.beta = numpy.ones(wordCount, dtype=numpy.float32) self.gamma = PriorConcDP() self.rho = PriorConcDP() self.mu = PriorConcDP() self.phi = numpy.ones(1 + len(obj.getAbnorms()), dtype=numpy.float32) self.phi[0] *= 10.0 self.phi /= self.phi.sum() self.topicWord = numpy.zeros((0, wordCount), dtype=numpy.int32) self.topicUse = numpy.zeros(0, dtype=numpy.int32) self.topicConc = self.gamma.conc self.abnormTopicWord = numpy.zeros( (1 + len(obj.getAbnorms()), wordCount), dtype=numpy.int32) self.cluster = [] self.clusterUse = numpy.zeros(0, dtype=numpy.int32) self.clusterConc = self.mu.conc abnormDict = dict() for i, abnorm in enumerate(obj.getAbnorms()): abnormDict[abnorm] = i + 1 self.doc = [DocState(obj, self.alpha, abnormDict)] self.abnorms = dict() for num, abnorm in enumerate(obj.getAbnorms()): self.abnorms[abnorm] = num + 1 self.fia = FlagIndexArray(len(self.abnorms) + 1) self.fia.addSingles() for doc in self.doc: doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags) if params != None: self.params = params else: self.params = Params() self.model = Model() else: # Construct from a corpus, as that is the only remaining option... # Behaviour flags... self.dnrDocInsts = obj.getDocInstsDNR() self.dnrCluInsts = obj.getCluInstsDNR() self.seperateClusterConc = obj.getSeperateClusterConc() self.seperateDocumentConc = obj.getSeperateDocumentConc() self.oneCluster = obj.getOneCluster() self.calcBeta = obj.getCalcBeta() self.calcCluBmn = obj.getCalcClusterBMN() self.calcPhi = obj.getCalcPhi() self.resampleConcs = obj.getResampleConcs() self.behSamples = obj.getBehSamples() # Concentration parameters - these are all constant... self.alpha = PriorConcDP(obj.getAlpha()) self.beta = numpy.ones(obj.getWordCount(), dtype=numpy.float32) self.beta *= obj.getBeta() self.gamma = PriorConcDP(obj.getGamma()) self.rho = PriorConcDP(obj.getRho()) self.mu = PriorConcDP(obj.getMu()) self.phi = numpy.ones(1 + len(obj.getAbnormDict()), dtype=numpy.float32) self.phi[0] *= obj.getPhiRatio() self.phi *= obj.getPhiConc() * self.phi.shape[0] / self.phi.sum() # The topics in the model - consists of three parts - first an array indexed by [topic,word] which gives how many times each word has been drawn from the given topic - this alongside beta allows the relevant Dirichlet posterior to be determined. Additionally we have topicUse, which counts how many times each topic has been instanced in a cluster - this alongside topicConc, which is the sampled concentration, defines the DP from which topics are drawn for inclusion in clusters... self.topicWord = numpy.zeros((0, obj.getWordCount()), dtype=numpy.int32) self.topicUse = numpy.zeros(0, dtype=numpy.int32) self.topicConc = self.gamma.conc # A second topicWord-style matrix, indexed by behaviour and containing the abnormal topics. Entry 0, which is normal, is again an empty dummy... self.abnormTopicWord = numpy.zeros( (1 + len(obj.getAbnormDict()), obj.getWordCount()), dtype=numpy.int32) # Defines the clusters, as a list of (inst, conc, bmn, bmnPrior). inst is a 2D array, containing all the topic instances that make up the cluster - whilst the first dimension of the array indexes each instance the second has two entrys only, the first the index number for the topic, the second the number of using document instances. conc is the sampled concentration that completes the definition of the DP defined for each cluster. bmn is the multinomial on behaviours associated with the cluster - a 1D array of floats. bmnPrior is the flagSet aligned integer array that is the prior on bmn. Additionally we have the DDP from which the specific clusters are drawn - this is defined by clusterUse and clusterConc, just as for the topics... self.cluster = [] self.clusterUse = numpy.zeros(0, dtype=numpy.int32) self.clusterConc = self.mu.conc # List of document objects, to contain the documents - whilst declared immediatly below as an empty list we then proceed to fill it in with the information from the given Corpus... self.doc = [] for doc in obj.documentList(): self.doc.append(DocState(doc, self.alpha, obj.getAbnormDict())) # The abnormality dictionary - need a copy so we can convert from flags to the user provided codes after fitting the model... self.abnorms = dict(obj.getAbnormDict()) # The flag index array - converts each flag combination to an index - required for learning the per-cluster behaviour multinomials... self.fia = FlagIndexArray(len(self.abnorms) + 1) self.fia.addSingles() for doc in self.doc: doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags) # Store the parameters... if params != None: self.params = params else: self.params = Params() # Create a model object, for storing samples into... self.model = Model() def setGlobalParams(self, sample): """Sets a number of parameters for the State after initialisation, taking them from the given Sample object. Designed for use with the addPrior method this allows you to extract all relevant parameters from a Sample. Must be called before any Gibbs sampling takes place.""" self.alpha = PriorConcDP(sample.alpha) self.beta = sample.beta.copy() self.gamma = PriorConcDP(sample.gamma) self.rho = PriorConcDP(sample.rho) self.mu = PriorConcDP(sample.mu) # No correct way of combining - the below seems reasonable enough however, and is correct if they have the same entrys... for key, fromIndex in sample.abnorms.iteritems(): if key in self.abnorms: toIndex = self.abnorms[key] self.phi[toIndex] = sample.phi[fromIndex] self.phi /= self.phi.sum() self.topicConc = sample.topicConc self.clusterConc = sample.clusterConc for doc in self.doc: doc.conc = self.alpha.conc def addPrior(self, sample): """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place.""" # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ################### # Do the topics... offset = self.topicWord.shape[0] if self.topicWord.shape[0] != 0: self.topicWord = numpy.vstack((self.topicWord, sample.topicWord)) else: self.topicWord = sample.topicWord.copy() self.topicUse = numpy.hstack((self.topicUse, sample.topicUse)) # Calculate the new abnormalities dictionary... newAbnorms = dict(sample.abnorms) for key, _ in self.abnorms.iteritems(): if key not in newAbnorms: val = len(newAbnorms) + 1 newAbnorms[key] = val # Transfer over the abnormal word counts... newAbnormTopicWord = numpy.zeros( (1 + len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32) for abnorm, origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] limit = self.abnormTopicWord.shape[1] newAbnormTopicWord[dest, :limit] += self.abnormTopicWord[ origin, :limit] for abnorm, origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] limit = sample.abnormTopicWord.shape[1] newAbnormTopicWord[dest, :limit] += sample.abnormTopicWord[ origin, :limit] # Update the document flags/counts for behaviours... for doc in self.doc: newFlags = numpy.zeros(1 + len(newAbnorms), dtype=numpy.uint8) newCounts = numpy.zeros(1 + len(newAbnorms), dtype=numpy.int32) newFlags[0] = doc.behFlags[0] newCounts[0] = doc.behCounts[0] for abnorm, origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] newFlags[dest] = doc.behFlags[origin] newCounts[dest] = doc.behCounts[origin] doc.behFlags = newFlags doc.behCounts = newCounts # Update the old clusters behaviour arrays... def mapOldCluster(c): c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32) c2 /= c2.sum() c2[0] *= c[2][0] for abnorm, origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] c2[dest] *= c[2][origin] c2 /= c2.sum() return (c[0], c[1], c2, c[3]) self.cluster = map(mapOldCluster, self.cluster) origCluCount = len(self.cluster) # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents... def mapCluster(pair): ci, c = pair c0 = c[0].copy() c0[:, 0] += offset c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32) c2 /= c2.sum() c2[0] *= c[2][0] for abnorm, origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] c2[dest] *= c[2][origin] c2 /= c2.sum() c3 = c[3].copy() for doc in filter(lambda doc: doc.cluster == ci, sample.doc): fi = sample.fia.flagIndex(doc.behFlags, False) if fi >= len( doc.behFlags ): # Only bother if the document has abnormalities, of which this is a valid test. total = 0 for i in xrange(doc.dp.shape[0]): c3[doc.dp[i, 0]] += doc.dp[i, 2] total += doc.dp[i, 2] c3[fi] -= total + 1 return (c0, c[1], c2, c3) self.cluster += map(mapCluster, enumerate(sample.cluster)) self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse)) # Update phi... newPhi = numpy.ones(len(newAbnorms) + 1, dtype=numpy.float32) newPhi[0] = 0.5 * (self.phi[0] + sample.phi[0]) for abnorm, origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] newPhi[dest] = self.phi[origin] for abnorm, origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] if abnorm not in self.abnorms: newPhi[dest] = sample.phi[origin] else: newPhi[dest] = 0.5 * (newPhi[dest] + sample.phi[origin]) self.phi = newPhi self.phi /= self.phi.sum() # Recreate the flag index array... remapOrig = dict() # Old flag positions to new flag positions. remapOrig[0] = 0 for abnorm, origin in self.abnorms.iteritems(): remapOrig[origin] = newAbnorms[abnorm] remapSam = dict() # sample flag positions to new flag positions. remapSam[0] = 0 for abnorm, origin in sample.abnorms.iteritems(): remapSam[origin] = newAbnorms[abnorm] newFia = FlagIndexArray(len(newAbnorms) + 1) newFia.addSingles() behIndAdjOrig = newFia.addFlagIndexArray(self.fia, remapOrig) behIndAdjSam = newFia.addFlagIndexArray(sample.fia, remapSam) for doc in self.doc: doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex] # Update cluster priors on bmn arrays... for c in xrange(len(self.cluster)): clu = self.cluster[c] newBmn = numpy.zeros(newFia.flagCount(), dtype=numpy.int32) oldBmn = clu[3].copy() # Transilate from old set... for b in xrange(oldBmn.shape[0]): index = behIndAdjOrig[b] if c < origCluCount else behIndAdjSam[ b] newBmn[index] += oldBmn[b] self.cluster[c] = (clu[0], clu[1], clu[2], newBmn) # Replace the old abnormality and fia stuff... self.abnormTopicWord = newAbnormTopicWord self.abnorms = newAbnorms self.fia = newFia def sample(self): """Samples the current state, storing the current estimate of the model parameters.""" self.model.sampleState(self) def absorbClone(self, clone): """Given a clone absorb all its samples - used for multiprocessing.""" self.model.absorbModel(clone.model) def getParams(self): """Returns the parameters object.""" return self.params def getModel(self): """Returns the model constructed from all the calls to sample().""" return self.model
def addPrior(self, sample): """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place.""" # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ################### # Do the topics... offset = self.topicWord.shape[0] if self.topicWord.shape[0] != 0: self.topicWord = numpy.vstack((self.topicWord, sample.topicWord)) else: self.topicWord = sample.topicWord.copy() self.topicUse = numpy.hstack((self.topicUse, sample.topicUse)) # Calculate the new abnormalities dictionary... newAbnorms = dict(sample.abnorms) for key, _ in self.abnorms.iteritems(): if key not in newAbnorms: val = len(newAbnorms) + 1 newAbnorms[key] = val # Transfer over the abnormal word counts... newAbnormTopicWord = numpy.zeros( (1 + len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32) for abnorm, origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] limit = self.abnormTopicWord.shape[1] newAbnormTopicWord[dest, :limit] += self.abnormTopicWord[ origin, :limit] for abnorm, origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] limit = sample.abnormTopicWord.shape[1] newAbnormTopicWord[dest, :limit] += sample.abnormTopicWord[ origin, :limit] # Update the document flags/counts for behaviours... for doc in self.doc: newFlags = numpy.zeros(1 + len(newAbnorms), dtype=numpy.uint8) newCounts = numpy.zeros(1 + len(newAbnorms), dtype=numpy.int32) newFlags[0] = doc.behFlags[0] newCounts[0] = doc.behCounts[0] for abnorm, origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] newFlags[dest] = doc.behFlags[origin] newCounts[dest] = doc.behCounts[origin] doc.behFlags = newFlags doc.behCounts = newCounts # Update the old clusters behaviour arrays... def mapOldCluster(c): c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32) c2 /= c2.sum() c2[0] *= c[2][0] for abnorm, origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] c2[dest] *= c[2][origin] c2 /= c2.sum() return (c[0], c[1], c2, c[3]) self.cluster = map(mapOldCluster, self.cluster) origCluCount = len(self.cluster) # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents... def mapCluster(pair): ci, c = pair c0 = c[0].copy() c0[:, 0] += offset c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32) c2 /= c2.sum() c2[0] *= c[2][0] for abnorm, origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] c2[dest] *= c[2][origin] c2 /= c2.sum() c3 = c[3].copy() for doc in filter(lambda doc: doc.cluster == ci, sample.doc): fi = sample.fia.flagIndex(doc.behFlags, False) if fi >= len( doc.behFlags ): # Only bother if the document has abnormalities, of which this is a valid test. total = 0 for i in xrange(doc.dp.shape[0]): c3[doc.dp[i, 0]] += doc.dp[i, 2] total += doc.dp[i, 2] c3[fi] -= total + 1 return (c0, c[1], c2, c3) self.cluster += map(mapCluster, enumerate(sample.cluster)) self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse)) # Update phi... newPhi = numpy.ones(len(newAbnorms) + 1, dtype=numpy.float32) newPhi[0] = 0.5 * (self.phi[0] + sample.phi[0]) for abnorm, origin in self.abnorms.iteritems(): dest = newAbnorms[abnorm] newPhi[dest] = self.phi[origin] for abnorm, origin in sample.abnorms.iteritems(): dest = newAbnorms[abnorm] if abnorm not in self.abnorms: newPhi[dest] = sample.phi[origin] else: newPhi[dest] = 0.5 * (newPhi[dest] + sample.phi[origin]) self.phi = newPhi self.phi /= self.phi.sum() # Recreate the flag index array... remapOrig = dict() # Old flag positions to new flag positions. remapOrig[0] = 0 for abnorm, origin in self.abnorms.iteritems(): remapOrig[origin] = newAbnorms[abnorm] remapSam = dict() # sample flag positions to new flag positions. remapSam[0] = 0 for abnorm, origin in sample.abnorms.iteritems(): remapSam[origin] = newAbnorms[abnorm] newFia = FlagIndexArray(len(newAbnorms) + 1) newFia.addSingles() behIndAdjOrig = newFia.addFlagIndexArray(self.fia, remapOrig) behIndAdjSam = newFia.addFlagIndexArray(sample.fia, remapSam) for doc in self.doc: doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex] # Update cluster priors on bmn arrays... for c in xrange(len(self.cluster)): clu = self.cluster[c] newBmn = numpy.zeros(newFia.flagCount(), dtype=numpy.int32) oldBmn = clu[3].copy() # Transilate from old set... for b in xrange(oldBmn.shape[0]): index = behIndAdjOrig[b] if c < origCluCount else behIndAdjSam[ b] newBmn[index] += oldBmn[b] self.cluster[c] = (clu[0], clu[1], clu[2], newBmn) # Replace the old abnormality and fia stuff... self.abnormTopicWord = newAbnormTopicWord self.abnorms = newAbnorms self.fia = newFia