Exemple #1
0
class Sample:
  """Stores a single sample drawn from the model - the topics, clusters and each document being sampled over. Stores counts and parameters required to make them into distributions, rather than final distributions. Has clonning capability."""
  def __init__(self, state, calcNLL = True, priorsOnly = False):
    """Given a state this draws a sample from it, as a specific parametrisation of the model. Also a copy constructor, with a slight modification - if the priorsOnly flag is set it will only copy across the priors, and initialise to an empty model."""
    if isinstance(state, Sample): # Code for clonning.
      self.alpha = state.alpha
      self.beta = state.beta.copy()
      self.gamma = state.gamma
      self.rho = state.rho
      self.mu = state.mu
      self.phi = state.phi.copy()

      if not priorsOnly:
        self.topicWord = state.topicWord.copy()
        self.topicUse = state.topicUse.copy()
      else:
        self.topicWord = numpy.zeros((0,state.topicWord.shape[1]), dtype=numpy.int32)
        self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = state.topicConc

      self.abnormTopicWord = state.abnormTopicWord.copy()
      self.abnorms = dict(state.abnorms)
      self.fia = FlagIndexArray(state.fia)

      if not priorsOnly:
        self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster)
        self.clusterUse = state.clusterUse.copy()
      else:
        self.cluster = []
        self.clusterUse = numpy.zeros(0,dtype=numpy.int32)
      self.clusterConc = state.clusterConc

      if not priorsOnly:
        self.doc = map(lambda ds: DocSample(ds), state.doc)
      else:
        self.doc = []
    else: # Normal initialisation code.
      self.alpha = state.alpha
      self.beta = state.beta.copy()
      self.gamma = state.gamma
      self.rho = state.rho
      self.mu = state.mu
      self.phi = state.phi.copy()

      # Topic stuff...
      self.topicWord = state.topicWord.copy()
      self.topicUse = state.topicUse.copy()
      self.topicConc = state.topicConc

      # Abnormality stuff...
      self.abnormTopicWord = state.abnormTopicWord.copy()
      self.abnorms = dict(state.abnorms)
      self.fia = FlagIndexArray(state.fia)

      # Cluster stuff...
      self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster)
      self.clusterUse = state.clusterUse.copy()
      self.clusterConc = state.clusterConc

      # The details for each document...
      self.doc = []
      for d in xrange(len(state.doc)):
        self.doc.append(DocSample(state.doc[d]))

      # Second pass through documents to fill in the negative log liklihoods - need some data structures for this...
      if calcNLL:
        for d in xrange(len(state.doc)):
          self.doc[d].calcNLL(state.doc[d],state)


  def merge(self, other):
    """Given a sample this merges it into this sample. Works under the assumption that the new sample was learnt with this sample as its only prior, and ends up as though both the prior and the sample were drawn whilst simultaneously being modeled. Trashes the given sample - do not continue to use."""

    # Update the old documents - there are potentially more behaviours in the new sample, which means adjusting the behaviour flags...
    if self.fia.getLength()!=other.fia.getLength():
      for doc in self.doc:
        newBehFlags = numpy.zeros(other.fia.getLength(), dtype=numpy.uint8)
        newBehFlags[0] = doc.behFlags[0]

        for abnorm, index in self.abnorms:
          newIndex = other.abnorms[abnorm]
          newBehFlags[newIndex] = doc.behFlags[index]
        
        doc.behFlags = newBehFlags

    # Replace the basic parameters...
    self.alpha = other.alpha
    self.beta = other.beta
    self.gamma = other.gamma
    self.rho = other.rho
    self.mu = other.mu
    self.phi = other.phi

    self.topicWord = other.topicWord
    self.topicUse = other.topicUse
    self.topicConc = other.topicConc

    self.abnormTopicWord = other.abnormTopicWord
    self.abnorms = other.abnorms
    self.fia = other.fia

    self.cluster = other.cluster
    self.clusterUse = other.clusterUse
    self.clusterConc = other.clusterConc

    # Add in the (presumably) new documents...
    for doc in other.doc:
      self.doc.append(doc)


  def getAlphaPrior(self):
    """Returns the PriorConcDP that was used for the alpha parameter, which is the concentration parameter for the DP in each document."""
    return self.alpha

  def getBeta(self):
    """Returns the beta prior, which is a vector representing a Dirichlet distribution from which the multinomials for each topic are drawn, from which words are drawn."""
    return self.beta

  def getGammaPrior(self):
    """Returns the PriorConcDP that was used for the gamma parameter, which is the concentration parameter for the global DP from which topics are drawn."""
    return self.gamma
    
  def getRhoPrior(self):
    """Returns the PriorConcDP that was used for the rho parameter, which is the concentration parameter for each specific clusters DP."""
    return self.rho

  def getMuPrior(self):
    """Returns the PriorConcDP that was used for the mu parameter, which is the concentration parameter for the DP from which clusters are drawn."""
    return self.mu

  def getPhi(self):
    """Returns the phi Dirichlet distribution prior on the behavioural multinomial for each cluster."""
    return self.phi


  def getTopicCount(self):
    """Returns the number of topics in the sample."""
    return self.topicWord.shape[0]

  def getWordCount(self):
    """Returns the number of words in the topic multinomial."""
    return self.topicWord.shape[1]

  def getTopicUseWeight(self, t):
    """Returns how many times the given topic has been instanced in a cluster."""
    return self.topicUse[t]

  def getTopicUseWeights(self):
    """Returns an array, indexed by topic id, that contains how many times each topic has been instanciated in a cluster. Do not edit the return value - copy it first."""
    return self.topicUse

  def getTopicConc(self):
    """Returns the sampled concentration parameter for drawing topic instances from the global DP."""
    return self.topicConc
    
  def getTopicWordCount(self, t):
    """Returns the number of samples assigned to each word for the given topic, as an integer numpy array. Do not edit the return value - make a copy first."""
    return self.topicWord[t,:]

  def getTopicWordCounts(self, t):
    """Returns the number of samples assigned to each word for all topics, indexed [topic, word], as an integer numpy array. Do not edit the return value - make a copy first."""
    return self.topicWord

  def getTopicMultinomial(self, t):
    """Returns the calculated multinomial for a given topic ident."""
    ret = self.beta.copy()
    ret += self.topicWord[t,:]
    ret /= ret.sum()
    return ret

  def getTopicMultinomials(self):
    """Returns the multinomials for all topics, in a single array - indexed by [topic, word] to give P(word|topic)."""
    ret = numpy.vstack([self.beta]*self.topicWord.shape[0])
    ret += self.topicWord
    ret = (ret.T / ret.sum(axis=1)).T
    return ret


  def getBehCount(self):
    """Returns the number of behaviours, which is the number of abnormalities plus 1, and the entry count for the indexing variable for abnormals in the relevant methods."""
    return self.abnormTopicWord.shape[0]

  def getAbnormWordCount(self, b):
    """Returns the number of samples assigned to each word for the given abnormal topic. Note that entry 0 equates to normal behaviour and is a dummy that should be ignored."""
    return self.abnormTopicWord[b,:]

  def getAbnormWordCounts(self):
    """Returns the number of samples assigned to each word in each abnormal behaviour. An integer 2D array indexed with [behaviour, word], noting that behaviour 0 is a dummy for normal behaviour. Do not edit the return value - make a copy first."""
    return self.abnormTopicWord

  def getAbnormMultinomial(self, b):
    """Returns the calculated multinomial for a given abnormal behaviour."""
    ret = self.beta.copy()
    ret += self.abnormTopicWord[b,:]
    ret /= ret.sum()
    return ret

  def getAbnormMultinomials(self):
    """Returns the multinomials for all abnormalities, in a single array - indexed by [behaviour, word] to give P(word|topic associated with behaviour). Entry 0 is a dummy to fill in for normal behaviour, and should be ignored."""
    ret = numpy.vstack([self.beta]*self.abnormTopicWord.shape[0])
    ret += self.abnormTopicWord
    ret = (ret.T / ret.sum(axis=1)).T
    return ret


  def getAbnormDict(self):
    """Returns a dictionary that takes each abnormalities user provided token to the behaviour index used for it. Allows the use of the getAbnorm* methods, amung other things."""
    return self.abnorms


  def getClusterCount(self):
    """Returns how many clusters there are."""
    return len(self.cluster)

  def getClusterDrawWeight(self, c):
    """Returns how many times the given cluster has been instanced by a document."""
    return self.clusterUse[c]

  def getClusterDrawWeights(self):
    """Returns an array, indexed by cluster id, that contains how many times each cluster has been instanciated by a document. Do not edit the return value - copy it first."""
    return self.clusterUse

  def getClusterDrawConc(self):
    """Returns the sampled concentration parameter for drawing cluster instances for documents."""
    return self.clusterConc

  def getClusterInstCount(self, c):
    """Returns how many instances of topics exist in the given cluster."""
    return self.cluster[c][0].shape[0]
    
  def getClusterInstWeight(self, c, ti):
    """Returns how many times the given cluster topic instance has been instanced by a documents DP."""
    return self.cluster[c][0][ti,1]
    
  def getClusterInstTopic(self, c, ti):
    """Returns which topic the given cluster topic instance is an instance of."""
    return self.cluster[c][0][ti,0]

  def getClusterInstDual(self, c):
    """Returns a 2D array, where the first dimension is indexed by the topic instance, and the second contains two columns - the first the topic index, the second the weight. Do not edit return value - copy before use."""
    return self.cluster[c][0]

  def getClusterInstConc(self, c):
    """Returns the sampled concentration that goes with the DP from which the members of each documents DP are drawn."""
    return self.cluster[c][1]

  def getClusterInstBehMN(self, c):
    """Returns the multinomial on drawing behaviours for the given cluster."""
    return self.cluster[c][2]

  def getClusterInstPriorBehMN(self, c):
    """Returns the prior on the behaviour multinomial, as an array of integer counts aligned with the flag set."""
    return self.cluster[c][3]


  def docCount(self):
    """Returns the number of documents stored within. Should be the same as the corpus from which the sample was drawn."""
    return len(self.doc)

  def getDoc(self,d):
    """Given a document index this returns the appropriate DocSample object. These indices should align up with the document indices in the Corpus from which this Sample was drawn, assuming no documents have been deleted."""
    return self.doc[d]


  def delDoc(self, ident):
    """Given a document ident this finds the document with the ident and removes it from the model, completly - i.e. all the variables in the sample are also updated. Primarilly used to remove documents for resampling prior to using the model as a prior. Note that this can potentially leave entities with no users - they get culled when the model is loaded into the C++ data structure so as to not cause problems."""
    # Find and remove it from the document list...
    index = None
    for i in xrange(len(self.doc)):
      if self.doc[i].getIdent()==ident:
        index = i
        break
    if index==None: return

    victim = self.doc[index]
    self.doc = self.doc[:index] + self.doc[index+1:]
    

    # Update all the variables left behind by subtracting the relevant terms...
    cluster = self.cluster[victim.cluster]
    self.clusterUse[victim.cluster] -= 1

    ## First pass through the dp and remove its influence; at the same time note the arrays that need to be updated by each user when looping through...
    dp_ext = []
    for i in xrange(victim.dp.shape[0]):
      beh = victim.dp[i,0]
      #count = victim.dp[i,2]

      if beh==0: # Normal behaviour
        cluInst = victim.dp[i,1]

        # Update the instance, and topic use counts if necessary...
        topic = cluster[0][cluInst,0]
        cluster[0][cluInst,1] -= 1
        if cluster[0][cluInst,1]==0:
          self.topicUse[topic] -= 1

        # Store the entity that needs updating in correspondence with this dp instance in the next step...
        dp_ext.append((self.topicWord, topic))

      else: # Abnormal behaviour.
        # Store the entity that needs updating in correspondence with the dp...
        dp_ext.append((self.abnormTopicWord, beh))
    
    ## Go through the samples array and remove their influnce - the hard part was done by the preceding step...
    for si in xrange(victim.samples.shape[0]):
      inst = victim.samples[si,0]
      word = victim.samples[si,1]
      mat, topic = dp_ext[inst]
      mat[topic,word] -= 1

    # Clean up all zeroed items...
    self.cleanZeros()


  def cleanZeros(self):
    """Goes through and removes anything that has a zero reference count, adjusting all indices accordingly."""

    # Remove the zeros from this object, noting the changes...

    ## Topics...
    newTopicCount = 0
    topicMap = dict()
    for t in xrange(self.topicUse.shape[0]):
      if self.topicUse[t]!=0:
        topicMap[t] = newTopicCount
        newTopicCount += 1

    if newTopicCount!=self.topicUse.shape[0]:
      newTopicWord = numpy.zeros((newTopicCount, self.topicWord.shape[1]), dtype=numpy.int32)
      newTopicUse = numpy.zeros(newTopicCount,dtype=numpy.int32)

      for origin, dest in topicMap.iteritems():
        newTopicWord[dest,:] = self.topicWord[origin,:]
        newTopicUse[dest] = self.topicUse[origin]
      
      self.topicWord = newTopicWord
      self.topicUse = newTopicUse

    ## Clusters...
    newClusterCount = 0
    clusterMap = dict()
    for c in xrange(self.clusterUse.shape[0]):
      if self.clusterUse[c]!=0:
        clusterMap[c] = newClusterCount
        newClusterCount += 1

    if newClusterCount!=self.clusterUse.shape[0]:
      newCluster = [None]*newClusterCount
      newClusterUse = numpy.zeros(newClusterCount, dtype=numpy.int32)

      for origin, dest in clusterMap.iteritems():
        newCluster[dest] = self.cluster[origin]
        newClusterUse[dest] = self.clusterUse[origin]

      self.cluster = newCluster
      self.clusterUse = newClusterUse

    ## Cluster instances...
    # (Change is noted by a 2-tuple of (new length, dict) where new length is the new length and dict goes from old indices to new indices.)
    cluInstAdj = []
    for ci in xrange(len(self.cluster)):
      newInstCount = 0
      instMap = dict()
      for i in xrange(self.cluster[ci][0].shape[0]):
        if self.cluster[ci][0][i,1]!=0:
          instMap[i] = newInstCount
          newInstCount += 1

      cluInstAdj.append((newInstCount, instMap))

      if newInstCount!=self.cluster[ci][0].shape[0]:
        newInst = numpy.zeros((newInstCount,2), dtype=numpy.int32)

        for origin, dest in instMap.iteritems():
          newInst[dest,:] = self.cluster[ci][0][origin,:]

        self.cluster[ci] = (newInst, self.cluster[ci][1], self.cluster[ci][2], self.cluster[ci][3])


    # Iterate and update the topic indices of the cluster instances...
    for ci in xrange(len(self.cluster)):
      for i in xrange(self.cluster[ci][0].shape[0]):
        self.cluster[ci][0][i,0] = topicMap[self.cluster[ci][0][i,0]]

    # Now iterate the documents and update their cluster and cluster instance indices...
    for doc in self.doc:
      doc.cluster = clusterMap[doc.cluster]
      _, instMap = cluInstAdj[doc.cluster]

      for di in xrange(doc.dp.shape[0]):
        if doc.dp[di,0]==0:
          doc.dp[di,1] = instMap[doc.dp[di,1]]


  def nllAllDocs(self):
    """Returns the negative log likelihood of all the documents in the sample - a reasonable value to compare various samples with."""
    return sum(map(lambda d: d.getNLL(),self.doc))

  def logNegProbWordsGivenClusterAbnorm(self, doc, cluster, particles = 16, cap = -1):
    """Uses wallach's 'left to right' method to calculate the negative log probability of the words in the document given the rest of the model. Both the cluster (provided as an index) and the documents abnormalities vector are fixed for this calculation. Returns the average of the results for each sample contained within model. particles is the number of particles to use in the left to right estimation algorithm. This is implimented using scipy.weave."""
    return solvers.leftRightNegLogProbWord(self, doc, cluster, particles, cap)

  def logNegProbWordsGivenAbnorm(self, doc, particles = 16, cap = -1):
    """Uses logNegProbWordsGivenClusterAbnorm and simply sums out the cluster variable."""

    # Get the probability of each with the dependence with clusters...
    cluScores = map(lambda c: solvers.leftRightNegLogProbWord(self, doc, c, particles, cap), xrange(self.getClusterCount()))

    # Multiply each by the probability of the cluster, so it can be summed out...
    cluNorm = float(self.clusterUse.sum()) + self.clusterConc
    cluScores = map(lambda c,s: s - math.log(float(self.clusterUse[c])/cluNorm), xrange(len(cluScores)), cluScores)

    # Also need to include the probability of a new cluster, even though it is likelly to be a neglible contribution...
    newVal = solvers.leftRightNegLogProbWord(self, doc, -1, particles, cap)
    newVal -= math.log(self.clusterConc/cluNorm)
    cluScores.append(newVal)

    # Sum out the cluster variable, in a numerically stable way given that we are dealing with negative log likelihood values that will map to extremelly low probabilities...
    minScore = min(cluScores)
    cluPropProb = map(lambda s: math.exp(minScore-s), cluScores)
    return minScore - math.log(sum(cluPropProb))
Exemple #2
0
  def __init__(self, state, calcNLL = True, priorsOnly = False):
    """Given a state this draws a sample from it, as a specific parametrisation of the model. Also a copy constructor, with a slight modification - if the priorsOnly flag is set it will only copy across the priors, and initialise to an empty model."""
    if isinstance(state, Sample): # Code for clonning.
      self.alpha = state.alpha
      self.beta = state.beta.copy()
      self.gamma = state.gamma
      self.rho = state.rho
      self.mu = state.mu
      self.phi = state.phi.copy()

      if not priorsOnly:
        self.topicWord = state.topicWord.copy()
        self.topicUse = state.topicUse.copy()
      else:
        self.topicWord = numpy.zeros((0,state.topicWord.shape[1]), dtype=numpy.int32)
        self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = state.topicConc

      self.abnormTopicWord = state.abnormTopicWord.copy()
      self.abnorms = dict(state.abnorms)
      self.fia = FlagIndexArray(state.fia)

      if not priorsOnly:
        self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster)
        self.clusterUse = state.clusterUse.copy()
      else:
        self.cluster = []
        self.clusterUse = numpy.zeros(0,dtype=numpy.int32)
      self.clusterConc = state.clusterConc

      if not priorsOnly:
        self.doc = map(lambda ds: DocSample(ds), state.doc)
      else:
        self.doc = []
    else: # Normal initialisation code.
      self.alpha = state.alpha
      self.beta = state.beta.copy()
      self.gamma = state.gamma
      self.rho = state.rho
      self.mu = state.mu
      self.phi = state.phi.copy()

      # Topic stuff...
      self.topicWord = state.topicWord.copy()
      self.topicUse = state.topicUse.copy()
      self.topicConc = state.topicConc

      # Abnormality stuff...
      self.abnormTopicWord = state.abnormTopicWord.copy()
      self.abnorms = dict(state.abnorms)
      self.fia = FlagIndexArray(state.fia)

      # Cluster stuff...
      self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy(),t[3].copy()), state.cluster)
      self.clusterUse = state.clusterUse.copy()
      self.clusterConc = state.clusterConc

      # The details for each document...
      self.doc = []
      for d in xrange(len(state.doc)):
        self.doc.append(DocSample(state.doc[d]))

      # Second pass through documents to fill in the negative log liklihoods - need some data structures for this...
      if calcNLL:
        for d in xrange(len(state.doc)):
          self.doc[d].calcNLL(state.doc[d],state)
  def __init__(self, obj, params = None):
    """Constructs a state object given either another State object (clone), or a Corpus and a Params object. If the Params object is omitted it uses the default. Also supports construction from a single Document, where it uses lots of defaults but is basically identical to a Corpus with a single Document in - used as a shortcut when fitting a Document to an already learnt model."""
    if isinstance(obj, State):
      # Cloning time...
      self.dnrDocInsts = obj.dnrDocInsts
      self.dnrCluInsts = obj.dnrCluInsts
      self.seperateClusterConc = obj.seperateClusterConc
      self.seperateDocumentConc = obj.seperateDocumentConc
      self.oneCluster = obj.oneCluster
      self.calcBeta = obj.calcBeta
      self.calcCluBmn = obj.calcCluBmn
      self.calcPhi = obj.calcPhi
      self.resampleConcs = obj.resampleConcs
      self.behSamples = obj.behSamples
      
      self.alpha = PriorConcDP(obj.alpha)
      self.beta = obj.beta.copy()
      self.gamma = PriorConcDP(obj.gamma)
      self.rho = PriorConcDP(obj.rho)
      self.mu = PriorConcDP(obj.mu)
      self.phi = obj.phi.copy()

      self.topicWord = obj.topicWord.copy()
      self.topicUse = obj.topicUse.copy()
      self.topicConc = obj.topicConc

      self.abnormTopicWord = obj.abnormTopicWord.copy()

      self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy()),obj.cluster)
      self.clusterUse = obj.clusterUse.copy()
      self.clusterConc = obj.clusterConc

      self.doc = map(lambda d: DocState(d), obj.doc)
      self.abnorms = dict(obj.abnorms)

      self.fia = FlagIndexArray(obj.fia)

      self.params = Params(obj.params)
      self.model = Model(obj.model)

    elif isinstance(obj, Document):
      # Construct from a single document...

      self.dnrDocInsts = False
      self.dnrCluInsts = False
      self.seperateClusterConc = False
      self.seperateDocumentConc = False
      self.oneCluster = False
      self.calcBeta = False
      self.calcCluBmn = False
      self.calcPhi = False
      self.resampleConcs = False
      self.behSamples = 1024

      wordCount = obj.getWord(obj.getWordCount()-1)[0]

      self.alpha = PriorConcDP()
      self.beta = numpy.ones(wordCount, dtype=numpy.float32)
      self.gamma = PriorConcDP()
      self.rho = PriorConcDP()
      self.mu = PriorConcDP()
      self.phi = numpy.ones(1+len(obj.getAbnorms()), dtype=numpy.float32)
      self.phi[0] *= 10.0
      self.phi /= self.phi.sum()

      self.topicWord = numpy.zeros((0,wordCount), dtype=numpy.int32)
      self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = self.gamma.conc

      self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnorms()), wordCount), dtype=numpy.int32)

      self.cluster = []
      self.clusterUse = numpy.zeros(0,dtype=numpy.int32)
      self.clusterConc = self.mu.conc

      abnormDict = dict()
      for i, abnorm in enumerate(obj.getAbnorms()):
        abnormDict[abnorm] = i+1
        
      self.doc = [DocState(obj,self.alpha,abnormDict)]
      self.abnorms = dict()
      for num, abnorm in enumerate(obj.getAbnorms()):
        self.abnorms[abnorm] = num+1

      self.fia = FlagIndexArray(len(self.abnorms)+1)
      self.fia.addSingles()

      for doc in self.doc:
        doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

      if params!=None: self.params = params
      else: self.params = Params()

      self.model = Model()
    else:
      # Construct from a corpus, as that is the only remaining option...

      # Behaviour flags...
      self.dnrDocInsts = obj.getDocInstsDNR()
      self.dnrCluInsts = obj.getCluInstsDNR()
      self.seperateClusterConc = obj.getSeperateClusterConc()
      self.seperateDocumentConc = obj.getSeperateDocumentConc()
      self.oneCluster = obj.getOneCluster()
      self.calcBeta = obj.getCalcBeta()
      self.calcCluBmn = obj.getCalcClusterBMN()
      self.calcPhi = obj.getCalcPhi()
      self.resampleConcs = obj.getResampleConcs()
      self.behSamples = obj.getBehSamples()

      # Concentration parameters - these are all constant...
      self.alpha = PriorConcDP(obj.getAlpha())
      self.beta = numpy.ones(obj.getWordCount(),dtype=numpy.float32)
      self.beta *= obj.getBeta()
      self.gamma = PriorConcDP(obj.getGamma())
      self.rho = PriorConcDP(obj.getRho())
      self.mu = PriorConcDP(obj.getMu())

      self.phi = numpy.ones(1+len(obj.getAbnormDict()), dtype=numpy.float32)
      self.phi[0] *= obj.getPhiRatio()
      self.phi *= obj.getPhiConc()*self.phi.shape[0] / self.phi.sum()

      # The topics in the model - consists of three parts - first an array indexed by [topic,word] which gives how many times each word has been drawn from the given topic - this alongside beta allows the relevant Dirichlet posterior to be determined. Additionally we have topicUse, which counts how many times each topic has been instanced in a cluster - this alongside topicConc, which is the sampled concentration, defines the DP from which topics are drawn for inclusion in clusters...
      self.topicWord = numpy.zeros((0,obj.getWordCount()),dtype=numpy.int32)
      self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = self.gamma.conc

      # A second topicWord-style matrix, indexed by behaviour and containing the abnormal topics. Entry 0, which is normal, is again an empty dummy...
      self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnormDict()), obj.getWordCount()), dtype=numpy.int32)

      # Defines the clusters, as a list of (inst, conc, bmn, bmnPrior). inst is a 2D array, containing all the topic instances that make up the cluster - whilst the first dimension of the array indexes each instance the second has two entrys only, the first the index number for the topic, the second the number of using document instances. conc is the sampled concentration that completes the definition of the DP defined for each cluster. bmn is the multinomial on behaviours associated with the cluster - a 1D array of floats. bmnPrior is the flagSet aligned integer array that is the prior on bmn. Additionally we have the DDP from which the specific clusters are drawn - this is defined by clusterUse and clusterConc, just as for the topics...
      self.cluster = []
      self.clusterUse = numpy.zeros(0, dtype=numpy.int32)
      self.clusterConc = self.mu.conc

      # List of document objects, to contain the documents - whilst declared immediatly below as an empty list we then proceed to fill it in with the information from the given Corpus...
      self.doc = []

      for doc in obj.documentList():
        self.doc.append(DocState(doc, self.alpha, obj.getAbnormDict()))

      # The abnormality dictionary - need a copy so we can convert from flags to the user provided codes after fitting the model...
      self.abnorms = dict(obj.getAbnormDict())

      # The flag index array - converts each flag combination to an index - required for learning the per-cluster behaviour multinomials...
      self.fia = FlagIndexArray(len(self.abnorms)+1)
      self.fia.addSingles()

      for doc in self.doc:
        doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

      # Store the parameters...
      if params!=None: self.params = params
      else: self.params = Params()

      # Create a model object, for storing samples into...
      self.model = Model()
class State:
  """State object, as manipulated by a Gibbs sampler to get samples of the unknown parameters of the model."""
  def __init__(self, obj, params = None):
    """Constructs a state object given either another State object (clone), or a Corpus and a Params object. If the Params object is omitted it uses the default. Also supports construction from a single Document, where it uses lots of defaults but is basically identical to a Corpus with a single Document in - used as a shortcut when fitting a Document to an already learnt model."""
    if isinstance(obj, State):
      # Cloning time...
      self.dnrDocInsts = obj.dnrDocInsts
      self.dnrCluInsts = obj.dnrCluInsts
      self.seperateClusterConc = obj.seperateClusterConc
      self.seperateDocumentConc = obj.seperateDocumentConc
      self.oneCluster = obj.oneCluster
      self.calcBeta = obj.calcBeta
      self.calcCluBmn = obj.calcCluBmn
      self.calcPhi = obj.calcPhi
      self.resampleConcs = obj.resampleConcs
      self.behSamples = obj.behSamples
      
      self.alpha = PriorConcDP(obj.alpha)
      self.beta = obj.beta.copy()
      self.gamma = PriorConcDP(obj.gamma)
      self.rho = PriorConcDP(obj.rho)
      self.mu = PriorConcDP(obj.mu)
      self.phi = obj.phi.copy()

      self.topicWord = obj.topicWord.copy()
      self.topicUse = obj.topicUse.copy()
      self.topicConc = obj.topicConc

      self.abnormTopicWord = obj.abnormTopicWord.copy()

      self.cluster = map(lambda t: (t[0].copy(),t[1],t[2].copy()),obj.cluster)
      self.clusterUse = obj.clusterUse.copy()
      self.clusterConc = obj.clusterConc

      self.doc = map(lambda d: DocState(d), obj.doc)
      self.abnorms = dict(obj.abnorms)

      self.fia = FlagIndexArray(obj.fia)

      self.params = Params(obj.params)
      self.model = Model(obj.model)

    elif isinstance(obj, Document):
      # Construct from a single document...

      self.dnrDocInsts = False
      self.dnrCluInsts = False
      self.seperateClusterConc = False
      self.seperateDocumentConc = False
      self.oneCluster = False
      self.calcBeta = False
      self.calcCluBmn = False
      self.calcPhi = False
      self.resampleConcs = False
      self.behSamples = 1024

      wordCount = obj.getWord(obj.getWordCount()-1)[0]

      self.alpha = PriorConcDP()
      self.beta = numpy.ones(wordCount, dtype=numpy.float32)
      self.gamma = PriorConcDP()
      self.rho = PriorConcDP()
      self.mu = PriorConcDP()
      self.phi = numpy.ones(1+len(obj.getAbnorms()), dtype=numpy.float32)
      self.phi[0] *= 10.0
      self.phi /= self.phi.sum()

      self.topicWord = numpy.zeros((0,wordCount), dtype=numpy.int32)
      self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = self.gamma.conc

      self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnorms()), wordCount), dtype=numpy.int32)

      self.cluster = []
      self.clusterUse = numpy.zeros(0,dtype=numpy.int32)
      self.clusterConc = self.mu.conc

      abnormDict = dict()
      for i, abnorm in enumerate(obj.getAbnorms()):
        abnormDict[abnorm] = i+1
        
      self.doc = [DocState(obj,self.alpha,abnormDict)]
      self.abnorms = dict()
      for num, abnorm in enumerate(obj.getAbnorms()):
        self.abnorms[abnorm] = num+1

      self.fia = FlagIndexArray(len(self.abnorms)+1)
      self.fia.addSingles()

      for doc in self.doc:
        doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

      if params!=None: self.params = params
      else: self.params = Params()

      self.model = Model()
    else:
      # Construct from a corpus, as that is the only remaining option...

      # Behaviour flags...
      self.dnrDocInsts = obj.getDocInstsDNR()
      self.dnrCluInsts = obj.getCluInstsDNR()
      self.seperateClusterConc = obj.getSeperateClusterConc()
      self.seperateDocumentConc = obj.getSeperateDocumentConc()
      self.oneCluster = obj.getOneCluster()
      self.calcBeta = obj.getCalcBeta()
      self.calcCluBmn = obj.getCalcClusterBMN()
      self.calcPhi = obj.getCalcPhi()
      self.resampleConcs = obj.getResampleConcs()
      self.behSamples = obj.getBehSamples()

      # Concentration parameters - these are all constant...
      self.alpha = PriorConcDP(obj.getAlpha())
      self.beta = numpy.ones(obj.getWordCount(),dtype=numpy.float32)
      self.beta *= obj.getBeta()
      self.gamma = PriorConcDP(obj.getGamma())
      self.rho = PriorConcDP(obj.getRho())
      self.mu = PriorConcDP(obj.getMu())

      self.phi = numpy.ones(1+len(obj.getAbnormDict()), dtype=numpy.float32)
      self.phi[0] *= obj.getPhiRatio()
      self.phi *= obj.getPhiConc()*self.phi.shape[0] / self.phi.sum()

      # The topics in the model - consists of three parts - first an array indexed by [topic,word] which gives how many times each word has been drawn from the given topic - this alongside beta allows the relevant Dirichlet posterior to be determined. Additionally we have topicUse, which counts how many times each topic has been instanced in a cluster - this alongside topicConc, which is the sampled concentration, defines the DP from which topics are drawn for inclusion in clusters...
      self.topicWord = numpy.zeros((0,obj.getWordCount()),dtype=numpy.int32)
      self.topicUse = numpy.zeros(0,dtype=numpy.int32)
      self.topicConc = self.gamma.conc

      # A second topicWord-style matrix, indexed by behaviour and containing the abnormal topics. Entry 0, which is normal, is again an empty dummy...
      self.abnormTopicWord = numpy.zeros((1+len(obj.getAbnormDict()), obj.getWordCount()), dtype=numpy.int32)

      # Defines the clusters, as a list of (inst, conc, bmn, bmnPrior). inst is a 2D array, containing all the topic instances that make up the cluster - whilst the first dimension of the array indexes each instance the second has two entrys only, the first the index number for the topic, the second the number of using document instances. conc is the sampled concentration that completes the definition of the DP defined for each cluster. bmn is the multinomial on behaviours associated with the cluster - a 1D array of floats. bmnPrior is the flagSet aligned integer array that is the prior on bmn. Additionally we have the DDP from which the specific clusters are drawn - this is defined by clusterUse and clusterConc, just as for the topics...
      self.cluster = []
      self.clusterUse = numpy.zeros(0, dtype=numpy.int32)
      self.clusterConc = self.mu.conc

      # List of document objects, to contain the documents - whilst declared immediatly below as an empty list we then proceed to fill it in with the information from the given Corpus...
      self.doc = []

      for doc in obj.documentList():
        self.doc.append(DocState(doc, self.alpha, obj.getAbnormDict()))

      # The abnormality dictionary - need a copy so we can convert from flags to the user provided codes after fitting the model...
      self.abnorms = dict(obj.getAbnormDict())

      # The flag index array - converts each flag combination to an index - required for learning the per-cluster behaviour multinomials...
      self.fia = FlagIndexArray(len(self.abnorms)+1)
      self.fia.addSingles()

      for doc in self.doc:
        doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

      # Store the parameters...
      if params!=None: self.params = params
      else: self.params = Params()

      # Create a model object, for storing samples into...
      self.model = Model()


  def setGlobalParams(self, sample):
    """Sets a number of parameters for the State after initialisation, taking them from the given Sample object. Designed for use with the addPrior method this allows you to extract all relevant parameters from a Sample. Must be called before any Gibbs sampling takes place."""
    self.alpha = PriorConcDP(sample.alpha)
    self.beta = sample.beta.copy()
    self.gamma = PriorConcDP(sample.gamma)
    self.rho = PriorConcDP(sample.rho)
    self.mu = PriorConcDP(sample.mu)

    # No correct way of combining - the below seems reasonable enough however, and is correct if they have the same entrys...
    for key,fromIndex in sample.abnorms.iteritems():
      if key in self.abnorms:
        toIndex = self.abnorms[key]
        self.phi[toIndex] = sample.phi[fromIndex]
    self.phi /= self.phi.sum()

    self.topicConc = sample.topicConc
    self.clusterConc = sample.clusterConc
    for doc in self.doc:
      doc.conc = self.alpha.conc
  
  def addPrior(self, sample):
    """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

    # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################
    
    # Do the topics...
    offset = self.topicWord.shape[0]
    if self.topicWord.shape[0]!=0:
      self.topicWord = numpy.vstack((self.topicWord,sample.topicWord))
    else:
      self.topicWord = sample.topicWord.copy()
    self.topicUse = numpy.hstack((self.topicUse,sample.topicUse))

    # Calculate the new abnormalities dictionary...
    newAbnorms = dict(sample.abnorms)
    for key,_ in self.abnorms.iteritems():
      if key not in newAbnorms:
        val = len(newAbnorms)+1
        newAbnorms[key] = val

    # Transfer over the abnormal word counts...
    newAbnormTopicWord = numpy.zeros((1+len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32)

    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = self.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += self.abnormTopicWord[origin,:limit]

    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = sample.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += sample.abnormTopicWord[origin,:limit]

    # Update the document flags/counts for behaviours...
    for doc in self.doc:
      newFlags = numpy.zeros(1+len(newAbnorms), dtype=numpy.uint8)
      newCounts = numpy.zeros(1+len(newAbnorms), dtype=numpy.int32)
      newFlags[0] = doc.behFlags[0]
      newCounts[0] = doc.behCounts[0]

      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        newFlags[dest] = doc.behFlags[origin]
        newCounts[dest] = doc.behCounts[origin]
      
      doc.behFlags = newFlags
      doc.behCounts = newCounts

    # Update the old clusters behaviour arrays...
    def mapOldCluster(c):
      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()
      
      c2[0] *= c[2][0]
      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()
      
      return (c[0],c[1],c2,c[3])
      
    self.cluster = map(mapOldCluster ,self.cluster)
    origCluCount = len(self.cluster)
    
    # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
    def mapCluster(pair):
      ci, c = pair
      
      c0 = c[0].copy()
      c0[:,0] += offset

      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()

      c2[0] *= c[2][0]
      for abnorm,origin in sample.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()

      c3 = c[3].copy()
      for doc in filter(lambda doc: doc.cluster==ci, sample.doc):
        fi = sample.fia.flagIndex(doc.behFlags, False)
        if fi>=len(doc.behFlags): # Only bother if the document has abnormalities, of which this is a valid test.
          total = 0
          for i in xrange(doc.dp.shape[0]):
            c3[doc.dp[i,0]] += doc.dp[i,2]
            total += doc.dp[i,2]
          c3[fi] -= total + 1
      
      return (c0,c[1],c2,c3)
      
    self.cluster += map(mapCluster, enumerate(sample.cluster))
    self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))
    
    # Update phi...
    newPhi = numpy.ones(len(newAbnorms)+1,dtype=numpy.float32)
    newPhi[0] = 0.5*(self.phi[0]+sample.phi[0])
    
    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      newPhi[dest] = self.phi[origin]
    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      if abnorm not in self.abnorms:
        newPhi[dest] = sample.phi[origin]
      else:
        newPhi[dest] = 0.5*(newPhi[dest] + sample.phi[origin])
      
    self.phi = newPhi
    self.phi /= self.phi.sum()

    # Recreate the flag index array...
    remapOrig = dict() # Old flag positions to new flag positions.
    remapOrig[0] = 0
    for abnorm,origin in self.abnorms.iteritems():
      remapOrig[origin] = newAbnorms[abnorm]

    remapSam = dict() # sample flag positions to new flag positions.
    remapSam[0] = 0
    for abnorm,origin in sample.abnorms.iteritems():
      remapSam[origin] = newAbnorms[abnorm]
    
    newFia = FlagIndexArray(len(newAbnorms)+1)
    newFia.addSingles()
    behIndAdjOrig = newFia.addFlagIndexArray(self.fia,remapOrig)
    behIndAdjSam  = newFia.addFlagIndexArray(sample.fia,remapSam)

    for doc in self.doc:
      doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

    # Update cluster priors on bmn arrays...
    for c in xrange(len(self.cluster)):
      clu = self.cluster[c]
      newBmn = numpy.zeros(newFia.flagCount(),dtype=numpy.int32)
      oldBmn = clu[3].copy()

      # Transilate from old set...
      for b in xrange(oldBmn.shape[0]):
        index = behIndAdjOrig[b] if c<origCluCount else behIndAdjSam[b]
        newBmn[index] += oldBmn[b]

      self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

    # Replace the old abnormality and fia stuff...
    self.abnormTopicWord = newAbnormTopicWord
    self.abnorms = newAbnorms
    self.fia = newFia


  def sample(self):
    """Samples the current state, storing the current estimate of the model parameters."""
    self.model.sampleState(self)

  def absorbClone(self,clone):
    """Given a clone absorb all its samples - used for multiprocessing."""
    self.model.absorbModel(clone.model)


  def getParams(self):
    """Returns the parameters object."""
    return self.params
    
  def getModel(self):
    """Returns the model constructed from all the calls to sample()."""
    return self.model
  def addPrior(self, sample):
    """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

    # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################
    
    # Do the topics...
    offset = self.topicWord.shape[0]
    if self.topicWord.shape[0]!=0:
      self.topicWord = numpy.vstack((self.topicWord,sample.topicWord))
    else:
      self.topicWord = sample.topicWord.copy()
    self.topicUse = numpy.hstack((self.topicUse,sample.topicUse))

    # Calculate the new abnormalities dictionary...
    newAbnorms = dict(sample.abnorms)
    for key,_ in self.abnorms.iteritems():
      if key not in newAbnorms:
        val = len(newAbnorms)+1
        newAbnorms[key] = val

    # Transfer over the abnormal word counts...
    newAbnormTopicWord = numpy.zeros((1+len(newAbnorms), max((self.abnormTopicWord.shape[1], sample.abnormTopicWord.shape[1]))), dtype=numpy.int32)

    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = self.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += self.abnormTopicWord[origin,:limit]

    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      limit = sample.abnormTopicWord.shape[1]
      newAbnormTopicWord[dest,:limit] += sample.abnormTopicWord[origin,:limit]

    # Update the document flags/counts for behaviours...
    for doc in self.doc:
      newFlags = numpy.zeros(1+len(newAbnorms), dtype=numpy.uint8)
      newCounts = numpy.zeros(1+len(newAbnorms), dtype=numpy.int32)
      newFlags[0] = doc.behFlags[0]
      newCounts[0] = doc.behCounts[0]

      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        newFlags[dest] = doc.behFlags[origin]
        newCounts[dest] = doc.behCounts[origin]
      
      doc.behFlags = newFlags
      doc.behCounts = newCounts

    # Update the old clusters behaviour arrays...
    def mapOldCluster(c):
      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()
      
      c2[0] *= c[2][0]
      for abnorm,origin in self.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()
      
      return (c[0],c[1],c2,c[3])
      
    self.cluster = map(mapOldCluster ,self.cluster)
    origCluCount = len(self.cluster)
    
    # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
    def mapCluster(pair):
      ci, c = pair
      
      c0 = c[0].copy()
      c0[:,0] += offset

      c2 = numpy.ones(1+len(newAbnorms), dtype=numpy.float32)
      c2 /= c2.sum()

      c2[0] *= c[2][0]
      for abnorm,origin in sample.abnorms.iteritems():
        dest = newAbnorms[abnorm]
        c2[dest] *= c[2][origin]
      c2 /= c2.sum()

      c3 = c[3].copy()
      for doc in filter(lambda doc: doc.cluster==ci, sample.doc):
        fi = sample.fia.flagIndex(doc.behFlags, False)
        if fi>=len(doc.behFlags): # Only bother if the document has abnormalities, of which this is a valid test.
          total = 0
          for i in xrange(doc.dp.shape[0]):
            c3[doc.dp[i,0]] += doc.dp[i,2]
            total += doc.dp[i,2]
          c3[fi] -= total + 1
      
      return (c0,c[1],c2,c3)
      
    self.cluster += map(mapCluster, enumerate(sample.cluster))
    self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))
    
    # Update phi...
    newPhi = numpy.ones(len(newAbnorms)+1,dtype=numpy.float32)
    newPhi[0] = 0.5*(self.phi[0]+sample.phi[0])
    
    for abnorm,origin in self.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      newPhi[dest] = self.phi[origin]
    for abnorm,origin in sample.abnorms.iteritems():
      dest = newAbnorms[abnorm]
      if abnorm not in self.abnorms:
        newPhi[dest] = sample.phi[origin]
      else:
        newPhi[dest] = 0.5*(newPhi[dest] + sample.phi[origin])
      
    self.phi = newPhi
    self.phi /= self.phi.sum()

    # Recreate the flag index array...
    remapOrig = dict() # Old flag positions to new flag positions.
    remapOrig[0] = 0
    for abnorm,origin in self.abnorms.iteritems():
      remapOrig[origin] = newAbnorms[abnorm]

    remapSam = dict() # sample flag positions to new flag positions.
    remapSam[0] = 0
    for abnorm,origin in sample.abnorms.iteritems():
      remapSam[origin] = newAbnorms[abnorm]
    
    newFia = FlagIndexArray(len(newAbnorms)+1)
    newFia.addSingles()
    behIndAdjOrig = newFia.addFlagIndexArray(self.fia,remapOrig)
    behIndAdjSam  = newFia.addFlagIndexArray(sample.fia,remapSam)

    for doc in self.doc:
      doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

    # Update cluster priors on bmn arrays...
    for c in xrange(len(self.cluster)):
      clu = self.cluster[c]
      newBmn = numpy.zeros(newFia.flagCount(),dtype=numpy.int32)
      oldBmn = clu[3].copy()

      # Transilate from old set...
      for b in xrange(oldBmn.shape[0]):
        index = behIndAdjOrig[b] if c<origCluCount else behIndAdjSam[b]
        newBmn[index] += oldBmn[b]

      self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

    # Replace the old abnormality and fia stuff...
    self.abnormTopicWord = newAbnormTopicWord
    self.abnorms = newAbnorms
    self.fia = newFia
Exemple #6
0
class State:
    """State object, as manipulated by a Gibbs sampler to get samples of the unknown parameters of the model."""
    def __init__(self, obj, params=None):
        """Constructs a state object given either another State object (clone), or a Corpus and a Params object. If the Params object is omitted it uses the default. Also supports construction from a single Document, where it uses lots of defaults but is basically identical to a Corpus with a single Document in - used as a shortcut when fitting a Document to an already learnt model."""
        if isinstance(obj, State):
            # Cloning time...
            self.dnrDocInsts = obj.dnrDocInsts
            self.dnrCluInsts = obj.dnrCluInsts
            self.seperateClusterConc = obj.seperateClusterConc
            self.seperateDocumentConc = obj.seperateDocumentConc
            self.oneCluster = obj.oneCluster
            self.calcBeta = obj.calcBeta
            self.calcCluBmn = obj.calcCluBmn
            self.calcPhi = obj.calcPhi
            self.resampleConcs = obj.resampleConcs
            self.behSamples = obj.behSamples

            self.alpha = PriorConcDP(obj.alpha)
            self.beta = obj.beta.copy()
            self.gamma = PriorConcDP(obj.gamma)
            self.rho = PriorConcDP(obj.rho)
            self.mu = PriorConcDP(obj.mu)
            self.phi = obj.phi.copy()

            self.topicWord = obj.topicWord.copy()
            self.topicUse = obj.topicUse.copy()
            self.topicConc = obj.topicConc

            self.abnormTopicWord = obj.abnormTopicWord.copy()

            self.cluster = map(lambda t: (t[0].copy(), t[1], t[2].copy()),
                               obj.cluster)
            self.clusterUse = obj.clusterUse.copy()
            self.clusterConc = obj.clusterConc

            self.doc = map(lambda d: DocState(d), obj.doc)
            self.abnorms = dict(obj.abnorms)

            self.fia = FlagIndexArray(obj.fia)

            self.params = Params(obj.params)
            self.model = Model(obj.model)

        elif isinstance(obj, Document):
            # Construct from a single document...

            self.dnrDocInsts = False
            self.dnrCluInsts = False
            self.seperateClusterConc = False
            self.seperateDocumentConc = False
            self.oneCluster = False
            self.calcBeta = False
            self.calcCluBmn = False
            self.calcPhi = False
            self.resampleConcs = False
            self.behSamples = 1024

            wordCount = obj.getWord(obj.getWordCount() - 1)[0]

            self.alpha = PriorConcDP()
            self.beta = numpy.ones(wordCount, dtype=numpy.float32)
            self.gamma = PriorConcDP()
            self.rho = PriorConcDP()
            self.mu = PriorConcDP()
            self.phi = numpy.ones(1 + len(obj.getAbnorms()),
                                  dtype=numpy.float32)
            self.phi[0] *= 10.0
            self.phi /= self.phi.sum()

            self.topicWord = numpy.zeros((0, wordCount), dtype=numpy.int32)
            self.topicUse = numpy.zeros(0, dtype=numpy.int32)
            self.topicConc = self.gamma.conc

            self.abnormTopicWord = numpy.zeros(
                (1 + len(obj.getAbnorms()), wordCount), dtype=numpy.int32)

            self.cluster = []
            self.clusterUse = numpy.zeros(0, dtype=numpy.int32)
            self.clusterConc = self.mu.conc

            abnormDict = dict()
            for i, abnorm in enumerate(obj.getAbnorms()):
                abnormDict[abnorm] = i + 1

            self.doc = [DocState(obj, self.alpha, abnormDict)]
            self.abnorms = dict()
            for num, abnorm in enumerate(obj.getAbnorms()):
                self.abnorms[abnorm] = num + 1

            self.fia = FlagIndexArray(len(self.abnorms) + 1)
            self.fia.addSingles()

            for doc in self.doc:
                doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

            if params != None: self.params = params
            else: self.params = Params()

            self.model = Model()
        else:
            # Construct from a corpus, as that is the only remaining option...

            # Behaviour flags...
            self.dnrDocInsts = obj.getDocInstsDNR()
            self.dnrCluInsts = obj.getCluInstsDNR()
            self.seperateClusterConc = obj.getSeperateClusterConc()
            self.seperateDocumentConc = obj.getSeperateDocumentConc()
            self.oneCluster = obj.getOneCluster()
            self.calcBeta = obj.getCalcBeta()
            self.calcCluBmn = obj.getCalcClusterBMN()
            self.calcPhi = obj.getCalcPhi()
            self.resampleConcs = obj.getResampleConcs()
            self.behSamples = obj.getBehSamples()

            # Concentration parameters - these are all constant...
            self.alpha = PriorConcDP(obj.getAlpha())
            self.beta = numpy.ones(obj.getWordCount(), dtype=numpy.float32)
            self.beta *= obj.getBeta()
            self.gamma = PriorConcDP(obj.getGamma())
            self.rho = PriorConcDP(obj.getRho())
            self.mu = PriorConcDP(obj.getMu())

            self.phi = numpy.ones(1 + len(obj.getAbnormDict()),
                                  dtype=numpy.float32)
            self.phi[0] *= obj.getPhiRatio()
            self.phi *= obj.getPhiConc() * self.phi.shape[0] / self.phi.sum()

            # The topics in the model - consists of three parts - first an array indexed by [topic,word] which gives how many times each word has been drawn from the given topic - this alongside beta allows the relevant Dirichlet posterior to be determined. Additionally we have topicUse, which counts how many times each topic has been instanced in a cluster - this alongside topicConc, which is the sampled concentration, defines the DP from which topics are drawn for inclusion in clusters...
            self.topicWord = numpy.zeros((0, obj.getWordCount()),
                                         dtype=numpy.int32)
            self.topicUse = numpy.zeros(0, dtype=numpy.int32)
            self.topicConc = self.gamma.conc

            # A second topicWord-style matrix, indexed by behaviour and containing the abnormal topics. Entry 0, which is normal, is again an empty dummy...
            self.abnormTopicWord = numpy.zeros(
                (1 + len(obj.getAbnormDict()), obj.getWordCount()),
                dtype=numpy.int32)

            # Defines the clusters, as a list of (inst, conc, bmn, bmnPrior). inst is a 2D array, containing all the topic instances that make up the cluster - whilst the first dimension of the array indexes each instance the second has two entrys only, the first the index number for the topic, the second the number of using document instances. conc is the sampled concentration that completes the definition of the DP defined for each cluster. bmn is the multinomial on behaviours associated with the cluster - a 1D array of floats. bmnPrior is the flagSet aligned integer array that is the prior on bmn. Additionally we have the DDP from which the specific clusters are drawn - this is defined by clusterUse and clusterConc, just as for the topics...
            self.cluster = []
            self.clusterUse = numpy.zeros(0, dtype=numpy.int32)
            self.clusterConc = self.mu.conc

            # List of document objects, to contain the documents - whilst declared immediatly below as an empty list we then proceed to fill it in with the information from the given Corpus...
            self.doc = []

            for doc in obj.documentList():
                self.doc.append(DocState(doc, self.alpha, obj.getAbnormDict()))

            # The abnormality dictionary - need a copy so we can convert from flags to the user provided codes after fitting the model...
            self.abnorms = dict(obj.getAbnormDict())

            # The flag index array - converts each flag combination to an index - required for learning the per-cluster behaviour multinomials...
            self.fia = FlagIndexArray(len(self.abnorms) + 1)
            self.fia.addSingles()

            for doc in self.doc:
                doc.behFlagsIndex = self.fia.flagIndex(doc.behFlags)

            # Store the parameters...
            if params != None: self.params = params
            else: self.params = Params()

            # Create a model object, for storing samples into...
            self.model = Model()

    def setGlobalParams(self, sample):
        """Sets a number of parameters for the State after initialisation, taking them from the given Sample object. Designed for use with the addPrior method this allows you to extract all relevant parameters from a Sample. Must be called before any Gibbs sampling takes place."""
        self.alpha = PriorConcDP(sample.alpha)
        self.beta = sample.beta.copy()
        self.gamma = PriorConcDP(sample.gamma)
        self.rho = PriorConcDP(sample.rho)
        self.mu = PriorConcDP(sample.mu)

        # No correct way of combining - the below seems reasonable enough however, and is correct if they have the same entrys...
        for key, fromIndex in sample.abnorms.iteritems():
            if key in self.abnorms:
                toIndex = self.abnorms[key]
                self.phi[toIndex] = sample.phi[fromIndex]
        self.phi /= self.phi.sum()

        self.topicConc = sample.topicConc
        self.clusterConc = sample.clusterConc
        for doc in self.doc:
            doc.conc = self.alpha.conc

    def addPrior(self, sample):
        """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

        # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################

        # Do the topics...
        offset = self.topicWord.shape[0]
        if self.topicWord.shape[0] != 0:
            self.topicWord = numpy.vstack((self.topicWord, sample.topicWord))
        else:
            self.topicWord = sample.topicWord.copy()
        self.topicUse = numpy.hstack((self.topicUse, sample.topicUse))

        # Calculate the new abnormalities dictionary...
        newAbnorms = dict(sample.abnorms)
        for key, _ in self.abnorms.iteritems():
            if key not in newAbnorms:
                val = len(newAbnorms) + 1
                newAbnorms[key] = val

        # Transfer over the abnormal word counts...
        newAbnormTopicWord = numpy.zeros(
            (1 + len(newAbnorms),
             max((self.abnormTopicWord.shape[1],
                  sample.abnormTopicWord.shape[1]))),
            dtype=numpy.int32)

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = self.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += self.abnormTopicWord[
                origin, :limit]

        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = sample.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += sample.abnormTopicWord[
                origin, :limit]

        # Update the document flags/counts for behaviours...
        for doc in self.doc:
            newFlags = numpy.zeros(1 + len(newAbnorms), dtype=numpy.uint8)
            newCounts = numpy.zeros(1 + len(newAbnorms), dtype=numpy.int32)
            newFlags[0] = doc.behFlags[0]
            newCounts[0] = doc.behCounts[0]

            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                newFlags[dest] = doc.behFlags[origin]
                newCounts[dest] = doc.behCounts[origin]

            doc.behFlags = newFlags
            doc.behCounts = newCounts

        # Update the old clusters behaviour arrays...
        def mapOldCluster(c):
            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            return (c[0], c[1], c2, c[3])

        self.cluster = map(mapOldCluster, self.cluster)
        origCluCount = len(self.cluster)

        # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
        def mapCluster(pair):
            ci, c = pair

            c0 = c[0].copy()
            c0[:, 0] += offset

            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in sample.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            c3 = c[3].copy()
            for doc in filter(lambda doc: doc.cluster == ci, sample.doc):
                fi = sample.fia.flagIndex(doc.behFlags, False)
                if fi >= len(
                        doc.behFlags
                ):  # Only bother if the document has abnormalities, of which this is a valid test.
                    total = 0
                    for i in xrange(doc.dp.shape[0]):
                        c3[doc.dp[i, 0]] += doc.dp[i, 2]
                        total += doc.dp[i, 2]
                    c3[fi] -= total + 1

            return (c0, c[1], c2, c3)

        self.cluster += map(mapCluster, enumerate(sample.cluster))
        self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))

        # Update phi...
        newPhi = numpy.ones(len(newAbnorms) + 1, dtype=numpy.float32)
        newPhi[0] = 0.5 * (self.phi[0] + sample.phi[0])

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            newPhi[dest] = self.phi[origin]
        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            if abnorm not in self.abnorms:
                newPhi[dest] = sample.phi[origin]
            else:
                newPhi[dest] = 0.5 * (newPhi[dest] + sample.phi[origin])

        self.phi = newPhi
        self.phi /= self.phi.sum()

        # Recreate the flag index array...
        remapOrig = dict()  # Old flag positions to new flag positions.
        remapOrig[0] = 0
        for abnorm, origin in self.abnorms.iteritems():
            remapOrig[origin] = newAbnorms[abnorm]

        remapSam = dict()  # sample flag positions to new flag positions.
        remapSam[0] = 0
        for abnorm, origin in sample.abnorms.iteritems():
            remapSam[origin] = newAbnorms[abnorm]

        newFia = FlagIndexArray(len(newAbnorms) + 1)
        newFia.addSingles()
        behIndAdjOrig = newFia.addFlagIndexArray(self.fia, remapOrig)
        behIndAdjSam = newFia.addFlagIndexArray(sample.fia, remapSam)

        for doc in self.doc:
            doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

        # Update cluster priors on bmn arrays...
        for c in xrange(len(self.cluster)):
            clu = self.cluster[c]
            newBmn = numpy.zeros(newFia.flagCount(), dtype=numpy.int32)
            oldBmn = clu[3].copy()

            # Transilate from old set...
            for b in xrange(oldBmn.shape[0]):
                index = behIndAdjOrig[b] if c < origCluCount else behIndAdjSam[
                    b]
                newBmn[index] += oldBmn[b]

            self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

        # Replace the old abnormality and fia stuff...
        self.abnormTopicWord = newAbnormTopicWord
        self.abnorms = newAbnorms
        self.fia = newFia

    def sample(self):
        """Samples the current state, storing the current estimate of the model parameters."""
        self.model.sampleState(self)

    def absorbClone(self, clone):
        """Given a clone absorb all its samples - used for multiprocessing."""
        self.model.absorbModel(clone.model)

    def getParams(self):
        """Returns the parameters object."""
        return self.params

    def getModel(self):
        """Returns the model constructed from all the calls to sample()."""
        return self.model
Exemple #7
0
    def addPrior(self, sample):
        """Given a Sample object this uses it as a prior - this is primarilly used to sample a single or small number of documents using a model already trainned on another set of documents. It basically works by adding the topics, clusters and behaviours from the sample into this corpus, with the counts all intact so they have the relevant weight and can't be deleted. Note that you could in principle add multiple priors, though that would be quite a strange scenario. If only called once then the topic indices will line up. Note that all the prior parameters are not transfered, though often you would want to - setGlobalParams is provided to do this. Must be called before any Gibbs sampling takes place."""

        # Below code has evolved into spagetti, via several other tasty culinary dishes, and needs a rewrite. Or to never be looked at or edited ever again. ###################

        # Do the topics...
        offset = self.topicWord.shape[0]
        if self.topicWord.shape[0] != 0:
            self.topicWord = numpy.vstack((self.topicWord, sample.topicWord))
        else:
            self.topicWord = sample.topicWord.copy()
        self.topicUse = numpy.hstack((self.topicUse, sample.topicUse))

        # Calculate the new abnormalities dictionary...
        newAbnorms = dict(sample.abnorms)
        for key, _ in self.abnorms.iteritems():
            if key not in newAbnorms:
                val = len(newAbnorms) + 1
                newAbnorms[key] = val

        # Transfer over the abnormal word counts...
        newAbnormTopicWord = numpy.zeros(
            (1 + len(newAbnorms),
             max((self.abnormTopicWord.shape[1],
                  sample.abnormTopicWord.shape[1]))),
            dtype=numpy.int32)

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = self.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += self.abnormTopicWord[
                origin, :limit]

        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            limit = sample.abnormTopicWord.shape[1]
            newAbnormTopicWord[dest, :limit] += sample.abnormTopicWord[
                origin, :limit]

        # Update the document flags/counts for behaviours...
        for doc in self.doc:
            newFlags = numpy.zeros(1 + len(newAbnorms), dtype=numpy.uint8)
            newCounts = numpy.zeros(1 + len(newAbnorms), dtype=numpy.int32)
            newFlags[0] = doc.behFlags[0]
            newCounts[0] = doc.behCounts[0]

            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                newFlags[dest] = doc.behFlags[origin]
                newCounts[dest] = doc.behCounts[origin]

            doc.behFlags = newFlags
            doc.behCounts = newCounts

        # Update the old clusters behaviour arrays...
        def mapOldCluster(c):
            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in self.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            return (c[0], c[1], c2, c[3])

        self.cluster = map(mapOldCluster, self.cluster)
        origCluCount = len(self.cluster)

        # Add the new clusters, updating their behaviour arrays and topic indices, plus getting their priors updated with their associated documents...
        def mapCluster(pair):
            ci, c = pair

            c0 = c[0].copy()
            c0[:, 0] += offset

            c2 = numpy.ones(1 + len(newAbnorms), dtype=numpy.float32)
            c2 /= c2.sum()

            c2[0] *= c[2][0]
            for abnorm, origin in sample.abnorms.iteritems():
                dest = newAbnorms[abnorm]
                c2[dest] *= c[2][origin]
            c2 /= c2.sum()

            c3 = c[3].copy()
            for doc in filter(lambda doc: doc.cluster == ci, sample.doc):
                fi = sample.fia.flagIndex(doc.behFlags, False)
                if fi >= len(
                        doc.behFlags
                ):  # Only bother if the document has abnormalities, of which this is a valid test.
                    total = 0
                    for i in xrange(doc.dp.shape[0]):
                        c3[doc.dp[i, 0]] += doc.dp[i, 2]
                        total += doc.dp[i, 2]
                    c3[fi] -= total + 1

            return (c0, c[1], c2, c3)

        self.cluster += map(mapCluster, enumerate(sample.cluster))
        self.clusterUse = numpy.hstack((self.clusterUse, sample.clusterUse))

        # Update phi...
        newPhi = numpy.ones(len(newAbnorms) + 1, dtype=numpy.float32)
        newPhi[0] = 0.5 * (self.phi[0] + sample.phi[0])

        for abnorm, origin in self.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            newPhi[dest] = self.phi[origin]
        for abnorm, origin in sample.abnorms.iteritems():
            dest = newAbnorms[abnorm]
            if abnorm not in self.abnorms:
                newPhi[dest] = sample.phi[origin]
            else:
                newPhi[dest] = 0.5 * (newPhi[dest] + sample.phi[origin])

        self.phi = newPhi
        self.phi /= self.phi.sum()

        # Recreate the flag index array...
        remapOrig = dict()  # Old flag positions to new flag positions.
        remapOrig[0] = 0
        for abnorm, origin in self.abnorms.iteritems():
            remapOrig[origin] = newAbnorms[abnorm]

        remapSam = dict()  # sample flag positions to new flag positions.
        remapSam[0] = 0
        for abnorm, origin in sample.abnorms.iteritems():
            remapSam[origin] = newAbnorms[abnorm]

        newFia = FlagIndexArray(len(newAbnorms) + 1)
        newFia.addSingles()
        behIndAdjOrig = newFia.addFlagIndexArray(self.fia, remapOrig)
        behIndAdjSam = newFia.addFlagIndexArray(sample.fia, remapSam)

        for doc in self.doc:
            doc.behFlagsIndex = behIndAdjOrig[doc.behFlagsIndex]

        # Update cluster priors on bmn arrays...
        for c in xrange(len(self.cluster)):
            clu = self.cluster[c]
            newBmn = numpy.zeros(newFia.flagCount(), dtype=numpy.int32)
            oldBmn = clu[3].copy()

            # Transilate from old set...
            for b in xrange(oldBmn.shape[0]):
                index = behIndAdjOrig[b] if c < origCluCount else behIndAdjSam[
                    b]
                newBmn[index] += oldBmn[b]

            self.cluster[c] = (clu[0], clu[1], clu[2], newBmn)

        # Replace the old abnormality and fia stuff...
        self.abnormTopicWord = newAbnormTopicWord
        self.abnorms = newAbnorms
        self.fia = newFia