Example #1
0
   def update_learner(self,example):
      self.layers[0][:] = example[0]

      # fprop
      for h in range(self.n_hidden_layers):
         mllin.product_matrix_vector(self.Ws[h],self.layers[h],self.layer_acts[h+1])
         self.layer_acts[h+1] += self.cs[h]
         mlnonlin.sigmoid(self.layer_acts[h+1],self.layers[h+1])

      mllin.product_matrix_vector(self.U,self.layers[-1],self.output_act)
      self.output_act += self.d
      mlnonlin.softmax(self.output_act,self.output)

      self.doutput_act[:] = self.output
      self.doutput_act[example[1]] -= 1
      self.doutput_act *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)

      self.dd[:] = self.doutput_act
      mllin.outer(self.doutput_act,self.layers[-1],self.dU)      
      mllin.product_matrix_vector(self.U.T,self.doutput_act,self.dlayers[-1])
      mlnonlin.dsigmoid(self.layers[-1],self.dlayers[-1],self.dlayer_acts[-1])
      for h in range(self.n_hidden_layers-1,-1,-1):
         self.dcs[h][:] = self.dlayer_acts[h+1]
         mllin.outer(self.dlayer_acts[h+1],self.layers[h],self.dWs[h])
         mllin.product_matrix_vector(self.Ws[h].T,self.dlayer_acts[h+1],self.dlayers[h])
         mlnonlin.dsigmoid(self.layers[h],self.dlayers[h],self.dlayer_acts[h])

      self.U -= self.dU
      self.d -= self.dd
      for h in range(self.n_hidden_layers-1,-1,-1):
         self.Ws[h] -= self.dWs[h]
         self.cs[h] -= self.dcs[h]

      self.n_updates += 1
Example #2
0
   def update_learner(self,example):
      self.layers[0][:] = example[0]

      # fprop
      for h in range(self.n_hidden_layers):
         mllin.product_matrix_vector(self.Ws[h],self.layers[h],self.layer_acts[h+1])
         self.layer_acts[h+1] += self.cs[h]
         if self.activation_function == 'sigmoid':
             mlnonlin.sigmoid(self.layer_acts[h+1],self.layers[h+1])
         elif self.activation_function == 'tanh':
             mlnonlin.tanh(self.layer_acts[h+1],self.layers[h+1])
         elif self.activation_function == 'reclin':
             mlnonlin.reclin(self.layer_acts[h+1],self.layers[h+1])
         else:
             raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'')

      mllin.product_matrix_vector(self.U,self.layers[-1],self.output_act)
      self.output_act += self.d
      mlnonlin.softmax(self.output_act,self.output)

      self.doutput_act[:] = self.output
      self.doutput_act[example[1]] -= 1
      self.doutput_act *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)

      self.dd[:] = self.doutput_act
      mllin.outer(self.doutput_act,self.layers[-1],self.dU)      
      mllin.product_matrix_vector(self.U.T,self.doutput_act,self.dlayers[-1])
      if self.activation_function == 'sigmoid':
          mlnonlin.dsigmoid(self.layers[-1],self.dlayers[-1],self.dlayer_acts[-1])
      elif self.activation_function == 'tanh':
          mlnonlin.dtanh(self.layers[-1],self.dlayers[-1],self.dlayer_acts[-1])
      elif self.activation_function == 'reclin':
          mlnonlin.dreclin(self.layers[-1],self.dlayers[-1],self.dlayer_acts[-1])
      else:
          raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'')

      for h in range(self.n_hidden_layers-1,-1,-1):
         self.dcs[h][:] = self.dlayer_acts[h+1]
         mllin.outer(self.dlayer_acts[h+1],self.layers[h],self.dWs[h])
         mllin.product_matrix_vector(self.Ws[h].T,self.dlayer_acts[h+1],self.dlayers[h])
         if self.activation_function == 'sigmoid':
             mlnonlin.dsigmoid(self.layers[h],self.dlayers[h],self.dlayer_acts[h])
         elif self.activation_function == 'tanh':
             mlnonlin.dtanh(self.layers[h],self.dlayers[h],self.dlayer_acts[h])
         elif self.activation_function == 'reclin':
             mlnonlin.dreclin(self.layers[h],self.dlayers[h],self.dlayer_acts[h])
         else:
             raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'')

      self.U -= self.dU
      self.d -= self.dd
      for h in range(self.n_hidden_layers-1,-1,-1):
         self.Ws[h] -= self.dWs[h]
         self.cs[h] -= self.dcs[h]

      self.n_updates += 1
Example #3
0
   def bprop(self,target):
       """
       Computes the loss derivatives with respect to all parameters
       times the current learning rate.  It assumes that
       ``self.fprop(input)`` was called first. All the derivatives are
       put in their corresponding object attributes (i.e. ``self.d*``).
       """
       self.doutput_act[:] = self.output
       self.doutput_act[target] -= 1
       self.doutput_act *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)
 
       self.dd[:] = self.doutput_act
       for k in range(self.n_k_means):
           c = self.cluster_indices[k]
           idx = c + k*self.n_clusters
 
           mllin.outer(self.doutput_act,self.layers[k],self.dVs[idx])
           mllin.product_matrix_vector(self.Vs[idx].T,self.doutput_act,self.dlayers[k])
           #mlnonlin.dsigmoid(self.layers[k],self.dlayers[k],self.dlayer_acts[k])
           if self.activation_function == 'sigmoid':
               mlnonlin.dsigmoid(self.layers[k],self.dlayers[k],self.dlayer_acts[k])
           elif self.activation_function == 'tanh':
               mlnonlin.dtanh(self.layers[k],self.dlayers[k],self.dlayer_acts[k])
           elif self.activation_function == 'reclin':
               mlnonlin.dreclin(self.layers[k],self.dlayers[k],self.dlayer_acts[k])
           else:
               raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'')

           self.dcs[idx][:] = self.dlayer_acts[k]
           mllin.outer(self.dlayer_acts[k],self.input,self.dWs[idx])

       if self.autoencoder_regularization != 0:
           self.dae_doutput_act[:] = self.dae_output
           self.dae_doutput_act[:] -= self.input
           self.dae_doutput_act *= 2*self.autoencoder_regularization*self.learning_rate/(1.+self.decrease_constant*self.n_updates)
           
           self.dae_dd[:] = self.dae_doutput_act
           for k in range(self.n_k_means):
               c = self.cluster_indices[k]
               idx = c + k*self.n_clusters
           
               mllin.outer(self.dae_doutput_act,self.dae_layers[k],self.dae_dWsT[idx])
               self.dWs[idx] += self.dae_dWsT[idx].T
               mllin.product_matrix_vector(self.Ws[idx],self.dae_doutput_act,self.dae_dlayers[k])
               #mlnonlin.dsigmoid(self.dae_layers[k],self.dae_dlayers[k],self.dae_dlayer_acts[k])
               if self.activation_function == 'sigmoid':
                   mlnonlin.dsigmoid(self.dae_layers[k],self.dae_dlayers[k],self.dae_dlayer_acts[k])     
               elif self.activation_function == 'tanh':
                   mlnonlin.dtanh(self.dae_layers[k],self.dae_dlayers[k],self.dae_dlayer_acts[k])     
               elif self.activation_function == 'reclin':
                   mlnonlin.dreclin(self.dae_layers[k],self.dae_dlayers[k],self.dae_dlayer_acts[k])     
               else:
                   raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'')

               self.dcs[idx] += self.dae_dlayer_acts[k]
               mllin.outer(self.dae_dlayer_acts[k],self.dae_input,self.dae_dWs[idx])
               self.dWs[idx] += self.dae_dWs[idx]               
Example #4
0
 def update_learner(self,example):
     self.input[:] = 0
     self.input[example[1]] = example[0]
     n_words = int(self.input.sum())
     
     # Performing CD-k
     mllin.product_matrix_vector(self.W,self.input,self.hidden_act)
     self.hidden_act += self.c*n_words
     mlnonlin.sigmoid(self.hidden_act,self.hidden_prob)
     self.neg_hidden_prob[:] = self.hidden_prob
     
     for k in range(self.k_contrastive_divergence_steps):
         if self.mean_field:
            self.hidden[:] = self.neg_hidden_prob
         else: 
            np.less(self.rng.rand(self.hidden_size),self.neg_hidden_prob,self.hidden)
     
         mllin.product_matrix_vector(self.W.T,self.hidden,self.neg_input_act)
         self.neg_input_act += self.b
         mlnonlin.softmax(self.neg_input_act,self.neg_input_prob)
         if self.mean_field:
            self.neg_input[:] = n_words*self.neg_input_prob
         else:
            self.neg_input[:] = self.rng.multinomial(n_words,self.neg_input_prob)
     
         mllin.product_matrix_vector(self.W,self.neg_input,self.neg_hidden_act)
         self.neg_hidden_act += self.c*n_words
         mlnonlin.sigmoid(self.neg_hidden_act,self.neg_hidden_prob)
     
     mllin.outer(self.hidden_prob,self.input,self.deltaW)
     mllin.outer(self.neg_hidden_prob,self.neg_input,self.neg_stats)
     self.deltaW -= self.neg_stats
     
     np.subtract(self.input,self.neg_input,self.deltab)
     np.subtract(self.hidden_prob,self.neg_hidden_prob,self.deltac)
     
     self.deltaW *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)
     self.deltab *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)
     self.deltac *= n_words*self.learning_rate/(1.+self.decrease_constant*self.n_updates)         
     
     self.W += self.deltaW
     self.b += self.deltab
     self.c += self.deltac
     
     self.n_updates += 1
Example #5
0
    def update_learner(self, example):
        self.input[:] = 0
        self.input[example[1]] = example[0]
        n_words = int(self.input.sum())

        # Performing CD-k
        mllin.product_matrix_vector(self.W, self.input, self.hidden_act)
        self.hidden_act += self.c * n_words
        mlnonlin.sigmoid(self.hidden_act, self.hidden_prob)
        self.neg_hidden_prob[:] = self.hidden_prob

        for k in range(self.k_contrastive_divergence_steps):
            if self.mean_field:
                self.hidden[:] = self.neg_hidden_prob
            else:
                np.less(self.rng.rand(self.hidden_size), self.neg_hidden_prob, self.hidden)

            mllin.product_matrix_vector(self.W.T, self.hidden, self.neg_input_act)
            self.neg_input_act += self.b
            mlnonlin.softmax(self.neg_input_act, self.neg_input_prob)
            if self.mean_field:
                self.neg_input[:] = n_words * self.neg_input_prob
            else:
                self.neg_input[:] = self.rng.multinomial(n_words, self.neg_input_prob)

            mllin.product_matrix_vector(self.W, self.neg_input, self.neg_hidden_act)
            self.neg_hidden_act += self.c * n_words
            mlnonlin.sigmoid(self.neg_hidden_act, self.neg_hidden_prob)

        mllin.outer(self.hidden_prob, self.input, self.deltaW)
        mllin.outer(self.neg_hidden_prob, self.neg_input, self.neg_stats)
        self.deltaW -= self.neg_stats

        np.subtract(self.input, self.neg_input, self.deltab)
        np.subtract(self.hidden_prob, self.neg_hidden_prob, self.deltac)

        self.deltaW *= self.learning_rate / (1.0 + self.decrease_constant * self.n_updates)
        self.deltab *= self.learning_rate / (1.0 + self.decrease_constant * self.n_updates)
        self.deltac *= n_words * self.learning_rate / (1.0 + self.decrease_constant * self.n_updates)

        self.W += self.deltaW
        self.b += self.deltab
        self.c += self.deltac

        self.n_updates += 1
Example #6
0
    def update_learner(self, example):
        self.input[:] = example

        # Performing CD-1
        mllin.product_matrix_vector(self.W, self.input, self.hidden_act)
        self.hidden_act += self.c
        mlnonlin.sigmoid(self.hidden_act, self.hidden_prob)
        np.less(self.rng.rand(self.hidden_size), self.hidden_prob, self.hidden)

        mllin.product_matrix_vector(self.W.T, self.hidden, self.neg_input_act)
        self.neg_input_act += self.b
        mlnonlin.sigmoid(self.neg_input_act, self.neg_input_prob)
        np.less(self.rng.rand(self.input_size), self.neg_input_prob,
                self.neg_input)

        mllin.product_matrix_vector(self.W, self.neg_input,
                                    self.neg_hidden_act)
        self.neg_hidden_act += self.c
        mlnonlin.sigmoid(self.neg_hidden_act, self.neg_hidden_prob)

        mllin.outer(self.hidden_prob, self.input, self.deltaW)
        mllin.outer(self.neg_hidden_prob, self.neg_input, self.neg_stats)
        self.deltaW -= self.neg_stats

        np.subtract(self.input, self.neg_input, self.deltab)
        np.subtract(self.hidden_prob, self.neg_hidden_prob, self.deltac)

        self.deltaW *= self.learning_rate / (
            1. + self.decrease_constant * self.n_updates)
        self.deltab *= self.learning_rate / (
            1. + self.decrease_constant * self.n_updates)
        self.deltac *= self.learning_rate / (
            1. + self.decrease_constant * self.n_updates)

        self.W += self.deltaW
        self.b += self.deltab
        self.c += self.deltac

        if self.l1_regularization > 0:
            self.W *= (np.abs(self.W) >
                       (self.l1_regularization * self.learning_rate /
                        (1. + self.decrease_constant * self.n_updates)))

        self.n_updates += 1
Example #7
0
   def update_learner(self,example):
      self.input[self.input_order] = example
   
      # fprop
      mllin.product_matrix_vector(self.W,self.input,self.recact)
      self.recact += self.b
      mlnonlin.sigmoid(self.recact,self.rec)

      # bprop
      np.subtract(self.rec,self.input,self.drec)
      self.db[:] = self.drec
      mllin.outer(self.drec,self.input,self.dW)
      
      self.dW *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)
      self.db *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)

      self.W -= self.dW
      self.b -= self.db

      self.W.ravel()[self.utri_index] = 0 # Setting back upper diagonal to 0
      self.n_updates += 1
    def update_learner(self, example):
        self.layers[0][:] = example[0]

        # fprop
        for h in range(self.n_hidden_layers):
            mllin.product_matrix_vector(self.Ws[h], self.layers[h],
                                        self.layer_acts[h + 1])
            self.layer_acts[h + 1] += self.cs[h]
            mlnonlin.sigmoid(self.layer_acts[h + 1], self.layers[h + 1])

        mllin.product_matrix_vector(self.U, self.layers[-1], self.output_act)
        self.output_act += self.d
        mlnonlin.softmax(self.output_act, self.output)

        self.doutput_act[:] = self.output
        self.doutput_act[example[1]] -= 1
        self.doutput_act *= self.learning_rate / (
            1. + self.decrease_constant * self.n_updates)

        self.dd[:] = self.doutput_act
        mllin.outer(self.doutput_act, self.layers[-1], self.dU)
        mllin.product_matrix_vector(self.U.T, self.doutput_act,
                                    self.dlayers[-1])
        mlnonlin.dsigmoid(self.layers[-1], self.dlayers[-1],
                          self.dlayer_acts[-1])
        for h in range(self.n_hidden_layers - 1, -1, -1):
            self.dcs[h][:] = self.dlayer_acts[h + 1]
            mllin.outer(self.dlayer_acts[h + 1], self.layers[h], self.dWs[h])
            mllin.product_matrix_vector(self.Ws[h].T, self.dlayer_acts[h + 1],
                                        self.dlayers[h])
            mlnonlin.dsigmoid(self.layers[h], self.dlayers[h],
                              self.dlayer_acts[h])

        self.U -= self.dU
        self.d -= self.dd
        for h in range(self.n_hidden_layers - 1, -1, -1):
            self.Ws[h] -= self.dWs[h]
            self.cs[h] -= self.dcs[h]

        self.n_updates += 1
Example #9
0
    def update_learner(self, example):
        self.input[self.input_order] = example

        # fprop
        mllin.product_matrix_vector(self.W, self.input, self.recact)
        self.recact += self.b
        mlnonlin.sigmoid(self.recact, self.rec)

        # bprop
        np.subtract(self.rec, self.input, self.drec)
        self.db[:] = self.drec
        mllin.outer(self.drec, self.input, self.dW)

        self.dW *= self.learning_rate / (
            1. + self.decrease_constant * self.n_updates)
        self.db *= self.learning_rate / (
            1. + self.decrease_constant * self.n_updates)

        self.W -= self.dW
        self.b -= self.db

        self.W.ravel()[self.utri_index] = 0  # Setting back upper diagonal to 0
        self.n_updates += 1
Example #10
0
   def update_learner(self,example):
      self.input[:] = example

      # Performing CD-1
      mllin.product_matrix_vector(self.W,self.input,self.hidden_act)
      self.hidden_act += self.c
      mlnonlin.sigmoid(self.hidden_act,self.hidden_prob)
      np.less(self.rng.rand(self.hidden_size),self.hidden_prob,self.hidden)

      mllin.product_matrix_vector(self.W.T,self.hidden,self.neg_input_act)
      self.neg_input_act += self.b
      mlnonlin.sigmoid(self.neg_input_act,self.neg_input_prob)
      np.less(self.rng.rand(self.input_size),self.neg_input_prob,self.neg_input)

      mllin.product_matrix_vector(self.W,self.neg_input,self.neg_hidden_act)
      self.neg_hidden_act += self.c
      mlnonlin.sigmoid(self.neg_hidden_act,self.neg_hidden_prob)

      mllin.outer(self.hidden_prob,self.input,self.deltaW)
      mllin.outer(self.neg_hidden_prob,self.neg_input,self.neg_stats)
      self.deltaW -= self.neg_stats

      np.subtract(self.input,self.neg_input,self.deltab)
      np.subtract(self.hidden_prob,self.neg_hidden_prob,self.deltac)

      self.deltaW *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)
      self.deltab *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)
      self.deltac *= self.learning_rate/(1.+self.decrease_constant*self.n_updates)         

      self.W += self.deltaW
      self.b += self.deltab
      self.c += self.deltac

      if self.l1_regularization > 0:
         self.W *= (np.abs(self.W) > (self.l1_regularization * self.learning_rate/(1.+self.decrease_constant*self.n_updates)))

      self.n_updates += 1
Example #11
0
    def cond_probs(self, y_set, gamma_set):
        """
        Given the set of gamma variables, outputs the set of 
        probabilities p(y_t | y_{t-1}, ... , y_1, gamma_{t-1}, ... , gamma_1)
        """

        # Note (HUGO): this function should probably be implemented in C
        #              to make it much faster, since it requires for loops.

        # Setting variables with friendlier name
        d_y = self.input_size
        d_z = self.latent_size
        A = self.A
        C = self.C
        Sigma = self.Sigma
        E = self.E

        cond_probs = []
        map_probs = []
        laplace_probs = []
        y_pred = []

        z_n_z_n_post_sum = zeros((d_z, d_z))

        # Temporary variable, to avoid memory allocation
        vec_d_z = zeros(d_z)
        vec_d_z2 = zeros(d_z)
        vec_d_y = zeros(d_y)
        mat_d_z_d_z = zeros((d_z, d_z))
        mat_d_z_d_z2 = zeros((d_z, d_z))
        eye_d_z = eye(d_z)
        mat_times_C_trans = zeros((d_z, d_y))
        pred = zeros(d_y)
        cov_pred = zeros((d_y, d_y))
        A_gamma = zeros((d_z, d_z))
        E_gamma = zeros((d_z, d_z))
        K = zeros((d_z, d_y))
        KC = zeros((d_z, d_z))
        J = zeros((d_z, d_z))
        A_times_prev_mu = zeros(d_z)
        Af_d_y_d_y = zeros((d_y, d_y), order='fortran')  # Temporary variables
        Bf_d_y_d_z = zeros((d_y, d_z), order='fortran')  # for calls to
        Af_d_z_d_z = zeros((d_z, d_z),
                           order='fortran')  # math.linalg.solve(...)
        Bf_d_z_d_z = zeros((d_z, d_z), order='fortran')
        pivots_d_y = zeros((d_y), dtype='i', order='fortran')
        pivots_d_z = zeros((d_z), dtype='i', order='fortran')
        z_n_z_n_post = zeros((d_z, d_z))
        next_z_n_z_n_post = zeros((d_z, d_z))

        log_det_diff2_log_gamma = 0
        for y_t, gamma_t in zip(y_set, gamma_set):
            T = len(y_t)
            cond_probs_t = zeros(T)
            map_probs_t = zeros(T)
            laplace_probs_t = zeros(T)
            y_pred_t = zeros((T, d_y))
            mu_kalman_t = zeros((T, d_z))  # Filtering mus
            E_kalman_t = zeros((T, d_z, d_z))  # Filtering Es
            mu_post_t = zeros((T, d_z))
            E_post_t = zeros((T, d_z, d_z))
            P_t = zeros((T - 1, d_z, d_z))

            # Forward pass

            # Initialization at n = 0
            A_times_prev_mu[:] = 0
            multiply(C.T, reshape(gamma_t[0], (-1, 1)), mat_times_C_trans)
            pred[:] = 0
            product_matrix_matrix(C, mat_times_C_trans, cov_pred)
            cov_pred += Sigma
            solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y, Bf_d_y_d_z,
                  pivots_d_y)

            vec_d_y[:] = y_t[0]
            vec_d_y -= pred
            product_matrix_vector(K, vec_d_y, mu_kalman_t[0])

            product_matrix_matrix(K, C, KC)
            mat_d_z_d_z[:] = eye_d_z
            mat_d_z_d_z -= KC
            multiply(mat_d_z_d_z, gamma_t[0], E_kalman_t[0])

            cond_probs_t[0] = self.multivariate_norm_log_pdf(
                y_t[0], pred, cov_pred)
            y_pred_t[0] = pred
            # from n=1 to T-1
            for n in xrange(T - 1):
                divide(1., E, vec_d_z)
                divide(1., gamma_t[n + 1], vec_d_z2)
                vec_d_z += vec_d_z2
                divide(1., vec_d_z, vec_d_z2)
                setdiag(E_gamma, vec_d_z2)
                divide(E, gamma_t[n + 1], vec_d_z)
                vec_d_z += 1
                divide(A, reshape(vec_d_z, (-1, 1)), A_gamma)

                P_tn = P_t[n]
                product_matrix_matrix(E_kalman_t[n], A_gamma.T, mat_d_z_d_z)
                product_matrix_matrix(A_gamma, mat_d_z_d_z, P_tn)
                P_tn += E_gamma
                product_matrix_vector(A_gamma, mu_kalman_t[n], A_times_prev_mu)
                product_matrix_matrix(P_tn, C.T, mat_times_C_trans)
                product_matrix_vector(C, A_times_prev_mu, pred)
                product_matrix_matrix(C, mat_times_C_trans, cov_pred)
                cov_pred += Sigma
                solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y,
                      Bf_d_y_d_z, pivots_d_y)
                vec_d_y[:] = y_t[n + 1]
                vec_d_y -= pred
                product_matrix_vector(K, vec_d_y, mu_kalman_t[n + 1])
                mu_kalman_t[n + 1] += A_times_prev_mu

                product_matrix_matrix(K, C, KC)
                mat_d_z_d_z[:] = eye_d_z
                mat_d_z_d_z -= KC
                product_matrix_matrix(mat_d_z_d_z, P_tn, mat_d_z_d_z2)
                # To ensure symmetry
                E_kalman_t[n + 1] = mat_d_z_d_z2
                E_kalman_t[n + 1] += mat_d_z_d_z2.T
                E_kalman_t[n + 1] /= 2

                mu_post_t[-1] = mu_kalman_t[-1]
                E_post_t[-1] = E_kalman_t[-1]

                # Compute last step statistics
                outer(mu_post_t[-1], mu_post_t[-1], z_n_z_n_post)
                z_n_z_n_post += E_post_t[-1]
                # Update cumulative statistics
                z_n_z_n_post_sum += z_n_z_n_post

                cond_probs_t[n + 1] = self.multivariate_norm_log_pdf(
                    y_t[n + 1], pred, cov_pred)
                y_pred_t[n + 1] = pred

            #print y_t, y_pred_t
            # Backward pass
            pred[:] = 0
            cov_pred[:] = 0
            for n in xrange(T - 2, -1, -1):
                next_z_n_z_n_post[:] = z_n_z_n_post
                divide(E, gamma_t[n + 1], vec_d_z)
                vec_d_z += 1
                divide(A, reshape(vec_d_z, (-1, 1)), A_gamma)

                P_tn = P_t[n]
                solve(P_tn.T, A_gamma, mat_d_z_d_z, Af_d_z_d_z, Bf_d_z_d_z,
                      pivots_d_z)
                product_matrix_matrix(E_kalman_t[n], mat_d_z_d_z.T, J)
                product_matrix_vector(A_gamma, mu_kalman_t[n], vec_d_z)

                vec_d_z *= -1
                vec_d_z += mu_post_t[n + 1]
                product_matrix_vector(J, vec_d_z, mu_post_t[n])
                mu_post_t[n] += mu_kalman_t[n]

                mat_d_z_d_z[:] = E_post_t[n + 1]
                mat_d_z_d_z -= P_tn
                product_matrix_matrix(mat_d_z_d_z, J.T, mat_d_z_d_z2)
                product_matrix_matrix(J, mat_d_z_d_z2, mat_d_z_d_z)
                # To ensure symmetry
                E_post_t[n] = E_kalman_t[n]
                E_post_t[n] += mat_d_z_d_z
                E_post_t[n] += E_kalman_t[n].T
                E_post_t[n] += mat_d_z_d_z.T
                E_post_t[n] /= 2

                outer(mu_post_t[n], mu_post_t[n], z_n_z_n_post)
                z_n_z_n_post += E_post_t[n]

                dummy = self.compute_gamma(A, E, z_n_z_n_post,
                                           next_z_n_z_n_post, gamma_t[n + 1])
                log_prior_gamma = self.log_prior_gamma(gamma_t[n + 1])
                #print log_prior_gamma
                log_prior_log_gamma = self.log_prior_log_gamma(gamma_t[n + 1])
                log_det_diff2_log_gamma = self.log_det_diff2_log_gamma(
                    A, E, z_n_z_n_post, next_z_n_z_n_post, gamma_t[n + 1])
                map_probs_t[n + 1] = cond_probs_t[n + 1] + log_prior_gamma
                laplace_probs_t[
                    n +
                    1] = cond_probs_t[n + 1] + log_prior_log_gamma + d_z * log(
                        2 * pi) / 2 - 0.5 * log_det_diff2_log_gamma

            gamma_t[0] = (diag(z_n_z_n_post) + 2 * self.gamma_prior_beta) / (
                2 * self.gamma_prior_alpha + 3)
            log_prior_gamma = self.log_prior_gamma(gamma_t[0])
            log_prior_log_gamma = self.log_prior_log_gamma(gamma_t[0])
            log_det_diff2_log_gamma = sum(
                (z_n_z_n_post / 2 + self.gamma_prior_beta) / gamma_t[0])
            map_probs_t[0] = cond_probs_t[0] + log_prior_gamma
            laplace_probs_t[
                0] = cond_probs_t[0] + log_prior_log_gamma + d_z * log(
                    2 * pi) / 2 - 0.5 * log_det_diff2_log_gamma

            cond_probs += [cond_probs_t]
            map_probs += [map_probs_t]
            laplace_probs += [laplace_probs_t]
            y_pred += [y_pred_t]

        return cond_probs, map_probs, laplace_probs, y_pred
Example #12
0
    def EM_step(self, y_set, gamma_set, training=False, return_mu_post=False):
        """
        Computes the posterior statistics and outputs the M step
        estimates of the parameters.
        Also outputs the non-parametric, sparsity inducing variances gamma_t.
        Optionally, can output the posterior means of the latent state variables.
        """
        # Setting variables with friendlier name
        d_y = self.input_size
        d_z = self.latent_size
        #V_zero = self.V_zero
        A = self.A
        C = self.C
        Sigma = self.Sigma
        E = self.E

        # Variables for estimating new parameters
        A_new = zeros((d_z, d_z))
        C_new = zeros((d_y, d_z))

        z_n_z_n_1_post_sum = zeros((d_z, d_z))
        z_n_z_n_post_sum = zeros((d_z, d_z))
        A_new_denums = zeros((d_z, d_z, d_z))
        y_n_z_n_post_sum = zeros((d_y, d_z))

        # Temporary variable, to avoid memory allocation
        vec_d_z = zeros(d_z)
        vec_d_z2 = zeros(d_z)
        vec_d_y = zeros(d_y)
        mat_d_z_d_z = zeros((d_z, d_z))
        mat_d_z_d_z2 = zeros((d_z, d_z))
        eye_d_z = eye(d_z)
        mat_times_C_trans = zeros((d_z, d_y))
        pred = zeros(d_y)
        cov_pred = zeros((d_y, d_y))
        A_gamma = zeros((d_z, d_z))
        E_gamma = zeros((d_z, d_z))
        K = zeros((d_z, d_y))
        KC = zeros((d_z, d_z))
        J = zeros((d_z, d_z))
        A_times_prev_mu = zeros(d_z)
        Af_d_y_d_y = zeros((d_y, d_y), order='fortran')  # Temporary variables
        Bf_d_y_d_z = zeros((d_y, d_z), order='fortran')  # for calls to
        Af_d_z_d_z = zeros((d_z, d_z),
                           order='fortran')  # math.linalg.solve(...)
        Bf_d_z_d_z = zeros((d_z, d_z), order='fortran')
        pivots_d_y = zeros((d_y), dtype='i', order='fortran')
        pivots_d_z = zeros((d_z), dtype='i', order='fortran')
        z_n_z_n_1_post = zeros((d_z, d_z))
        z_n_z_n_post = zeros((d_z, d_z))
        weighted_z_n_z_n_post = zeros((d_z, d_z, d_z))
        next_z_n_z_n_post = zeros((d_z, d_z))
        y_n_z_n_post = zeros((d_y, d_z))

        if training == True:
            max_Esteps = self.max_Esteps
            last_Esteps = self.last_Esteps
        else:
            max_Esteps = self.max_test_Esteps
            last_Esteps = self.max_test_Esteps

        Esteps = 0
        have_A_denum = False
        get_A_denum = False
        finished = False
        while not finished:
            T_sum = 0
            gamma_mean_diff = 0
            z_n_z_n_1_post_sum[:] = 0
            z_n_z_n_post_sum[:] = 0
            y_n_z_n_post_sum[:] = 0
            A_new_denums[:] = 0

            Esteps += 1
            if Esteps == max_Esteps:
                get_A_denum = True
                finished = True
            elif Esteps >= last_Esteps:
                get_A_denum = True

            if return_mu_post:
                mu_post = []

            for y_t, gamma_t in zip(y_set, gamma_set):
                T = len(y_t)
                T_sum += T
                mu_kalman_t = zeros((T, d_z))  # Filtering mus
                E_kalman_t = zeros((T, d_z, d_z))  # Filtering Es
                mu_post_t = zeros((T, d_z))
                E_post_t = zeros((T, d_z, d_z))
                P_t = zeros((T - 1, d_z, d_z))

                # Forward pass

                # Initialization at n = 0
                A_times_prev_mu[:] = 0
                multiply(C.T, reshape(gamma_t[0], (-1, 1)), mat_times_C_trans)
                pred[:] = 0

                product_matrix_matrix(C, mat_times_C_trans, cov_pred)
                cov_pred += Sigma
                solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y,
                      Bf_d_y_d_z, pivots_d_y)

                vec_d_y[:] = y_t[0]
                vec_d_y -= pred
                product_matrix_vector(K, vec_d_y, mu_kalman_t[0])

                product_matrix_matrix(K, C, KC)
                mat_d_z_d_z[:] = eye_d_z
                mat_d_z_d_z -= KC
                multiply(mat_d_z_d_z, gamma_t[0], E_kalman_t[0])

                # from n=1 to T-1
                for n in xrange(T - 1):
                    divide(1., E, vec_d_z)
                    divide(1., gamma_t[n + 1], vec_d_z2)
                    vec_d_z += vec_d_z2
                    divide(1., vec_d_z, vec_d_z2)
                    setdiag(E_gamma, vec_d_z2)
                    divide(E, gamma_t[n + 1], vec_d_z)
                    vec_d_z += 1
                    divide(A, reshape(vec_d_z, (-1, 1)), A_gamma)
                    P_tn = P_t[n]
                    product_matrix_matrix(E_kalman_t[n], A_gamma.T,
                                          mat_d_z_d_z)

                    product_matrix_matrix(A_gamma, mat_d_z_d_z, P_tn)
                    P_tn += E_gamma
                    product_matrix_vector(A_gamma, mu_kalman_t[n],
                                          A_times_prev_mu)
                    product_matrix_matrix(P_tn, C.T, mat_times_C_trans)
                    product_matrix_vector(C, A_times_prev_mu, pred)
                    product_matrix_matrix(C, mat_times_C_trans, cov_pred)
                    cov_pred += Sigma
                    solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y,
                          Bf_d_y_d_z, pivots_d_y)
                    vec_d_y[:] = y_t[n + 1]
                    vec_d_y -= pred
                    product_matrix_vector(K, vec_d_y, mu_kalman_t[n + 1])
                    mu_kalman_t[n + 1] += A_times_prev_mu

                    product_matrix_matrix(K, C, KC)
                    mat_d_z_d_z[:] = eye_d_z
                    mat_d_z_d_z -= KC
                    product_matrix_matrix(mat_d_z_d_z, P_tn, mat_d_z_d_z2)
                    # To ensure symmetry
                    E_kalman_t[n + 1] = mat_d_z_d_z2
                    E_kalman_t[n + 1] += mat_d_z_d_z2.T
                    E_kalman_t[n + 1] /= 2

                mu_post_t[-1] = mu_kalman_t[-1]
                E_post_t[-1] = E_kalman_t[-1]

                # Compute last step statistics
                outer(mu_post_t[-1], mu_post_t[-1], z_n_z_n_post)
                z_n_z_n_post += E_post_t[-1]
                outer(y_t[-1], mu_post_t[-1], y_n_z_n_post)
                # Update cumulative statistics
                z_n_z_n_post_sum += z_n_z_n_post
                y_n_z_n_post_sum += y_n_z_n_post

                # Backward pass
                pred[:] = 0
                cov_pred[:] = 0
                for n in xrange(T - 2, -1, -1):
                    next_z_n_z_n_post[:] = z_n_z_n_post
                    divide(E, gamma_t[n + 1], vec_d_z)
                    vec_d_z += 1
                    divide(A, reshape(vec_d_z, (-1, 1)), A_gamma)

                    P_tn = P_t[n]
                    solve(P_tn.T, A_gamma, mat_d_z_d_z, Af_d_z_d_z, Bf_d_z_d_z,
                          pivots_d_z)
                    product_matrix_matrix(E_kalman_t[n], mat_d_z_d_z.T, J)
                    product_matrix_vector(A_gamma, mu_kalman_t[n], vec_d_z)

                    vec_d_z *= -1
                    vec_d_z += mu_post_t[n + 1]
                    product_matrix_vector(J, vec_d_z, mu_post_t[n])
                    mu_post_t[n] += mu_kalman_t[n]

                    mat_d_z_d_z[:] = E_post_t[n + 1]
                    mat_d_z_d_z -= P_tn
                    product_matrix_matrix(mat_d_z_d_z, J.T, mat_d_z_d_z2)
                    product_matrix_matrix(J, mat_d_z_d_z2, mat_d_z_d_z)
                    # To ensure symmetry
                    E_post_t[n] = E_kalman_t[n]
                    E_post_t[n] += mat_d_z_d_z
                    E_post_t[n] += E_kalman_t[n].T
                    E_post_t[n] += mat_d_z_d_z.T
                    E_post_t[n] /= 2

                    # Compute posterior statistics
                    product_matrix_matrix(J, E_post_t[n + 1], z_n_z_n_1_post)
                    outer(mu_post_t[n + 1], mu_post_t[n], mat_d_z_d_z)
                    z_n_z_n_1_post += mat_d_z_d_z

                    outer(mu_post_t[n], mu_post_t[n], z_n_z_n_post)
                    z_n_z_n_post += E_post_t[n]

                    outer(y_t[n], mu_post_t[n], y_n_z_n_post)

                    # Update cumulative statistics
                    z_n_z_n_1_post_sum += z_n_z_n_1_post
                    z_n_z_n_post_sum += z_n_z_n_post
                    y_n_z_n_post_sum += y_n_z_n_post

                    gamma_mean_diff += self.compute_gamma(
                        A, E, z_n_z_n_post, next_z_n_z_n_post, gamma_t[n + 1])
                    #print gamma_t[n+1]
                    if get_A_denum == True:
                        # Compute the denominator of the A update,
                        # which requires d_z matrices of size (d_z,d_z)
                        # (i.e. d_z different weighted sums of the z_n_z_n_post matrices)
                        add(gamma_t[n + 1], E, vec_d_z)
                        divide(gamma_t[n + 1], vec_d_z, vec_d_z2)
                        multiply(reshape(z_n_z_n_post, (1, d_z, d_z)),
                                 reshape(vec_d_z2, (d_z, 1, 1)),
                                 weighted_z_n_z_n_post)
                        A_new_denums += weighted_z_n_z_n_post
                        have_A_denum = True

                new_gamma = (diag(z_n_z_n_post) + 2 * self.gamma_prior_beta
                             ) / (2 * self.gamma_prior_alpha + 3)
                gamma_mean_diff += sum((gamma_t[0] - new_gamma)**2) / d_z

                gamma_t[0] = new_gamma

            gamma_mean_diff /= T_sum
            if gamma_mean_diff < self.gamma_change_tolerance:
                if training == True:
                    if have_A_denum == True:
                        finished = True
                        self.last_Esteps = Esteps
                    else:
                        get_A_denum = True
                else:
                    finished = True
            elif gamma_mean_diff <= 10 * self.gamma_change_tolerance and training == True:
                get_A_denum = True
            if self.verbose:
                print gamma_mean_diff, max_Esteps, Esteps
            if return_mu_post:
                mu_post += [mu_post_t]

        # Compute the M step estimates of the parameters
        if training == True:
            for i in xrange(d_z):
                solve(
                    A_new_denums[i] +
                    eye(d_z) * self.latent_transition_matrix_regularizer,
                    z_n_z_n_1_post_sum[i:(i + 1)].T, A_new[i:(i + 1)].T)

            solve(
                z_n_z_n_post_sum + eye_d_z * self.emission_matrix_regularizer,
                y_n_z_n_post_sum.T, C_new.T)

        if return_mu_post:
            return (A_new, C_new), gamma_set, mu_post
        else:
            return (A_new, C_new), gamma_set
Example #13
0
    def EM_step(self, y_set, return_mu_post=False):
        """
        Computes the posterior statistics and outputs the M step
        estimates of the parameters.
        The set of probabilities p(y_t | y_{t-1}, ... , y_1) are also given.
        """

        # Setting variables with friendlier name
        d_y = self.input_size
        d_z = self.latent_size
        mu_zero = self.mu_zero
        V_zero = self.V_zero
        A = self.A
        C = self.C
        Sigma = self.Sigma
        E = self.E

        # Variables for estimating new parameters
        A_new = zeros((d_z, d_z))
        C_new = zeros((d_y, d_z))
        E_new = zeros((d_z, d_z))
        Sigma_new = zeros((d_y, d_y))
        mu_zero_new = zeros((d_z))
        V_zero_new = zeros((d_z, d_z))

        z_n_z_n_1_post_sum = zeros((d_z, d_z))
        z_n_z_n_post_sum = zeros((d_z, d_z))
        z_n_z_n_post_sum_no_last = zeros((d_z, d_z))
        z_n_z_n_post_sum_no_first = zeros((d_z, d_z))
        z_n_z_n_post_sum_first = zeros((d_z, d_z))
        outer_z_n_z_n_post_sum_first = zeros((d_z, d_z))
        z_n_post_sum_first = zeros((d_z))
        y_n_z_n_post_sum = zeros((d_y, d_z))
        y_n_y_n_sum = zeros((d_y, d_y))

        cond_probs = []

        # Temporary variable, to avoid memory allocation
        vec_d_z = zeros(d_z)
        vec_d_y = zeros(d_y)
        mat_d_z_d_z = zeros((d_z, d_z))
        mat_d_z_d_z2 = zeros((d_z, d_z))
        eye_d_z = eye(d_z)
        mat_times_C_trans = zeros((d_z, d_y))
        pred = zeros(d_y)
        cov_pred = zeros((d_y, d_y))
        K = zeros((d_z, d_y))
        KC = zeros((d_z, d_z))
        J = zeros((d_z, d_z))
        A_times_prev_mu = zeros(d_z)
        Af_d_y_d_y = zeros((d_y, d_y), order='fortran')  # Temporary variables
        Bf_d_y_d_z = zeros((d_y, d_z), order='fortran')  # for calls to
        Af_d_z_d_z = zeros((d_z, d_z),
                           order='fortran')  # math.linalg.solve(...)
        Bf_d_z_d_z = zeros((d_z, d_z), order='fortran')
        pivots_d_y = zeros((d_y), dtype='i', order='fortran')
        pivots_d_z = zeros((d_z), dtype='i', order='fortran')
        z_n_z_n_1_post = zeros((d_z, d_z))
        z_n_z_n_post = zeros((d_z, d_z))
        y_n_z_n_post = zeros((d_y, d_z))
        y_n_y_n = zeros((d_y, d_y))
        T_sum = 0

        if return_mu_post:
            mu_post = []

        for y_t in y_set:
            T = len(y_t)
            T_sum += T
            mu_kalman_t = zeros((T, d_z))  # Filtering mus
            E_kalman_t = zeros((T, d_z, d_z))  # Filtering Es
            mu_post_t = zeros(
                (T, d_z))  # Posterior mus (could be removed and computed once)
            E_post_t = zeros(
                (T, d_z,
                 d_z))  # Posterior Es  (could be removed and computed once)
            P_t = zeros((T - 1, d_z, d_z))
            cond_probs_t = zeros(T)

            # Forward pass

            # Initialization at n = 0
            A_times_prev_mu[:] = 0
            product_matrix_matrix(V_zero, C.T, mat_times_C_trans)
            product_matrix_vector(C, mu_zero, pred)
            product_matrix_matrix(C, mat_times_C_trans, cov_pred)
            cov_pred += Sigma
            solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y, Bf_d_y_d_z,
                  pivots_d_y)

            vec_d_y[:] = y_t[0]
            vec_d_y -= pred
            product_matrix_vector(K, vec_d_y, mu_kalman_t[0])
            mu_kalman_t[0] += mu_zero

            product_matrix_matrix(K, C, KC)
            mat_d_z_d_z[:] = eye_d_z
            mat_d_z_d_z -= KC
            product_matrix_matrix(mat_d_z_d_z, V_zero, E_kalman_t[0])
            cond_probs_t[0] = self.multivariate_norm_log_pdf(
                y_t[0], pred, cov_pred)
            # from n=1 to T-1
            for n in xrange(T - 1):
                P_tn = P_t[n]
                product_matrix_matrix(E_kalman_t[n], A.T, mat_d_z_d_z)
                product_matrix_matrix(A, mat_d_z_d_z, P_tn)
                P_tn += E
                product_matrix_vector(A, mu_kalman_t[n], A_times_prev_mu)
                product_matrix_matrix(P_tn, C.T, mat_times_C_trans)
                product_matrix_vector(C, A_times_prev_mu, pred)
                product_matrix_matrix(C, mat_times_C_trans, cov_pred)
                cov_pred += Sigma
                solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y,
                      Bf_d_y_d_z, pivots_d_y)
                vec_d_y[:] = y_t[n + 1]
                vec_d_y -= pred
                product_matrix_vector(K, vec_d_y, mu_kalman_t[n + 1])
                mu_kalman_t[n + 1] += A_times_prev_mu

                product_matrix_matrix(K, C, KC)
                mat_d_z_d_z[:] = eye_d_z
                mat_d_z_d_z -= KC
                product_matrix_matrix(mat_d_z_d_z, P_tn, mat_d_z_d_z2)
                # To ensure symmetry
                E_kalman_t[n + 1] = mat_d_z_d_z2
                E_kalman_t[n + 1] += mat_d_z_d_z2.T
                E_kalman_t[n + 1] /= 2
                cond_probs_t[n + 1] = self.multivariate_norm_log_pdf(
                    y_t[n + 1], pred, cov_pred)

            mu_post_t[-1] = mu_kalman_t[-1]
            E_post_t[-1] = E_kalman_t[-1]

            # Compute last step statistics
            outer(mu_post_t[-1], mu_post_t[-1], z_n_z_n_post)
            z_n_z_n_post += E_post_t[-1]
            outer(y_t[-1], mu_post_t[-1], y_n_z_n_post)
            outer(y_t[-1], y_t[-1], y_n_y_n)
            # Update cumulative statistics
            z_n_z_n_post_sum += z_n_z_n_post
            z_n_z_n_post_sum_no_first += z_n_z_n_post
            y_n_z_n_post_sum += y_n_z_n_post
            y_n_y_n_sum += y_n_y_n

            # Backward pass
            pred[:] = 0
            cov_pred[:] = 0
            for n in xrange(T - 2, -1, -1):
                P_tn = P_t[n]
                solve(P_tn.T, A, mat_d_z_d_z, Af_d_z_d_z, Bf_d_z_d_z,
                      pivots_d_z)
                product_matrix_matrix(E_kalman_t[n], mat_d_z_d_z.T, J)
                product_matrix_vector(A, mu_kalman_t[n], vec_d_z)

                vec_d_z *= -1
                vec_d_z += mu_post_t[n + 1]
                product_matrix_vector(J, vec_d_z, mu_post_t[n])
                mu_post_t[n] += mu_kalman_t[n]

                mat_d_z_d_z[:] = E_post_t[n + 1]
                mat_d_z_d_z -= P_tn
                product_matrix_matrix(mat_d_z_d_z, J.T, mat_d_z_d_z2)
                product_matrix_matrix(J, mat_d_z_d_z2, mat_d_z_d_z)
                # To ensure symmetry
                E_post_t[n] = E_kalman_t[n]
                E_post_t[n] += mat_d_z_d_z
                E_post_t[n] += E_kalman_t[n].T
                E_post_t[n] += mat_d_z_d_z.T
                E_post_t[n] /= 2

                # Compute posterior statistics
                product_matrix_matrix(J, E_post_t[n + 1], z_n_z_n_1_post)
                outer(mu_post_t[n + 1], mu_post_t[n], mat_d_z_d_z)
                z_n_z_n_1_post += mat_d_z_d_z

                outer(mu_post_t[n], mu_post_t[n], z_n_z_n_post)
                z_n_z_n_post += E_post_t[n]

                outer(y_t[n], mu_post_t[n], y_n_z_n_post)
                outer(y_t[n], y_t[n], y_n_y_n)

                # Update cumulative statistics
                z_n_z_n_1_post_sum += z_n_z_n_1_post
                z_n_z_n_post_sum += z_n_z_n_post
                if n > 0:
                    z_n_z_n_post_sum_no_first += z_n_z_n_post
                else:
                    z_n_z_n_post_sum_first += z_n_z_n_post
                    z_n_post_sum_first += mu_post_t[n]
                    outer(mu_post_t[n], mu_post_t[n], mat_d_z_d_z)
                    outer_z_n_z_n_post_sum_first += mat_d_z_d_z
                z_n_z_n_post_sum_no_last += z_n_z_n_post
                y_n_z_n_post_sum += y_n_z_n_post
                y_n_y_n_sum += y_n_y_n

            cond_probs += [cond_probs_t]

            if return_mu_post:
                mu_post += [mu_post_t]

        # Compute the M step estimates of the parameters
        #A_new = dot(z_n_z_n_1_post_sum,inv(z_n_z_n_post_sum_no_last+
        #                               eye_d_z*self.latent_transition_matrix_regularizer))
        solve(
            z_n_z_n_post_sum_no_last +
            eye_d_z * self.latent_transition_matrix_regularizer,
            z_n_z_n_1_post_sum.T, A_new.T)
        #C_new = dot(y_n_z_n_post_sum, inv(z_n_z_n_post_sum+
        #                                  eye_d_z*self.input_transition_matrix_regularizer))
        solve(
            z_n_z_n_post_sum +
            eye_d_z * self.input_transition_matrix_regularizer,
            y_n_z_n_post_sum.T, C_new.T)

        E_new[:] = z_n_z_n_post_sum_no_first
        z_n_z_n_1_A_T = dot(z_n_z_n_1_post_sum, A_new.T)
        E_new -= z_n_z_n_1_A_T.T
        E_new -= z_n_z_n_1_A_T  # There is an error in Bishop's equation: the transpose on A is missing
        E_new += dot(A_new, dot(z_n_z_n_post_sum_no_last, A_new.T))
        E_new += eye_d_z * self.latent_covariance_matrix_regularizer
        E_new /= T_sum - len(y_set)
        Sigma_new[:] = y_n_y_n_sum
        C_z_n_y_n = dot(C_new, y_n_z_n_post_sum.T)
        Sigma_new -= C_z_n_y_n
        Sigma_new -= C_z_n_y_n.T  # There is an error in Bishop's equation: the transpose on C is missing
        Sigma_new += dot(C_new, dot(z_n_z_n_post_sum, C_new.T))  # ... idem
        Sigma_new += eye(d_y) * self.input_covariance_matrix_regularizer
        Sigma_new /= T_sum

        mu_zero_new[:] = z_n_post_sum_first
        mu_zero_new /= len(y_set)
        V_zero_new[:] = z_n_z_n_post_sum_first
        V_zero_new -= outer_z_n_z_n_post_sum_first
        V_zero_new /= len(y_set)

        if return_mu_post:
            return (A_new, C_new, E_new, Sigma_new, mu_zero_new,
                    V_zero_new), cond_probs, mu_post
        else:
            return (A_new, C_new, E_new, Sigma_new, mu_zero_new,
                    V_zero_new), cond_probs
Example #14
0
    def EM_step(self,y_set,gamma_set,training = False, return_mu_post = False):
        """
        Computes the posterior statistics and outputs the M step
        estimates of the parameters.
        Also outputs the non-parametric, sparsity inducing variances gamma_t.
        Optionally, can output the posterior means of the latent state variables.
        """
        # Setting variables with friendlier name
        d_y = self.input_size
        d_z = self.latent_size
        #V_zero = self.V_zero
        A = self.A
        C = self.C
        Sigma = self.Sigma
        E = self.E

        # Variables for estimating new parameters
        A_new = zeros((d_z,d_z))
        C_new = zeros((d_y,d_z))
        
        z_n_z_n_1_post_sum = zeros((d_z,d_z))
        z_n_z_n_post_sum = zeros((d_z,d_z))
        A_new_denums = zeros((d_z,d_z,d_z))
        y_n_z_n_post_sum = zeros((d_y,d_z))

        # Temporary variable, to avoid memory allocation
        vec_d_z = zeros(d_z)
        vec_d_z2 = zeros(d_z)
        vec_d_y = zeros(d_y)
        mat_d_z_d_z = zeros((d_z,d_z))
        mat_d_z_d_z2 = zeros((d_z,d_z))
        eye_d_z = eye(d_z)
        mat_times_C_trans = zeros((d_z,d_y))
        pred = zeros(d_y)
        cov_pred = zeros((d_y,d_y))
        A_gamma = zeros((d_z,d_z))
        E_gamma = zeros((d_z,d_z))
        K = zeros((d_z,d_y))
        KC = zeros((d_z,d_z))
        J = zeros((d_z,d_z))
        A_times_prev_mu = zeros(d_z)
        Af_d_y_d_y = zeros((d_y,d_y),order='fortran') # Temporary variables
        Bf_d_y_d_z = zeros((d_y,d_z),order='fortran') # for calls to
        Af_d_z_d_z = zeros((d_z,d_z),order='fortran') # math.linalg.solve(...)
        Bf_d_z_d_z = zeros((d_z,d_z),order='fortran') 
        pivots_d_y = zeros((d_y),dtype='i',order='fortran') 
        pivots_d_z = zeros((d_z),dtype='i',order='fortran') 
        z_n_z_n_1_post = zeros((d_z,d_z))
        z_n_z_n_post = zeros((d_z,d_z))
        weighted_z_n_z_n_post = zeros((d_z,d_z,d_z))
        next_z_n_z_n_post = zeros((d_z,d_z))
        y_n_z_n_post = zeros((d_y,d_z))

        if training == True:
            max_Esteps = self.max_Esteps
            last_Esteps = self.last_Esteps
        else:
            max_Esteps = self.max_test_Esteps
            last_Esteps = self.max_test_Esteps

        Esteps = 0
        have_A_denum = False
        get_A_denum = False
        finished = False
        while not finished:
            T_sum = 0
            gamma_mean_diff = 0
            z_n_z_n_1_post_sum[:] = 0
            z_n_z_n_post_sum[:] = 0
            y_n_z_n_post_sum[:] = 0
            A_new_denums[:] = 0

            Esteps += 1
            if Esteps == max_Esteps:
                get_A_denum = True
                finished = True
            elif Esteps >= last_Esteps:
                get_A_denum = True
                
            if return_mu_post:
                mu_post = []

            for y_t,gamma_t in zip(y_set,gamma_set):
                T = len(y_t)
                T_sum += T
                mu_kalman_t = zeros((T,d_z))     # Filtering mus
                E_kalman_t = zeros((T,d_z,d_z))  # Filtering Es
                mu_post_t = zeros((T,d_z))
                E_post_t = zeros((T,d_z,d_z))
                P_t = zeros((T-1,d_z,d_z)) 
            
                # Forward pass
            
                # Initialization at n = 0
                A_times_prev_mu[:] = 0
                multiply(C.T,reshape(gamma_t[0],(-1,1)),mat_times_C_trans)
                pred[:] = 0

                product_matrix_matrix(C,mat_times_C_trans,cov_pred)
                cov_pred += Sigma
                solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y)
            
                vec_d_y[:] = y_t[0]
                vec_d_y -= pred
                product_matrix_vector(K,vec_d_y,mu_kalman_t[0])

                product_matrix_matrix(K,C,KC)
                mat_d_z_d_z[:] = eye_d_z
                mat_d_z_d_z -= KC
                multiply(mat_d_z_d_z,gamma_t[0],E_kalman_t[0])

                # from n=1 to T-1
                for n in xrange(T-1):
                    divide(1.,E,vec_d_z)
                    divide(1.,gamma_t[n+1],vec_d_z2)
                    vec_d_z += vec_d_z2
                    divide(1.,vec_d_z,vec_d_z2)
                    setdiag(E_gamma,vec_d_z2) 
                    divide(E,gamma_t[n+1],vec_d_z)
                    vec_d_z += 1
                    divide(A,reshape(vec_d_z,(-1,1)),A_gamma)
                    P_tn = P_t[n]
                    product_matrix_matrix(E_kalman_t[n],A_gamma.T,mat_d_z_d_z)

                    product_matrix_matrix(A_gamma,mat_d_z_d_z,P_tn)
                    P_tn += E_gamma
                    product_matrix_vector(A_gamma,mu_kalman_t[n],A_times_prev_mu)
                    product_matrix_matrix(P_tn,C.T,mat_times_C_trans)
                    product_matrix_vector(C,A_times_prev_mu,pred)
                    product_matrix_matrix(C,mat_times_C_trans,cov_pred)
                    cov_pred += Sigma
                    solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y)
                    vec_d_y[:] = y_t[n+1]
                    vec_d_y -= pred
                    product_matrix_vector(K,vec_d_y,mu_kalman_t[n+1])
                    mu_kalman_t[n+1] += A_times_prev_mu
                    
                    product_matrix_matrix(K,C,KC)
                    mat_d_z_d_z[:] = eye_d_z
                    mat_d_z_d_z -= KC
                    product_matrix_matrix(mat_d_z_d_z,P_tn,mat_d_z_d_z2)
                    # To ensure symmetry
                    E_kalman_t[n+1] = mat_d_z_d_z2
                    E_kalman_t[n+1] += mat_d_z_d_z2.T
                    E_kalman_t[n+1] /= 2

                mu_post_t[-1] = mu_kalman_t[-1]
                E_post_t[-1] = E_kalman_t[-1]

                # Compute last step statistics
                outer(mu_post_t[-1],mu_post_t[-1],z_n_z_n_post)
                z_n_z_n_post += E_post_t[-1]
                outer(y_t[-1],mu_post_t[-1],y_n_z_n_post)
                # Update cumulative statistics
                z_n_z_n_post_sum += z_n_z_n_post
                y_n_z_n_post_sum += y_n_z_n_post
 
                # Backward pass
                pred[:] = 0
                cov_pred[:] = 0
                for n in xrange(T-2,-1,-1):
                    next_z_n_z_n_post[:] = z_n_z_n_post
                    divide(E,gamma_t[n+1],vec_d_z)
                    vec_d_z += 1
                    divide(A,reshape(vec_d_z,(-1,1)),A_gamma)

                    P_tn = P_t[n]
                    solve(P_tn.T,A_gamma,mat_d_z_d_z,Af_d_z_d_z,Bf_d_z_d_z,pivots_d_z)
                    product_matrix_matrix(E_kalman_t[n],mat_d_z_d_z.T,J)
                    product_matrix_vector(A_gamma,mu_kalman_t[n],vec_d_z)

                    vec_d_z *= -1
                    vec_d_z += mu_post_t[n+1]
                    product_matrix_vector(J,vec_d_z,mu_post_t[n])
                    mu_post_t[n] += mu_kalman_t[n]
    
                    mat_d_z_d_z[:] = E_post_t[n+1]
                    mat_d_z_d_z -= P_tn
                    product_matrix_matrix(mat_d_z_d_z,J.T,mat_d_z_d_z2)
                    product_matrix_matrix(J,mat_d_z_d_z2,mat_d_z_d_z)
                    # To ensure symmetry
                    E_post_t[n] = E_kalman_t[n]
                    E_post_t[n] += mat_d_z_d_z
                    E_post_t[n] += E_kalman_t[n].T
                    E_post_t[n] += mat_d_z_d_z.T
                    E_post_t[n] /= 2
    
                    # Compute posterior statistics
                    product_matrix_matrix(J,E_post_t[n+1],z_n_z_n_1_post)
                    outer(mu_post_t[n+1],mu_post_t[n],mat_d_z_d_z)
                    z_n_z_n_1_post += mat_d_z_d_z
    
                    outer(mu_post_t[n],mu_post_t[n],z_n_z_n_post)
                    z_n_z_n_post += E_post_t[n]
                    
                    outer(y_t[n],mu_post_t[n],y_n_z_n_post)
                     
                    # Update cumulative statistics
                    z_n_z_n_1_post_sum += z_n_z_n_1_post
                    z_n_z_n_post_sum += z_n_z_n_post
                    y_n_z_n_post_sum += y_n_z_n_post
                    
                    gamma_mean_diff += self.compute_gamma(A,E,z_n_z_n_post,next_z_n_z_n_post,gamma_t[n+1])
                    #print gamma_t[n+1]
                    if get_A_denum == True:
                        # Compute the denominator of the A update, 
                        # which requires d_z matrices of size (d_z,d_z)
                        # (i.e. d_z different weighted sums of the z_n_z_n_post matrices)
                        add(gamma_t[n+1],E,vec_d_z)
                        divide(gamma_t[n+1],vec_d_z,vec_d_z2)
                        multiply(reshape(z_n_z_n_post,(1,d_z,d_z)),reshape(vec_d_z2,(d_z,1,1)),weighted_z_n_z_n_post)
                        A_new_denums += weighted_z_n_z_n_post
                        have_A_denum = True
                        
                new_gamma = (diag(z_n_z_n_post)+2*self.gamma_prior_beta)/(2*self.gamma_prior_alpha+3)
                gamma_mean_diff += sum((gamma_t[0]-new_gamma)**2)/d_z
                
                gamma_t[0] = new_gamma
                
            gamma_mean_diff /= T_sum
            if gamma_mean_diff < self.gamma_change_tolerance:
                if training == True:
                    if have_A_denum == True:
                        finished = True
                        self.last_Esteps = Esteps
                    else:
                        get_A_denum = True
                else:
                    finished = True                    
            elif gamma_mean_diff <= 10*self.gamma_change_tolerance and training == True:
                get_A_denum = True
            if self.verbose:
                print gamma_mean_diff, max_Esteps, Esteps
            if return_mu_post:
                mu_post += [mu_post_t]

        # Compute the M step estimates of the parameters
        if training == True:
            for i in xrange(d_z):
                solve(A_new_denums[i]+eye(d_z)*self.latent_transition_matrix_regularizer,z_n_z_n_1_post_sum[i:(i+1)].T,A_new[i:(i+1)].T)
        
            solve(z_n_z_n_post_sum+eye_d_z*self.emission_matrix_regularizer,y_n_z_n_post_sum.T,C_new.T)
        
        if return_mu_post:
            return (A_new,C_new),gamma_set,mu_post
        else:
            return (A_new,C_new),gamma_set
Example #15
0
    def cond_probs(self,y_set,gamma_set):
        """
        Given the set of gamma variables, outputs the set of 
        probabilities p(y_t | y_{t-1}, ... , y_1, gamma_{t-1}, ... , gamma_1)
        """

        # Note (HUGO): this function should probably be implemented in C
        #              to make it much faster, since it requires for loops.

        # Setting variables with friendlier name
        d_y = self.input_size
        d_z = self.latent_size
        A = self.A
        C = self.C
        Sigma = self.Sigma
        E = self.E

        cond_probs = []
        map_probs = []
        laplace_probs = []
        y_pred = []
        
        z_n_z_n_post_sum = zeros((d_z,d_z))

        # Temporary variable, to avoid memory allocation
        vec_d_z = zeros(d_z)
        vec_d_z2 = zeros(d_z)
        vec_d_y = zeros(d_y)
        mat_d_z_d_z = zeros((d_z,d_z))
        mat_d_z_d_z2 = zeros((d_z,d_z))
        eye_d_z = eye(d_z)
        mat_times_C_trans = zeros((d_z,d_y))
        pred = zeros(d_y)
        cov_pred = zeros((d_y,d_y))
        A_gamma = zeros((d_z,d_z))
        E_gamma = zeros((d_z,d_z))
        K = zeros((d_z,d_y))
        KC = zeros((d_z,d_z))
        J = zeros((d_z,d_z))
        A_times_prev_mu = zeros(d_z)
        Af_d_y_d_y = zeros((d_y,d_y),order='fortran') # Temporary variables
        Bf_d_y_d_z = zeros((d_y,d_z),order='fortran') # for calls to
        Af_d_z_d_z = zeros((d_z,d_z),order='fortran') # math.linalg.solve(...)
        Bf_d_z_d_z = zeros((d_z,d_z),order='fortran') 
        pivots_d_y = zeros((d_y),dtype='i',order='fortran') 
        pivots_d_z = zeros((d_z),dtype='i',order='fortran') 
        z_n_z_n_post = zeros((d_z,d_z))
        next_z_n_z_n_post = zeros((d_z,d_z))

        log_det_diff2_log_gamma = 0
        for y_t,gamma_t in zip(y_set,gamma_set):
            T = len(y_t)
            cond_probs_t = zeros(T)
            map_probs_t = zeros(T)
            laplace_probs_t = zeros(T)
            y_pred_t = zeros((T,d_y))
            mu_kalman_t = zeros((T,d_z))     # Filtering mus
            E_kalman_t = zeros((T,d_z,d_z))  # Filtering Es
            mu_post_t = zeros((T,d_z))
            E_post_t = zeros((T,d_z,d_z))
            P_t = zeros((T-1,d_z,d_z)) 
            
            # Forward pass
            
            # Initialization at n = 0
            A_times_prev_mu[:] = 0
            multiply(C.T,reshape(gamma_t[0],(-1,1)),mat_times_C_trans)
            pred[:] = 0
            product_matrix_matrix(C,mat_times_C_trans,cov_pred)
            cov_pred += Sigma
            solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y)
            
            vec_d_y[:] = y_t[0]
            vec_d_y -= pred
            product_matrix_vector(K,vec_d_y,mu_kalman_t[0])
            
            product_matrix_matrix(K,C,KC)
            mat_d_z_d_z[:] = eye_d_z
            mat_d_z_d_z -= KC
            multiply(mat_d_z_d_z,gamma_t[0],E_kalman_t[0])

            cond_probs_t[0] = self.multivariate_norm_log_pdf(y_t[0],pred,cov_pred)
            y_pred_t[0] = pred
            # from n=1 to T-1
            for n in xrange(T-1):
                divide(1.,E,vec_d_z)
                divide(1.,gamma_t[n+1],vec_d_z2)
                vec_d_z += vec_d_z2
                divide(1.,vec_d_z,vec_d_z2)
                setdiag(E_gamma,vec_d_z2) 
                divide(E,gamma_t[n+1],vec_d_z)
                vec_d_z += 1
                divide(A,reshape(vec_d_z,(-1,1)),A_gamma)
                
                P_tn = P_t[n]
                product_matrix_matrix(E_kalman_t[n],A_gamma.T,mat_d_z_d_z)
                product_matrix_matrix(A_gamma,mat_d_z_d_z,P_tn)
                P_tn += E_gamma
                product_matrix_vector(A_gamma,mu_kalman_t[n],A_times_prev_mu)
                product_matrix_matrix(P_tn,C.T,mat_times_C_trans)
                product_matrix_vector(C,A_times_prev_mu,pred)
                product_matrix_matrix(C,mat_times_C_trans,cov_pred)
                cov_pred += Sigma
                solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y)
                vec_d_y[:] = y_t[n+1]
                vec_d_y -= pred
                product_matrix_vector(K,vec_d_y,mu_kalman_t[n+1])
                mu_kalman_t[n+1] += A_times_prev_mu
                
                product_matrix_matrix(K,C,KC)
                mat_d_z_d_z[:] = eye_d_z
                mat_d_z_d_z -= KC
                product_matrix_matrix(mat_d_z_d_z,P_tn,mat_d_z_d_z2)
                # To ensure symmetry
                E_kalman_t[n+1] = mat_d_z_d_z2
                E_kalman_t[n+1] += mat_d_z_d_z2.T
                E_kalman_t[n+1] /= 2

                mu_post_t[-1] = mu_kalman_t[-1]
                E_post_t[-1] = E_kalman_t[-1]

                # Compute last step statistics
                outer(mu_post_t[-1],mu_post_t[-1],z_n_z_n_post)
                z_n_z_n_post += E_post_t[-1]
                # Update cumulative statistics
                z_n_z_n_post_sum += z_n_z_n_post

                cond_probs_t[n+1] = self.multivariate_norm_log_pdf(y_t[n+1],pred,cov_pred)
                y_pred_t[n+1] = pred

                
            #print y_t, y_pred_t
            # Backward pass
            pred[:] = 0
            cov_pred[:] = 0
            for n in xrange(T-2,-1,-1):
                next_z_n_z_n_post[:] = z_n_z_n_post
                divide(E,gamma_t[n+1],vec_d_z)
                vec_d_z += 1
                divide(A,reshape(vec_d_z,(-1,1)),A_gamma)
                
                P_tn = P_t[n]
                solve(P_tn.T,A_gamma,mat_d_z_d_z,Af_d_z_d_z,Bf_d_z_d_z,pivots_d_z)
                product_matrix_matrix(E_kalman_t[n],mat_d_z_d_z.T,J)
                product_matrix_vector(A_gamma,mu_kalman_t[n],vec_d_z)
                
                vec_d_z *= -1
                vec_d_z += mu_post_t[n+1]
                product_matrix_vector(J,vec_d_z,mu_post_t[n])
                mu_post_t[n] += mu_kalman_t[n]
                
                mat_d_z_d_z[:] = E_post_t[n+1]
                mat_d_z_d_z -= P_tn
                product_matrix_matrix(mat_d_z_d_z,J.T,mat_d_z_d_z2)
                product_matrix_matrix(J,mat_d_z_d_z2,mat_d_z_d_z)
                # To ensure symmetry
                E_post_t[n] = E_kalman_t[n]
                E_post_t[n] += mat_d_z_d_z
                E_post_t[n] += E_kalman_t[n].T
                E_post_t[n] += mat_d_z_d_z.T
                E_post_t[n] /= 2
                    
                outer(mu_post_t[n],mu_post_t[n],z_n_z_n_post)
                z_n_z_n_post += E_post_t[n]

                dummy = self.compute_gamma(A,E,z_n_z_n_post,next_z_n_z_n_post,gamma_t[n+1])
                log_prior_gamma = self.log_prior_gamma(gamma_t[n+1])
                #print log_prior_gamma
                log_prior_log_gamma = self.log_prior_log_gamma(gamma_t[n+1])               
                log_det_diff2_log_gamma = self.log_det_diff2_log_gamma(A,E,z_n_z_n_post,next_z_n_z_n_post,gamma_t[n+1])
                map_probs_t[n+1] = cond_probs_t[n+1]+log_prior_gamma
                laplace_probs_t[n+1] = cond_probs_t[n+1]+log_prior_log_gamma+d_z*log(2*pi)/2-0.5*log_det_diff2_log_gamma

            gamma_t[0] = (diag(z_n_z_n_post)+2*self.gamma_prior_beta)/(2*self.gamma_prior_alpha+3)
            log_prior_gamma = self.log_prior_gamma(gamma_t[0])
            log_prior_log_gamma = self.log_prior_log_gamma(gamma_t[0])
            log_det_diff2_log_gamma = sum((z_n_z_n_post/2+self.gamma_prior_beta)/gamma_t[0])
            map_probs_t[0] = cond_probs_t[0]+log_prior_gamma
            laplace_probs_t[0] = cond_probs_t[0]+log_prior_log_gamma+d_z*log(2*pi)/2-0.5*log_det_diff2_log_gamma

            cond_probs += [cond_probs_t]
            map_probs += [map_probs_t]
            laplace_probs += [laplace_probs_t]
            y_pred += [y_pred_t]
        
        return cond_probs, map_probs, laplace_probs, y_pred
Example #16
0
    def update_learner(self, example):
        # apply example to the inputs
        self.layers[0][:] = example[0]
        
        # forward propagation: compute activation values of all units
        
        # hidden layers
        for h in range(self.n_hidden_layers):
            mllin.product_matrix_vector(self.Ws[h], self.layers[h], self.layer_acts[h + 1])
            self.layer_acts[h + 1] += self.cs[h]
            mlnonlin.sigmoid(self.layer_acts[h + 1], self.layers[h + 1])
        
        # output layer
        mllin.product_matrix_vector(self.U, self.layers[-1], self.output_act)
        self.output_act += self.d
        mlnonlin.softmax(self.output_act, self.output)
        
        # back propagation: compute delta errors and updates to weights and
        # biases
        
        # TA:begin
        
        if   self.cost_function == 'CE': 
            self.doutput_act[:] = self.output
            self.doutput_act[example[1]] -= 1
            
        elif self.cost_function == 'SSE':
            y = self.output.copy()
            t = np.zeros(np.shape(y))
            t[example[1]] = 1
            
            # nr of classes
            c = np.size(y)
            
            T2 = (y-t)*y
            T2 = np.array([T2])
            T2 = T2.T
            T2 = np.tile(T2,[1,c])
            
            T3 = np.eye(c,c)
            T3 = T3 - np.tile(y,[c,1])
            
            # delta error at output layer
            self.doutput_act = np.sum(T2*T3,axis=0)
            
        elif self.cost_function == 'EXP':
            y = self.output.copy()
            t = np.zeros(np.shape(y))
            t[example[1]] = 1
            
            # nr of classes
            c = np.size(y)
            
            T1 = y-t
            T1 = np.square(T1)
            T1 = np.sum(T1)
            T1 = T1/self.tau
            T1 = np.exp(T1)
            T1 = 2*T1
            
            T2 = (y-t)*y
            T2 = np.array([T2])
            T2 = T2.T
            T2 = np.tile(T2,[1,c])
            
            T3 = np.eye(c,c)
            T3 = T3 - np.tile(y,[c,1])
            
            # delta error at output layer
            self.doutput_act = T1 * np.sum(T2*T3,axis=0)
            
        # TA:end
        
        self.doutput_act *= self.learning_rate / (1. + self.decrease_constant * self.n_updates)
        self.dd[:] = self.doutput_act
        mllin.outer(self.doutput_act, self.layers[-1], self.dU)
        
        mllin.product_matrix_vector(self.U.T, self.doutput_act, self.dlayers[-1])
        """
        The description and argument names of dsigmoid() are unclear. In
        practice, dsigmoid(s,dx,ds) computes s*(1-s)*dx element-wise and puts
        the result in ds. [TA]
        """
        mlnonlin.dsigmoid(self.layers[-1], self.dlayers[-1], self.dlayer_acts[-1])
        
        for h in range(self.n_hidden_layers - 1, -1, -1):
            self.dcs[h][:] = self.dlayer_acts[h + 1]
            mllin.outer(self.dlayer_acts[h + 1], self.layers[h], self.dWs[h])
            mllin.product_matrix_vector(self.Ws[h].T, self.dlayer_acts[h + 1], self.dlayers[h])
            mlnonlin.dsigmoid(self.layers[h], self.dlayers[h], self.dlayer_acts[h])
        
        #TA:
        if not self.freeze_Ws_cs:
            # update output weights and biases
            self.U -= self.dU
            self.d -= self.dd
            
            # update all hidden weights and biases
            for h in range(self.n_hidden_layers - 1, -1, -1):
                self.Ws[h] -= self.dWs[h]
                self.cs[h] -= self.dcs[h]
        else:
            # update output weights and biases
            self.U -= self.dU
            self.d -= self.dd
            
#             # update only highest hidden layer
#             h = self.n_hidden_layers - 1
#             self.Ws[h] -= self.dWs[h]
#             self.cs[h] -= self.dcs[h]
        
        self.n_updates += 1
Example #17
0
    def EM_step(self,y_set,return_mu_post = False):
        """
        Computes the posterior statistics and outputs the M step
        estimates of the parameters.
        The set of probabilities p(y_t | y_{t-1}, ... , y_1) are also given.
        """

        # Setting variables with friendlier name
        d_y = self.input_size
        d_z = self.latent_size
        mu_zero = self.mu_zero
        V_zero = self.V_zero
        A = self.A
        C = self.C
        Sigma = self.Sigma
        E = self.E

        # Variables for estimating new parameters
        A_new = zeros((d_z,d_z))
        C_new = zeros((d_y,d_z))
        E_new = zeros((d_z,d_z))
        Sigma_new = zeros((d_y,d_y))
        mu_zero_new = zeros((d_z))
        V_zero_new = zeros((d_z,d_z))
        
        z_n_z_n_1_post_sum = zeros((d_z,d_z))
        z_n_z_n_post_sum = zeros((d_z,d_z))
        z_n_z_n_post_sum_no_last = zeros((d_z,d_z))
        z_n_z_n_post_sum_no_first = zeros((d_z,d_z))
        z_n_z_n_post_sum_first = zeros((d_z,d_z))
        outer_z_n_z_n_post_sum_first = zeros((d_z,d_z))
        z_n_post_sum_first = zeros((d_z))
        y_n_z_n_post_sum = zeros((d_y,d_z))
        y_n_y_n_sum = zeros((d_y,d_y))

        cond_probs = []

        # Temporary variable, to avoid memory allocation
        vec_d_z = zeros(d_z)
        vec_d_y = zeros(d_y)
        mat_d_z_d_z = zeros((d_z,d_z))
        mat_d_z_d_z2 = zeros((d_z,d_z))
        eye_d_z = eye(d_z)
        mat_times_C_trans = zeros((d_z,d_y))
        pred = zeros(d_y)
        cov_pred = zeros((d_y,d_y))
        K = zeros((d_z,d_y))
        KC = zeros((d_z,d_z))
        J = zeros((d_z,d_z))
        A_times_prev_mu = zeros(d_z)
        Af_d_y_d_y = zeros((d_y,d_y),order='fortran') # Temporary variables
        Bf_d_y_d_z = zeros((d_y,d_z),order='fortran') # for calls to
        Af_d_z_d_z = zeros((d_z,d_z),order='fortran') # math.linalg.solve(...)
        Bf_d_z_d_z = zeros((d_z,d_z),order='fortran') 
        pivots_d_y = zeros((d_y),dtype='i',order='fortran') 
        pivots_d_z = zeros((d_z),dtype='i',order='fortran') 
        z_n_z_n_1_post = zeros((d_z,d_z))
        z_n_z_n_post = zeros((d_z,d_z))
        y_n_z_n_post = zeros((d_y,d_z))
        y_n_y_n = zeros((d_y,d_y))
        T_sum = 0

        if return_mu_post:
            mu_post = []

        for y_t in y_set:
            T = len(y_t)
            T_sum += T
            mu_kalman_t = zeros((T,d_z))     # Filtering mus
            E_kalman_t = zeros((T,d_z,d_z))  # Filtering Es
            mu_post_t = zeros((T,d_z))       # Posterior mus (could be removed and computed once)
            E_post_t = zeros((T,d_z,d_z))    # Posterior Es  (could be removed and computed once)
            P_t = zeros((T-1,d_z,d_z)) 
            cond_probs_t = zeros(T)

            # Forward pass

            # Initialization at n = 0
            A_times_prev_mu[:] = 0
            product_matrix_matrix(V_zero,C.T,mat_times_C_trans)
            product_matrix_vector(C,mu_zero,pred)
            product_matrix_matrix(C,mat_times_C_trans,cov_pred)
            cov_pred += Sigma
            solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y)
            
            vec_d_y[:] = y_t[0]
            vec_d_y -= pred
            product_matrix_vector(K,vec_d_y,mu_kalman_t[0])
            mu_kalman_t[0] += mu_zero

            product_matrix_matrix(K,C,KC)
            mat_d_z_d_z[:] = eye_d_z
            mat_d_z_d_z -= KC
            product_matrix_matrix(mat_d_z_d_z,V_zero,E_kalman_t[0])
            cond_probs_t[0] = self.multivariate_norm_log_pdf(y_t[0],pred,cov_pred)
            # from n=1 to T-1
            for n in xrange(T-1):
                P_tn = P_t[n]
                product_matrix_matrix(E_kalman_t[n],A.T,mat_d_z_d_z)
                product_matrix_matrix(A,mat_d_z_d_z,P_tn)
                P_tn += E
                product_matrix_vector(A,mu_kalman_t[n],A_times_prev_mu)
                product_matrix_matrix(P_tn,C.T,mat_times_C_trans)
                product_matrix_vector(C,A_times_prev_mu,pred)
                product_matrix_matrix(C,mat_times_C_trans,cov_pred)
                cov_pred += Sigma
                solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y)
                vec_d_y[:] = y_t[n+1]
                vec_d_y -= pred
                product_matrix_vector(K,vec_d_y,mu_kalman_t[n+1])
                mu_kalman_t[n+1] += A_times_prev_mu
                
                product_matrix_matrix(K,C,KC)
                mat_d_z_d_z[:] = eye_d_z
                mat_d_z_d_z -= KC
                product_matrix_matrix(mat_d_z_d_z,P_tn,mat_d_z_d_z2)
                # To ensure symmetry
                E_kalman_t[n+1] = mat_d_z_d_z2
                E_kalman_t[n+1] += mat_d_z_d_z2.T
                E_kalman_t[n+1] /= 2
                cond_probs_t[n+1] = self.multivariate_norm_log_pdf(y_t[n+1],pred,cov_pred)


            mu_post_t[-1] = mu_kalman_t[-1]
            E_post_t[-1] = E_kalman_t[-1]

            # Compute last step statistics
            outer(mu_post_t[-1],mu_post_t[-1],z_n_z_n_post)
            z_n_z_n_post += E_post_t[-1]
            outer(y_t[-1],mu_post_t[-1],y_n_z_n_post)
            outer(y_t[-1],y_t[-1],y_n_y_n)
            # Update cumulative statistics
            z_n_z_n_post_sum += z_n_z_n_post
            z_n_z_n_post_sum_no_first += z_n_z_n_post
            y_n_z_n_post_sum += y_n_z_n_post
            y_n_y_n_sum += y_n_y_n

            # Backward pass
            pred[:] = 0
            cov_pred[:] = 0
            for n in xrange(T-2,-1,-1):
                P_tn = P_t[n]
                solve(P_tn.T,A,mat_d_z_d_z,Af_d_z_d_z,Bf_d_z_d_z,pivots_d_z)
                product_matrix_matrix(E_kalman_t[n],mat_d_z_d_z.T,J)
                product_matrix_vector(A,mu_kalman_t[n],vec_d_z)

                vec_d_z *= -1
                vec_d_z += mu_post_t[n+1]
                product_matrix_vector(J,vec_d_z,mu_post_t[n])
                mu_post_t[n] += mu_kalman_t[n]

                mat_d_z_d_z[:] = E_post_t[n+1]
                mat_d_z_d_z -= P_tn
                product_matrix_matrix(mat_d_z_d_z,J.T,mat_d_z_d_z2)
                product_matrix_matrix(J,mat_d_z_d_z2,mat_d_z_d_z)
                # To ensure symmetry
                E_post_t[n] = E_kalman_t[n]
                E_post_t[n] += mat_d_z_d_z
                E_post_t[n] += E_kalman_t[n].T
                E_post_t[n] += mat_d_z_d_z.T
                E_post_t[n] /= 2

                # Compute posterior statistics
                product_matrix_matrix(J,E_post_t[n+1],z_n_z_n_1_post)
                outer(mu_post_t[n+1],mu_post_t[n],mat_d_z_d_z)
                z_n_z_n_1_post += mat_d_z_d_z

                outer(mu_post_t[n],mu_post_t[n],z_n_z_n_post)
                z_n_z_n_post += E_post_t[n]
                
                outer(y_t[n],mu_post_t[n],y_n_z_n_post)
                outer(y_t[n],y_t[n],y_n_y_n)
                 
                # Update cumulative statistics
                z_n_z_n_1_post_sum += z_n_z_n_1_post
                z_n_z_n_post_sum += z_n_z_n_post
                if n > 0: 
                    z_n_z_n_post_sum_no_first += z_n_z_n_post
                else: 
                    z_n_z_n_post_sum_first += z_n_z_n_post
                    z_n_post_sum_first += mu_post_t[n]
                    outer(mu_post_t[n],mu_post_t[n],mat_d_z_d_z)
                    outer_z_n_z_n_post_sum_first += mat_d_z_d_z
                z_n_z_n_post_sum_no_last += z_n_z_n_post
                y_n_z_n_post_sum += y_n_z_n_post
                y_n_y_n_sum += y_n_y_n
            
            cond_probs += [cond_probs_t]

            if return_mu_post:
                mu_post += [mu_post_t]
        
        # Compute the M step estimates of the parameters
        #A_new = dot(z_n_z_n_1_post_sum,inv(z_n_z_n_post_sum_no_last+
        #                               eye_d_z*self.latent_transition_matrix_regularizer))
        solve(z_n_z_n_post_sum_no_last+eye_d_z*self.latent_transition_matrix_regularizer,
              z_n_z_n_1_post_sum.T,A_new.T)
        #C_new = dot(y_n_z_n_post_sum, inv(z_n_z_n_post_sum+
        #                                  eye_d_z*self.input_transition_matrix_regularizer))
        solve(z_n_z_n_post_sum+eye_d_z*self.input_transition_matrix_regularizer,
              y_n_z_n_post_sum.T,C_new.T)

        E_new[:] = z_n_z_n_post_sum_no_first
        z_n_z_n_1_A_T = dot(z_n_z_n_1_post_sum,A_new.T)
        E_new -= z_n_z_n_1_A_T.T
        E_new -= z_n_z_n_1_A_T # There is an error in Bishop's equation: the transpose on A is missing
        E_new += dot(A_new,dot(z_n_z_n_post_sum_no_last,A_new.T))
        E_new += eye_d_z*self.latent_covariance_matrix_regularizer
        E_new /= T_sum - len(y_set)
        Sigma_new[:] = y_n_y_n_sum
        C_z_n_y_n = dot(C_new,y_n_z_n_post_sum.T)
        Sigma_new -= C_z_n_y_n
        Sigma_new -= C_z_n_y_n.T # There is an error in Bishop's equation: the transpose on C is missing
        Sigma_new += dot(C_new,dot(z_n_z_n_post_sum,C_new.T)) # ... idem
        Sigma_new += eye(d_y)*self.input_covariance_matrix_regularizer
        Sigma_new /= T_sum

        mu_zero_new[:] = z_n_post_sum_first
        mu_zero_new /= len(y_set)
        V_zero_new[:] = z_n_z_n_post_sum_first
        V_zero_new -= outer_z_n_z_n_post_sum_first
        V_zero_new /= len(y_set)

        if return_mu_post:
            return (A_new,C_new,E_new,Sigma_new,mu_zero_new,V_zero_new),cond_probs,mu_post
        else:
            return (A_new,C_new,E_new,Sigma_new,mu_zero_new,V_zero_new),cond_probs