Esempio n. 1
0
class Net:
    def __init__(self, method=Method.Sigmoid):
        self.weights = []  # Current weights
        self.old_weights = []  # Last time weights
        self.output = 0.0  # Neuron output
        self.inputted_features = []  # Inputted features
        self.summed_signal = 0.0  # Summed singal (the summation of input)
        self.learning_rate = 0.8  # Learning rate
        self.activition = Activation()  # Activation function

    # 加总信号
    def summarize_inputs(self, features=[]):
        if not features:
            return 0.0
        self.inputted_features = copy.deepcopy(features)
        self.summed_signal = np.dot(self.inputted_features,
                                    self.weights)  # a = X * W
        self.output = self.activition.activate(self.summed_signal)  # b = f(a)
        return self.output

    # 更新权重
    def update_weights(self, error_value=0.0):
        self.old_weights = copy.deepcopy(self.weights)
        for index, old_weight in enumerate(self.old_weights):
            # new weight          = old weight + learning rate * error_value(i) * input
            new_weight = old_weight + self.learning_rate * error_value * self.inputted_features[
                index]
            self.weights[index] = new_weight

    def differential_activition(self):
        return self.activition.differentiate(self.output)
Esempio n. 2
0
class Net(object):

    # Read, Write
    activation_method = None

    # Readonly
    output_value = None  # 输出值
    output_partial = None
    previous_output = None
    previous_output_partial = None

    def __init__(self, has_recurrent=False):
        self.weights = []  # <number>
        self.recurrent_weights = []  # <number>
        self.bias = 0.0
        self.delta_value = 0.0  # Current delta value will be next delta value.
        self.has_recurrent = has_recurrent  # Has recurrent inputs ? (hidden net has recurrent, but output net not.
        self.activation = Activation(
        )  # 活化函式的 Get, Set 都在这里: net.activation.method.
        # 有另外开 self.activation_method 来方便存取
        self.output = NetOutput()
        self.timesteps = []  # <Timestep Object>

    # weights<number>
    def reset_weights(self, weights=[]):
        if not weights:
            return
        self.weights = np.copy(weights).tolist()

    # recurrent_weights<number>
    def reset_recurrent_weights(self, recurrent_weights=[]):
        if not recurrent_weights:
            return
        self.recurrent_weights = np.copy(recurrent_weights).tolist()

    def weight_for_index(self, index=0):
        return self.weights[index]

    def recurrent_weight_for_index(self, index=0):
        return self.recurrent_weights[index]

    # bias 也在这里归零
    def remove_all_weights(self):
        del self.weights[:]
        del self.recurrent_weights[:]
        self.bias = 0.0

    # Randomizing weights and bias.
    def randomize_weights(self, random_count=1, min=-0.5, max=0.5):
        del self.weights[:]
        self.bias = 0.0
        random = np.random
        for i in range(0, random_count):
            self.weights.append(random.uniform(min, max))

        self.bias = random.uniform(min, max)

    def randomize_recurrent_weights(self, random_count=1, min=-0.5, max=0.5):
        if self.has_recurrent:
            del self.recurrent_weights[:]
            random = np.random
            for i in range(0, random_count):
                self.recurrent_weights.append(random.uniform(min, max))

    # Net output. (Hidden net with recurrent, Output net without recurrent)
    def net_output(self, inputs=[], recurrent_outputs=[]):
        # 先走一般前馈(Forward)至 Hidden Layer
        summed_singal = np.dot(inputs, self.weights) + self.bias
        # 如果有递归层,再走递归(Recurrent) 至 Hidden Layer
        if len(recurrent_outputs) > 0:
            summed_singal += np.dot(recurrent_outputs, self.recurrent_weights)

        # 神经元输出
        output_value = self.activation.activate(summed_singal)
        self.output.add_sum_input(summed_singal)
        self.output.add_output_value(output_value)
        return output_value

    def clear(self):
        self.output.refresh()

    # For hidden layer nets to calculate their delta weights with recurrent layer,
    # and for output layer nets to calculate their delta weights without recurrent layer.
    # layer_outputs: hidden layer outputs or output layer outputs.
    def calculate_delta_weights(self,
                                learning_rate=1.0,
                                layer_outputs=[],
                                recurrent_outputs=[]):
        # 利用 Timestep 来当每一个 BP timestep 算权重修正值时的记录容器
        timestep = Timestep()
        # For delta bias.
        timestep.delta_bias = learning_rate * self.delta_value
        # For delta of weights.
        for weight_index, weight in enumerate(self.weights):
            # To calculate and delta of weight.
            last_layer_output = layer_outputs[weight_index]
            # SGD: new w = old w + (-learning rate * delta_value * x)
            #      -> x 可为 b[t][h] (hidden output) 或 b[t-1][h] (recurrent output) 或 x[i] (input feature)
            # Output layer 的 delta_value = aE/aw[hk] = -error value * f'(net)
            # Hidden layer 的 delta_value = aE/aw[ih] = SUM(delta_value[t][hk] * w[hk] + SUM(delta_value[t+1][h'h] * w
            delta_weight = learning_rate * self.delta_value * last_layer_output
            timestep.add_delta_weight(delta_weight)

        # For delta of recurrent weights. (Noted: Output Layer is without Recurrent)
        for recurrent_index, recurrent_weight in enumerate(
                self.recurrent_weights):
            last_recurrent_output = recurrent_outputs[recurrent_index]
            recurrent_delta_weight = learning_rate * self.delta_value * last_recurrent_output
            timestep.add_recurrent_delta_weight(recurrent_delta_weight)

        self.timesteps.append(timestep)

    def renew_weights(self, new_weights=[], new_recurrent_weights=[]):
        if not new_weights and not new_recurrent_weights:
            return
        self.remove_all_weights()
        self.reset_weights(new_weights)
        self.reset_recurrent_weights(new_recurrent_weights)

    def renew_bias(self):
        sum_changes = 0.0
        for timestep in self.timesteps:
            sum_changes += timestep.delta_bias
        # new b(j) = old b(j) + [-L * -delta(j)]
        self.bias += sum_changes

    # Renew weights and bias.
    def renew(self):
        # 累加每个 Timestep 里相同 Index 的 Delta Weight
        new_weights = []
        new_recurrent_weights = []

        for weight_index, weight in enumerate(self.weights):
            # For normal weight.
            sum_delta_changes = 0.0
            for timestep in self.timesteps:
                sum_delta_changes += timestep.delta_weight(weight_index)
            new_weight = weight + sum_delta_changes
            new_weights.append(new_weight)

        for recurrent_index, recurrent_weight in enumerate(
                self.recurrent_weights):
            # for recurrent weight.
            sum_recurrent_changes = 0.0
            for timestep in self.timesteps:
                sum_recurrent_changes += timestep.recurrent_delta_weight(
                    recurrent_index)
            new_recurrent_weight = recurrent_weight + sum_recurrent_changes
            new_recurrent_weights.append(new_recurrent_weight)

        self.renew_weights(new_weights, new_recurrent_weights)
        self.renew_bias()

        del self.timesteps[:]
        self.delta_value = 0.0  # 一定要归零, 因为走 BPTT 的原故

    '''
    @ Getters that all Readonly
    '''

    @property
    # The last output value from output_values.
    def output_value(self):
        return self.output.last_output_value

    @property
    # The last output value partial from output_values.
    def output_partial(self):
        # 是线性输出的活化函式(e.g. SGN, ReLU ... etc.),必须用输入信号(Sum Input)来求函式偏微分
        # 非线性的活化函式则用输出值(Output Value)来求偏微。
        output_value = self.output.last_sum_input if self.activation.is_linear == True else self.output_value
        return self.activation.partial(output_value)

    @property
    # The last moment output. e.g. b[t-1][h]
    def previous_output(self):
        return self.output.previous_output

    @property
    # 上一刻的输出偏微分
    def previous_output_partial(self):
        output_value = self.output.previous_sum_input if self.activation.is_linear == True else self.previous_output
        return self.activation.partial(output_value)

    @property
    def activation_method(self):
        return self.activation.method

    '''
    @ Setter
    '''

    @activation_method.setter
    def activation_method(self, method):
        self.activation.method = method
Esempio n. 3
0
class Neuron:
    # Private usage.
    __iteration_times = 0  # 迭代次数
    __iteration_error = 0.0  #迭代误差总和

    def __init__(self):
        self.tag = self.__class__.__name__
        self.samples = []  # 所有的训练样本(特征值)
        self.targets = []  # 范本的目标输出
        self.weights = []  # 权重
        self.bias = 0.0  # 偏权值
        self.learning_rate = 1.0  # 学习速率
        self.max_iteration = 1  # 最大迭代数
        self.convergence = 0.001  # 收敛误差
        self.activation = Activation()

    # Iteration Cost Function: 每个完整迭代运算后,把每一个训练样本的cost function取平均(用于判断是否收敛)
    def _iteration_cost_function(self):
        # 1/2 * (所有训练样本的cost function总和 / (训练样本数量 * 每笔训练样本的目标输出数量))
        return 0.5 * (self.__iteration_error / (len(self.samples) * 1))

    # 训练样本的Cost Function: 由于会在 _iteration_cost_function()计算迭代的cost function时去统一除 1/2。
    # 故在这里计算训练样本的cost function 时不除以 1/2。
    def _cost_function(self, error_value=0.0):
        self.__iteration_error += (error_value**2)

    def _net_input(self, features=[]):
        return np.dot(features, self.weights)

    def _net_output(self, net_input=0.0):
        return self.activation.activate(net_input)

    def _start(self, iteration, completion):
        self.__iteration_times += 1
        self.__iteration_error += 0.0

        # 这里刻意反每一个步骤都写出来,一步步的代算清楚流程
        for index, features in enumerate(self.samples):
            # Forward
            target_value = self.targets[index]
            net_input = self._net_input(features)
            net_output = self._net_output(net_input)

            # Backward
            error_value = target_value - net_output
            derived_activation = self.activation.partial(net_output)
            # Calculates cost function of the training sample.
            self._cost_function(error_value)
            # Updates all weights, the formula:
            # delta_value = -(target value - net output) * f'(net)
            # delta_weight = -learning rate * delta_value * x1 (Noted: 这里 learning rate 和 delta_value的负号会相)
            # new weights, e.g. new s1 = old w1 + delta_weight w1
            delta_value = error_value * derived_activation
            delta_weights = np.multiply(self.learning_rate * delta_value,
                                        features)
            new_weights = np.add(self.weights, delta_weights)
            self.weights = new_weights

        # Finished an iteration then adjusts conditions
        if (self.__iteration_times >= self.max_iteration) or (
                self._iteration_cost_function() <= self.convergence):
            if not completion is None:
                completion(self.__iteration_times, self.weights)
        else:
            if not iteration is None:
                iteration(self.__iteration_times, self.weights)
            self._start(iteration, completion)

    # One training sample: features -> one target
    def add_pattern(self, features=[], target=0):
        # If features is not an array that still working on here
        if not features:
            return
        # samples[features array]
        # targets[target value]
        self.samples.append(features)
        self.targets.append(target)

    def initialize_weights(self, weights=[]):
        if not weights:
            return
        self.weights = weights

    # 全零的初始权重
    def zero_weights(self):
        if not self.samples:
            return
        length = len(self.samples[0])
        for i in range(length):
            self.weights.append(0.0)

    def randomize_weights(self, min=0.0, max=1.0):
        # Float
        random = np.random
        input_count = len(self.samples[0])
        weights = []
        for i in range(0, input_count):
            weights.append(random.uniform(min, max))
        self.initialize_weights(weights)

    # iteration and completion are callback functions
    def training(self, iteration, completion):
        self.__iteration_times = 0
        self.__iteration_error = 0.0
        self._start(iteration, completion)

    def predict(self, features=[]):
        return self._net_output(self._net_input(features))
class RNN:
    def __init__(self,
                 input_layer_size,
                 state_layer_size,
                 state_layer_activation,
                 output_layer_size,
                 output_layer_activation,
                 epochs=100,
                 bptt_truncate=None,
                 learning_rule='bptt',
                 kernel=None,
                 eta=0.001,
                 rand=None,
                 verbose=0):
        """
        Notes:
            U - weight matrix from input into hidden layer.
            W - weight matrix from hidden layer to hidden layer.
            V - weight matrix from hidden layer to output layer.

        Inputs:
            input_size:
                Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1]

            state_layer_size:
                State layer size.

            state_layer_activation:
                A string. Refer to activation.py

            output_size:
                Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1]

            output_layer_activation:
                A string. Refer to activation.py

            epochs(opt):
                Number of epochs for a single training sample.

            learning_rule(opt):
                Choose between 'bptt' and 'modified'

            bptt_truncate(opt):
                If left at None, back propagation through time will be applied for all time steps. 

                Otherwise, a value for bptt_truncate means that 
                bptt will only be applied for at most bptt_truncate steps.

                Only considered when learning_rule == 'bptt'

            kernel(opt):
                # TODO - fill this
                Only considered when learning_rule == 'modified'

            eta (opt):
                Learning rate. Initialized to 0.001.

            rand (opt):
                Random seed. Initialized to None (no random seed).

            verbose (opt):
                Verbosity: levels 0 - 2

        Outputs:
            None
        """
        np.random.seed(rand)

        self.learning_rule = learning_rule.lower()

        if self.learning_rule == 'bptt':
            self.gradient_function = self.bptt
        elif self.learning_rule == 'modified':
            self.gradietn_function = self.modified_learning_rule
        else:
            raise ValueError

        self.input_layer_size = input_layer_size

        self.state_layer_size = state_layer_size
        self.state_layer_activation = state_layer_activation
        self.state_activation = Activation(state_layer_activation)

        self.output_layer_size = output_layer_size
        self.output_layer_activation = output_layer_activation
        self.output_activation = Activation(output_layer_activation)

        self.epochs = epochs

        self.kernel = kernel
        self.bptt_truncate = bptt_truncate

        # U - weight matrix from input into state layer.
        # W - weight matrix from state layer to state layer.
        # V - weight matrix from state layer to output layer.
        self.U = np.random.uniform(-np.sqrt(1. / input_layer_size),
                                   np.sqrt(1. / input_layer_size),
                                   (state_layer_size, input_layer_size))
        self.V = np.random.uniform(-np.sqrt(1. / state_layer_size),
                                   np.sqrt(1. / state_layer_size),
                                   (output_layer_size, state_layer_size))
        self.W = np.random.uniform(-np.sqrt(1. / state_layer_size),
                                   np.sqrt(1. / state_layer_size),
                                   (state_layer_size, state_layer_size))
        self.state_bias = np.zeros((state_layer_size, 1))
        self.output_bias = np.zeros((output_layer_size, 1))

        self.eta = eta
        self.verbose = verbose
        self.show_progress_bar = verbose > 0

    def fit(self, X_train, y_train):
        """
        Notes:

        Inputs:
            X_train:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.

            Y_train:
                Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples.

        Outputs:
            None
        """
        eta = self.eta

        if self.show_progress_bar:
            bar = ProgressBar(max_value=self.epochs)

        for epoch in range(self.epochs):
            for x, y in zip(X_train, y_train):
                dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y)
                self.U -= eta * dLdU
                self.V -= eta * dLdV
                self.W -= eta * dLdW
                self.output_bias -= dLdOb
                self.state_bias -= dLdSb

            if self.show_progress_bar:
                bar.update(epoch)

    def forward_propagation(self, x):
        """
        Inputs:
            x:
                Expect size (T, input_layer_size), where T is the length of time.
        Outputs:
            o:
                The activation of the output layer.
            s:
                The activation of the hidden state. 
        """
        T = x.shape[0]

        s = np.zeros((T + 1, self.state_layer_size))
        o = np.zeros((T, self.output_layer_size))
        s_linear = np.zeros((T + 1, self.state_layer_size))
        o_linear = np.zeros((T, self.output_layer_size))

        state_bias = Convert2DTo1D(self.state_bias)
        output_bias = Convert2DTo1D(self.output_bias)

        for t in np.arange(T):
            state_linear = np.dot(self.U, x[t]) + np.dot(self.W,
                                                         s[t - 1]) + state_bias
            s_linear[t] = state_linear
            s[t] = self.state_activation.activate(state_linear)
            output_linear = np.dot(self.V, s[t]) + output_bias
            o[t] = self.output_activation.activate(output_linear)
            o_linear[t] = output_linear
        return (o, s, s_linear, o_linear)

    def modified_learning_rule(self, x, y):
        """ 
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
            TODO - implement this
        """
        raise NotImplementedError

    def bptt(self, x, y):
        """
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
        """
        # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately
        # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices
        # and we might want to clean that later somehow...

        # TODO - also this can probably be cleaned more.

        T = len(y)
        assert T == len(x)

        if self.bptt_truncate is None:
            bptt_truncate = T
        else:
            bptt_truncate = self.bptt_truncate

        o, s, s_linear, o_linear = self.forward_propagation(x)

        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)

        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)

        num_dU_additions = 0
        num_dVdW_additions = 0

        delta_o = o - y
        for t in reversed(range(T)):
            # Backprop the error at the output layer
            g = delta_o[t]
            o_linear_val = o_linear[t]
            state_activation = s[t]

            g = Convert1DTo2D(g)
            o_linear_val = Convert1DTo2D(o_linear_val)
            state_activation = Convert1DTo2D(state_activation)

            g = g * self.output_activation.dactivate(o_linear_val)
            dLdV += np.dot(g, state_activation.T)
            dLdOb += g
            num_dU_additions += 1
            g = np.dot(self.V.T, g)

            # Backpropagation through time for at most bptt truncate steps
            for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)):
                state_linear = s_linear[bptt_step]
                state_activation_prev = s[bptt_step - 1]
                x_present = x[t]

                state_linear = Convert1DTo2D(state_linear)
                state_activation_prev = Convert1DTo2D(state_activation_prev)
                x_present = Convert1DTo2D(x_present)

                g = g * self.state_activation.dactivate(state_linear)
                dLdW += np.dot(g, state_activation_prev.T)
                dLdU += np.dot(g, x_present.T)
                dLdSb += g
                num_dVdW_additions += 1

                g = g * np.dot(self.W.T, g)
        return [
            dLdU / num_dU_additions, dLdV / num_dVdW_additions,
            dLdW / num_dVdW_additions, dLdOb / num_dU_additions,
            dLdSb / num_dVdW_additions
        ]

    def predict(self, X):
        """
        Inputs:
            X:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.

        Outputs:
            predictions
        """
        predictions = []
        for x in X:
            o, _, _, _ = self.forward_propagation(x)
            predictions.append(o)
        return predictions

    def score(self, X, Y):
        """
        Inputs:
            X:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.

            Y:
                Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples.

        Outputs:
            MSE  
        """
        predictions = self.predict(X)
        mses = []
        for prediction, y in zip(predictions, Y):
            mses.append(np.mean((predictions - y)**2))
        return np.mean(mses)
Esempio n. 5
0
class Dense:
    # Definiation and Initialization of the Dense Layer
    def __init__(self, num_neurons, input_shape):
        print(
            'Adding Layer: input_shape: {}, number of neurons: {}, output_shape: {}'
            .format(input_shape, num_neurons, num_neurons))
        # Let's initialize the weights in interval [0,1) for respective synaptic inputs
        self.weights = np.random.uniform(low=0, high=1, size=input_shape)

        # Lets initialize the biases all with value '1' for every neuron in current layer
        self.biases = np.ones(num_neurons)

        # Lets initialize the activation_potentials all with value '0' for every neuron in current layer
        self.activation_potentials = np.zeros(num_neurons)

        # Outputs of this layer
        self.outputs = np.zeros(num_neurons)

        # Local Gradients of all the neurons in current layer
        self.local_gradients = np.zeros(num_neurons)

        # And finally the activation function, for non-linearity of outputs
        self.activation = Activation()

        # Inputs to this layer -> Outputs from previous layer
        self.previous_layers_outputs = []
        print('Added Layer ... ')

    def get_gradients(self):
        return self.local_gradients

    def get_weights(self):
        return self.weights

    # The activation potential vector calculator for a given layer
    def activation_potential(self, inputs):
        self.activation_potentials = np.dot(inputs, self.weights) + self.biases

    #Local Gradient
    def local_gradient(self, error_at_end, layer, network, next_gradients,
                       next_weights):
        if layer == network[-1]:
            # Output layer = error * derivative of activation function
            self.local_gradients = np.dot(
                error_at_end,
                self.activation.activate_derivative(
                    self.activation_potentials))
        else:
            # Hidden layers = derivative of activation function * sum of all (derivative of activation
            # function of next layer * weights associated with this neuron going to all neurons in next layer)
            diff_of_activation_funct = self.activation.activate_derivative(
                self.activation_potentials)
            self.local_gradients = np.dot(diff_of_activation_funct,
                                          np.dot(next_gradients, next_weights))

    # Forward Signal Definition
    def forward_signal(self, inputs):
        # Calculate activation potentials for all neurons in this layer
        self.activation_potential(inputs)

        # Activate activation_potentials and save it in self.outputs
        self.outputs = self.activation.activate(self.activation_potentials)

        # Return the outputs of this layer, will need it in next layer
        return self.outputs

    # Backward Signal Definition
    def backward_signal(self, error_at_end, layer, network, learning_rate,
                        next_gradients, next_weights, inputs):
        # Calculate local_gradients
        self.local_gradient(error_at_end, layer, network, next_gradients,
                            next_weights)

        # Update weights
        self.weights = np.sum(
            self.weights, learning_rate * np.dot(self.local_gradients, inputs))
        # Update Biases
        self.biases = np.sum(self.biases,
                             learning_rate * np.dot(self.local_gradients, 1))
class RNN:
    def __init__(self,
                 input_layer_size,
                 state_layer_size,
                 state_layer_activation,
                 output_layer_size,
                 output_layer_activation,
                 epochs=100,
                 bptt_truncate=None,
                 learning_rule='bptt',
                 tau=None,
                 eta=1e-5,
                 rand=None,
                 verbose=0):
        """
        Notes:
            U - weight matrix from input into hidden layer.
            W - weight matrix from hidden layer to hidden layer.
            V - weight matrix from hidden layer to output layer.

        Inputs:
            input_size:
                Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1]

            state_layer_size:
                State layer size.

            state_layer_activation:
                A string. Refer to activation.py

            output_size:
                Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1]

            output_layer_activation:
                A string. Refer to activation.py

            epochs(opt):
                Number of epochs for a single training sample.

            learning_rule(opt):
                Choose between 'bptt','fa' or 'modified' 

            bptt_truncate(opt):
                If left at None, back propagation through time will be applied for all time steps. 

                Otherwise, a value for bptt_truncate means that 
                bptt will only be applied for at most bptt_truncate steps.

                Only considered when learning_rule == 'bptt'

            kernel(opt):
                # TODO - fill this
                Only considered when learning_rule == 'modified'

            eta (opt):
                Learning rate. Initialized to 0.001.

            rand (opt):
                Random seed. Initialized to None (no random seed).

            verbose (opt):
                Verbosity: levels 0 - 2

        Outputs:
            None
        """
        np.random.seed(rand)

        self.learning_rule = learning_rule.lower()

        if self.learning_rule == 'bptt':
            self.gradient_function = self.bptt
        elif self.learning_rule == 'fa':
            self.gradient_function = self.feedback_alignment
        elif self.learning_rule == 'modified':
            self.gradient_function = self.modified_learning_rule
        else:
            raise ValueError

        self.input_layer_size = input_layer_size

        self.state_layer_size = state_layer_size
        self.state_layer_activation = state_layer_activation
        self.state_activation = Activation(state_layer_activation)

        self.output_layer_size = output_layer_size
        self.output_layer_activation = output_layer_activation
        self.output_activation = Activation(output_layer_activation)

        self.epochs = epochs

        self.tau = tau
        self.convolutions = None
        if self.tau:
            self.convolutions = np.zeros((state_layer_size, ))
        self.bptt_truncate = bptt_truncate

        # U - weight matrix from input into state layer.
        # W - weight matrix from state layer to state layer.
        # V - weight matrix from state layer to output layer.
        """
        self.U = np.eye(-np.sqrt(1./input_layer_size),
                                    np.sqrt(1./input_layer_size), 
                                    (state_layer_size, input_layer_size))
        self.V = np.random.uniform(-np.sqrt(1./state_layer_size),
                                    np.sqrt(1./state_layer_size),
                                    (output_layer_size, state_layer_size))
        self.W = np.random.uniform(-np.sqrt(1./state_layer_size),
                                    np.sqrt(1./state_layer_size),
                                    (state_layer_size, state_layer_size))
        """
        self.U = np.eye(state_layer_size)
        self.V = np.eye(state_layer_size)

        self.W = np.random.uniform(-np.sqrt(1. / state_layer_size),
                                   np.sqrt(1. / state_layer_size),
                                   (state_layer_size, state_layer_size))

        self.state_bias = np.zeros((state_layer_size, 1))
        self.output_bias = np.zeros((output_layer_size, 1))

        # B - Feedback weight matrix for all layers
        self.B = np.random.uniform(-np.sqrt(1. / state_layer_size),
                                   np.sqrt(1. / state_layer_size),
                                   (state_layer_size, input_layer_size))

        self.eta = eta
        self.verbose = verbose
        self.show_progress_bar = verbose > 0

    def kernel_compute(self, t):
        time_const = 1
        M = np.exp(-t / time_const)
        return (M)

    def fit(self, X_train, y_train):
        """
        Notes:

        Inputs:
            X_train:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.

            Y_train:
                Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples.

        Outputs:
            None
        """
        eta = self.eta

        if self.show_progress_bar:
            bar = ProgressBar()

        for epoch in bar(range(self.epochs)):
            print 'epoch {}'.format(epoch)
            if self.convolutions is not None:
                self.convolutions *= 0.
            for x, y in zip(X_train, y_train):
                dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y)
                #self.U -= eta * dLdU
                #self.V -= eta * dLdV
                self.W -= eta * dLdW
                #self.output_bias -= dLdOb
                self.state_bias -= dLdSb

    def forward_propagation(self, x):
        """
        Inputs:
            x:
                Expect size (T, input_layer_size), where T is the length of time.
        Outputs:
            o:
                The activation of the output layer.
            s:
                The activation of the hidden state. 
        """
        T = x.shape[0]

        s = np.zeros((T + 1, self.state_layer_size))
        o = np.zeros((T, self.output_layer_size))
        s_linear = np.zeros((T + 1, self.state_layer_size))
        o_linear = np.zeros((T, self.output_layer_size))

        state_bias = Convert2DTo1D(self.state_bias)
        output_bias = Convert2DTo1D(self.output_bias)

        for t in np.arange(T):
            state_linear = np.dot(self.U, x[t]) + np.dot(self.W,
                                                         s[t - 1]) + state_bias
            s_linear[t] = state_linear
            s[t] = self.state_activation.activate(state_linear)
            output_linear = np.dot(self.V, s[t]) + output_bias
            o[t] = self.output_activation.activate(output_linear)
            o_linear[t] = output_linear
            if self.convolutions is not None:
                if all(self.convolutions == 0):
                    self.convolutions = s[t]
                self.convolutions = (
                    1 - 1 / self.tau) * self.convolutions + 1 / self.tau * s[t]
        return (o, s, s_linear, o_linear)

    def modified_learning_rule(self, x, y):
        """ 
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
        
            Hyper Parameters:
                K : Kernel
                T : Timesteps after which the weights are updated
            Learning Rule:
                Take a Random Backward Weight Vector(B) in same direction as W and minimize the error
        """
        T = len(y)
        assert T == len(x)

        if self.bptt_truncate is None:
            bptt_truncate = T
        else:
            bptt_truncate = self.bptt_truncate

        o, s, s_linear, o_linear = self.forward_propagation(x)
        #Initialize Random backward weights
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)

        num_dU_additions = 0
        num_dVdW_additions = 0
        delta_o = o - y
        for t in reversed(range(T)):
            # Backprop the error at the output layer
            e = delta_o[t]
            o_linear_val = o_linear[t]
            state_activation = s[t]

            e = Convert1DTo2D(e)
            o_linear_val = Convert1DTo2D(o_linear_val)
            state_activation = Convert1DTo2D(state_activation)

            e = e * self.output_activation.dactivate(o_linear_val)
            e = np.dot(self.V.T, e)

            #kernel_sum = 0
            # Backpropagation through time for at most bptt truncate steps
            #for t_prime in (range(t+1)):
            #    k = self.kernel_compute(t - t_prime)
            #    kernel_sum += k * x[t]
            #    dLdW += e * kernel_sum * self.B # TODO fix this
            #    num_dVdW_additions +=1

            assert self.convolutions is not None
            dLdW += self.B.dot(e).dot(Convert1DTo2D(self.convolutions).T)
        return [dLdU, dLdV, dLdW / T, dLdOb, dLdSb]

    #raise NotImplementedError

    def feedback_alignment(self, x, y):
        """
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
        """
        # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately
        # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices
        # and we might want to clean that later somehow...

        # TODO - also this can probably be cleaned more.

        T = len(y)
        assert T == len(x)

        if self.bptt_truncate is None:
            bptt_truncate = T
        else:
            bptt_truncate = self.bptt_truncate

        o, s, s_linear, o_linear = self.forward_propagation(x)

        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)

        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)

        num_dU_additions = 0
        num_dVdW_additions = 0

        delta_o = o - y
        for t in reversed(range(T)):
            # Backprop the error at the output layer
            g = delta_o[t]
            o_linear_val = o_linear[t]
            state_activation = s[t]

            g = Convert1DTo2D(g)
            o_linear_val = Convert1DTo2D(o_linear_val)
            state_activation = Convert1DTo2D(state_activation)

            g = g * self.output_activation.dactivate(o_linear_val)
            dLdV += np.dot(g, state_activation.T)
            dLdOb += g
            num_dU_additions += 1
            g = np.dot(self.V.T, g)

            # Backpropagation through time for at most bptt truncate steps
            for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)):
                state_linear = s_linear[bptt_step]
                state_activation_prev = s[bptt_step - 1]
                x_present = x[t]

                state_linear = Convert1DTo2D(state_linear)
                state_activation_prev = Convert1DTo2D(state_activation_prev)
                x_present = Convert1DTo2D(x_present)

                g = g * self.state_activation.dactivate(state_linear)
                dLdW += np.dot(g, state_activation_prev.T)
                dLdU += np.dot(g, x_present.T)
                dLdSb += g
                num_dVdW_additions += 1

                g = g * np.dot(self.B.T, g)
        return [
            dLdU / num_dU_additions, dLdV / num_dVdW_additions,
            dLdW / num_dVdW_additions, dLdOb / num_dU_additions,
            dLdSb / num_dVdW_additions
        ]

    def bptt(self, x, y):
        """
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
        """
        # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately
        # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices
        # and we might want to clean that later somehow...

        # TODO - also this can probably be cleaned more.

        T = len(y)
        assert T == len(x)

        if self.bptt_truncate is None:
            bptt_truncate = T
        else:
            bptt_truncate = self.bptt_truncate

        o, s, s_linear, o_linear = self.forward_propagation(x)

        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)

        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)

        num_dU_additions = 0
        num_dVdW_additions = 0

        delta_o = o - y
        for t in reversed(range(T)):
            # Backprop the error at the output layer
            g = delta_o[t]
            o_linear_val = o_linear[t]
            state_activation = s[t]

            g = Convert1DTo2D(g)
            o_linear_val = Convert1DTo2D(o_linear_val)
            state_activation = Convert1DTo2D(state_activation)

            g = g * self.output_activation.dactivate(o_linear_val)
            dLdV += np.dot(g, state_activation.T)
            dLdOb += g
            num_dU_additions += 1
            g = np.dot(self.V.T, g)

            # Backpropagation through time for at most bptt truncate steps
            for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)):
                state_linear = s_linear[bptt_step]
                state_activation_prev = s[bptt_step - 1]
                x_present = x[t]

                state_linear = Convert1DTo2D(state_linear)
                state_activation_prev = Convert1DTo2D(state_activation_prev)
                x_present = Convert1DTo2D(x_present)

                g = g * self.state_activation.dactivate(state_linear)
                dLdW += np.dot(g, state_activation_prev.T)
                dLdU += np.dot(g, x_present.T)
                dLdSb += g
                num_dVdW_additions += 1

                g = g * np.dot(self.W.T, g)
        return [
            dLdU / num_dU_additions, dLdV / num_dVdW_additions,
            dLdW / num_dVdW_additions, dLdOb / num_dU_additions,
            dLdSb / num_dVdW_additions
        ]

    def predict(self, X):
        """
        Inputs:
            X:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.

        Outputs:
            predictions
        """
        predictions = []
        for x in X:
            o, _, _, _ = self.forward_propagation(x)
            predictions.append(o)
        return predictions

    def score(self, X, Y):
        """
        Inputs:
            X:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.

            Y:
                Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples.

        Outputs:
            MSE  
        """
        predictions = self.predict(X)
        mses = []
        for prediction, y in zip(predictions, Y):
            mses.append(np.mean((predictions - y)**2))
        return np.mean(mses)
Esempio n. 7
0
class RNN(object):
    def __init__(self,
                 input_layer_size,
                 state_layer_size,
                 state_layer_activation,
                 output_layer_size,
                 output_layer_activation,
                 epochs=100,
                 bptt_truncate=None,
                 learning_rule='bptt',
                 tau=None,
                 eta=0.001,
                 rand=None,
                 verbose=0):
        """
        Notes:
            U - weight matrix from input into hidden layer.
            W - weight matrix from hidden layer to hidden layer.
            V - weight matrix from hidden layer to output layer.
        Inputs:
            input_size:
                Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1]
            state_layer_size:
                State layer size.
            state_layer_activation:
                A string. Refer to activation.py
            output_size:
                Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1]
            output_layer_activation:
                A string. Refer to activation.py
            epochs(opt):
                Number of epochs for a single training sample.
            learning_rule(opt):
                Choose between 'bptt','fa', 'dfa' or 'modified' 

            bptt_truncate(opt):
                If left at None, back propagation through time will be applied for all time steps. 
                Otherwise, a value for bptt_truncate means that 
                bptt will only be applied for at most bptt_truncate steps.
                Only considered when learning_rule == 'bptt'
            kernel(opt):
                # TODO - fill this
                Only considered when learning_rule == 'modified'
            eta (opt):
                Learning rate. Initialized to 0.001.
            rand (opt):
                Random seed. Initialized to None (no random seed).
            verbose (opt):
                Verbosity: levels 0 - 2
        Outputs:
            None
        """
        np.random.seed(rand)

        self.learning_rule = learning_rule.lower()

        if self.learning_rule == 'bptt':
            self.gradient_function = self.bptt
        elif self.learning_rule == 'fa':
            self.gradient_function = self.feedback_alignment
        elif self.learning_rule == 'dfa':
            self.gradient_function = self.direct_feedback_alignment
        elif self.learning_rule == 'modified':
            self.gradient_function = self.modified_learning_rule
        else:
            raise ValueError

        self.input_layer_size = input_layer_size

        self.state_layer_size = state_layer_size
        self.state_layer_activation = state_layer_activation
        self.state_activation = Activation(state_layer_activation)

        self.output_layer_size = output_layer_size
        self.output_layer_activation = output_layer_activation
        self.output_activation = Activation(output_layer_activation)

        self.epochs = epochs

        self.tau = tau
        self.bptt_truncate = bptt_truncate

        self.kernel_convs = None

        # U - weight matrix from input into state layer.
        # W - weight matrix from state layer to state layer.
        # V - weight matrix from state layer to output layer.
        """
        if self.learning_rule == 'bptt':
            self.U = np.random.uniform(-np.sqrt(1./input_layer_size),
                                        np.sqrt(1./input_layer_size), 
                                        (state_layer_size, input_layer_size))
            self.V = np.random.uniform(-np.sqrt(1./state_layer_size),
                                        np.sqrt(1./state_layer_size),
                                        (output_layer_size, state_layer_size))
            self.W = np.random.uniform(-np.sqrt(1./state_layer_size),
                                        np.sqrt(1./state_layer_size),
                                        (state_layer_size, state_layer_size))
            else:
        """
        if state_layer_size == input_layer_size and state_layer_size == output_layer_size:
            print "Using identity matrices for U and V"
            self.U = np.eye(state_layer_size)
            self.V = np.eye(state_layer_size)
        else:
            self.U = np.random.uniform(1, 2.,
                                       (state_layer_size, input_layer_size))
            self.V = np.random.uniform(1, 2.,
                                       (output_layer_size, state_layer_size))

        self.W = np.random.uniform(-0.5, 0.5,
                                   (state_layer_size, state_layer_size))
        # see if W matrix randomization is the cause
        #self.W = np.random.rand(2, 2) - 1/2#np.array([[0.51940038, -0.57702151],[0.64065148, 0.31259335]])
        #self.W = np.array([[0.51940038, -0.57702151],[0.64065148, 0.31259335]])

        self.state_bias = np.zeros((state_layer_size, 1))
        self.output_bias = np.zeros((output_layer_size, 1))

        # B - Feedback weight matrix for all layers
        """
        self.B = np.random.uniform(-np.sqrt(1./state_layer_size),
                                    np.sqrt(1./state_layer_size), 
                                    (state_layer_size, input_layer_size))
                                    """
        self.B = np.random.uniform(0., 0.5, self.W.shape)

        self.eta = eta
        self.verbose = verbose
        self.show_progress_bar = verbose > 0

    def kernel_compute(self, t):
        return np.exp(-t / self.tau)

    def eWBe(self, x, y):
        o, s, s_linear, o_linear = self.forward_propagation(x)

        delta_o = o - y
        T = len(x)

        eWBe = []

        for t in reversed(range(T)):
            e = delta_o[t]
            eWBe.append(np.dot(np.dot(np.dot(e.T, self.W), self.B), e))

        return eWBe

    def fit(self, X, y, validation_size=0.1):
        """
        Notes:
        Inputs:
            X_train:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.
            Y_train:
                Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples.
        Outputs:
            None
        """
        eta = self.eta

        X = np.array(X)
        y = np.array(y)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=validation_size, random_state=0)
        if self.verbose:
            print "Validation size: {0}".format(validation_size)
            print "Training on {0} samples".format(len(X_train))

        training_losses = []
        validation_losses = []
        # Non-online
        if self.show_progress_bar:
            bar = ProgressBar(max_value=len(X_train))
        for epoch in range(self.epochs):
            #if self.learning_rule == 'modified':
            #    self.kernel_convs = np.zeros_like(self.kernel_convs)

            training_loss = self.score(X_train, y_train)
            validation_loss = self.score(X_test, y_test)
            training_losses.append(training_loss)
            validation_losses.append(validation_loss)
            if self.verbose == 2:
                print "--------"
                print "Epoch {0}/{1}".format(epoch, self.epochs)
                print "Training loss: {0}".format(training_loss)
                print "Validation loss: {0}".format(validation_loss)
                print "--------"

            #eWBe = []
            for i, (x, y) in enumerate(zip(X_train, y_train)):
                dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y)
                self.W -= eta * dLdW
                #self.U -= eta * dLdU
                #self.V -= eta * dLdV
                self.state_bias -= eta * dLdSb
                #self.output_bias -= eta * dLdOb
                #eWBe.append(np.mean(self.eWBe(x, y)))
                if self.show_progress_bar:
                    bar.update(i)
            if self.show_progress_bar:
                bar.update(0)

        return training_losses, validation_losses

    def forward_propagation(self, x):
        """
        Inputs:
            x:
                Expect size (T, input_layer_size), where T is the length of time.
        Outputs:
            o:
                The activation of the output layer.
            s:
                The activation of the hidden state.
        """
        if self.learning_rule == 'modified':
            self.kernel_convs = np.zeros((self.state_layer_size, x.shape[0]))

        T = x.shape[0]

        s = np.zeros((T + 1, self.state_layer_size))
        o = np.zeros((T, self.output_layer_size))
        s_linear = np.zeros((T + 1, self.state_layer_size))
        o_linear = np.zeros((T, self.output_layer_size))

        state_bias = Convert2DTo1D(self.state_bias)
        output_bias = Convert2DTo1D(self.output_bias)

        for t in np.arange(T):
            state_linear = np.dot(self.U, x[t]) + np.dot(self.W,
                                                         s[t - 1]) + state_bias
            s_linear[t] = state_linear
            s[t] = self.state_activation.activate(state_linear)
            if self.learning_rule == 'modified' and t > 0:
                alpha = 1 / self.tau
                self.kernel_convs[:, t] = alpha * s[t] + (
                    1 - alpha) * self.kernel_convs[:, t - 1]
            output_linear = np.dot(self.V, s[t]) + output_bias
            o[t] = self.output_activation.activate(output_linear)
            o_linear[t] = output_linear
        return (o, s, s_linear, o_linear)

    def modified_learning_rule(self, x, y):
        """ 
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
        
            Hyper Parameters:
                K : Kernel
                T : Timesteps after which the weights are updated
            Learning Rule:
                Take a Random Backward Weight Vector(B) in same direction as W and minimize the error
        """
        T = len(y)
        assert T == len(x)

        if self.bptt_truncate is None:
            bptt_truncate = T
        else:
            bptt_truncate = self.bptt_truncate

        o, s, s_linear, o_linear = self.forward_propagation(x)
        #Initialize Random backward weights
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)

        num_dW_additions = 0
        delta_o = o - y

        for t in reversed(range(T)):
            # Get the error at the output layer
            e = self.V.T.dot(delta_o[t])
            o_linear_val = o_linear[t]

            e = Convert1DTo2D(e)
            o_linear_val = Convert1DTo2D(o_linear_val)

            #kernel_sum = 0

            # Backpropagation through time for at most bptt truncate steps
            #for t_prime in (range(max(0,t-50),t+1)):
            #for t_prime in (range(t+1)):
            #    state_activation = s[t_prime]
            #    state_linear = s_linear[t_prime - 1]

            #    k = self.kernel_compute(t - t_prime)
            #    kernel_sum += k * state_activation * self.state_activation.dactivate(state_linear)

            #kernel_sum = kernel_sum/(t+1)
            #kernel_sum = Convert1DTo2D(kernel_sum)
            dLdW += self.B.dot(e).dot(
                Convert1DTo2D(self.kernel_convs[:, t]).T
            )  #np.dot(np.dot(self.B, e), kernel_sum.T)
            dLdSb += np.dot(self.B, e)
            num_dW_additions += 1
        return [
            dLdU, dLdV, dLdW / num_dW_additions, dLdOb,
            dLdSb / num_dW_additions
        ]

    def direct_feedback_alignment(self, x, y):
        """
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
        """
        T = len(y)
        assert T == len(x)

        if self.bptt_truncate is None:
            bptt_truncate = T
        else:
            bptt_truncate = self.bptt_truncate

        o, s, s_linear, o_linear = self.forward_propagation(x)

        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)

        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)

        num_dU_additions = 0
        num_dVdW_additions = 0

        delta_o = o - y
        for t in reversed(range(T)):
            # Backprop the error at the output layer
            e = self.V.T.dot(delta_o[t])
            o_linear_val = o_linear[t]
            state_activation = s[t]

            e = Convert1DTo2D(e)
            o_linear_val = Convert1DTo2D(o_linear_val)
            state_activation = Convert1DTo2D(state_activation)

            num_dU_additions += 1

            # Backpropagation through time for at most bptt truncate steps
            for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)):
                state_linear = s_linear[bptt_step]
                state_activation_prev = s[bptt_step - 1]
                x_present = x[t]
                g = self.B.dot(e.copy())

                state_linear = Convert1DTo2D(state_linear)
                state_activation_prev = Convert1DTo2D(state_activation_prev)
                x_present = Convert1DTo2D(x_present)

                g = g * self.state_activation.dactivate(state_linear)
                dLdW += np.dot(g, state_activation_prev.T)
                dLdSb += g
                num_dVdW_additions += 1

        num_dVdW_additions = T
        return [
            dLdU / num_dU_additions, dLdV / num_dVdW_additions,
            dLdW / num_dVdW_additions, dLdOb / num_dU_additions,
            dLdSb / num_dVdW_additions
        ]

    def feedback_alignment(self, x, y):
        """
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
        """
        T = len(y)
        assert T == len(x)

        if self.bptt_truncate is None:
            bptt_truncate = T
        else:
            bptt_truncate = self.bptt_truncate

        o, s, s_linear, o_linear = self.forward_propagation(x)

        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)

        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)

        num_dU_additions = 0
        num_dVdW_additions = 0

        delta_o = o - y
        for t in reversed(range(T)):
            # Backprop the error at the output layer
            g = self.V.T.dot(delta_o[t])
            o_linear_val = o_linear[t]
            state_activation = s[t]

            g = Convert1DTo2D(g)
            o_linear_val = Convert1DTo2D(o_linear_val)
            state_activation = Convert1DTo2D(state_activation)

            num_dU_additions += 1

            # Backpropagation through time for at most bptt truncate steps
            for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)):
                state_linear = s_linear[bptt_step]
                state_activation_prev = s[bptt_step - 1]
                x_present = x[t]

                state_linear = Convert1DTo2D(state_linear)
                state_activation_prev = Convert1DTo2D(state_activation_prev)
                x_present = Convert1DTo2D(x_present)

                g = g * self.state_activation.dactivate(state_linear)
                dLdW += np.dot(g, state_activation_prev.T)
                dLdSb += g
                num_dVdW_additions += 1

                g = np.dot(self.B, g)
        num_dVdW_additions = T
        return [
            dLdU / num_dU_additions, dLdV / num_dVdW_additions,
            dLdW / num_dVdW_additions, dLdOb / num_dU_additions,
            dLdSb / num_dVdW_additions
        ]

        # online version
        #def bptt(self, x, y):
        """
            Output:
                dLdU:
                    Gradient for U matrix
                dLdV:
                    Gradient for V matrix
                dLdW:
                    Gradient for W matrix
                dLdOb:
                    Gradient for output layer bias
                dLdSb:
                    Gradient for state layer bias
        """
        """
        # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately
        # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices
        # and we might want to clean that later somehow...
        # TODO - also this can probably be cleaned more.
        t = len(y)
        assert t == len(x)
        
        if self.bptt_truncate is None:
            bptt_truncate = t
        else:
            bptt_truncate = self.bptt_truncate
        o, s, s_linear, o_linear = self.forward_propagation(x)
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)
        num_dU_additions = 0
        num_dVdW_additions = 0
        delta_o = o - y
        # Backprop the error at the output layer
        g = delta_o[t - 1]
        o_linear_val = o_linear[t - 1]
        state_activation = s[t - 1]
        g = Convert1DTo2D(g)
        o_linear_val = Convert1DTo2D(o_linear_val)
        state_activation = Convert1DTo2D(state_activation)
        g = g * self.output_activation.dactivate(o_linear_val)
        dLdV += np.dot(g, state_activation.T)
        dLdOb += g
        num_dU_additions += 1
        g = np.dot(self.V.T, g)
        # Backpropagation through time for at most bptt truncate steps
        for bptt_step in reversed(range(max(0, t - bptt_truncate),  t + 1)):
            state_linear = s_linear[bptt_step]
            state_activation_prev = s[bptt_step - 1]
            x_present = x[t - 1]
            
            state_linear = Convert1DTo2D(state_linear)
            state_activation_prev = Convert1DTo2D(state_activation_prev)
            x_present = Convert1DTo2D(x_present)
            g = g  * self.state_activation.dactivate(state_linear)
            dLdW += np.dot(g, state_activation_prev.T)
            dLdU += np.dot(g, x_present.T)
            dLdSb += g
            num_dVdW_additions += 1
            g = g * np.dot(self.W.T, g)
        return [dLdU/num_dU_additions, 
                dLdV/num_dVdW_additions, 
                dLdW/num_dVdW_additions, 
                dLdOb/num_dU_additions, 
                dLdSb/num_dVdW_additions]
    """

    # Non-online version
    def bptt(self, x, y):
        # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately
        # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices
        # and we might want to clean that later somehow...

        # TODO - also this can probably be cleaned more.

        T = len(y)
        assert T == len(x)

        if self.bptt_truncate is None:
            bptt_truncate = T
        else:
            bptt_truncate = self.bptt_truncate

        o, s, s_linear, o_linear = self.forward_propagation(x)

        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)

        dLdOb = np.zeros(self.output_bias.shape)
        dLdSb = np.zeros(self.state_bias.shape)

        num_dU_additions = 0
        num_dVdW_additions = 0

        delta_o = o - y
        for t in reversed(range(T)):
            # Backprop the error at the output layer
            g = delta_o[t]
            o_linear_val = o_linear[t]
            state_activation = s[t]

            g = Convert1DTo2D(g)
            o_linear_val = Convert1DTo2D(o_linear_val)
            state_activation = Convert1DTo2D(state_activation)

            g = g * self.output_activation.dactivate(o_linear_val)
            dLdV += np.dot(g, state_activation.T)
            dLdOb += g
            num_dU_additions += 1
            g = np.dot(self.V.T, g)

            # Backpropagation through time for at most bptt truncate steps
            for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)):
                state_linear = s_linear[bptt_step]
                state_activation_prev = s[bptt_step - 1]
                x_present = x[t]

                state_linear = Convert1DTo2D(state_linear)
                state_activation_prev = Convert1DTo2D(state_activation_prev)
                x_present = Convert1DTo2D(x_present)

                g = g * self.state_activation.dactivate(state_linear)
                dLdW += np.dot(g, state_activation_prev.T)
                dLdU += np.dot(g, x_present.T)
                dLdSb += g
                num_dVdW_additions += 1

                g = np.dot(self.W.T, g)
        num_dVdW_additions = T
        return [
            dLdU / num_dU_additions, dLdV / num_dVdW_additions,
            dLdW / num_dVdW_additions, dLdOb / num_dU_additions,
            dLdSb / num_dVdW_additions
        ]

    def predict(self, X):
        """
        Inputs:
            X:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.
        Outputs:
            predictions
        """
        predictions = []
        for x in X:
            o, _, _, _ = self.forward_propagation(x)
            predictions.append(o)
        return predictions

    def score(self, X, Y):
        """
        Inputs:
            X:
                Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples.
            Y:
                Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples.
        Outputs:
            MSE  
        """
        predictions = self.predict(X)
        mses = []
        for prediction, y in zip(predictions, Y):
            #mses.append(np.mean((prediction - y)**2))
            mses.append(mean_squared_error(prediction, y))
        return np.mean(mses)