def affine_backward(dout, cache): """ Computes the backward pass for an affine layer. Inputs: - dout: Upstream derivative, of shape (N, M) - cache: Tuple of: - x: Input data, of shape (N, d_1, ... d_k) - w: Weights, of shape (D, M) Returns a tuple of: - dx: Gradient with respect to x, of shape (N, d1, ..., d_k) - dw: Gradient with respect to w, of shape (D, M) - db: Gradient with respect to b, of shape (M,) """ x, w, b = cache x_plain = np.reshape(x, (x.shape[0], -1)) db = np.sum(dout, axis=0) dx_plain = np.dot(dout, np.transpose(w)) dx = np.reshape(dx_plain, x.shape) dw = np.dot(np.transpose(x_plain), dout) return dx, dw, db
def forward(self, X): """Forward pass to obtain the action probabilities for each observation in `X`.""" a = np.dot(self.params['w1'], X.T) h = np.maximum(0, a) logits = np.dot(h.T, self.params['w2'].T) p = 1.0 / (1.0 + np.exp(-logits)) return p
def lstm_step(x, prev_h, prev_c, Wx, Wh, b): """ Forward pass for a single timestep of an LSTM. The input data has dimension D, the hidden state has dimension H, and we use a minibatch size of N. Inputs: - x: Input data, of shape (N, D) - prev_h: Previous hidden state, of shape (N, H) - prev_c: previous cell state, of shape (N, H) - Wx: Input-to-hidden weights, of shape (D, 4H) - Wh: Hidden-to-hidden weights, of shape (H, 4H) - b: Biases, of shape (4H,) Returns a tuple of: - next_h: Next hidden state, of shape (N, H) - next_c: Next cell state, of shape (N, H) """ N, H = prev_c.shape # 1. activation vector a = np.dot(x, Wx) + np.dot(prev_h, Wh) + b # 2. gate fuctions i = sigmoid(a[:, 0:H]) f = sigmoid(a[:, H:2 * H]) o = sigmoid(a[:, 2 * H:3 * H]) g = np.tanh(a[:, 3 * H:4 * H]) # 3. next cell state next_c = f * prev_c + i * g next_h = o * np.tanh(next_c) return next_h, next_c
def lstm_step(x, prev_h, prev_c, Wx, Wh, b): """ Forward pass for a single timestep of an LSTM. The input data has dimension D, the hidden state has dimension H, and we use a minibatch size of N. Inputs: - x: Input data, of shape (N, D) - prev_h: Previous hidden state, of shape (N, H) - prev_c: previous cell state, of shape (N, H) - Wx: Input-to-hidden weights, of shape (D, 4H) - Wh: Hidden-to-hidden weights, of shape (H, 4H) - b: Biases, of shape (4H,) Returns a tuple of: - next_h: Next hidden state, of shape (N, H) - next_c: Next cell state, of shape (N, H) """ N, H = prev_c.shape # 1. activation vector a = np.dot(x, Wx) + np.dot(prev_h, Wh) + b # 2. gate fuctions i = sigmoid(a[:, 0:H]) f = sigmoid(a[:, H:2*H]) o = sigmoid(a[:, 2*H:3*H]) g = np.tanh(a[:, 3*H:4*H]) # 3. next cell state next_c = f * prev_c + i * g next_h = o * np.tanh(next_c) return next_h, next_c
def gru_step(x, prev_h, Wx, Wh, b, Wxh, Whh, bh): """ Forward pass for a single timestep of an GRU. The input data has dimentsion D, the hidden state has dimension H, and we use a minibatch size of N. Parameters ---------- x : Input data, of shape (N, D) prev_h : Previous hidden state, of shape (N, H) prev_c : Previous hidden state, of shape (N, H) Wx : Input-to-hidden weights for r and z gates, of shape (D, 2H) Wh : Hidden-to-hidden weights for r and z gates, of shape (H, 2H) b : Biases for r an z gates, of shape (2H,) Wxh : Input-to-hidden weights for h', of shape (D, H) Whh : Hidden-to-hidden weights for h', of shape (H, H) bh : Biases, of shape (H,) Returns ------- next_h : Next hidden state, of shape (N, H) Notes ----- Implementation follows http://jmlr.org/proceedings/papers/v37/jozefowicz15.pdf """ N, H = prev_h.shape a = sigmoid(np.dot(x, Wx) + np.dot(prev_h, Wh) + b) r = a[:, 0:H] z = a[:, H:2*H] h_m = np.tanh(np.dot(x, Wxh) + np.dot(r * prev_h, Whh) + bh) next_h = z * prev_h + (1 - z) * h_m return next_h
def test_cpu_gpu(n, s): #n = 10 #s = 512 with cpu(): x_cpu = _randn(s, s) y_cpu = _randn(s, s) for i in range(10): z_cpu = np.dot(x_cpu, y_cpu) z_cpu.asnumpy() t0 = time.time() for i in range(n): z_cpu = np.dot(x_cpu, y_cpu) z_cpu.asnumpy() t1 = time.time() all_cpu_time = t1 - t0 with gpu(0): x_gpu0 = _randn(s, s) y_gpu0 = _randn(s, s) for i in range(10): z_gpu0 = np.dot(x_gpu0, y_gpu0) z_gpu0.asnumpy() t2 = time.time() for i in range(n): z_gpu0 = np.dot(x_gpu0, y_gpu0) z_gpu0.asnumpy() t3 = time.time() all_gpu_time = t3 - t2 print("run on cpu:%.6f s/iter" % (all_cpu_time / n)) print("run on gpu:%.6f s/iter" % (all_gpu_time / n)) print("%s cpu_time/gpu_time:%.6f " % (s, all_cpu_time / all_gpu_time))
def test_context(): set_context(gpu(1)) # set the global context as gpu(1) def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def predict(weights, inputs): return sigmoid(np.dot(inputs, weights)) def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero(np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 with gpu(0): xshape = (256, 500) wshape = (500, 250) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 training_gradient_fun = grad(training_loss) for i in range(20): print('Trained loss accuracy #{}: {}%'.format(i, training_accuracy(weights, inputs))) gr = training_gradient_fun(weights, inputs) weights -= gr * 0.01 print("\nff and bp on {0}".format(weights.context)) print("\nexecute on cpu") with cpu(): x_cpu = random.rand(32, 64) - 0.5 y_cpu = random.rand(64, 32) - 0.5 z_cpu = np.dot(x_cpu, y_cpu) print('z_cpu.context = {0}'.format(z_cpu.context)) print("\nexecute on gpu(0)") with gpu(0): x_gpu0 = random.rand(32, 64) - 0.5 y_gpu0 = random.rand(64, 32) - 0.5 z_gpu0 = np.dot(x_gpu0, y_gpu0) z_gpu0.asnumpy() print('z_gpu0.context = {0}'.format(z_gpu0.context)) print("\n[use global context] execute on gpu(1)") x_gpu1 = random.rand(32, 64) - 0.5 y_gpu1 = random.rand(64, 32) - 0.5 z_gpu1 = np.dot(x_gpu1, y_gpu1) z_gpu1.asnumpy() print('z_gpu1.context = {0}'.format(z_gpu1.context))
def forward(self, X): a = np.dot(self.params['fc1'], X.T) h = np.maximum(0, a) logits = np.dot(h.T, self.params['policy_fc_last'].T) ps = np.exp(logits - np.max(logits, axis=1, keepdims=True)) ps /= np.sum(ps, axis=1, keepdims=True) vs = np.dot(h.T, self.params['vf_fc_last'].T) + self.params['vf_fc_last_bias'] return ps, vs
def _inner_loop(self, X, h, h0, WX, Wh, previous_h): # TODO efficiency N, H = h.shape gamma, beta = self.params['gamma'], self.params['beta'] boundary_condition = np.dot(X, WX) + np.dot(h, Wh) hs = h0 for s in xrange(self._inner_length): projected_hs = self._learning_rate * sum( self._decay_rate**(len(previous_h) - t - 1) * batch_scalar_product(h, hs) * h for t, h in enumerate(previous_h)) hs = boundary_condition + projected_hs hs = layer_normalization(hs, gamma, beta) hs = self._nonlinear(hs) return hs
def affine_forward(x, w, b): """ Computes the forward pass for an affine (fully-connected) layer. The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N examples, where each example x[i] has shape (d_1, ..., d_k). We will reshape each input into a vector of dimension D = d_1 * ... * d_k, and then transform it to an output vector of dimension M. Inputs: - x: A numpy array containing input data, of shape (N, d_1, ..., d_k) - w: A numpy array of weights, of shape (D, M) - b: A numpy array of biases, of shape (M,) Returns a tuple of: - out: output, of shape (N, M) - cache: (x, w, b) """ x_plain = np.reshape(x, (x.shape[0], -1)) out = np.dot(x_plain, w) + b cache = (x, w, b) return out, cache
def gru_step(x, prev_h, Wx, Wh, b, Wxh, Whh, bh): """ Forward pass for a single timestep of an GRU. The input data has dimentsion D, the hidden state has dimension H, and we use a minibatch size of N. Parameters ---------- x Input data, of shape (N, D) prev_h Previous hidden state, of shape (N, H) prev_c Previous hidden state, of shape (N, H) Wx Input-to-hidden weights for r and z gates, of shape (D, 2H) Wh Hidden-to-hidden weights for r and z gates, of shape (H, 2H) b Biases for r an z gates, of shape (2H,) Wxh Input-to-hidden weights for h', of shape (D, H) Whh Hidden-to-hidden weights for h', of shape (H, H) bh Biases, of shape (H,) Returns ------- next_h Next hidden state, of shape (N, H) Notes ----- Implementation follows http://jmlr.org/proceedings/papers/v37/jozefowicz15.pdf """ N, H = prev_h.shape a = sigmoid(np.dot(x, Wx) + np.dot(prev_h, Wh) + b) r = a[:, 0:H] z = a[:, H:2*H] h_m = np.tanh(np.dot(x, Wxh) + np.dot(r * prev_h, Whh) + bh) next_h = z * prev_h + (1 - z) * h_m return next_h
def predict(w, x): ''' a = np.exp(np.dot(x, w)) a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum ''' y = np.dot(x, w) prob = softmax(x=y, softmax_label=softmax_label) return prob
def loss(caffe_layer_specs, X, T): # original code: # log_prior = -L2_reg * np.dot(W_vect, W_vect) log_prior = 0 for caffe_layer in caffe_layer_specs: log_prior += -L2_reg * np.dot(caffe_layer.get_learnable_params()[0], caffe_layer.get_learnable_params()[0]) log_lik = np.sum(predictions(caffe_layer_specs, X) * T) return - log_prior - log_lik
def forward_pass(self, inputs, param_vector): """Get output of layer for inputs and param vector""" # get parameters and biases from vector with parser params = self.parser.get(param_vector, self.paramName) biases = self.parser.get(param_vector, self.biasName) # if inputs.ndim > 2: # #inputs = inputs.reshape((inputs.shape[0], np.prod(inputs.shape[1:]))) # inputs = inputs.reshape((inputs.shape[0], inputs.shape[1])) # perform layer operation and return result return self.nonlinearity(np.dot(inputs[:, :], params) + biases)
def rnn_step(x, prev_h, Wx, Wh, b): """ Run the forward pass for a single timestep of a vanilla RNN that uses a tanh activation function. The input data has dimension D, the hidden state has dimension H, and we use a minibatch size of N. Inputs: - x: Input data for this timestep, of shape (N, D). - prev_h: Hidden state from previous timestep, of shape (N, H) - Wx: Weight matrix for input-to-hidden connections, of shape (D, H) - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H) - b: Biases of shape (H,) Returns a tuple of: - next_h: Next hidden state, of shape (N, H) """ next_h = np.tanh(np.dot(x, Wx) + np.dot(prev_h, Wh) + b) return next_h
def forward(self, inputs, params): weight = params[self._weight] bias = params[self._bias] # weight_mean = np.mean(weight, axis=0).reshape((1, weight.shape[0])) weight_mean = np.sum(weight, axis=0) / float(weight.shape[0]) weight = weight - weight_mean X_dot_W = np.dot(inputs, weight) if self._no_bias: output = X_dot_W else: output = X_dot_W + bias return output
def forward(X,y,*p): N, C, H, W = X.shape X = X.reshape((N,C*H*W)) print '>>',X.shape print '>>',p[0].shape first = np.dot( X, p[0] ) + p[1] second = np.dot( first, p[2] ) + p[3] exp = np.exp(second) pred = exp / np.sum(exp) N = X.shape[0] loss = -np.sum( pred[np.arange(N),y] ) return loss
def hmc(U, dU, epsilon, L, current_q): q = current_q p = mp.random.randn(1, current_q.shape[0], dtype=mp.float32).T current_p = p # half step for momentum p = p - 0.5*epsilon*dU(q) # full steps for pos and momentum for i in range(L): q = q + epsilon*p if i != L-1: p = p - epsilon*dU(q) # half step for momentum p = p - 0.5*epsilon*dU(q) # Negate momentum for symmetry p = -p # Evaluate potential and kinetic energies current_U = U(current_q) current_K = 0.5*(mp.dot(current_p.T,current_p)) proposed_U = U(q) proposed_K = 0.5*(mp.dot(p.T, p)) if math.log(random.random()) < (current_U-proposed_U+current_K-proposed_K)[0]: return q return current_q
def forward(self, inputs, params): weight = params[self._weight] bias = params[self._bias] gain = params[self._gain] weight_norm = np.sqrt(np.sum(weight**2, axis=0)) weight_norm = weight_norm.reshape((1, weight.shape[1])) weight /= weight_norm weight *= gain outputs = np.dot(inputs, weight) if not self._no_bias: outputs += bias return outputs
def test_context(): #set_context(gpu(1)) # set the global context as gpu(1) def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def predict(weights, inputs): return sigmoid(np.dot(inputs, weights)) def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero( np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 """ with gpu(0): xshape = (256, 500) wshape = (500, 250) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 training_gradient_fun = grad(training_loss) for i in range(20): print('Trained loss accuracy #{}: {}%'.format(i, training_accuracy(weights, inputs))) gr = training_gradient_fun(weights, inputs) weights -= gr * 0.01 print("\nff and bp on {0}".format(weights.context)) """ print("\nexecute on cpu") with cpu(): x_cpu = random.rand(32, 64) - 0.5 y_cpu = random.rand(64, 32) - 0.5 z_cpu = np.dot(x_cpu, y_cpu) print('z_cpu.context = {0}'.format(z_cpu.context)) """
def test_sum_forward(): np_x = py_np.zeros((2, 10)) np_w = py_np.zeros((10, 3)) np_b = py_np.zeros(3) x = NumpyVarToMinpy(np_x) w = NumpyVarToMinpy(np_w) b = NumpyVarToMinpy(np_b) x_plain = np.reshape(x, (x.shape[0], -1)) out0 = np.dot(x_plain, w) out = out0 + b np_out = MinpyVarToNumpy(out) var = py_np.random.randn(2, 3) tmp = NumpyVarToMinpy(var) sum_tmp = np.sum(tmp, axis = 0) sum_py = MinpyVarToNumpy(sum_tmp)
def predict(weights, inputs): # Test Slice sliced_weights = weights[:, ::2] y = sigmoid(np.dot(inputs, sliced_weights)) return y
def predict(weights, bias, inputs): return sigmoid(np.dot(inputs, weights) + bias)
def conv_forward_naive(x, w, b, conv_param): """ A naive implementation of the forward pass for a convolutional layer. The input consists of N data points, each with C channels, height H and width W. We convolve each input with F different filters, where each filter spans all C channels and has height HH and width HH. Input: - x: Input data of shape (N, C, H, W) - w: Filter weights of shape (F, C, HH, WW) - b: Biases, of shape (F,) - conv_param: A dictionary with the following keys: - 'stride': The number of pixels between adjacent receptive fields in the horizontal and vertical directions. - 'pad': The number of pixels that will be used to zero-pad the input. Returns a tuple of: - out: Output data, of shape (N, F, H', W') where H' and W' are given by H' = 1 + (H + 2 * pad - HH) / stride W' = 1 + (W + 2 * pad - WW) / stride - cache: (x, w, b, conv_param) """ out = None ############################################################################# # TODO: Implement the convolutional forward pass. # # Hint: you can use the function np.pad for padding. # ############################################################################# pad = conv_param['pad'] stride = conv_param['stride'] N, C, H, W = x.shape F, _, HH, WW = w.shape Hout = 1 + (H + 2 * pad - HH) / stride Wout = 1 + (W + 2 * pad - WW) / stride # print 'N:%d,C:%d,H:%d,W:%d,F:%d,HH:%d,WW:%d,Hout:%d,Wout:%d,pad:%d,stride:%d' \ # % (N, C, H, W, F, HH, WW, Hout, Wout, pad, stride) # row_w shape: (F, C * HH * WW) print w.shape row_w = w.reshape((F, C*HH*WW)) print row_w.shape # pad_x shape: (N, C, H + 2 * pad, W + 2 * pad) pad_x = np.pad(x, pad, 'constant', constant_values=0) if pad != 0: pad_x = pad_x[pad:-pad, pad:-pad] print 'pad_x', pad_x.shape out = np.zeros((N, F, Hout, Wout)) # column_x shape: (N, C * HH * WW, Hout * Wout) for filter_W in range(Wout): for filter_H in range(Hout): block = pad_x[:, :, filter_H * stride:filter_H * stride + HH, filter_W * stride:filter_W * stride + WW] N, C, H, W = block.shape # print block.shape block = block.reshape((N, C*H*W)) # print block.shape # print row_w.shape o = np.dot(block, row_w.T) # print type(o) o = np.copy(o) b = np.copy(b) out[:, :, filter_H, filter_W] = o + b ############################################################################# # END OF YOUR CODE # ############################################################################# cache = (x, w, b, conv_param) return out, cache
def U(beta): return mp.sum(mp.log(1 + mp.exp(mp.dot(X, beta))))-mp.dot(y.T,(mp.dot(X,beta)))+(0.5/alpha)*mp.sum(beta**2)
def dU(beta): return mp.dot(X.T, (mp.exp(mp.dot(X,beta))/(1+mp.exp(mp.dot(X,beta))) - y)) + beta/alpha
def conv_forward_naive(x, w, b, conv_param): """ A naive implementation of the forward pass for a convolutional layer. The input consists of N data points, each with C channels, height H and width W. We convolve each input with F different filters, where each filter spans all C channels and has height HH and width HH. Input: - x: Input data of shape (N, C, H, W) - w: Filter weights of shape (F, C, HH, WW) - b: Biases, of shape (F,) - conv_param: A dictionary with the following keys: - 'stride': The number of pixels between adjacent receptive fields in the horizontal and vertical directions. - 'pad': The number of pixels that will be used to zero-pad the input. Returns a tuple of: - out: Output data, of shape (N, F, H', W') where H' and W' are given by H' = 1 + (H + 2 * pad - HH) / stride W' = 1 + (W + 2 * pad - WW) / stride - cache: (x, w, b, conv_param) """ out = None ############################################################################# # TODO: Implement the convolutional forward pass. # # Hint: you can use the function np.pad for padding. # ############################################################################# pad = conv_param['pad'] stride = conv_param['stride'] N, C, H, W = x.shape F, _, HH, WW = w.shape Hout = 1 + (H + 2 * pad - HH) / stride Wout = 1 + (W + 2 * pad - WW) / stride # print 'N:%d,C:%d,H:%d,W:%d,F:%d,HH:%d,WW:%d,Hout:%d,Wout:%d,pad:%d,stride:%d' \ # % (N, C, H, W, F, HH, WW, Hout, Wout, pad, stride) # row_w shape: (F, C * HH * WW) print w.shape row_w = w.reshape((F, C * HH * WW)) print row_w.shape # pad_x shape: (N, C, H + 2 * pad, W + 2 * pad) pad_x = np.pad(x, pad, 'constant', constant_values=0) if pad != 0: pad_x = pad_x[pad:-pad, pad:-pad] print 'pad_x', pad_x.shape out = np.zeros((N, F, Hout, Wout)) # column_x shape: (N, C * HH * WW, Hout * Wout) for filter_W in range(Wout): for filter_H in range(Hout): block = pad_x[:, :, filter_H * stride:filter_H * stride + HH, filter_W * stride:filter_W * stride + WW] N, C, H, W = block.shape # print block.shape block = block.reshape((N, C * H * W)) # print block.shape # print row_w.shape o = np.dot(block, row_w.T) # print type(o) o = np.copy(o) b = np.copy(b) out[:, :, filter_H, filter_W] = o + b ############################################################################# # END OF YOUR CODE # ############################################################################# cache = (x, w, b, conv_param) return out, cache
def training_loss(weights): preds = sigmoid(np.dot(inputs, weights)) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l
def activations(weights, *args): cat_state = np.concatenate(args + (np.ones((args[0].shape[0],1)),), axis=1) return np.dot(cat_state, weights)
Dindex1 = 6446 Dindex2 = 98 for i in range(fold): # tempResult = [] for j in range(fold): print(i, j) mat1Beg = i * Dindex1 mat1End = (i + 1) * Dindex1 print('mat1:{}:{}'.format(mat1Beg, mat1End)) mat2Beg = j * Dindex2 mat2End = (j + 1) * Dindex2 print('mat2:{}:{}'.format(mat2Beg, mat2End)) mat1part = mat[mat1Beg:mat1End, :] mat2part = mat2[:, mat2Beg:mat2End] partResult = mnp.dot(mat1part, mat2part) multiResult[mat1Beg:mat1End, mat2Beg:mat2End] = partResult print(type(partResult)) # tempResult.append(partResult) print(partResult) print(partResult.shape) # multiResult.append(tempResult) # multiResult = mnp.block(multiResult) print(multiResult) print(multiResult.shape) print(mnp.sum(multiResult)) print(mnp.mean(multiResult)) # multiResult[mat1Beg:mat1End, mat2Beg:mat2End] = partResult
def predict(w, x): a = np.exp(np.dot(x, w)) a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum return prob
def loss(W_vect, X, T): log_prior = -L2_reg * np.dot(W_vect, W_vect) log_lik = np.sum(predictions(W_vect, X) * T) return - log_prior - log_lik
def activations(weights, *args): cat_state = np.concatenate(args + (np.ones((args[0].shape[0], 1)), ), axis=1) return np.dot(cat_state, weights)