def __init__(self, in_dim, hl_dims, bias=False, activation_fun=tf.nn.relu, batch_normalization=True): self.in_dim = in_dim self.hl_dims = hl_dims self.bias = bias self.activation_func = activation_fun self.batch_normalization = batch_normalization self.fc_layers = [] # This must be equal, in order to add residual assert in_dim == hl_dims[-1] idim = self.in_dim for i in range(len(hl_dims)): if i == 0: # Give Relu NN activation function self.fc_layers.append( FullyConnectedLayer(idim, hl_dims[i], bias=self.bias, activation_fun=activation_fun)) else: # Give No activation function self.fc_layers.append( FullyConnectedLayer(idim, hl_dims[i], bias=self.bias, activation_fun=None)) idim = hl_dims[i]
def _build_architecture_get_prediction_and_regularization_cost( architecture, weight_decay, current_input): architecture_built = list() regularization_cost = Variable(0.0) weight_decay_variable = Variable(weight_decay) # TODO: constant previous_layer_output = architecture[0]['input'] for layer_dictionary in architecture: assert previous_layer_output == layer_dictionary["input"], \ 'Inconsistent architecture: can not feed {} outputs to {} inputs'.format( previous_layer_output, layer_dictionary['input'] ) activation_function = activation_function_name_to_class[ layer_dictionary["nonlinear"]] regularization_method = regularization_method_name_to_class[ layer_dictionary["regularization"]] layer = FullyConnectedLayer(layer_dictionary["input"], layer_dictionary["output"], activation_function, current_input) regularization_cost = Add( regularization_cost, Multiply(weight_decay_variable, regularization_method(layer.get_weight()))) architecture_built.append(layer) current_input = layer previous_layer_output = layer_dictionary['output'] return architecture_built, current_input, regularization_cost
def test_forward_backward_1_no_activation(self): x = np.arange(6).reshape(3, 2) x_variable = Variable(x) fc = FullyConnectedLayer(2, 1, Identity, x_variable) w = fc._w._value.copy() b = fc._b._value.copy() wxb_desired = x @ w + b wxb_actual = fc.forward() np.testing.assert_almost_equal(wxb_actual, wxb_desired) fc.backward(np.array([[6.0], [7.0], [8.0]])) dl_dw_actual = fc._w.get_gradient() dl_dx_actual = x_variable.get_gradient() dl_dw_desired = np.array([[0 * 6 + 2 * 7 + 4 * 8], [1 * 6 + 3 * 7 + 5 * 8]]) dl_dx_desired = np.array([[w[0,0] * 6, w[1,0] * 6], [w[0,0] * 7, w[1,0] * 7], [w[0,0] * 8, w[1,0] * 8]]) np.testing.assert_allclose(dl_dw_actual, dl_dw_desired) np.testing.assert_allclose(dl_dx_actual, dl_dx_desired) dl_db_actual = fc._b.get_gradient() dl_db_desired = np.array([6 + 7 + 8]) np.testing.assert_allclose(dl_db_actual, dl_db_desired)
def __init__(self, in_dim, hl_sizes, out_dim, isClassification=True, learning_rate=.0001): self.X = tf.placeholder(dtype=tf.float32, shape=[None, in_dim], name="X") self.Y = tf.placeholder(dtype=tf.float32, shape=[None, out_dim], name="Y") self.blocks = [] self.learning_rate = learning_rate self.isClassification = isClassification # Create Layers / Blocks idim = in_dim self.blocks.append(FullyConnectedLayer( idim, hl_sizes[0])) # Non Res FC Layer idim = hl_sizes[0] self.blocks.append( FullyConnectedResNetBlock(idim, hl_sizes, bias=False)) # Res Block self.blocks.append( FullyConnectedLayer(hl_sizes[-1], out_dim, activation_fun=None)) # Non Res Block # Roll On Through Those Tensors, Cowboy Z = self.X for i in range(len(self.blocks)): Z = self.blocks[i].forward(Z) self.Y_pred = Z # Now setup cost function if self.isClassification: self.Yk = tf.nn.softmax(self.Y_pred, axis=1) self.cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.Y, logits=self.Y_pred, dim=1)) self.optimizer = tf.train.AdamOptimizer( self.learning_rate).minimize(self.cost) #self.optimizer = tf.train.AdagradDAOptimizer(.001).minimize(self.cost) else: # Is Regressive self.cost = tf.reduce_sum( tf.squared_difference(self.Y_pred, self.Y)) self.optimizer = tf.train.AdamOptimizer( self.learning_rate).minimize(self.cost) # Start the session self.set_session(tf.Session()) self.sess.run(tf.global_variables_initializer()) print( "Session Initialized: Network Params will be dumped to CNN_Parameters.txt" )
def __init__(self, input_dim, output_dim, hidden_layer_sizes = [64], activation_fun = tf.nn.relu, use_res_net_blocks = True, training_rate = 1e-3): self.X = tf.placeholder(tf.float32, shape=[None,input_dim], name = "inputs") self.Y = tf.placeholder(tf.float32, shape =[None,], name = "actions") self.hidden_layers = [] self.input_dim = input_dim self.output_dim = output_dim self.training_rate = training_rate idim = input_dim self.hl_sizes = hidden_layer_sizes self.experiences = {'states':[], 'values': []} self.min_experiences = 100 self.max_expereinces = 1000 self.batch_size = 32 self.Xs = [] self.Ys = [] if use_res_net_blocks: # Then fill out tensorboard with res net for i in range(len(hl_sizes)): if i == 0: self.hidden_layers.append(FullyConnectedLayer(idim, hl_sizes[i], activation_fun = activation_fun)) else: self.hidden_layers.append(FullyConnectedResNetBlock(idim, [hl_sizes[i]], activation_fun = activation_fun, batch_normalization = False)) idim = hl_sizes[i] else: # Use regular fully connected layers for hl in self.hl_sizes: self.hidden_layers.append(FullyConnectedLayer(idim, hl, activation_fun = activation_fun)) idim = hl # Computes the Value of the current state self.h_last = FullyConnectedLayer(idim, output_dim, activation_fun = None) # Graph abstraction Z = self.X for hl in self.hidden_layers: Z = hl.forward(Z) self.Y_pred = self.h_last.forward(Z) self.Y_pred = tf.reshape(self.Y_pred, [-1]) # Cost self.cost = tf.reduce_sum(tf.square(self.Y - self.Y_pred)) self.train_op = tf.train.AdamOptimizer(self.training_rate).minimize(self.cost)
def buildNetwork(self): """ Builds the neural network with a fixed structure, and a variable number of outputs. """ self.inputLayer = InputLayer() convLayer = ConvolutionalLayer(5,10) poolLayer = PoolingLayer(4) reluLayer = ReluLayer() convLayer2 = ConvolutionalLayer(4,20) pool2Layer = PoolingLayer(2) flattenLayer = FlattenLayer() reluLayer2 = ReluLayer() fullLayer = FullyConnectedLayer(20) self.outputLayer = OutputLayer(self.numOutputs) fullLayer.connect(self.outputLayer) flattenLayer.connect(fullLayer) reluLayer2.connect(flattenLayer) pool2Layer.connect(reluLayer2) convLayer2.connect(pool2Layer) reluLayer.connect(convLayer2) poolLayer.connect(reluLayer) convLayer.connect(poolLayer) self.inputLayer.connect(convLayer)
def test_forward_backward_4_no_activation(self): x = np.arange(6).reshape(3, 2) x_variable = Variable(x) fc = FullyConnectedLayer(2, 4, Identity, x_variable) w = fc._w._value.copy() b = fc._b._value.copy() wxb_desired = x @ w + b wxb_actual = fc.forward() np.testing.assert_almost_equal(wxb_actual, wxb_desired) dl_dxwb = np.arange(6, 6 + 3 * 4).reshape(3, 4) fc.backward(dl_dxwb) dl_dw_actual = fc._w.get_gradient() dl_dw_desired = np.array([ [x[:, 0].T @ dl_dxwb[:, 0], x[:, 0].T @ dl_dxwb[:, 1], x[:, 0].T @ dl_dxwb[:, 2], x[:, 0].T @ dl_dxwb[:, 3]], [x[:, 1].T @ dl_dxwb[:, 0], x[:, 1].T @ dl_dxwb[:, 1], x[:, 1].T @ dl_dxwb[:, 2], x[:, 1].T @ dl_dxwb[:, 3]], ]) np.testing.assert_allclose(dl_dw_actual, dl_dw_desired)
def __init__(self, input_dim = 2, output_dim = 1, hl_sizes = [64,64,64], use_res_net = True, activation_fun = tf.nn.relu, training_rate = 1e-3, eps = 1.0): self.input_dim = input_dim self.output_dim = output_dim self.X = tf.placeholder(tf.float32, shape=[None,input_dim], name = "inputs") self.actions = tf.placeholder(tf.float32, shape =[None,], name = "actions") self.advantages = tf.placeholder(tf.float32, shape=[None,], name = "advantages") self.hidden_layers = [] self.training_rate = training_rate self.min_experiences = 100 self.max_experiences = 1000 self.batch_size = 32 self.inner_counter = 0.0 self.eps = eps self.experiences = {'states': [], 'actions': [], 'advantages': []} idim = input_dim if use_res_net: # Then fill out tensorboard with res net for i in range(len(hl_sizes)): if i == 0: self.hidden_layers.append(FullyConnectedLayer(idim, hl_sizes[i], activation_fun = activation_fun)) else: self.hidden_layers.append(FullyConnectedResNetBlock(idim, [hl_sizes[i]], activation_fun = activation_fun, batch_normalization = False)) idim = hl_sizes[i] else: # Use regular fully connected layers for hl in hl_sizes: self.hidden_layers.append(FullyConnectedLayer(idim, hl, activation_fun = activation_fun)) idim = hl # last layer is Regressive to single node -- One for Mean, One for Std_dev self.Y_mean = FullyConnectedLayer(idim, output_dim, activation_fun = None) self.Y_std = FullyConnectedLayer(idim, output_dim, activation_fun= tf.nn.relu) # Rollout Abstraction Z = self.X for hl in self.hidden_layers: Z = hl.forward(Z) #Mean mean = tf.reshape(self.Y_mean.forward(Z), [-1]) std = tf.reshape(self.Y_std.forward(Z), [-1]) + 1e-5 # Sample from the normal distribution norm = tf.contrib.distributions.Normal(mean, std) self.predict_op = tf.clip_by_value(norm.sample(), -1,1) log_probs = norm.log_prob(self.actions) self.cost = -tf.reduce_sum(self.advantages * log_probs + .1*norm.entropy()) self.train_op = tf.train.AdamOptimizer(self.training_rate).minimize(self.cost)
theano.config.floatX = 'float32' else: print ("Running with a CPU. If this is not desired, then the modify "+\ "network3.py to set\nthe GPU flag to True.") training_data, validation_data, test_data = load_data_shared() mini_batch_size = 10 from Network import Network from FullyConnectedLayer import FullyConnectedLayer from SoftmaxLayer import SoftmaxLayer from ConvPoolLayer import ConvPoolLayer net = Network([ FullyConnectedLayer(n_in=784, n_out=100), SoftmaxLayer(n_in=100, n_out=10) ], mini_batch_size) net.SGD(training_data, 1, mini_batch_size, 0.1, validation_data, test_data) # add a convolutional layer: net = Network([ ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), filter_shape=(20, 1, 5, 5), poolsize=(2, 2)), FullyConnectedLayer(n_in=20*12*12, n_out=100), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
from Network import Network from FullyConnectedLayer import FullyConnectedLayer import numpy as np X = np.array([[0,0,1], [0,1,1], [1,0,1], [1,1,1]]) y = np.array([[0],[1],[1],[0]]) layers = [ FullyConnectedLayer(X.shape[1], 4, False, 'sigmoid'), FullyConnectedLayer(4, 1, True, 'sigmoid') ] net = Network(8000, layers) net.train(X, y)
import numpy as np from FullyConnectedLayer import FullyConnectedLayer from OutputLayer import OutputLayer l1 = FullyConnectedLayer(17*17) l2 = FullyConnectedLayer(50) l3 = OutputLayer(10) l1.append(l2).append(l3) l1.forward(np.ones(17*17)) print l1.getUnits() print l2.getUnits() print l3.getUnits() l3.setTrainData(np.array([1,0,0,0,0,0,0,0,0,0])) l3.backward()
return np.sum(a) / float(X.shape[0]) * 100. def one_hot(x, size): a = np.zeros((x.shape[0], size)) a[np.arange(x.shape[0]), x] = 1. return a if __name__ == '__main__': batch_size = 20 # A simple strided convnet layers = [ ConvolutionLayer((4, 4, 1, 20), strides=2, activation=lkrelu, filter_init=lambda shp: np.random.normal(size=shp) * np.sqrt(1.0 / (28*28 + 13*13*20)) ), ConvolutionLayer((5, 5, 20, 40), strides=2, activation=lkrelu, filter_init=lambda shp: np.random.normal(size=shp) * np.sqrt(1.0 / (13*13*20 + 5*5*40)) ), FlattenLayer((5, 5, 40)), FullyConnectedLayer((5*5*40, 100), activation=sigmoid, weight_init=lambda shp: np.random.normal(size=shp) * np.sqrt(1.0 / (5*5*40 + 100.))), FullyConnectedLayer((100, 10), activation=linear, weight_init=lambda shp: np.random.normal(size=shp) * np.sqrt(1.0 / (110.))) ] lr = 0.001 k = 2000 net = CNNetwork(layers, learning_rate=lr, loss_function=cross_entropy) (train_data_X, train_data_Y), v, (tx, ty) = mnist_loader.load_data('./data/mnist.pkl.gz') train_data_Y = one_hot(train_data_Y, size=10) ty = one_hot(ty, size=10) train_data_X = np.reshape(train_data_X, [-1, 28, 28, 1]) tx = np.reshape(tx, [-1, 28, 28, 1]) for epoch in range(100000): shuffled_index = np.random.permutation(train_data_X.shape[0]) batch_train_X = train_data_X[shuffled_index[:batch_size]]
testdata = [] for i in numbers: number_path_train = os.path.join(dataset_path, 'train', str(i)) for filename in os.listdir(number_path_train): traindata.append((os.path.join(dataset_path, 'train', str(i), filename), i)) number_path_test = os.path.join(dataset_path, 'test', str(i)) for filename in os.listdir(number_path_test): testdata.append((os.path.join(dataset_path, 'test', str(i), filename), i)) random.shuffle(traindata) random.shuffle(testdata) lr = 0.01 l1 = FullyConnectedLayer(28*28, lr) l2 = FullyConnectedLayer(30, lr) l3 = OutputLayer(10, lr) l1.append(l2).append(l3) if mode == 'train': n_iter = 3 for j in range(0, n_iter): i = 0 for (image_path, label) in traindata: im_pil = Image.open(image_path) im = np.asarray(im_pil, dtype=np.float) im = im / 255
class PolicyModel: def __init__(self, input_dim = 2, output_dim = 1, hl_sizes = [64,64,64], use_res_net = True, activation_fun = tf.nn.relu, training_rate = 1e-3, eps = 1.0): self.input_dim = input_dim self.output_dim = output_dim self.X = tf.placeholder(tf.float32, shape=[None,input_dim], name = "inputs") self.actions = tf.placeholder(tf.float32, shape =[None,], name = "actions") self.advantages = tf.placeholder(tf.float32, shape=[None,], name = "advantages") self.hidden_layers = [] self.training_rate = training_rate self.min_experiences = 100 self.max_experiences = 1000 self.batch_size = 32 self.inner_counter = 0.0 self.eps = eps self.experiences = {'states': [], 'actions': [], 'advantages': []} idim = input_dim if use_res_net: # Then fill out tensorboard with res net for i in range(len(hl_sizes)): if i == 0: self.hidden_layers.append(FullyConnectedLayer(idim, hl_sizes[i], activation_fun = activation_fun)) else: self.hidden_layers.append(FullyConnectedResNetBlock(idim, [hl_sizes[i]], activation_fun = activation_fun, batch_normalization = False)) idim = hl_sizes[i] else: # Use regular fully connected layers for hl in hl_sizes: self.hidden_layers.append(FullyConnectedLayer(idim, hl, activation_fun = activation_fun)) idim = hl # last layer is Regressive to single node -- One for Mean, One for Std_dev self.Y_mean = FullyConnectedLayer(idim, output_dim, activation_fun = None) self.Y_std = FullyConnectedLayer(idim, output_dim, activation_fun= tf.nn.relu) # Rollout Abstraction Z = self.X for hl in self.hidden_layers: Z = hl.forward(Z) #Mean mean = tf.reshape(self.Y_mean.forward(Z), [-1]) std = tf.reshape(self.Y_std.forward(Z), [-1]) + 1e-5 # Sample from the normal distribution norm = tf.contrib.distributions.Normal(mean, std) self.predict_op = tf.clip_by_value(norm.sample(), -1,1) log_probs = norm.log_prob(self.actions) self.cost = -tf.reduce_sum(self.advantages * log_probs + .1*norm.entropy()) self.train_op = tf.train.AdamOptimizer(self.training_rate).minimize(self.cost) def set_session(self, sess): self.session = sess def partial_fit(self, X, actions, advantages, printOp = True): #X = np.atleast_2d(X) actions = np.atleast_1d(actions) advantages = np.atleast_1d(advantages) #print(X, actions, advantages) self.experiences['actions'].append(actions) self.experiences['advantages'].append(advantages) self.experiences['states'].append(X) if len(self.experiences['advantages']) < self.min_experiences: return if len(self.experiences['advantages']) > self.max_experiences: self.experiences['advantages'].pop(0) self.experiences['actions'].pop(0) self.experiences['states'].pop(0) indxs = np.random.choice(len(self.experiences['advantages']), self.batch_size) #print("INDEXES: ", indxs) states = [self.experiences['states'][indx] for indx in indxs] states = np.reshape(np.array(states), (self.batch_size, self.input_dim)) actions = [self.experiences['actions'][indx] for indx in indxs] actions = np.reshape(actions, (self.batch_size,)) advantages = [self.experiences['advantages'][indx] for indx in indxs] advantages = np.reshape(advantages, (self.batch_size,)) loss, _ = self.session.run([self.cost, self.train_op], feed_dict={self.X: states, self.actions: actions, self.advantages: advantages}) if printOp: print("Computed Partial Fit with Loss of: ", loss) def predict(self, X): X = np.atleast_2d(X) return self.session.run(self.predict_op, feed_dict={self.X: X}) def sample_action(self, X): return np.asscalar(self.predict(X)[0])
import numpy as np from NeuralNetwork import NeuralNetwork from FullyConnectedLayer import FullyConnectedLayer from ActivationLayer import ActivationLayer from ActivationFunctions import tanh, tanhDerivative from LossFunction import meanSquaredError, meanSquaredErrorDerivative # Sample training data inputData = np.array([[[0, 0]], [[0, 1]], [[1, 0]], [[1, 1]]]) expectedOutput = np.array([[[0]], [[1]], [[1]], [[0]]]) # Creating a neural network with 3 nodes in first hidden layer, 1 node in final layer, with activation functions # after each layer network = NeuralNetwork() network.add(FullyConnectedLayer(2, 3)) network.add(ActivationLayer(tanh, tanhDerivative)) network.add(FullyConnectedLayer(3, 1)) network.add(ActivationLayer(tanh, tanhDerivative)) # Training network network.setLoss(meanSquaredError, meanSquaredErrorDerivative) network.train(inputData, expectedOutput, epochs=1000, learningRate=.1) # Test the network output = network.predict(inputData) for set in range(len(inputData)): print("For set {} my prediction is {}. The correct value is {}".format( inputData[set], output[set], expectedOutput[set]))
def __init__(self, input_dims, output_dim, fc_hl_sizes, isClassification = True, use_resnet = True ): self.isClassification = isClassification self.output_dim = output_dim self.input_dims = input_dims # Implement VGG Blocks for a Mini - VGG Architecture # Begin Graph Abstraction self.CNN_Block_Layers = [] self.FC_Layers = [] x_dims = [None] + list(input_dims) # Make sure the user did not say the image dim is 2D -> 2D should be changes to at least 3d assert len(input_dims) == 3 # Should be inputted as a 3D image (if 2D -> third channel is 1) self.X = tf.placeholder(dtype = tf.float32, shape = x_dims, name = "X") self.Y = tf.placeholder(dtype = tf.float32, shape = [None, output_dim], name = "Y") # Holds the individual layer's pool size and pool stride settings self.pool_stride_block_settings = {'Num_Pools': [], 'Conv_Stride': [] ,'Pool_Stride': []} # OPTION 1: # Stack Convolutional Blocks -- VGG if not use_resnet: ConvBlock = CNNBlocks.VGGConvPoolBlock64() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.VGGConvPoolBlock64() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.VGGConvPoolBlock128() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.VGGConvPoolBlock256() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.VGGConvPoolBlock512() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.VGGConvPoolBlock512() self.CNN_Block_Layers.append(ConvBlock) else: # OPTION 2: # Stack Convolutional Blocks -- ResNet ConvBlock = CNNBlocks.VGGConvPoolBlock64() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.ResNetBlock128() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.ResNetBlock128() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.ResNetBlock128() self.CNN_Block_Layers.append(ConvBlock) ConvBlock = CNNBlocks.VGGConvPoolBlock128() self.CNN_Block_Layers.append(ConvBlock) # Determine Number of Stacked Convolutional Blocks self.num_conv_blocks = len(self.CNN_Block_Layers) # Initialize the Convolutional weights idim = self.input_dims[2] for i in range(self.num_conv_blocks): # Change the input of a block layer to have the same input channel dimention as the inputted image / feature map channel self.CNN_Block_Layers[i].set_layer_input_channel_dim(idim) #(0,idim) # Layer zero or 1 idim = self.CNN_Block_Layers[i].get_layer_output_channel_dim() # Layer Zero or Layer 1 ( we choose layer one to connect to the next block) self.pool_stride_block_settings['Num_Pools'].append(self.CNN_Block_Layers[i].get_num_pools()) self.pool_stride_block_settings['Pool_Stride'].append(self.CNN_Block_Layers[i].get_block_pool_stride()) # Store Pool Stride information #self.pool_stride_block_settings['Num_Pools'].append(ConvBlock.get_num_pools()) #self.pool_stride_block_settings['Pool_Stride'].append(ConvBlock.get_block_pool_stride()) # Get input size for the fully connected layer FC_INPUT_SIZE = int(self.get_FC_input_size(input_dims, self.pool_stride_block_settings['Num_Pools'], self.pool_stride_block_settings['Pool_Stride'])) idim = FC_INPUT_SIZE # Report FC Size for Conv - NN transition print("Fully Connected Input Size: ", idim) for i in range(len(fc_hl_sizes)): self.FC_Layers.append(FullyConnectedLayer(idim, fc_hl_sizes[i])) idim = fc_hl_sizes[i] self.FC_Layers.append(FullyConnectedLayer(idim, self.output_dim, activation_fun = None)) # Rollout the network tensor abstraction Z = self.X # Convolutional Rollout for i in range(self.num_conv_blocks): Z = self.CNN_Block_Layers[i].forward(Z) # Reshape Z for the Fully Connected Rollout Z = tf.reshape(Z,(-1,FC_INPUT_SIZE)) # Fully Conneccted Rollout for i in range(len(fc_hl_sizes)): Z = self.FC_Layers[i].forward(Z) self.Y_pred = self.FC_Layers[-1].forward(Z) # Here we either take the linear output unit, or we use the CNN for classification # If the classifcation flag is set to true, we use a softmax cross entropy with logits cost function if self.isClassification: self.Yk = tf.nn.softmax(self.Y_pred, axis = 1) self.cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels = self.Y, logits = self.Y_pred, dim = 1)) self.optimizer = tf.train.AdamOptimizer(.001).minimize(self.cost) #self.optimizer = tf.train.AdagradDAOptimizer(.001).minimize(self.cost) else: # Is Regressive self.cost = tf.reduce_sum(tf.squared_difference(self.Y_pred, self.Y)) self.optimizer = tf.train.AdamOptimizer(.001).minimize(self.cost) # Start the session self.set_session(tf.Session()) self.sess.run(tf.global_variables_initializer()) print("Session Initialized: Network Params will be dumped to CNN_Parameters.txt")
def testGradient(): """Test the backprop implementation by checking the gradients on a small network""" # load the training data images, labels = load_mnist() images /= 255.0 grad_images = images[:,:,0:10] #use 10 image subset for gradient checking grad_labels = labels[0,0:10] #respective labels for the images--going to have to encode these labels # create a small network, 1 conv layer + 1 pooling layer + 1 fully connected softmax # convolutional layer, taking in a 28x28 image, using 2 9x9 filters # output should be 2 28-9+1x28-9+1 = 2 20x20 feature maps in a (20, 20, 2) form layer0 = ConvLayer(grad_images[:,:,0].reshape((28,28,1)), (28, 28, 1), (9, 9, 2, 1)) print "initalized convolutional layer" layer0.forwardprop(grad_images[:,:,0].reshape((28,28,1))) print "finished forward pass of convolutional layer" # pooling layer, taking in 2 20x20 feature maps # output should be 2 10x10 feature maps (though may want to downsample 5x for gradient check) layer1 = PoolingLayer(layer0.output, (20, 20, 2)) print "initialized pooling layer" layer1.downsample(layer0.output, (20, 20, 2)) print "finished forward pass of pooling layer" # fully-connected softmax layer, taking in 2 10x10 feature maps (if downsampled by 2) # or taking in 2 4x4 feature maps (if downsampled by 5) # either way, flattened into a long input vector full_conn_input = layer1.output.flatten() layer2 = FullyConnectedLayer(full_conn_input.reshape((full_conn_input.size, 1)), full_conn_input.size, 10) print "initialized fully-conn layer" layer2.softmax_output(full_conn_input.reshape((full_conn_input.size, 1))) print "finished forward pass of fully-conn layer" # perform backpropagation target = np.zeros((10,1)) for i in range(0, 10): if grad_labels[i] == 1: target[i] = 1 layer2.backprop(0, 0, target) print "finished layer 2 backprop" layer1.upsample(layer2, 0) print "finished layer 1 backprop" layer0.backprop(layer1) print "finished layer 0 backprop" # # after initialization, finish training # for i in range(1, grad_labels.size): # # forward propagation # layer0.forwardprop(grad_images[:,:,i].reshape((28,28,1))) # layer1.downsample(layer0.output, (20,20,2)) # full_conn_input = layer1.output.flatten() # layer2.softmax_output(full_conn_input.reshape((full_conn_input.size, 1))) # # # backpropagation # target = np.zeros((10,1)) # for j in range(0,10): # if grad_labels[i] == 1: # target[i] = 1 # layer2.backprop(0, 0, target) # layer1.upsample(layer2, 0) # layer0.backprop(layer1) # check the gradient epsilon = 1.0e-4 layer0_check = layer0 layer1_check = layer1 layer2_check = layer2 layer0_w_vec = layer0.W.flatten() layer0_bias_vec = layer0.bias.flatten() layer0_gradw = layer0.gradient_w.flatten() layer0_gradb = layer0.gradient_b.flatten() layer2_w_vec = layer2.W.flatten() layer2_bias_vec = layer2.bias.flatten() layer2_gradw = layer2.gradient_w.flatten() layer2_gradb = layer2.gradient_b.flatten() w_vec = np.concatenate((layer0_w_vec, layer0_bias_vec, layer2_w_vec, layer2_bias_vec)) backprop_vec = np.concatenate((layer0_gradw, layer0_gradb, layer2_gradw, layer2_gradb)) print layer0_gradw gradient_check = np.zeros(w_vec.size) for i in range(0, w_vec.size): pos = w_vec pos[i] += epsilon neg = w_vec neg[i] -= epsilon # feed-forward to get J(w+e), J(w-e), subtract and calculate gradient # J(w+e) layer0_check.W = pos[0:layer0_w_vec.size].reshape(layer0.filter_shape) layer0_check.bias = pos[layer0_w_vec.size : layer0_w_vec.size+layer0_bias_vec.size].reshape(layer0.bias_shape) layer2_check.W = pos[layer0_w_vec.size+layer0_bias_vec.size : layer0.W.size+layer0.bias.size+layer2_w_vec.size].reshape(layer2.W.shape) layer2_check.bias = pos[layer0.W.size+layer0.bias.size+layer2_w_vec.size:].reshape(layer2.bias.shape) layer0_check.forwardprop(grad_images[:,:,0].reshape((28,28,1))) layer1_check.downsample(layer0_check.output, (20,20,2)) full_conn_input = layer1.output.flatten() layer2_check.softmax_output(full_conn_input.reshape((full_conn_input.size, 1))) pos_out = J(layer2_check.output, grad_labels[0]) # J(w-e) layer0_check.W = neg[0:layer0_w_vec.size].reshape(layer0.filter_shape) layer0_check.bias = neg[layer0_w_vec.size : layer0_w_vec.size+layer0_bias_vec.size].reshape(layer0.bias_shape) layer2_check.W = neg[layer0_w_vec.size+layer0_bias_vec.size : layer0.W.size+layer0.bias.size+layer2_w_vec.size].reshape(layer2.W.shape) layer2_check.bias = neg[layer0.W.size+layer0.bias.size+layer2_w_vec.size:].reshape(layer2.bias.shape) layer0_check.forwardprop(grad_images[:,:,0].reshape((28,28,1))) layer1_check.downsample(layer0_check.output, (20,20,2)) full_conn_input = layer1.output.flatten() layer2_check.softmax_output(full_conn_input.reshape((full_conn_input.size, 1))) neg_out = J(layer2_check.output, grad_labels[0]) # compute gradient for i gradient_check[i] = (pos_out - neg_out)/(2*epsilon) # print gradient_check print gradient_check[0:layer0_w_vec.size]
def SGD_train(minibatch_size, data, labels, alpha, momentum, epochs): """Train the network with stochastic gradient descent :type minibatch_size: an integer :param minibatch_size: the size of the minibatches (usually something like 256) :type data: 3D matrix height x width x num training data pts. :param data: A 3D matrix that contains all of the training data points of the set :type labels: num training data pts x 1 vector :param labels: the labels for each image :type alpha: float :param alpha: the learning rate :type momentum: float :param momentum: the momentum :type epochs: an integer :param epochs: the number of epochs (ie. iterations) through the training """ it = 0 # convolutional layer, taking in a 28x28 image, using 2 9x9 filters # output should be 2 28-9+1x28-9+1 = 2 20x20 feature maps in a (20, 20, 2) form layer0 = ConvLayer((28, 28, 1), (9,9,2)) print "initialized convolutional layer" # pooling layer, taking in 2 20x20 feature maps # output should be 2 10x10 feature maps layer1 = PoolingLayer((20, 20, 2)) print "initialized pooling layer" # fully-connected softmax layer, taking in 2 10x10 feature maps (if downsampled by 2) # flattened into a long input vector layer2 = FullyConnectedLayer(200, 10) print "initialized fully-connected layer" params = np.concatenate((layer0.W.flatten(), layer0.bias.flatten(), layer2.W.flatten(), layer2.bias.flatten())) velocity = np.zeros(params.shape) for i in range(0, epochs): correct_class = 0 cost = 0.0 # shuffle the dataset--shuffle_vec will be used as indices shuffle_vec = rand.permutation(data.shape[2]) for j in range(0, data.shape[2] - minibatch_size + 1, minibatch_size): # perform gradient descent w/each batch it += 1 if it == 20: # increase momentum after 20 iterations momentum = 0.9 # gradient should be an unrolled vector of the avg. sum of the 256 gradients gotten # from the forward pass and backprop for k in range(0, minibatch_size): layer0.forwardprop(data[:,:,shuffle_vec[j+k]].reshape((28,28,1))) layer1.downsample(layer0.output, (20,20,2)) layer2_input = layer1.output.flatten() layer2.softmax_output(layer2_input.reshape((layer2_input.size, 1))) cost += J(layer2.output, labels[shuffle_vec[j+k]]) # print "%d %d" % (np.argmax(layer2.output), labels[shuffle_vec[j+k]]) if np.argmax(layer2.output) == labels[shuffle_vec[j+k]]: correct_class += 1 # backprop layer2.backprop(0, 0, encode_label(labels[shuffle_vec[j+k]])) layer1.upsample(layer2, 0) layer0.backprop(layer1) # flatten the gradient vector if k == 0: grad = np.concatenate((layer0.gradient_w.flatten(), layer0.gradient_b.flatten(), layer2.gradient_w.flatten(), layer2.gradient_b.flatten())) else: grad += np.concatenate((layer0.gradient_w.flatten(), layer0.gradient_b.flatten(), layer2.gradient_w.flatten(), layer2.gradient_b.flatten())) grad /= minibatch_size # update velocity vector velocity = momentum*velocity + alpha*grad params = params - velocity # update the parameters layer0.W = params[0:layer0.W.flatten().size].reshape(layer0.W.shape) next_begin = layer0.W.flatten().size layer0.bias = params[next_begin:next_begin+layer0.bias.flatten().size].reshape(layer0.bias.shape) next_begin += layer0.bias.flatten().size layer2.W = params[next_begin:next_begin+layer2.W.flatten().size].reshape(layer2.W.shape) next_begin += layer2.W.flatten().size layer2.bias = params[next_begin:].reshape(layer2.bias.shape) # reduce learning rate by half after each epoch alpha /= 2.0 print "%d correct classifications" % correct_class print "cost function is ", cost/(minibatch_size*(data.shape[2] - minibatch_size + 1))
class ValueModel: def __init__(self, input_dim, output_dim, hidden_layer_sizes = [64], activation_fun = tf.nn.relu, use_res_net_blocks = True, training_rate = 1e-3): self.X = tf.placeholder(tf.float32, shape=[None,input_dim], name = "inputs") self.Y = tf.placeholder(tf.float32, shape =[None,], name = "actions") self.hidden_layers = [] self.input_dim = input_dim self.output_dim = output_dim self.training_rate = training_rate idim = input_dim self.hl_sizes = hidden_layer_sizes self.experiences = {'states':[], 'values': []} self.min_experiences = 100 self.max_expereinces = 1000 self.batch_size = 32 self.Xs = [] self.Ys = [] if use_res_net_blocks: # Then fill out tensorboard with res net for i in range(len(hl_sizes)): if i == 0: self.hidden_layers.append(FullyConnectedLayer(idim, hl_sizes[i], activation_fun = activation_fun)) else: self.hidden_layers.append(FullyConnectedResNetBlock(idim, [hl_sizes[i]], activation_fun = activation_fun, batch_normalization = False)) idim = hl_sizes[i] else: # Use regular fully connected layers for hl in self.hl_sizes: self.hidden_layers.append(FullyConnectedLayer(idim, hl, activation_fun = activation_fun)) idim = hl # Computes the Value of the current state self.h_last = FullyConnectedLayer(idim, output_dim, activation_fun = None) # Graph abstraction Z = self.X for hl in self.hidden_layers: Z = hl.forward(Z) self.Y_pred = self.h_last.forward(Z) self.Y_pred = tf.reshape(self.Y_pred, [-1]) # Cost self.cost = tf.reduce_sum(tf.square(self.Y - self.Y_pred)) self.train_op = tf.train.AdamOptimizer(self.training_rate).minimize(self.cost) def set_session(self, sess): self.session = sess def partial_fit(self, X, Y, printOp = True): self.experiences['states'].append(X) self.experiences['values'].append(Y) if len(self.experiences['states']) < self.min_experiences: return if len(self.experiences['states']) > self.max_expereinces: self.experiences['states'].pop(0) self.experiences['values'].pop(0) indxs = np.random.choice(len(self.experiences['states']), self.batch_size) #print("INDEXES: ", indxs) X = [self.experiences['states'][indx] for indx in indxs] X = np.reshape(np.array(X), (self.batch_size, self.input_dim)) Y = [self.experiences['values'][indx] for indx in indxs] Y = np.reshape(np.array(Y), (self.batch_size,)) loss, _ = self.session.run([self.cost, self.train_op], feed_dict = {self.X: X, self.Y: Y}) if printOp: print("Partial Fit Loss is: ", loss) def predict(self, X): return self.session.run(self.Y_pred, feed_dict = {self.X: X})