class NeuralNet: def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out): ''' Initialize a NeuralNet @param input_shape: tuple or list of length 4 , (batch size, num input feature maps, image height, image width) @param filter_shapes: list of 2 (for each conv layer) * 4 values (number of filters, num input feature maps, filter height,filter width) @param strides: list of size 2, stride values for each hidden layer @param n_hidden: int, number of neurons in the all-to-all connected hidden layer @param n_out: int, number od nudes in output layer ''' #create theano variables corresponding to input_batch (x) and output of the network (y) x = T.ftensor4('x') y = T.fmatrix('y') #first hidden layer is convolutional: self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0], input_shape, strides[0]) #second convolutional hidden layer: the size of input depends on the size of output from first layer #it is defined as (num_batches, num_input_feature_maps, height_of_input_maps, width_of_input_maps) second_conv_input_shape = [input_shape[0], filter_shapes[0][0], self.layer_hidden_conv1.feature_map_size, self.layer_hidden_conv1.feature_map_size] self.layer_hidden_conv2 = ConvolutionalLayer(self.layer_hidden_conv1.output, filter_shapes[1], image_shape=second_conv_input_shape, stride=2) #output from convolutional layer is 4D, but normal hidden layer expects 2D. Because of all to all connections # 3rd hidden layer does not care from which feature map or from which position the input comes from flattened_input = self.layer_hidden_conv2.output.flatten(2) #create third hidden layer self.layer_hidden3 = HiddenLayer(flattened_input, self.layer_hidden_conv2.fan_out, n_hidden) #create output layer self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden, n_out) #define the ensemble of parameters of the whole network self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \ + self.layer_hidden3.params + self.layer_output.params #discount factor self.gamma = 0.95 #: define regularization terms, for some reason we only take in count the weights, not biases) # linear regularization term, useful for having many weights zero self.l1 = abs(self.layer_hidden_conv1.W).sum() \ + abs(self.layer_hidden_conv2.W).sum() \ + abs(self.layer_hidden3.W).sum() \ + abs(self.layer_output.W).sum() #: square regularization term, useful for forcing small weights self.l2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \ + (self.layer_hidden_conv2.W ** 2).sum() \ + (self.layer_hidden3.W ** 2).sum() \ + (self.layer_output.W ** 2).sum() #: define the cost function cost = 0.0 * self.l1 + 0.0 * self.l2_sqr + self.layer_output.errors(y) #: define gradient calculation grads = T.grad(cost, self.params) #: Define how much we need to change the parameter values learning_rate = 0.0001 updates = [] for param_i, gparam_i in zip(self.params, grads): updates.append((param_i, param_i - learning_rate * gparam_i)) #: we need another set of theano variables (other than x and y) to use in train and predict functions temp_x = T.ftensor4('temp_x') temp_y = T.fmatrix('temp_y') #: define the training operation as applying the updates calculated given temp_x and temp_y self.train_model = theano.function(inputs=[temp_x, temp_y], outputs=[cost , self.params[0][0]], updates=updates, givens={ x: temp_x, y: temp_y}) self.predict_rewards = theano.function( inputs=[temp_x], outputs=[self.layer_output.output], givens={ x: temp_x }) self.predict_rewards_and_cost = theano.function( inputs=[temp_x, temp_y], outputs=[self.layer_output.output, cost], givens={ x: temp_x, y: temp_y }) def train(self, minibatch): """ Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net and trains the network @param minibatch: array of dictionaries, each dictionary contains one transition (prestate,action,reward,poststate) """ #: we have a new, better estimation for the Q-val of the action we chose, it is the sum of the reward # received on transition and the maximum of future rewards. Q-s for other actions remain the same. for i, transition in enumerate(minibatch): estimated_Q = self.predict_rewards([transition['prestate']])[0][0] #: line prints out the output of the network, uncomment it if you want to verify that different # inputs give different outputs (c.f. wiki Basic tests/Issue #10) #print "estimated q", estimated_Q estimated_Q[transition['action']] = transition['reward'] + self.gamma \ * np.max(self.predict_rewards([transition['poststate']])) #: knowing what estimated_Q looks like, we can train the model cost, first_filter = self.train_model([transition['prestate']], [estimated_Q]) #: next line prints out the weight values in the first line of the first 8x8 filter in first conv layer, # uncomment it if you want to make sure the weight values do indeed change as the result of learning # (c.f. wiki Basic tests/Issue #7) #print "first line of filter applied to first img of first layer is: \n", first_filter[0][0] def predict_best_action(self, state): """ Predict_best_action returns the action with the highest Q-value @param state: 4D array, input (game state) for which we want to know the best action """ predicted_values_for_actions = self.predict_rewards(state)[0][0] #print "predicted best action", predicted_values_for_actions return np.argmax(predicted_values_for_actions)
class NeuralNet: def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out): ''' Initialize a NeuralNet @param input_shape: tuple or list of length 4 , (batch size, num input feature maps, image height, image width) @param filter_shapes: list of 2 (for each conv layer) * 4 values (number of filters, num input feature maps, filter height,filter width) @param strides: list of size 2, stride values for each hidden layer @param n_hidden: int, number of neurons in the all-to-all connected hidden layer @param n_out: int, number od nudes in output layer ''' #create theano variables corresponding to input_batch (x) and output of the network (y) x = T.ftensor4('x') y = T.fmatrix('y') #first hidden layer is convolutional: self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0], input_shape, strides[0]) #second convolutional hidden layer: the size of input depends on the size of output from first layer #it is defined as (num_batches, num_input_feature_maps, height_of_input_maps, width_of_input_maps) second_conv_input_shape = [ input_shape[0], filter_shapes[0][0], self.layer_hidden_conv1.feature_map_size, self.layer_hidden_conv1.feature_map_size ] self.layer_hidden_conv2 = ConvolutionalLayer( self.layer_hidden_conv1.output, filter_shapes[1], image_shape=second_conv_input_shape, stride=2) # Drops use of strides #output from convolutional layer is 4D, but normal hidden layer expects 2D. Because of all to all connections # 3rd hidden layer does not care from which feature map or from which position the input comes from flattened_input = self.layer_hidden_conv2.output.flatten(2) #create third hidden layer self.layer_hidden3 = HiddenLayer(flattened_input, self.layer_hidden_conv2.fan_out, n_hidden) #create output layer self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden, n_out) #define the ensemble of parameters of the whole network self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \ + self.layer_hidden3.params + self.layer_output.params #discount factor self.gamma = 0.95 #: define regularization terms, for some reason we only take in count the weights, not biases) # linear regularization term, useful for having many weights zero self.l1 = abs(self.layer_hidden_conv1.W).sum() \ + abs(self.layer_hidden_conv2.W).sum() \ + abs(self.layer_hidden3.W).sum() \ + abs(self.layer_output.W).sum() #: square regularization term, useful for forcing small weights self.l2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \ + (self.layer_hidden_conv2.W ** 2).sum() \ + (self.layer_hidden3.W ** 2).sum() \ + (self.layer_output.W ** 2).sum() #: define the cost function self.cost = 0.0 * self.l1 + 0.0 * self.l2_sqr + self.layer_output.errors( y) self.cost_function = theano.function([x, y], [self.cost]) #: define gradient calculation self.grads = T.grad(self.cost, self.params) #: Define how much we need to change the parameter values self.learning_rate = T.scalar('lr') self.updates = [] for param_i, gparam_i in zip(self.params, self.grads): self.updates.append( (param_i, param_i - self.learning_rate * gparam_i)) self.x = x self.y = y #: we need another set of theano variables (other than x and y) to use in train and predict functions temp_x = T.ftensor4('temp_x') temp_y = T.fmatrix('temp_y') #: define the training operation as applying the updates calculated given temp_x and temp_y self.train_model = theano.function(inputs=[ temp_x, temp_y, theano.Param(self.learning_rate, default=0.00001) ], outputs=[self.cost], updates=self.updates, givens={ x: temp_x, y: temp_y }, name='train_model') self.cost_clone = theano.clone(self.cost, replace=self.updates) self.line_function = theano.function([x, y, self.learning_rate], [self.cost_clone]) self.predict_rewards = theano.function( inputs=[temp_x], outputs=[self.layer_output.output], givens={x: temp_x}, name='predict_rewards') self.predict_rewards_and_cost = theano.function( inputs=[temp_x, temp_y], outputs=[self.layer_output.output, self.cost], givens={ x: temp_x, y: temp_y }, name='predict_rewards_and_cost') actual_learning_rate = 1e-5 learning_rates = [] def optimal_learning_rate(self, prestates, new_estimated_Q, lr): objective = lambda lr: self.line_function(np.array( prestates), new_estimated_Q, float(lr))[0] res = scipy.optimize.minimize(objective, 0, method='Nelder-Mead', options={'xtol': 1e-1}) print 'optimization result' print res self.learning_rates.append(max(1e-6, float(res.x))) def train(self, minibatch): """ Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net and trains the network @param minibatch: array of dictionaries, each dictionary contains one transition (prestate,action,reward,poststate) """ prestates = [t['prestate'] for t in minibatch] initial_estimated_Q = self.predict_rewards(prestates)[0] new_estimated_Q = initial_estimated_Q.copy() poststates = [t['poststate'] for t in minibatch] post_eQ = [ self.predict_rewards([s])[0] if s is not None else None for s in poststates ] actions = [t['action'] for t in minibatch] game_end_ps = [t['game_end'] for t in minibatch] rewards = np.array([t['reward'] for t in minibatch]) for row, (peQ, action, reward, game_end) in enumerate( zip(post_eQ, actions, rewards, game_end_ps)): new_estimated_Q[row, action] = reward + (0 if game_end else self.gamma * np.max(peQ)) initial_cost = self.cost_function(prestates, new_estimated_Q) optimal_learning_rate = lambda: self.optimal_learning_rate( prestates, new_estimated_Q, self.learning_rates[-1] if self.learning_rates else self.actual_learning_rate) if (len(self.learning_rates) % 50) == 0: print 'computing optimal learning rate' optimal_learning_rate() else: self.learning_rates.append(self.learning_rates[-1]) self.train_model(np.array(prestates), new_estimated_Q, self.learning_rates[-1]) final_cost = self.cost_function(prestates, new_estimated_Q) final_estimated_Q = self.predict_rewards(prestates)[0] print 'initial_cost', initial_cost, 'final_cost', final_cost, 'foo baz' print 'current rewards', (final_estimated_Q - final_estimated_Q.min(axis=0)).mean(axis=0) print 'current rewards absolute' for r, a, s in sorted( zip(rewards, actions, map(list, final_estimated_Q))): print r, a, s if final_cost > initial_cost: print 'overstepped; computing current optimal learning rate' optimal_learning_rate() if os.path.exists('/var/tmp/stop'): import pdb pdb.set_trace() def predict_best_action(self, state): """ Predict_best_action returns the action with the highest Q-value @param state: 4D array, input (game state) for which we want to know the best action """ predicted_values_for_actions = self.predict_rewards(state)[0][0] return np.argmax(predicted_values_for_actions)
class NeuralNet: def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out): x = T.dtensor4('x') y = T.dmatrix('y') self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0], input_shape, strides[0]) second_conv_input_shape=[input_shape[0], filter_shapes[0][0], self.layer_hidden_conv1.feature_map_size, self.layer_hidden_conv1.feature_map_size] self.layer_hidden_conv2 = ConvolutionalLayer(self.layer_hidden_conv1.output, filter_shapes[1], image_shape=second_conv_input_shape, stride=2) flattened_input=self.layer_hidden_conv2.output.flatten(2) self.layer_hidden3 = HiddenLayer(flattened_input, self.layer_hidden_conv2.fan_out, n_hidden) self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden, n_out) self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \ + self.layer_hidden3.params + self.layer_output.params self.gamma = 0.95 self.L1 = abs(self.layer_hidden_conv1.W).sum() \ + abs(self.layer_hidden_conv2.W).sum() \ + abs(self.layer_hidden3.W).sum() \ + abs(self.layer_output.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \ + (self.layer_hidden_conv2.W ** 2).sum() \ + (self.layer_hidden3.W ** 2).sum() \ + (self.layer_output.W ** 2).sum() cost = 0.0*self.L1 + 0.0*self.L2_sqr + self.layer_output.errors(y) grads = T.grad(cost, self.params) # Define how much we need to change the parameter values learning_rate = 0.01 updates = [] for param_i, gparam_i in zip(self.params, grads): updates.append((param_i, param_i - learning_rate * gparam_i)) temp1 = T.dtensor4('temp1') temp2 = T.dmatrix('temp2') self.train_model = theano.function(inputs=[temp1, temp2], outputs=[cost], updates=updates, givens={ x: temp1, y: temp2}) #self.shared_q = theano.shared(np.zeros((32,4))) #self.shared_s = theano.shared(np.zeros((32,4,84,84))) #self.train_model_shared = theano.function(inputs=[], outputs=[cost], # updates=updates, # givens={ # x: self.shared_s, # y: self.shared_q # }) self.predict_rewards = theano.function( inputs=[temp1], outputs=[self.layer_output.output], givens={ x: temp1 }) self.predict_rewards_and_cost = theano.function( inputs=[temp1, temp2], outputs=[self.layer_output.output, cost], givens={ x: temp1, y: temp2 }) def train(self, minibatch): states = [] expected_Qs = [] states1 = [element['prestate'] for element in minibatch] states2 = [element['poststate'] for element in minibatch] current_predicted_rewards = self.predict_rewards(states1)[0] predicted_future_rewards = self.predict_rewards(states2)[0] for i, transition in enumerate(minibatch): rewards = current_predicted_rewards[i] rewards[transition['action']] = transition['reward'] + self.gamma*np.max(predicted_future_rewards[i]) states.append(transition['prestate']) expected_Qs.append(rewards) #self.shared_s = theano.shared(states) #self.shared_q = theano.shared(expected_Qs) #print "expected", expected_Qs[0] #print "expected", self.shared_q.eval()[0] #print self.predict_rewards_and_cost(self.shared_s.eval(),self.shared_q.eval())[0][0] #return self.train_model_shared() self.train_model(states, expected_Qs)
class NeuralNet: def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out): x = T.dtensor4('x') y = T.dmatrix('y') self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0], input_shape, strides[0]) second_conv_input_shape = [ input_shape[0], filter_shapes[0][0], self.layer_hidden_conv1.feature_map_size, self.layer_hidden_conv1.feature_map_size ] self.layer_hidden_conv2 = ConvolutionalLayer( self.layer_hidden_conv1.output, filter_shapes[1], image_shape=second_conv_input_shape, stride=2) flattened_input = self.layer_hidden_conv2.output.flatten(2) self.layer_hidden3 = HiddenLayer(flattened_input, self.layer_hidden_conv2.fan_out, n_hidden) self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden, n_out) self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \ + self.layer_hidden3.params + self.layer_output.params self.gamma = 0.95 self.L1 = abs(self.layer_hidden_conv1.W).sum() \ + abs(self.layer_hidden_conv2.W).sum() \ + abs(self.layer_hidden3.W).sum() \ + abs(self.layer_output.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \ + (self.layer_hidden_conv2.W ** 2).sum() \ + (self.layer_hidden3.W ** 2).sum() \ + (self.layer_output.W ** 2).sum() cost = 0.0 * self.L1 + 0.0 * self.L2_sqr + self.layer_output.errors(y) grads = T.grad(cost, self.params) # Define how much we need to change the parameter values learning_rate = 0.01 updates = [] for param_i, gparam_i in zip(self.params, grads): updates.append((param_i, param_i - learning_rate * gparam_i)) temp1 = T.dtensor4('temp1') temp2 = T.dmatrix('temp2') self.train_model = theano.function(inputs=[temp1, temp2], outputs=[cost], updates=updates, givens={ x: temp1, y: temp2 }) #self.shared_q = theano.shared(np.zeros((32,4))) #self.shared_s = theano.shared(np.zeros((32,4,84,84))) #self.train_model_shared = theano.function(inputs=[], outputs=[cost], # updates=updates, # givens={ # x: self.shared_s, # y: self.shared_q # }) self.predict_rewards = theano.function( inputs=[temp1], outputs=[self.layer_output.output], givens={x: temp1}) self.predict_rewards_and_cost = theano.function( inputs=[temp1, temp2], outputs=[self.layer_output.output, cost], givens={ x: temp1, y: temp2 }) def train(self, minibatch): states = [] expected_Qs = [] states1 = [element['prestate'] for element in minibatch] states2 = [element['poststate'] for element in minibatch] current_predicted_rewards = self.predict_rewards(states1)[0] predicted_future_rewards = self.predict_rewards(states2)[0] for i, transition in enumerate(minibatch): rewards = current_predicted_rewards[i] rewards[transition['action']] = transition[ 'reward'] + self.gamma * np.max(predicted_future_rewards[i]) states.append(transition['prestate']) expected_Qs.append(rewards) #self.shared_s = theano.shared(states) #self.shared_q = theano.shared(expected_Qs) #print "expected", expected_Qs[0] #print "expected", self.shared_q.eval()[0] #print self.predict_rewards_and_cost(self.shared_s.eval(),self.shared_q.eval())[0][0] #return self.train_model_shared() self.train_model(states, expected_Qs)
class NeuralNet: def __init__(self, input_shape, filter_shapes, strides, n_hidden, n_out): ''' Initialize a NeuralNet @param input_shape: tuple or list of length 4 , (batch size, num input feature maps, image height, image width) @param filter_shapes: list of 2 (for each conv layer) * 4 values (number of filters, num input feature maps, filter height,filter width) @param strides: list of size 2, stride values for each hidden layer @param n_hidden: int, number of neurons in the all-to-all connected hidden layer @param n_out: int, number od nudes in output layer ''' #create theano variables corresponding to input_batch (x) and output of the network (y) x = T.ftensor4('x') y = T.fmatrix('y') #first hidden layer is convolutional: self.layer_hidden_conv1 = ConvolutionalLayer(x, filter_shapes[0], input_shape, strides[0]) #second convolutional hidden layer: the size of input depends on the size of output from first layer #it is defined as (num_batches, num_input_feature_maps, height_of_input_maps, width_of_input_maps) second_conv_input_shape = [ input_shape[0], filter_shapes[0][0], self.layer_hidden_conv1.feature_map_size, self.layer_hidden_conv1.feature_map_size ] self.layer_hidden_conv2 = ConvolutionalLayer( self.layer_hidden_conv1.output, filter_shapes[1], image_shape=second_conv_input_shape, stride=2) #output from convolutional layer is 4D, but normal hidden layer expects 2D. Because of all to all connections # 3rd hidden layer does not care from which feature map or from which position the input comes from flattened_input = self.layer_hidden_conv2.output.flatten(2) #create third hidden layer self.layer_hidden3 = HiddenLayer(flattened_input, self.layer_hidden_conv2.fan_out, n_hidden) #create output layer self.layer_output = OutputLayer(self.layer_hidden3.output, n_hidden, n_out) #define the ensemble of parameters of the whole network self.params = self.layer_hidden_conv1.params + self.layer_hidden_conv2.params \ + self.layer_hidden3.params + self.layer_output.params #discount factor self.gamma = 0.95 #: define regularization terms, for some reason we only take in count the weights, not biases) # linear regularization term, useful for having many weights zero self.l1 = abs(self.layer_hidden_conv1.W).sum() \ + abs(self.layer_hidden_conv2.W).sum() \ + abs(self.layer_hidden3.W).sum() \ + abs(self.layer_output.W).sum() #: square regularization term, useful for forcing small weights self.l2_sqr = (self.layer_hidden_conv1.W ** 2).sum() \ + (self.layer_hidden_conv2.W ** 2).sum() \ + (self.layer_hidden3.W ** 2).sum() \ + (self.layer_output.W ** 2).sum() #: define the cost function cost = 0.0 * self.l1 + 0.0 * self.l2_sqr + self.layer_output.errors(y) #: define gradient calculation grads = T.grad(cost, self.params) #: Define how much we need to change the parameter values learning_rate = 0.0001 updates = [] for param_i, gparam_i in zip(self.params, grads): updates.append((param_i, param_i - learning_rate * gparam_i)) #: we need another set of theano variables (other than x and y) to use in train and predict functions temp_x = T.ftensor4('temp_x') temp_y = T.fmatrix('temp_y') #: define the training operation as applying the updates calculated given temp_x and temp_y self.train_model = theano.function(inputs=[temp_x, temp_y], outputs=[cost], updates=updates, givens={ x: temp_x, y: temp_y }) self.predict_rewards = theano.function( inputs=[temp_x], outputs=[self.layer_output.output], givens={x: temp_x}) self.predict_rewards_and_cost = theano.function( inputs=[temp_x, temp_y], outputs=[self.layer_output.output, cost], givens={ x: temp_x, y: temp_y }) @profile def train(self, minibatch): """ Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net and trains the network @param minibatch: array of dictionaries, each dictionary contains one transition (prestate,action,reward,poststate) """ #: we have a new, better estimation for the Q-val of the action we chose, it is the sum of the reward # received on transition and the maximum of future rewards. Q-s for other actions remain the same. for i, transition in enumerate(minibatch): estimated_Q = self.predict_rewards([transition['prestate']])[0][0] estimated_Q[transition['action']] = transition['reward'] + self.gamma \ * np.max(self.predict_rewards([transition['prestate']])) #: knowing what estimated_Q looks like, we can train the model self.train_model([transition['prestate']], [estimated_Q]) @profile def predict_best_action(self, state): """ Predict_best_action returns the action with the highest Q-value @param state: 4D array, input (game state) for which we want to know the best action """ predicted_values_for_actions = self.predict_rewards(state)[0][0] return np.argmax(predicted_values_for_actions)