def build_model(self): # TODO: should we set exp rate as a placeholder? self.t_state = [tf.placeholder(dtype=tf.float32, shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"].get("num_steps", 1),)) for od in self.observation_dims] self.t_action = tf.placeholder(dtype=tf.int32, shape=(None,)) # for discrete action space self.t_discounted_reward = tf.placeholder(dtype=tf.float32, shape=(None,)) batch_size = tf.shape(self.t_state)[0] random_action_probs = tf.fill((batch_size, self.action_dim), 1.0 / self.action_dim) self.global_step = tf.Variable(0, name='global_step', trainable=False) ''' self.exp_rate = self.config["init_exp_rate"] * (self.config["anneal_base_exp"] ** tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_exp"]), tf.float32)) ''' self.exp_rate = tf.cast(tf.maximum(self.config['anneal_step_exp'] - self.global_step, 0), tf.float32) / (1.0 * self.config['anneal_step_exp']) * (self.config["init_exp_rate"] - self.config["min_exp"]) + self.config["min_exp"] self.learning_rate = tf.maximum(self.config["init_learning_rate"] * (self.config["anneal_base_lr"] ** tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_lr"]), tf.float32)), self.config["min_lr"]) self.actor = Estimator(self.config['estimator_params'] ['policy_network']['name']).get_estimator( inputs=self.t_state, num_out=self.action_dim, scope='policy_network', **self.config['estimator_params']['policy_network']) self.critic = Estimator(self.config['estimator_params'] ['value_network']['name']).get_estimator( inputs=self.t_state, num_out=1, scope='value_network', **self.config['estimator_params']['value_network']) policy_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy_network") self.action_probs = tf.nn.softmax(self.actor) self.explore = tf.less(tf.random_uniform([batch_size]), self.exp_rate) self.action_sampler = tf.select(self.explore, tf.multinomial(random_action_probs, num_samples=1), tf.multinomial(self.action_probs, num_samples=1)) # TODO: seed? # TODO: how to measure global_step? value_network_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="value_network") advantage = self.t_discounted_reward - self.critic self.actor_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( self.actor, self.t_action) * advantage) self.critic_loss = tf.reduce_mean(tf.square(self.t_discounted_reward - self.critic)) self.add_reg(policy_network_variables) self.add_reg(value_network_variables) self.loss = (self.actor_loss + self.critic_loss) self.loss += self.reg_loss self.build_train()
def ranked_captains_mode_selected(self): self.radiant_pick = SelectionView('Radiant Pick', self) self.dire_pick = SelectionView('Dire Pick', self) self.radiant_ban = SelectionView('Radiant Ban', self) self.dire_ban = SelectionView('Dire Ban', self) v_layout = QtGui.QVBoxLayout() h_layout = QtGui.QHBoxLayout() h_layout.addWidget(self.radiant_ban) h_layout.addWidget(self.dire_ban) v_layout.addLayout(h_layout) h_layout = QtGui.QHBoxLayout() h_layout.addWidget(self.radiant_pick) h_layout.addWidget(self.dire_pick) v_layout.addLayout(h_layout) self.layout().addLayout(v_layout, 1, 1, 1, 1) selections = { self.radiant_pick.name: self.radiant_pick, self.dire_pick.name: self.dire_pick, self.radiant_ban.name: self.radiant_ban, self.dire_ban.name: self.dire_ban } self.cycle = RankedCaptainsModeCycle(selections) self.estimator = Estimator(self.cycle, self.heropool)
def __init__(self, **kwargs): super(MNISTFunction, self).__init__(**kwargs) self.batch_size = kwargs['batch_size'] self.mnist = input_data.read_data_sets("MNIST_data/", one_hot=False) self.gen_train = self.mnist.train X, y = self.gen_train.next_batch(1) self.x_dim = X.shape[1] self.t_X = tf.placeholder(dtype=tf.float32, shape=(None, self.x_dim), name='MNIST_X') self.t_y = tf.placeholder(dtype=tf.int32, shape=(None, ), name='MNIST_y') self.logits = Estimator('fc').get_estimator(inputs=[self.t_X], num_out=10, num_hids=[500, 300], trainable=True, scope='mnist_lenet') self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( self.logits, self.t_y)) self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='World') self.build_train() self.build_state()
def main(): config_file = "configs/ddpgcont_parallel_quad_func.json" max_iter = 1000 #max_iter = 100 bsize, num_vars = 40, 200 max_x = 10 config = json.load(open(config_file)) save_file = 'weights/' + config_file.split('/')[-1].split('.')[0] + '/pretrain_%s_%s.ckpt' % (config['env_config']['action'], config['env_config']['state']) actor_config = config['estimator_params']['policy_network'] dim_features = 1 if config['env_config']['state'] in ['gradient', 'variable'] else 2 inputs = tf.placeholder(dtype=tf.float32, shape=(bsize, actor_config['num_features'], num_vars, dim_features)) actor = Estimator(config['estimator_params'] ['policy_network']['name']).get_estimator( inputs=[tf.reshape(inputs, (bsize, dim_features, actor_config['num_features'] * num_vars))], scope='actor', **actor_config) if config['env_config']['action'] == 'step': loss = tf.reduce_sum(tf.square(tf.squeeze(actor) - inputs[:, -1, :, -1])) elif config['env_config']['action'] == 'coordinate_lr': loss = tf.reduce_sum(tf.square(tf.squeeze(actor) - config['env_config']['base_lr'])) elif self.config['env_config']['action'] == 'params': if self.config['env_config']['opt_method'] == 'rmsprop': self.num_actions = 4 elif self.config['env_config']['opt_method'] == 'sgd': self.num_actions = 1 train_op = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9).minimize(loss) sess = tf.Session() loader = tf.train.Saver() if os.path.exists(save_file): loader.restore(sess, save_file) else: sess.run(tf.initialize_all_variables()) for i in xrange(max_iter): X = np.random.uniform(-max_x, max_x, (bsize, actor_config['num_features'], num_vars, dim_features)) feed = {inputs: X} _, loss_v = sess.run([train_op, loss], feed) if i % 10 == 0: print "iter %d, loss %f" % (i, loss_v) ''' # create variables for actor_target variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) new_vars = [] with tf.variable_scope('actor_target'): for v in variables: vname = v.name.replace('actor/', '').split(':')[0] new_vars.append(tf.get_variable(vname, initializer=v)) sess.run(tf.initialize_variables(new_vars)) ''' saver = tf.train.Saver() saver.save(sess, save_file) print "save to " + save_file
def test_estimator_init(self): es = Estimator() rfc = RandomForestClassifier() es.estimator = rfc assert es.estimator is rfc assert es.hash == '83d11c9bf77830ad42c4e93abe9cf397' assert es.file_name == 'files/estimators/83d11c9bf77830ad42c4e93abe9cf397'
def build_graph(data, hidden_dims, hidden_dims_extra, activation, optimizer, learning_rate, model_path, model_id, model_name): model_graph = tf.Graph() with model_graph.as_default(): estimator = Estimator( data, hidden_dims, hidden_dims_extra, activation, optimizer, learning_rate, model_path, model_id, model_name ) return model_graph, estimator
def ranked_all_pick_selected(self): self.radiant_pick = SelectionView('Radiant Pick', self) self.dire_pick = SelectionView('Dire Pick', self) h_layout = QtGui.QHBoxLayout() h_layout.addWidget(self.radiant_pick) h_layout.addWidget(self.dire_pick) self.layout().addLayout(h_layout, 1, 1, 1, 1) selections = { self.radiant_pick.name: self.radiant_pick, self.dire_pick.name: self.dire_pick } self.cycle = RankedAllPickCycle(selections) self.estimator = Estimator(self.cycle, self.heropool)
def __init__(self, **kwargs): super(SimpleNNFunction, self).__init__(**kwargs) self.init_data() self.t_X = tf.placeholder(dtype=tf.float32, shape=(None, self.X.shape[1]), name='X') self.t_y = tf.placeholder(dtype=tf.int32, shape=(None, ), name='y') estimator_params = {"name": "fc", "num_hids": [20], "trainable": True} self.logits = Estimator(estimator_params['name']).get_estimator( inputs=[self.t_X], num_out=self.ydim, scope='estimator', **estimator_params) self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( self.logits, self.t_y)) self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='World') self.build_train() self.build_state()
def main(args): if args.explore: # sample a bunch of parameters, and run those experiments feats, hidds, activs, optims, lrs, drs, bss = sample_parameters( args.explore) best_args = [] # store the best combination best_valid_acc = -100000. # store the best validation accuracy best_model = None # store the best model id valid_threshold = args.threshold # accuracy must be higher for model to be saved print "Will try %d different configurations..." % args.explore for idx in range(args.explore): with tf.Session() as sess: try: # print sampled parameters print "\n[%d] sampled features:\n%s" % (idx + 1, feats[idx]) print "[%d] sampled hidden_sizes: %s" % (idx + 1, hidds[idx]) print "[%d] extra hidden_sizes: %s" % (idx + 1, hidds[idx][-1]) print "[%d] sampled activation: %s" % (idx + 1, activs[idx]) print "[%d] sampled optimizer: %s" % (idx + 1, optims[idx]) print "[%d] sampled learning rate: %g" % (idx + 1, lrs[idx]) print "[%d] sampled dropout rate: %g" % (idx + 1, drs[idx]) print "[%d] sampled batch size: %d" % (idx + 1, bss[idx]) # Load datasets data = get_data(args.data, MODE_TO_TARGET[args.mode], feature_list=feats[idx]) n_folds = len(data[0]) print "[%d] Building the network..." % (idx + 1, ) estimator = Estimator(data, hidds[idx], [hidds[idx][-1]], activs[idx], optims[idx], lrs[idx], model_path='models/%s' % args.mode) print "[%d] Training the network..." % (idx + 1, ) estimator.train( sess, MODE_TO_FLAG[args.mode], args.patience, bss[idx], drs[idx], save=False, # don't save for now pretrained=None, verbose=False) # Only consider last `n_folds` accuracies! max_train = [ max(estimator.train_accuracies[i]) for i in range(-n_folds, 0) ] max_valid = [ max(estimator.valid_accuracies[i]) for i in range(-n_folds, 0) ] print "[%d] max train accuracies: %s" % (idx + 1, max_train) print "[%d] max valid accuracies: %s" % (idx + 1, max_valid) train_acc = np.mean(max_train) valid_acc = np.mean(max_valid) print "[%d] best avg. train accuracy: %g" % (idx + 1, train_acc) print "[%d] best avg. valid accuracy: %g" % (idx + 1, valid_acc) # save now if we got a good model if valid_acc > valid_threshold: estimator.save(sess) # update variables if we got better model if valid_acc > best_valid_acc: print "[%d] got better accuracy! new: %g > old: %g" % ( idx + 1, valid_acc, best_valid_acc) best_valid_acc = valid_acc best_model = estimator.model_id best_args = [ feats[idx], hidds[idx], activs[idx], optims[idx], lrs[idx], drs[idx], bss[idx] ] else: print "[%d] best validation accuracy is still %g" % ( idx + 1, best_valid_acc) # end of try block, catch CTRL+C errors to print current results except KeyboardInterrupt as e: print e print "best model: %s" % best_model print "with parameters:" print " - features:\n%s" % (best_args[0], ) print " - hidden_sizes: %s" % (best_args[1], ) print " - activation: %s" % (best_args[2], ) print " - optimizer: %s" % (best_args[3], ) print " - learning rate: %g" % (best_args[4], ) print " - dropout rate: %g" % (best_args[5], ) print " - batch size: %d" % (best_args[6], ) print "with average valid accuracy: %g" % best_valid_acc sys.exit() # end of tensorflow session, reset for the next graph tf.reset_default_graph() # end of exploration, print best results: print "done!" print "best model: %s" % best_model print "with parameters:" print " - features:\n%s" % (best_args[0], ) print " - hidden_sizes: %s" % (best_args[1], ) print " - activation: %s" % (best_args[2], ) print " - optimizer: %s" % (best_args[3], ) print " - learning rate: %g" % (best_args[4], ) print " - dropout rate: %g" % (best_args[5], ) print " - batch size: %d" % (best_args[6], ) print "with average valid accuracy: %g" % best_valid_acc else: # run one experiment with provided parameters # load previously trained model. if args.previous_model: old_data, \ hidden_sizes, hidden_sizes_extra, activation, \ optimizer, learning_rate, \ model_path, model_id, model_name, \ batch_size, dropout_rate, pretrained, \ train_accuracies, valid_accuracies = load_previous_model(args.previous_model) # Load provided dataset, but with the same features as previous model data = get_data(args.data, MODE_TO_TARGET[args.mode], feature_list=old_data[-1]) # set pretrained to this model name pretrained = (model_path, model_id, model_name) # now update this model name to not override previous one model_name = "%s" % args.previous_model.replace( 'models', '').replace('.', '').replace('//', '').replace( '/', '.') # store previous_accuracies previous_accuracies = (train_accuracies, valid_accuracies) else: # else, build current parameters data = get_data(args.data, MODE_TO_TARGET[args.mode]) hidden_sizes = args.hidden_sizes hidden_sizes_extra = [args.hidden_sizes[-1]] activation = args.activation optimizer = args.optimizer learning_rate = args.learning_rate model_id = None # keep the default one model_name = 'Estimator' batch_size = args.batch_size dropout_rate = args.dropout_rate pretrained = None previous_accuracies = None n_folds = len(data[0]) print "Building the network..." estimator = Estimator(data, hidden_sizes, hidden_sizes_extra, activation, optimizer, learning_rate, model_path='models/%s' % args.mode, model_id=model_id, model_name=model_name) with tf.Session() as sess: print "Training the network..." estimator.train(sess, MODE_TO_FLAG[args.mode], args.patience, batch_size, dropout_rate, save=True, pretrained=pretrained, previous_accuracies=previous_accuracies, verbose=True) max_train = [ max(estimator.train_accuracies[i]) for i in range(-n_folds, 0) ] max_valid = [ max(estimator.valid_accuracies[i]) for i in range(-n_folds, 0) ] print "max train accuracies: %s" % (max_train, ) print "max valid accuracies: %s" % (max_valid, ) train_acc = np.mean(max_train) valid_acc = np.mean(max_valid) print "best avg. train accuracy: %g" % train_acc print "best avg. valid accuracy: %g" % valid_acc print "done."
def build_model(self): # TODO: should we set exp rate as a placeholder? self.t_state = [tf.placeholder(dtype=tf.float32, name='t_state_%d' % i, shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"].get("num_steps", 1),)) for i, od in enumerate(self.observation_dims)] #print self.t_state[0].get_shape(), self.observation_dims, self.action_dim self.t_state_new = [tf.placeholder(dtype=tf.float32, name='t_state_new_%d' % i, shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"]. get("num_steps", 1),)) for i, od in enumerate(self.observation_dims)] self.t_action = tf.placeholder(dtype=tf.int32, name='t_action', shape=(None,) + self.action_dim) # for action space self.t_discounted_reward = tf.placeholder(dtype=tf.float32, name='t_discounted_reward', shape=(None,)) self.t_reward = tf.placeholder(dtype=tf.float32, name='t_reward', shape=(None,)) batch_size = tf.shape(self.t_state)[0] #TODO: only used for discrete action:random_action_probs = tf.fill((batch_size, self.action_dim), 1.0 / self.action_dim) self.global_step = tf.Variable(0, name='global_step', trainable=False) ''' self.exp_rate = self.config["init_exp_rate"] * (self.config["anneal_base_exp"] ** tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_exp"]), tf.float32)) ''' self.exp_rate = tf.cast(tf.maximum(self.config['anneal_step_exp'] - self.global_step, 0), tf.float32) / (1.0 * self.config['anneal_step_exp']) * (self.config["init_exp_rate"] - self.config["min_exp"]) + self.config["min_exp"] self.learning_rate = tf.maximum(self.config["init_learning_rate"] * (self.config["anneal_base_lr"] ** tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_lr"]), tf.float32)), self.config["min_lr"]) #self.config['estimator_params']['policy_network']['trainable'] = self.config['trainable'] #self.config['estimator_params']['critic_network']['trainable'] = self.config['trainable'] self.actor = Estimator(self.config['estimator_params'] ['policy_network']['name']).get_estimator( inputs=self.t_state, num_out=np.prod(self.action_dim), scope='actor', **self.config['estimator_params']['policy_network']) #print self.actor.get_shape(), (-1,) + self.action_dim self.actor_id = tf.identity(self.actor) self.actor = tf.reshape(self.actor, (-1,) + self.action_dim) #TODO: #self.action_scale = 1e-3 self.action_scale = 1 self.actor *= self.action_scale value_network_estimator = Estimator(self.config['estimator_params'] ['value_network']['name']) self.critic = value_network_estimator.get_estimator( inputs=self.t_state, actions=self.t_action, num_out=1, scope='critic', **self.config['estimator_params']['value_network']) self.critic_with_actor = value_network_estimator.get_estimator( inputs=self.t_state, actions=self.actor, num_out=1, scope='critic', **self.config['estimator_params']['value_network']) self.action_sampler_deterministic = self.actor self.action_sampler = self.actor + tf.random_normal(tf.shape(self.actor), stddev=self.exp_rate) actor_target_config = deepcopy(self.config['estimator_params']['policy_network']) actor_target_config['trainable'] = False self.actor_target = Estimator(self.config['estimator_params'] ['policy_network']['name']).get_estimator( inputs=self.t_state_new, num_out=np.prod(self.action_dim), scope='actor_target', **actor_target_config) #print self.actor_target.get_shape(), (-1,) + self.action_dim self.actor_target = tf.reshape(self.actor_target, (-1,) + self.action_dim) self.actor_target *= self.action_scale critic_target_config = deepcopy(self.config['estimator_params']['policy_network']) critic_target_config['trainable'] = False self.critic_target = Estimator(self.config['estimator_params'] ['value_network']['name']).get_estimator( inputs=self.t_state_new, actions=self.actor_target, num_out=1, scope='critic_target', **critic_target_config) actor_variables = dict([('/'.join(v.name.split('/')[1:]), v) for v in tf.get_collection( tf.GraphKeys.VARIABLES, scope="actor")]) critic_variables = dict([('/'.join(v.name.split('/')[1:]), v) for v in tf.get_collection( tf.GraphKeys.VARIABLES, scope="critic")]) actor_target_variables = dict([('/'.join(v.name.split('/')[1:]), v) for v in tf.get_collection( tf.GraphKeys.VARIABLES, scope="actor_target")]) critic_target_variables = dict([('/'.join(v.name.split('/')[1:]), v) for v in tf.get_collection( tf.GraphKeys.VARIABLES, scope="critic_target")]) self.target_init_ops = [] for k, v in actor_variables.iteritems(): self.target_init_ops.append(actor_target_variables[k].assign(v)) for k, v in critic_variables.iteritems(): self.target_init_ops.append(critic_target_variables[k].assign(v)) self.target_update_ops = [] tau = self.config["tau"] for k, v in actor_variables.iteritems(): self.target_update_ops.append(actor_target_variables[k].assign( tau * v + (1.0 - tau) * actor_target_variables[k])) for k, v in critic_variables.iteritems(): self.target_update_ops.append(critic_target_variables[k].assign( tau * v + (1.0 - tau) * critic_target_variables[k])) self.actor_loss = -tf.reduce_mean(self.critic_with_actor) self.target = tf.reshape(self.t_reward, \ [-1] + [1] * (self.critic.get_shape().ndims - 1))\ + self.config["discount_rate"] * self.critic_target self.critic_loss = tf.reduce_mean(tf.square(self.target - self.critic)) #print self.critic_target.get_shape(), self.target.get_shape(), self.critic.get_shape() self.add_reg(actor_variables.values()) self.add_reg(critic_variables.values()) self.loss = (self.actor_loss + self.critic_loss) self.loss += self.reg_loss if self.learning: self.build_train()
def build_model(self): # TODO: should we set exp rate as a placeholder? self.t_state = [tf.placeholder(dtype=tf.float32, shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"].get("num_steps", 1),)) for od in self.observation_dims] self.t_state_new = [tf.placeholder(dtype=tf.float32, shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"].get("num_steps", 1),)) for od in self.observation_dims] self.t_action = tf.placeholder(dtype=tf.int32, shape=(None,)) # for discrete action space self.t_discounted_reward = tf.placeholder(dtype=tf.float32, shape=(None,)) self.t_reward = tf.placeholder(dtype=tf.float32, shape=(None,)) batch_size = tf.shape(self.t_state)[0] random_action_probs = tf.fill((batch_size, self.action_dim), 1.0 / self.action_dim) self.global_step = tf.Variable(0, name='global_step', trainable=False) ''' self.exp_rate = self.config["init_exp_rate"] * (self.config["anneal_base_exp"] ** tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_exp"]), tf.float32)) ''' self.exp_rate = tf.cast(tf.maximum(self.config['anneal_step_exp'] - self.global_step, 0), tf.float32) / (1.0 * self.config['anneal_step_exp']) * (self.config["init_exp_rate"] - self.config["min_exp"]) + self.config["min_exp"] self.learning_rate = tf.maximum(self.config["init_learning_rate"] * (self.config["anneal_base_lr"] ** tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_lr"]), tf.float32)), self.config["min_lr"]) self.actor = Estimator(self.config['estimator_params'] ['policy_network']['name']).get_estimator( inputs=self.t_state, num_out=self.action_dim, scope='actor', **self.config['estimator_params']['policy_network']) self.critic = Estimator(self.config['estimator_params'] ['value_network']['name']).get_estimator( inputs=self.t_state, actions=self.t_action, num_out=1, scope='critic', **self.config['estimator_params']['value_network']) self.action_sampler_deterministic = tf.argmax(self.actor, dimension=1) self.action_probs = tf.nn.softmax(self.actor) self.explore = tf.less(tf.random_uniform([batch_size]), self.exp_rate) self.action_sampler = tf.select(self.explore, tf.multinomial(random_action_probs, num_samples=1), tf.multinomial(self.action_probs, num_samples=1)) actor_target_config = deepcopy(self.config['estimator_params']['policy_network']) actor_target_config['trainable'] = False self.actor_target = Estimator(self.config['estimator_params'] ['policy_network']['name']).get_estimator( inputs=self.t_state_new, num_out=self.action_dim, scope='actor_target', **actor_target_config) critic_target_config = deepcopy(self.config['estimator_params']['policy_network']) critic_target_config['trainable'] = False self.critic_target = Estimator(self.config['estimator_params'] ['value_network']['name']).get_estimator( inputs=self.t_state_new, actions=self.action_sampler_deterministic, num_out=1, scope='critic_target', **critic_target_config) actor_variables = dict([('/'.join(v.name.split('/')[1:]), v) for v in tf.get_collection( tf.GraphKeys.VARIABLES, scope="actor")]) critic_variables = dict([('/'.join(v.name.split('/')[1:]), v) for v in tf.get_collection( tf.GraphKeys.VARIABLES, scope="critic")]) actor_target_variables = dict([('/'.join(v.name.split('/')[1:]), v) for v in tf.get_collection( tf.GraphKeys.VARIABLES, scope="actor_target")]) critic_target_variables = dict([('/'.join(v.name.split('/')[1:]), v) for v in tf.get_collection( tf.GraphKeys.VARIABLES, scope="critic_target")]) self.target_init_ops = [] for k, v in actor_variables.iteritems(): self.target_init_ops.append(actor_target_variables[k].assign(v)) for k, v in critic_variables.iteritems(): self.target_init_ops.append(critic_target_variables[k].assign(v)) self.target_update_ops = [] tau = self.config["tau"] for k, v in actor_variables.iteritems(): self.target_update_ops.append(actor_target_variables[k].assign( tau * v + (1.0 - tau) * actor_target_variables[k])) for k, v in critic_variables.iteritems(): self.target_update_ops.append(critic_target_variables[k].assign( tau * v + (1.0 - tau) * critic_target_variables[k])) self.actor_loss = tf.reduce_mean(self.critic_target) self.target = self.t_reward + self.config["discount_rate"] * self.critic_target self.critic_loss = tf.reduce_mean(self.target - self.critic) self.add_reg(actor_variables.values()) self.add_reg(critic_variables.values()) self.loss = (self.actor_loss + self.critic_loss) self.loss += self.reg_loss self.build_train()