コード例 #1
0
ファイル: agents.py プロジェクト: tyeah/ReinforceExplorer
    def build_model(self):
        # TODO: should we set exp rate as a placeholder?
        self.t_state = [tf.placeholder(dtype=tf.float32, shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"].get("num_steps", 1),)) for od in self.observation_dims]
        self.t_action = tf.placeholder(dtype=tf.int32, shape=(None,)) # for discrete action space
        self.t_discounted_reward = tf.placeholder(dtype=tf.float32, shape=(None,))
        batch_size = tf.shape(self.t_state)[0]
        random_action_probs = tf.fill((batch_size, self.action_dim), 1.0 / self.action_dim)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        '''
        self.exp_rate = self.config["init_exp_rate"] * (self.config["anneal_base_exp"] ** 
                tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_exp"]), tf.float32))
        '''

        self.exp_rate = tf.cast(tf.maximum(self.config['anneal_step_exp'] - self.global_step, 0), tf.float32) / (1.0 * self.config['anneal_step_exp']) * (self.config["init_exp_rate"] - self.config["min_exp"]) + self.config["min_exp"]

        self.learning_rate = tf.maximum(self.config["init_learning_rate"] * 
                (self.config["anneal_base_lr"] ** 
                tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_lr"]), 
                    tf.float32)), self.config["min_lr"])

        self.actor = Estimator(self.config['estimator_params']
                ['policy_network']['name']).get_estimator(
                inputs=self.t_state, num_out=self.action_dim, 
                scope='policy_network', 
                **self.config['estimator_params']['policy_network'])
        self.critic = Estimator(self.config['estimator_params']
                ['value_network']['name']).get_estimator(
                inputs=self.t_state, num_out=1, 
                scope='value_network', 
                **self.config['estimator_params']['value_network'])
        policy_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy_network")
        self.action_probs = tf.nn.softmax(self.actor)
        self.explore = tf.less(tf.random_uniform([batch_size]), self.exp_rate)
        self.action_sampler = tf.select(self.explore, 
                tf.multinomial(random_action_probs, num_samples=1),
                tf.multinomial(self.action_probs, num_samples=1))
        # TODO: seed?
        # TODO: how to measure global_step?
        value_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="value_network")

        advantage = self.t_discounted_reward - self.critic
        self.actor_loss =  tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
                self.actor, self.t_action) * advantage)
        self.critic_loss = tf.reduce_mean(tf.square(self.t_discounted_reward - self.critic))
        self.add_reg(policy_network_variables)
        self.add_reg(value_network_variables)
        self.loss = (self.actor_loss + self.critic_loss)
        self.loss += self.reg_loss
        self.build_train()
コード例 #2
0
ファイル: mainwidget.py プロジェクト: Dominik1123/Pocketpick
    def ranked_captains_mode_selected(self):
        self.radiant_pick = SelectionView('Radiant Pick', self)
        self.dire_pick = SelectionView('Dire Pick', self)
        self.radiant_ban = SelectionView('Radiant Ban', self)
        self.dire_ban = SelectionView('Dire Ban', self)
        v_layout = QtGui.QVBoxLayout()

        h_layout = QtGui.QHBoxLayout()
        h_layout.addWidget(self.radiant_ban)
        h_layout.addWidget(self.dire_ban)
        v_layout.addLayout(h_layout)

        h_layout = QtGui.QHBoxLayout()
        h_layout.addWidget(self.radiant_pick)
        h_layout.addWidget(self.dire_pick)
        v_layout.addLayout(h_layout)

        self.layout().addLayout(v_layout, 1, 1, 1, 1)

        selections = {
            self.radiant_pick.name: self.radiant_pick,
            self.dire_pick.name: self.dire_pick,
            self.radiant_ban.name: self.radiant_ban,
            self.dire_ban.name: self.dire_ban
        }

        self.cycle = RankedCaptainsModeCycle(selections)
        self.estimator = Estimator(self.cycle, self.heropool)
コード例 #3
0
    def __init__(self, **kwargs):
        super(MNISTFunction, self).__init__(**kwargs)
        self.batch_size = kwargs['batch_size']

        self.mnist = input_data.read_data_sets("MNIST_data/", one_hot=False)
        self.gen_train = self.mnist.train
        X, y = self.gen_train.next_batch(1)
        self.x_dim = X.shape[1]

        self.t_X = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.x_dim),
                                  name='MNIST_X')
        self.t_y = tf.placeholder(dtype=tf.int32,
                                  shape=(None, ),
                                  name='MNIST_y')
        self.logits = Estimator('fc').get_estimator(inputs=[self.t_X],
                                                    num_out=10,
                                                    num_hids=[500, 300],
                                                    trainable=True,
                                                    scope='mnist_lenet')
        self.loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                self.logits, self.t_y))
        self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           scope='World')
        self.build_train()
        self.build_state()
コード例 #4
0
ファイル: pretrain.py プロジェクト: tyeah/ReinforceExplorer
def main():
    config_file = "configs/ddpgcont_parallel_quad_func.json"
    max_iter = 1000
    #max_iter = 100
    bsize, num_vars = 40, 200
    max_x = 10


    config = json.load(open(config_file))
    save_file = 'weights/' + config_file.split('/')[-1].split('.')[0] + '/pretrain_%s_%s.ckpt' % (config['env_config']['action'], config['env_config']['state'])

    actor_config = config['estimator_params']['policy_network']
    dim_features = 1 if config['env_config']['state'] in ['gradient', 'variable'] else 2
    inputs = tf.placeholder(dtype=tf.float32, shape=(bsize, 
        actor_config['num_features'], num_vars, dim_features))
    actor = Estimator(config['estimator_params']
            ['policy_network']['name']).get_estimator(
            inputs=[tf.reshape(inputs, (bsize, dim_features, actor_config['num_features'] * num_vars))],
            scope='actor',
            **actor_config)
    if config['env_config']['action'] == 'step':
        loss = tf.reduce_sum(tf.square(tf.squeeze(actor) - inputs[:, -1, :, -1]))
    elif config['env_config']['action'] == 'coordinate_lr':
        loss = tf.reduce_sum(tf.square(tf.squeeze(actor) - config['env_config']['base_lr']))
    elif self.config['env_config']['action'] == 'params':
        if self.config['env_config']['opt_method'] == 'rmsprop':
            self.num_actions = 4
        elif self.config['env_config']['opt_method'] == 'sgd':
            self.num_actions = 1
    train_op = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9).minimize(loss)

    sess = tf.Session()
    loader = tf.train.Saver()
    if os.path.exists(save_file):
        loader.restore(sess, save_file)
    else:
        sess.run(tf.initialize_all_variables())
    for i in xrange(max_iter):
        X = np.random.uniform(-max_x, max_x, 
                (bsize, actor_config['num_features'], num_vars, dim_features))
        feed = {inputs: X}
        _, loss_v = sess.run([train_op, loss], feed)
        if i % 10 == 0:
            print "iter %d, loss %f" % (i, loss_v)

    '''
    # create variables for actor_target
    variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    new_vars = []
    with tf.variable_scope('actor_target'):
        for v in variables:
            vname = v.name.replace('actor/', '').split(':')[0]
            new_vars.append(tf.get_variable(vname, initializer=v))
    sess.run(tf.initialize_variables(new_vars))
    '''

    saver = tf.train.Saver()
    saver.save(sess, save_file)
    print "save to " + save_file
コード例 #5
0
    def test_estimator_init(self):
        es = Estimator()

        rfc = RandomForestClassifier()
        es.estimator = rfc

        assert es.estimator is rfc
        assert es.hash == '83d11c9bf77830ad42c4e93abe9cf397'
        assert es.file_name == 'files/estimators/83d11c9bf77830ad42c4e93abe9cf397'
コード例 #6
0
ファイル: test.py プロジェクト: zhongyunuestc/convai
def build_graph(data, hidden_dims, hidden_dims_extra, activation, optimizer, learning_rate, model_path, model_id, model_name):
    model_graph = tf.Graph()
    with model_graph.as_default():
        estimator = Estimator(
            data,
            hidden_dims, hidden_dims_extra, activation,
            optimizer, learning_rate,
            model_path, model_id, model_name
        )
    return model_graph, estimator
コード例 #7
0
ファイル: mainwidget.py プロジェクト: Dominik1123/Pocketpick
    def ranked_all_pick_selected(self):
        self.radiant_pick = SelectionView('Radiant Pick', self)
        self.dire_pick = SelectionView('Dire Pick', self)
        h_layout = QtGui.QHBoxLayout()
        h_layout.addWidget(self.radiant_pick)
        h_layout.addWidget(self.dire_pick)
        self.layout().addLayout(h_layout, 1, 1, 1, 1)

        selections = {
            self.radiant_pick.name: self.radiant_pick,
            self.dire_pick.name: self.dire_pick
        }

        self.cycle = RankedAllPickCycle(selections)
        self.estimator = Estimator(self.cycle, self.heropool)
コード例 #8
0
 def __init__(self, **kwargs):
     super(SimpleNNFunction, self).__init__(**kwargs)
     self.init_data()
     self.t_X = tf.placeholder(dtype=tf.float32,
                               shape=(None, self.X.shape[1]),
                               name='X')
     self.t_y = tf.placeholder(dtype=tf.int32, shape=(None, ), name='y')
     estimator_params = {"name": "fc", "num_hids": [20], "trainable": True}
     self.logits = Estimator(estimator_params['name']).get_estimator(
         inputs=[self.t_X],
         num_out=self.ydim,
         scope='estimator',
         **estimator_params)
     self.loss = tf.reduce_mean(
         tf.nn.sparse_softmax_cross_entropy_with_logits(
             self.logits, self.t_y))
     self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='World')
     self.build_train()
     self.build_state()
コード例 #9
0
def main(args):
    if args.explore:
        # sample a bunch of parameters, and run those experiments
        feats, hidds, activs, optims, lrs, drs, bss = sample_parameters(
            args.explore)
        best_args = []  # store the best combination
        best_valid_acc = -100000.  # store the best validation accuracy
        best_model = None  # store the best model id
        valid_threshold = args.threshold  # accuracy must be higher for model to be saved
        print "Will try %d different configurations..." % args.explore
        for idx in range(args.explore):
            with tf.Session() as sess:
                try:
                    # print sampled parameters
                    print "\n[%d] sampled features:\n%s" % (idx + 1,
                                                            feats[idx])
                    print "[%d] sampled hidden_sizes: %s" % (idx + 1,
                                                             hidds[idx])
                    print "[%d] extra hidden_sizes: %s" % (idx + 1,
                                                           hidds[idx][-1])
                    print "[%d] sampled activation: %s" % (idx + 1,
                                                           activs[idx])
                    print "[%d] sampled optimizer: %s" % (idx + 1, optims[idx])
                    print "[%d] sampled learning rate: %g" % (idx + 1,
                                                              lrs[idx])
                    print "[%d] sampled dropout rate: %g" % (idx + 1, drs[idx])
                    print "[%d] sampled batch size: %d" % (idx + 1, bss[idx])

                    # Load datasets
                    data = get_data(args.data,
                                    MODE_TO_TARGET[args.mode],
                                    feature_list=feats[idx])
                    n_folds = len(data[0])
                    print "[%d] Building the network..." % (idx + 1, )
                    estimator = Estimator(data,
                                          hidds[idx], [hidds[idx][-1]],
                                          activs[idx],
                                          optims[idx],
                                          lrs[idx],
                                          model_path='models/%s' % args.mode)
                    print "[%d] Training the network..." % (idx + 1, )
                    estimator.train(
                        sess,
                        MODE_TO_FLAG[args.mode],
                        args.patience,
                        bss[idx],
                        drs[idx],
                        save=False,  # don't save for now
                        pretrained=None,
                        verbose=False)
                    # Only consider last `n_folds` accuracies!
                    max_train = [
                        max(estimator.train_accuracies[i])
                        for i in range(-n_folds, 0)
                    ]
                    max_valid = [
                        max(estimator.valid_accuracies[i])
                        for i in range(-n_folds, 0)
                    ]
                    print "[%d] max train accuracies: %s" % (idx + 1,
                                                             max_train)
                    print "[%d] max valid accuracies: %s" % (idx + 1,
                                                             max_valid)
                    train_acc = np.mean(max_train)
                    valid_acc = np.mean(max_valid)
                    print "[%d] best avg. train accuracy: %g" % (idx + 1,
                                                                 train_acc)
                    print "[%d] best avg. valid accuracy: %g" % (idx + 1,
                                                                 valid_acc)

                    # save now if we got a good model
                    if valid_acc > valid_threshold:
                        estimator.save(sess)

                    # update variables if we got better model
                    if valid_acc > best_valid_acc:
                        print "[%d] got better accuracy! new: %g > old: %g" % (
                            idx + 1, valid_acc, best_valid_acc)
                        best_valid_acc = valid_acc
                        best_model = estimator.model_id
                        best_args = [
                            feats[idx], hidds[idx], activs[idx], optims[idx],
                            lrs[idx], drs[idx], bss[idx]
                        ]
                    else:
                        print "[%d] best validation accuracy is still %g" % (
                            idx + 1, best_valid_acc)

                # end of try block, catch CTRL+C errors to print current results
                except KeyboardInterrupt as e:
                    print e
                    print "best model: %s" % best_model
                    print "with parameters:"
                    print " - features:\n%s" % (best_args[0], )
                    print " - hidden_sizes: %s" % (best_args[1], )
                    print " - activation: %s" % (best_args[2], )
                    print " - optimizer: %s" % (best_args[3], )
                    print " - learning rate: %g" % (best_args[4], )
                    print " - dropout rate: %g" % (best_args[5], )
                    print " - batch size: %d" % (best_args[6], )
                    print "with average valid accuracy: %g" % best_valid_acc
                    sys.exit()

            # end of tensorflow session, reset for the next graph
            tf.reset_default_graph()

        # end of exploration, print best results:
        print "done!"
        print "best model: %s" % best_model
        print "with parameters:"
        print " - features:\n%s" % (best_args[0], )
        print " - hidden_sizes: %s" % (best_args[1], )
        print " - activation: %s" % (best_args[2], )
        print " - optimizer: %s" % (best_args[3], )
        print " - learning rate: %g" % (best_args[4], )
        print " - dropout rate: %g" % (best_args[5], )
        print " - batch size: %d" % (best_args[6], )
        print "with average valid accuracy: %g" % best_valid_acc

    else:
        # run one experiment with provided parameters
        # load previously trained model.
        if args.previous_model:
            old_data, \
            hidden_sizes, hidden_sizes_extra, activation, \
            optimizer, learning_rate, \
            model_path, model_id, model_name, \
            batch_size, dropout_rate, pretrained, \
            train_accuracies, valid_accuracies = load_previous_model(args.previous_model)
            # Load provided dataset, but with the same features as previous model
            data = get_data(args.data,
                            MODE_TO_TARGET[args.mode],
                            feature_list=old_data[-1])
            # set pretrained to this model name
            pretrained = (model_path, model_id, model_name)
            # now update this model name to not override previous one
            model_name = "%s" % args.previous_model.replace(
                'models', '').replace('.', '').replace('//', '').replace(
                    '/', '.')
            # store previous_accuracies
            previous_accuracies = (train_accuracies, valid_accuracies)
        else:
            # else, build current parameters
            data = get_data(args.data, MODE_TO_TARGET[args.mode])
            hidden_sizes = args.hidden_sizes
            hidden_sizes_extra = [args.hidden_sizes[-1]]
            activation = args.activation
            optimizer = args.optimizer
            learning_rate = args.learning_rate
            model_id = None  # keep the default one
            model_name = 'Estimator'
            batch_size = args.batch_size
            dropout_rate = args.dropout_rate
            pretrained = None
            previous_accuracies = None

        n_folds = len(data[0])
        print "Building the network..."
        estimator = Estimator(data,
                              hidden_sizes,
                              hidden_sizes_extra,
                              activation,
                              optimizer,
                              learning_rate,
                              model_path='models/%s' % args.mode,
                              model_id=model_id,
                              model_name=model_name)

        with tf.Session() as sess:
            print "Training the network..."
            estimator.train(sess,
                            MODE_TO_FLAG[args.mode],
                            args.patience,
                            batch_size,
                            dropout_rate,
                            save=True,
                            pretrained=pretrained,
                            previous_accuracies=previous_accuracies,
                            verbose=True)
            max_train = [
                max(estimator.train_accuracies[i]) for i in range(-n_folds, 0)
            ]
            max_valid = [
                max(estimator.valid_accuracies[i]) for i in range(-n_folds, 0)
            ]
            print "max train accuracies: %s" % (max_train, )
            print "max valid accuracies: %s" % (max_valid, )
            train_acc = np.mean(max_train)
            valid_acc = np.mean(max_valid)
            print "best avg. train accuracy: %g" % train_acc
            print "best avg. valid accuracy: %g" % valid_acc
        print "done."
コード例 #10
0
ファイル: agents.py プロジェクト: tyeah/ReinforceExplorer
    def build_model(self):
        # TODO: should we set exp rate as a placeholder?
        self.t_state = [tf.placeholder(dtype=tf.float32, 
            name='t_state_%d' % i,
            shape=(None,) + od[:-1] + (od[-1] * 
            self.config["inner_state_params"].get("num_steps", 1),)) 
            for i, od in enumerate(self.observation_dims)]
        #print self.t_state[0].get_shape(), self.observation_dims, self.action_dim
        self.t_state_new = [tf.placeholder(dtype=tf.float32, 
            name='t_state_new_%d' % i,
            shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"].
                get("num_steps", 1),)) for i, od in enumerate(self.observation_dims)]
        self.t_action = tf.placeholder(dtype=tf.int32, name='t_action', 
                shape=(None,) + self.action_dim) # for action space
        self.t_discounted_reward = tf.placeholder(dtype=tf.float32, 
                name='t_discounted_reward', shape=(None,))
        self.t_reward = tf.placeholder(dtype=tf.float32, 
                name='t_reward', shape=(None,))
        batch_size = tf.shape(self.t_state)[0]
        #TODO: only used for discrete action:random_action_probs = tf.fill((batch_size, self.action_dim), 1.0 / self.action_dim)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        '''
        self.exp_rate = self.config["init_exp_rate"] * (self.config["anneal_base_exp"] ** 
                tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_exp"]), tf.float32))
        '''

        self.exp_rate = tf.cast(tf.maximum(self.config['anneal_step_exp'] - self.global_step, 0), tf.float32) / (1.0 * self.config['anneal_step_exp']) * (self.config["init_exp_rate"] - self.config["min_exp"]) + self.config["min_exp"]

        self.learning_rate = tf.maximum(self.config["init_learning_rate"] * 
                (self.config["anneal_base_lr"] ** 
                tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_lr"]), 
                    tf.float32)), self.config["min_lr"])

        #self.config['estimator_params']['policy_network']['trainable'] = self.config['trainable']
        #self.config['estimator_params']['critic_network']['trainable'] = self.config['trainable']
        self.actor = Estimator(self.config['estimator_params']
                ['policy_network']['name']).get_estimator(
                inputs=self.t_state, num_out=np.prod(self.action_dim), 
                scope='actor', 
                **self.config['estimator_params']['policy_network'])
        #print self.actor.get_shape(), (-1,) + self.action_dim
        self.actor_id = tf.identity(self.actor)
        self.actor = tf.reshape(self.actor, (-1,) + self.action_dim)
        #TODO:
        #self.action_scale = 1e-3
        self.action_scale = 1
        self.actor *= self.action_scale

        value_network_estimator = Estimator(self.config['estimator_params']
                ['value_network']['name'])
        self.critic = value_network_estimator.get_estimator(
                inputs=self.t_state, actions=self.t_action, num_out=1, 
                scope='critic', 
                **self.config['estimator_params']['value_network'])

        self.critic_with_actor = value_network_estimator.get_estimator(
                inputs=self.t_state, actions=self.actor, num_out=1, 
                scope='critic', 
                **self.config['estimator_params']['value_network'])

        self.action_sampler_deterministic = self.actor

        self.action_sampler = self.actor + tf.random_normal(tf.shape(self.actor), stddev=self.exp_rate)

        actor_target_config = deepcopy(self.config['estimator_params']['policy_network'])
        actor_target_config['trainable'] = False
        self.actor_target = Estimator(self.config['estimator_params']
                ['policy_network']['name']).get_estimator(
                inputs=self.t_state_new, num_out=np.prod(self.action_dim), 
                scope='actor_target',
                **actor_target_config)
        #print self.actor_target.get_shape(), (-1,) + self.action_dim
        self.actor_target = tf.reshape(self.actor_target, (-1,) + self.action_dim)
        self.actor_target *= self.action_scale
        critic_target_config = deepcopy(self.config['estimator_params']['policy_network'])
        critic_target_config['trainable'] = False
        self.critic_target = Estimator(self.config['estimator_params']
                ['value_network']['name']).get_estimator(
                inputs=self.t_state_new, actions=self.actor_target, num_out=1, 
                scope='critic_target',
                **critic_target_config)

        actor_variables = dict([('/'.join(v.name.split('/')[1:]), v) 
                for v in tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="actor")])
        critic_variables = dict([('/'.join(v.name.split('/')[1:]), v) 
                for v in tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="critic")])
        actor_target_variables = dict([('/'.join(v.name.split('/')[1:]), v) 
                for v in tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="actor_target")])
        critic_target_variables = dict([('/'.join(v.name.split('/')[1:]), v) 
                for v in tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="critic_target")])

        self.target_init_ops = []
        for k, v in actor_variables.iteritems():
            self.target_init_ops.append(actor_target_variables[k].assign(v))
        for k, v in critic_variables.iteritems():
            self.target_init_ops.append(critic_target_variables[k].assign(v))
        self.target_update_ops = []
        tau = self.config["tau"]
        for k, v in actor_variables.iteritems():
            self.target_update_ops.append(actor_target_variables[k].assign(
                tau * v + (1.0 - tau) * actor_target_variables[k]))
        for k, v in critic_variables.iteritems():
            self.target_update_ops.append(critic_target_variables[k].assign(
                tau * v + (1.0 - tau) * critic_target_variables[k]))

        self.actor_loss = -tf.reduce_mean(self.critic_with_actor)
        self.target = tf.reshape(self.t_reward, \
                [-1] + [1] * (self.critic.get_shape().ndims - 1))\
                + self.config["discount_rate"] * self.critic_target
        self.critic_loss = tf.reduce_mean(tf.square(self.target - self.critic))
        #print self.critic_target.get_shape(), self.target.get_shape(), self.critic.get_shape()

        self.add_reg(actor_variables.values())
        self.add_reg(critic_variables.values())
        self.loss = (self.actor_loss + self.critic_loss)
        self.loss += self.reg_loss
        if self.learning:
            self.build_train()
コード例 #11
0
ファイル: agents.py プロジェクト: tyeah/ReinforceExplorer
    def build_model(self):
        # TODO: should we set exp rate as a placeholder?
        self.t_state = [tf.placeholder(dtype=tf.float32, shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"].get("num_steps", 1),)) for od in self.observation_dims]
        self.t_state_new = [tf.placeholder(dtype=tf.float32, shape=(None,) + od[:-1] + (od[-1] * self.config["inner_state_params"].get("num_steps", 1),)) for od in self.observation_dims]
        self.t_action = tf.placeholder(dtype=tf.int32, shape=(None,)) # for discrete action space
        self.t_discounted_reward = tf.placeholder(dtype=tf.float32, shape=(None,))
        self.t_reward = tf.placeholder(dtype=tf.float32, shape=(None,))
        batch_size = tf.shape(self.t_state)[0]
        random_action_probs = tf.fill((batch_size, self.action_dim), 1.0 / self.action_dim)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        '''
        self.exp_rate = self.config["init_exp_rate"] * (self.config["anneal_base_exp"] ** 
                tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_exp"]), tf.float32))
        '''

        self.exp_rate = tf.cast(tf.maximum(self.config['anneal_step_exp'] - self.global_step, 0), tf.float32) / (1.0 * self.config['anneal_step_exp']) * (self.config["init_exp_rate"] - self.config["min_exp"]) + self.config["min_exp"]

        self.learning_rate = tf.maximum(self.config["init_learning_rate"] * 
                (self.config["anneal_base_lr"] ** 
                tf.cast(tf.floordiv(self.global_step, self.config["anneal_step_lr"]), 
                    tf.float32)), self.config["min_lr"])

        self.actor = Estimator(self.config['estimator_params']
                ['policy_network']['name']).get_estimator(
                inputs=self.t_state, num_out=self.action_dim, 
                scope='actor', 
                **self.config['estimator_params']['policy_network'])
        self.critic = Estimator(self.config['estimator_params']
                ['value_network']['name']).get_estimator(
                inputs=self.t_state, actions=self.t_action, num_out=1, 
                scope='critic', 
                **self.config['estimator_params']['value_network'])

        self.action_sampler_deterministic = tf.argmax(self.actor, dimension=1)
        self.action_probs = tf.nn.softmax(self.actor)
        self.explore = tf.less(tf.random_uniform([batch_size]), self.exp_rate)
        self.action_sampler = tf.select(self.explore, 
                tf.multinomial(random_action_probs, num_samples=1),
                tf.multinomial(self.action_probs, num_samples=1))

        actor_target_config = deepcopy(self.config['estimator_params']['policy_network'])
        actor_target_config['trainable'] = False
        self.actor_target = Estimator(self.config['estimator_params']
                ['policy_network']['name']).get_estimator(
                inputs=self.t_state_new, num_out=self.action_dim, 
                scope='actor_target',
                **actor_target_config)
        critic_target_config = deepcopy(self.config['estimator_params']['policy_network'])
        critic_target_config['trainable'] = False
        self.critic_target = Estimator(self.config['estimator_params']
                ['value_network']['name']).get_estimator(
                inputs=self.t_state_new, actions=self.action_sampler_deterministic, num_out=1, 
                scope='critic_target',
                **critic_target_config)

        actor_variables = dict([('/'.join(v.name.split('/')[1:]), v) 
                for v in tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="actor")])
        critic_variables = dict([('/'.join(v.name.split('/')[1:]), v) 
                for v in tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="critic")])
        actor_target_variables = dict([('/'.join(v.name.split('/')[1:]), v) 
                for v in tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="actor_target")])
        critic_target_variables = dict([('/'.join(v.name.split('/')[1:]), v) 
                for v in tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="critic_target")])

        self.target_init_ops = []
        for k, v in actor_variables.iteritems():
            self.target_init_ops.append(actor_target_variables[k].assign(v))
        for k, v in critic_variables.iteritems():
            self.target_init_ops.append(critic_target_variables[k].assign(v))
        self.target_update_ops = []
        tau = self.config["tau"]
        for k, v in actor_variables.iteritems():
            self.target_update_ops.append(actor_target_variables[k].assign(
                tau * v + (1.0 - tau) * actor_target_variables[k]))
        for k, v in critic_variables.iteritems():
            self.target_update_ops.append(critic_target_variables[k].assign(
                tau * v + (1.0 - tau) * critic_target_variables[k]))

        self.actor_loss = tf.reduce_mean(self.critic_target)
        self.target = self.t_reward + self.config["discount_rate"] * self.critic_target
        self.critic_loss = tf.reduce_mean(self.target - self.critic)

        self.add_reg(actor_variables.values())
        self.add_reg(critic_variables.values())
        self.loss = (self.actor_loss + self.critic_loss)
        self.loss += self.reg_loss
        self.build_train()