Ejemplo n.º 1
0
    def __init__ (self, train_loop):

        self.train_loop = train_loop
        self.graph = train_loop.graph
        self.sess = train_loop.sess
        journalist = train_loop.logger

        num_actions = self.train_loop.num_actions;
        observation_size = self.train_loop.observation_size;
        observations_in_seq = 1;
        input_size = observation_size*observations_in_seq;
        learning_rate = 1e-4

        r = tf.nn.relu
        t = tf.nn.tanh

        critic = MLP([input_size, num_actions], [512, 512, 512, 512, 512, 1],
                    [r, r, r, r, t, tf.identity], scope='critic')

        self.actor = MLP([input_size,], [512, 512, 512, 512, 512, num_actions],
                    [r, r, r, r, t, tf.nn.sigmoid], scope='actor')

        # step 1
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        # step 2
        # optimizer = tf.train.GradientDescentOptimizer(learning_rate=5e-5)

        self.controller = ContinuousDeepQ(
            input_size,
            num_actions,
            self.actor,
            critic,
            optimizer,
            self.sess,
            discount_rate=0.99,
            target_actor_update_rate=0.01,
            target_critic_update_rate=0.01,
            exploration_period=5000,
            max_experience=10000,
            store_every_nth=4,
            train_every_nth=4,
            summary_writer=journalist,
            rewards = self.train_loop.dequeued_rewards,
            given_action = self.train_loop.dequeued_actions,
            observation = self.train_loop.dequeued_prev_states,
            next_observation = self.train_loop.dequeued_next_states,
            next_observation_mask = tf.ones(self.train_loop.dequeued_rewards.get_shape (), tf.float32)
        )
Ejemplo n.º 2
0
def main(desired_iterations, save_path):
    # Define a log file to use with tensorboard
    # Not that we currently make use of tensorboard at all
    LOG_DIR = tempfile.mkdtemp()
    print "Tensorboard Log: " + LOG_DIR + '\n'

    # The directory to save the animations to
    SAVE_DIR = save_path

    # Define the simulation
    sim = Planning(get_noodle_environment())

    # Tensorflow!
    tf.reset_default_graph()
    session = tf.InteractiveSession()
    journalist = tf.train.SummaryWriter(LOG_DIR)
    brain = MLP([
        sim.observation_size,
    ], [200, 200, sim.num_actions], [tf.tanh, tf.tanh, tf.identity])
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9)

    # DiscreteDeepQ object
    current_controller = DiscreteDeepQ(sim.observation_size,
                                       sim.num_actions,
                                       brain,
                                       optimizer,
                                       session,
                                       random_action_probability=0.2,
                                       discount_rate=0.9,
                                       exploration_period=1000,
                                       max_experience=10000,
                                       store_every_nth=1,
                                       train_every_nth=1,
                                       summary_writer=journalist)

    # Initialize the session
    session.run(tf.initialize_all_variables())
    session.run(current_controller.target_network_update)
    # journalist.add_graph(session.graph)

    # Run the simulation and let the robot learn
    num_simulations = 0

    iterations_needed = []
    total_rewards = []

    try:
        for game_idx in range(desired_iterations + 1):
            current_random_prob = current_controller.random_action_probability
            update_random_prob = game_idx != 0 and game_idx % 200 == 0
            if update_random_prob and 0.01 < current_random_prob <= 0.1:
                current_controller.random_action_probability = current_random_prob - 0.01
            elif update_random_prob and 0.1 < current_random_prob:
                current_controller.random_action_probability = current_random_prob - 0.1
            game = Planning(get_noodle_environment())
            game_iterations = 0

            observation = game.observe()
            while not game.is_over():
                action = current_controller.action(observation)
                reward = game.collect_reward(action)
                new_observation = game.observe()
                current_controller.store(observation, action, reward,
                                         new_observation)
                current_controller.training_step()
                observation = new_observation
                game_iterations += 1
            total_rewards.append(sum(game.collected_rewards))
            iterations_needed.append(game_iterations)
            rewards = []
            if game_idx % 50 == 0:
                print "\rGame %d:\nIterations before end: %d." % (
                    game_idx, game_iterations)
                if game.collected_rewards[-1] == 10:
                    print "Hit target!"
                print "Total Rewards: %s\n" % (sum(game.collected_rewards))
                if SAVE_DIR is not None:
                    game.save_path(SAVE_DIR, game_idx)

    except KeyboardInterrupt:
        print "Interrupted"

    # Plot the iterations and reward
    plt.figure(figsize=(12, 8))
    plt.plot(total_rewards, label='Reward')
    # plt.plot(iterations_needed, label='Iterations')
    plt.legend()
    plt.show()
Ejemplo n.º 3
0
        b"a": 2,
    })
else:
    # Tensorflow business - it is always good to reset a graph before creating a new controller.
    tf.reset_default_graph()
    session = tf.InteractiveSession()

    # This little guy will let us run tensorboard
    #      tensorboard --logdir [LOG_DIR]
    journalist = tf.train.SummaryWriter(LOG_DIR)

    # Brain maps from observation to Q values for different actions.
    # Here it is a done using a multi layer perceptron with 2 hidden
    # layers
    brain = MLP([
        g.observation_size,
    ], [200, 200, g.num_actions], [tf.tanh, tf.tanh, tf.identity])

    # The optimizer to use. Here we use RMSProp as recommended
    # by the publication
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9)

    # DiscreteDeepQ object
    current_controller = DiscreteDeepQ(g.observation_size,
                                       g.num_actions,
                                       brain,
                                       optimizer,
                                       session,
                                       discount_rate=0.99,
                                       exploration_period=5000,
                                       max_experience=10000,
observation_size = 2
observations_in_seq = 1
input_size = observation_size * observations_in_seq

# actions
num_actions = 3

#brain = MLP([input_size,], [5, 5, 5, num_actions],
#            [tf.tanh, tf.tanh, tf.tanh, tf.identity])
#brain = MLP([input_size,], [20, 20, 20, 20, num_actions],
#            [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity])

brain = SeparatedMLP([
    MLP([
        input_size,
    ], [64, 64, 1], [tf.sigmoid, tf.sigmoid, tf.identity],
        scope="mlp_action1"),
    MLP([
        input_size,
    ], [64, 64, 1], [tf.sigmoid, tf.sigmoid, tf.identity],
        scope="mlp_action2"),
    MLP([
        input_size,
    ], [64, 64, 1], [tf.sigmoid, tf.sigmoid, tf.identity],
        scope="mlp_action3")
])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
Ejemplo n.º 5
0
observation_size = 7
observations_in_seq = 1
input_size = observation_size * observations_in_seq

# actions
num_actions = 5

#brain = MLP([input_size,], [5, 5, 5, num_actions],
#            [tf.tanh, tf.tanh, tf.tanh, tf.identity])
#brain = MLP([input_size,], [20, 20, 20, 20, num_actions],
#            [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity])

brain = SeparatedMLP([
    MLP([
        input_size,
    ], [8, 8, 1], [tf.nn.relu, tf.nn.relu, tf.identity],
        scope="mlp_action1"),
    MLP([
        input_size,
    ], [8, 8, 1], [tf.nn.relu, tf.nn.relu, tf.identity],
        scope="mlp_action2"),
    MLP([
        input_size,
    ], [8, 8, 1], [tf.nn.relu, tf.nn.relu, tf.identity],
        scope="mlp_action3"),
    MLP([
        input_size,
    ], [8, 8, 1], [tf.nn.relu, tf.nn.relu, tf.identity],
        scope="mlp_action4"),
    MLP([
        input_size,
Ejemplo n.º 6
0
import scipy.io as sio

import copy

N = Quadrotor.num_of_actions

tf.reset_default_graph()
session = tf.InteractiveSession()

LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)
journalist = tf.train.SummaryWriter(LOG_DIR)

brain = MLP([
    4,
], [32, 64, N], [tf.tanh, tf.tanh, tf.identity])

optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)

current_controller = DiscreteDeepQ(4,
                                   N,
                                   brain,
                                   optimizer,
                                   session,
                                   discount_rate=0.9,
                                   exploration_period=100000,
                                   max_experience=10000,
                                   minibatch_size=64,
                                   random_action_probability=0.05,
                                   store_every_nth=1,
observations_in_seq = 4
input_size = observation_size * observations_in_seq

# actions
num_actions = 2

#brain = MLP([input_size,], [5, 5, 5, num_actions],
#            [tf.tanh, tf.tanh, tf.tanh, tf.identity])
#brain = MLP([input_size,], [20, 20, 20, 20, num_actions],
#            [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity])

#brain = MLP([input_size,], [32, 32, 32, 32, 32, num_actions],
#            [tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.identity])

critic = MLP([input_size, num_actions * 2], [1024, 512, 1],
             [tf.nn.sigmoid, tf.nn.sigmoid, tf.identity],
             scope='critic')

actor = MLP([
    input_size,
], [1024, 512, num_actions], [tf.nn.sigmoid, tf.nn.sigmoid, tf.identity],
            scope='actor')

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
#optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0001, decay=0.9)
#optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0005, decay=0.9)
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate= 0.001)

# DiscreteDeepQ object
Ejemplo n.º 8
0
# In[9]:

# Tensorflow business - it is always good to reset a graph before creating a new controller.
tf.reset_default_graph()
session = tf.InteractiveSession()

# This little guy will let us run tensorboard
#      tensorboard --logdir [LOG_DIR]
journalist = tf.train.SummaryWriter(LOG_DIR)

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP([
    4,
], [10, 4], [tf.tanh, tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = DiscreteDeepQ(4,
                                   4,
                                   brain,
                                   optimizer,
                                   session,
                                   discount_rate=0.9,
                                   exploration_period=100,
                                   max_experience=10000,
Ejemplo n.º 9
0
    'kt': 1.0,
    'd': 1.0,
    'Fmax': 10,
    'Mmax': 5,
    'us': us
}

tf.reset_default_graph()
session = tf.InteractiveSession()

LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)
journalist = tf.train.SummaryWriter(LOG_DIR)

brain = MLP([
    6,
], [32, 64, N * N], [tf.tanh, tf.tanh, tf.identity])

optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)

current_controller = DiscreteDeepQ(6,
                                   N * N,
                                   brain,
                                   optimizer,
                                   session,
                                   discount_rate=0.9,
                                   exploration_period=500000,
                                   max_experience=10000,
                                   minibatch_size=128,
                                   random_action_probability=0.05,
                                   store_every_nth=1,
input_size = observation_size * observations_in_seq

# actions
num_actions = 2

#critic = MLP([input_size, num_actions*2], [2048, 1024, 1],
#            [tf.nn.sigmoid, tf.nn.sigmoid, tf.identity], scope='critic')
#
#actor = MLP([input_size,], [2048, 1024, num_actions],
#            [tf.nn.sigmoid, tf.nn.sigmoid, tf.identity], scope='actor')

r = tf.nn.relu
t = tf.nn.tanh

critic = MLP([input_size, num_actions], [2048, 512, 256, 256, 1],
             [t, t, t, t, tf.identity],
             scope='critic')

actor = MLP([
    input_size,
], [2048, 512, 256, 256, num_actions], [t, t, t, t, tf.identity],
            scope='actor')

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
#optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0001, decay=0.9)
#optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0005, decay=0.9)
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate= 0.001)

# DiscreteDeepQ object
Ejemplo n.º 11
0
    def start(self):
        """Start MLDaemon
        
        This function create tensorflow controller and running the tuning by iteratively 
        training and choose action.
        """
        if self.debugging_level >= 1:
            import cProfile
            import io
            import pstats
            pr = cProfile.Profile()
            pr.enable()

        logger.info(f"Connected to database {self.conf['replaydb']['dbfile']}")

        # set stopped to False, so daemon can run
        self.stopped = False

        logger.info('Starting MLDaemon...')
        try:
            # TensorFlow business - it is always good to reset a graph before creating a new controller.
            ops.reset_default_graph()
            # ? shall we use InteractiveSession()?
            self.session = tf.Session()  # tf.InteractiveSession()

            # This little guy will let us run tensorboard
            #      tensorboard --logdir [LOG_DIR]
            journalist = tf.summary.FileWriter(self.LOG_DIR)

            # Brain maps from observation to Q values for different actions.
            # Here it is a done using a multi layer perceptron with 2 hidden
            # layers
            hidden_layer_size = max(int(self.observation_size * 1.2), 200)
            logger.info('Observation size {0}, hidden layer size {1}'.format(
                self.observation_size, hidden_layer_size))
            brain = MLP([
                self.observation_size,
            ], [hidden_layer_size, hidden_layer_size, self.opt['num_actions']],
                        [tf.tanh, tf.tanh, tf.identity])

            # The optimizer to use. Here we use RMSProp as recommended
            # by the publication
            optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001,
                                                  decay=0.9)
            # DiscreteDeepQ object
            self.controller = DiscreteDeepQ(
                (self.observation_size, ),
                self.opt['num_actions'],
                brain,
                optimizer,
                self.session,
                discount_rate=0.99,
                start_random_rate=self.start_random_rate,
                exploration_period=self.exploration_period,
                random_action_probability=self.
                opt['random_action_probability'],
                train_every_nth=1,
                summary_writer=journalist,
                k_action=int(self.opt['k_val']))

            self.session.run(tf.initialize_all_variables())
            self.session.run(self.controller.target_network_update)

            #checks if there is a model to be loaded before updating the graph
            if os.path.isfile(os.path.join(self.save_path, 'model')):
                self.controller.restore(self.save_path)
                logger.info('Loaded saved model from ' + self.save_path)
            else:
                logger.info('No saved model found')
            self.test_number_of_steps_after_restore = self.controller.actions_executed_so_far

            # graph was not available when journalist was created
            journalist.add_graph(self.session.graph)

            last_action_second = 0  # last action timestep
            last_training_step_duration = 0  # last training duration
            last_checkpoint_time = time.time()  # last checkpoint
            while not self.stop_requested:
                begin_time = time.time()  # set begin time to current time

                # Run training step
                logger.info('Start training step...')
                minibatch_size, prediction_error = self._do_training_step()

                if minibatch_size > 0:
                    # Check checkpoint time for every self.checkpoint_time
                    logger.info(
                        f'Time before checkpoint: {self.checkpoint_time - (time.time() - last_checkpoint_time)}'
                    )
                    if time.time(
                    ) - last_checkpoint_time > self.checkpoint_time:
                        # save controller checkpoint
                        cp_path = os.path.join(
                            self.save_path,
                            'checkpoint_' + time.strftime('%Y-%m-%d_%H-%M-%S'))
                        os.mkdir(cp_path)
                        self.controller.save(cp_path)
                        # update checkpoint time
                        last_checkpoint_time = time.time()
                        logger.info('Checkpoint saved in ' + cp_path)

                    # update last training duration
                    last_training_step_duration = time.time() - begin_time
                    logger.info(
                        'Finished {step}th training step in {time} seconds '
                        'using {mb} samples with prediction error {error}.'.
                        format(step=self.controller.iteration,
                               time=last_training_step_duration,
                               mb=minibatch_size,
                               error=prediction_error))
                else:
                    logger.info('Not enough data for training yet.')

                # Check if it is time for tuning
                # (check if duration since last action passed compare to time left before next actions)
                if time.time() - (
                        last_action_second + 0.5
                ) >= self.delay_between_actions - last_training_step_duration:
                    if self.enable_tuning:
                        logger.debug('Start tuning step...')

                        try:
                            # Update memcache for next traininf interval
                            self.db.refresh_memcache()
                        except:
                            pass

                        # get sleep time either 0 or what is left until next action is start
                        sleep_time = max(
                            0, self.delay_between_actions -
                            (time.time() - (last_action_second + 0.5)))
                        if sleep_time > 0.05:
                            # Do garbage cleaning up before long sleeping
                            gc.collect()
                            sleep_time = max(
                                0, self.delay_between_actions -
                                (time.time() - (last_action_second + 0.5)))
                        if sleep_time > 0.0001:
                            logger.debug(f'Sleeping {sleep_time} seconds')
                            # Welp, basically sleep
                            time.sleep(sleep_time)

                        # Do action step
                        ts = int(time.time())
                        self._do_action_step(ts)
                        # Update action to current time
                        last_action_second = ts
                    else:
                        logger.debug('Tuning disabled.')
                        # Check for new data every 200 steps to reduce checking overhead
                        if self.controller.number_of_times_train_called % 200 == 0:
                            try:
                                self.db.refresh_memcache()
                                pass
                            except:
                                pass

                    # We always print out the reward to the log for analysis
                    logger.info(f'Cumulative reward: {self.cumulative_reward}')

                    # Clean log at the end for next run
                    flush_log()
        finally:
            # set stopped to True, so daemon can properly stop
            self.stopped = True
            # controller.save should not work here as the controller is still NoneType
            # self.controller.save(self.save_path)
            logger.info('MLDaemon stopped.')

            if self.debugging_level >= 1:
                pr.disable()
                s = io.StringIO()
                sortby = 'cumulative'
                ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                ps.print_stats()
                print(s.getvalue())
Ejemplo n.º 12
0
observation_size = 4
observations_in_seq = 1
input_size = observation_size * observations_in_seq

# actions
num_actions = 3

#brain = MLP([input_size,], [5, 5, 5, num_actions],
#            [tf.tanh, tf.tanh, tf.tanh, tf.identity])
#brain = MLP([input_size,], [20, 20, 20, 20, num_actions],
#            [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity])

#brain = MLP([input_size,], [32, 32, 32, 32, 32, num_actions],
#            [tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.identity])
brain = MLP([
    input_size,
], [64, 64, num_actions], [tf.sigmoid, tf.sigmoid, tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
#optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0001, decay=0.9)
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)

# DiscreteDeepQ object
current_controller = DiscreteDeepQ(input_size,
                                   num_actions,
                                   brain,
                                   optimizer,
                                   session,
                                   discount_rate=0.95,
                                   target_network_update_rate=0.01,
Ejemplo n.º 13
0
n_prev_frames = 3

# Tensorflow business - it is always good to reset a graph before creating a new controller.
tf.reset_default_graph()
session = tf.InteractiveSession()

# This little guy will let us run tensorboard
#      tensorboard --logdir [LOG_DIR]
journalist = tf.train.SummaryWriter(LOG_DIR)

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP([
    n_prev_frames * 4 + n_prev_frames - 1,
], [4], [tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = DiscreteDeepQ(n_prev_frames * 4 + n_prev_frames - 1,
                                   4,
                                   brain,
                                   optimizer,
                                   session,
                                   discount_rate=0.9,
                                   exploration_period=100,
                                   max_experience=10000,
observation_size = 4;
observations_in_seq = 1;
input_size = observation_size*observations_in_seq;

# actions
num_actions = 1;

#brain = MLP([input_size,], [5, 5, 5, num_actions], 
#            [tf.tanh, tf.tanh, tf.tanh, tf.identity])
#brain = MLP([input_size,], [20, 20, 20, 20, num_actions], 
#            [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity])

#brain = MLP([input_size,], [32, 32, 32, 32, 32, num_actions], 
#            [tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.identity])

critic = MLP([input_size, num_actions], [64, 64, 1], 
            [tf.sigmoid, tf.sigmoid, tf.identity], scope='critic')

actor = MLP([input_size,], [64, 64, num_actions], 
            [tf.sigmoid, tf.sigmoid, tf.identity], scope='actor')

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
#optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0001, decay=0.9)
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = ContinuousDeepQ(input_size, num_actions, actor, critic, optimizer, session, discount_rate=0.99, target_actor_update_rate=0.001, target_critic_update_rate=0.001, exploration_period=5000, max_experience=10000, store_every_nth=4, train_every_nth=4, summary_writer=journalist)

#class ContinuousDeepQ
#                       observation_size,
#                       action_size,
Ejemplo n.º 15
0
    def __init__(self, train_loop):

        self.train_loop = train_loop
        self.graph = train_loop.graph
        self.sess = train_loop.sess
        journalist = train_loop.logger

        num_actions = self.train_loop.num_actions
        observation_size = self.train_loop.observation_size
        observations_in_seq = 1
        input_size = observation_size * observations_in_seq
        learning_rate = 1e-4

        r = tf.nn.relu
        t = tf.nn.tanh

        self.lstm_input_size = 41
        self.lstm_layer_size = 64
        self.lstm_layers_count = 2
        self.lstm_steps_count = 5

        class MULTI_LSTM_MLP(object):
            def __init__(self,
                         input_size,
                         step_count=5,
                         layer_size=32,
                         layers_count=2,
                         batch_size=32,
                         mlp=None,
                         scope='lstm_mlp'):
                self.input_size = input_size
                self.step_count = step_count
                self.layer_size = layer_size
                self.layers_count = layers_count
                self.batch_size = batch_size
                self.mlp = mlp
                self.scope = scope

                with tf.variable_scope(scope) as sc:

                    def lstm_cell():
                        # return tf.contrib.rnn.BasicLSTMCell(self.layer_size, reuse=sc.reuse)
                        return tf.contrib.rnn.LSTMCell(
                            self.layer_size,
                            reuse=sc.reuse
                            # initializer=tf.random_uniform_initializer(-0.05, 0.05)
                            # activation=tf.nn.relu
                        )

                    self.stacked_lstm = tf.contrib.rnn.MultiRNNCell(
                        [lstm_cell() for _ in range(self.layers_count)])

                    fake_input = tf.placeholder(
                        tf.float32,
                        [self.batch_size, self.step_count, self.input_size])

                    self.initial_state_batch = self.stacked_lstm.zero_state(
                        train_loop.batch_size, tf.float32)
                    self.initial_state_one = self.stacked_lstm.zero_state(
                        1, tf.float32)
                    self.lstm_output, state = self.stacked_lstm(
                        fake_input[:, 0], self.initial_state_batch)

                    self.model_variables = [
                        v for v in tf.trainable_variables()
                        if v.name.startswith(sc.name)
                    ]
                    for v in self.model_variables:
                        print("--- MULTI_LSTM_MLP v: " + v.name)

            def __call__(self, xs):
                # if this is critic we need to ignore input action
                # since it is already present
                print('call: ' +
                      (self.scope if isinstance(self.scope, str) else self.
                       scope.name))
                if (isinstance(xs, list)):
                    lstm_input = xs[0]
                else:
                    lstm_input = xs
                print(lstm_input)

                # convert xs into steps
                lstm_input = tf.reshape(lstm_input,
                                        [-1, self.step_count, self.input_size])
                print(lstm_input.get_shape())

                initial_state = self.initial_state_batch
                if str(lstm_input.get_shape()[0]) == '?':
                    print('--- dynamic shape')
                    initial_state = self.initial_state_one
                print('--- initial state')
                print(lstm_input.get_shape()[0])

                with tf.variable_scope(self.scope, reuse=True):
                    state = initial_state
                    for i in range(self.step_count):
                        print('--- lstm step: {}'.format(i))
                        print(lstm_input[:, i].get_shape())
                        lstm_output, state = self.stacked_lstm(
                            lstm_input[:, i], state)
                    final_state = state

                if (isinstance(xs, list)):
                    return self.mlp([lstm_output, xs[1]])
                else:
                    return self.mlp(lstm_output)

            def copy(self, scope=None):
                scope = scope or self.scope + "_copy"
                print("--- copy " + scope)
                with tf.variable_scope(scope) as sc:
                    for v in self.model_variables:
                        print("--- bn: " + base_name2(v) + " " + v.name)
                        tf.get_variable(
                            base_name2(v),
                            v.get_shape(),
                            initializer=lambda x, dtype=tf.float32,
                            partition_info=None: v.initialized_value())
                    sc.reuse_variables()
                mlp_copy = self.mlp.copy('mlp_' + scope)
                return MULTI_LSTM_MLP(self.input_size,
                                      self.step_count,
                                      self.layer_size,
                                      self.layers_count,
                                      self.batch_size,
                                      mlp_copy,
                                      scope=sc)

            def variables(self):
                return self.model_variables + self.mlp.variables()

        mlp_critic = MLP([self.lstm_layer_size, num_actions],
                         [256, 256, 256, 256, 1], [r, r, r, t, tf.identity],
                         scope='mlp_critic')

        mlp_actor = MLP([
            self.lstm_layer_size,
        ], [256, 256, 256, 256, num_actions], [r, r, r, t, tf.nn.sigmoid],
                        scope='mlp_actor')

        self.actor = MULTI_LSTM_MLP(self.lstm_input_size,
                                    self.lstm_steps_count,
                                    self.lstm_layer_size,
                                    self.lstm_layers_count,
                                    train_loop.batch_size, mlp_actor, 'actor')

        critic = MULTI_LSTM_MLP(self.lstm_input_size, self.lstm_steps_count,
                                self.lstm_layer_size, self.lstm_layers_count,
                                train_loop.batch_size, mlp_critic, 'critic')

        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

        self.controller = ContinuousDeepQ(
            input_size,
            num_actions,
            self.actor,
            critic,
            optimizer,
            self.sess,
            discount_rate=0.99,
            target_actor_update_rate=0.01,
            target_critic_update_rate=0.01,
            exploration_period=5000,
            max_experience=10000,
            store_every_nth=4,
            train_every_nth=4,
            summary_writer=journalist,
            rewards=self.train_loop.dequeued_rewards,
            given_action=self.train_loop.dequeued_actions,
            observation=self.train_loop.dequeued_prev_states,
            next_observation=self.train_loop.dequeued_next_states,
            next_observation_mask=tf.ones(
                self.train_loop.dequeued_rewards.get_shape(), tf.float32))