コード例 #1
0
def save_model():

  optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = 1e-3);
  obs_spec = TensorSpec((7,), dtype = tf.float32, name = 'observation');
  action_spec = BoundedTensorSpec((1,), dtype = tf.int32, minimum = 0, maximum = 3, name = 'action');
  actor_net = ActorDistributionRnnNetwork(obs_spec, action_spec, lstm_size = (100,100));
  value_net = ValueRnnNetwork(obs_spec);
  agent = ppo_agent.PPOAgent(
    time_step_spec = time_step_spec(obs_spec),
    action_spec = action_spec,
    optimizer = optimizer,
    actor_net = actor_net,
    value_net = value_net,
    normalize_observations = True,
    normalize_rewards = True,
    use_gae = True,
    num_epochs = 1,
  );
  checkpointer = Checkpointer(
    ckpt_dir = 'checkpoints/policy',
    max_to_keep = 1,
    agent = agent,
    policy = agent.policy,
    global_step = tf.compat.v1.train.get_or_create_global_step());
  checkpointer.initialize_or_restore();
  saver = policy_saver.PolicySaver(agent.policy);
  saver.save('final_policy');
コード例 #2
0
def create_algorithm(env, use_rnn=False, learning_rate=1e-1):
    observation_spec = env.observation_spec()
    action_spec = env.action_spec()

    if use_rnn:
        actor_net = ActorDistributionRnnNetwork(observation_spec,
                                                action_spec,
                                                input_fc_layer_params=(),
                                                output_fc_layer_params=(),
                                                lstm_size=(4, ))
        value_net = ValueRnnNetwork(observation_spec,
                                    input_fc_layer_params=(),
                                    output_fc_layer_params=(),
                                    lstm_size=(4, ))
    else:
        actor_net = ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=(),
            continuous_projection_net=StableNormalProjectionNetwork)
        value_net = ValueNetwork(observation_spec, fc_layer_params=())

    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)

    return PPOAlgorithm(action_spec=action_spec,
                        actor_network=actor_net,
                        value_network=value_net,
                        loss=PPOLoss(action_spec=action_spec,
                                     gamma=1.0,
                                     debug_summaries=DEBUGGING),
                        optimizer=optimizer,
                        debug_summaries=DEBUGGING)
コード例 #3
0
def create_algorithm(env, use_rnn=False, learning_rate=1e-1):
    observation_spec = env.observation_spec()
    action_spec = env.action_spec()

    if use_rnn:
        actor_net = ActorDistributionRnnNetwork(observation_spec,
                                                action_spec,
                                                input_fc_layer_params=(),
                                                output_fc_layer_params=(),
                                                lstm_size=(4, ))
        value_net = ValueRnnNetwork(observation_spec,
                                    input_fc_layer_params=(),
                                    output_fc_layer_params=(),
                                    lstm_size=(4, ))
    else:
        actor_net = ActorDistributionNetwork(observation_spec,
                                             action_spec,
                                             fc_layer_params=())
        value_net = ValueNetwork(observation_spec, fc_layer_params=())

    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)

    ac_algorithm = ActorCriticAlgorithm(action_spec=action_spec,
                                        actor_network=actor_net,
                                        value_network=value_net,
                                        loss=PPOLoss(action_spec=action_spec,
                                                     gamma=1.0),
                                        optimizer=optimizer)
    return PPOAlgorithm(ac_algorithm)
コード例 #4
0
def create_networks(observation_spec, action_spec):
    actor_net = ActorDistributionRnnNetwork(observation_spec,
                                            action_spec,
                                            conv_layer_params=[(16, 8, 4),
                                                               (32, 4, 2)],
                                            input_fc_layer_params=(256, ),
                                            lstm_size=(256, ),
                                            output_fc_layer_params=(128, ),
                                            activation_fn=tf.nn.elu)
    value_net = ValueRnnNetwork(observation_spec,
                                conv_layer_params=[(16, 8, 4), (32, 4, 2)],
                                input_fc_layer_params=(256, ),
                                lstm_size=(256, ),
                                output_fc_layer_params=(128, ),
                                activation_fn=tf.nn.elu)

    return actor_net, value_net
コード例 #5
0
def create_networks(tf_env):
    actor_net = ActorDistributionRnnNetwork(tf_env.observation_spec(),
                                            tf_env.action_spec(),
                                            conv_layer_params=[(16, 4, 2),
                                                               (32, 2, 1)],
                                            input_fc_layer_params=(256, ),
                                            lstm_size=(256, ),
                                            output_fc_layer_params=(128, ))

    value_net = ValueRnnNetwork(tf_env.observation_spec(),
                                conv_layer_params=[(16, 4, 2), (32, 2, 1)],
                                input_fc_layer_params=(256, ),
                                lstm_size=(256, ),
                                output_fc_layer_params=(128, ),
                                activation_fn=tf.nn.elu)

    return actor_net, value_net
コード例 #6
0
def _create_ppo_algorithm():
    observation_spec = common.get_observation_spec()
    action_spec = common.get_action_spec()
    optimizer = tf.optimizers.Adam(learning_rate=1e-3)

    actor_net = ActorDistributionRnnNetwork(observation_spec,
                                            action_spec,
                                            input_fc_layer_params=(),
                                            output_fc_layer_params=None)
    value_net = ValueRnnNetwork(observation_spec,
                                input_fc_layer_params=(),
                                output_fc_layer_params=None)

    return PPOAlgorithm(action_spec=action_spec,
                        actor_network=actor_net,
                        value_network=value_net,
                        loss_class=PPOLoss,
                        optimizer=optimizer,
                        debug_summaries=True)
コード例 #7
0
def create_networks(observation_spec, action_spec):

    preprocessing_combiner = tf.keras.layers.Concatenate()
    actor_net = ActorDistributionRnnNetwork(
        observation_spec,
        action_spec,
        conv_layer_params=[(16, 8, 4), (32, 4, 2)],
        input_fc_layer_params=(256, ),
        lstm_size=(256, ),
        preprocessing_combiner=preprocessing_combiner,
        output_fc_layer_params=(128, ),
        activation_fn=tf.nn.elu)
    value_net = ValueRnnNetwork(observation_spec,
                                conv_layer_params=[(16, 8, 4), (32, 4, 2)],
                                input_fc_layer_params=(256, ),
                                preprocessing_combiner=preprocessing_combiner,
                                lstm_size=(256, ),
                                output_fc_layer_params=(128, ),
                                activation_fn=tf.nn.elu)

    return actor_net, value_net
コード例 #8
0
ファイル: on_policy_driver_test.py プロジェクト: runjerry/alf
    def test_actor_critic_rnn_policy(self):
        batch_size = 100
        steps_per_episode = 5
        gap = 3

        env = RNNPolicyUnittestEnv(batch_size, steps_per_episode, gap)
        # We need to wrap env using TFPyEnvironment because the methods of env
        # has side effects (e.g, env._current_time_step can be changed)
        env = TFPyEnvironment(env)
        action_spec = env.action_spec()
        observation_spec = env.observation_spec()
        algorithm = ActorCriticAlgorithm(
            observation_spec=observation_spec,
            action_spec=action_spec,
            actor_network=ActorDistributionRnnNetwork(
                observation_spec,
                action_spec,
                input_fc_layer_params=(),
                output_fc_layer_params=None),
            value_network=ValueRnnNetwork(observation_spec,
                                          input_fc_layer_params=(),
                                          output_fc_layer_params=None),
            optimizer=tf.optimizers.Adam(learning_rate=1e-2))
        driver = OnPolicyDriver(env, algorithm, train_interval=9)
        eval_driver = OnPolicyDriver(env, algorithm, training=False)

        driver.run = tf.function(driver.run)

        t0 = time.time()
        driver.run(max_num_steps=2000 * batch_size)
        logging.info("time=%s" % (time.time() - t0))

        env.reset()
        time_step, _ = eval_driver.run(max_num_steps=4 * batch_size)
        logging.info("reward=%s" % tf.reduce_mean(time_step.reward))
        self.assertAlmostEqual(1.0,
                               float(tf.reduce_mean(time_step.reward)),
                               delta=5e-2)
コード例 #9
0
ファイル: target_navigation.py プロジェクト: ruizhaogit/alf
def get_ac_networks(conv_layer_params=None,
                    num_embedding_dims=None,
                    fc_layer_params=None,
                    num_state_tiles=None,
                    num_sentence_tiles=None):
    """
    Generate the actor and value networks

    Args:
        conv_layer_params (list[int 3 tuple]): optional convolution layers
            parameters, where each item is a length-three tuple indicating
            (filters, kernel_size, stride).
        num_embedding_dims (int): optional number of dimensions of the
            vocabulary embedding space.
        fc_layer_params (list[int]): optional fully_connected parameters, where
            each item is the number of units in the layer.
        num_state_tiles (int): optional number of times to repeat the
            internal state tensor before concatenation with other inputs.
            The rationale is to match the number of dimentions of the image
            input, so that the final concatenation will have roughly equal
            representation from different sources of input.  Without this,
            typically image input, due to its large input size, will take over
            and trump all other small dimensional inputs.
        num_sentence_tiles (int): optional number of times to repeat the
            sentence embedding tensor before concatenation with other inputs,
            so that sentence input won't be trumped by other high dimensional
            inputs like image observation.
    """
    observation_spec = common.get_observation_spec()
    action_spec = common.get_action_spec()

    conv_layers = tf.keras.Sequential(
        tf_agents.networks.utils.mlp_layers(
            conv_layer_params=conv_layer_params))

    preprocessing_layers = {
        'image': conv_layers,
    }
    if common.get_states_shape():
        state_layers = get_identity_layer()
        # [image: (1, 12800), sentence: (1, 16 * 800), states: (1, 16 * 800)]
        # Here, we tile along the last dimension of the input.
        if num_state_tiles:
            state_layers = tf.keras.Sequential([
                tf.keras.layers.Lambda(
                    lambda x: tf.tile(x, multiples=[1, num_state_tiles]))
            ])
        preprocessing_layers['states'] = state_layers

    vocab_size = common.get_vocab_size()
    if vocab_size:
        sentence_layers = tf.keras.Sequential([
            tf.keras.layers.Embedding(vocab_size, num_embedding_dims),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
        if num_sentence_tiles:
            sentence_layers.add(
                tf.keras.layers.Lambda(
                    lambda x: tf.tile(x, multiples=[1, num_sentence_tiles])))
        preprocessing_layers['sentence'] = sentence_layers

    preprocessing_combiner = tf.keras.layers.Concatenate()

    actor = ActorDistributionRnnNetwork(
        input_tensor_spec=observation_spec,
        output_tensor_spec=action_spec,
        preprocessing_layers=preprocessing_layers,
        preprocessing_combiner=preprocessing_combiner,
        input_fc_layer_params=fc_layer_params)

    value = ValueRnnNetwork(input_tensor_spec=observation_spec,
                            preprocessing_layers=preprocessing_layers,
                            preprocessing_combiner=preprocessing_combiner,
                            input_fc_layer_params=fc_layer_params)

    return actor, value
コード例 #10
0
def create_ac_algorithm(env,
                        actor_fc_layers=(200, 100),
                        value_fc_layers=(200, 100),
                        encoding_conv_layers=(),
                        encoding_fc_layers=(),
                        use_rnns=False,
                        use_icm=False,
                        learning_rate=5e-5,
                        algorithm_class=ActorCriticAlgorithm,
                        loss_class=ActorCriticLoss,
                        debug_summaries=False):
    """Create a simple ActorCriticAlgorithm.

    Args:
        env (TFEnvironment): A TFEnvironment
        actor_fc_layers (list[int]): list of fc layers parameters for actor network
        value_fc_layers (list[int]): list of fc layers parameters for value network
        encoding_conv_layers (list[int]): list of convolution layers parameters for encoding network
        encoding_fc_layers (list[int]): list of fc layers parameters for encoding network
        use_rnns (bool): True if rnn should be used
        use_icm (bool): True if intrinsic curiosity module should be used
        learning_rate (float): learning rate
        algorithm_class (type): class of the algorithm. Can be
            ActorCriticAlgorithm or PPOAlgorithm
        loss_class (type): the class of the loss. The signature of its
            constructor: loss_class(action_spec, debug_summaries)
        debug_summaries (bool): True if debug summaries should be created.
    """
    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)

    if use_rnns:
        actor_net = ActorDistributionRnnNetwork(
            env.observation_spec(),
            env.action_spec(),
            input_fc_layer_params=actor_fc_layers,
            output_fc_layer_params=None)
        value_net = ValueRnnNetwork(env.observation_spec(),
                                    input_fc_layer_params=value_fc_layers,
                                    output_fc_layer_params=None)
    else:
        actor_net = ActorDistributionNetwork(env.observation_spec(),
                                             env.action_spec(),
                                             fc_layer_params=actor_fc_layers)
        value_net = ValueNetwork(env.observation_spec(),
                                 fc_layer_params=value_fc_layers)

    encoding_net = None
    if encoding_fc_layers or encoding_conv_layers:
        encoding_net = EncodingNetwork(
            input_tensor_spec=env.observation_spec(),
            conv_layer_params=encoding_conv_layers,
            fc_layer_params=encoding_fc_layers)

    icm = None
    if use_icm:
        feature_spec = env.observation_spec()
        if encoding_net:
            feature_spec = tf.TensorSpec((encoding_fc_layers[-1], ),
                                         dtype=tf.float32)
        icm = ICMAlgorithm(env.action_spec(),
                           feature_spec,
                           encoding_net=encoding_net)

    algorithm = algorithm_class(action_spec=env.action_spec(),
                                actor_network=actor_net,
                                value_network=value_net,
                                intrinsic_curiosity_module=icm,
                                loss_class=loss_class,
                                optimizer=optimizer,
                                debug_summaries=debug_summaries)

    return algorithm
コード例 #11
0
def main():
    if len(sys.argv) != 2:
        raise ValueError(f"Usage: ./{sys.argv[0]} experiment_name")
    experiment_name = sys.argv[1]

    tf.compat.v1.enable_v2_behavior()
    # Create train and evaluation environments for Tensorflow
    train_env = tf_py_environment.TFPyEnvironment(
        parallel_py_environment.ParallelPyEnvironment(
            [Environment.Environment] * num_parallel_environments))

    eval_py_env = Environment.Environment()
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    optimizer = Adam(learning_rate=learning_rate, epsilon=1e-5)

    global_step = tf.compat.v1.train.get_or_create_global_step()
    timed_at_step = global_step.numpy()

    # Initialize actor and value networks
    actor_net = ActorDistributionRnnNetwork(
        input_tensor_spec=train_env.observation_spec(),
        output_tensor_spec=train_env.action_spec(),
        conv_layer_params=[(3, 4, 1), (7, 4, 2), (5, 8, 2)],
        input_fc_layer_params=(128, ),
        lstm_size=(128, ),
        output_fc_layer_params=(64, ),
        activation_fn=tf.nn.elu)

    value_net = ValueRnnNetwork(input_tensor_spec=train_env.observation_spec(),
                                conv_layer_params=[(3, 4, 1), (7, 4, 2),
                                                   (5, 8, 2)],
                                input_fc_layer_params=(128, ),
                                lstm_size=(128, ),
                                output_fc_layer_params=(64, ),
                                activation_fn=tf.nn.elu)

    agent = ppo_agent.PPOAgent(train_env.time_step_spec(),
                               train_env.action_spec(),
                               optimizer,
                               actor_net,
                               value_net,
                               num_epochs=num_epochs,
                               train_step_counter=global_step,
                               discount_factor=0.99,
                               gradient_clipping=0.5,
                               entropy_regularization=1e-2,
                               importance_ratio_clipping=0.2,
                               use_gae=True,
                               use_td_lambda_return=True)

    agent.initialize()
    step_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps()
    ]

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        agent.collect_data_spec,
        batch_size=num_parallel_environments,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        train_env,
        agent.collect_policy,
        observers=[replay_buffer.add_batch] + step_metrics,
        num_episodes=collect_episodes_per_iteration)

    environment_steps_metric = tf_metrics.EnvironmentSteps()

    collect_time = 0
    train_time = 0

    # Reset the train step
    agent.train_step_counter.assign(0)

    saved_model_dir = os.path.join("saved_models", experiment_name)
    checkpoint_dir = os.path.join(saved_model_dir, 'checkpoint')
    train_checkpointer = common.Checkpointer(
        ckpt_dir=checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        policy=agent.policy,
        #replay_buffer=replay_buffer,
        global_step=global_step)

    train_checkpointer.initialize_or_restore()
    global_step = tf.compat.v1.train.get_global_step()
    print(f"Starting training at step: {global_step.numpy()}")
    while environment_steps_metric.result() < num_environment_steps:
        start_time = time.time()
        collect_driver.run()
        collect_time += time.time() - start_time

        start_time = time.time()
        trajectories = replay_buffer.gather_all()
        total_loss, unused_info = agent.train(experience=trajectories)
        replay_buffer.clear()
        train_time += time.time() - start_time

        global_step_val = global_step.numpy()
        if global_step_val % eval_interval == 0:
            with open(
                    eval_path(saved_model_dir, global_step_val, eval_interval),
                    'w') as f:
                avg_return = evaluate_perf(f, eval_env, agent.policy,
                                           num_eval_episodes)
            steps_per_sec = ((global_step_val - timed_at_step) /
                             (collect_time + train_time))
            print(
                f"step = {global_step_val}: loss = {total_loss}, Avg return: {avg_return}, {steps_per_sec:.3f} steps/sec, collect_time = {collect_time}, train_time = {train_time}"
            )
            timed_at_step = global_step_val
            collect_time = 0
            train_time = 0

        if global_step_val % policy_saver_interval == 0:
            train_checkpointer.save(global_step_val)
コード例 #12
0
def main(_):

    # environment serves as the dataset in reinforcement learning
    train_env = tf_py_environment.TFPyEnvironment(
        ParallelPyEnvironment([lambda: suite_mujoco.load('HalfCheetah-v2')] *
                              batch_size))
    eval_env = tf_py_environment.TFPyEnvironment(
        suite_mujoco.load('HalfCheetah-v2'))
    # create agent
    actor_net = ActorDistributionRnnNetwork(train_env.observation_spec(),
                                            train_env.action_spec(),
                                            lstm_size=(100, 100))
    value_net = ValueRnnNetwork(train_env.observation_spec())
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
    tf_agent = ppo_agent.PPOAgent(train_env.time_step_spec(),
                                  train_env.action_spec(),
                                  optimizer=optimizer,
                                  actor_net=actor_net,
                                  value_net=value_net,
                                  normalize_observations=False,
                                  normalize_rewards=False,
                                  use_gae=True,
                                  num_epochs=25)
    tf_agent.initialize()
    # replay buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        tf_agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=1000000)
    # policy saver
    saver = policy_saver.PolicySaver(tf_agent.policy)
    # define trajectory collector
    train_episode_count = tf_metrics.NumberOfEpisodes()
    train_total_steps = tf_metrics.EnvironmentSteps()
    train_avg_reward = tf_metrics.AverageReturnMetric(
        batch_size=train_env.batch_size)
    train_avg_episode_len = tf_metrics.AverageEpisodeLengthMetric(
        batch_size=train_env.batch_size)
    train_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        train_env,
        tf_agent.collect_policy,  # NOTE: use PPOPolicy to collect episode
        observers=[
            replay_buffer.add_batch, train_episode_count, train_total_steps,
            train_avg_reward, train_avg_episode_len
        ],  # callbacks when an episode is completely collected
        num_episodes=30,  # how many episodes are collected in an iteration
    )
    # training
    eval_avg_reward = tf_metrics.AverageReturnMetric(buffer_size=30)
    eval_avg_episode_len = tf_metrics.AverageEpisodeLengthMetric(
        buffer_size=30)
    while train_total_steps.result() < 25000000:
        train_driver.run()
        trajectories = replay_buffer.gather_all()
        loss, _ = tf_agent.train(experience=trajectories)
        replay_buffer.clear()
        # clear collected episodes right after training
        if tf_agent.train_step_counter.numpy() % 50 == 0:
            print('step = {0}: loss = {1}'.format(
                tf_agent.train_step_counter.numpy(), loss))
        if tf_agent.train_step_counter.numpy() % 500 == 0:
            # save checkpoint
            saver.save('checkpoints/policy_%d' %
                       tf_agent.train_step_counter.numpy())
            # evaluate the updated policy
            eval_avg_reward.reset()
            eval_avg_episode_len.reset()
            eval_driver = dynamic_episode_driver.DynamicEpisodeDriver(
                eval_env,
                tf_agent.policy,
                observers=[
                    eval_avg_reward,
                    eval_avg_episode_len,
                ],
                num_episodes=
                30,  # how many epsiodes are collected in an iteration
            )
            eval_driver.run()
            print(
                'step = {0}: Average Return = {1} Average Episode Length = {2}'
                .format(tf_agent.train_step_counter.numpy(),
                        train_avg_reward.result(),
                        train_avg_episode_len.result()))
    # play cartpole for the last 3 times and visualize
    import cv2
    for _ in range(3):
        status = eval_env.reset()
        policy_state = tf_agent.policy.get_initial_state(eval_env.batch_size)
        while not status.is_last():
            action = tf_agent.policy.action(status, policy_state)
            # NOTE: use greedy policy to test
            status = eval_env.step(action.action)
            policy_state = action.state
            cv2.imshow('halfcheetah', eval_env.pyenv.envs[0].render())
            cv2.waitKey(25)
コード例 #13
0
ファイル: utils.py プロジェクト: zoetsekas/marl_ppo
def make_networks(env,
                  size=(96, 96),
                  num_frames=1,
                  num_channels=3,
                  conv_params=[(16, 8, 4), (32, 3, 2)],
                  in_fc_params=(256, ),
                  out_fc_params=(128, ),
                  use_lstm=False,
                  lstm_size=(256, )):
    """ Creates the actor and critic neural networks of the PPO agent.

    Function for creating the neural networks for the PPO agent, namely the
    actor and value networks.

    Source for network params:
    https://www.arconsis.com/unternehmen/blog/reinforcement-learning-doom-with-tf-agents-and-ppo

    Arguments:
        env (TfPyEnvironment): A TensorFlow environment the agent interacts with.
        size (tuple):  The desired width and height of the observation space.
            Defaults to (96, 96).  Input tuple should preserve the original
            observation aspect ratio.
        num_frames (int):  Number of frames used in the agent's observation.
            Defaults to 1, num_frames > 1 indicates frame stacking.
        num_channels (int):  Number of color channels to include for each frame.
            Defaults to 3 (RGB), and 1 denotes grayscale.
        conv_params (list): A list corresponding to convolutional layer
            parameters for the PPO agent's actor and critic neural networks.
        in_fc_params (tuple): The number of neurons in the input fully
            connected layer of the actor and critic networks of the agent.
        out_fc_params (tuple): The number of neurons in the output fully
            connected layer of the actor and critic networks of the agent.
        use_lstm (bool):  Whether to use LSTM-based actor and critic networks.
        lstm_size (tuple): The number of hidden states inside the LSTM for the
            actor and critic networks of the agents.

    Returns:
        actor_net (ActorDistributionNetwork): A tf-agents Actor Distribution
            Network used for PPO agent action selection.
        value_net (ValueNetwork): A tf-agents Value Network used for
            PPO agent value estimation.
    """
    # Restructure time step spec to match expected processed observations
    processed_shape = tuple(size + (num_channels * num_frames, ))
    obs_spec = env.observation_spec()  # Get old observation spec
    obs_spec = tensor_spec.BoundedTensorSpec(processed_shape,
                                             obs_spec.dtype,
                                             minimum=obs_spec.minimum,
                                             maximum=obs_spec.maximum,
                                             name=obs_spec.name)
    if use_lstm:  # LSTM-based policies
        # Define actor network
        actor_net = ActorDistributionRnnNetwork(
            obs_spec,
            env.action_spec(),
            conv_layer_params=conv_params,
            input_fc_layer_params=in_fc_params,
            lstm_size=lstm_size,
            output_fc_layer_params=out_fc_params)
        # Define value network
        value_net = ValueRnnNetwork(obs_spec,
                                    conv_layer_params=conv_params,
                                    input_fc_layer_params=in_fc_params,
                                    lstm_size=lstm_size,
                                    output_fc_layer_params=out_fc_params)

        print("Created Actor and Value Networks with LSTM...")

    else:  # non-LSTM-based policies
        # Define actor network
        actor_net = ActorDistributionNetwork(obs_spec,
                                             env.action_spec(),
                                             conv_layer_params=conv_params)
        # Define value network
        value_net = ValueNetwork(obs_spec, conv_layer_params=conv_params)

    return actor_net, value_net