def retraining(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=4,  #50
        nb_rollout_steps=3,  #100
        reward_scale=1.0,
        render=False,
        render_eval=False,
        #   noise_type='adaptive-param_0.2',
        noise_type='normal_0.2',
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-4,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,  #100
        batch_size=640,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #50
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    nb_actions = env.num_actions

    # nb_actions=3
    # print(nb_actions)
    action_shape = np.array(nb_actions * [0]).shape

    #4 pairs pos + 3 link length
    # nb_features = 2*(env.num_actions+1)+env.num_actions

    #4 pairs pos + 1 pair target pos
    nb_features = 2 * (env.num_actions + 2)
    observation_shape = np.array(nb_features * [0]).shape
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    # sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    #load the initialization policy
    agent.load_ini(sess, save_path)
    # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape)
    for epoch in range(nb_epochs):
        print(nb_epochs)
        # obs, env_state = env.reset()
        obs = env.reset()
        agent.save(save_path)
        epoch_episode_rewards = []
        '''check if the actor initialization policy has been loaded correctly, 
        i.e. equal to directly ouput values in checkpoint files '''
        # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0')
        # print('loaded_weights:', sess.run(loaded_weights))
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.

            for t_rollout in range(nb_rollout_steps):
                # Predict next action
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                print('action:', action)

                new_obs, r, done = env.step(action)
                # time.sleep(0.2)
                t += 1

                episode_reward += r
                episode_step += 1
                # print('episode_re: ', episode_reward) #[1.]

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

            epoch_episode_rewards.append(episode_reward)
            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)
                # print('Train!')
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        step_set.append(t)
        plt.plot(step_set,
                 mean_epoch_episode_rewards,
                 color='r',
                 label='Initialization')
        plt.xlabel('Steps')
        plt.ylabel('Mean Episode Reward')
        plt.savefig('ddpg_mean_retrain.png')
        # plt.show()

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
    print('stepset: ', step_set)
    print('rewards: ', mean_epoch_episode_rewards)

    return agent
コード例 #2
0
def testing(save_path, network, env,
          seed=None,
          total_timesteps=None,
          nb_epochs=None, # with default settings, perform 1M steps total
          nb_epoch_cycles=50,
          nb_rollout_steps=3,  #100
          reward_scale=1.0,
          render=False,
          render_eval=False,
          # no noise for test
        #   noise_type='adaptive-param_0.2',
        #   noise_type='normal_0.9',
        #   noise_type='ou_0.9',

          normalize_returns=False,
          normalize_observations=True,
          critic_l2_reg=1e-2,
          actor_lr=1e-4,
          critic_lr=1e-3,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          nb_train_steps=3, # per epoch cycle and MPI worker,  50
          nb_eval_steps=1,  #100
          batch_size=640, # per MPI worker
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=3, #50
          **network_kwargs):


    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    nb_actions = env.num_actions

    # nb_actions=3
    # print(nb_actions)
    action_shape=np.array(nb_actions*[0]).shape

    nb_features = 2*(env.num_actions+1)+env.num_actions
    observation_shape=np.array(nb_features*[0]).shape
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    '''no noise for test'''
    # if noise_type is not None:
    #     for current_noise_type in noise_type.split(','):
    #         current_noise_type = current_noise_type.strip()
    #         if current_noise_type == 'none':
    #             pass
    #         elif 'adaptive-param' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
    #         elif 'normal' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         elif 'ou' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         else:
    #             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor, critic, memory, observation_shape, action_shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.load(sess,save_path)
    # sess.graph.finalize()  # cannot save sess if its finalized!

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype = np.float32) #vector
    episode_step = np.zeros(nenvs, dtype = int) # vector
    episodes = 0 #scalar
    t = 0 # scalar
    step_set=[]
    reward_set=[]

    epoch = 0



    start_time = time.time()

    epoch_episode_rewards = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        print(nb_epochs)
        # obs, env_state = env.reset()
        obs = env.reset()
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                '''no noise for test'''
                action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True)
                # print('action:', action)

                # Execute next action.
                # if rank == 0 and render:
                #     env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                # new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                
                # new_obs, r, env_state,done = env.step(action, env_state)
                '''actually no need for env_state: in or out'''
                new_obs, r, done = env.step(action)


                # print('reward:', r)
                # note these outputs are batched from vecenv
                # print('obs: ',obs.shape,obs, 'action: ', action.shape, action )
                '''obs shape: (1,17), action shape: (1,6)'''
                # print('maxaction: ', max_action.shape)
                '''max_action shape: (6,) , max_action*action shape: (1,6)'''
                t += 1
                # if rank == 0 and render:
                #     env.render()
                # print('r:', r)
                episode_reward += r
                episode_step += 1
                # print('episode_re: ', episode_reward) #[1.]

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b=1.
                agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append.
                # print('r: ', r)
                # '''r shape: (1,)'''
                obs = new_obs

                # for d in range(len(done)):
                #     if done[d]:
                #         print('done')
                #         # Episode done.
                #         epoch_episode_rewards.append(episode_reward[d])
                #         episode_rewards_history.append(episode_reward[d])
                #         epoch_episode_steps.append(episode_step[d])
                #         episode_reward[d] = 0.
                #         episode_step[d] = 0
                #         epoch_episodes += 1
                #         episodes += 1
                #         if nenvs == 1:
                #             agent.reset()

            '''added'''                
            epoch_episode_rewards.append(episode_reward)
            '''
            step_set.append(t)
            reward_set=np.concatenate((reward_set,episode_reward))
            # print(step_set,reward_set)
            # print(t, episode_reward)
            
            plt.plot(step_set,reward_set)
            plt.xlabel('Steps')
            plt.ylabel('Episode Reward')
            plt.savefig('ddpg.png')
            plt.show()
            '''

            episode_reward = np.zeros(nenvs, dtype = np.float32) #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            '''no training for test'''
            # for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary. no noise for test!
                # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                #     distance = agent.adapt_param_noise()
                #     epoch_adaptive_distances.append(distance)

                # cl, al = agent.train()
                # epoch_critic_losses.append(cl)
                # epoch_actor_losses.append(al)
                # agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        step_set.append(t)
        plt.plot(step_set,mean_epoch_episode_rewards)
        plt.xlabel('Steps')
        plt.ylabel('Mean Episode Reward')
        plt.savefig('ddpg_mean_test.png')
        # plt.show()

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)
        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s'%x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                    pickle.dump(eval_env.get_state(), f)


    return agent
コード例 #3
0
def testing(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=50,
        nb_rollout_steps=3,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        # no noise for test
        #   noise_type='adaptive-param_0.2',
        #   noise_type='normal_0.9',
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    # nb_actions = 2*env.grid_size
    nb_actions = env.grid_size
    action_shape = np.array(nb_actions * [0]).shape
    nb_features = (4 + 1) * env.grid_size
    observation_shape = np.array(nb_features * [0]).shape
    grid_x = env.grid_x
    grid_y = env.grid_y
    x = []
    y = []
    for i in range(grid_x):
        x.append(i + 1)
    for i in range(grid_y):
        y.append(i + 1)
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    '''no noise for test'''
    # if noise_type is not None:
    #     for current_noise_type in noise_type.split(','):
    #         current_noise_type = current_noise_type.strip()
    #         if current_noise_type == 'none':
    #             pass
    #         elif 'adaptive-param' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
    #         elif 'normal' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         elif 'ou' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         else:
    #             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    # agent.initialize(sess)
    # sess.graph.finalize()
    agent.load(sess, save_path)

    agent.reset()

    obs, env_state = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    average_reward = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_state = []
    epoch_episodes = 0
    #record the car numbers in each step
    car_num_set = {}
    t_set = [i for i in range(total_timesteps)]
    for xx in x:
        for yy in y:
            lab = str(xx) + str(yy)
            car_num_set[lab] = [[0 for i in range(total_timesteps)]
                                for j in range(4)]

    for epoch in range(nb_epochs):
        obs, env_state = env.reset()
        epoch_actions = []
        epoch_state = []
        average_car_num_set = []
        last_action = 1
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            action, q, _, _ = agent.step(obs,
                                         apply_noise=False,
                                         compute_Q=True)
            '''random action'''
            # if np.random.rand()>0.5:
            #     action=[1]
            # else:
            #     action=[0]
            '''cycle light state'''
            # action=[0]
            '''cycle action (should cycle state instead of action)'''
            # if last_action==1:
            #     action=[0]
            # else:
            #     action=[1]
            # last_action=action[0]

            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                new_obs, r, env_state, done = env.step(action, env_state)
                epoch_state.append(env_state['11'].light_state)
                for xx in x:
                    for yy in y:
                        lab = str(xx) + str(yy)
                        for i in range(4):
                            car_num_set[lab][i][t] = (
                                env_state['11'].car_nums[i])
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        print('done')
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            epoch_episode_rewards.append(episode_reward)
            average_reward.append(episode_reward / nb_rollout_steps)

            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            # for t_train in range(nb_train_steps):
            #     # Adapt param noise, if necessary.
            #     if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
            #         distance = agent.adapt_param_noise()
            #         epoch_adaptive_distances.append(distance)
            #     # print('Train!')
            #     cl, al = agent.train()
            #     epoch_critic_losses.append(cl)
            #     epoch_actor_losses.append(al)
            #     agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0
            step_set.append(t)

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        # plt.figure(figsize=(8,5))
        '''plot rewards-steps'''
        ax1 = plt.subplot(2, 1, 1)
        plt.sca(ax1)
        plt.plot(step_set, average_reward, color='b')
        # plt.xlabel('Steps')
        plt.ylabel('Mean Reward', fontsize=12)
        # plt.ylim(-15000,0)
        '''plot queueing car numbers-steps'''
        ax2 = plt.subplot(2, 1, 2)
        plt.sca(ax2)
        print(np.shape(t_set), np.shape(car_num_set['11'][i]))
        for i in range(4):
            if i == 0:
                plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b')
            elif i == 1:
                plt.plot(t_set,
                         car_num_set['11'][i],
                         '--',
                         label=i,
                         color='orange')
            elif i == 2:
                plt.plot(t_set, car_num_set['11'][i], label=i, color='g')
            else:
                plt.plot(t_set, car_num_set['11'][i], label=i, color='r')
        plt.ylim(0, 100)
        #sum among roads
        sum_car_num = np.sum(car_num_set['11'], axis=0)
        #average among time steps
        average_car_num = np.average(sum_car_num)
        average_car_num_set.append(average_car_num)

        plt.xlabel('Steps', fontsize=12)
        plt.ylabel('Cars Numbers', fontsize=12)
        # set legend
        handles, labels = plt.gca().get_legend_handles_labels()
        by_label = OrderedDict(zip(labels, handles))
        leg = plt.legend(by_label.values(), by_label.keys(), loc=1)
        # leg = plt.legend(loc=4)
        legfm = leg.get_frame()
        legfm.set_edgecolor('black')  # set legend fame color
        legfm.set_linewidth(0.5)  # set legend fame linewidth
        plt.savefig('ddpg_mean_test.pdf')
        plt.show()
        print(epoch_state)

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
    print('average queueing car numbers: ', np.average(average_car_num_set))

    return agent
コード例 #4
0
ファイル: main.py プロジェクト: titi2338432/pytorch-gym
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                        sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
                   action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
コード例 #5
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)

    # ---------- AMEND: specific setting for brsEngine -----------
    print("kwargs", kwargs)
    env.reward_type = kwargs['reward_type']
    env.set_additional_goal = kwargs['set_additional_goal']
    kwargs.pop('reward_type', None)
    kwargs.pop('set_additional_goal', None)
    brsEngine = None
    if env.reward_type == 'ttr':
        if env_id == 'DubinsCarEnv-v0':
            brsEngine = DubinsCar_brs_engine()
            brsEngine.reset_variables()
        elif env_id == 'PlanarQuadEnv-v0':
            brsEngine = Quadrotor_brs_engine()
            brsEngine.reset_variables()
        else:
            raise ValueError("invalid environment name for ttr reward!")
        # You have to assign the engine!
        env.brsEngine = brsEngine
    # -----------------------------------------------------------

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        # ---------- AMEND: specific setting for brsEngine -----------
        eval_env.brsEngine = brsEngine
        # ------------------------------------------------------------

        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()

    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
コード例 #6
0
    def __init__(self,
                 observation_shape,
                 action_shape,
                 nb_demo_kine,
                 nb_key_states,
                 batch_size=128,
                 noise_type='',
                 actor=None,
                 critic=None,
                 layer_norm=True,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 normalize_returns=False,
                 normalize_observations=True,
                 reward_scale=1.,
                 clip_norm=None,
                 demo_l2_reg=0.,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 demo_lr=5e-3,
                 gamma=0.99,
                 tau=0.001,
                 enable_popart=False,
                 save_ckpt=True):

        # Noise
        nb_actions = action_shape[-1]
        param_noise, action_noise = process_noise_type(noise_type, nb_actions)

        logger.info('param_noise', param_noise)
        logger.info('action_noise', action_noise)

        # States recording
        self.memory = Memory(limit=int(2e5),
                             action_shape=action_shape,
                             observation_shape=observation_shape)

        # Models
        self.nb_demo_kine = nb_demo_kine
        self.actor = actor or Actor(
            nb_actions, nb_demo_kine, layer_norm=layer_norm)
        self.nb_key_states = nb_key_states
        self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm)
        self.nb_obs_org = nb_key_states

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        # self.critic_target_Q: value assigned by self.target_Q_obs0
        self.critic_target_Q = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name='critic_target_Q')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # change in observations
        self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine]
        self.obs_delta_kstates = (self.obs1 -
                                  self.obs0)[:, :self.nb_key_states]

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.demo_lr = demo_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.demo_l2_reg = demo_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        self.normalized_obs0 = tf.clip_by_value(
            obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(
            obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across set-up parts.
        # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic
        self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0)

        # critic loss
        # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss
        self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic(
            self.normalized_obs0, act_norm(self.actions))
        # self.critic_tf: only in logging [reference_Q_mean/std]
        self.critic_tf = ret_denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)

        # actor loss
        normalized_critic_with_actor_tf = self.critic(self.normalized_obs0,
                                                      act_norm(self.actor_tf),
                                                      reuse=True)[0]
        # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std]
        self.critic_with_actor_tf = ret_denormalize(
            tf.clip_by_value(normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # target Q
        self.target_action = tf.clip_by_value(
            target_actor(normalized_obs1)[0], self.action_range[0],
            self.action_range[1])
        self.target_Q_obs1 = ret_denormalize(
            target_critic(normalized_obs1, act_norm(self.target_action))[0],
            self.ret_rms)
        self.target_Q_obs0 = self.rewards + (
            1. - self.terminals1) * gamma * self.target_Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(self.normalized_obs0)

        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()
        self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars

        self.sess = None
        # Set up checkpoint saver
        self.save_ckpt = save_ckpt
        if save_ckpt:
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
        else:
            # saver for loading ckpt
            self.saver = tf.train.Saver()

        self.main_summaries = tf.summary.merge_all()
        logdir = logger.get_dir()
        if logdir:
            self.train_writer = tf.summary.FileWriter(
                os.path.join(logdir, 'tb'), tf.get_default_graph())
        else:
            self.train_writer = None
コード例 #7
0
class DDPG(object):
    def __init__(self,
                 observation_shape,
                 action_shape,
                 nb_demo_kine,
                 nb_key_states,
                 batch_size=128,
                 noise_type='',
                 actor=None,
                 critic=None,
                 layer_norm=True,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 normalize_returns=False,
                 normalize_observations=True,
                 reward_scale=1.,
                 clip_norm=None,
                 demo_l2_reg=0.,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 demo_lr=5e-3,
                 gamma=0.99,
                 tau=0.001,
                 enable_popart=False,
                 save_ckpt=True):

        # Noise
        nb_actions = action_shape[-1]
        param_noise, action_noise = process_noise_type(noise_type, nb_actions)

        logger.info('param_noise', param_noise)
        logger.info('action_noise', action_noise)

        # States recording
        self.memory = Memory(limit=int(2e5),
                             action_shape=action_shape,
                             observation_shape=observation_shape)

        # Models
        self.nb_demo_kine = nb_demo_kine
        self.actor = actor or Actor(
            nb_actions, nb_demo_kine, layer_norm=layer_norm)
        self.nb_key_states = nb_key_states
        self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm)
        self.nb_obs_org = nb_key_states

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        # self.critic_target_Q: value assigned by self.target_Q_obs0
        self.critic_target_Q = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name='critic_target_Q')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # change in observations
        self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine]
        self.obs_delta_kstates = (self.obs1 -
                                  self.obs0)[:, :self.nb_key_states]

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.demo_lr = demo_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.demo_l2_reg = demo_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        self.normalized_obs0 = tf.clip_by_value(
            obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(
            obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across set-up parts.
        # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic
        self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0)

        # critic loss
        # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss
        self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic(
            self.normalized_obs0, act_norm(self.actions))
        # self.critic_tf: only in logging [reference_Q_mean/std]
        self.critic_tf = ret_denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)

        # actor loss
        normalized_critic_with_actor_tf = self.critic(self.normalized_obs0,
                                                      act_norm(self.actor_tf),
                                                      reuse=True)[0]
        # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std]
        self.critic_with_actor_tf = ret_denormalize(
            tf.clip_by_value(normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # target Q
        self.target_action = tf.clip_by_value(
            target_actor(normalized_obs1)[0], self.action_range[0],
            self.action_range[1])
        self.target_Q_obs1 = ret_denormalize(
            target_critic(normalized_obs1, act_norm(self.target_action))[0],
            self.ret_rms)
        self.target_Q_obs0 = self.rewards + (
            1. - self.terminals1) * gamma * self.target_Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(self.normalized_obs0)

        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()
        self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars

        self.sess = None
        # Set up checkpoint saver
        self.save_ckpt = save_ckpt
        if save_ckpt:
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
        else:
            # saver for loading ckpt
            self.saver = tf.train.Saver()

        self.main_summaries = tf.summary.merge_all()
        logdir = logger.get_dir()
        if logdir:
            self.train_writer = tf.summary.FileWriter(
                os.path.join(logdir, 'tb'), tf.get_default_graph())
        else:
            self.train_writer = None

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)[0]
        logger.debug('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)[0]
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        # loss_normed = -tf.reduce_mean(self.normalized_critic_with_actor_tf)
        self.actor_Q = tf.reduce_mean(self.critic_with_actor_tf)
        self.actor_loss = -self.actor_Q
        tf.summary.scalar('actor/Q', self.actor_Q)

        # setting up actor vars/grads/optimizer
        self.actor_vars = self.actor.active_vars
        self.actor_grads = tf_util.flatgrad(self.actor_loss,
                                            self.actor_vars,
                                            clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        self.actor_params = actor_params = [0] * (
            len(self.actor.trainable_vars) + 1)
        for i, shape in enumerate(actor_shapes):
            actor_params[i + 1] = actor_params[i] + np.prod(shape)
        n_inact = len(actor_shapes) - len(self.actor_vars)
        active_params = actor_params[n_inact:] - actor_params[n_inact]
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_params))
        logger.info('  actor total: {}'.format(actor_params[-1]))
        logger.info('  actor active: {}'.format(active_params))

        grad = self.actor_grads[active_params[0]:active_params[1]]
        tf.summary.scalar(
            'grads/actor_layer%d_%d' %
            (n_inact // 2, active_params[1] - active_params[0]),
            tf.reduce_mean(grad))

        grad = self.actor_grads[active_params[-3]:active_params[-2]]
        tf.summary.scalar(
            'grads/actor_layer%d_%d' %
            (-1, active_params[-2] - active_params[-3]), tf.reduce_mean(grad))

        # for train_demo()
        self.demo_loss = tf.reduce_mean(
            tf.square(self.obs_delta_kine - self.demo_aprx))
        self.demo_max_loss = tf.reduce_max(
            tf.square(self.obs_delta_kine - self.demo_aprx))
        if self.demo_l2_reg > 0.:
            demo_reg_vars = self.actor.demo_reg_vars
            for var in demo_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info(
                '  applying l2 regularization for demo_aprx with {}'.format(
                    self.demo_l2_reg))
            self.demo_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.demo_l2_reg),
                weights_list=demo_reg_vars)
            self.demo_loss += self.demo_reg
        else:
            self.demo_reg = None

        self.demo_grads = tf_util.flatgrad(self.demo_loss,
                                           self.actor.trainable_vars,
                                           clip_norm=self.clip_norm)
        self.demo_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                      beta1=0.9,
                                      beta2=0.999,
                                      epsilon=1e-08)

        # mimic rwd
        self.mimic_rwd = -self.demo_loss
        tf.summary.scalar('actor/mimic_rwd', self.mimic_rwd)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')

        self.normalized_critic_target_tf = tf.clip_by_value(
            ret_normalize(self.critic_target_Q, self.ret_rms),
            self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf -
                      self.normalized_critic_target_tf))
        tf.summary.scalar('critic_loss/Q_diff', self.critic_loss)
        if self.normalize_returns:
            tf.summary.scalar('critic_loss/Q_normed_critic',
                              tf.reduce_mean(self.normalized_critic_tf))
            tf.summary.scalar('critic_loss/Q_normed_target',
                              tf.reduce_mean(self.normalized_critic_target_tf))

        self.critic_loss_step = 0
        diff_rwd = tf.reduce_mean(tf.square(self.pred_rwd - self.rewards))
        self.critic_loss_step += diff_rwd
        tf.summary.scalar('critic_loss/step_rwd', self.critic_loss_step)

        critic_kine_factor = 100
        diff_obs = tf.reduce_mean(tf.square(self.pred_obs_delta -
                                            self.obs_delta_kstates),
                                  axis=0)
        diff_obs_kine = tf.reduce_mean(
            diff_obs[:self.nb_demo_kine]) * critic_kine_factor
        diff_obs_rest = tf.reduce_mean(diff_obs[self.nb_demo_kine:])
        self.critic_loss_step += (diff_obs_kine + diff_obs_rest)
        tf.summary.scalar(
            'critic_loss/step_kstates_kine_x%d' % critic_kine_factor,
            diff_obs_kine)
        tf.summary.scalar('critic_loss/step_kstates_rest', diff_obs_rest)
        tf.summary.scalar('critic_loss/step_total', self.critic_loss_step)

        self.critic_loss += self.critic_loss_step

        if self.critic_l2_reg > 0.:
            critic_reg_vars = self.critic.reg_vars
            for var in critic_reg_vars:
                logger.debug('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
            tf.summary.scalar('critic_loss/reg', critic_reg)

        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]

        critic_params = [0] * (len(self.critic.trainable_vars) + 1)
        for i, shape in enumerate(critic_shapes):
            critic_params[i + 1] = critic_params[i] + np.prod(shape)

        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_params))
        logger.info('  critic total: {}'.format(critic_params[-1]))
        self.critic_grads = tf_util.flatgrad(self.critic_loss,
                                             self.critic.trainable_vars,
                                             clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

        # todo: make the following general
        grad = self.critic_grads[critic_params[0]:critic_params[1]]
        tf.summary.scalar(
            'grads/critic_layer%d_%d' %
            (0, critic_params[1] - critic_params[0]), tf.reduce_mean(grad))
        grad = self.critic_grads[critic_params[-3]:critic_params[-2]]
        tf.summary.scalar(
            'grads/critic_layer%d_rwd_%d' %
            (-1, critic_params[-2] - critic_params[-3]), tf.reduce_mean(grad))
        grad = self.critic_grads[critic_params[-7]:critic_params[-6]]
        tf.summary.scalar(
            'grads/critic_layer%d_q_%d' %
            (-1, critic_params[-6] - critic_params[-7]), tf.reduce_mean(grad))

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['zrms/ret_mean', 'zrms/ret_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean[:self.nb_demo_kine]),
                tf.reduce_mean(self.obs_rms.std[:self.nb_demo_kine])
            ]
            names += ['zrms/obs_kine_mean', 'zrms/obs_kine_std']

            ops += [
                tf.reduce_mean(self.obs_rms.mean[:self.nb_key_states]),
                tf.reduce_mean(self.obs_rms.std[:self.nb_key_states])
            ]
            names += ['zrms/obs_kstates_mean', 'zrms/obs_kstates_std']

            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['zrms/obs_mean', 'zrms/obs_std']

            # for debugging partial normalisation
            for o_i in [self.nb_obs_org - 1, self.nb_obs_org]:
                ops += [self.obs0[0, o_i], self.normalized_obs0[0, o_i]]
                names += ['zobs_dbg_%d' % o_i, 'zobs_dbg_%d_normalized' % o_i]

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['zref/Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['zref/Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['zref/Q_tf_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['zref/Q_tf_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['zref/action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['zref/action_std']

        ops += [tf.reduce_mean(self.mimic_rwd)]
        names += ['zref/mimic_rwd']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['zref/action_ptb_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['zref/action_ptb_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self,
           obs,
           step,
           apply_param_noise=True,
           apply_action_noise=True,
           compute_Q=True,
           rollout_log=False):
        if self.param_noise is not None and apply_param_noise:
            actor_tf = self.perturbed_actor_tf
            info = 'ptb'
        else:
            actor_tf = self.actor_tf
            info = 'org'
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        # actor output is [0,1], no need to denormalise.
        # action = act_denorm(action)
        if rollout_log:
            summary_list = [('the_action/%d_rollout_%s' % (i, info), a)
                            for i, a in enumerate(action)]

        if self.action_noise is not None and apply_action_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        else:
            noise = None
        action = np.clip(action, self.action_range[0], self.action_range[1])

        if rollout_log:
            if noise is not None:
                summary_list += [('the_action/%d_rollout_noise' % i, a)
                                 for i, a in enumerate(noise)]
            self.add_list_summary(summary_list, step)
        return action, q

    def store_transition(self, storage, obs0, action, reward, obs1, terminal1):
        '''store one experience'''
        reward *= self.reward_scale
        storage.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def store_multrans(self, storage, obs0, action, reward, obs1, terminal1):
        '''store multiple experiences'''
        for i in range(len(reward)):
            storage.append(obs0[i], action[i], reward[i] * self.reward_scale,
                           obs1[i], terminal1[i])
        if self.normalize_observations:
            self.obs_rms.update(np.vstack(obs0))

    def train_demo(self,
                   obs0_pos,
                   obs1_pos,
                   obs0_neg,
                   obs1_neg,
                   step,
                   neg_pct=1.0,
                   lr_decay=1.0):
        # gradients calculated for pos and neg data separately, then combined for gradient update,
        # because only positive data are used in eval modes

        # the loss evaluated here are those before gradient update
        ops = [
            self.demo_grads, self.demo_loss, self.demo_max_loss, self.actor_Q
        ]
        pos_grads, demo_loss, max_loss, actor_Q = self.sess.run(ops,
                                                                feed_dict={
                                                                    self.obs0:
                                                                    obs0_pos,
                                                                    self.obs1:
                                                                    obs1_pos,
                                                                })
        ops = [self.demo_grads, self.demo_loss]
        neg_grads, neg_loss = self.sess.run(ops,
                                            feed_dict={
                                                self.obs0: obs0_neg,
                                                self.obs1: obs1_neg,
                                            })

        comb_grads = pos_grads - neg_grads * neg_pct
        self.demo_optimizer.update(comb_grads,
                                   stepsize=self.demo_lr * lr_decay)

        if self.demo_reg is not None:
            demo_reg = self.sess.run(self.demo_reg)
        else:
            demo_reg = 0

        # sanity check the training
        pos_g = pos_grads[self.actor_params[2]:self.actor_params[3]]
        neg_g = neg_grads[self.actor_params[2]:self.actor_params[3]]
        comb_g = comb_grads[self.actor_params[2]:self.actor_params[3]]
        summary_list = [
            ('demo_loss/train_pos', demo_loss),
            ('demo_loss/train_max', max_loss),
            ('demo_loss/train_neg', neg_loss),
            ('grads/demo_pos_layer%d_%d' % (1, len(pos_g)), np.mean(pos_g)),
            ('grads/demo_neg_layer%d_%d' % (1, len(neg_g)), np.mean(neg_g)),
            ('grads/demo_comb_layer%d_%d' % (1, len(comb_g)), np.mean(comb_g)),
            ('actor/Q', actor_Q), ('demo_loss/reg', demo_reg)
        ]
        self.add_list_summary(summary_list, step)

        return demo_loss

    def test_demo(self, obs0, obs1):
        loss_mean, loss_max = self.sess.run(
            [self.demo_loss, self.demo_max_loss],
            feed_dict={
                self.obs0: obs0,
                self.obs1: obs1,
            })
        return loss_mean, loss_max

    def eval_demo(self, obs0):
        return self.sess.run(self.demo_aprx, feed_dict={self.obs0: obs0})

    def get_mimic_rwd(self, obs0, obs1):
        mimic_rwd, demo_aprx = self.sess.run([self.mimic_rwd, self.demo_aprx],
                                             feed_dict={
                                                 self.obs0: obs0,
                                                 self.obs1: obs1
                                             })
        return mimic_rwd, demo_aprx

    def train_main(self, step):
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            ops = [
                self.ret_rms.mean, self.ret_rms.std, self.target_Q_obs0,
                self.target_Q_obs1
            ]
            old_mean, old_std, target_Q_obs0, target_Q_obs1 = self.sess.run(
                ops,
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q_obs0.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q_obs0, self.ret_rms.mean, self.ret_rms.std],
            # feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q_obs0, new_mean, new_std)
            # assert (np.abs(target_Q_obs0 - target_Q_new) < 1e-3).all()
        else:
            ops = [self.target_Q_obs0, self.target_Q_obs1]
            target_Q_obs0, target_Q_obs1 = self.sess.run(
                ops,
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32')
                })

        summary_list = [
            ('critic_loss/Q_target_obs1_mean', np.mean(target_Q_obs1)),
            ('critic_loss/Q_target_obs1_std', np.std(target_Q_obs1)),
            ('critic_loss/Q_target_obs0_mean', np.mean(target_Q_obs0)),
            ('critic_loss/Q_target_obs0_std', np.std(target_Q_obs0))
        ]
        self.add_list_summary(summary_list, step)

        # Get all gradients and perform a synced update.
        ops = [
            self.main_summaries, self.actor_grads, self.actor_loss,
            self.critic_grads, self.critic_loss
        ]
        main_summaries, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target_Q: target_Q_obs0,
                self.rewards: batch['rewards'],
                self.obs1: batch['obs1']
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        if self.train_writer:
            self.train_writer.add_summary(main_summaries, step)

        return critic_loss, actor_loss

    def initialize(self, sess, start_ckpt=None):
        self.sess = sess
        if start_ckpt:
            self.saver.restore(sess, start_ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.demo_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def store_ckpt(self, save_path, epoch):
        if self.save_ckpt:
            self.saver.save(self.sess, save_path, global_step=epoch)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self, storage):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = storage.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.obs1: self.stats_sample['obs1'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self, step):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })
        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        self.add_list_summary([('param_noise/distance', mean_distance)], step)
        self.add_list_summary(
            [('param_noise/std', self.param_noise.current_stddev)], step)
        return mean_distance

    def reset(self):
        '''Reset internal state after an episode is complete.'''
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })

    def add_list_summary(self, summary_raw, step):
        def summary_val(k, v):
            kwargs = {'tag': k, 'simple_value': v}
            return tf.Summary.Value(**kwargs)

        if self.train_writer:
            summary_list = [summary_val(tag, val) for tag, val in summary_raw]
            self.train_writer.add_summary(tf.Summary(value=summary_list), step)
def learn(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=7,  #50
        nb_rollout_steps=3,  #100
        reward_scale=1.0,
        render=False,
        render_eval=False,
        #   noise_type='adaptive-param_0.2',
        #   noise_type='normal_0.2',        # large noise
        #   noise_type='normal_0.02',       # small noise
        noise_type='normal_2.0',

        # action ranges 360, so noise scale should be chosen properly
        #   noise_type='normal_5',        # large noise
        #   noise_type='normal_0.2',       # small noise
        #   noise_type='normal_0.00001',      # no noise
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,  # large lr
        critic_lr=1e-3,  # large lr
        #   actor_lr=1e-7,      # small lr
        #   critic_lr=1e-3,     # small lr
        #   actor_lr = 1e-10,    # no lr
        #   critic_lr=1e-10,     # no lr
    popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,  #100
        batch_size=640,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #50
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    nb_actions = env.num_actions

    action_shape = np.array(nb_actions * [0]).shape

    #4 pairs pos + 3 link length
    # nb_features = 2*(env.num_actions+1)+env.num_actions

    #4 pairs pos + 1 pair target pos
    nb_features = 2 * (env.num_actions + 2)

    observation_shape = np.array(nb_features * [0]).shape
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    # sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    episode_end_distance = []
    epoch_episodes = 0
    SPARSE_REWARD = False
    '''add this line to make non-initialized to be initialized'''
    agent.load_ini(sess, save_path)
    for epoch in range(nb_epochs):
        print('epochs: ', epoch)
        obs = env.reset()
        agent.save(save_path)
        epoch_episode_rewards = []

        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                # print('action:', action)

                if SPARSE_REWARD:
                    new_obs, r, done, end_distance = env.step(
                        action, SPARSE_REWARD)
                else:
                    new_obs, r, done = env.step(action, SPARSE_REWARD)

                t += 1

                episode_reward += r
                episode_step += 1
                # print('episode_re: ', episode_reward) #[1.]

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.
                # print('r: ', r)
                # '''r shape: (1,)'''
                obs = new_obs

            epoch_episode_rewards.append(episode_reward)
            if cycle == nb_epoch_cycles - 1:
                # record the distance from the end position of reacher to the goal for the last step of each episode
                if SPARSE_REWARD:
                    episode_end_distance.append(end_distance)
                else:
                    end_distance = 100.0 / r - 1
                    episode_end_distance.append(end_distance[0])

            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []

            # filling memory with noised initialized policy & preupdate the critic networks
            preheating_step = 30  #50 episode = 600 steps, 12 steps per episode
            if epoch > preheating_step:
                # print('memory_entries: ',memory.nb_entries)
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)
                    # print('Train!')
                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()
            else:
                # update two critic networks at start
                cl = agent.update_critic()
                epoch_critic_losses.append(cl)
                print('critic loss in initial training: ', cl)
                pass

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        step_set.append(t)
        plt.figure(1)
        plt.plot(step_set, mean_epoch_episode_rewards)
        plt.xlabel('Steps')
        plt.ylabel('Mean Episode Reward')
        plt.savefig('ddpg_mean.png')

        plt.figure(2)
        plt.plot(step_set, episode_end_distance)
        plt.xlabel('Steps')
        plt.ylabel('Distance to Target')
        plt.savefig('ddpgini_distance.png')
        # plt.show()

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)

    print('stepset: ', step_set)
    print('rewards: ', mean_epoch_episode_rewards)
    print('distances: ', episode_end_distance)

    return agent