Esempio n. 1
0
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        '''
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        '''
        self.env_evaluate = MeetEnv()

        if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
        if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 2
0
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            if args.use_per:
                self.buffer = PrioritizedReplayBuffer(args)
            else:
                self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.map + '/'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.file_name = self.save_path + str(args.env_name) + '_' + str(
            args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time
Esempio n. 3
0
 def test_len(self):
     rb = ReplayBuffer(5)
     rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[2])
     assert len(rb) == 3
     for i in range(8):
         rb.add(Transitions[i])
     assert len(rb) == 5
Esempio n. 4
0
    def __init__(self, env, args):
        self.env = env
        self.args = args

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.qmix_pg_learner = QMIX_PG(self.agents, args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.actor_critic_buffer = ReplayBuffer(args, args.buffer_size)
            # self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        tmp = f'clamp2-5_rewardscale10_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \
                                           f'{args.actor_update_delay}_{args.critic_lr}_{args.n_epoch}_{args.temp}'  # f'clamp2-5_'+ rewardscale10_
        self.save_path = self.args.result_dir + '/linear_mix/' + 'mcsac' + '/' + tmp + '/' + args.map  # _gradclip0.5

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 5
0
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)

        if args.alg == 'commnet_coma':
            self.agents = CommNetAgents(args)
            self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate,
                                                       self.agents, args)
        else:
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                                args)
        if args.alg != 'coma' and args.alg != 'commnet_coma':
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 6
0
 def __init__(self,actor,critic,buffersize,game,player,batch_size,gamma):
     self.actor = actor
     self.critic = critic
     self.replay = ReplayBuffer(buffersize)
     self.game =game
     self.player = player
     self.batch_size = batch_size
     self.gamma = gamma
Esempio n. 7
0
    def __init__(self, env, args):
        self.env = env
        self.args = args

        self.agents = Agents(args)
        self.qmix_pg_learner = QMIX_PG(self.agents, args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find(
                'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.critic_buffer = ReplayBuffer(args, args.critic_buffer_size)
            self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \
                                                             f'{args.actor_update_delay}_{args.critic_lr}'  # f'clamp2-5_'+  anneal_epsilon
        self.save_path = self.args.result_dir + '/linear_mix/' + 'qmix_ac_total_cf' + '/' + tmp + '/' + args.map

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 8
0
    def test_circular_buffer(self):
        rb = ReplayBuffer(4)
        rb.add(Transitions[0])
        rb.add(Transitions[1])
        rb.add(Transitions[2])
        rb.add(Transitions[3])
        rb.add(Transitions[4])
        rb.add(Transitions[5])

        assert (rb._storage == [
            Transitions[4], Transitions[5], Transitions[2], Transitions[3]
        ]).all()
Esempio n. 9
0
    def test_random_sampling(self):
        rb = ReplayBuffer(3)
        rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[1]).add(
            Transitions[2])

        samples = rb.sample(100)
        n_1, n_2 = 0, 0
        for sample in samples:
            if sample == Transitions[1]:
                n_1 += 1
            elif sample == Transitions[2]:
                n_2 += 1
            else:
                pytest.fail()

        assert n_1 > n_2
Esempio n. 10
0
    def __init__(self, env, gamma=0.99, tau=0.005, hidden_size=256, device=None):
        super(NAF, self).__init__(env, device=None)
        self.action_space = self.act_dim
        self.num_inputs = self.obs_dim
        num_inputs = self.obs_dim
        action_space = self.act_dim
        self.model = Policy(hidden_size, num_inputs, action_space).to(self.device)
        self.target_model = Policy(hidden_size, num_inputs, action_space).to(self.device)
        self.optimizer = Adam(self.model.parameters(), lr=1e-3)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim)
        self.c_loss, self.a_loss = [], []
        self.gamma = gamma
        self.tau = tau

        hard_update(self.target_model, self.model)
Esempio n. 11
0
    def __init__(self, env, args):
        self.env = env

        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 12
0
    def __init__(self, config):
        self.writer = SummaryWriter() 
        self.device = 'cuda' if T.cuda.is_available() else 'cpu'

        self.dqn_type = config["dqn-type"]
        self.run_title = config["run-title"]
        self.env = gym.make(config["environment"])

        self.num_states  = np.prod(self.env.observation_space.shape)
        self.num_actions = self.env.action_space.n

        layers = [
            self.num_states, 
            *config["architecture"], 
            self.num_actions
        ]

        self.policy_net = Q_Network(self.dqn_type, layers).to(self.device)
        self.target_net = Q_Network(self.dqn_type, layers).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        capacity = config["max-experiences"]
        self.p_replay_eps = config["p-eps"]
        self.prioritized_replay = config["prioritized-replay"]
        self.replay_buffer = PrioritizedReplayBuffer(capacity, config["p-alpha"]) if self.prioritized_replay \
                        else ReplayBuffer(capacity)

        self.beta_scheduler = LinearSchedule(config["episodes"], initial_p=config["p-beta-init"], final_p=1.0)
        self.epsilon_decay = lambda e: max(config["epsilon-min"], e * config["epsilon-decay"])

        self.train_freq = config["train-freq"]
        self.use_soft_update = config["use-soft-update"]
        self.target_update = config["target-update"]
        self.tau = config["tau"]
        self.gamma = config["gamma"]
        self.batch_size = config["batch-size"]
        self.time_step = 0

        self.optim = T.optim.AdamW(self.policy_net.parameters(), lr=config["lr-init"], weight_decay=config["weight-decay"])
        self.lr_scheduler = T.optim.lr_scheduler.StepLR(self.optim, step_size=config["lr-step"], gamma=config["lr-gamma"])
        self.criterion = nn.SmoothL1Loss(reduction="none") # Huber Loss
        self.min_experiences = max(config["min-experiences"], config["batch-size"])

        self.save_path = config["save-path"]
Esempio n. 13
0
    def __init__(self, env, args, itr, seed):
        # 随机设置种子
        if seed is not None:
            self.setup_seed(seed)
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0
        self.time_steps = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        alg_dir = self.args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '_' + \
                  str(self.args.target_update_period)
        self.alg_tag = '_' + self.args.optim

        if self.args.her:
            self.alg_tag += str(self.args.her)
            alg_dir += '_her=' + str(self.args.her)

        # self.save_path = self.args.result_dir + '/' + alg_dir + '/' + self.args.map + '/' + itr
        self.save_path = self.args.result_dir + '/' + self.args.map + '/' + alg_dir + '/' + itr
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.args.model_dir = args.model_dir + '/' + args.map + '/' + alg_dir + '/' + itr

        self.agents = Agents(args, itr=itr)
        print('step runner 初始化')
        if self.args.her:
            print('使用HER')
Esempio n. 14
0
    def __init__(self, env, args):
        self.env = env
        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        self.buffer = ReplayBuffer(args)
        self.args = args
        self.epsilon = args.epsilon

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                            args)
Esempio n. 15
0
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.plt_success = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.env_name
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 16
0
    def __init__(self, env, args, itr):
        # 获取参数
        # self.args = get_common_args()
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.agents = Agents(args, itr=itr)
        # 不复用网络,就会有多个agent,训练的时候共享参数,就是一个网络
        # if not self.args.reuse_network:
        #     self.agents = []
        #     for i in range(self.args.n_agents):
        #         self.agents.append(Agents(self.args, i))

        # self.rollout = RollOut(self.agents, self.args)

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        self.save_path = self.args.result_dir + '/' + self.args.alg + '/' + self.args.map + '/' + str(
            itr)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        print('runner 初始化')
Esempio n. 17
0
    quant_idx = quant_idx.cpu().data
    batch_idx = np.arange(batch_size)
    tau = tau_hat[:, quant_idx][batch_idx, batch_idx]
        
    return tau, expected_quant

num_quant = 51
Vmin = -10
Vmax = 10

current_model = QRDQN(env.observation_space.shape[0], env.action_space.n, num_quant)
target_model  = QRDQN(env.observation_space.shape[0], env.action_space.n, num_quant)
    
optimizer = optim.Adam(current_model.parameters())

replay_buffer = ReplayBuffer(10000)

def update_target(current_model, target_model):
    target_model.load_state_dict(current_model.state_dict())
    
update_target(current_model, target_model)

def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size) 

    state      = autograd.Variable(torch.FloatTensor(np.float32(state)))
    next_state = autograd.Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    action     = autograd.Variable(torch.LongTensor(action))
    reward     = torch.FloatTensor(reward)
    done       = torch.FloatTensor(np.float32(done))
Esempio n. 18
0
])

# Use soft updates to update the target networks
target_update = tf.group([
    tf.assign(v_targ, DECAY * v_targ + (1 - DECAY) * v_main)
    for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
])

init = tf.global_variables_initializer()
session = tf.Session()
session.run(init)
session.run(target_init)

#%% Replay Buffer

replay_buffer = ReplayBuffer(observation_shape=env.observation_space.shape,
                             action_shape=(1, ))


# %% Play
def sample_action(env, observation, epsilon):
    if np.random.random() < epsilon:
        return env.action_space.sample()
    else:
        q_s_a = session.run(q, feed_dict={x: np.atleast_2d(observation)})[0]
        return np.argmax(q_s_a)


def play_once(env, epsilon, render=False):
    observation = env.reset()
    done = False
    steps = 0
Esempio n. 19
0
def learn(env,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16):
    torch.set_num_threads(num_cpu)
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,
            final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    exploration = LinearSchedule(
        schedule_timesteps=int(exploration_fraction * max_timesteps),
        initial_p=1.0,
        final_p=exploration_final_eps)
    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = player_relative

    obs, xy_per_marine = common.init(env, obs)

    group_id = 0
    reset = True
    dqn = DQN(num_actions, lr, cuda)

    print('\nCollecting experience...')
    checkpoint_path = 'models/deepq/checkpoint.pth.tar'
    if os.path.exists(checkpoint_path):
        dqn, saved_mean_reward = load_checkpoint(dqn, cuda, filename=checkpoint_path)
    for t in range(max_timesteps):
        # Take action and update exploration to the newest value
        # custom process for DefeatZerglingsAndBanelings
        obs, screen, player = common.select_marine(env, obs)
        # action = act(
        #     np.array(screen)[None], update_eps=update_eps, **kwargs)[0]
        action = dqn.choose_action(np.array(screen)[None])
        reset = False
        rew = 0
        new_action = None
        obs, new_action = common.marine_action(env, obs, player, action)
        army_count = env._obs[0].observation.player_common.army_count
        try:
            if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]:
                obs = env.step(actions=new_action)
            else:
                new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                obs = env.step(actions=new_action)
        except Exception as e:
            # print(e)
            1  # Do nothing
        player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
        new_screen = player_relative
        rew += obs[0].reward
        done = obs[0].step_type == environment.StepType.LAST
        selected = obs[0].observation["screen"][_SELECTED]
        player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
        if len(player_y) > 0:
            player = [int(player_x.mean()), int(player_y.mean())]
        if len(player) == 2:
            if player[0] > 32:
                new_screen = common.shift(LEFT, player[0] - 32, new_screen)
            elif player[0] < 32:
                new_screen = common.shift(RIGHT, 32 - player[0],
                                          new_screen)
            if player[1] > 32:
                new_screen = common.shift(UP, player[1] - 32, new_screen)
            elif player[1] < 32:
                new_screen = common.shift(DOWN, 32 - player[1], new_screen)
        # Store transition in the replay buffer.
        replay_buffer.add(screen, action, rew, new_screen, float(done))
        screen = new_screen
        episode_rewards[-1] += rew
        reward = episode_rewards[-1]
        if done:
            print("Episode Reward : %s" % episode_rewards[-1])
            obs = env.reset()
            player_relative = obs[0].observation["screen"][
                _PLAYER_RELATIVE]
            screen = player_relative
            group_list = common.init(env, obs)
            # Select all marines first
            # env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
            episode_rewards.append(0.0)
            reset = True

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(
                    batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = dqn.learn(obses_t, actions, rewards, obses_tp1, gamma, batch_size)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes,
                                                new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            dqn.update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("reward", reward)
            logger.record_tabular("mean 100 episode reward",
                                  mean_100ep_reward)
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.dump_tabular()

        if (checkpoint_freq is not None and t > learning_starts
                and num_episodes > 100 and t % checkpoint_freq == 0):
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                if print_freq is not None:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".format(
                            saved_mean_reward,
                            mean_100ep_reward))
                save_checkpoint({
                    'epoch': t + 1,
                    'state_dict': dqn.save_state_dict(),
                    'best_accuracy': mean_100ep_reward
                }, checkpoint_path)
                saved_mean_reward = mean_100ep_reward
Esempio n. 20
0
 def __init__(self, parameters):
     super(Rainbow, self).__init__(parameters)
     self.replay_buffer = ReplayBuffer(self.buffersize)
Esempio n. 21
0
    def __init__(self, config):
        self.config = config

        self.PD_freq = self.config.conf['LLC-frequency']
        self.Physics_freq = self.config.conf['Physics-frequency']
        self.network_freq = self.config.conf['HLC-frequency']
        self.sampling_skip = int(self.PD_freq / self.network_freq)

        self.reward_decay = 1.0
        self.reward_scale = config.conf['reward-scale']
        self.reward_scale = self.reward_scale / float(
            self.sampling_skip)  # /10.0#normalizing reward to 1

        self.max_time_per_train_episode = self.config.conf['max-train-time']
        self.max_step_per_train_episode = int(self.max_time_per_train_episode *
                                              self.network_freq)
        self.max_time_per_test_episode = self.config.conf['max-test-time']  #16
        self.max_step_per_test_episode = int(self.max_time_per_test_episode *
                                             self.network_freq)
        self.train_external_force_disturbance = True
        if self.train_external_force_disturbance == True:
            path_str = 'with_external_force_disturbance/'
        else:
            path_str = 'without_external_force_disturbance/'
        self.test_external_force_disturbance = True

        self.env = Valkyrie(
            max_time=self.max_time_per_train_episode,
            renders=False,
            initial_gap_time=0.5,
            PD_freq=self.PD_freq,
            Physics_freq=self.Physics_freq,
            Kp=config.conf['Kp'],
            Kd=config.conf['Kd'],
            bullet_default_PD=config.conf['bullet-default-PD'],
            controlled_joints_list=config.conf['controlled-joints'])

        config.conf['state-dim'] = self.env.stateNumber
        self.agent = Agent(self.env, self.config)

        self.episode_count = 0
        self.step_count = 0
        self.train_iter_count = 0

        self.best_reward = 0
        self.best_episode = 0
        self.best_train_iter = 0

        self.control = Control(self.config, self.env)

        # load weight from previous network
        # dir_path = 'record/2017_12_04_15.20.44/no_force'  # '2017_05_29_18.23.49/with_force'

        # create new network
        dir_path = 'TRPO/record/' + '3D_push/' + path_str + datetime.now(
        ).strftime('%Y_%m_%d_%H.%M.%S')
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        if not os.path.exists(dir_path + '/saved_actor_networks'):
            os.makedirs(dir_path + '/saved_actor_networks')
        if not os.path.exists(dir_path + '/saved_critic_networks'):
            os.makedirs(dir_path + '/saved_critic_networks')
        self.logging = logger(dir_path)
        config.save_configuration(dir_path)
        config.record_configuration(dir_path)
        config.print_configuration()
        self.agent.load_weight(dir_path)
        self.dir_path = dir_path

        self.on_policy_paths = []
        self.off_policy_paths = []
        self.buffer = ReplayBuffer(self.config.conf['replay-buffer-size'])

        self.force = [0, 0, 0]
        self.force_chest = [0, 0,
                            0]  # max(0,force_chest[1]-300*1.0 / EXPLORE)]
        self.force_pelvis = [0, 0, 0]
Esempio n. 22
0
 def reset_replay_buffer(self):
     self.replay_buffer = ReplayBuffer(1e6)
Esempio n. 23
0
            noise=config.get("noise", None)
            if m != config["target_map"] else None,
            vsn=config.get("vsn", None) if m != config["target_map"] else None,
            ally_indices=ally_indices,
            enemy_indices=enemy_indices,
        ) for m, d in zip(config["map_names"], difficulties)
    ]

    for env in train_envs:
        env_info = env.get_env_info()
        target_info = target_env.get_env_info()
        env.buffer = ReplayBuffer(
            n_actions=target_info['n_actions'],
            n_agents=env_info['n_agents'],
            obs_shape=target_info['obs_shape'],
            state_shape=target_info['state_shape'],
            episode_limit=env_info['episode_limit'],
            size=args.buffer_size,
            alg=args.alg,
            dtype=np.float16,
        )
        logging.info(env_info)
    # change args to accommodate largest possible env
    # assures the widths of the created neural networks are sufficient
    env_info = target_env.get_env_info()
    args.n_actions = env_info["n_actions"]
    args.n_agents = env_info["n_agents"]
    args.state_shape = env_info["state_shape"]
    args.obs_shape = env_info["obs_shape"]
    args.episode_limit = env_info["episode_limit"]

    runner = Runner(None, args, target_env)
Esempio n. 24
0
    def __init__(self, input_space, act_space, scope, args):
        self.input_shape = input_space
        self.act_space = act_space
        self.scope = scope
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        self.optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)
        self.grad_norm_clipping = 0.5
        with tf.variable_scope(self.scope):
            act_pdtype = make_pdtype(act_space)

            # act_ph = act_pdtype.sample_placeholder([None], name= "action")
            act_ph = tf.placeholder(tf.float32, shape=(None, 1))
            if args.game == "RoboschoolPong-v1":
                obs_ph = tf.placeholder(tf.float32,
                                        shape=(None, input_space.shape[0]))
            elif args.game == "Pong-2p-v0":
                obs_ph = tf.placeholder(tf.float32,
                                        shape=(None, input_space.shape[0],
                                               input_space.shape[1],
                                               input_space.shape[2]))
            q_target = tf.placeholder(tf.float32, shape=(None, ))

            #build the world representation z
            z = conv_model(obs_ph, 20, scope="world_model")
            p_input = z

            p = mlp_model(p_input, 2, scope="p_func")
            p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

            act_pd = act_pdtype.pdfromflat(p)
            act_sample = act_pd.sample()

            p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

            q_input = tf.concat([z, act_sample], -1)
            q = mlp_model(q_input, 1, scope="q_func")
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
            pg_loss = -tf.reduce_mean(q)

            q_loss = tf.reduce_mean(tf.square(q - q_target))
            # q_reg = tf.reduce_mean(tf.square(q))
            q_optimize_expr = U.minimize_and_clip(self.optimizer, q_loss,
                                                  q_func_vars,
                                                  self.grad_norm_clipping)

            p_loss = pg_loss + p_reg * 1e-3

            p_optimize_expr = U.minimize_and_clip(self.optimizer, p_loss,
                                                  p_func_vars,
                                                  self.grad_norm_clipping)

            p_values = U.function([obs_ph], p)

            target_p = mlp_model(z, 2, scope="target_p_func")
            target_p_func_vars = U.scope_vars(
                U.absolute_scope_name("target_p_func"))

            target_q = mlp_model(q_input, 1, scope="target_q_func")
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))
            target_act_sample = act_pdtype.pdfromflat(target_p).sample()

            self.update_target_p = make_update_exp(p_func_vars,
                                                   target_p_func_vars)
            self.update_target_q = make_update_exp(q_func_vars,
                                                   target_q_func_vars)

            self.act = U.function(inputs=[obs_ph], outputs=act_sample)
            self.target_act = U.function(inputs=[obs_ph],
                                         outputs=target_act_sample)
            self.p_train = U.function(inputs=[obs_ph] + [act_ph],
                                      outputs=p_loss,
                                      updates=[p_optimize_expr])
            self.q_train = U.function(inputs=[obs_ph] + [act_ph] + [q_target],
                                      outputs=q_loss,
                                      updates=[q_optimize_expr])
            self.q_values = U.function([obs_ph] + [act_ph], q)
            self.target_q_values = U.function([obs_ph] + [act_ph], target_q)
Esempio n. 25
0
def train(train_env, agent_action_fn, eval_mode=False):
    action_space = train_env.action_space
    obs_space = train_env.observation_space

    ######### instantiate actor,critic, replay buffer, uo-process#########
    ## feed online with state. feed target with next_state.
    online_state_inputs = tf.placeholder(tf.float32,
                                         shape=(None, obs_space.shape[0]),
                                         name="online_state_inputs")

    target_state_inputs = tf.placeholder(tf.float32,
                                         shape=online_state_inputs.shape,
                                         name="target_state_inputs")

    ## inputs to q_net for training q.
    online_action_inputs_training_q = tf.placeholder(
        tf.float32,
        shape=(None, action_space.shape[0]),
        name='online_action_batch_inputs')
    # condition bool scalar to switch action inputs to online q.
    # feed True: training q.
    # feed False: training policy.
    cond_training_q = tf.placeholder(tf.bool, shape=[], name='cond_training_q')

    terminated_inputs = tf.placeholder(tf.float32,
                                       shape=(None),
                                       name='terminated_inputs')
    reward_inputs = tf.placeholder(tf.float32,
                                   shape=(None),
                                   name='rewards_inputs')

    # for summary text
    summary_text_tensor = tf.convert_to_tensor(str('summary_text'),
                                               preferred_dtype=string)
    tf.summary.text(name='summary_text',
                    tensor=summary_text_tensor,
                    collections=[DDPG_CFG.log_summary_keys])

    ##instantiate actor, critic.
    actor = Actor(
        action_dim=action_space.shape[0],
        online_state_inputs=online_state_inputs,
        target_state_inputs=target_state_inputs,
        input_normalizer=DDPG_CFG.actor_input_normalizer,
        input_norm_params=DDPG_CFG.actor_input_norm_params,
        n_fc_units=DDPG_CFG.actor_n_fc_units,
        fc_activations=DDPG_CFG.actor_fc_activations,
        fc_initializers=DDPG_CFG.actor_fc_initializers,
        fc_normalizers=DDPG_CFG.actor_fc_normalizers,
        fc_norm_params=DDPG_CFG.actor_fc_norm_params,
        fc_regularizers=DDPG_CFG.actor_fc_regularizers,
        output_layer_initializer=DDPG_CFG.actor_output_layer_initializer,
        output_layer_regularizer=None,
        output_normalizers=DDPG_CFG.actor_output_layer_normalizers,
        output_norm_params=DDPG_CFG.actor_output_layer_norm_params,
        output_bound_fns=DDPG_CFG.actor_output_bound_fns,
        learning_rate=DDPG_CFG.actor_learning_rate,
        is_training=is_training)

    critic = Critic(
        online_state_inputs=online_state_inputs,
        target_state_inputs=target_state_inputs,
        input_normalizer=DDPG_CFG.critic_input_normalizer,
        input_norm_params=DDPG_CFG.critic_input_norm_params,
        online_action_inputs_training_q=online_action_inputs_training_q,
        online_action_inputs_training_policy=actor.
        online_action_outputs_tensor,
        cond_training_q=cond_training_q,
        target_action_inputs=actor.target_action_outputs_tensor,
        n_fc_units=DDPG_CFG.critic_n_fc_units,
        fc_activations=DDPG_CFG.critic_fc_activations,
        fc_initializers=DDPG_CFG.critic_fc_initializers,
        fc_normalizers=DDPG_CFG.critic_fc_normalizers,
        fc_norm_params=DDPG_CFG.critic_fc_norm_params,
        fc_regularizers=DDPG_CFG.critic_fc_regularizers,
        output_layer_initializer=DDPG_CFG.critic_output_layer_initializer,
        output_layer_regularizer=None,
        learning_rate=DDPG_CFG.critic_learning_rate)

    ## track updates.
    global_step_tensor = tf.train.create_global_step()

    ## build whole graph
    copy_online_to_target_op, train_online_policy_op, train_online_q_op, update_target_op, saver \
      = build_ddpg_graph(actor, critic, reward_inputs, terminated_inputs, global_step_tensor)

    #we save the replay buffer data to files.
    replay_buffer = ReplayBuffer(
        buffer_size=DDPG_CFG.replay_buff_size,
        save_segment_size=DDPG_CFG.replay_buff_save_segment_size,
        save_path=DDPG_CFG.replay_buffer_file_path,
        seed=DDPG_CFG.random_seed)
    if DDPG_CFG.load_replay_buffer_set:
        replay_buffer.load(DDPG_CFG.replay_buffer_file_path)

    sess = tf.Session(graph=tf.get_default_graph())
    summary_writer = tf.summary.FileWriter(logdir=os.path.join(
        DDPG_CFG.log_dir, "train"),
                                           graph=sess.graph)
    log_summary_op = tf.summary.merge_all(key=DDPG_CFG.log_summary_keys)

    sess.run(fetches=[tf.global_variables_initializer()])

    #copy init params from online to target
    sess.run(fetches=[copy_online_to_target_op])

    # Load a previous checkpoint if it exists
    latest_checkpoint = tf.train.latest_checkpoint(DDPG_CFG.checkpoint_dir)
    if latest_checkpoint:
        tf.logging.info(
            "==== Loading model checkpoint: {}".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    elif eval_mode:
        raise FileNotFoundError(
            '== in evaluation mode, we need check point file which can not be found.==='
        )

    ####### start training #########
    obs = train_env.reset()
    transition = preprocess_low_dim(obs)

    n_episodes = 1

    if not eval_mode:
        for step in range(1, DDPG_CFG.num_training_steps):
            #replace with new transition
            policy_out = sess.run(fetches=[actor.online_action_outputs_tensor],
                                  feed_dict={
                                      online_state_inputs:
                                      transition.next_state[np.newaxis, :],
                                      is_training:
                                      False
                                  })[0]
            transition = agent_action_fn(policy_out, replay_buffer, train_env)
            if step % 200 == 0:
                tf.logging.info(' +++++++++++++++++++ global_step:{} action:{}'
                                '  reward:{} term:{}'.format(
                                    step, transition.action, transition.reward,
                                    transition.terminated))
            if step < 10:
                #feed some transitions in buffer.
                continue
            ## ++++ sample mini-batch and train.++++
            state_batch, action_batch, reward_batch, next_state_batch, terminated_batch = \
             replay_buffer.sample_batch(DDPG_CFG.batch_size)

            # ---- 1. train policy.-----------
            sess.run(
                fetches=[train_online_policy_op],
                feed_dict={
                    online_state_inputs: state_batch,
                    cond_training_q: False,
                    online_action_inputs_training_q:
                    action_batch,  # feed but not used.
                    is_training: True
                })

            # ---- 2. train q. --------------
            sess.run(fetches=[train_online_q_op],
                     feed_dict={
                         online_state_inputs: state_batch,
                         cond_training_q: True,
                         online_action_inputs_training_q: action_batch,
                         target_state_inputs: next_state_batch,
                         reward_inputs: reward_batch,
                         terminated_inputs: terminated_batch,
                         is_training: True
                     })

            # ----- 3. update target ---------
            sess.run(fetches=[update_target_op], feed_dict=None)

            # do evaluation after eval_freq steps:
            if step % DDPG_CFG.eval_freq == 0:  ##and step > DDPG_CFG.eval_freq:
                evaluate(env=train_env,
                         num_eval_steps=DDPG_CFG.num_eval_steps,
                         preprocess_fn=preprocess_low_dim,
                         estimate_fn=lambda state: sess.run(
                             fetches=[actor.online_action_outputs_tensor],
                             feed_dict={
                                 online_state_inputs: state,
                                 is_training: False
                             }),
                         summary_writer=summary_writer,
                         saver=saver,
                         sess=sess,
                         global_step=step,
                         log_summary_op=log_summary_op,
                         summary_text_tensor=summary_text_tensor)

            if transition.terminated:
                transition = preprocess_low_dim(train_env.reset())
                n_episodes += 1
                continue  # begin new episode

    else:  #eval mode
        evaluate(env=train_env,
                 num_eval_steps=DDPG_CFG.eval_steps_after_training,
                 preprocess_fn=preprocess_low_dim,
                 estimate_fn=lambda state: sess.run(
                     fetches=[actor.online_action_outputs_tensor],
                     feed_dict={
                         online_state_inputs: state,
                         is_training: False
                     }),
                 summary_writer=summary_writer,
                 saver=None,
                 sess=sess,
                 global_step=0,
                 log_summary_op=log_summary_op,
                 summary_text_tensor=summary_text_tensor)

    sess.close()
    train_env.close()
Esempio n. 26
0
def train(train_env, monitor_env, agent_action_fn, noise_process):
    '''
    :return:
  '''
    action_space = train_env.action_space
    obs_space = train_env.observation_space

    ######### instantiate actor,critic, replay buffer, uo-process#########
    ## feed online with state. feed target with next_state.
    online_state_inputs = tf.placeholder(tf.float32,
                                         shape=(None, obs_space.shape[0]),
                                         name="online_state_inputs")

    # tf.logging.info('@@@@ online_state_inputs shape:{}'.format(online_state_inputs.shape))
    target_state_inputs = tf.placeholder(tf.float32,
                                         shape=online_state_inputs.shape,
                                         name="target_state_inputs")

    ## inputs to q_net for training q.
    online_action_inputs_training_q = tf.placeholder(
        tf.float32,
        shape=(None, action_space.shape[0]),
        name='online_action_batch_inputs')
    # condition bool scalar to switch action inputs to online q.
    # feed True: training q.
    # feed False: training policy.
    cond_training_q = tf.placeholder(tf.bool, shape=[], name='cond_training_q')

    # batch_size vector.
    terminated_inputs = tf.placeholder(tf.float32,
                                       shape=(None),
                                       name='terminated_inputs')
    reward_inputs = tf.placeholder(tf.float32,
                                   shape=(None),
                                   name='rewards_inputs')

    #for l_r decay
    actor_l_r = tf.placeholder(tf.float32, shape=[], name='actor_l_r')
    critic_l_r = tf.placeholder(tf.float32, shape=[], name='critic_l_r')

    #for summary text
    summary_text_tensor = tf.convert_to_tensor(str('summary_text'),
                                               preferred_dtype=string)
    tf.summary.text(name='summary_text',
                    tensor=summary_text_tensor,
                    collections=[DDPG_CFG.log_summary_keys])

    ##instantiate actor, critic.
    actor = Actor(
        action_dim=action_space.shape[0],
        online_state_inputs=online_state_inputs,
        target_state_inputs=target_state_inputs,
        input_normalizer=DDPG_CFG.actor_input_normalizer,
        input_norm_params=DDPG_CFG.actor_input_norm_params,
        n_fc_units=DDPG_CFG.actor_n_fc_units,
        fc_activations=DDPG_CFG.actor_fc_activations,
        fc_initializers=DDPG_CFG.actor_fc_initializers,
        fc_normalizers=DDPG_CFG.actor_fc_normalizers,
        fc_norm_params=DDPG_CFG.actor_fc_norm_params,
        fc_regularizers=DDPG_CFG.actor_fc_regularizers,
        output_layer_initializer=DDPG_CFG.actor_output_layer_initializer,
        # output_layer_regularizer=DDPG_CFG.actor_output_layer_regularizer,
        output_layer_regularizer=None,
        output_normalizers=DDPG_CFG.actor_output_layer_normalizers,
        output_norm_params=DDPG_CFG.actor_output_layer_norm_params,
        # output_normalizers=None,
        # output_norm_params=None,
        output_bound_fns=DDPG_CFG.actor_output_bound_fns,
        learning_rate=actor_l_r,
        is_training=is_training)

    critic = Critic(
        online_state_inputs=online_state_inputs,
        target_state_inputs=target_state_inputs,
        input_normalizer=DDPG_CFG.critic_input_normalizer,
        input_norm_params=DDPG_CFG.critic_input_norm_params,
        online_action_inputs_training_q=online_action_inputs_training_q,
        online_action_inputs_training_policy=actor.
        online_action_outputs_tensor,
        cond_training_q=cond_training_q,
        target_action_inputs=actor.target_action_outputs_tensor,
        n_fc_units=DDPG_CFG.critic_n_fc_units,
        fc_activations=DDPG_CFG.critic_fc_activations,
        fc_initializers=DDPG_CFG.critic_fc_initializers,
        fc_normalizers=DDPG_CFG.critic_fc_normalizers,
        fc_norm_params=DDPG_CFG.critic_fc_norm_params,
        fc_regularizers=DDPG_CFG.critic_fc_regularizers,
        output_layer_initializer=DDPG_CFG.critic_output_layer_initializer,
        output_layer_regularizer=DDPG_CFG.critic_output_layer_regularizer,
        # output_layer_regularizer = None,
        learning_rate=critic_l_r)

    ## track updates.
    global_step_tensor = tf.train.create_global_step()

    ## build whole graph
    copy_online_to_target_op, train_online_policy_op, train_online_q_op, update_target_op, saver,q_loss_tensor \
      = build_ddpg_graph(actor, critic, reward_inputs, terminated_inputs, global_step_tensor)

    #we save the replay buffer data to files.
    replay_buffer = ReplayBuffer(
        buffer_size=DDPG_CFG.replay_buff_size,
        save_segment_size=DDPG_CFG.replay_buff_save_segment_size,
        save_path=DDPG_CFG.replay_buffer_file_path,
        seed=DDPG_CFG.random_seed)
    ##TODO test load replay buffer from files.
    if DDPG_CFG.load_replay_buffer_set:
        replay_buffer.load(DDPG_CFG.replay_buffer_file_path)

    # ===  finish building ddpg graph before this =================#

    ##create tf default session
    sess = tf.Session(graph=tf.get_default_graph())
    '''
  # note: will transfer graph to graphdef now. so we must finish all the computation graph
  # before creating summary writer.
  '''
    summary_writer = tf.summary.FileWriter(logdir=os.path.join(
        DDPG_CFG.log_dir, "train"),
                                           graph=sess.graph)
    actor_summary_op = tf.summary.merge_all(key=DDPG_CFG.actor_summary_keys)
    critic_summary_op = tf.summary.merge_all(key=DDPG_CFG.critic_summary_keys)
    log_summary_op = tf.summary.merge_all(key=DDPG_CFG.log_summary_keys)
    ######### initialize computation graph  ############
    '''
  # -------------trace graphdef only
  whole_graph_def = meta_graph.create_meta_graph_def(graph_def=sess.graph.as_graph_def())
  summary_writer.add_meta_graph(whole_graph_def,global_step=1)
  summary_writer.flush()

  run_options = tf.RunOptions(output_partition_graphs=True, trace_level=tf.RunOptions.FULL_TRACE)
  run_metadata = tf.RunMetadata()

  # including copy target -> online
  sess.run(fetches=[init_op],
           options=run_options,
           run_metadata=run_metadata
           )
  graphdef_part1 = run_metadata.partition_graphs[0]
  meta_graph_part1 = meta_graph.create_meta_graph_def(graph_def=graphdef_part1)
  part1_metagraph_writer = tf.summary.FileWriter(DDPG_CFG.log_dir + '/part1_metagraph')
  part1_metagraph_writer.add_meta_graph(meta_graph_part1)
  part1_metagraph_writer.close()

  graphdef_part2 = run_metadata.partition_graphs[1]
  meta_graph_part2 = meta_graph.create_meta_graph_def(graph_def=graphdef_part2)
  part2_metagraph_writer = tf.summary.FileWriter(DDPG_CFG.log_dir + '/part2_metagraph')
  part2_metagraph_writer.add_meta_graph(meta_graph_part2)
  part2_metagraph_writer.close()
  # --------------- end trace
  '''

    sess.run(fetches=[tf.global_variables_initializer()])

    #copy init params from online to target
    sess.run(fetches=[copy_online_to_target_op])

    # Load a previous checkpoint if it exists
    latest_checkpoint = tf.train.latest_checkpoint(DDPG_CFG.checkpoint_dir)
    if latest_checkpoint:
        print("=== Loading model checkpoint: {}".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    ####### start training #########

    if not DDPG_CFG.train_from_replay_buffer_set_only:
        obs = train_env.reset()
        transition = preprocess_low_dim(obs)

    n_episodes = 1
    update_start = 0.0

    for step in range(1, DDPG_CFG.num_training_steps):
        noise_process.reset()

        #replace with new transition
        if not DDPG_CFG.train_from_replay_buffer_set_only:  #no need new samples
            transition = agent_action_fn(step, sess, actor,
                                         online_state_inputs, is_training,
                                         transition.next_state[np.newaxis, :],
                                         replay_buffer, noise_process,
                                         train_env)
        if step % DDPG_CFG.summary_transition_freq == 0:
            summary_transition(summary_writer, action_space.shape[0],
                               transition, step)

        # after fill replay_buffer with some states, we start learn.
        if step > DDPG_CFG.learn_start:
            # test update duration at first 10 update
            if step < (DDPG_CFG.learn_start + 10):
                update_start = time.time()

            ## ++++ sample mini-batch and train.++++
            state_batch, action_batch, reward_batch, next_state_batch, terminated_batch = \
              replay_buffer.sample_batch(DDPG_CFG.batch_size)

            if step % 2000 == 0 and DDPG_CFG.train_from_replay_buffer_set_only:
                tf.logging.info(
                    '@@@@@ train from buffer only -one sample - global_step:{} action:{}'
                    '  reward:{} term:{} @@@@@@@@@@'.format(
                        step, action_batch[0], reward_batch[0],
                        terminated_batch[0]))

            # ---- 1. train policy.-----------
            # no need to feed reward, next_state, terminated which are un-used in policy update.
            # run_options = tf.RunOptions(output_partition_graphs=True, trace_level=tf.RunOptions.FULL_TRACE)
            if 0 == step % DDPG_CFG.summary_freq:
                # run_metadata = tf.RunMetadata()
                _, actor_summary = sess.run(
                    fetches=[train_online_policy_op, actor_summary_op],
                    feed_dict={
                        online_state_inputs: state_batch,
                        cond_training_q: False,
                        online_action_inputs_training_q:
                        action_batch,  # feed but not used.
                        actor_l_r: l_r_decay(DDPG_CFG.actor_learning_rate,
                                             step),
                        is_training: True
                    })
                # options=run_options,
                # run_metadata=run_metadata)
                # summary_writer._add_graph_def(run_metadata.partition_graphs[0])

                # the policy online network is updated above and will not affect training q.
                # ---- 2. train q. --------------
                _, critic_summary = sess.run(
                    fetches=[train_online_q_op, critic_summary_op],
                    feed_dict={
                        online_state_inputs: state_batch,
                        cond_training_q: True,
                        online_action_inputs_training_q: action_batch,
                        target_state_inputs: next_state_batch,
                        reward_inputs: reward_batch,
                        reward_inputs: reward_batch,
                        terminated_inputs: terminated_batch,
                        critic_l_r: l_r_decay(DDPG_CFG.critic_learning_rate,
                                              step),
                        is_training: True
                    })

                summary_writer.add_summary(actor_summary)
                summary_writer.add_summary(critic_summary)
                summary_writer.flush()
            else:
                _ = sess.run(
                    fetches=[train_online_policy_op],
                    feed_dict={
                        online_state_inputs: state_batch,
                        cond_training_q: False,
                        online_action_inputs_training_q:
                        action_batch,  # feed but not used.
                        actor_l_r: l_r_decay(DDPG_CFG.actor_learning_rate,
                                             step),
                        is_training: True
                    })

                # the policy online network is updated above and will not affect training q.
                # ---- 2. train q. --------------
                _, q_loss_value = sess.run(
                    fetches=[train_online_q_op, q_loss_tensor],
                    feed_dict={
                        online_state_inputs: state_batch,
                        cond_training_q: True,
                        online_action_inputs_training_q: action_batch,
                        target_state_inputs: next_state_batch,
                        reward_inputs: reward_batch,
                        terminated_inputs: terminated_batch,
                        critic_l_r: l_r_decay(DDPG_CFG.critic_learning_rate,
                                              step),
                        is_training: True
                    })
                if step % 2000 == 0:
                    tf.logging.info('@@ step:{} q_loss:{}'.format(
                        step, q_loss_value))

            # --end of summary --
            # ----- 3. update target ---------
            # including increment global step.
            _ = sess.run(fetches=[update_target_op], feed_dict=None)

            # test update duration at first 10 update
            if step < (DDPG_CFG.learn_start + 10):
                tf.logging.info(
                    ' @@@@ one batch learn duration @@@@:{}'.format(
                        time.time() - update_start))

            # do evaluation after eval_freq steps:
            if step % DDPG_CFG.eval_freq == 0:  ##and step > DDPG_CFG.eval_freq:
                evaluate(env=monitor_env,
                         num_eval_steps=DDPG_CFG.num_eval_steps,
                         preprocess_fn=preprocess_low_dim,
                         estimate_fn=lambda state: sess.run(
                             fetches=[actor.online_action_outputs_tensor],
                             feed_dict={
                                 online_state_inputs: state,
                                 is_training: False
                             }),
                         summary_writer=summary_writer,
                         saver=saver,
                         sess=sess,
                         global_step=step,
                         log_summary_op=log_summary_op,
                         summary_text_tensor=summary_text_tensor)
                # if monitor_env is train_env:
                #   #torcs share. we should reset
                #   transition.terminated = True #fall through
        #-- end of learn

        #TODO temp solution to on vision .use thread instead
        if step % 2000 == 0:
            v_on = os.path.exists('/home/yuheng/Desktop/train_vision_on')
            if train_env.vision_status == False and v_on:
                train_env.vision_on()  #will display next reset
                transition = preprocess_low_dim(train_env.reset(relaunch=True))
                n_episodes += 1
                tf.logging.info('@@ episodes: {} @@'.format(n_episodes))
                continue
            elif train_env.vision_status == True and not v_on:
                train_env.vision_off()
                transition = preprocess_low_dim(train_env.reset(relaunch=True))
                n_episodes += 1
                tf.logging.info('@@ episodes: {} @@'.format(n_episodes))
                continue

            # if os.path.exists('/home/yuheng/Desktop/eval_vision_on'):
            #   monitor_env.vision_on()  # will display next reset
            # else:
            #   monitor_env.vision_off()

        if (not DDPG_CFG.train_from_replay_buffer_set_only) and (
                transition.terminated):
            # relaunch TORCS every 3 episode because of the memory leak error
            # replace with transition observed after reset.only save state..
            transition = preprocess_low_dim(train_env.reset())
            n_episodes += 1
            tf.logging.info('@@ episodes: {} @@'.format(n_episodes))
            continue  # begin new episode
    # ====end for t.

    sess.close()
    train_env.close()
    monitor_env.close()
Esempio n. 27
0
def train(env, args, writer):
    p1_current_model = DQN(env, args).to(args.device)
    p1_target_model = DQN(env, args).to(args.device)
    update_target(p1_current_model, p1_target_model)
    p2_current_model = DQN(env, args).to(args.device)
    p2_target_model = DQN(env, args).to(args.device)
    update_target(p2_current_model, p2_target_model)

    if args.noisy:
        p1_current_model.update_noisy_modules()
        p1_target_model.update_noisy_modules()
        p2_current_model.update_noisy_modules()
        p2_target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(p1_current_model, args, 1)
        load_model(p2_current_model, args, 2)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
        p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        p1_replay_buffer = ReplayBuffer(args.buffer_size)
        p2_replay_buffer = ReplayBuffer(args.buffer_size)
    
    p1_state_deque = deque(maxlen=args.multi_step)
    p2_state_deque = deque(maxlen=args.multi_step)
    p1_reward_deque = deque(maxlen=args.multi_step)
    p1_action_deque = deque(maxlen=args.multi_step)
    p2_reward_deque = deque(maxlen=args.multi_step)
    p2_action_deque = deque(maxlen=args.multi_step)

    p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr)
    p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr)

    length_list = []
    p1_reward_list, p1_loss_list = [], []
    p2_reward_list, p2_loss_list = [], []
    p1_episode_reward, p2_episode_reward = 0, 0
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    (p1_state, p2_state) = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.noisy:
            p1_current_model.sample_noise()
            p1_target_model.sample_noise()
            p2_current_model.sample_noise()
            p2_target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon)
        p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon)

        if args.render:
            env.render()

        actions = {"1": p1_action, "2": p2_action}
        (p1_next_state, p2_next_state), reward, done, _ = env.step(actions)


        p1_state_deque.append(p1_state)
        p2_state_deque.append(p2_state)
        if args.negative:
            p1_reward_deque.append(reward[0] - 1)
        else:
            p1_reward_deque.append(reward[0])
        p1_action_deque.append(p1_action)
        if args.negative:
            p2_reward_deque.append(reward[1] - 1)
        else:
            p2_reward_deque.append(reward[1])
        p2_action_deque.append(p2_action)

        if len(p1_state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(p1_reward_deque, args.gamma)
            n_state = p1_state_deque[0]
            n_action = p1_action_deque[0]
            p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done))

            n_reward = multi_step_reward(p2_reward_deque, args.gamma)
            n_state = p2_state_deque[0]
            n_action = p2_action_deque[0]
            p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done))

        (p1_state, p2_state) = (p1_next_state, p2_next_state)
        p1_episode_reward += (reward[0])
        p2_episode_reward += (reward[1])
        if args.negative:
            p1_episode_reward -= 1
            p2_episode_reward -= 1
        episode_length += 1

        if done or episode_length > args.max_episode_length:
            (p1_state, p2_state) = env.reset()
            p1_reward_list.append(p1_episode_reward)
            p2_reward_list.append(p2_episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx)
            writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0
            p1_state_deque.clear()
            p2_state_deque.clear()
            p1_reward_deque.clear()
            p2_reward_deque.clear()
            p1_action_deque.clear()
            p2_action_deque.clear()

        if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta)
            p1_loss_list.append(loss.item())
            writer.add_scalar("data/p1_loss", loss.item(), frame_idx)

            loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta)
            p2_loss_list.append(loss.item())
            writer.add_scalar("data/p2_loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(p1_current_model, p1_target_model)
            update_target(p2_current_model, p2_target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list)
            print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list)
            p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear()
            p1_loss_list.clear(), p2_loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(p1_current_model, args, 1)
            save_model(p2_current_model, args, 2)

    save_model(p1_current_model, args, 1)
    save_model(p2_current_model, args, 2)
Esempio n. 28
0
def learn(logger, device, env, number_timesteps, network, optimizer, save_path,
          save_interval, ob_scale, gamma, grad_norm, double_q, param_noise,
          exploration_fraction, exploration_final_eps, batch_size, train_freq,
          learning_starts, target_network_update_freq, buffer_size,
          prioritized_replay, prioritized_replay_alpha,
          prioritized_replay_beta0, atom_num, min_value, max_value):
    """
    Papers:
    Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep
    reinforcement learning[J]. Nature, 2015, 518(7540): 529.
    Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements
    in Deep Reinforcement Learning[J]. 2017.

    Parameters:
    ----------
    double_q (bool): if True double DQN will be used
    param_noise (bool): whether or not to use parameter space noise
    dueling (bool): if True dueling value estimation will be used
    exploration_fraction (float): fraction of entire training period over which
                                  the exploration rate is annealed
    exploration_final_eps (float): final value of random action probability
    batch_size (int): size of a batched sampled from replay buffer for training
    train_freq (int): update the model every `train_freq` steps
    learning_starts (int): how many steps of the model to collect transitions
                           for before learning starts
    target_network_update_freq (int): update the target network every
                                      `target_network_update_freq` steps
    buffer_size (int): size of the replay buffer
    prioritized_replay (bool): if True prioritized replay buffer will be used.
    prioritized_replay_alpha (float): alpha parameter for prioritized replay
    prioritized_replay_beta0 (float): beta parameter for prioritized replay
    atom_num (int): atom number in distributional RL for atom_num > 1
    min_value (float): min value in distributional RL
    max_value (float): max value in distributional RL

    """

    qnet = network.to(device)
    qtar = deepcopy(qnet)
    if prioritized_replay:
        buffer = PrioritizedReplayBuffer(buffer_size, device,
                                         prioritized_replay_alpha,
                                         prioritized_replay_beta0)
    else:
        buffer = ReplayBuffer(buffer_size, device)
    generator = _generate(device, env, qnet, ob_scale, number_timesteps,
                          param_noise, exploration_fraction,
                          exploration_final_eps, atom_num, min_value,
                          max_value)
    if atom_num > 1:
        delta_z = float(max_value - min_value) / (atom_num - 1)
        z_i = torch.linspace(min_value, max_value, atom_num).to(device)

    infos = {'eplenmean': deque(maxlen=100), 'eprewmean': deque(maxlen=100)}
    start_ts = time.time()
    for n_iter in range(1, number_timesteps + 1):
        if prioritized_replay:
            buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps
        *data, info = generator.__next__()
        buffer.add(*data)
        for k, v in info.items():
            infos[k].append(v)

        # update qnet
        if n_iter > learning_starts and n_iter % train_freq == 0:
            b_o, b_a, b_r, b_o_, b_d, *extra = buffer.sample(batch_size)
            b_o.mul_(ob_scale)
            b_o_.mul_(ob_scale)

            if atom_num == 1:
                with torch.no_grad():
                    if double_q:
                        b_a_ = qnet(b_o_).argmax(1).unsqueeze(1)
                        b_q_ = (1 - b_d) * qtar(b_o_).gather(1, b_a_)
                    else:
                        b_q_ = (1 - b_d) * qtar(b_o_).max(1, keepdim=True)[0]
                b_q = qnet(b_o).gather(1, b_a)
                abs_td_error = (b_q - (b_r + gamma * b_q_)).abs()
                priorities = abs_td_error.detach().cpu().clamp(1e-6).numpy()
                if extra:
                    loss = (extra[0] * huber_loss(abs_td_error)).mean()
                else:
                    loss = huber_loss(abs_td_error).mean()
            else:
                with torch.no_grad():
                    b_dist_ = qtar(b_o_).exp()
                    b_a_ = (b_dist_ * z_i).sum(-1).argmax(1)
                    b_tzj = (gamma * (1 - b_d) * z_i[None, :] + b_r).clamp(
                        min_value, max_value)
                    b_i = (b_tzj - min_value) / delta_z
                    b_l = b_i.floor()
                    b_u = b_i.ceil()
                    b_m = torch.zeros(batch_size, atom_num).to(device)
                    temp = b_dist_[torch.arange(batch_size), b_a_, :]
                    b_m.scatter_add_(1, b_l.long(), temp * (b_u - b_i))
                    b_m.scatter_add_(1, b_u.long(), temp * (b_i - b_l))
                b_q = qnet(b_o)[torch.arange(batch_size), b_a.squeeze(1), :]
                kl_error = -(b_q * b_m).sum(1)
                # use kl error as priorities as proposed by Rainbow
                priorities = kl_error.detach().cpu().clamp(1e-6).numpy()
                loss = kl_error.mean()

            optimizer.zero_grad()
            loss.backward()
            if grad_norm is not None:
                nn.utils.clip_grad_norm_(qnet.parameters(), grad_norm)
            optimizer.step()
            if prioritized_replay:
                buffer.update_priorities(extra[1], priorities)

        # update target net and log
        if n_iter % target_network_update_freq == 0:
            qtar.load_state_dict(qnet.state_dict())
            logger.info('{} Iter {} {}'.format('=' * 10, n_iter, '=' * 10))
            fps = int(n_iter / (time.time() - start_ts))
            logger.info('Total timesteps {} FPS {}'.format(n_iter, fps))
            for k, v in infos.items():
                v = (sum(v) / len(v)) if v else float('nan')
                logger.info('{}: {:.6f}'.format(k, v))
            if n_iter > learning_starts and n_iter % train_freq == 0:
                logger.info('vloss: {:.6f}'.format(loss.item()))

        if save_interval and n_iter % save_interval == 0:
            torch.save(
                [qnet.state_dict(), optimizer.state_dict()],
                os.path.join(save_path, '{}.checkpoint'.format(n_iter)))
Esempio n. 29
0
    def __init__(self, config):
        self.config = config

        self.network_freq = 125#self.config.conf['HLC-frequency']

        self.reward_decay = 1.0
        self.reward_scale = config.conf['reward-scale']

        self.max_time_per_train_episode = 10#self.config.conf['max-train-time']
        self.max_step_per_train_episode = int(self.max_time_per_train_episode*self.network_freq)
        self.max_time_per_test_episode = 10#self.config.conf['max-test-time']#16
        self.max_step_per_test_episode = int(self.max_time_per_test_episode*self.network_freq)

        env_name = 'Walker2DBulletEnv-v0'#'AntBulletEnv-v0'#'Walker2DBulletEnv-v0'#'HumanoidBulletEnv-v0'
        self.env = gym.make(env_name)
        # self.env.render()

        print(self.env.observation_space)
        print(self.env.action_space)
        self.config.conf['state-dim'] = self.env.observation_space.shape[0]
        self.config.conf['action-dim'] = self.env.action_space.shape[0]

        self.config.conf['actor-logstd-initial'] = np.zeros((1, self.config.conf['action-dim']))
        self.config.conf['actor-logstd-bounds'] = np.ones((2,self.config.conf['action-dim']))
        self.config.conf['actor-output-bounds'] = np.ones((2,self.config.conf['action-dim']))
        self.config.conf['actor-output-bounds'][0][:] = -1 * np.ones(self.config.conf['action-dim'],)
        self.config.conf['actor-output-bounds'][1][:] = 1* np.ones(self.config.conf['action-dim'],)

        self.config.conf['actor-logstd-initial'] *= np.log(1.0)  # np.log(min(std*0.25, 1.0))#0.5
        self.config.conf['actor-logstd-bounds'][0] *= np.log(0.2)
        self.config.conf['actor-logstd-bounds'][1] *= np.log(1.0)  # 0.6

        self.agent = Agent(self.env, self.config)

        self.episode_count = 0
        self.step_count = 0
        self.train_iter_count = 0

        self.best_reward = 0
        self.best_episode = 0
        self.best_train_iter = 0

        # load weight from previous network
        # dir_path = 'record/2017_12_04_15.20.44/no_force'  # '2017_05_29_18.23.49/with_force'

        # create new network
        dir_path = 'TRPO/record/' + '3D/' + env_name +'/' + datetime.now().strftime('%Y_%m_%d_%H.%M.%S')
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        if not os.path.exists(dir_path + '/saved_actor_networks'):
            os.makedirs(dir_path + '/saved_actor_networks')
        if not os.path.exists(dir_path + '/saved_critic_networks'):
            os.makedirs(dir_path + '/saved_critic_networks')
        self.logging = logger(dir_path)
        config.save_configuration(dir_path)
        config.record_configuration(dir_path)
        config.print_configuration()
        self.agent.load_weight(dir_path)
        self.dir_path = dir_path

        self.on_policy_paths = []
        self.off_policy_paths = []
        self.buffer = ReplayBuffer(self.config.conf['replay-buffer-size'])

        self.force = [0,0,0]
        self.force_chest = [0, 0, 0]  # max(0,force_chest[1]-300*1.0 / EXPLORE)]
        self.force_pelvis = [0, 0, 0]
Esempio n. 30
0
    def __init__(self,
                 states_n: tuple,
                 actions_n: int,
                 hidden_layers: list,
                 scope_name: str,
                 sess=None,
                 learning_rate=1e-4,
                 discount=0.98,
                 replay_memory_size=100000,
                 batch_size=32,
                 begin_train=1000,
                 targetnet_update_freq=1000,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay_step=50000,
                 seed=1,
                 logdir='logs',
                 savedir='save',
                 save_freq=10000,
                 use_tau=False,
                 tau=0.001):
        """

        :param states_n: tuple
        :param actions_n: int
        :param hidden_layers: list
        :param scope_name: str
        :param sess: tf.Session
        :param learning_rate: float
        :param discount: float
        :param replay_memory_size: int
        :param batch_size: int
        :param begin_train: int
        :param targetnet_update_freq: int
        :param epsilon_start: float
        :param epsilon_end: float
        :param epsilon_decay_step: int
        :param seed: int
        :param logdir: str
        """
        self.states_n = states_n
        self.actions_n = actions_n
        self._hidden_layers = hidden_layers
        self._scope_name = scope_name
        self.lr = learning_rate
        self._target_net_update_freq = targetnet_update_freq
        self._current_time_step = 0
        self._epsilon_schedule = LinearSchedule(epsilon_decay_step,
                                                epsilon_end, epsilon_start)
        self._train_batch_size = batch_size
        self._begin_train = begin_train
        self._gamma = discount

        self._use_tau = use_tau
        self._tau = tau

        self.savedir = savedir
        self.save_freq = save_freq

        self.qnet_optimizer = tf.train.AdamOptimizer(self.lr)

        self._replay_buffer = ReplayBuffer(replay_memory_size)

        self._seed(seed)

        with tf.Graph().as_default():
            self._build_graph()
            self._merged_summary = tf.summary.merge_all()

            if sess is None:
                self.sess = tf.Session()
            else:
                self.sess = sess
            self.sess.run(tf.global_variables_initializer())

            self._saver = tf.train.Saver()

            self._summary_writer = tf.summary.FileWriter(logdir=logdir)
            self._summary_writer.add_graph(tf.get_default_graph())