Beispiel #1
0
    def test_model(self):
        torch.autograd.set_detect_anomaly(True)
        self.algo = AttentionSAC([(5, 3), (5, 2)],
                                 tau=0.01,
                                 pi_lr=0.01,
                                 q_lr=0.01,
                                 gamma=0.95,
                                 pol_hidden_dim=128,
                                 critic_hidden_dim=128,
                                 attend_heads=4,
                                 reward_scale=10.)

        self.algo.prep_rollouts(device='cpu')

        sample: Dict[AgentKey, AgentObservation] = \
            {AgentKey(0, '0-1'): AgentObservation([1, 2, 3, 2, 3]),
             AgentKey(0, '0-2'): AgentObservation([2, 4, 3, 2, 4]),
             AgentKey(0, '0-3'): AgentObservation([2, 4, 3, 2, 4]),
             AgentKey(1, '0-1'): AgentObservation([1, 1, 3, 1, 4]),
             AgentKey(1, '0-2'): AgentObservation([1, 1, 3, 1, 4])}

        results = self.algo.step(sample, explore=True)

        self.assertEqual(len(results[AgentKey(0, '0-1')].action), 3)
        self.assertEqual(len(results[AgentKey(1, '0-1')].action), 2)

        for key in sample:
            self.assertTrue(key in results)

        for i in range(20):
            self.algo.step(sample)

        self.algo.prep_training(device='cpu')

        # Generate random training sample
        train_sample: List[Dict[AgentKey, AgentReplayFrame]] = \
            [{AgentKey(0, '0-1'): AgentReplayFrame([rval() for i in range(5)], [0, 1, 0], 5, False, [rval() for i in range(5)]),
              AgentKey(0, '0-2'): AgentReplayFrame([rval() for i in range(5)], [1, 0, 0], 5, False, [rval() for i in range(5)]),
              AgentKey(0, '0-3'): AgentReplayFrame([rval() for i in range(5)], [0, 1, 0], 5, False, [rval() for i in range(5)]),
              AgentKey(1, '0-1'): AgentReplayFrame([rval() for i in range(5)], [0, 1], 5, False, [rval() for i in range(5)]),
              AgentKey(1, '0-2'): AgentReplayFrame([rval() for i in range(5)], [0, 1], 5, False, [rval() for i in range(5)])}
             for _ in range(3)]
        train_sample: Dict[AgentKey,
                           BatchedAgentReplayFrame] = preprocess_to_batch(
                               train_sample)
        self.algo.update_critic(train_sample, logger=None)
        self.algo.update_policies(train_sample, logger=None)
        self.algo.update_all_targets()
Beispiel #2
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    model = AttentionSAC.init_from_save(model_path)
    env = make_env(config.env_id, discrete_action=True)
    model.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render('human')
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_actions = model.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)
            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human')
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames,
                            duration=ifi)

    env.close()
Beispiel #3
0
def run(config):
    env = football_env.create_environment(
        env_name=config["academy_scenario"],
        rewards=config["scoring"],
        render=config["render_mode"],
        number_of_left_players_agent_controls=config["num_to_control"],
        representation='raw')

    model = AttentionSAC.init_from_save(
        "./models/football/MAAC3/run2/model.pt", True)
    # (** EDITED **) Set Replay Buffer
    # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정

    for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]):
        obs = env.reset()
        obs = make_state(obs)
        model.prep_rollouts(device='cpu')

        for et_i in range(config["episode_length"]):
            print("episode : {} | step : {}".format(ep_i, et_i), end='\r')
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config["n_rollout_threads"])]

            # Reform Actions list to fit on Football Env
            # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음
            actions_list = [[np.argmax(b) for b in a] for a in actions]

            # Step
            next_obs, rewards, dones, infos = env.step(actions_list)
            next_obs = make_state(next_obs)

            # Prevention of divergence
            # 안해주면 발산해서 학습 불가 (NaN)
            rewards = rewards - 0.000001

            # Reform Done Flag list
            # replay buffer에 알맞도록 done 리스트 재구성
            obs = next_obs

    env.close()
Beispiel #4
0
def run(model_name: str):
    model_path, run_num, run_dir, log_dir = run_setup(model_name, get_latest_model=True)

    if model_path is None:
        print("Couldn't find model!")
        return

    model = AttentionSAC.init_from_save(model_path)

    model.prep_rollouts(device='cpu')

    run_env: HaliteRunHelper = HaliteRunHelper()

    run_env.simulate(lambda o: model.step(o, explore=True), agent_count=2)
Beispiel #5
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    maac = AttentionSAC.init_from_save(model_path)
    env = MultiAgentEnv(config.env_id, config.n_controlled_lagents,
                        config.n_controlled_ragents, config.reward_type,
                        config.render)
    maac.prep_rollouts(device='cpu')

    goal_diff = 0

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        for t_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(maac.nagents)
            ]
            # get actions as torch Variables
            torch_actions = maac.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)
            if all(dones):
                goal_diff += np.sum(rewards) / (config.n_controlled_lagents +
                                                config.n_controlled_ragents)
            if all(dones):
                break
    goal_diff /= config.n_episodes
    print(goal_diff)
    env.close()
Beispiel #6
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    envActionSpace = env.action_space
    envObservationSpace = env.observation_space

    model = AttentionSAC.init_from_env(
        envActionSpace,
        envObservationSpace,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,  #128
        critic_hidden_dim=config.critic_hidden_dim,  #128
        attend_heads=config.attend_heads,  #4
        reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):  #12
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):  #25
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads
                ):  # 100 steps across rollouts -> 4 updates
                model.prep_training(device='cpu')

                for u_i in range(config.num_updates):  #4
                    sample = replay_buffer.sample(config.batch_size)
                    model.update_critic(sample)
                    model.update_policies(sample)
                    model.update_all_targets()

                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
Beispiel #7
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    # model = AttentionSAC.init_from_env(env,
    #                                    tau=config.tau,
    #                                    pi_lr=config.pi_lr,
    #                                    q_lr=config.q_lr,
    #                                    gamma=config.gamma,
    #                                    pol_hidden_dim=config.pol_hidden_dim,
    #                                    critic_hidden_dim=config.critic_hidden_dim,
    #                                    attend_heads=config.attend_heads,
    #                                    reward_scale=config.reward_scale)

    # Model used to test with adversarial agent 
    # model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run140\\model.pt")
    # print("Model instantiated")

    # Model used to test without adversarial agent 
    model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run148\\model.pt")
    print("Model instantiated")

    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0

    row_list = []

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(model.nagents)]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            # print (rewards)
            # print (dones[0])
            # env.render('human')
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    #print(sample)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')

            if (dones[0][0]):
                print("Breakin the epsiodeeeee at timestep", et_i)
                break
        
        et_i += 1   

        row_list.append((ep_i+1,et_i))   

        ep_rews = replay_buffer.get_average_rewards(
            et_i * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * et_i, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    with open('Timesteps_vs_Episodes.csv', 'w', newline='') as file:
         writer = csv.writer(file)
         writer.writerow(["Ep No", "Number of Timesteps"])
         for row in row_list:
            writer.writerow(row)

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Beispiel #8
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    # if not model_dir.exists():
    #     run_num = 1
    # else:
    #     exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
    #                      model_dir.iterdir() if
    #                      str(folder.name).startswith('run')]
    #     if len(exst_run_nums) == 0:
    #         run_num = 1
    #     else:
    #         run_num = max(exst_run_nums) + 1
    run_num = 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir,exist_ok=True)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    model = AttentionSAC.init_from_env(env,
                                       tau=config.tau,
                                       pi_lr=config.pi_lr,
                                       q_lr=config.q_lr,
                                       gamma=config.gamma,
                                       pol_hidden_dim=config.pol_hidden_dim,
                                       critic_hidden_dim=config.critic_hidden_dim,
                                       attend_heads=config.attend_heads,
                                       reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(model.nagents)]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * config.episode_length, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Beispiel #9
0
def run(config):
    cover_ratio = []

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    # os.makedirs(log_dir)
    # logger = SummaryWriter(str(log_dir))

    #    torch.manual_seed(run_num)
    #    np.random.seed(run_num)
    #env = make_parallel_env(, config.n_rollout_threads, run_num)
    env = make_env(config.env_id,
                   benchmark=BENCHMARK,
                   discrete_action=True,
                   use_handcraft_policy=config.use_handcraft_policy)
    model = AttentionSAC.init_from_env(
        env,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,
        critic_hidden_dim=config.critic_hidden_dim,
        attend_heads=config.attend_heads,
        reward_scale=config.reward_scale)

    model.init_from_save_self('./models/swift_scenario/model/run8/model.pt')
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    update_count = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            agent_actions = [
                ac.data.numpy().squeeze() for ac in torch_agent_actions
            ]
            # rearrange actions to be per environment
            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # agent_actions[0][5]=1
            # agent_actions[1][5]=1
            # agent_actions[2][5]=1
            next_obs, rewards, dones, infos = env.step(
                agent_actions,
                use_handcraft_policy=config.use_handcraft_policy)
            env.render()
            time.sleep(0.1)

            # # # get actions as torch Variables
            # torch_agent_actions = model.step(torch_obs, explore=True)
            # # convert actions to numpy arrays
            # agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # # rearrange actions to be per environment
            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # next_obs, rewards, dones, infos = env.step(actions)
            # env.render()

            #if et_i == config.episode_length - 1:
            #print(infos)
            #print(type(infos['cover_ratio']))
            #cover_ratio.append(float(infos[0]['n'][0]['cover_ratio']))
            #print(infos)

            #            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            '''
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):

                    update_count += 1
                    print("episode:", ep_i, ", total steps:", t, " update_count:", update_count)

                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

        logger.export_scalars_to_json(str(log_dir / 'summary.json'))

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    print(cover_ratio)
    '''
    env.close()
Beispiel #10
0
def run(config):
    USE_CUDA = False
    if config.gpu:
        if torch.cuda.is_available():
            USE_CUDA = True
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))
    
#     model_run = 'run%i' % max(exst_run_nums)
#     model_path = model_dir / model_run / 'model.pt'

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num,
                            config.n_controlled_lagents, config.n_controlled_ragents, config.reward_type, config.render)
    model = AttentionSAC.init_from_env(env,
                                       tau=config.tau,
                                       pi_lr=config.pi_lr,
                                       q_lr=config.q_lr,
                                       gamma=config.gamma,
                                       pol_hidden_dim=config.pol_hidden_dim,
                                       critic_hidden_dim=config.critic_hidden_dim,
                                       attend_heads=config.attend_heads,
                                       reward_scale=config.reward_scale)
    
#     model = AttentionSAC.init_from_save_(model_path, load_critic=False, gpu=USE_CUDA)
    
    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    best_rewards = 0
    t = 0
    num_episodes = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        
        if ep_i % (config.epoch_size * config.n_rollout_threads) == 0:
            stat = dict()
            stat['epoch'] = int(ep_i / (config.epoch_size * config.n_rollout_threads) + 1)
            
        obs = env.reset()
        model.prep_rollouts(device='cpu')
        
        s = dict()
        s['dones'] = [0 for i in range(config.n_rollout_threads)]
        s['num_episodes'] = [0 for i in range(config.n_rollout_threads)]
        s['reward'] = [0 for i in range(config.n_rollout_threads)]
        s['success'] = [0 for i in range(config.n_rollout_threads)]
        s['steps_taken'] = [0 for i in range(config.n_rollout_threads)]
        s['reward_buffer'] = [0 for i in range(config.n_rollout_threads)]
        s['steps_buffer'] = [0 for i in range(config.n_rollout_threads)]

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(model.nagents)]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=USE_CUDA)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
                
            for i in range(config.n_rollout_threads):
                s['reward'][i] += np.mean(rewards[i])
                s['steps_taken'][i] += 1
                if dones[i][0] == True:
                    s['dones'][i] += 1
                    s['num_episodes'][i] += 1
                    s['reward_buffer'][i] = s['reward'][i]
                    s['steps_buffer'][i] = s['steps_taken'][i]
                    if infos[i]['score_reward'] == 1:
                        s['success'][i] += 1
                if et_i == config.episode_length-1:
                    if dones[i][0] == False:
                        if s['dones'][i] > 0:
                            s['reward'][i] = s['reward_buffer'][i]
                            s['steps_taken'][i] = s['steps_buffer'][i]
                        else:
                            s['num_episodes'][i] += 1
                            
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        global_ep_rews = 0
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalars('agent%i/rewards' % a_i, {'mean_episode_rewards': a_ep_rew}, ep_i)
            global_ep_rews += a_ep_rew / (config.n_controlled_lagents + config.n_controlled_ragents)
        logger.add_scalars('global', {'global_rewards': global_ep_rews}, ep_i)
        
        if global_ep_rews > 0.007:
            model.save(run_dir / ('model_ep%i.pt' % ep_i))
#             print('model saved at ep%i' % ep_i)   
#             print('saved model reward: ', global_ep_rews)
        
        if global_ep_rews > best_rewards:
            best_rewards = global_ep_rews
            if best_rewards > 0.005:
                model.save(run_dir / ('best_model_ep%i.pt' % ep_i))
#                 print('best model saved at ep%i' % ep_i)
#                 print('best global reward: ', best_rewards)
                
#         if ep_i%500 == 0:
#             print('episode: ', ep_i)
#             print('global reward: ', global_ep_rews)
#             print('best global reward: ', best_rewards)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')
            
        # An exact episode means a real episode in the game, rather than the episode in a training loop
        # Mean (exact) episode data are only generated from complete exact episodes
        # We calculate the mean (exact) episode data in each epoch
        # (config.epoch_size * config.n_rollout_threads) means the number of training episodes an epoch includes
        # The mean (exact) episode data are used for visualization and comparison
        # Reward, Steps-Taken, Success

        stat['num_episodes'] = stat.get('num_episodes', 0) + np.sum(s['num_episodes'])
        stat['reward'] = stat.get('reward', 0) + np.sum(s['reward'])
        stat['success'] = stat.get('success', 0) + np.sum(s['success'])
        stat['steps_taken'] = stat.get('steps_taken', 0) + np.sum(s['steps_taken'])

        if (ep_i+config.n_rollout_threads) % (config.epoch_size * config.n_rollout_threads) == 0:
            num_episodes += stat['num_episodes']
            print('Epoch {}'.format(stat['epoch']))
            print('Episode: {}'.format(num_episodes))
            print('Reward: {}'.format(stat['reward']/stat['num_episodes']))
            print('Success: {:.2f}'.format(stat['success']/stat['num_episodes']))
            print('Steps-Taken: {:.2f}'.format(stat['steps_taken']/stat['num_episodes']))

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Beispiel #11
0
def run(config):
    model_dir = Path('./models') / config["env_id"] / config["model_name"]
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config["env_id"], config["n_rollout_threads"],
                            run_num)
    model = AttentionSAC.init_from_env(
        env,
        tau=config["tau"],
        pi_lr=config["pi_lr"],
        q_lr=config["q_lr"],
        gamma=config["gamma"],
        pol_hidden_dim=config["pol_hidden_dim"],
        critic_hidden_dim=config["critic_hidden_dim"],
        attend_heads=config["attend_heads"],
        reward_scale=config["reward_scale"])
    replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents,
                                 [115 for _ in range(11)],
                                 [19 for _ in range(11)])
    t = 0
    for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]):
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + config["n_rollout_threads"],
               config["n_episodes"]))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        done = [False]
        et_i = 0

        while not any(done):
            et_i += 1
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config["n_rollout_threads"])]

            actions_list = []
            for a in actions:
                temp = []
                for b in a:
                    temp.append(np.argmax(b))
                actions_list.append(temp)

            next_obs, rewards, done, infos = env.step(actions_list)

            dones = [done for _ in range(11)]

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config["n_rollout_threads"]
            if (len(replay_buffer) >= config["batch_size"]
                    and (t % config["steps_per_update"]) <
                    config["n_rollout_threads"]):
                if config["use_gpu"]:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config["num_updates"]):
                    sample = replay_buffer.sample(config["batch_size"],
                                                  to_gpu=config["use_gpu"])
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')

            print("ep_i : {} | et_i : {}".format(ep_i, et_i), end='\r')

        ep_rews = replay_buffer.get_average_rewards(
            config["episode_length"] * config["n_rollout_threads"])

        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * config["episode_length"], ep_i)

        if ep_i % config["save_interval"] < config["n_rollout_threads"]:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Beispiel #12
0
def train(env):
    n_agents = env["n_agents"]
    x_dim = env["x_dim"]
    y_dim = env["y_dim"]
    n_cities = env["n_cities"]
    max_rails_between_cities = env["max_rails_between_cities"]
    max_rails_in_city = env["max_rails_in_city"]
    seed = 0
    use_fast_tree_obs = False

    # Observation parameters
    observation_tree_depth = 4
    observation_radius = 10
    observation_max_path_depth = 30

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = None

    if use_fast_tree_obs:
        tree_observation = FastTreeObs(max_depth=observation_tree_depth)
        print("Using FastTreeObs")
    else:
        tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                             predictor=predictor)
        print("Using StandardTreeObs")

    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)

    rewards = []
    obs, info = env.reset()

    if use_fast_tree_obs:
        state_size = tree_observation.observation_dim
    else:
        # Calculate the state size given the depth of the tree observation and the
        # number of features
        n_features_per_node = env.obs_builder.observation_dim
        n_nodes = 0
        for i in range(observation_tree_depth + 1):
            n_nodes += np.power(4, i)

        state_size = n_features_per_node * n_nodes

    action_size = 5

    DEVICE = 'cpu'
    # if torch.cuda.is_available():
    # 	DEVICE = 'gpu'

    buffer_length = 10000
    steps_to_save_model = 10
    step_size = 100
    num_steps = 100  # update every 100 steps
    avg_steps = 20  # num steps to average and plot rewards
    reward_q = []
    batch_size = 100

    agent_obs = np.array([None] * env.get_num_agents())

    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    num_episodes = 100000

    agent_init_params = []
    sa_size = []

    for i in range(n_agents):
        agent_init_params.append({
            'num_in_pol': state_size,
            'num_out_pol': action_size,
            'init_weights': 'model.pt'
        })
        sa_size.append((state_size, action_size))

    hyperparams = {
        "tau": 0.01,
        "pi_lr": 0.00001,
        "q_lr": 0.00005,
        "pol_hidden_dim": 256,
        "critic_hidden_dim": 256,
        "attend_heads": 8
    }

    model = AttentionSAC(agent_init_params=agent_init_params,
                         sa_size=sa_size,
                         tau=hyperparams["tau"],
                         pi_lr=hyperparams["pi_lr"],
                         q_lr=hyperparams["q_lr"],
                         pol_hidden_dim=hyperparams["pol_hidden_dim"],
                         critic_hidden_dim=hyperparams["critic_hidden_dim"],
                         attend_heads=hyperparams["attend_heads"])
    model.init_dict = {}

    replay_buffer = ReplayBuffer(buffer_length, n_agents,
                                 [state_size for i in range(n_agents)],
                                 [action_size for i in range(n_agents)])

    print("MAX STEPS: " + str(max_steps))
    print("NUM EPISODES: ", num_episodes)
    print("HYPERPARAMS: ")
    print(hyperparams)

    start_time = time.time()

    for ep in range(num_episodes):
        print("Episode " + str(ep) + ":", flush=True)
        obs, info = env.reset(True, True)
        model.prep_rollouts(device=DEVICE)
        reward_sum_for_this_episode = 0

        for steps in range(max_steps):
            if steps % step_size == 0:
                print("=", end="", flush=True)
            for agent in env.get_agent_handles():
                if obs[agent] is not None:
                    if use_fast_tree_obs:
                        agent_obs[agent] = obs[agent]
                    else:
                        agent_obs[agent] = normalize_observation(
                            obs[agent],
                            observation_tree_depth,
                            observation_radius=observation_radius)
                else:
                    agent_obs[agent] = np.array([0.] * state_size)

            action_dict = {}
            agent_actions = []

            torch_obs = [
                Variable(torch.Tensor([agent_obs[i]]), requires_grad=False)
                for i in range(n_agents)
            ]
            torch_agent_actions = model.step(torch_obs, explore=True)
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            for i in range(n_agents):
                dist = torch_agent_actions[i][0]
                idx = -1
                for j in range(action_size):
                    if dist[j] != 0:
                        idx = j
                        break
                action_dict[i] = idx

            next_obs, all_rewards, done, info = env.step(action_dict)

            rewards = []
            dones = []

            next_agent_obs = np.array([None] * env.get_num_agents())

            for agent in env.get_agent_handles():
                if next_obs[agent] is not None:
                    if use_fast_tree_obs:
                        next_agent_obs[agent] = next_obs[agent]
                    else:
                        next_agent_obs[agent] = normalize_observation(
                            obs[agent],
                            observation_tree_depth,
                            observation_radius=observation_radius)
                else:
                    next_agent_obs[agent] = np.array([0.] * state_size)

            for i in range(n_agents):
                reward_sum_for_this_episode += all_rewards[i]
                rewards.append(all_rewards[i])
                all_rewards[i] += augment_reward(agent_obs[agent])
                dones.append(done[i])

            replay_buffer.push(np.array([agent_obs]), np.array(agent_actions),
                               np.array([rewards]), np.array([next_agent_obs]),
                               np.array([dones]))

            if steps % num_steps == 0:
                model.prep_training(device=DEVICE)
                sample = replay_buffer.sample(batch_size, norm_rews=False)
                #print(sample)
                model.update_critic(sample)
                model.update_policies(sample)
                model.update_all_targets()
                model.prep_rollouts(device=DEVICE)

        reward_sum_for_this_episode /= n_agents
        reward_q.append(reward_sum_for_this_episode)

        if len(reward_q) == avg_steps:
            wandb.log({'reward': np.mean(reward_q)})
            reward_q = []

        print()

        if ep % steps_to_save_model == 0:
            print("\nSaving model")
            model.save(os.getcwd() + "/model.pt")
            cur_time = time.time()
            time_elapsed = (cur_time - start_time) // 60
            print("Time Elapsed: " + str(time_elapsed) + "\n")
Beispiel #13
0
def run(halite_env: BaseEnv, load_latest: bool=False):
    config = halite_env.config

    model_path, run_num, run_dir, log_dir = run_setup(config.model_name, get_latest_model=load_latest)

    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    # Build MAAC model
    if model_path is None:
        model = AttentionSAC(halite_env.agent_type_topologies,
                             tau=config.tau,
                             pi_lr=config.pi_lr,
                             q_lr=config.q_lr,
                             gamma=config.gamma,
                             pol_hidden_dim=config.pol_hidden_dim,
                             critic_hidden_dim=config.critic_hidden_dim,
                             attend_heads=config.attend_heads,
                             reward_scale=config.reward_scale)
    else:
        model = AttentionSAC.init_from_save(model_path, load_critic=True)

    # Build replay buffer
    replay_buffer = ReplayBuffer(config.buffer_length)

    prev_time = time.perf_counter()

    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        curr_time = time.perf_counter()
        print("Episodes %i-%i of %i (%is)" % (ep_i + 1,
                                              ep_i + 1 + config.n_rollout_threads,
                                              config.n_episodes,
                                              (curr_time - prev_time)))
        model.prep_rollouts(device='cpu')

        game_reward = halite_env.simulate(lambda o: model.step(o, explore=True), replay_buffer)

        t += config.n_rollout_threads
        if (replay_buffer.length() >= config.batch_size and
            (t % config.games_per_update) < config.n_rollout_threads):
            print("Training")
            if config.use_gpu:
                model.prep_training(device='gpu')
            else:
                model.prep_training(device='cpu')
            for u_i in range(config.num_updates):
                sample: List[Dict[AgentKey, AgentReplayFrame]] = replay_buffer.sample(config.batch_size)
                # print("Original sample size", len(sample))
                # print("Preprocessing to batch structure")
                sample: Dict[AgentKey, BatchedAgentReplayFrame] = preprocess_to_batch(sample, to_gpu=config.use_gpu)
                # print("Filtered sample size", len(sample))
                # if len(sample) < 5:
                #     print("Sample size keys:", sample.keys())
                # print("Updating model critic")
                model.update_critic(sample, logger=logger)
                # print("Updating model policies")
                model.update_policies(sample, logger=logger)
                model.update_all_targets()
            model.prep_rollouts(device='cpu')

        ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads)
        for k, v in ep_rews.items():
            logger.add_scalar('agent%s/mean_episode_rewards' % str(k), v, ep_i)

        logger.add_scalar("global_env_rewards", game_reward, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            print("Saving")
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')
            print("run_dir", run_dir)

        prev_time = curr_time

    model.save(run_dir / 'model.pt')
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Beispiel #14
0
def run(config):

    numWolves = 4
    numSheep = 1
    numBlocks = 2
    numAgents = numWolves + numSheep
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [
        blockSize
    ] * numBlocks
    sheepMaxSpeed = 1.3
    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheep + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    collisionReward = 10
    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID,
                              sheepsID,
                              entitiesSizeList,
                              getPosFromAgentState,
                              isCollision,
                              punishForOutOfBound,
                              collisionPunishment=collisionReward)

    individualRewardWolf = 0
    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision,
                            collisionReward, individualRewardWolf)
    reshapeAction = ReshapeAction()
    costActionRatio = 0
    getActionCost = GetActionCost(costActionRatio,
                                  reshapeAction,
                                  individualCost=True)
    getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID]
    rewardWolfWithActionCost = lambda state, action, nextState: np.array(
        rewardWolf(state, action, nextState)) - np.array(
            getActionCost(getWolvesAction(action)))

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)

    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState,
                                    getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction,
                                       applyActionForce, applyEnvironForce,
                                       integrateState)

    isTerminal = lambda state: [False] * numAgents

    initObsForParams = observe(reset())
    envObservationSpace = [
        initObsForParams[obsID].shape for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    envActionSpace = [
        spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents)
    ]

    model_dir = os.path.join(dirName, 'models', config.env_id,
                             config.model_name)
    model = AttentionSAC.init_from_env(
        envActionSpace,
        envObservationSpace,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,  #128
        critic_hidden_dim=config.critic_hidden_dim,  #128
        attend_heads=config.attend_heads,  #4
        reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [
        obsp[0] if isinstance(obsp, tuple) else obsp.shape[0]
        for obsp in envObservationSpace
    ], [
        acsp.shape[0] if isinstance(acsp, Box) else acsp.n
        for acsp in envActionSpace
    ])
    t = 0

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):  #12
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        state = reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            obs = observe(state)
            obs = np.array([obs])
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            action = actions[0]
            nextState = transit(state, action)
            next_obs = np.array([observe(nextState)])
            rewards = np.array([rewardFunc(state, action, nextState)])
            dones = np.array([isTerminal(nextState)])

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            state = nextState
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads
                ):  # 100 steps across rollouts -> 4 updates
                model.prep_training(device='cpu')

                for u_i in range(config.num_updates):  #4
                    sample = replay_buffer.sample(config.batch_size)
                    model.update_critic(sample)
                    model.update_policies(sample)
                    model.update_all_targets()

                model.prep_rollouts(device='cpu')

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            pathIncremental = os.path.join(model_dir, 'incremental')
            if not os.path.exists(pathIncremental):
                os.makedirs(pathIncremental)
            model.save(
                os.path.join(pathIncremental, ('model_ep%i.pt' % (ep_i + 1))))

    model.save(os.path.join(model_dir, 'model.pt'))
Beispiel #15
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    #log_dir = run_dir / 'logs'
    os.makedirs(run_dir)
    #logger = SummaryWriter(str(log_dir))

    # Initialization of evaluation metrics
    collisions = [0]
    success_nums = [0]
    ccr_activates = [0]
    final_ep_rewards = []  # sum of rewards for training curve
    final_ep_collisions = []
    final_ep_activates = []
    final_ep_success_nums = []

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    env = make_env(config.env_id, discrete_action=True)
    num_agents = env.n
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)

    # if config.emergency:
    #     env.switch_emergency()

    model = AttentionSAC.init_from_env(
        env,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,
        critic_hidden_dim=config.critic_hidden_dim,
        attend_heads=config.attend_heads,
        reward_scale=config.reward_scale)

    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    #### remove all tensorboard methods, replace with print and pickle

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        #print("Episodes %i-%i of %i" % (ep_i + 1,
        #                                ep_i + 1 + config.n_rollout_threads,
        #                                config.n_episodes))
        if config.emergency:
            env.switch_emergency()
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        t_start = time.time()

        prev_obs = None
        act_n_t_minus_1 = None

        for et_i in range(config.episode_length):
            if config.CCR:
                if act_n_t_minus_1:
                    target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1)
                    diff_state = obs[:, :, :4] - target_obs_n[:, :, :
                                                              4]  # 12x4x4

                    if config.env_id == 'wall' or config.env_id == 'strong_wind' or config.env_id == 'wall_expos':
                        diff_obs = obs[:, :, -(model.nagents + 8 + 1)]
                    elif config.env_id == 'turbulence':
                        diff_obs = obs[:, :, -(model.nagents + 2 + 1)]
                    else:
                        assert (False)

                    emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs  # 12x4

                    env.oracle_update()

                    # obs: 12x4x20
                    # emerg_n: 12x4
                    for agent_i in range(model.nagents):
                        for agent_j in range(model.nagents):
                            #print(obs[:, agent_i, -agent_j])
                            #print(emerg_n[:, agent_j])
                            obs[:, agent_i, -agent_j] = emerg_n[:, agent_j]
                            #print(obs[:, agent_i, -agent_j])
                            #print(emerg_n[:, agent_j])
            # collect experience
            if prev_obs is not None:
                replay_buffer.push(prev_obs, agent_actions, rewards, obs,
                                   dones)

            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]

            next_obs, rewards, dones, infos = env.step(actions)

            if config.CCR:
                if act_n_t_minus_1:
                    for i in range(model.nagents):
                        for j in range(model.nagents):
                            # ccr_activates[-1] += 1
                            intrinsic_reward = np.linalg.norm(
                                next_obs[:, i, 2:4] - obs[:, j, 2:4],
                                axis=-1) - np.linalg.norm(
                                    obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1)
                            intrinsic_reward /= (1 + np.linalg.norm(
                                obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1))
                            intrinsic_reward *= (emerg_n[:, j] - emerg_n[:, i])
                            rewards[:, i] += 10 * intrinsic_reward / np.sqrt(
                                num_agents)
                            """
                            if (len(episode_rewards) == 2 or len(episode_rewards) == 2000 or len(episode_rewards) == 5000) and episode_step % 5 == 0:
                                Ls[i].append('      intrinsic reward = ' + str(intrinsic_reward) + '\n')
                            """
                            # if i == j: continue
                            # emerg_invalid = ~((emerg_n[:,j] > emerg_n[:,i]) & (emerg_n[:,j] > 0))
                            # ccr_activates[-1] += (~emerg_invalid).sum()
                            # intrinsic_reward = np.linalg.norm(next_obs[:,i,2:4] - obs[:,j,2:4], axis=-1) - np.linalg.norm(obs[:,i,2:4] - obs[:,j,2:4], axis=-1)
                            # intrinsic_reward[emerg_invalid] = 0
                            # rewards[:,i] += 10 * intrinsic_reward

                act_n_t_minus_1 = actions

            prev_obs = obs

            obs = next_obs

            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=None)
                    model.update_policies(sample, logger=None)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')

        ls_num_collision = env.get_collision_and_zero_out()

        collisions.append(np.array(
            ls_num_collision).mean())  # might need to convert to np.int

        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        ep_rews = np.array(ep_rews).mean()
        # save model, display training output

        print(
            "episodes: {}, mean episode reward: {}, mean number of collisions with wall: {}, ccr activates: {}, success numbers: {}, time: {}"
            .format(ep_i, ep_rews, np.mean(collisions[-config.save_rate:]),
                    np.mean(ccr_activates[-config.save_rate:]),
                    np.mean(success_nums[-config.save_rate:]),
                    round(time.time() - t_start, 3)))

        # Keep track of final episode reward
        final_ep_rewards.append(ep_rews)
        # final_ep_activates.append(np.mean(ccr_activates[-config.save_rate:]))
        final_ep_collisions.append(np.mean(collisions[-config.save_rate:]))
        final_ep_success_nums.append(np.mean(success_nums[-config.save_rate:]))
        if ep_i % config.save_rate == 0:
            x_axis = np.arange(0, ep_i + 1, step=12)
            # plot reward data
            rew_file_name = run_dir / 'rewards.png'

            plt.plot(x_axis, final_ep_rewards)
            plt.xlabel('training episode')
            plt.ylabel('reward')
            #plt.legend()
            plt.savefig(rew_file_name)

            plt.clf()

            collision_file_name = run_dir / 'collisions.png'

            plt.plot(x_axis, final_ep_collisions)
            plt.xlabel('training episode')
            plt.ylabel('number of collisions')
            #plt.legend()
            plt.savefig(collision_file_name)

            plt.clf()

            # activates_file_name = run_dir / 'activates.png'

            # plt.plot(x_axis, final_ep_activates)
            # plt.xlabel('training episode')
            # plt.ylabel('CCR activates')
            # #plt.legend()
            # plt.savefig(activates_file_name)

            # plt.clf()

            success_file_name = run_dir / 'successes.png'

            plt.plot(x_axis, final_ep_success_nums)
            plt.xlabel('training episode')
            plt.ylabel('success numbers')
            #plt.legend()
            plt.savefig(success_file_name)

            plt.clf()

            rew_file_name = run_dir
            collision_file_name = run_dir
            success_nums_file_name = run_dir
            activates_file_name = run_dir

            rew_file_name /= 'rewards.pkl'
            collision_file_name /= 'collisions.pkl'
            success_nums_file_name /= 'success_nums.pkl'
            # activates_file_name /= 'activates.pkl'

            with open(rew_file_name, 'wb') as fp:
                pickle.dump(final_ep_rewards, fp)
            with open(collision_file_name, 'wb') as fp:
                pickle.dump(final_ep_collisions, fp)

            # with open(activates_file_name, 'wb') as fp:
            #     pickle.dump(final_ep_activates, fp)

            with open(success_nums_file_name, 'wb') as fp:
                pickle.dump(final_ep_success_nums, fp)

                plt.clf()

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
Beispiel #16
0
def test(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        # runs the newest
        run_num = max(exst_run_nums)

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    # Initialization of evaluation metrics
    collisions = [0]
    success_nums = [0]
    ccr_activates = [0]
    final_ep_rewards = []  # sum of rewards for training curve
    final_ep_collisions = []
    final_ep_activates = []
    final_ep_success_nums = []

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    env = make_env(config.env_id, discrete_action=True)
    env.seed(run_num)
    np.random.seed(run_num)
    model = AttentionSAC.init_from_save(run_dir / 'model.pt', True)

    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    #### remove all tensorboard methods, replace with print and pickle

    for ep_i in range(0, config.n_episodes):

        obs = np.expand_dims(np.array(env.reset()), 0)
        model.prep_rollouts(device='cpu')

        t_start = time.time()

        prev_obs = None
        act_n_t_minus_1 = None

        for et_i in range(config.episode_length):
            if config.CCR:
                if act_n_t_minus_1:
                    target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1[0])

                    target_obs_n = np.expand_dims(np.array(target_obs_n), 0)

                    diff_state = obs[:, :, :4] - target_obs_n[:, :, :
                                                              4]  # 1x4x4

                    if config.env_id == 'wall':
                        diff_obs = obs[:, :, -(model.nagents + 8 + 1)]
                    elif config.env_id == 'turbulence':
                        diff_obs = obs[:, :, -(model.nagents + 2 + 1)]
                    else:
                        assert (False)

                    emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs  # 1x4

                    env.oracle_update()

                    # obs: 1x4x20
                    # emerg_n: 1x4
                    for agent_i in range(model.nagents):
                        for agent_j in range(model.nagents):
                            obs[:, agent_i, -agent_j] = emerg_n[:, agent_j]

            # collect experience
            if prev_obs is not None:
                replay_buffer.push(prev_obs, agent_actions, rewards, obs,
                                   dones)

            #print(obs)
            # convert observation to torch Variable
            torch_obs = []
            for i in range(model.nagents):
                torch_obs.append(
                    Variable(torch.Tensor(obs[:, i]), requires_grad=False))
            # print(torch_obs)
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[0] for ac in agent_actions]]

            # rearrange actions to be per environment
            #actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]

            next_obs, rewards, dones, infos = env.step(actions[0])

            next_obs = np.expand_dims(np.array(next_obs), 0)
            rewards = np.expand_dims(np.array(rewards), 0)
            dones = np.expand_dims(np.array(dones), 0)
            infos = np.expand_dims(np.array(infos), 0)

            if config.CCR:
                act_n_t_minus_1 = actions

            prev_obs = obs

            obs = next_obs

            t += 1

            # for displaying learned policies
            if config.display:
                time.sleep(0.1)
                env.render()
                continue

    env.close()
Beispiel #17
0
def main():
    debug = 1
    if debug:
        numWolves = 3
        numSheep = 1
        numBlocks = 2
        sheepSpeedMultiplier = 1
        individualRewardWolf = 0
        costActionRatio = 0.0

    else:
        print(sys.argv)
        condition = json.loads(sys.argv[1])
        numWolves = int(condition['numWolves'])
        numSheep = int(condition['numSheeps'])
        numBlocks = int(condition['numBlocks'])

        sheepSpeedMultiplier = float(condition['sheepSpeedMultiplier'])
        individualRewardWolf = float(condition['individualRewardWolf'])
        costActionRatio = float(condition['costActionRatio'])

    modelName = "maac{}wolves{}sheep{}blocksSheepSpeed{}WolfActCost{}individ{}".format(
        numWolves, numSheep, numBlocks, sheepSpeedMultiplier, costActionRatio,
        individualRewardWolf)

    n_rollout_threads = 1
    buffer_length = int(1e6)
    n_episodes = 60000
    episode_length = 75
    steps_per_update = 100
    num_updates = 4
    batch_size = 1024
    save_interval = 1000
    pol_hidden_dim = 128
    critic_hidden_dim = 128
    attend_heads = 4
    pi_lr = 0.001
    q_lr = 0.001
    tau = 0.001
    gamma = 0.99
    reward_scale = 100.

    numAgents = numWolves + numSheep
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [
        blockSize
    ] * numBlocks

    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    sheepMaxSpeedOriginal = 1.3
    sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheep + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    collisionReward = 10
    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID,
                              sheepsID,
                              entitiesSizeList,
                              getPosFromAgentState,
                              isCollision,
                              punishForOutOfBound,
                              collisionPunishment=collisionReward)

    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision,
                            collisionReward, individualRewardWolf)
    reshapeAction = ReshapeAction()
    getActionCost = GetActionCost(costActionRatio,
                                  reshapeAction,
                                  individualCost=True)
    getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID]
    rewardWolfWithActionCost = lambda state, action, nextState: np.array(
        rewardWolf(state, action, nextState)) - np.array(
            getActionCost(getWolvesAction(action)))

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)

    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState,
                                    getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction,
                                       applyActionForce, applyEnvironForce,
                                       integrateState)

    isTerminal = lambda state: [False] * numAgents

    initObsForParams = observe(reset())
    envObservationSpace = [
        initObsForParams[obsID].shape for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    envActionSpace = [
        spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents)
    ]

    model_dir = os.path.join(dirName, 'models', 'chasing')
    model = AttentionSAC.init_from_env(
        envActionSpace,
        envObservationSpace,
        tau=tau,
        pi_lr=pi_lr,
        q_lr=q_lr,
        gamma=gamma,
        pol_hidden_dim=pol_hidden_dim,  #128
        critic_hidden_dim=critic_hidden_dim,  #128
        attend_heads=attend_heads,  #4
        reward_scale=reward_scale)
    replay_buffer = ReplayBuffer(buffer_length, model.nagents, [
        obsp[0] if isinstance(obsp, tuple) else obsp.shape[0]
        for obsp in envObservationSpace
    ], [
        acsp.shape[0] if isinstance(acsp, Box) else acsp.n
        for acsp in envActionSpace
    ])
    t = 0

    for ep_i in range(0, n_episodes, n_rollout_threads):  #12
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + n_rollout_threads, n_episodes))
        state = reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(episode_length):
            obs = observe(state)
            obs = np.array([obs])
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(n_rollout_threads)]
            action = actions[0]
            nextState = transit(state, action)
            next_obs = np.array([observe(nextState)])
            rewards = np.array([rewardFunc(state, action, nextState)])
            dones = np.array([isTerminal(nextState)])

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            state = nextState
            t += n_rollout_threads
            if (len(replay_buffer) >= batch_size
                    and (t % steps_per_update) < n_rollout_threads
                ):  # 100 steps across rollouts -> 4 updates
                model.prep_training(device='cpu')

                for u_i in range(num_updates):  #4
                    sample = replay_buffer.sample(batch_size)
                    model.update_critic(sample)
                    model.update_policies(sample)
                    model.update_all_targets()

                model.prep_rollouts(device='cpu')

        if ep_i % save_interval < n_rollout_threads:
            model.prep_rollouts(device='cpu')
            model.save(os.path.join(model_dir, modelName + 'eps' + str(ep_i)))

    model.save(os.path.join(model_dir, modelName))
Beispiel #18
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(1804)
    np.random.seed(1804)
    # initialize E parallel environments with N agents
    env = make_parallel_env(config.env_id, config.n_rollout_threads, 1804)
    model = AttentionSAC.init_from_save('model.pt')
    # model = AttentionSAC.init_from_env(env,
    #                                    tau=config.tau,
    #                                    pi_lr=config.pi_lr,
    #                                    q_lr=config.q_lr,
    #                                    gamma=config.gamma,
    #                                    pol_hidden_dim=config.pol_hidden_dim,
    #                                    critic_hidden_dim=config.critic_hidden_dim,
    #                                    attend_heads=config.attend_heads,
    #                                    reward_scale=config.reward_scale)
    # initialize replay buffer D
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    # T_update
    t = 0
    max_step = 0
    max_time = 0
    total_step = np.zeros(model.nagents)
    total_time = np.zeros(model.nagents)
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        success = np.zeros((config.n_rollout_threads, model.nagents),
                           dtype=bool)
        steps = np.zeros((config.n_rollout_threads, model.nagents))
        time_cost = np.zeros((config.n_rollout_threads, model.nagents))
        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            start = time.clock()
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)
            end = time.clock()
            per_time_cost = end - start

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            # calculate steps
            success = np.logical_or(success, dones)
            # steps += dones
            steps += np.logical_not(dones)
            time_cost += np.logical_not(dones) * per_time_cost

            # store transitions for all env in replay buffer
            # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs

            # T_update = T_update + E
            t += config.n_rollout_threads

            # if (len(replay_buffer) >= max(config.pi_batch_size, config.q_batch_size) and
            #     (t % config.steps_per_update) < config.n_rollout_threads):
            #     if config.use_gpu:
            #         model.prep_training(device='gpu')
            #     else:
            #         model.prep_training(device='cpu')
            #     for u_i in range(config.num_critic_updates):
            #         sample = replay_buffer.sample(config.q_batch_size,
            #                                       to_gpu=config.use_gpu)
            #         model.update_critic(sample, logger=logger)
            #     for u_i in range(config.num_pol_updates):
            #         sample = replay_buffer.sample(config.pi_batch_size,
            #                                       to_gpu=config.use_gpu)
            #         model.update_policies(sample, logger=logger)
            #     model.update_all_targets()
            #     # for u_i in range(config.num_updates):
            #     #     sample = replay_buffer.sample(config.batch_size,
            #     #                                   to_gpu=config.use_gpu)
            #     #     model.update_critic(sample, logger=logger)
            #     #     model.update_policies(sample, logger=logger)
            #     #     model.update_all_targets()
            model.prep_rollouts(device='cpu')

        # ep_dones = np.mean(success, axis=0)
        # ep_steps = 1 - np.mean(steps / config.episode_length, axis=0)
        # ep_mean_step

        # ep_rews = replay_buffer.get_average_rewards(
        #     config.episode_length * config.n_rollout_threads)
        # for a_i, a_ep_rew in enumerate(ep_rews):
        #     logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
        # for a_i, a_ep_done in enumerate(ep_dones):
        # logger.add_scalar('agent%i/mean_episode_dones' % a_i, a_ep_done, ep_i)
        # for a_i, a_ep_step in enumerate(ep_steps):
        # logger.add_scalar('agent%i/mean_episode_steps' % a_i, a_ep_step, ep_i)

        total_step += np.mean(steps, axis=0)
        total_time += np.mean(time_cost, axis=0)

        max_step += np.max(steps)
        max_time += np.max(time_cost)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            # os.makedirs(run_dir / 'incremental', exist_ok=True)
            # model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            # model.save(run_dir / 'model.pt')

    mean_step = total_step / (100 / config.n_rollout_threads)
    mean_time = total_time / (100 / config.n_rollout_threads)
    max_time /= 100 / config.n_rollout_threads
    max_step /= 100 / config.n_rollout_threads

    print('; '.join([
        f'{chr(65 + i)} Mean Step:{mean_step[i]}, Mean Time:{mean_time[i]}'
        for i in range(model.nagents)
    ]))
    print('Mean Max Step:{}, Mean Max Time Cost:{}'.format(max_step, max_time))
    # model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Beispiel #19
0
def run(config):
    model_dir = Path('./models') / config["env_id"] / config["model_name"]
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config["n_rollout_threads"], run_num)
    model = AttentionSAC.init_from_env(
        env,
        tau=config["tau"],
        pi_lr=config["pi_lr"],
        q_lr=config["q_lr"],
        gamma=config["gamma"],
        pol_hidden_dim=config["pol_hidden_dim"],
        critic_hidden_dim=config["critic_hidden_dim"],
        attend_heads=config["attend_heads"],
        reward_scale=config["reward_scale"])
    # (** EDITED **) Set Replay Buffer
    # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정
    replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents,
                                 [115 for _ in range(model.nagents)],
                                 [19 for _ in range(model.nagents)])
    t = 0
    for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]):
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + config["n_rollout_threads"],
               config["n_episodes"]))

        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config["episode_length"]):
            print("episode : {} | step : {}".format(ep_i, et_i), end='\r')
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config["n_rollout_threads"])]

            # Reform Actions list to fit on Football Env
            # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음
            actions_list = [[np.argmax(b) for b in a] for a in actions]

            # Step
            next_obs, rewards, dones, infos = env.step(actions_list)

            # Prevention of divergence
            # 안해주면 발산해서 학습 불가 (NaN)
            rewards = rewards - 0.000001

            # Reform Done Flag list
            # replay buffer에 알맞도록 done 리스트 재구성
            dones = (np.array([dones for _ in range(model.nagents)])).T

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config["n_rollout_threads"]
            if (len(replay_buffer) >= config["batch_size"]
                    and (t % config["steps_per_update"]) <
                    config["n_rollout_threads"]):
                if config["use_gpu"]:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config["num_updates"]):
                    sample = replay_buffer.sample(config["batch_size"],
                                                  to_gpu=config["use_gpu"])
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config["episode_length"] * config["n_rollout_threads"])
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * config["episode_length"], ep_i)

        if ep_i % config["save_interval"] < config["n_rollout_threads"]:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Beispiel #20
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    run_num = 1

    numWolves = 3
    numSheep = 1
    numBlocks = 2
    numAgents = numWolves + numSheep
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [
        blockSize
    ] * numBlocks
    sheepMaxSpeed = 1.3
    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheep + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    collisionReward = 10
    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID,
                              sheepsID,
                              entitiesSizeList,
                              getPosFromAgentState,
                              isCollision,
                              punishForOutOfBound,
                              collisionPunishment=collisionReward)

    individualRewardWolf = 0
    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision,
                            collisionReward, individualRewardWolf)
    reshapeAction = ReshapeAction()
    costActionRatio = 0
    getActionCost = GetActionCost(costActionRatio,
                                  reshapeAction,
                                  individualCost=True)
    getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID]
    rewardWolfWithActionCost = lambda state, action, nextState: np.array(
        rewardWolf(state, action, nextState)) - np.array(
            getActionCost(getWolvesAction(action)))

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)
    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState,
                                    getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction,
                                       applyActionForce, applyEnvironForce,
                                       integrateState)

    isTerminal = lambda state: [False] * numAgents

    initObsForParams = observe(reset())
    obsShape = [
        initObsForParams[obsID].shape for obsID in range(len(initObsForParams))
    ]
    worldDim = 2
    actionSpace = [
        spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents)
    ]

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    model = AttentionSAC.init_from_save(filename=run_dir / 'model.pt')

    biteList = []
    trajListToRender = []

    for ep_i in range(0, config.n_episodes):
        state = reset()
        model.prep_rollouts(device='cpu')

        trajectory = []

        for et_i in range(config.episode_length):
            obs = observe(state)
            obs = np.array([obs])
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            torch_agent_actions = model.step(torch_obs, explore=False)
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            action = actions[0]

            nextState = transit(state, action)
            next_obs = observe(nextState)
            rewards = rewardFunc(state, action, nextState)
            done_n = isTerminal(nextState)
            done = all(done_n)
            trajectory.append((state, action, rewards, nextState))

            state = nextState

        biteNum = calcWolfTrajBiteAmount(trajectory, wolvesID, singleReward=10)
        biteList.append(biteNum)
        trajListToRender.append(list(trajectory))

        print(biteNum)

    meanTrajBite = np.mean(biteList)
    seTrajBite = np.std(biteList) / np.sqrt(len(biteList) - 1)
    print('meanTrajBite', meanTrajBite, 'seTrajBite ', seTrajBite)

    wolfColor = np.array([0.85, 0.35, 0.35])
    sheepColor = np.array([0.35, 0.85, 0.35])
    blockColor = np.array([0.25, 0.25, 0.25])
    entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [
        blockColor
    ] * numBlocks
    render = Render(entitiesSizeList, entitiesColorList, numAgents,
                    getPosFromAgentState)
    trajToRender = np.concatenate(trajListToRender)
    render(trajToRender)
Beispiel #21
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    run_num = 1

    numWolves = 3
    numSheep = 1
    numBlocks = 2
    numAgents = numWolves + numSheep
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [
        blockSize
    ] * numBlocks

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    model = AttentionSAC.init_from_save(filename=run_dir / 'model.pt')

    biteList = []
    trajListToRender = []
    for ep_i in range(0, config.n_episodes):
        obs = env.reset()
        model.prep_rollouts(device='cpu')
        trajectory = []

        for et_i in range(config.episode_length):  #25
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            torch_agent_actions = model.step(torch_obs, explore=False)
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            state = [
                np.append(agent.state.p_pos, agent.state.p_vel)
                for agent in env.agents
            ] + [
                np.append(landmark.state.p_pos, landmark.state.p_vel)
                for landmark in env.world.landmarks
            ]

            state = obs[0]
            action = actions[0]
            reward = rewards[0]
            nextState = next_obs[0]
            trajectory.append((state, action, reward, nextState))

            obs = next_obs

        biteNum = calcWolfTrajBiteAmount(trajectory, wolvesID, singleReward=10)
        biteList.append(biteNum)
        trajListToRender = trajListToRender + trajectory

        print(biteNum)

    meanTrajBite = np.mean(biteList)
    seTrajBite = np.std(biteList) / np.sqrt(len(biteList) - 1)
    print('meanTrajBite', meanTrajBite, 'seTrajBite ', seTrajBite)

    wolfColor = np.array([0.85, 0.35, 0.35])
    sheepColor = np.array([0.35, 0.85, 0.35])
    blockColor = np.array([0.25, 0.25, 0.25])
    entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [
        blockColor
    ] * numBlocks
    render = Render(entitiesSizeList, entitiesColorList, numAgents,
                    getPosFromAgentState)
    trajToRender = np.concatenate(trajListToRender)
    render(trajToRender)

    env.close()
Beispiel #22
0
def run(config):
    device = torch.device(
        'cuda:' + str(config.gpu) if torch.cuda.is_available() else 'cpu')
    model_dir = Path('./runs') / config.store_result_dir

    train_loader, train_drugs, train_Y = preprocess(config.dataset, config)

    print("number of data")
    print(len(train_loader))
    for it, original_pair in enumerate(train_loader):
        if not model_dir.exists():
            run_num = 1
        else:
            exst_run_nums = [
                int(str(folder.name).split('run')[1])
                for folder in model_dir.iterdir()
                if str(folder.name).startswith('run')
            ]
            if len(exst_run_nums) == 0:
                run_num = 1
            else:
                run_num = max(exst_run_nums) + 1
        curr_run = 'run%i' % run_num
        run_dir = model_dir / curr_run
        log_dir = run_dir / 'logs'
        os.makedirs(log_dir)
        logger = SummaryWriter(str(log_dir))

        torch.manual_seed(run_num)
        np.random.seed(run_num)

        print('Run pair number ', str(it))
        Hyperparams = Args()
        BasePath = './runs/' + config.store_result_dir
        writer = SummaryWriter(BasePath + '/plots')

        original_drug_smile = train_drugs[it]
        original_target_aff = train_Y[it]
        original_drug = original_pair
        original_target = original_pair.target[0]

        print('Original target:')
        print(original_target)
        print('Original molecule:')
        print(original_drug_smile)

        model_to_explain = mol_utils.get_graphdta_dgn().to(device)
        pred_aff, drug_original_encoding, prot_original_encoding = model_to_explain(
            original_drug.to(device),
            seq_cat(original_target).to(device))
        atoms_ = np.unique([
            x.GetSymbol()
            for x in Chem.MolFromSmiles(original_drug_smile).GetAtoms()
        ])
        cof = [1.0, 0.05, 0.01, 0.05]
        env = make_parallel_env(original_drug_smile, original_target,
                                Hyperparams, atoms_, model_to_explain,
                                original_drug, original_target_aff, pred_aff,
                                device, cof)
        model = AttentionSAC.init_from_env(
            env,
            tau=config.tau,
            pi_lr=config.pi_lr,
            q_lr=config.q_lr,
            gamma=config.gamma,
            pol_hidden_dim=config.pol_hidden_dim,
            critic_hidden_dim=config.critic_hidden_dim,
            attend_heads=config.attend_heads,
            reward_scale=config.reward_scale)
        replay_buffer = ReplayBuffer(
            config.buffer_length, model.nagents,
            [obsp[0] for obsp in env.observation_space],
            [acsp for acsp in env.action_space])

        if not os.path.isdir(BasePath + "/counterfacts"):
            os.makedirs(BasePath + "/counterfacts")
        mol_utils.TopKCounterfactualsDTA.init(original_drug_smile, it,
                                              BasePath + "/counterfacts")

        t = 0
        episode_length = 1
        trg = trange(0, config.n_episodes, config.n_rollout_threads)
        for ep_i in trg:
            obs = env.reset()
            model.prep_rollouts(device='cpu')

            for et_i in range(episode_length):
                # rearrange observations to be per agent, and convert to torch Variable
                torch_obs = [
                    Variable(torch.Tensor(np.vstack(obs[:, i])),
                             requires_grad=False) for i in range(model.nagents)
                ]
                # get actions as torch Variables
                torch_agent_actions = model.step(torch_obs, explore=True)
                # convert actions to numpy arrays
                agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
                # rearrange actions to be per environment
                actions = [[ac[i] for ac in agent_actions]
                           for i in range(config.n_rollout_threads)]
                next_obs, results, dones, action_drug, action_prot = env.step(
                    actions)
                drug_reward, loss_, gain, drug_sim, prot_sim, qed = results[0][
                    0]
                prot_reward, loss_, gain, drug_sim, prot_sim, qed = results[0][
                    1]

                writer.add_scalar('DTA/Reward', drug_reward, ep_i)
                writer.add_scalar('DTA/Distance', loss_, ep_i)
                writer.add_scalar('DTA/Drug Similarity', drug_sim, ep_i)
                writer.add_scalar('DTA/Drug QED', qed, ep_i)
                writer.add_scalar('DTA/Protein Similarity', prot_sim, ep_i)

                pair_reward = []
                pair_reward.append(drug_reward)
                pair_reward.append(prot_reward)
                rewards = np.array([pair_reward])
                replay_buffer.push(obs, agent_actions, rewards, next_obs,
                                   dones)
                obs = next_obs
                t += 1
                if (len(replay_buffer) >= config.batch_size
                        and (t % config.steps_per_update) < 1):
                    if config.use_gpu:
                        model.prep_training(device='gpu')
                    else:
                        model.prep_training(device='cpu')
                    for u_i in range(config.num_updates):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=config.use_gpu)
                        model.update_critic(sample, logger=logger)
                        model.update_policies(sample, logger=logger)
                        model.update_all_targets()
                    model.prep_rollouts(device='cpu')
                if np.all(dones == True):
                    mutate_position = [
                        i for i in range(len(original_target))
                        if original_target[i] != action_prot[i]
                    ]
                    trg.set_postfix(Reward=drug_reward,
                                    DrugSim=drug_sim,
                                    TargetSim=prot_sim,
                                    SMILES=action_drug,
                                    TargetMutatePosition=mutate_position,
                                    refresh=True)
                    mol_utils.TopKCounterfactualsDTA.insert({
                        'smiles':
                        action_drug,
                        'protein':
                        action_prot,
                        'drug_reward':
                        drug_reward,
                        'protein_reward':
                        prot_reward,
                        'loss':
                        loss_,
                        'gain':
                        gain,
                        'drug sim':
                        drug_sim,
                        'drug qed':
                        qed,
                        'prot sim':
                        prot_sim,
                        'mutate position':
                        mutate_position
                    })
            ep_rews = replay_buffer.get_average_rewards(episode_length * 1)
            for a_i, a_ep_rew in enumerate(ep_rews):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  a_ep_rew * episode_length, ep_i)

            if ep_i % config.save_interval < config.n_rollout_threads:
                model.prep_rollouts(device='cpu')
                os.makedirs(run_dir / 'incremental', exist_ok=True)
                model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                      (ep_i + 1)))
                model.save(run_dir / 'model.pt')

        model.save(run_dir / 'model.pt')
        env.close()
        logger.export_scalars_to_json(str(log_dir / 'summary.json'))
        logger.close()
Beispiel #23
0
        })
        sa_size.append((state_size, action_size))

    hyperparams = {
        "tau": 0.01,  # ddpg soft update
        "pi_lr": 0.00001,
        "q_lr": 0.00005,
        "pol_hidden_dim": 256,
        "critic_hidden_dim": 256,
        "attend_heads": 8
    }

    model = AttentionSAC(agent_init_params=agent_init_params,
                         sa_size=sa_size,
                         tau=hyperparams["tau"],
                         pi_lr=hyperparams["pi_lr"],
                         q_lr=hyperparams["q_lr"],
                         pol_hidden_dim=hyperparams["pol_hidden_dim"],
                         critic_hidden_dim=hyperparams["critic_hidden_dim"],
                         attend_heads=hyperparams["attend_heads"])
    model.init_dict = {}

    replay_buffer = ReplayBuffer(buffer_length, n_agents,
                                 [state_size for i in range(n_agents)],
                                 [action_size for i in range(n_agents)])

    print("MAX STEPS: " + str(max_steps))
    print("NUM EPISODES: ", num_episodes)
    print("HYPERPARAMS: ")
    print(hyperparams)

    start_time = time.time()