예제 #1
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    envActionSpace = env.action_space
    envObservationSpace = env.observation_space

    model = AttentionSAC.init_from_env(
        envActionSpace,
        envObservationSpace,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,  #128
        critic_hidden_dim=config.critic_hidden_dim,  #128
        attend_heads=config.attend_heads,  #4
        reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):  #12
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):  #25
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads
                ):  # 100 steps across rollouts -> 4 updates
                model.prep_training(device='cpu')

                for u_i in range(config.num_updates):  #4
                    sample = replay_buffer.sample(config.batch_size)
                    model.update_critic(sample)
                    model.update_policies(sample)
                    model.update_all_targets()

                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
예제 #2
0
def run(config):
    cover_ratio = []

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    # os.makedirs(log_dir)
    # logger = SummaryWriter(str(log_dir))

    #    torch.manual_seed(run_num)
    #    np.random.seed(run_num)
    #env = make_parallel_env(, config.n_rollout_threads, run_num)
    env = make_env(config.env_id,
                   benchmark=BENCHMARK,
                   discrete_action=True,
                   use_handcraft_policy=config.use_handcraft_policy)
    model = AttentionSAC.init_from_env(
        env,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,
        critic_hidden_dim=config.critic_hidden_dim,
        attend_heads=config.attend_heads,
        reward_scale=config.reward_scale)

    model.init_from_save_self('./models/swift_scenario/model/run8/model.pt')
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    update_count = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            agent_actions = [
                ac.data.numpy().squeeze() for ac in torch_agent_actions
            ]
            # rearrange actions to be per environment
            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # agent_actions[0][5]=1
            # agent_actions[1][5]=1
            # agent_actions[2][5]=1
            next_obs, rewards, dones, infos = env.step(
                agent_actions,
                use_handcraft_policy=config.use_handcraft_policy)
            env.render()
            time.sleep(0.1)

            # # # get actions as torch Variables
            # torch_agent_actions = model.step(torch_obs, explore=True)
            # # convert actions to numpy arrays
            # agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # # rearrange actions to be per environment
            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # next_obs, rewards, dones, infos = env.step(actions)
            # env.render()

            #if et_i == config.episode_length - 1:
            #print(infos)
            #print(type(infos['cover_ratio']))
            #cover_ratio.append(float(infos[0]['n'][0]['cover_ratio']))
            #print(infos)

            #            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            '''
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):

                    update_count += 1
                    print("episode:", ep_i, ", total steps:", t, " update_count:", update_count)

                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

        logger.export_scalars_to_json(str(log_dir / 'summary.json'))

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    print(cover_ratio)
    '''
    env.close()
예제 #3
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    # if not model_dir.exists():
    #     run_num = 1
    # else:
    #     exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
    #                      model_dir.iterdir() if
    #                      str(folder.name).startswith('run')]
    #     if len(exst_run_nums) == 0:
    #         run_num = 1
    #     else:
    #         run_num = max(exst_run_nums) + 1
    run_num = 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir,exist_ok=True)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    model = AttentionSAC.init_from_env(env,
                                       tau=config.tau,
                                       pi_lr=config.pi_lr,
                                       q_lr=config.q_lr,
                                       gamma=config.gamma,
                                       pol_hidden_dim=config.pol_hidden_dim,
                                       critic_hidden_dim=config.critic_hidden_dim,
                                       attend_heads=config.attend_heads,
                                       reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(model.nagents)]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * config.episode_length, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #4
0
def run(config):

    numWolves = 4
    numSheep = 1
    numBlocks = 2
    numAgents = numWolves + numSheep
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [
        blockSize
    ] * numBlocks
    sheepMaxSpeed = 1.3
    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheep + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    collisionReward = 10
    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID,
                              sheepsID,
                              entitiesSizeList,
                              getPosFromAgentState,
                              isCollision,
                              punishForOutOfBound,
                              collisionPunishment=collisionReward)

    individualRewardWolf = 0
    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision,
                            collisionReward, individualRewardWolf)
    reshapeAction = ReshapeAction()
    costActionRatio = 0
    getActionCost = GetActionCost(costActionRatio,
                                  reshapeAction,
                                  individualCost=True)
    getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID]
    rewardWolfWithActionCost = lambda state, action, nextState: np.array(
        rewardWolf(state, action, nextState)) - np.array(
            getActionCost(getWolvesAction(action)))

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)

    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState,
                                    getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction,
                                       applyActionForce, applyEnvironForce,
                                       integrateState)

    isTerminal = lambda state: [False] * numAgents

    initObsForParams = observe(reset())
    envObservationSpace = [
        initObsForParams[obsID].shape for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    envActionSpace = [
        spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents)
    ]

    model_dir = os.path.join(dirName, 'models', config.env_id,
                             config.model_name)
    model = AttentionSAC.init_from_env(
        envActionSpace,
        envObservationSpace,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,  #128
        critic_hidden_dim=config.critic_hidden_dim,  #128
        attend_heads=config.attend_heads,  #4
        reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [
        obsp[0] if isinstance(obsp, tuple) else obsp.shape[0]
        for obsp in envObservationSpace
    ], [
        acsp.shape[0] if isinstance(acsp, Box) else acsp.n
        for acsp in envActionSpace
    ])
    t = 0

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):  #12
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        state = reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            obs = observe(state)
            obs = np.array([obs])
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            action = actions[0]
            nextState = transit(state, action)
            next_obs = np.array([observe(nextState)])
            rewards = np.array([rewardFunc(state, action, nextState)])
            dones = np.array([isTerminal(nextState)])

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            state = nextState
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads
                ):  # 100 steps across rollouts -> 4 updates
                model.prep_training(device='cpu')

                for u_i in range(config.num_updates):  #4
                    sample = replay_buffer.sample(config.batch_size)
                    model.update_critic(sample)
                    model.update_policies(sample)
                    model.update_all_targets()

                model.prep_rollouts(device='cpu')

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            pathIncremental = os.path.join(model_dir, 'incremental')
            if not os.path.exists(pathIncremental):
                os.makedirs(pathIncremental)
            model.save(
                os.path.join(pathIncremental, ('model_ep%i.pt' % (ep_i + 1))))

    model.save(os.path.join(model_dir, 'model.pt'))
예제 #5
0
파일: main_gpu.py 프로젝트: leehe228/TIL
def run(config):
    model_dir = Path('./models') / config["env_id"] / config["model_name"]
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config["n_rollout_threads"], run_num)
    model = AttentionSAC.init_from_env(
        env,
        tau=config["tau"],
        pi_lr=config["pi_lr"],
        q_lr=config["q_lr"],
        gamma=config["gamma"],
        pol_hidden_dim=config["pol_hidden_dim"],
        critic_hidden_dim=config["critic_hidden_dim"],
        attend_heads=config["attend_heads"],
        reward_scale=config["reward_scale"])
    # (** EDITED **) Set Replay Buffer
    # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정
    replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents,
                                 [115 for _ in range(model.nagents)],
                                 [19 for _ in range(model.nagents)])
    t = 0
    for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]):
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + config["n_rollout_threads"],
               config["n_episodes"]))

        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config["episode_length"]):
            print("episode : {} | step : {}".format(ep_i, et_i), end='\r')
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config["n_rollout_threads"])]

            # Reform Actions list to fit on Football Env
            # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음
            actions_list = [[np.argmax(b) for b in a] for a in actions]

            # Step
            next_obs, rewards, dones, infos = env.step(actions_list)

            # Prevention of divergence
            # 안해주면 발산해서 학습 불가 (NaN)
            rewards = rewards - 0.000001

            # Reform Done Flag list
            # replay buffer에 알맞도록 done 리스트 재구성
            dones = (np.array([dones for _ in range(model.nagents)])).T

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config["n_rollout_threads"]
            if (len(replay_buffer) >= config["batch_size"]
                    and (t % config["steps_per_update"]) <
                    config["n_rollout_threads"]):
                if config["use_gpu"]:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config["num_updates"]):
                    sample = replay_buffer.sample(config["batch_size"],
                                                  to_gpu=config["use_gpu"])
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config["episode_length"] * config["n_rollout_threads"])
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * config["episode_length"], ep_i)

        if ep_i % config["save_interval"] < config["n_rollout_threads"]:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #6
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    #log_dir = run_dir / 'logs'
    os.makedirs(run_dir)
    #logger = SummaryWriter(str(log_dir))

    # Initialization of evaluation metrics
    collisions = [0]
    success_nums = [0]
    ccr_activates = [0]
    final_ep_rewards = []  # sum of rewards for training curve
    final_ep_collisions = []
    final_ep_activates = []
    final_ep_success_nums = []

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    env = make_env(config.env_id, discrete_action=True)
    num_agents = env.n
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)

    # if config.emergency:
    #     env.switch_emergency()

    model = AttentionSAC.init_from_env(
        env,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,
        critic_hidden_dim=config.critic_hidden_dim,
        attend_heads=config.attend_heads,
        reward_scale=config.reward_scale)

    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    #### remove all tensorboard methods, replace with print and pickle

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        #print("Episodes %i-%i of %i" % (ep_i + 1,
        #                                ep_i + 1 + config.n_rollout_threads,
        #                                config.n_episodes))
        if config.emergency:
            env.switch_emergency()
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        t_start = time.time()

        prev_obs = None
        act_n_t_minus_1 = None

        for et_i in range(config.episode_length):
            if config.CCR:
                if act_n_t_minus_1:
                    target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1)
                    diff_state = obs[:, :, :4] - target_obs_n[:, :, :
                                                              4]  # 12x4x4

                    if config.env_id == 'wall' or config.env_id == 'strong_wind' or config.env_id == 'wall_expos':
                        diff_obs = obs[:, :, -(model.nagents + 8 + 1)]
                    elif config.env_id == 'turbulence':
                        diff_obs = obs[:, :, -(model.nagents + 2 + 1)]
                    else:
                        assert (False)

                    emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs  # 12x4

                    env.oracle_update()

                    # obs: 12x4x20
                    # emerg_n: 12x4
                    for agent_i in range(model.nagents):
                        for agent_j in range(model.nagents):
                            #print(obs[:, agent_i, -agent_j])
                            #print(emerg_n[:, agent_j])
                            obs[:, agent_i, -agent_j] = emerg_n[:, agent_j]
                            #print(obs[:, agent_i, -agent_j])
                            #print(emerg_n[:, agent_j])
            # collect experience
            if prev_obs is not None:
                replay_buffer.push(prev_obs, agent_actions, rewards, obs,
                                   dones)

            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]

            next_obs, rewards, dones, infos = env.step(actions)

            if config.CCR:
                if act_n_t_minus_1:
                    for i in range(model.nagents):
                        for j in range(model.nagents):
                            # ccr_activates[-1] += 1
                            intrinsic_reward = np.linalg.norm(
                                next_obs[:, i, 2:4] - obs[:, j, 2:4],
                                axis=-1) - np.linalg.norm(
                                    obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1)
                            intrinsic_reward /= (1 + np.linalg.norm(
                                obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1))
                            intrinsic_reward *= (emerg_n[:, j] - emerg_n[:, i])
                            rewards[:, i] += 10 * intrinsic_reward / np.sqrt(
                                num_agents)
                            """
                            if (len(episode_rewards) == 2 or len(episode_rewards) == 2000 or len(episode_rewards) == 5000) and episode_step % 5 == 0:
                                Ls[i].append('      intrinsic reward = ' + str(intrinsic_reward) + '\n')
                            """
                            # if i == j: continue
                            # emerg_invalid = ~((emerg_n[:,j] > emerg_n[:,i]) & (emerg_n[:,j] > 0))
                            # ccr_activates[-1] += (~emerg_invalid).sum()
                            # intrinsic_reward = np.linalg.norm(next_obs[:,i,2:4] - obs[:,j,2:4], axis=-1) - np.linalg.norm(obs[:,i,2:4] - obs[:,j,2:4], axis=-1)
                            # intrinsic_reward[emerg_invalid] = 0
                            # rewards[:,i] += 10 * intrinsic_reward

                act_n_t_minus_1 = actions

            prev_obs = obs

            obs = next_obs

            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=None)
                    model.update_policies(sample, logger=None)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')

        ls_num_collision = env.get_collision_and_zero_out()

        collisions.append(np.array(
            ls_num_collision).mean())  # might need to convert to np.int

        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        ep_rews = np.array(ep_rews).mean()
        # save model, display training output

        print(
            "episodes: {}, mean episode reward: {}, mean number of collisions with wall: {}, ccr activates: {}, success numbers: {}, time: {}"
            .format(ep_i, ep_rews, np.mean(collisions[-config.save_rate:]),
                    np.mean(ccr_activates[-config.save_rate:]),
                    np.mean(success_nums[-config.save_rate:]),
                    round(time.time() - t_start, 3)))

        # Keep track of final episode reward
        final_ep_rewards.append(ep_rews)
        # final_ep_activates.append(np.mean(ccr_activates[-config.save_rate:]))
        final_ep_collisions.append(np.mean(collisions[-config.save_rate:]))
        final_ep_success_nums.append(np.mean(success_nums[-config.save_rate:]))
        if ep_i % config.save_rate == 0:
            x_axis = np.arange(0, ep_i + 1, step=12)
            # plot reward data
            rew_file_name = run_dir / 'rewards.png'

            plt.plot(x_axis, final_ep_rewards)
            plt.xlabel('training episode')
            plt.ylabel('reward')
            #plt.legend()
            plt.savefig(rew_file_name)

            plt.clf()

            collision_file_name = run_dir / 'collisions.png'

            plt.plot(x_axis, final_ep_collisions)
            plt.xlabel('training episode')
            plt.ylabel('number of collisions')
            #plt.legend()
            plt.savefig(collision_file_name)

            plt.clf()

            # activates_file_name = run_dir / 'activates.png'

            # plt.plot(x_axis, final_ep_activates)
            # plt.xlabel('training episode')
            # plt.ylabel('CCR activates')
            # #plt.legend()
            # plt.savefig(activates_file_name)

            # plt.clf()

            success_file_name = run_dir / 'successes.png'

            plt.plot(x_axis, final_ep_success_nums)
            plt.xlabel('training episode')
            plt.ylabel('success numbers')
            #plt.legend()
            plt.savefig(success_file_name)

            plt.clf()

            rew_file_name = run_dir
            collision_file_name = run_dir
            success_nums_file_name = run_dir
            activates_file_name = run_dir

            rew_file_name /= 'rewards.pkl'
            collision_file_name /= 'collisions.pkl'
            success_nums_file_name /= 'success_nums.pkl'
            # activates_file_name /= 'activates.pkl'

            with open(rew_file_name, 'wb') as fp:
                pickle.dump(final_ep_rewards, fp)
            with open(collision_file_name, 'wb') as fp:
                pickle.dump(final_ep_collisions, fp)

            # with open(activates_file_name, 'wb') as fp:
            #     pickle.dump(final_ep_activates, fp)

            with open(success_nums_file_name, 'wb') as fp:
                pickle.dump(final_ep_success_nums, fp)

                plt.clf()

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
예제 #7
0
def main():
    debug = 1
    if debug:
        numWolves = 3
        numSheep = 1
        numBlocks = 2
        sheepSpeedMultiplier = 1
        individualRewardWolf = 0
        costActionRatio = 0.0

    else:
        print(sys.argv)
        condition = json.loads(sys.argv[1])
        numWolves = int(condition['numWolves'])
        numSheep = int(condition['numSheeps'])
        numBlocks = int(condition['numBlocks'])

        sheepSpeedMultiplier = float(condition['sheepSpeedMultiplier'])
        individualRewardWolf = float(condition['individualRewardWolf'])
        costActionRatio = float(condition['costActionRatio'])

    modelName = "maac{}wolves{}sheep{}blocksSheepSpeed{}WolfActCost{}individ{}".format(
        numWolves, numSheep, numBlocks, sheepSpeedMultiplier, costActionRatio,
        individualRewardWolf)

    n_rollout_threads = 1
    buffer_length = int(1e6)
    n_episodes = 60000
    episode_length = 75
    steps_per_update = 100
    num_updates = 4
    batch_size = 1024
    save_interval = 1000
    pol_hidden_dim = 128
    critic_hidden_dim = 128
    attend_heads = 4
    pi_lr = 0.001
    q_lr = 0.001
    tau = 0.001
    gamma = 0.99
    reward_scale = 100.

    numAgents = numWolves + numSheep
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [
        blockSize
    ] * numBlocks

    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    sheepMaxSpeedOriginal = 1.3
    sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheep + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    collisionReward = 10
    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID,
                              sheepsID,
                              entitiesSizeList,
                              getPosFromAgentState,
                              isCollision,
                              punishForOutOfBound,
                              collisionPunishment=collisionReward)

    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision,
                            collisionReward, individualRewardWolf)
    reshapeAction = ReshapeAction()
    getActionCost = GetActionCost(costActionRatio,
                                  reshapeAction,
                                  individualCost=True)
    getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID]
    rewardWolfWithActionCost = lambda state, action, nextState: np.array(
        rewardWolf(state, action, nextState)) - np.array(
            getActionCost(getWolvesAction(action)))

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)

    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState,
                                    getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction,
                                       applyActionForce, applyEnvironForce,
                                       integrateState)

    isTerminal = lambda state: [False] * numAgents

    initObsForParams = observe(reset())
    envObservationSpace = [
        initObsForParams[obsID].shape for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    envActionSpace = [
        spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents)
    ]

    model_dir = os.path.join(dirName, 'models', 'chasing')
    model = AttentionSAC.init_from_env(
        envActionSpace,
        envObservationSpace,
        tau=tau,
        pi_lr=pi_lr,
        q_lr=q_lr,
        gamma=gamma,
        pol_hidden_dim=pol_hidden_dim,  #128
        critic_hidden_dim=critic_hidden_dim,  #128
        attend_heads=attend_heads,  #4
        reward_scale=reward_scale)
    replay_buffer = ReplayBuffer(buffer_length, model.nagents, [
        obsp[0] if isinstance(obsp, tuple) else obsp.shape[0]
        for obsp in envObservationSpace
    ], [
        acsp.shape[0] if isinstance(acsp, Box) else acsp.n
        for acsp in envActionSpace
    ])
    t = 0

    for ep_i in range(0, n_episodes, n_rollout_threads):  #12
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + n_rollout_threads, n_episodes))
        state = reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(episode_length):
            obs = observe(state)
            obs = np.array([obs])
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(n_rollout_threads)]
            action = actions[0]
            nextState = transit(state, action)
            next_obs = np.array([observe(nextState)])
            rewards = np.array([rewardFunc(state, action, nextState)])
            dones = np.array([isTerminal(nextState)])

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            state = nextState
            t += n_rollout_threads
            if (len(replay_buffer) >= batch_size
                    and (t % steps_per_update) < n_rollout_threads
                ):  # 100 steps across rollouts -> 4 updates
                model.prep_training(device='cpu')

                for u_i in range(num_updates):  #4
                    sample = replay_buffer.sample(batch_size)
                    model.update_critic(sample)
                    model.update_policies(sample)
                    model.update_all_targets()

                model.prep_rollouts(device='cpu')

        if ep_i % save_interval < n_rollout_threads:
            model.prep_rollouts(device='cpu')
            model.save(os.path.join(model_dir, modelName + 'eps' + str(ep_i)))

    model.save(os.path.join(model_dir, modelName))
예제 #8
0
def run(config):
    USE_CUDA = False
    if config.gpu:
        if torch.cuda.is_available():
            USE_CUDA = True
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))
    
#     model_run = 'run%i' % max(exst_run_nums)
#     model_path = model_dir / model_run / 'model.pt'

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num,
                            config.n_controlled_lagents, config.n_controlled_ragents, config.reward_type, config.render)
    model = AttentionSAC.init_from_env(env,
                                       tau=config.tau,
                                       pi_lr=config.pi_lr,
                                       q_lr=config.q_lr,
                                       gamma=config.gamma,
                                       pol_hidden_dim=config.pol_hidden_dim,
                                       critic_hidden_dim=config.critic_hidden_dim,
                                       attend_heads=config.attend_heads,
                                       reward_scale=config.reward_scale)
    
#     model = AttentionSAC.init_from_save_(model_path, load_critic=False, gpu=USE_CUDA)
    
    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    best_rewards = 0
    t = 0
    num_episodes = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        
        if ep_i % (config.epoch_size * config.n_rollout_threads) == 0:
            stat = dict()
            stat['epoch'] = int(ep_i / (config.epoch_size * config.n_rollout_threads) + 1)
            
        obs = env.reset()
        model.prep_rollouts(device='cpu')
        
        s = dict()
        s['dones'] = [0 for i in range(config.n_rollout_threads)]
        s['num_episodes'] = [0 for i in range(config.n_rollout_threads)]
        s['reward'] = [0 for i in range(config.n_rollout_threads)]
        s['success'] = [0 for i in range(config.n_rollout_threads)]
        s['steps_taken'] = [0 for i in range(config.n_rollout_threads)]
        s['reward_buffer'] = [0 for i in range(config.n_rollout_threads)]
        s['steps_buffer'] = [0 for i in range(config.n_rollout_threads)]

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(model.nagents)]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=USE_CUDA)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
                
            for i in range(config.n_rollout_threads):
                s['reward'][i] += np.mean(rewards[i])
                s['steps_taken'][i] += 1
                if dones[i][0] == True:
                    s['dones'][i] += 1
                    s['num_episodes'][i] += 1
                    s['reward_buffer'][i] = s['reward'][i]
                    s['steps_buffer'][i] = s['steps_taken'][i]
                    if infos[i]['score_reward'] == 1:
                        s['success'][i] += 1
                if et_i == config.episode_length-1:
                    if dones[i][0] == False:
                        if s['dones'][i] > 0:
                            s['reward'][i] = s['reward_buffer'][i]
                            s['steps_taken'][i] = s['steps_buffer'][i]
                        else:
                            s['num_episodes'][i] += 1
                            
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        global_ep_rews = 0
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalars('agent%i/rewards' % a_i, {'mean_episode_rewards': a_ep_rew}, ep_i)
            global_ep_rews += a_ep_rew / (config.n_controlled_lagents + config.n_controlled_ragents)
        logger.add_scalars('global', {'global_rewards': global_ep_rews}, ep_i)
        
        if global_ep_rews > 0.007:
            model.save(run_dir / ('model_ep%i.pt' % ep_i))
#             print('model saved at ep%i' % ep_i)   
#             print('saved model reward: ', global_ep_rews)
        
        if global_ep_rews > best_rewards:
            best_rewards = global_ep_rews
            if best_rewards > 0.005:
                model.save(run_dir / ('best_model_ep%i.pt' % ep_i))
#                 print('best model saved at ep%i' % ep_i)
#                 print('best global reward: ', best_rewards)
                
#         if ep_i%500 == 0:
#             print('episode: ', ep_i)
#             print('global reward: ', global_ep_rews)
#             print('best global reward: ', best_rewards)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')
            
        # An exact episode means a real episode in the game, rather than the episode in a training loop
        # Mean (exact) episode data are only generated from complete exact episodes
        # We calculate the mean (exact) episode data in each epoch
        # (config.epoch_size * config.n_rollout_threads) means the number of training episodes an epoch includes
        # The mean (exact) episode data are used for visualization and comparison
        # Reward, Steps-Taken, Success

        stat['num_episodes'] = stat.get('num_episodes', 0) + np.sum(s['num_episodes'])
        stat['reward'] = stat.get('reward', 0) + np.sum(s['reward'])
        stat['success'] = stat.get('success', 0) + np.sum(s['success'])
        stat['steps_taken'] = stat.get('steps_taken', 0) + np.sum(s['steps_taken'])

        if (ep_i+config.n_rollout_threads) % (config.epoch_size * config.n_rollout_threads) == 0:
            num_episodes += stat['num_episodes']
            print('Epoch {}'.format(stat['epoch']))
            print('Episode: {}'.format(num_episodes))
            print('Reward: {}'.format(stat['reward']/stat['num_episodes']))
            print('Success: {:.2f}'.format(stat['success']/stat['num_episodes']))
            print('Steps-Taken: {:.2f}'.format(stat['steps_taken']/stat['num_episodes']))

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #9
0
파일: MAAC.py 프로젝트: leehe228/TIL
def run(config):
    model_dir = Path('./models') / config["env_id"] / config["model_name"]
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config["env_id"], config["n_rollout_threads"],
                            run_num)
    model = AttentionSAC.init_from_env(
        env,
        tau=config["tau"],
        pi_lr=config["pi_lr"],
        q_lr=config["q_lr"],
        gamma=config["gamma"],
        pol_hidden_dim=config["pol_hidden_dim"],
        critic_hidden_dim=config["critic_hidden_dim"],
        attend_heads=config["attend_heads"],
        reward_scale=config["reward_scale"])
    replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents,
                                 [115 for _ in range(11)],
                                 [19 for _ in range(11)])
    t = 0
    for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]):
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + config["n_rollout_threads"],
               config["n_episodes"]))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        done = [False]
        et_i = 0

        while not any(done):
            et_i += 1
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config["n_rollout_threads"])]

            actions_list = []
            for a in actions:
                temp = []
                for b in a:
                    temp.append(np.argmax(b))
                actions_list.append(temp)

            next_obs, rewards, done, infos = env.step(actions_list)

            dones = [done for _ in range(11)]

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config["n_rollout_threads"]
            if (len(replay_buffer) >= config["batch_size"]
                    and (t % config["steps_per_update"]) <
                    config["n_rollout_threads"]):
                if config["use_gpu"]:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config["num_updates"]):
                    sample = replay_buffer.sample(config["batch_size"],
                                                  to_gpu=config["use_gpu"])
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')

            print("ep_i : {} | et_i : {}".format(ep_i, et_i), end='\r')

        ep_rews = replay_buffer.get_average_rewards(
            config["episode_length"] * config["n_rollout_threads"])

        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * config["episode_length"], ep_i)

        if ep_i % config["save_interval"] < config["n_rollout_threads"]:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #10
0
def run(config):
    device = torch.device(
        'cuda:' + str(config.gpu) if torch.cuda.is_available() else 'cpu')
    model_dir = Path('./runs') / config.store_result_dir

    train_loader, train_drugs, train_Y = preprocess(config.dataset, config)

    print("number of data")
    print(len(train_loader))
    for it, original_pair in enumerate(train_loader):
        if not model_dir.exists():
            run_num = 1
        else:
            exst_run_nums = [
                int(str(folder.name).split('run')[1])
                for folder in model_dir.iterdir()
                if str(folder.name).startswith('run')
            ]
            if len(exst_run_nums) == 0:
                run_num = 1
            else:
                run_num = max(exst_run_nums) + 1
        curr_run = 'run%i' % run_num
        run_dir = model_dir / curr_run
        log_dir = run_dir / 'logs'
        os.makedirs(log_dir)
        logger = SummaryWriter(str(log_dir))

        torch.manual_seed(run_num)
        np.random.seed(run_num)

        print('Run pair number ', str(it))
        Hyperparams = Args()
        BasePath = './runs/' + config.store_result_dir
        writer = SummaryWriter(BasePath + '/plots')

        original_drug_smile = train_drugs[it]
        original_target_aff = train_Y[it]
        original_drug = original_pair
        original_target = original_pair.target[0]

        print('Original target:')
        print(original_target)
        print('Original molecule:')
        print(original_drug_smile)

        model_to_explain = mol_utils.get_graphdta_dgn().to(device)
        pred_aff, drug_original_encoding, prot_original_encoding = model_to_explain(
            original_drug.to(device),
            seq_cat(original_target).to(device))
        atoms_ = np.unique([
            x.GetSymbol()
            for x in Chem.MolFromSmiles(original_drug_smile).GetAtoms()
        ])
        cof = [1.0, 0.05, 0.01, 0.05]
        env = make_parallel_env(original_drug_smile, original_target,
                                Hyperparams, atoms_, model_to_explain,
                                original_drug, original_target_aff, pred_aff,
                                device, cof)
        model = AttentionSAC.init_from_env(
            env,
            tau=config.tau,
            pi_lr=config.pi_lr,
            q_lr=config.q_lr,
            gamma=config.gamma,
            pol_hidden_dim=config.pol_hidden_dim,
            critic_hidden_dim=config.critic_hidden_dim,
            attend_heads=config.attend_heads,
            reward_scale=config.reward_scale)
        replay_buffer = ReplayBuffer(
            config.buffer_length, model.nagents,
            [obsp[0] for obsp in env.observation_space],
            [acsp for acsp in env.action_space])

        if not os.path.isdir(BasePath + "/counterfacts"):
            os.makedirs(BasePath + "/counterfacts")
        mol_utils.TopKCounterfactualsDTA.init(original_drug_smile, it,
                                              BasePath + "/counterfacts")

        t = 0
        episode_length = 1
        trg = trange(0, config.n_episodes, config.n_rollout_threads)
        for ep_i in trg:
            obs = env.reset()
            model.prep_rollouts(device='cpu')

            for et_i in range(episode_length):
                # rearrange observations to be per agent, and convert to torch Variable
                torch_obs = [
                    Variable(torch.Tensor(np.vstack(obs[:, i])),
                             requires_grad=False) for i in range(model.nagents)
                ]
                # get actions as torch Variables
                torch_agent_actions = model.step(torch_obs, explore=True)
                # convert actions to numpy arrays
                agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
                # rearrange actions to be per environment
                actions = [[ac[i] for ac in agent_actions]
                           for i in range(config.n_rollout_threads)]
                next_obs, results, dones, action_drug, action_prot = env.step(
                    actions)
                drug_reward, loss_, gain, drug_sim, prot_sim, qed = results[0][
                    0]
                prot_reward, loss_, gain, drug_sim, prot_sim, qed = results[0][
                    1]

                writer.add_scalar('DTA/Reward', drug_reward, ep_i)
                writer.add_scalar('DTA/Distance', loss_, ep_i)
                writer.add_scalar('DTA/Drug Similarity', drug_sim, ep_i)
                writer.add_scalar('DTA/Drug QED', qed, ep_i)
                writer.add_scalar('DTA/Protein Similarity', prot_sim, ep_i)

                pair_reward = []
                pair_reward.append(drug_reward)
                pair_reward.append(prot_reward)
                rewards = np.array([pair_reward])
                replay_buffer.push(obs, agent_actions, rewards, next_obs,
                                   dones)
                obs = next_obs
                t += 1
                if (len(replay_buffer) >= config.batch_size
                        and (t % config.steps_per_update) < 1):
                    if config.use_gpu:
                        model.prep_training(device='gpu')
                    else:
                        model.prep_training(device='cpu')
                    for u_i in range(config.num_updates):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=config.use_gpu)
                        model.update_critic(sample, logger=logger)
                        model.update_policies(sample, logger=logger)
                        model.update_all_targets()
                    model.prep_rollouts(device='cpu')
                if np.all(dones == True):
                    mutate_position = [
                        i for i in range(len(original_target))
                        if original_target[i] != action_prot[i]
                    ]
                    trg.set_postfix(Reward=drug_reward,
                                    DrugSim=drug_sim,
                                    TargetSim=prot_sim,
                                    SMILES=action_drug,
                                    TargetMutatePosition=mutate_position,
                                    refresh=True)
                    mol_utils.TopKCounterfactualsDTA.insert({
                        'smiles':
                        action_drug,
                        'protein':
                        action_prot,
                        'drug_reward':
                        drug_reward,
                        'protein_reward':
                        prot_reward,
                        'loss':
                        loss_,
                        'gain':
                        gain,
                        'drug sim':
                        drug_sim,
                        'drug qed':
                        qed,
                        'prot sim':
                        prot_sim,
                        'mutate position':
                        mutate_position
                    })
            ep_rews = replay_buffer.get_average_rewards(episode_length * 1)
            for a_i, a_ep_rew in enumerate(ep_rews):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  a_ep_rew * episode_length, ep_i)

            if ep_i % config.save_interval < config.n_rollout_threads:
                model.prep_rollouts(device='cpu')
                os.makedirs(run_dir / 'incremental', exist_ok=True)
                model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                      (ep_i + 1)))
                model.save(run_dir / 'model.pt')

        model.save(run_dir / 'model.pt')
        env.close()
        logger.export_scalars_to_json(str(log_dir / 'summary.json'))
        logger.close()