コード例 #1
0
def config_log(FLAGS):
    logdir = "tensorboard/%s/hrl_a2c_svib/%s_lr%s_%s/%s_%s_%s" % (
        FLAGS.env,FLAGS.num_timesteps, '0.0007',FLAGS.policy, start_time, FLAGS.train_option, str(FLAGS.beta))
    if FLAGS.log == "tensorboard":
        Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[TensorBoardOutputFormat(logdir)])
    elif FLAGS.log == "stdout":
        Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[HumanOutputFormat(sys.stdout)])
コード例 #2
0
def main():
    FLAGS(sys.argv)
    logdir = "tensorboard"
    if FLAGS.algorithm == "deepq":
        logdir = "tensorboard/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)

    if FLAGS.log == "tensorboard":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])
    elif FLAGS.log == "stdout":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])
    print("env : %s" % FLAGS.env)
    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("lr : %s" % FLAGS.lr)
    # Choose which RL algorithm to train.
    if FLAGS.algorithm == "deepq":  # Use DQN
        train_dqn(env_id=FLAGS.env, num_timesteps=FLAGS.timesteps)
コード例 #3
0
def main():
  FLAGS(sys.argv)

  logdir = "tensorboard"
  if(FLAGS.algorithm == "deepq"):
    logdir = "./tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.exploration_fraction,
      FLAGS.prioritized,
      FLAGS.dueling,
      FLAGS.lr,
      start_time
    )

  if(FLAGS.log == "tensorboard"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir='log.txt',
               output_formats=[TensorBoardOutputFormat(logdir)])

  elif(FLAGS.log == "stdout"):
    os.mkdir(logdir)
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[HumanOutputFormat(logdir+"/log.txt")])

  with sc2_env.SC2Env(
      map_name="DefeatZerglingsAndBanelings",
      minimap_size_px = (FLAGS.minimap_size_px, FLAGS.minimap_size_px),
      step_mul=FLAGS.step_mul,
      visualize=FLAGS.visualize,
      game_steps_per_episode= FLAGS.episode_steps) as env:

    model = deepq.models.cnn_to_mlp(
      convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1), (64, 3, 1), (64, 3, 1), (32, 3, 1)],
      hiddens=[256],
      dueling=True
    )

    act = dqfd.learn(
      env,
      q_func=model,
      num_actions=FLAGS.num_actions,
      lr=FLAGS.lr,
      print_freq= FLAGS.print_freq,
      max_timesteps=FLAGS.timesteps,
      buffer_size=FLAGS.buffer_size,
      exploration_fraction=FLAGS.exploration_fraction,
      exploration_final_eps=FLAGS.exploration_final_eps,
      train_freq=FLAGS.train_freq,
      learning_starts=FLAGS.learning_starts,
      target_network_update_freq=FLAGS.target_network_update_freq,
      gamma=FLAGS.gamma,
      prioritized_replay=FLAGS.prioritized,
      callback=deepq_callback
    )
    act.save("defeat_zerglings.pkl")
コード例 #4
0
def main():
    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",
            step_mul=step_mul,
            visualize=True,
            agent_interface_format=sc2_env.AgentInterfaceFormat(
                feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)),
            game_steps_per_episode=steps * step_mul) as env:
        obs = env.reset()
        #print(obs[0].observation)
        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[256],
                                        dueling=True)
        demo_replay = []
        act = dqfd.learn(env,
                         q_func=model,
                         num_actions=3,
                         lr=1e-4,
                         max_timesteps=10000000,
                         buffer_size=100000,
                         exploration_fraction=0.5,
                         exploration_final_eps=0.01,
                         train_freq=2,
                         learning_starts=100000,
                         target_network_update_freq=1000,
                         gamma=0.99,
                         prioritized_replay=True,
                         callback=deepq_callback)
        act.save("defeat_zerglings.pkl")
コード例 #5
0
ファイル: main.py プロジェクト: yaqingwang/mlsh-gpu
def main():
    if osp.exists(LOGDIR):
        shutil.rmtree(LOGDIR)
    os.makedirs(LOGDIR)
    if not osp.exists(CKPTDIR):
        os.makedirs(CKPTDIR)
    Logger.DEFAULT = Logger.CURRENT = Logger(
        dir=None,
        output_formats=[
            HumanOutputFormat(sys.stdout),
            CSVOutputFormat(osp.join(LOGDIR, 'log.csv'))
        ])
    train()
コード例 #6
0
ファイル: my_test.py プロジェクト: shockley/pysc2-examples
def main():
    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",
            step_mul=step_mul,
            visualize=True,
            agent_interface_format=sc2_env.AgentInterfaceFormat(
                feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)),
            game_steps_per_episode=steps * step_mul) as env:

        print(env.observation_spec())
        screen_dim = env.observation_spec()[0]['feature_screen'][1:3]
        print(screen_dim)
コード例 #7
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            visualize=True,
                            screen_size_px=(16, 16),
                            minimap_size_px=(16, 16)) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "deepq-4way"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            screen_size_px=(32, 32),
                            minimap_size_px=(32, 32),
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
コード例 #8
0
ファイル: train.py プロジェクト: amyzhang/level-replay
def train(args, seeds):
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if args.cuda else "cpu")
    if 'cuda' in device.type:
        print('Using CUDA\n')

    torch.set_num_threads(1)

    utils.seed(args.seed)

    # Configure logging
    if args.xpid is None:
        args.xpid = "lr-%s" % time.strftime("%Y%m%d-%H%M%S")
    log_dir = os.path.expandvars(os.path.expanduser(args.log_dir))
    plogger = FileWriter(
        xpid=args.xpid, xp_args=args.__dict__, rootdir=log_dir,
        seeds=seeds,
    )
    stdout_logger = HumanOutputFormat(sys.stdout)

    checkpointpath = os.path.expandvars(
        os.path.expanduser("%s/%s/%s" % (log_dir, args.xpid, "model.tar"))
    )

    global last_checkpoint_time
    current_update_count = 0
    initial_update_count = 0
    last_logged_update_count_at_restart = -1

    start_level = 0
    num_levels = 1
    level_sampler_args = dict(
        num_actors=args.num_processes,
        strategy=args.level_replay_strategy,
        max_score_coef=args.level_replay_max_score_coef,
        replay_schedule=args.level_replay_schedule,
        score_transform=args.level_replay_score_transform,
        temperature=args.level_replay_temperature,
        eps=args.level_replay_eps,
        rho=args.level_replay_rho,
        replay_prob=args.level_replay_prob, 
        alpha=args.level_replay_alpha,
        staleness_coef=args.staleness_coef,
        staleness_transform=args.staleness_transform,
        staleness_temperature=args.staleness_temperature,
        sample_full_distribution=args.train_full_distribution,
        seed_buffer_size=args.level_replay_seed_buffer_size,
        seed_buffer_priority=args.level_replay_seed_buffer_priority,
        tscl_window_size=args.tscl_window_size)

    level_sampler_secondary_args = {}
    if args.level_replay_secondary_strategy:
        level_sampler_secondary_args = dict(
            strategy=args.level_replay_secondary_strategy,
            score_transform=args.level_replay_secondary_score_transform,
            temperature=args.level_replay_secondary_temperature,
            eps=args.level_replay_secondary_eps,
            staleness_coef=args.secondary_staleness_coef,
            staleness_transform=args.secondary_staleness_transform,
            staleness_temperature=args.secondary_staleness_temperature,)
        args_tmp = level_sampler_args.copy()
        args_tmp.update(level_sampler_secondary_args)
        level_sampler_secondary_args = args_tmp

    envs, level_sampler, secondary_level_sampler = make_lr_venv(
            num_envs=args.num_processes, env_name=args.env_name,
            seeds=seeds, device=device,
            num_levels=num_levels, 
            start_level=start_level,
            no_ret_normalization=args.no_ret_normalization,
            distribution_mode=args.distribution_mode,
            paint_vel_info=args.paint_vel_info,
            level_sampler_args=level_sampler_args,
            level_sampler_secondary_args=level_sampler_secondary_args,
            level_replay_strategy_mix_coef=args.level_replay_strategy_mix_coef)
    
    is_minigrid = args.env_name.startswith('MiniGrid')

    actor_critic = model_for_env_name(args, envs)       
    actor_critic.to(device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                envs.observation_space.shape, envs.action_space,
                                actor_critic.recurrent_hidden_state_size)
        
    batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch)

    aug_id = None
    if args.algo == 'ucb' or args.use_ucb:
        print('Using UCB')
        aug_id = data_augs.Identity
        aug_list = [aug_to_func[t](batch_size=batch_size) 
            for t in list(aug_to_func.keys())]

        mix_alpha = None if not args.use_mixreg else args.mixreg_alpha

        agent = algo.UCBDrAC(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            aug_list=aug_list,
            aug_id=aug_id,
            aug_coef=args.aug_coef,
            num_aug_types=len(list(aug_to_func.keys())),
            ucb_exploration_coef=args.ucb_exploration_coef,
            ucb_window_length=args.ucb_window_length,
            mix_alpha=mix_alpha,
            log_grad_norm=args.log_grad_norm)
    elif args.algo == 'mixreg' or args.use_mixreg:
        agent = algo.MixRegPPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            log_grad_norm=args.log_grad_norm,
            mix_alpha=args.mixreg_alpha)
    else:
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            log_grad_norm=args.log_grad_norm)

    level_seeds = torch.zeros(args.num_processes)
    if level_sampler:
        obs, level_seeds = envs.reset()
    else:
        obs = envs.reset()
    level_seeds = level_seeds.unsqueeze(-1)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    def checkpoint():
        if args.disable_checkpoint:
            return
        logging.info("Saving checkpoint to %s", checkpointpath)

        checkpoint_states = {
            "model_state_dict": actor_critic.state_dict(),
            "optimizer_state_dict": agent.optimizer.state_dict(),
            "rollouts": rollouts,
            "episode_rewards": episode_rewards,
            "level_sampler": level_sampler,
            "current_update_count": current_update_count
        }

        if hasattr(agent, 'bandit_state_dict'):
            checkpoint_states.update({
                "bandit_state_dict": agent.bandit_state_dict()
            })

        torch.save(
            checkpoint_states,
            checkpointpath
        )

    # Load checkpoint
    if args.checkpoint and os.path.exists(checkpointpath):
        checkpoint_states = torch.load(checkpointpath)

        actor_critic.load_state_dict(checkpoint_states['model_state_dict'])
        agent.optimizer.load_state_dict(checkpoint_states["optimizer_state_dict"])
        rollouts = checkpoint_states["rollouts"]
        episode_rewards = checkpoint_states["episode_rewards"]
        level_sampler = checkpoint_states["level_sampler"]
        current_update_count = checkpoint_states["current_update_count"]
        initial_update_count = current_update_count

        last_logged_update_count_at_restart = plogger.latest_tick() + 1 # ticks are 0-indexed updates

        if hasattr(agent, 'load_bandit_state_dict'):
            agent.load_bandit_state_dict(checkpoint_states["bandit_state_dict"])

        logging.info(f"Resuming preempted job from update: {current_update_count}\n")

    timer = timeit.default_timer
    update_start_time = timer()
    agent_id = 0 # np.random.choice(range(actor_critic.ensemble_size))
    for j in range(initial_update_count, num_updates):
        actor_critic.train()
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                obs_id = rollouts.obs[step]
                if aug_id:
                    obs_id = aug_id(obs_id)
                value, action, action_log_dist, recurrent_hidden_states = actor_critic.act(
                    obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step], agent_id=agent_id)
                action_log_prob = action_log_dist.gather(-1, action)
                uncertainties = actor_critic.get_uncertainty(obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step])

            # Observed reward and next obs
            obs, reward, done, infos = envs.step(action)

            # Reset all done levels by sampling from level sampler
            for i, info in enumerate(infos):
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                if level_sampler:
                    level_seeds[i][0] = info['level_seed']

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(
                obs, recurrent_hidden_states, 
                action, action_log_prob, action_log_dist, 
                value, reward, masks, bad_masks, uncertainties, level_seeds)

        with torch.no_grad():
            obs_id = rollouts.obs[-1]
            if aug_id:
                obs_id = aug_id(obs_id)
            next_value = actor_critic.get_value(
                obs_id, rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()
        
        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda)

        # Update level sampler
        if level_sampler:
            level_sampler.update_with_rollouts(rollouts)

        if secondary_level_sampler:
            secondary_level_sampler.update_with_rollouts(rollouts)

        if args.use_ucb and j > 0:
            agent.update_ucb_values(rollouts)
        value_loss, action_loss, dist_entropy, info = agent.update(rollouts)
        rollouts.after_update()
        if level_sampler:
            level_sampler.after_update()

        if secondary_level_sampler:
            secondary_level_sampler.after_update()

        current_update_count = j + 1

        # ==== Everything below here is for logging + checkpointing ====
        if current_update_count <= last_logged_update_count_at_restart:
            continue    

        # Log stats every log_interval updates or if it is the last update
        if (j % args.log_interval == 0 and len(episode_rewards) > 1) or j == num_updates - 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps

            update_end_time = timer()
            num_interval_updates = 1 if j == 0 else args.log_interval
            sps = num_interval_updates*(args.num_processes * args.num_steps) / (update_end_time - update_start_time)
            update_start_time = update_end_time

            logging.info(f"\nUpdate {j} done, {total_num_steps} steps\n  ")
            logging.info(f"\nEvaluating on {args.num_test_seeds} test levels...\n  ")
            eval_episode_rewards = evaluate(
                args, 
                actor_critic, 
                args.num_test_seeds, 
                device, 
                aug_id=aug_id)

            logging.info(f"\nEvaluating on {args.num_test_seeds} train levels...\n  ")
            train_eval_episode_rewards = evaluate(
                args, 
                actor_critic, 
                args.num_test_seeds, # Use same number of levels for evaluating train and test seeds
                device, 
                start_level=0, 
                num_levels=args.num_train_seeds, 
                seeds=seeds, 
                aug_id=aug_id)

            stats = { 
                "step": total_num_steps,
                "pg_loss": action_loss,
                "value_loss": value_loss,
                "dist_entropy": dist_entropy,
                "train:mean_episode_return": np.mean(episode_rewards),
                "train:median_episode_return": np.median(episode_rewards),
                "test:mean_episode_return": np.mean(eval_episode_rewards),
                "test:median_episode_return": np.median(eval_episode_rewards),
                "train_eval:mean_episode_return": np.mean(train_eval_episode_rewards),
                "train_eval:median_episode_return": np.median(train_eval_episode_rewards),
                "sps": sps,
            }

            if args.log_grad_norm:
                stats.update({
                    "mean_grad_norm": np.mean(info['grad_norms'])
                })

            if is_minigrid:
                stats["train:success_rate"] = np.mean(np.array(episode_rewards) > 0)
                stats["train_eval:success_rate"] = np.mean(np.array(train_eval_episode_rewards) > 0)
                stats["test:success_rate"] = np.mean(np.array(eval_episode_rewards) > 0)

            if j == num_updates - 1:
                logging.info(f"\nLast update: Evaluating on {args.num_test_seeds} test levels...\n  ")
                final_eval_episode_rewards = evaluate(args, actor_critic, args.final_num_test_seeds, device)

                mean_final_eval_episode_rewards = np.mean(final_eval_episode_rewards)
                median_final_eval_episide_rewards = np.median(final_eval_episode_rewards)
                
                plogger.log_final_test_eval({
                    'num_test_seeds': args.final_num_test_seeds,
                    'mean_episode_return': mean_final_eval_episode_rewards,
                    'median_episode_return': median_final_eval_episide_rewards
                })

            plogger.log(stats)
            if args.verbose:
                stdout_logger.writekvs(stats)

        # Log level weights
        if level_sampler and j % args.weight_log_interval == 0:
            plogger.log_level_weights(level_sampler.sample_weights(), level_sampler.seeds)

        # Checkpoint 
        timer = timeit.default_timer
        if last_checkpoint_time is None:
            last_checkpoint_time = timer()
        try:
            if j == num_updates - 1 or \
                (args.save_interval > 0 and timer() - last_checkpoint_time > args.save_interval * 60):  # Save every 60 min.
                checkpoint()
                last_checkpoint_time = timer()
                logging.info(f"\nSaved checkpoint after update {current_update_count}")
        except KeyboardInterrupt:
            return
コード例 #9
0
def main():
    FLAGS(sys.argv)

    steps = 0  #Test steps

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if FLAGS.lr == 0:
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if FLAGS.algorithm == "deepq-4way":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "deepq":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "a2c":
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if FLAGS.log == "tensorboard":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif FLAGS.log == "stdout":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if FLAGS.algorithm == "deepq":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16))
        # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting
        # We need this change because sc2 now requires specifying players.
        with sc2_env.SC2Env(
                map_name="Simple64",
                players=[
                    sc2_env.Agent(race=sc2_env.Race.terran),
                    sc2_env.Agent(race=sc2_env.Race.terran)
                ],
                #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)],
                step_mul=step_mul,
                visualize=True,
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            acts = deepq_nexus_wars.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)

            agent = random_agent.RandomAgent()
            run_loop.run_loop([agent], env, steps)

            acts[0].save("mineral_shards_x.pkl")
            acts[1].save("mineral_shards_y.pkl")

    elif FLAGS.algorithm == "deepq-4way":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(map_name="Simple64",
                            players=[
                                sc2_env.Agent(race=sc2_env.Race.terran),
                                sc2_env.Agent(race=sc2_env.Race.terran)
                            ],
                            step_mul=step_mul,
                            agent_interface_format=AGENT_INTERFACE_FORMAT,
                            visualize=True) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif FLAGS.algorithm == "a2c":

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
コード例 #10
0
ファイル: train.py プロジェクト: dibyaghosh/level-replay
def train(args, seeds):
    global last_checkpoint_time
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if args.cuda else "cpu")
    if 'cuda' in device.type:
        print('Using CUDA\n')

    torch.set_num_threads(1)

    utils.seed(args.seed)

    # Configure logging
    if args.xpid is None:
        args.xpid = "lr-%s" % time.strftime("%Y%m%d-%H%M%S")
    log_dir = os.path.expandvars(os.path.expanduser(args.log_dir))
    plogger = FileWriter(
        xpid=args.xpid,
        xp_args=args.__dict__,
        rootdir=log_dir,
        seeds=seeds,
    )
    stdout_logger = HumanOutputFormat(sys.stdout)

    checkpointpath = os.path.expandvars(
        os.path.expanduser("%s/%s/%s" % (log_dir, args.xpid, "model.tar")))

    # Configure actor envs
    start_level = 0
    if args.full_train_distribution:
        num_levels = 0
        level_sampler_args = None
        seeds = None
    else:
        num_levels = 1
        level_sampler_args = dict(
            num_actors=args.num_processes,
            strategy=args.level_replay_strategy,
            replay_schedule=args.level_replay_schedule,
            score_transform=args.level_replay_score_transform,
            temperature=args.level_replay_temperature,
            eps=args.level_replay_eps,
            rho=args.level_replay_rho,
            nu=args.level_replay_nu,
            alpha=args.level_replay_alpha,
            staleness_coef=args.staleness_coef,
            staleness_transform=args.staleness_transform,
            staleness_temperature=args.staleness_temperature)
    envs, level_sampler = make_lr_venv(
        num_envs=args.num_processes,
        env_name=args.env_name,
        seeds=seeds,
        device=device,
        num_levels=num_levels,
        start_level=start_level,
        no_ret_normalization=args.no_ret_normalization,
        distribution_mode=args.distribution_mode,
        paint_vel_info=args.paint_vel_info,
        level_sampler_args=level_sampler_args)

    is_minigrid = args.env_name.startswith('MiniGrid')

    actor_critic = model_for_env_name(args, envs)
    actor_critic.to(device)
    print(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch)

    def checkpoint():
        if args.disable_checkpoint:
            return
        logging.info("Saving checkpoint to %s", checkpointpath)
        torch.save(
            {
                "model_state_dict": actor_critic.state_dict(),
                "optimizer_state_dict": agent.optimizer.state_dict(),
                "args": vars(args),
            },
            checkpointpath,
        )

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm,
                     env_name=args.env_name)

    level_seeds = torch.zeros(args.num_processes)
    if level_sampler:
        obs, level_seeds = envs.reset()
    else:
        obs = envs.reset()
    level_seeds = level_seeds.unsqueeze(-1)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    timer = timeit.default_timer
    update_start_time = timer()
    for j in range(num_updates):
        actor_critic.train()
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                obs_id = rollouts.obs[step]
                value, action, action_log_dist, recurrent_hidden_states = actor_critic.act(
                    obs_id, rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
                action_log_prob = action_log_dist.gather(-1, action)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            # Reset all done levels by sampling from level sampler
            for i, info in enumerate(infos):
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                if level_sampler:
                    level_seeds[i][0] = info['level_seed']

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, action_log_dist, value, reward,
                            masks, bad_masks, level_seeds)

        with torch.no_grad():
            obs_id = rollouts.obs[-1]
            next_value = actor_critic.get_value(
                obs_id, rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.gae_lambda)

        # Update level sampler
        if level_sampler:
            level_sampler.update_with_rollouts(rollouts)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()
        if level_sampler:
            level_sampler.after_update()

        # Log stats every log_interval updates or if it is the last update
        if (j % args.log_interval == 0
                and len(episode_rewards) > 1) or j == num_updates - 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps

            update_end_time = timer()
            num_interval_updates = 1 if j == 0 else args.log_interval
            sps = num_interval_updates * (args.num_processes *
                                          args.num_steps) / (update_end_time -
                                                             update_start_time)
            update_start_time = update_end_time

            logging.info(f"\nUpdate {j} done, {total_num_steps} steps\n  ")
            logging.info(
                f"\nEvaluating on {args.num_test_seeds} test levels...\n  ")
            eval_episode_rewards, transitions = evaluate(
                args, actor_critic, args.num_test_seeds, device)
            plogger._save_data(transitions, f'test_trajectories_{j}.pkl')

            logging.info(
                f"\nEvaluating on {args.num_test_seeds} train levels...\n  ")
            train_eval_episode_rewards, transitions = evaluate(
                args,
                actor_critic,
                args.num_test_seeds,
                device,
                start_level=0,
                num_levels=args.num_train_seeds,
                seeds=seeds,
                level_sampler=level_sampler)

            stats = {
                "step":
                total_num_steps,
                "pg_loss":
                action_loss,
                "value_loss":
                value_loss,
                "dist_entropy":
                dist_entropy,
                "train:mean_episode_return":
                np.mean(episode_rewards),
                "train:median_episode_return":
                np.median(episode_rewards),
                "test:mean_episode_return":
                np.mean(eval_episode_rewards),
                "test:median_episode_return":
                np.median(eval_episode_rewards),
                "train_eval:mean_episode_return":
                np.mean(train_eval_episode_rewards),
                "train_eval:median_episode_return":
                np.median(train_eval_episode_rewards),
                "sps":
                sps,
            }
            if is_minigrid:
                stats["train:success_rate"] = np.mean(
                    np.array(episode_rewards) > 0)
                stats["train_eval:success_rate"] = np.mean(
                    np.array(train_eval_episode_rewards) > 0)
                stats["test:success_rate"] = np.mean(
                    np.array(eval_episode_rewards) > 0)

            if j == num_updates - 1:
                logging.info(
                    f"\nLast update: Evaluating on {args.num_test_seeds} test levels...\n  "
                )
                final_eval_episode_rewards, transitions = evaluate(
                    args, actor_critic, args.final_num_test_seeds, device)

                mean_final_eval_episode_rewards = np.mean(
                    final_eval_episode_rewards)
                median_final_eval_episide_rewards = np.median(
                    final_eval_episode_rewards)

                plogger.log_final_test_eval({
                    'num_test_seeds':
                    args.final_num_test_seeds,
                    'mean_episode_return':
                    mean_final_eval_episode_rewards,
                    'median_episode_return':
                    median_final_eval_episide_rewards
                })

            plogger.log(stats)
            if args.verbose:
                stdout_logger.writekvs(stats)

        # Log level weights
        if level_sampler and j % args.weight_log_interval == 0:
            plogger.log_level_weights(level_sampler.sample_weights())

        # Checkpoint
        timer = timeit.default_timer
        if last_checkpoint_time is None:
            last_checkpoint_time = timer()
        try:
            if j == num_updates - 1 or \
                (args.save_interval > 0 and timer() - last_checkpoint_time > args.save_interval * 60):  # Save every 10 min.
                checkpoint()
                last_checkpoint_time = timer()
        except KeyboardInterrupt:
            return
コード例 #11
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_cpu : %s" % FLAGS.num_cpu)
    print("lr : %s" % FLAGS.lr)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env("CollectMineralShards",
                            step_mul=step_mul,
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(env,
                                             q_func=model,
                                             num_actions=64,
                                             lr=1e-3,
                                             max_timesteps=20000000,
                                             buffer_size=10000,
                                             exploration_fraction=0.5,
                                             exploration_final_eps=0.01,
                                             train_freq=4,
                                             learning_starts=10000,
                                             target_network_update_freq=1000,
                                             gamma=0.99,
                                             prioritized_replay=True,
                                             callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "acktr"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        # def make_env(rank):
        #   # env = sc2_env.SC2Env(
        #   #   "CollectMineralShards",
        #   #   step_mul=step_mul)
        #   # return env
        #   #env.seed(seed + rank)
        #   def _thunk():
        #     env = sc2_env.SC2Env(
        #         map_name=FLAGS.map,
        #         step_mul=step_mul,
        #         visualize=True)
        #     #env.seed(seed + rank)
        #     if logger.get_dir():
        #      env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
        #     return env
        #   return _thunk

        # agents = [Agent()
        #           for _ in range(num_cpu)]
        #
        # for agent in agents:
        #   time.sleep(1)
        #   agent.daemon = True
        #   agent.start()

        # agent_controller = AgentController(agents)

        #set_global_seeds(seed)
        env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map)

        policy_fn = CnnPolicy
        acktr_disc.learn(policy_fn,
                         env,
                         seed,
                         total_timesteps=num_timesteps,
                         nprocs=FLAGS.num_cpu,
                         ent_coef=0.1,
                         callback=acktr_callback)
コード例 #12
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(  #interface.feature_layer.resolution 和  interface.feature_layer.minimap_resolution
            feature_dimensions=sc2_env.Dimensions(screen=32,
                                                  minimap=32)  # 16 16
            # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32)  # 16 16
        )
        with sc2_env.SC2Env(
                map_name="CollectMineralShards",
                step_mul=step_mul,  #推进的速度,通俗理解就是人类玩家的每秒的有效操作
                visualize=True,
                # screen_size_px=(16, 16),
                # minimap_size_px=(16, 16)) as env:
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = deepq.models.cnn_to_mlp(  #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model
                convs=[(16, 8, 4), (32, 4, 2)],
                hiddens=[256],
                dueling=True)  #卷积核数量,卷积核大小,步长
            # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True)  # 卷积核数量,卷积核大小,步长
            act = deepq_mineral_shards.learn(  #训练模型并保存
                # act = deepq_ActSeparate.learn(  #训练模型并保存
                # act=deepq_actSeparateWith4Directions.learn(
                # act = deepq_actionGroup_4way.learn(
                # act = deep_DiffActInSameTime.learn(
                env,
                q_func=model,
                num_actions=4,  #default 16  num_actions=256   3  4
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_actSeparateWith4Directions_callback
            )  #deepq_callback; deepq_ActSeperate_callback  ;   deepq_actSeparateWith4Directions_callback  deep_DiffActInSameTime_callback
            act.save(
                "mineral_shards.pkl"
            )  #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py

    elif (FLAGS.algorithm == "deepq-4way"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(  #
                map_name="CollectMineralShards",
                step_mul=step_mul,
                # screen_size_px=(32, 32),
                # minimap_size_px=(32, 32),
                save_replay_episodes=2,
                replay_dir="D:/StarCraft II/StarCraft II/video",
                agent_interface_format=AGENT_INTERFACE_FORMAT,
                visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)
            # model = deepq.models.mlp(hiddens=[256,128,4])
            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
コード例 #13
0
def main():

  # tf.reset_default_graph()
  # config = tf.ConfigProto()
  # config.gpu_options.allow_growth = True

  FLAGS(sys.argv)
  # steps_left = FLAGS.timesteps

  logdir = "tensorboard"
  if(FLAGS.algorithm == "deepq"):
    logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.exploration_fraction,
      FLAGS.prioritized,
      FLAGS.dueling,
      FLAGS.lr,
      start_time
    )
  elif(FLAGS.algorithm == "acktr"):
    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.num_cpu,
      FLAGS.lr,
      start_time
    )
  elif(FLAGS.algorithm == "BicNet"):
    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.num_cpu,
      FLAGS.lr,
      start_time
    )

  if(FLAGS.log == "tensorboard"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[TensorBoardOutputFormat(logdir)])

  elif(FLAGS.log == "stdout"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[HumanOutputFormat(sys.stdout)])

  AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
    feature_dimensions=sc2_env.Dimensions(screen=64, minimap=64),#feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
    use_feature_units=True
  )

  lr = FLAGS.lr
  batch_size = 32  # 32
  gamma = 0.99
  num_agents = 9
  vector_obs_len = 33   #4096  # 32*32  1024
  output_len = 3
  hidden_vector_len = 128   #1
  tau = 0.001
  # stddev = 0.1


  sess = U.make_session()
  sess.__enter__()
  actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len)
  sess.run(tf.global_variables_initializer())

  # while(steps_left > 0):
  with sc2_env.SC2Env(
      map_name="DefeatZerglingsAndBanelings",  #DefeatZerglingsAndBanelings
      step_mul=step_mul,
      save_replay_episodes=1,
      replay_dir="D:/StarCraft II/StarCraft II/Replays/video/0722",
      agent_interface_format=AGENT_INTERFACE_FORMAT,
      visualize=False, #True
      game_steps_per_episode=steps * step_mul) as env:

    learn(
      env,
      sess=sess,
      max_timesteps=FLAGS.timesteps,
      # callback=BicNet_callback,
      actor=actor,
      num_agents=num_agents
    )
コード例 #14
0
def main():

    # tf.reset_default_graph()
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)
    elif (FLAGS.algorithm == "BicNet"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
        feature_dimensions=sc2_env.Dimensions(
            screen=64, minimap=64
        )  #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
    )
    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",  #DefeatZerglingsAndBanelings
            step_mul=step_mul,
            agent_interface_format=AGENT_INTERFACE_FORMAT,
            visualize=True,  #True
            game_steps_per_episode=steps * step_mul) as env:

        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[256],
                                        dueling=True)
        #model,需要改成lstm的形式。
        demo_replay = []
        # act = dqfd.learn(
        #   env,
        #   q_func=model,
        #   num_actions=3,
        #   lr=1e-4,
        #   max_timesteps=10000000,
        #   buffer_size=100000,
        #   exploration_fraction=0.5,
        #   exploration_final_eps=0.01,
        #   train_freq=2,
        #   learning_starts=100000,
        #   target_network_update_freq=1000,
        #   gamma=0.99,
        #   prioritized_replay=True,
        #   callback=deepq_callback
        # )
        # act.save("defeat_zerglings.pkl")
        BicNet_findAndDefeatZergling.learn(
            env,
            lr=FLAGS.lr,
            max_timesteps=FLAGS.timesteps,
            buffer_size=100000,
            train_freq=1,
            learning_starts=1000,  #100000,
            target_network_update_freq=1000,
            gamma=0.99,
            callback=BicNet_callback)
コード例 #15
0
def main():

    # tf.reset_default_graph()
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    FLAGS(sys.argv)
    # steps_left = FLAGS.timesteps

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)
    elif (FLAGS.algorithm == "BicNet"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
        feature_dimensions=sc2_env.Dimensions(
            screen=32, minimap=32
        ),  #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
        use_feature_units=True)

    lr = FLAGS.lr
    buffer_size = 60000  # 50000   减少一下,尽量是训练步数的1/10  70000  test 200  70000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 2  #9
    vector_obs_len = 736  #33   #4096  # 32*32  1024
    output_len = 4  #3

    hidden_vector_len = 128  #128   #1
    tau = 0.001
    # stddev = 0.1

    sess = U.make_session()
    sess.__enter__()
    actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents,
                            vector_obs_len, output_len, hidden_vector_len)
    critic = tb.CriticNetwork(sess, lr, tau, gamma,
                              actor.get_num_trainable_vars(), num_agents,
                              vector_obs_len, output_len, hidden_vector_len)
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1), sigma=float(stddev) * np.ones(1))
    action_noise = noise_OU.OU_noise(decay_period=FLAGS.timesteps -
                                     buffer_size)

    # while(steps_left > 0):
    with sc2_env.SC2Env(
            map_name="CollectMineralShards",  #DefeatZerglingsAndBanelings
            # step_mul=step_mul,
            agent_interface_format=AGENT_INTERFACE_FORMAT,
            visualize=False,  #True
            game_steps_per_episode=steps * step_mul) as env:

        learn(
            env,
            sess=sess,
            max_timesteps=FLAGS.timesteps,
            train_freq=1,
            save_freq=10000,
            target_network_update_freq=1,  #1000
            gamma=gamma,
            # callback=BicNet_callback,
            actor=actor,
            critic=critic,
            replay_buffer=replay_buffer,
            num_agents=num_agents,
            action_noise=action_noise,
            output_len=output_len,
            num_exploring=buffer_size  #buffer_size
        )