def config_log(FLAGS): logdir = "tensorboard/%s/hrl_a2c_svib/%s_lr%s_%s/%s_%s_%s" % ( FLAGS.env,FLAGS.num_timesteps, '0.0007',FLAGS.policy, start_time, FLAGS.train_option, str(FLAGS.beta)) if FLAGS.log == "tensorboard": Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[HumanOutputFormat(sys.stdout)])
def main(): FLAGS(sys.argv) logdir = "tensorboard" if FLAGS.algorithm == "deepq": logdir = "tensorboard/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) if FLAGS.log == "tensorboard": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) print("env : %s" % FLAGS.env) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("lr : %s" % FLAGS.lr) # Choose which RL algorithm to train. if FLAGS.algorithm == "deepq": # Use DQN train_dqn(env_id=FLAGS.env, num_timesteps=FLAGS.timesteps)
def main(): FLAGS(sys.argv) logdir = "tensorboard" if(FLAGS.algorithm == "deepq"): logdir = "./tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time ) if(FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir='log.txt', output_formats=[TensorBoardOutputFormat(logdir)]) elif(FLAGS.log == "stdout"): os.mkdir(logdir) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(logdir+"/log.txt")]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", minimap_size_px = (FLAGS.minimap_size_px, FLAGS.minimap_size_px), step_mul=FLAGS.step_mul, visualize=FLAGS.visualize, game_steps_per_episode= FLAGS.episode_steps) as env: model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1), (64, 3, 1), (64, 3, 1), (32, 3, 1)], hiddens=[256], dueling=True ) act = dqfd.learn( env, q_func=model, num_actions=FLAGS.num_actions, lr=FLAGS.lr, print_freq= FLAGS.print_freq, max_timesteps=FLAGS.timesteps, buffer_size=FLAGS.buffer_size, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=FLAGS.exploration_final_eps, train_freq=FLAGS.train_freq, learning_starts=FLAGS.learning_starts, target_network_update_freq=FLAGS.target_network_update_freq, gamma=FLAGS.gamma, prioritized_replay=FLAGS.prioritized, callback=deepq_callback ) act.save("defeat_zerglings.pkl")
def main(): FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", step_mul=step_mul, visualize=True, agent_interface_format=sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)), game_steps_per_episode=steps * step_mul) as env: obs = env.reset() #print(obs[0].observation) model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) demo_replay = [] act = dqfd.learn(env, q_func=model, num_actions=3, lr=1e-4, max_timesteps=10000000, buffer_size=100000, exploration_fraction=0.5, exploration_final_eps=0.01, train_freq=2, learning_starts=100000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("defeat_zerglings.pkl")
def main(): if osp.exists(LOGDIR): shutil.rmtree(LOGDIR) os.makedirs(LOGDIR) if not osp.exists(CKPTDIR): os.makedirs(CKPTDIR) Logger.DEFAULT = Logger.CURRENT = Logger( dir=None, output_formats=[ HumanOutputFormat(sys.stdout), CSVOutputFormat(osp.join(LOGDIR, 'log.csv')) ]) train()
def main(): FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", step_mul=step_mul, visualize=True, agent_interface_format=sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)), game_steps_per_episode=steps * step_mul) as env: print(env.observation_spec()) screen_dim = env.observation_spec()[0]['feature_screen'][1:3] print(screen_dim)
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, visualize=True, screen_size_px=(16, 16), minimap_size_px=(16, 16)) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "deepq-4way"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, screen_size_px=(32, 32), minimap_size_px=(32, 32), visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def train(args, seeds): args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") if 'cuda' in device.type: print('Using CUDA\n') torch.set_num_threads(1) utils.seed(args.seed) # Configure logging if args.xpid is None: args.xpid = "lr-%s" % time.strftime("%Y%m%d-%H%M%S") log_dir = os.path.expandvars(os.path.expanduser(args.log_dir)) plogger = FileWriter( xpid=args.xpid, xp_args=args.__dict__, rootdir=log_dir, seeds=seeds, ) stdout_logger = HumanOutputFormat(sys.stdout) checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (log_dir, args.xpid, "model.tar")) ) global last_checkpoint_time current_update_count = 0 initial_update_count = 0 last_logged_update_count_at_restart = -1 start_level = 0 num_levels = 1 level_sampler_args = dict( num_actors=args.num_processes, strategy=args.level_replay_strategy, max_score_coef=args.level_replay_max_score_coef, replay_schedule=args.level_replay_schedule, score_transform=args.level_replay_score_transform, temperature=args.level_replay_temperature, eps=args.level_replay_eps, rho=args.level_replay_rho, replay_prob=args.level_replay_prob, alpha=args.level_replay_alpha, staleness_coef=args.staleness_coef, staleness_transform=args.staleness_transform, staleness_temperature=args.staleness_temperature, sample_full_distribution=args.train_full_distribution, seed_buffer_size=args.level_replay_seed_buffer_size, seed_buffer_priority=args.level_replay_seed_buffer_priority, tscl_window_size=args.tscl_window_size) level_sampler_secondary_args = {} if args.level_replay_secondary_strategy: level_sampler_secondary_args = dict( strategy=args.level_replay_secondary_strategy, score_transform=args.level_replay_secondary_score_transform, temperature=args.level_replay_secondary_temperature, eps=args.level_replay_secondary_eps, staleness_coef=args.secondary_staleness_coef, staleness_transform=args.secondary_staleness_transform, staleness_temperature=args.secondary_staleness_temperature,) args_tmp = level_sampler_args.copy() args_tmp.update(level_sampler_secondary_args) level_sampler_secondary_args = args_tmp envs, level_sampler, secondary_level_sampler = make_lr_venv( num_envs=args.num_processes, env_name=args.env_name, seeds=seeds, device=device, num_levels=num_levels, start_level=start_level, no_ret_normalization=args.no_ret_normalization, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, level_sampler_args=level_sampler_args, level_sampler_secondary_args=level_sampler_secondary_args, level_replay_strategy_mix_coef=args.level_replay_strategy_mix_coef) is_minigrid = args.env_name.startswith('MiniGrid') actor_critic = model_for_env_name(args, envs) actor_critic.to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch) aug_id = None if args.algo == 'ucb' or args.use_ucb: print('Using UCB') aug_id = data_augs.Identity aug_list = [aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys())] mix_alpha = None if not args.use_mixreg else args.mixreg_alpha agent = algo.UCBDrAC( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), ucb_exploration_coef=args.ucb_exploration_coef, ucb_window_length=args.ucb_window_length, mix_alpha=mix_alpha, log_grad_norm=args.log_grad_norm) elif args.algo == 'mixreg' or args.use_mixreg: agent = algo.MixRegPPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, log_grad_norm=args.log_grad_norm, mix_alpha=args.mixreg_alpha) else: agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, log_grad_norm=args.log_grad_norm) level_seeds = torch.zeros(args.num_processes) if level_sampler: obs, level_seeds = envs.reset() else: obs = envs.reset() level_seeds = level_seeds.unsqueeze(-1) rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes def checkpoint(): if args.disable_checkpoint: return logging.info("Saving checkpoint to %s", checkpointpath) checkpoint_states = { "model_state_dict": actor_critic.state_dict(), "optimizer_state_dict": agent.optimizer.state_dict(), "rollouts": rollouts, "episode_rewards": episode_rewards, "level_sampler": level_sampler, "current_update_count": current_update_count } if hasattr(agent, 'bandit_state_dict'): checkpoint_states.update({ "bandit_state_dict": agent.bandit_state_dict() }) torch.save( checkpoint_states, checkpointpath ) # Load checkpoint if args.checkpoint and os.path.exists(checkpointpath): checkpoint_states = torch.load(checkpointpath) actor_critic.load_state_dict(checkpoint_states['model_state_dict']) agent.optimizer.load_state_dict(checkpoint_states["optimizer_state_dict"]) rollouts = checkpoint_states["rollouts"] episode_rewards = checkpoint_states["episode_rewards"] level_sampler = checkpoint_states["level_sampler"] current_update_count = checkpoint_states["current_update_count"] initial_update_count = current_update_count last_logged_update_count_at_restart = plogger.latest_tick() + 1 # ticks are 0-indexed updates if hasattr(agent, 'load_bandit_state_dict'): agent.load_bandit_state_dict(checkpoint_states["bandit_state_dict"]) logging.info(f"Resuming preempted job from update: {current_update_count}\n") timer = timeit.default_timer update_start_time = timer() agent_id = 0 # np.random.choice(range(actor_critic.ensemble_size)) for j in range(initial_update_count, num_updates): actor_critic.train() for step in range(args.num_steps): # Sample actions with torch.no_grad(): obs_id = rollouts.obs[step] if aug_id: obs_id = aug_id(obs_id) value, action, action_log_dist, recurrent_hidden_states = actor_critic.act( obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step], agent_id=agent_id) action_log_prob = action_log_dist.gather(-1, action) uncertainties = actor_critic.get_uncertainty(obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observed reward and next obs obs, reward, done, infos = envs.step(action) # Reset all done levels by sampling from level sampler for i, info in enumerate(infos): if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if level_sampler: level_seeds[i][0] = info['level_seed'] # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert( obs, recurrent_hidden_states, action, action_log_prob, action_log_dist, value, reward, masks, bad_masks, uncertainties, level_seeds) with torch.no_grad(): obs_id = rollouts.obs[-1] if aug_id: obs_id = aug_id(obs_id) next_value = actor_critic.get_value( obs_id, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda) # Update level sampler if level_sampler: level_sampler.update_with_rollouts(rollouts) if secondary_level_sampler: secondary_level_sampler.update_with_rollouts(rollouts) if args.use_ucb and j > 0: agent.update_ucb_values(rollouts) value_loss, action_loss, dist_entropy, info = agent.update(rollouts) rollouts.after_update() if level_sampler: level_sampler.after_update() if secondary_level_sampler: secondary_level_sampler.after_update() current_update_count = j + 1 # ==== Everything below here is for logging + checkpointing ==== if current_update_count <= last_logged_update_count_at_restart: continue # Log stats every log_interval updates or if it is the last update if (j % args.log_interval == 0 and len(episode_rewards) > 1) or j == num_updates - 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps update_end_time = timer() num_interval_updates = 1 if j == 0 else args.log_interval sps = num_interval_updates*(args.num_processes * args.num_steps) / (update_end_time - update_start_time) update_start_time = update_end_time logging.info(f"\nUpdate {j} done, {total_num_steps} steps\n ") logging.info(f"\nEvaluating on {args.num_test_seeds} test levels...\n ") eval_episode_rewards = evaluate( args, actor_critic, args.num_test_seeds, device, aug_id=aug_id) logging.info(f"\nEvaluating on {args.num_test_seeds} train levels...\n ") train_eval_episode_rewards = evaluate( args, actor_critic, args.num_test_seeds, # Use same number of levels for evaluating train and test seeds device, start_level=0, num_levels=args.num_train_seeds, seeds=seeds, aug_id=aug_id) stats = { "step": total_num_steps, "pg_loss": action_loss, "value_loss": value_loss, "dist_entropy": dist_entropy, "train:mean_episode_return": np.mean(episode_rewards), "train:median_episode_return": np.median(episode_rewards), "test:mean_episode_return": np.mean(eval_episode_rewards), "test:median_episode_return": np.median(eval_episode_rewards), "train_eval:mean_episode_return": np.mean(train_eval_episode_rewards), "train_eval:median_episode_return": np.median(train_eval_episode_rewards), "sps": sps, } if args.log_grad_norm: stats.update({ "mean_grad_norm": np.mean(info['grad_norms']) }) if is_minigrid: stats["train:success_rate"] = np.mean(np.array(episode_rewards) > 0) stats["train_eval:success_rate"] = np.mean(np.array(train_eval_episode_rewards) > 0) stats["test:success_rate"] = np.mean(np.array(eval_episode_rewards) > 0) if j == num_updates - 1: logging.info(f"\nLast update: Evaluating on {args.num_test_seeds} test levels...\n ") final_eval_episode_rewards = evaluate(args, actor_critic, args.final_num_test_seeds, device) mean_final_eval_episode_rewards = np.mean(final_eval_episode_rewards) median_final_eval_episide_rewards = np.median(final_eval_episode_rewards) plogger.log_final_test_eval({ 'num_test_seeds': args.final_num_test_seeds, 'mean_episode_return': mean_final_eval_episode_rewards, 'median_episode_return': median_final_eval_episide_rewards }) plogger.log(stats) if args.verbose: stdout_logger.writekvs(stats) # Log level weights if level_sampler and j % args.weight_log_interval == 0: plogger.log_level_weights(level_sampler.sample_weights(), level_sampler.seeds) # Checkpoint timer = timeit.default_timer if last_checkpoint_time is None: last_checkpoint_time = timer() try: if j == num_updates - 1 or \ (args.save_interval > 0 and timer() - last_checkpoint_time > args.save_interval * 60): # Save every 60 min. checkpoint() last_checkpoint_time = timer() logging.info(f"\nSaved checkpoint after update {current_update_count}") except KeyboardInterrupt: return
def main(): FLAGS(sys.argv) steps = 0 #Test steps print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if FLAGS.lr == 0: FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if FLAGS.algorithm == "deepq-4way": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "deepq": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "a2c": logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if FLAGS.log == "tensorboard": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if FLAGS.algorithm == "deepq": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16)) # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting # We need this change because sc2 now requires specifying players. with sc2_env.SC2Env( map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)], step_mul=step_mul, visualize=True, agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) acts = deepq_nexus_wars.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) agent = random_agent.RandomAgent() run_loop.run_loop([agent], env, steps) acts[0].save("mineral_shards_x.pkl") acts[1].save("mineral_shards_y.pkl") elif FLAGS.algorithm == "deepq-4way": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env(map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif FLAGS.algorithm == "a2c": num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def train(args, seeds): global last_checkpoint_time args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") if 'cuda' in device.type: print('Using CUDA\n') torch.set_num_threads(1) utils.seed(args.seed) # Configure logging if args.xpid is None: args.xpid = "lr-%s" % time.strftime("%Y%m%d-%H%M%S") log_dir = os.path.expandvars(os.path.expanduser(args.log_dir)) plogger = FileWriter( xpid=args.xpid, xp_args=args.__dict__, rootdir=log_dir, seeds=seeds, ) stdout_logger = HumanOutputFormat(sys.stdout) checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (log_dir, args.xpid, "model.tar"))) # Configure actor envs start_level = 0 if args.full_train_distribution: num_levels = 0 level_sampler_args = None seeds = None else: num_levels = 1 level_sampler_args = dict( num_actors=args.num_processes, strategy=args.level_replay_strategy, replay_schedule=args.level_replay_schedule, score_transform=args.level_replay_score_transform, temperature=args.level_replay_temperature, eps=args.level_replay_eps, rho=args.level_replay_rho, nu=args.level_replay_nu, alpha=args.level_replay_alpha, staleness_coef=args.staleness_coef, staleness_transform=args.staleness_transform, staleness_temperature=args.staleness_temperature) envs, level_sampler = make_lr_venv( num_envs=args.num_processes, env_name=args.env_name, seeds=seeds, device=device, num_levels=num_levels, start_level=start_level, no_ret_normalization=args.no_ret_normalization, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, level_sampler_args=level_sampler_args) is_minigrid = args.env_name.startswith('MiniGrid') actor_critic = model_for_env_name(args, envs) actor_critic.to(device) print(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch) def checkpoint(): if args.disable_checkpoint: return logging.info("Saving checkpoint to %s", checkpointpath) torch.save( { "model_state_dict": actor_critic.state_dict(), "optimizer_state_dict": agent.optimizer.state_dict(), "args": vars(args), }, checkpointpath, ) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, env_name=args.env_name) level_seeds = torch.zeros(args.num_processes) if level_sampler: obs, level_seeds = envs.reset() else: obs = envs.reset() level_seeds = level_seeds.unsqueeze(-1) rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes timer = timeit.default_timer update_start_time = timer() for j in range(num_updates): actor_critic.train() for step in range(args.num_steps): # Sample actions with torch.no_grad(): obs_id = rollouts.obs[step] value, action, action_log_dist, recurrent_hidden_states = actor_critic.act( obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step]) action_log_prob = action_log_dist.gather(-1, action) # Obser reward and next obs obs, reward, done, infos = envs.step(action) # Reset all done levels by sampling from level sampler for i, info in enumerate(infos): if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if level_sampler: level_seeds[i][0] = info['level_seed'] # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, action_log_dist, value, reward, masks, bad_masks, level_seeds) with torch.no_grad(): obs_id = rollouts.obs[-1] next_value = actor_critic.get_value( obs_id, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.gae_lambda) # Update level sampler if level_sampler: level_sampler.update_with_rollouts(rollouts) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if level_sampler: level_sampler.after_update() # Log stats every log_interval updates or if it is the last update if (j % args.log_interval == 0 and len(episode_rewards) > 1) or j == num_updates - 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps update_end_time = timer() num_interval_updates = 1 if j == 0 else args.log_interval sps = num_interval_updates * (args.num_processes * args.num_steps) / (update_end_time - update_start_time) update_start_time = update_end_time logging.info(f"\nUpdate {j} done, {total_num_steps} steps\n ") logging.info( f"\nEvaluating on {args.num_test_seeds} test levels...\n ") eval_episode_rewards, transitions = evaluate( args, actor_critic, args.num_test_seeds, device) plogger._save_data(transitions, f'test_trajectories_{j}.pkl') logging.info( f"\nEvaluating on {args.num_test_seeds} train levels...\n ") train_eval_episode_rewards, transitions = evaluate( args, actor_critic, args.num_test_seeds, device, start_level=0, num_levels=args.num_train_seeds, seeds=seeds, level_sampler=level_sampler) stats = { "step": total_num_steps, "pg_loss": action_loss, "value_loss": value_loss, "dist_entropy": dist_entropy, "train:mean_episode_return": np.mean(episode_rewards), "train:median_episode_return": np.median(episode_rewards), "test:mean_episode_return": np.mean(eval_episode_rewards), "test:median_episode_return": np.median(eval_episode_rewards), "train_eval:mean_episode_return": np.mean(train_eval_episode_rewards), "train_eval:median_episode_return": np.median(train_eval_episode_rewards), "sps": sps, } if is_minigrid: stats["train:success_rate"] = np.mean( np.array(episode_rewards) > 0) stats["train_eval:success_rate"] = np.mean( np.array(train_eval_episode_rewards) > 0) stats["test:success_rate"] = np.mean( np.array(eval_episode_rewards) > 0) if j == num_updates - 1: logging.info( f"\nLast update: Evaluating on {args.num_test_seeds} test levels...\n " ) final_eval_episode_rewards, transitions = evaluate( args, actor_critic, args.final_num_test_seeds, device) mean_final_eval_episode_rewards = np.mean( final_eval_episode_rewards) median_final_eval_episide_rewards = np.median( final_eval_episode_rewards) plogger.log_final_test_eval({ 'num_test_seeds': args.final_num_test_seeds, 'mean_episode_return': mean_final_eval_episode_rewards, 'median_episode_return': median_final_eval_episide_rewards }) plogger.log(stats) if args.verbose: stdout_logger.writekvs(stats) # Log level weights if level_sampler and j % args.weight_log_interval == 0: plogger.log_level_weights(level_sampler.sample_weights()) # Checkpoint timer = timeit.default_timer if last_checkpoint_time is None: last_checkpoint_time = timer() try: if j == num_updates - 1 or \ (args.save_interval > 0 and timer() - last_checkpoint_time > args.save_interval * 60): # Save every 10 min. checkpoint() last_checkpoint_time = timer() except KeyboardInterrupt: return
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_cpu : %s" % FLAGS.num_cpu) print("lr : %s" % FLAGS.lr) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env("CollectMineralShards", step_mul=step_mul, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn(env, q_func=model, num_actions=64, lr=1e-3, max_timesteps=20000000, buffer_size=10000, exploration_fraction=0.5, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "acktr"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 # def make_env(rank): # # env = sc2_env.SC2Env( # # "CollectMineralShards", # # step_mul=step_mul) # # return env # #env.seed(seed + rank) # def _thunk(): # env = sc2_env.SC2Env( # map_name=FLAGS.map, # step_mul=step_mul, # visualize=True) # #env.seed(seed + rank) # if logger.get_dir(): # env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) # return env # return _thunk # agents = [Agent() # for _ in range(num_cpu)] # # for agent in agents: # time.sleep(1) # agent.daemon = True # agent.start() # agent_controller = AgentController(agents) #set_global_seeds(seed) env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map) policy_fn = CnnPolicy acktr_disc.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_cpu, ent_coef=0.1, callback=acktr_callback)
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( #interface.feature_layer.resolution 和 interface.feature_layer.minimap_resolution feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32) # 16 16 # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32) # 16 16 ) with sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=step_mul, #推进的速度,通俗理解就是人类玩家的每秒的有效操作 visualize=True, # screen_size_px=(16, 16), # minimap_size_px=(16, 16)) as env: agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = deepq.models.cnn_to_mlp( #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) #卷积核数量,卷积核大小,步长 # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True) # 卷积核数量,卷积核大小,步长 act = deepq_mineral_shards.learn( #训练模型并保存 # act = deepq_ActSeparate.learn( #训练模型并保存 # act=deepq_actSeparateWith4Directions.learn( # act = deepq_actionGroup_4way.learn( # act = deep_DiffActInSameTime.learn( env, q_func=model, num_actions=4, #default 16 num_actions=256 3 4 lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_actSeparateWith4Directions_callback ) #deepq_callback; deepq_ActSeperate_callback ; deepq_actSeparateWith4Directions_callback deep_DiffActInSameTime_callback act.save( "mineral_shards.pkl" ) #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py elif (FLAGS.algorithm == "deepq-4way"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env( # map_name="CollectMineralShards", step_mul=step_mul, # screen_size_px=(32, 32), # minimap_size_px=(32, 32), save_replay_episodes=2, replay_dir="D:/StarCraft II/StarCraft II/video", agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) # model = deepq.models.mlp(hiddens=[256,128,4]) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) # steps_left = FLAGS.timesteps logdir = "tensorboard" if(FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time ) elif(FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time ) elif(FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time ) if(FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif(FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=64, minimap=64),#feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 use_feature_units=True ) lr = FLAGS.lr batch_size = 32 # 32 gamma = 0.99 num_agents = 9 vector_obs_len = 33 #4096 # 32*32 1024 output_len = 3 hidden_vector_len = 128 #1 tau = 0.001 # stddev = 0.1 sess = U.make_session() sess.__enter__() actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) # while(steps_left > 0): with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", #DefeatZerglingsAndBanelings step_mul=step_mul, save_replay_episodes=1, replay_dir="D:/StarCraft II/StarCraft II/Replays/video/0722", agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=False, #True game_steps_per_episode=steps * step_mul) as env: learn( env, sess=sess, max_timesteps=FLAGS.timesteps, # callback=BicNet_callback, actor=actor, num_agents=num_agents )
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) elif (FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions( screen=64, minimap=64 ) #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 ) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", #DefeatZerglingsAndBanelings step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True, #True game_steps_per_episode=steps * step_mul) as env: model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) #model,需要改成lstm的形式。 demo_replay = [] # act = dqfd.learn( # env, # q_func=model, # num_actions=3, # lr=1e-4, # max_timesteps=10000000, # buffer_size=100000, # exploration_fraction=0.5, # exploration_final_eps=0.01, # train_freq=2, # learning_starts=100000, # target_network_update_freq=1000, # gamma=0.99, # prioritized_replay=True, # callback=deepq_callback # ) # act.save("defeat_zerglings.pkl") BicNet_findAndDefeatZergling.learn( env, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=100000, train_freq=1, learning_starts=1000, #100000, target_network_update_freq=1000, gamma=0.99, callback=BicNet_callback)
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) # steps_left = FLAGS.timesteps logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) elif (FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions( screen=32, minimap=32 ), #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 use_feature_units=True) lr = FLAGS.lr buffer_size = 60000 # 50000 减少一下,尽量是训练步数的1/10 70000 test 200 70000 batch_size = 32 # 32 gamma = 0.99 num_agents = 2 #9 vector_obs_len = 736 #33 #4096 # 32*32 1024 output_len = 4 #3 hidden_vector_len = 128 #128 #1 tau = 0.001 # stddev = 0.1 sess = U.make_session() sess.__enter__() actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) critic = tb.CriticNetwork(sess, lr, tau, gamma, actor.get_num_trainable_vars(), num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1), sigma=float(stddev) * np.ones(1)) action_noise = noise_OU.OU_noise(decay_period=FLAGS.timesteps - buffer_size) # while(steps_left > 0): with sc2_env.SC2Env( map_name="CollectMineralShards", #DefeatZerglingsAndBanelings # step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=False, #True game_steps_per_episode=steps * step_mul) as env: learn( env, sess=sess, max_timesteps=FLAGS.timesteps, train_freq=1, save_freq=10000, target_network_update_freq=1, #1000 gamma=gamma, # callback=BicNet_callback, actor=actor, critic=critic, replay_buffer=replay_buffer, num_agents=num_agents, action_noise=action_noise, output_len=output_len, num_exploring=buffer_size #buffer_size )