def config_log(FLAGS): logdir = "tensorboard/%s/hrl_a2c_svib/%s_lr%s_%s/%s_%s_%s" % ( FLAGS.env,FLAGS.num_timesteps, '0.0007',FLAGS.policy, start_time, FLAGS.train_option, str(FLAGS.beta)) if FLAGS.log == "tensorboard": Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[HumanOutputFormat(sys.stdout)])
def main(): FLAGS(sys.argv) logdir = "tensorboard" if FLAGS.algorithm == "deepq": logdir = "tensorboard/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) if FLAGS.log == "tensorboard": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) print("env : %s" % FLAGS.env) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("lr : %s" % FLAGS.lr) # Choose which RL algorithm to train. if FLAGS.algorithm == "deepq": # Use DQN train_dqn(env_id=FLAGS.env, num_timesteps=FLAGS.timesteps)
def main(): FLAGS(sys.argv) logdir = "tensorboard" if(FLAGS.algorithm == "deepq"): logdir = "./tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time ) if(FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir='log.txt', output_formats=[TensorBoardOutputFormat(logdir)]) elif(FLAGS.log == "stdout"): os.mkdir(logdir) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(logdir+"/log.txt")]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", minimap_size_px = (FLAGS.minimap_size_px, FLAGS.minimap_size_px), step_mul=FLAGS.step_mul, visualize=FLAGS.visualize, game_steps_per_episode= FLAGS.episode_steps) as env: model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1), (64, 3, 1), (64, 3, 1), (32, 3, 1)], hiddens=[256], dueling=True ) act = dqfd.learn( env, q_func=model, num_actions=FLAGS.num_actions, lr=FLAGS.lr, print_freq= FLAGS.print_freq, max_timesteps=FLAGS.timesteps, buffer_size=FLAGS.buffer_size, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=FLAGS.exploration_final_eps, train_freq=FLAGS.train_freq, learning_starts=FLAGS.learning_starts, target_network_update_freq=FLAGS.target_network_update_freq, gamma=FLAGS.gamma, prioritized_replay=FLAGS.prioritized, callback=deepq_callback ) act.save("defeat_zerglings.pkl")
def main(): FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", step_mul=step_mul, visualize=True, agent_interface_format=sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)), game_steps_per_episode=steps * step_mul) as env: obs = env.reset() #print(obs[0].observation) model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) demo_replay = [] act = dqfd.learn(env, q_func=model, num_actions=3, lr=1e-4, max_timesteps=10000000, buffer_size=100000, exploration_fraction=0.5, exploration_final_eps=0.01, train_freq=2, learning_starts=100000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("defeat_zerglings.pkl")
def main(): if osp.exists(LOGDIR): shutil.rmtree(LOGDIR) os.makedirs(LOGDIR) if not osp.exists(CKPTDIR): os.makedirs(CKPTDIR) Logger.DEFAULT = Logger.CURRENT = Logger( dir=None, output_formats=[ HumanOutputFormat(sys.stdout), CSVOutputFormat(osp.join(LOGDIR, 'log.csv')) ]) train()
def main(): FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", step_mul=step_mul, visualize=True, agent_interface_format=sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)), game_steps_per_episode=steps * step_mul) as env: print(env.observation_spec()) screen_dim = env.observation_spec()[0]['feature_screen'][1:3] print(screen_dim)
def configure(dir, format_strs=None, custom_output_formats=None): if not dir: return assert isinstance(dir, str) os.makedirs(dir, exist_ok=True) if format_strs is None: strs = os.getenv('OPENAI_LOG_FORMAT') format_strs = strs.split(',') if strs else LOG_OUTPUT_FORMATS output_formats = [make_output_format(f, dir) for f in format_strs] if custom_output_formats is not None: assert isinstance(custom_output_formats, list) for custom_output_format in custom_output_formats: assert isinstance(custom_output_format, KVWriter) output_formats.extend(custom_output_formats) Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) log('Logging to %s' % dir)
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") env = StarCraft2Env(map_name="8m", reward_only_positive=False, reward_scale_rate=200, state_last_action=True, obs_last_action=True, obs_timestep_number=True, state_timestep_number=True) #reward_defeat=-200 env_info = env.get_env_info() n_episodes = 2500 #4000 #2000 timesteps = 500000 n_agents = env_info["n_agents"] n_actions = env_info["n_actions"] output_len = n_actions lr = 0.002 buffer_size = 70000 #int(timesteps * 0.1) # 80000 # 减少一下,尽量是训练步数的1/10 70000 test 200 80000 20000 batch_size = 32 # 32 gamma = 0.99 num_agents = 8 local_obs_len = 179 # local obs:80 ; global state:168; global_state_len = 348 # 179+169 hidden_vector_len = 256 # 128 # 1 256 tau = 0.001 num_exploring = buffer_size # buffer_size action_low = -1 action_high = 1 save_freq = 10000 critic_output_len = 1 logdir = "tensorboard/%s/%s_lr%s/%s" % ("BicNet", timesteps, lr, start_time) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, local_obs_len, output_len, hidden_vector_len) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, global_state_len, critic_output_len, hidden_vector_len, n_actions) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) action_noise = OU_noise(decay_period=timesteps - buffer_size) action_noise.reset() # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings") # U.load_state(model_file_load, sess) U.initialize() t = 0 step_train = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 local_obs = env.get_obs() local_obs = np.array(local_obs) global_state = env.get_state() global_state_expand = np.zeros( [local_obs.shape[0], local_obs.shape[1] + global_state.shape[0]]) reward_hl_own_old = [] reward_hl_en_old = [] episode_reward_agent = [0 for n in range(n_agents)] for i in range(local_obs.shape[0]): global_state_expand[i] = np.append(local_obs[i], global_state.flatten()) reward_hl_own_old.append(env.get_agent_health(i)) reward_hl_en_old.append(env.get_enemy_health(i)) while not terminated: t = t + 1 critic_input = np.expand_dims(global_state_expand, axis=0) actor_input = np.expand_dims(local_obs, axis=0) action = actor.predict(actor_input)[0] act_with_noise = action #np.clip(action + action_noise.get_noise(step_train), action_low, action_high) act_mat_norm = (act_with_noise + 1) / 2 actions = [] dead_unit = [] rew_expand = np.zeros((n_agents, 1)) for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if (sum_avail_act == 0): act_prob = (np.array(act_prob) + 1) / len(act_prob) else: act_prob = np.array(act_prob) / sum_avail_act index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) actions.append(index) if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): dead_unit.append(agent_id) reward_base, terminated, info = env.step(actions) new_local_obs = env.get_obs() new_local_obs = np.array(new_local_obs) new_global_state = env.get_state() new_global_state_expand = np.zeros([ new_local_obs.shape[0], new_local_obs.shape[1] + new_global_state.shape[0] ]) reward_hl_own_new = [] reward_hl_en_new = [] for i in range(new_local_obs.shape[0]): new_global_state_expand[i] = np.append( new_local_obs[i], new_global_state.flatten()) reward_hl_own_new.append(env.get_agent_health(i)) reward_hl_en_new.append(env.get_enemy_health(i)) for i in range(n_agents): if (i in dead_unit): rew_expand[i] = 0 else: rew_expand[i] = -0.05 if (actions[i] > 5): target_id = actions[i] - 6 health_reduce_en = reward_hl_en_old[ target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): rew_expand[i] += 2 + health_reduce_en * 5 # if (reward_base > 50): # rew_expand[i] += 20 else: rew_expand[i] += 1 else: rew_expand[i] += (reward_hl_own_new[i] - reward_hl_own_old[i]) * 5 # if (terminated): if (info["battle_won"] is False): rew_expand[i] += -10 else: rew_expand[i] += 10 episode_reward_agent[i] += rew_expand[i] replay_buffer.add(local_obs, global_state_expand, act_with_noise, rew_expand, terminated, new_local_obs, new_global_state_expand) episode_reward += reward_base local_obs = new_local_obs global_state_expand = new_global_state_expand if (t == num_exploring): print("training starts") if (t >= num_exploring): local_s_batch, global_s_batch, a_batch, r_batch, done_batch, local_s2_batch, global_s2_batch = replay_buffer.sample_batch( batch_size ) # [group0:[batch_size, trace.dimension], group1, ... group8] target_q = r_batch + gamma * critic.predict_target( global_s2_batch, actor.predict_target(local_s2_batch)) predicted_q_value, _ = critic.train( global_s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, critic_output_len))) a_outs = actor.predict(local_s_batch) # a_outs和a_batch是完全相同的 grads = critic.action_gradients(global_s_batch, a_outs) # delta Q对a的导数 actor.train(local_s_batch, grads) step_train = step_train + 1 actor.update_target_network() critic.update_target_network() if (t % save_freq == 0): model_file_save = os.path.join( "model/" + str(step_train) + "_" + "training_steps_model/", "8m") U.save_state(model_file_save) print("Model have been trained for %s times" % (step_train)) # replay_buffer.save() print("steps until now : %s, episode: %s, episode reward: %s" % (t, e, episode_reward)) logger.record_tabular("steps", t) logger.record_tabular("episodes", e) logger.record_tabular("reward_episode", episode_reward) for i in range(n_agents): logger.record_tabular("reward_agent_" + str(i), episode_reward_agent[i]) logger.dump_tabular() # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) env.close()
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs): kwargs['logdir'] = logdir whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': sys.exit(0) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Write to temp directory for all non-master workers. actual_dir = None Logger.CURRENT.close() Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[]) logger.set_level(logger.DISABLED) # Create envs. if rank == 0: env = gym.make(env_id) if gym_monitor and logdir: env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True) env = SimpleMonitor(env) if evaluation: eval_env = gym.make(env_id) if gym_monitor and logdir: eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True) eval_env = SimpleMonitor(eval_env) else: eval_env = None else: env = gym.make(env_id) if evaluation: eval_env = gym.make(env_id) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() Logger.CURRENT.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_cpu : %s" % FLAGS.num_cpu) print("lr : %s" % FLAGS.lr) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env("CollectMineralShards", step_mul=step_mul, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn(env, q_func=model, num_actions=64, lr=1e-3, max_timesteps=20000000, buffer_size=10000, exploration_fraction=0.5, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "acktr"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 # def make_env(rank): # # env = sc2_env.SC2Env( # # "CollectMineralShards", # # step_mul=step_mul) # # return env # #env.seed(seed + rank) # def _thunk(): # env = sc2_env.SC2Env( # map_name=FLAGS.map, # step_mul=step_mul, # visualize=True) # #env.seed(seed + rank) # if logger.get_dir(): # env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) # return env # return _thunk # agents = [Agent() # for _ in range(num_cpu)] # # for agent in agents: # time.sleep(1) # agent.daemon = True # agent.start() # agent_controller = AgentController(agents) #set_global_seeds(seed) env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map) policy_fn = CnnPolicy acktr_disc.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_cpu, ent_coef=0.1, callback=acktr_callback)
def train(arglist): with U.single_threaded_session(): # Create environment env = StarCraft2Env(map_name=arglist.scenario, reward_only_positive=False, obs_last_action=True, obs_timestep_number=True, reward_scale_rate=200) # Create agent trainers env_info = env.get_env_info() num_agents = env_info["n_agents"] num_adversaries = num_agents obs_shape_n = [(env_info["obs_shape"], ) for i in range(num_adversaries)] action_space_n = [ env_info["n_actions"] for i in range(num_adversaries) ] buffer_size = arglist.buffer_size trainers = get_trainers(num_adversaries, obs_shape_n, action_space_n, arglist, buffer_size) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() logdir = "./tensorboard/" Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(num_agents)] # individual agent reward saver = tf.train.Saver(max_to_keep=100000000) n_actions_no_attack = 6 env.reset() obs_n = [] reward_hl_own_old = [] reward_hl_en_old = [] for agent_id in range(num_agents): # 第一个循环是为了得到初始状态/观察/生命值信息 obs = env.get_obs_agent(agent_id) obs_n.append(obs) reward_hl_own_old.append(env.get_agent_health(agent_id)) reward_hl_en_old.append(env.get_enemy_health(agent_id)) episode_step = 0 step = 0 print('Starting iterations...') while True: # get action action_set_actual = [] action_set_execute = [] action_n = [] dead_unit = [] for agent_id in range(num_agents): action_output = trainers[agent_id].action(obs_n[agent_id]) action_n.append(action_output) action_prob = action_output action_to_choose = np.argmax(action_prob) action_set_actual.append(action_to_choose) avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] if action_to_choose in avail_actions_ind: action_set_execute.append(action_to_choose) elif (avail_actions[0] == 1): action_set_execute.append( 0) # 如果该动作不能执行,并且智能体已经死亡,那么就用NO_OP代替当前动作 else: action_set_execute.append(1) # 如果该动作不能执行,那么就用STOP动作代替 if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): # 判断该智能体是否已经死亡 dead_unit.append(agent_id) rew_base, done, _ = env.step(action_set_execute) episode_rewards[-1] += rew_base new_obs_n = [] reward_hl_own_new = [] reward_hl_en_new = [] rew_n = [] for agent_id in range(num_agents): obs_next = env.get_obs_agent(agent_id=agent_id) new_obs_n.append(obs_next) reward_hl_own_new.append(env.get_agent_health(agent_id)) reward_hl_en_new.append(env.get_enemy_health(agent_id)) for agent_id in range(num_agents): if (agent_id in dead_unit): reward = 0 elif (action_set_execute[agent_id] != action_set_actual[agent_id] ): #当输出动作无法执行时,执行替代动作,但是把输出动作进行保存并且给与一个负的奖励 reward = -2 elif (action_set_execute[agent_id] > 5): target_id = action_set_execute[ agent_id] - n_actions_no_attack health_reduce_en = reward_hl_en_old[ target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): if (rew_base > 0): reward = 2 + rew_base else: reward = 2 else: reward = 1 else: reward = (reward_hl_own_new[agent_id] - reward_hl_own_old[agent_id]) * 5 rew_n.append(reward) episode_step += 1 # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done) obs_n = new_obs_n reward_hl_own_old = reward_hl_own_new reward_hl_en_old = reward_hl_en_new for i, rew in enumerate(rew_n): agent_rewards[i][-1] += rew if done: print("steps until now : %s, episode: %s, episode reward: %s" % (step, len(episode_rewards), episode_rewards[-1])) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("episode reward", episode_rewards[-1]) for i in range(num_agents): logger.record_tabular("agent" + str(i) + " episode reward", agent_rewards[i][-1]) logger.dump_tabular() env.reset() obs_n = [] reward_hl_own_old = [] reward_hl_en_old = [] for agent_id in range(num_agents): # 第一个循环是为了得到初始状态/观察/生命值信息 obs = env.get_obs_agent(agent_id) obs_n.append(obs) reward_hl_own_old.append(env.get_agent_health(agent_id)) reward_hl_en_old.append(env.get_enemy_health(agent_id)) episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) # increment global step counter step += 1 if (step == arglist.buffer_size): print("Training starts.") # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, step) # save model, display training output if done and (len(episode_rewards) % arglist.save_rate == 0): save_dir = arglist.save_dir + "/model_" + str( step) + "steps/" + arglist.exp_name U.save_state(save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}". format(step, len(episode_rewards), np.mean( episode_rewards[-arglist.save_rate:]))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}" .format(step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: print('...Finished total of {} episodes.'.format( len(episode_rewards) - 1)) break
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) # steps_left = FLAGS.timesteps logdir = "tensorboard" if(FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time ) elif(FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time ) elif(FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time ) if(FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif(FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=64, minimap=64),#feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 use_feature_units=True ) lr = FLAGS.lr batch_size = 32 # 32 gamma = 0.99 num_agents = 9 vector_obs_len = 33 #4096 # 32*32 1024 output_len = 3 hidden_vector_len = 128 #1 tau = 0.001 # stddev = 0.1 sess = U.make_session() sess.__enter__() actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) # while(steps_left > 0): with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", #DefeatZerglingsAndBanelings step_mul=step_mul, save_replay_episodes=1, replay_dir="D:/StarCraft II/StarCraft II/Replays/video/0722", agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=False, #True game_steps_per_episode=steps * step_mul) as env: learn( env, sess=sess, max_timesteps=FLAGS.timesteps, # callback=BicNet_callback, actor=actor, num_agents=num_agents )
_ATTACK_SCREEN = actions.FUNCTIONS.Attack_screen.id _SELECT_ARMY = actions.FUNCTIONS.select_army.id _SELECT_UNIT = actions.FUNCTIONS.select_unit.id _SELECT_POINT = actions.FUNCTIONS.select_point.id _NOT_QUEUED = [0] _SELECT_ALL = [0] UP, DOWN, LEFT, RIGHT = 'up', 'down', 'left', 'right' #to record the output start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") logdir = "./tensorboard/enjoy/%s" % start_time Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) FLAGS = flags.FLAGS flags.DEFINE_string("map_name", "DefeatZerglingsAndBanelings", "the map you want to see.") flags.DEFINE_string("trained_model", "/home/tld/PycharmProjects/DeepQ_StarCraft2/models/deepq/zergling_45.6.pkl", "the model you has trained.") flags.DEFINE_bool("visualize", True, "if you want to see the game") flags.DEFINE_integer("num_actions", 4, "numbers of your action") flags.DEFINE_integer("step_mul", 5, "the time of every step spends") flags.DEFINE_integer("episode_steps", 2800, "the steps of every episode spends") def main(): FLAGS(sys.argv) with sc2_env.SC2Env( map_name=FLAGS.map_name,
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) elif (FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions( screen=64, minimap=64 ) #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 ) with sc2_env.SC2Env( map_name="DefeatZerglingsAndBanelings", #DefeatZerglingsAndBanelings step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True, #True game_steps_per_episode=steps * step_mul) as env: model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) #model,需要改成lstm的形式。 demo_replay = [] # act = dqfd.learn( # env, # q_func=model, # num_actions=3, # lr=1e-4, # max_timesteps=10000000, # buffer_size=100000, # exploration_fraction=0.5, # exploration_final_eps=0.01, # train_freq=2, # learning_starts=100000, # target_network_update_freq=1000, # gamma=0.99, # prioritized_replay=True, # callback=deepq_callback # ) # act.save("defeat_zerglings.pkl") BicNet_findAndDefeatZergling.learn( env, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=100000, train_freq=1, learning_starts=1000, #100000, target_network_update_freq=1000, gamma=0.99, callback=BicNet_callback)
def main(): FLAGS(sys.argv) steps = 0 #Test steps print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if FLAGS.lr == 0: FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if FLAGS.algorithm == "deepq-4way": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "deepq": logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif FLAGS.algorithm == "a2c": logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if FLAGS.log == "tensorboard": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif FLAGS.log == "stdout": Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if FLAGS.algorithm == "deepq": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16)) # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting # We need this change because sc2 now requires specifying players. with sc2_env.SC2Env( map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)], step_mul=step_mul, visualize=True, agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) acts = deepq_nexus_wars.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) agent = random_agent.RandomAgent() run_loop.run_loop([agent], env, steps) acts[0].save("mineral_shards_x.pkl") acts[1].save("mineral_shards_y.pkl") elif FLAGS.algorithm == "deepq-4way": AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env(map_name="Simple64", players=[ sc2_env.Agent(race=sc2_env.Race.terran), sc2_env.Agent(race=sc2_env.Race.terran) ], step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif FLAGS.algorithm == "a2c": num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( #interface.feature_layer.resolution 和 interface.feature_layer.minimap_resolution feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32) # 16 16 # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32) # 16 16 ) with sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=step_mul, #推进的速度,通俗理解就是人类玩家的每秒的有效操作 visualize=True, # screen_size_px=(16, 16), # minimap_size_px=(16, 16)) as env: agent_interface_format=AGENT_INTERFACE_FORMAT) as env: model = deepq.models.cnn_to_mlp( #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) #卷积核数量,卷积核大小,步长 # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True) # 卷积核数量,卷积核大小,步长 act = deepq_mineral_shards.learn( #训练模型并保存 # act = deepq_ActSeparate.learn( #训练模型并保存 # act=deepq_actSeparateWith4Directions.learn( # act = deepq_actionGroup_4way.learn( # act = deep_DiffActInSameTime.learn( env, q_func=model, num_actions=4, #default 16 num_actions=256 3 4 lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_actSeparateWith4Directions_callback ) #deepq_callback; deepq_ActSeperate_callback ; deepq_actSeparateWith4Directions_callback deep_DiffActInSameTime_callback act.save( "mineral_shards.pkl" ) #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py elif (FLAGS.algorithm == "deepq-4way"): AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)) with sc2_env.SC2Env( # map_name="CollectMineralShards", step_mul=step_mul, # screen_size_px=(32, 32), # minimap_size_px=(32, 32), save_replay_episodes=2, replay_dir="D:/StarCraft II/StarCraft II/video", agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) # model = deepq.models.mlp(hiddens=[256,128,4]) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") lr = 0.002 buffer_size = 80000 #80000 # 减少一下,尽量是训练步数的1/10 70000 test 200 80000 20000 batch_size = 32 # 32 gamma = 0.99 num_agents = 8 vector_obs_len = 248 # local obs:80 ; global state:168; output_len = 14 hidden_vector_len = 256 #128 # 1 256 tau = 0.001 num_exploring = buffer_size #buffer_size action_low = -1 action_high = 1 save_freq = 10000 # min_life = 45 env = StarCraft2Env(map_name="8m",reward_only_positive=False, reward_scale_rate=200) #8m DefeatZerglingsAndBanelings reward_scale_rate=200 env_info = env.get_env_info() n_episodes = 4000 #4000 #2000 # n_actions = env_info["n_actions"] n_agents = env_info["n_agents"] episode_len = env_info["episode_limit"] timesteps = n_episodes * episode_len logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( "BicNet", timesteps, 16, lr, start_time ) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() # state_dim = (n_agents, vector_obs_len) # action_dim = (n_agents, output_len) actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) # actor = ActorNetwork(sess, state_dim, action_dim, lr, tau, batch_size) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, vector_obs_len, output_len, hidden_vector_len) # critic = CriticNetwork(sess, state_dim, action_dim, lr, tau, gamma, actor.get_num_trainable_vars()) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) action_noise = OU_noise(decay_period=timesteps - buffer_size) action_noise.reset() # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings") # U.load_state(model_file_load, sess) U.initialize() t = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 obs = env.get_obs() obs = np.array(obs) # state, target_attack = env.get_state() state, min = env.get_state() screen_expand = np.zeros([obs.shape[0],obs.shape[1] + state.shape[0]]) for i in range(obs.shape[0]): screen_expand[i] = np.append(obs[i],state.flatten()) # screen_expand = state_transform(obs) # screen_expand = state_expand(state, n_agents) while not terminated: t = t+1 screen_input = np.expand_dims(screen_expand, axis=0) action = actor.predict(screen_input)[0] act_with_noise = np.clip(action + action_noise.get_noise(t - num_exploring), action_low, action_high) act_mat_norm = (act_with_noise+1)/2 actions = [] dead_unit = [] rew_expand = np.zeros((n_agents, 1)) # punish = [] # health_agent = [] # health_enemy = [] agent_group = [] for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] # print('act_unit_norm',act_unit_norm) # act_prob = act_unit_norm / np.sum(act_unit_norm, axis=0) for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if(sum_avail_act == 0): act_prob = (np.array(act_prob) + 1)/len(act_prob) else : act_prob = np.array(act_prob)/sum_avail_act # index = np.random.choice(np.arange(0,14), p=act_prob.ravel()) # print("act_prob",act_prob) index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) # if (index in avail_actions_ind): # punish.append(False) # else: # punish.append(True) # if (0 in avail_actions_ind): # actions.append(0) # else: # actions.append(1) actions.append(index) # health_agent.append(state[4*agent_id]) # health_enemy.append(state[4*n_agents + 3*agent_id]) # if(index > 5): # target_id = index - 6 if(len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): dead_unit.append(agent_id) # health_agent = np.array(health_agent) # for i in range(len(health_enemy)): # if (health_enemy[i] < min_life): # min_life = health_enemy[i] # health_enemy = np.array(health_enemy) reward, terminated, _ = env.step(actions) # rew_expand = np.ones((n_agents, 1))*reward # health_enemy_new = [] for i in range(n_agents): if (i not in dead_unit): rew_expand[i] += reward if (actions[i] > 5): enemy_id = actions[i] - 6 rew_expand[i] += 1 # if(actions[i]-6 == target_attack): for j in range(n_agents): if (actions[j] == actions[i] and i!=j): if (state[4 * n_agents + 3 * enemy_id] == min): rew_expand[i] += 1 new_obs = env.get_obs() new_obs = np.array(new_obs) # new_state, target_attack = env.get_state() new_state, min = env.get_state() new_screen_expand = np.zeros([new_obs.shape[0], new_obs.shape[1] + new_state.shape[0]]) for i in range(new_obs.shape[0]): new_screen_expand[i] = np.append(new_obs[i], new_state.flatten()) # health_agent_new = [] # for i in range(n_agents): # health_agent_new.append(new_state[4 * i]) # # health_enemy_new.append(new_state[4 * n_agents + 3 * i]) # health_agent_new = np.array(health_agent_new) # health_enemy_new = np.array(health_enemy_new) # life_reduce_agent = health_agent - health_agent_new # life_reduce_agent_all = life_reduce_agent.sum(axis=0) # life_reduce_enemy = health_enemy - health_enemy_new # life_reduce_enemy_all = life_reduce_enemy.sum(axis=0) # reward_base = life_reduce_enemy_all - life_reduce_agent_all # for i in range(n_agents): # rew_expand[i] += reward_base+life_reduce_agent[i] # for i in range(n_agents): # if (punish[i]): # rew_expand[i] += -2 # elif (i in dead_unit): # rew_expand[i] += 0 # elif (actions[i] > 5): # rew_expand[i] = 1 # if(health_enemy[actions[i] - 6] == min_life): # rew_expand[i] = 1 # rew_expand[i] += life_reduce_agent[i] replay_buffer.add(screen_expand, act_with_noise, rew_expand, terminated, new_screen_expand) episode_reward += reward screen_expand = new_screen_expand # state = new_state # target_attack = target_attack_new if(t>=num_exploring): print("training starts") s_batch, a_batch, r_batch, done_batch, s2_batch = replay_buffer.sample_batch(batch_size) # [group0:[batch_size, trace.dimension], group1, ... group8] target_q = r_batch + gamma * critic.predict_target(s2_batch, actor.predict_target(s2_batch)) predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, output_len))) a_outs = actor.predict(s_batch) # a_outs和a_batch是完全相同的 grads = critic.action_gradients(s_batch, a_outs) # delta Q对a的导数 actor.train(s_batch, grads) actor.update_target_network() critic.update_target_network() # if(t % save_freq == 0): # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) # replay_buffer.save() print("Total reward in episode {} = {}".format(e, episode_reward)) logger.record_tabular("steps", t) logger.record_tabular("episodes", e) logger.record_tabular("reward", episode_reward) logger.dump_tabular() # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) env.close()
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") lr = 0.002 batch_size = 32 # 32 num_agents = 8 vector_obs_len = 248 # local obs:80 ; global state:168; output_len = 14 hidden_vector_len = 256 #128 # 1 256 tau = 0.001 env = StarCraft2Env(map_name="8m",reward_only_positive=False, reward_scale_rate=200) #8m DefeatZerglingsAndBanelings reward_scale_rate=200 env_info = env.get_env_info() n_episodes = 4000 #4000 #2000 n_agents = env_info["n_agents"] episode_len = env_info["episode_limit"] timesteps = n_episodes * episode_len logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( "BicNet", timesteps, 16, lr, start_time ) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) model_file_load = os.path.join(str(300000) + "_" + "model_segment_training/", "defeat_zerglings") U.load_state(model_file_load, sess) t = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 obs = env.get_obs() obs = np.array(obs) state, min = env.get_state() screen_expand = np.zeros([obs.shape[0],obs.shape[1] + state.shape[0]]) for i in range(obs.shape[0]): screen_expand[i] = np.append(obs[i],state.flatten()) while not terminated: t = t+1 screen_input = np.expand_dims(screen_expand, axis=0) action = actor.predict(screen_input)[0] act_with_noise = action act_mat_norm = (act_with_noise+1)/2 actions = [] for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if(sum_avail_act == 0): act_prob = (np.array(act_prob) + 1)/len(act_prob) else : act_prob = np.array(act_prob)/sum_avail_act index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) actions.append(index) reward, terminated, _ = env.step(actions) new_obs = env.get_obs() new_obs = np.array(new_obs) new_state, min = env.get_state() new_screen_expand = np.zeros([new_obs.shape[0], new_obs.shape[1] + new_state.shape[0]]) for i in range(new_obs.shape[0]): new_screen_expand[i] = np.append(new_obs[i], new_state.flatten()) episode_reward += reward screen_expand = new_screen_expand print("Total reward in episode {} = {}".format(e, episode_reward)) logger.record_tabular("steps", t) logger.record_tabular("episodes", e) logger.record_tabular("reward", episode_reward) logger.dump_tabular() env.close()
def main(): FLAGS(sys.argv) print("algorithm : %s" % FLAGS.algorithm) print("timesteps : %s" % FLAGS.timesteps) print("exploration_fraction : %s" % FLAGS.exploration_fraction) print("prioritized : %s" % FLAGS.prioritized) print("dueling : %s" % FLAGS.dueling) print("num_agents : %s" % FLAGS.num_agents) print("lr : %s" % FLAGS.lr) if (FLAGS.lr == 0): FLAGS.lr = random.uniform(0.00001, 0.001) print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) logdir = "tensorboard" if (FLAGS.algorithm == "deepq-4way"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "deepq"): logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, lr_round, start_time) elif (FLAGS.algorithm == "a2c"): logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) if (FLAGS.algorithm == "deepq"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, visualize=True, screen_size_px=(16, 16), minimap_size_px=(16, 16)) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_shards.learn( env, q_func=model, num_actions=16, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "deepq-4way"): with sc2_env.SC2Env(map_name="CollectMineralGas", step_mul=step_mul, screen_size_px=(32, 32), minimap_size_px=(32, 32), visualize=True) as env: model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], dueling=True) act = deepq_mineral_4way.learn( env, q_func=model, num_actions=4, lr=FLAGS.lr, max_timesteps=FLAGS.timesteps, buffer_size=10000, exploration_fraction=FLAGS.exploration_fraction, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, callback=deepq_4way_callback) act.save("mineral_shards.pkl") elif (FLAGS.algorithm == "a2c"): num_timesteps = int(40e6) num_timesteps //= 4 seed = 0 env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map) policy_fn = CnnPolicy a2c.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, ent_coef=0.5, nsteps=FLAGS.nsteps, max_grad_norm=0.01, callback=a2c_callback)
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) # steps_left = FLAGS.timesteps logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) elif (FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions( screen=32, minimap=32 ), #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 use_feature_units=True) lr = FLAGS.lr buffer_size = 60000 # 50000 减少一下,尽量是训练步数的1/10 70000 test 200 70000 batch_size = 32 # 32 gamma = 0.99 num_agents = 2 #9 vector_obs_len = 736 #33 #4096 # 32*32 1024 output_len = 4 #3 hidden_vector_len = 128 #128 #1 tau = 0.001 # stddev = 0.1 sess = U.make_session() sess.__enter__() actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) critic = tb.CriticNetwork(sess, lr, tau, gamma, actor.get_num_trainable_vars(), num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1), sigma=float(stddev) * np.ones(1)) action_noise = noise_OU.OU_noise(decay_period=FLAGS.timesteps - buffer_size) # while(steps_left > 0): with sc2_env.SC2Env( map_name="CollectMineralShards", #DefeatZerglingsAndBanelings # step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=False, #True game_steps_per_episode=steps * step_mul) as env: learn( env, sess=sess, max_timesteps=FLAGS.timesteps, train_freq=1, save_freq=10000, target_network_update_freq=1, #1000 gamma=gamma, # callback=BicNet_callback, actor=actor, critic=critic, replay_buffer=replay_buffer, num_agents=num_agents, action_noise=action_noise, output_len=output_len, num_exploring=buffer_size #buffer_size )