def train(): FLAGS(sys.argv) with sc2_env.SC2Env( map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=64, feature_minimap=64, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=step_mul, game_steps_per_episode=None, disable_fog=False, visualize=True) as env: sess = tf.Session() actor = Actor(sess, n_features=2, n_actions=4, lr=0.001) critic = Critic(sess, n_features=2, lr=0.001) sess.run(tf.global_variables_initializer()) for episodes in range(EPISODES): done = False obs = env.reset() while not 331 in obs[0].observation["available_actions"]: actions = actAgent2Pysc2(100,obs) obs = env.step(actions=[actions]) state = np.array(obs2state(obs)) #print('episode start') global_step = 0 reward = 0 while not done: global_step += 1 action = actor.choose_action(state) actions = actAgent2Pysc2(action,obs) obs = env.step(actions=[actions]) for i in range(3): actions = no_operation(obs) obs = env.step(actions=[actions]) distance = obs2distance(obs) if global_step == 1: pre_distance = distance next_state = np.array(obs2state(obs)) reward = -(distance-pre_distance)*400 if distance < 0.03 or global_step == 200: # 게임 종료시 if distance < 0.03: reward = 10 if global_step == 200: reward = -10 done = True td_error = critic.learn(state, reward, next_state) actor.learn(state, action, td_error) if distance < 0.03 or global_step == 200: # 게임 종료시 break state = next_state pre_distance = distance
def worker(remote, visualize): env = sc2_env.SC2Env( map_name='CollectMineralShards', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=32, feature_minimap=32, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=4, game_steps_per_episode=None, disable_fog=False, visualize=True) done = False pre_num_mineral = 20 while True: cmd, action, obs, global_step = remote.recv() end_step = 400 if cmd == 'step': if not action == 'done': available_action, state = obs a = actAgent2Pysc2(action, state) obs = env.step(actions=[a]) state = obs2state(obs) obs = (obs[0].observation.available_actions, state) #reward shaping mineral_map = state.reshape(32, 32, 2) num_mineral = np.sum(mineral_map[:, :, 1]) reward = -0.1 if num_mineral != pre_num_mineral: # when getting mineral reward = -num_mineral + pre_num_mineral if num_mineral <= 2 or global_step == end_step - 1: # when getting all mineral or failed to getting all mineral done = True pre_num_mineral = num_mineral remote.send((obs, state, action, reward, done)) else: remote.send((0, 0, 0, 0, True)) if cmd == 'reset': pre_num_mineral = 20 done = False obs = env.reset() #env 초기화 while not 331 in obs[0].observation.available_actions: #마린을 선택하기 actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) obs = (obs[0].observation.available_actions, state) remote.send((obs, state, 0, 0, False)) if cmd == 'close': remote.close() break
def train(): FLAGS(sys.argv) with sc2_env.SC2Env( map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=64, feature_minimap=64, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=step_mul, game_steps_per_episode=None, disable_fog=False, visualize=True) as env: with tf.Session() as sess: Policy = Policy_net('policy', 2, 4) Old_Policy = Policy_net('old_policy', 2, 4) PPO = PPOTrain(Policy, Old_Policy) D = Discriminator() saver = tf.train.Saver() saver.restore(sess, './model/gail.cpkt') c = 0 for episodes in range(100000): done = False obs = env.reset() while not 331 in obs[0].observation.available_actions: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) observations = [] actions_list = [] rewards = [] v_preds = [] reward = 0 global_step = 0 while not done: global_step += 1 state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) observations.append(state) actions_list.append(act) rewards.append(reward) v_preds.append(v_pred) actions = actAgent2Pysc2(act, obs) obs = env.step(actions=[actions]) next_state = obs2state(obs) distance = obs2distance(obs) if distance < 0.03 or global_step == 100: done = True if done: v_preds_next = v_preds[1:] + [0] break state = next_state
def worker(remote, visualize): env = sc2_env.SC2Env( map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=32, feature_minimap=32, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=4, game_steps_per_episode=None, disable_fog=False, visualize=False) done = False while True: cmd, action, obs, global_step = remote.recv() end_step = 400 if cmd == 'step': if not action == 'done': available_action, state = obs a = actAgent2Pysc2(action, state) obs = env.step(actions=[a]) distance = obs2distance(obs) state = obs2state(obs) obs = (obs[0].observation.available_actions, state) #reward shaping reward = -0.1 if distance < 0.03 or global_step == end_step - 1: if distance < 0.03: reward = 1 if global_step == end_step - 1: reward = -1 done = True remote.send((obs, state, action, reward, done)) else: remote.send((0, 0, 0, 0, True)) if cmd == 'reset': pre_num_mineral = 20 done = False obs = env.reset() #env 초기화 while not 331 in obs[0].observation.available_actions: #마린을 선택하기 actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) obs = (obs[0].observation.available_actions, state) remote.send((obs, state, 0, 0, False)) if cmd == 'close': remote.close() break
def worker(remote, visualize): env = sc2_env.SC2Env( map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=64, feature_minimap=64, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=4, game_steps_per_episode=None, disable_fog=False, visualize=True) done = False while True: cmd, action, obs, global_step = remote.recv() end_step = 100 if cmd == 'step': if not action == 'done': #while not 331 in obs[0].observation['available_actions']: #마린을 선택하기 # actions = actAgent2Pysc2(100, obs) # obs = env.step(actions=[actions]) a = actAgent2Pysc2(action, obs) obs = env.step(actions=[a]) for i in range(1): actions = no_operation(obs) obs = env.step(actions=[actions]) state = obs2state(obs) distance = obs2distance(obs) reward = -0.1 obs = obs[0].observation.feature_screen.base[4] if distance < 0.03 or global_step == end_step - 1: if distance < 0.03: reward = 1 done = True remote.send((obs, state, action, reward, done)) else: remote.send((0, 0, 0, 0, True)) if cmd == 'reset': done = False obs = env.reset() #env 초기화 while not 331 in obs[0].observation['available_actions']: #마린을 선택하기 actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) obs = obs[0].observation.feature_screen.base[4] remote.send((obs, state, 0, 0, False)) if cmd == 'close': remote.close() break
def test(): FLAGS(sys.argv) with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env: sess = tf.Session() mainDQN = DQN(sess, 2, 4, name='main') targetDQN = DQN(sess, 2, 4, name='target') #sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, './Move2Beacon(DQN)/model.cpkt') copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) for episodes in range(EPISODES): done = False obs = env.reset() while not 331 in obs[0].observation["available_actions"]: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) print('episode start') global_step = 0 random_rate = 0 e = 1. / ((episodes / 10) + 1) reward = 0 while not done: time.sleep(0.13) global_step += 1 action = np.argmax(mainDQN.predict(state)) actions = actAgent2Pysc2(action, obs) obs = env.step(actions=[actions]) for i in range(1): actions = no_operation(obs) obs = env.step(actions=[actions]) distance = obs2distance(obs) if global_step == 1: pre_distance = distance next_state = obs2state(obs) reward = -(distance - pre_distance) * 400 if distance < 0.015 or global_step == 200: # 게임 종료시 done = True if distance < 0.015 or global_step == 200: # 게임 종료시 print(reward, episodes, random_rate / global_step) break state = next_state pre_distance = distance
def worker(remote, visualize): env = sc2_env.SC2Env(map_name='MoveToBeacon', step_mul=4, visualize=visualize, screen_size_px=(64, 64), minimap_size_px=(64, 64)) done = False while True: cmd, action, obs, global_step = remote.recv() end_step = 100 if cmd == 'step': if not action == 'done': while not 331 in obs[0].observation[ 'available_actions']: #마린을 선택하기 actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) a = actAgent2Pysc2(action, obs) obs = env.step(actions=[a]) for i in range(1): actions = no_operation(obs) obs = env.step(actions=[actions]) state = obs2state(obs) distance = obs2distance(obs) reward = -0.01 if distance < 0.03 or global_step == end_step - 1: if distance < 0.03: reward = 1 if global_step == end_step - 1: reward = -1 done = True remote.send((obs, state, action, reward, done)) else: remote.send((0, 0, 0, 0, True)) if cmd == 'reset': done = False obs = env.reset() #env 초기화 while not 331 in obs[0].observation['available_actions']: #마린을 선택하기 actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) remote.send((obs, state, 0, 0, False)) if cmd == 'close': remote.close() break
def train(): FLAGS(sys.argv) with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env: sess = tf.Session() actor = Actor(sess, n_features=2, n_actions=4, lr=0.001) critic = Critic(sess, n_features=2, lr=0.001) sess.run(tf.global_variables_initializer()) for episodes in range(EPISODES): done = False obs = env.reset() while not 331 in obs[0].observation["available_actions"]: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = np.array(obs2state(obs)) print('episode start') global_step = 0 reward = 0 while not done: global_step += 1 time.sleep(0.2) action = actor.choose_action(state) actions = actAgent2Pysc2(action, obs) obs = env.step(actions=[actions]) for i in range(3): actions = no_operation(obs) obs = env.step(actions=[actions]) distance = obs2distance(obs) if global_step == 1: pre_distance = distance next_state = np.array(obs2state(obs)) reward = -(distance - pre_distance) * 400 if distance < 0.03 or global_step == 200: # 게임 종료시 if distance < 0.03: reward = 10 if global_step == 200: reward = -10 done = True td_error = critic.learn(state, reward, next_state) actor.learn(state, action, td_error) if distance < 0.03 or global_step == 200: # 게임 종료시 break state = next_state pre_distance = distance
def train(): FLAGS(sys.argv) with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul, screen_size_px=(16, 16), minimap_size_px=(16, 16)) as env: Policy = Policy_net('policy', 16 * 16 * 2, 4) Old_Policy = Policy_net('old_policy', 16 * 16 * 2, 4) PPO = PPOTrain(Policy, Old_Policy, gamma=0.95) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for episodes in range(EPISODES): done = False obs = env.reset() while not 331 in obs[0].observation["available_actions"]: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = np.array(obs2state(obs)) print('episode start') global_step = 0 reward = 0 observations = [] actions_list = [] v_preds = [] rewards = [] while not done: global_step += 1 time.sleep(0.05) state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) actions = actAgent2Pysc2(act, obs) obs = env.step(actions=[actions]) for i in range(1): actions = no_operation(obs) obs = env.step(actions=[actions]) distance = obs2distance(obs) if global_step == 1: pre_distance = distance next_state = np.array(obs2state(obs)) reward = -10 * (distance - pre_distance) #if reward < 0 : # reward = -0.01 #if reward <= 0: # reward = 0 #elif reward > 0: # reward = 0 reward = -0.01 if distance < 0.03 or global_step == 100: # 게임 종료시 if distance < 0.03: reward = 1 if global_step == 200: reward = -1 done = True observations.append(state) actions_list.append(act) v_preds.append(v_pred) rewards.append(reward) if distance < 0.03 or global_step == 100: # 게임 종료시 v_preds_next = v_preds[1:] + [0] gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) observations = np.reshape(observations, newshape=[-1, 16 * 16 * 2]) actions = np.array(actions_list).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype( dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) PPO.assign_policy_parameters() inp = [ observations, actions, rewards, v_preds_next, gaes ] for epoch in range(5): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) print(episodes, global_step) break state = next_state pre_distance = distance
def train(): FLAGS(sys.argv) with sc2_env.SC2Env( map_name='MoveToBeacon', agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=64, feature_minimap=64, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False), step_mul=step_mul, game_steps_per_episode=None, disable_fog=False, visualize=False) as env: r = tf.placeholder(tf.float32) ######## rr = tf.summary.scalar('reward', r) merged = tf.summary.merge_all() ######## expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) with tf.Session() as sess: Policy = Policy_net('policy', 2, 4) Old_Policy = Policy_net('old_policy', 2, 4) PPO = PPOTrain(Policy, Old_Policy) D = Discriminator() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() writer = tf.summary.FileWriter('./board/gail', sess.graph) ######## c = 0 for episodes in range(100000): done = False obs = env.reset() while not 331 in obs[0].observation.available_actions: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) observations = [] actions_list = [] rewards = [] v_preds = [] reward = 0 global_step = 0 while not done: global_step += 1 state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) observations.append(state) actions_list.append(act) rewards.append(reward) v_preds.append(v_pred) actions = actAgent2Pysc2(act, obs) obs = env.step(actions=[actions]) next_state = obs2state(obs) distance = obs2distance(obs) if distance < 0.03 or global_step == 100: done = True if done: v_preds_next = v_preds[1:] + [0] break state = next_state observations = np.reshape(observations, newshape=[-1, 2]) actions_list = np.array(actions_list).astype(dtype=np.int32) for i in range(2): sample_indices = (np.random.randint( expert_observations.shape[0], size=observations.shape[0])) inp = [expert_observations, expert_actions] sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data D.train(expert_s=sampled_inp[0], expert_a=sampled_inp[1], agent_s=observations, agent_a=actions_list) d_rewards = D.get_rewards(agent_s=observations, agent_a=actions_list) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) inp = [ observations, actions_list, gaes, d_rewards, v_preds_next ] PPO.assign_policy_parameters() for epoch in range(15): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = sess.run(merged, feed_dict={r: global_step}) writer.add_summary(summary, episodes) if global_step < 50: c += 1 else: c = 0 if c > 10: saver.save(sess, './model/gail.cpkt') print('save model') break print(episodes, global_step, c)
def train(): FLAGS(sys.argv) with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env: replay_buffer = deque(maxlen=1000) sess = tf.Session() mainDQN = DQN(sess, 2, 4, name='main') targetDQN = DQN(sess, 2, 4, name='target') sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() #saver.restore(sess, './Move2Beacon/model.cpkt') copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) for episodes in range(EPISODES): done = False obs = env.reset() while not 331 in obs[0].observation["available_actions"]: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) state = obs2state(obs) print('episode start') global_step = 0 random_rate = 0 e = 1. / ((episodes / 10) + 1) reward = 0 while not done: global_step += 1 time.sleep(0.05) if np.random.rand() < e: random_rate += 1 action = random.randrange(4) else: action = np.argmax(mainDQN.predict(state)) #action = np.argmax(mainDQN.predict(state)) actions = actAgent2Pysc2(action, obs) obs = env.step(actions=[actions]) for i in range(3): actions = no_operation(obs) obs = env.step(actions=[actions]) distance = obs2distance(obs) if global_step == 1: pre_distance = distance next_state = obs2state(obs) reward = -(distance - pre_distance) * 400 #print(reward) if distance < 0.03 or global_step == 200: # 게임 종료시 if distance < 0.03: reward = 10 if global_step == 200: reward = -10 done = True #print(next_state, reward) replay_buffer.append((state, action, reward, next_state, done)) if distance < 0.03 or global_step == 200: # 게임 종료시 if len(replay_buffer) > BATCH_SIZE: minibatch = random.sample(replay_buffer, BATCH_SIZE) loss, _ = replay_train(mainDQN, targetDQN, minibatch) sess.run(copy_ops) print('model trained') saver.save(sess, './Move2Beacon/model.cpkt') print(reward, episodes, random_rate / global_step) break state = next_state pre_distance = distance
def train(): FLAGS(sys.argv) with sc2_env.SC2Env(map_name="CollectMineralShards", step_mul=step_mul, screen_size_px=(32, 32), minimap_size_px=(32, 32)) as env: Policy = Policy_net('policy', 32*32*2, 4) Old_Policy = Policy_net('old_policy', 32*32*2, 4) PPO = PPOTrain(Policy, Old_Policy, gamma=0.95) saver = tf.train.Saver() with tf.Session() as sess: print('a') saver.restore(sess, './model/model.ckpt') print('a') #sess.run(tf.global_variables_initializer()) for episodes in range(EPISODES): done = False obs = env.reset() while not 331 in obs[0].observation["available_actions"]: actions = actAgent2Pysc2(100, obs) obs = env.step(actions=[actions]) actions = gather(obs) obs = env.step(actions=[actions]) end_step = 200 global_step = 0 score = 0 reward = 0 for i in range(100): time.sleep(0.01) actions = no_operation(obs) obs = env.step(actions=[actions]) state = obs2state(obs) observations = [] actions_list = [] v_preds = [] rewards = [] print('episode start') while not done: global_step += 1 time.sleep(0.05) state = np.stack([state]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=state, stochastic=True) act, v_pred = np.asscalar(act), np.asscalar(v_pred) actions = actAgent2Pysc2(act, obs) #while not 331 in obs[0].observation["available_actions"]: # actions = actAgent2Pysc2(100, obs) # obs = env.step(actions=[actions]) obs = env.step(actions=[actions]) if global_step == end_step or obs2done(obs) >= 1900 : # 게임 time을 다 사용하거나 미네랄을 다 먹었을 경우 게임이 끝난다. done = True next_state = obs2state(obs) reward = obs[0].reward if reward == 0: reward = -0.1 if done: if obs2done(obs) >= 1900: # 게임이 종료되었는데 미네랄을 다 먹었으면 reward = 3 else: # 게임이 종료되었는데 미네랄을 다 못먹으면 reward = -3 score += reward observations.append(state) actions_list.append(act) v_preds.append(v_pred) rewards.append(reward) if done: # 게임 종료시 v_preds_next = v_preds[1:] + [0] gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) observations = np.reshape(observations, newshape=[-1, 32*32*2]) actions = np.array(actions_list).astype(dtype=np.int32) rewards = np.array(rewards).astype(np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] for epoch in range(5): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=64) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) print(episodes, score) save_path = saver.save(sess, './model/model.ckpt') if episodes == 0: f = open('test2.csv', 'w', encoding='utf-8', newline='') else: f = open('test2.csv', 'a', encoding='utf-8', newline='') wr = csv.writer(f) wr.writerow([episodes, score]) f.close() break state = next_state