def main(): args = parser.get_args() path = args["path"] if not os.path.exists(path): print("File does not exist") sys.exit(1) todos = utils.load_todos(path) utils.get_action(args, todos)
def assign_tasks(data): # TODO Ask Sadegh about node structure current_state = unpack(data) pub = rospy.Publisher('task_assigner/assignment', TaskAssignment) rospy.init_node('task_assigner') with open('mdp_info.json', 'r') as mdp_info_file: mdp_info = json.loads(mdp_info_file) # TODO Make a a map service with open(WORLDS_DIRECTORY + "world.json", "r") as world_file: world = json.load(world_file) while not rospy.is_shutdown(): # TODO Query state from some source current_action = utils.get_action(mdp_info, current_state) for t, r in current_action: msg = TaskAssignment() msg.robot_id = r.get_id() msg.problem = problem_generator.generate_escort_problem(r, t, world) rospy.loginfo(msg) pub.publish(msg) rospy.sleep(TASK_DURATION)
def extract_tree_expression(self, node, index_mark='_'): if node == None or node.data == None: return self.expression_str feature, parentheses, action, wl_scalar, wl_power, parentheses_bias, wl_activation, parentheses_activation, wl_bias, parentheses_power = Individual.get_all_merged_values( node.data) if parentheses == 1: self.expression_str += utils.get_activation( parentheses_activation) + '(' self.expression_str += utils.get_activation(wl_activation) self.expression_str += '({}'.format(wl_scalar) + '*' ## add wl scalar self.expression_str += '{}{}{}'.format(index_mark, feature, index_mark) self.expression_str += '**{}'.format(wl_power) + '+{}'.format(wl_bias) self.expression_str += ')' self.expression_str += utils.get_action(action) self.expression_str = self.extract_tree_expression( node.left, index_mark) self.expression_str = self.extract_tree_expression( node.right, index_mark) if parentheses == 1: self.expression_str = self.expression_str[:-1] + '+{})'.format( parentheses_bias) + '**{}'.format( parentheses_power) + self.expression_str[ -1] ## put closing parentesis before action return self.expression_str
def _get_path(net, dataset, map, map_index, start_pos, goal_pos, max_number_steps): with torch.no_grad(): success = True path = [start_pos] pos = start_pos for idx in range(max_number_steps): # ensure that whole perceptive area lies within grid world if pos[0] >= 3 * map.size()[0] // 4 or pos[0] < map.size( )[0] // 4 or pos[1] >= 3 * map.size()[1] // 4 or pos[1] < map.size( )[1] // 4: return (path, False) # reached goal if pos[0] == goal_pos[0] and pos[1] == goal_pos[1] and pos[ 2] == goal_pos[2]: return (path, success) if idx > 0: # get indices of the cells that contain the wheels fl, fr, bl, br = get_wheel_coord(pos, net.rotation_step_size, net.leg_x, net.leg_y) fl, fr, bl, br = fl.round().long(), fr.round().long( ), bl.round().long(), br.round().long() # check collision for each wheel if map[fl[0], fl[1]] == 1 or map[fr[0], fr[1]] == 1 or map[ bl[0], bl[1]] == 1 or map[br[0], br[1]] == 1: success = False # get net input for current position start_orientation = pos[2].to(net.device) occ_map, goal_map = dataset.get_inputs((map_index, pos, goal_pos)) occ_map, goal_map = occ_map.unsqueeze_(0).to( net.device), goal_map.unsqueeze_(0).to(net.device) # predict next action action_vector = net.forward(occ_map, goal_map, start_orientation) action = get_action(action_vector[0], dim=3) # update position and orientation new_pos = pos + action if new_pos[2] < 0: new_pos[2] += net.num_orientations elif new_pos[2] >= net.num_orientations: new_pos[2] -= net.num_orientations path.append(new_pos) pos = new_pos if pos[0] == goal_pos[0] and pos[1] == goal_pos[1] and pos[ 2] == goal_pos[2]: # reached goal return (path, success) else: # did not reach goal return (path, False)
def choose_action(self, obs): action_id = get_action(self.qval[obs], self.player_lambda) self.probs[obs] = np.array(current_probs) chosen_action = ID_TO_ACTION[self.player_type][action_id] if self.save_history: self.history['states'].append(obs) self.history['actions'].append(chosen_action) return chosen_action
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = 3 print('image size:', img_shape) print('action size:', num_actions) net = QNet(num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() epsilon = 0 for e in range(5): done = False score = 0 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) history = torch.stack((state, state, state, state)) for i in range(3): action = env.action_space.sample() state, reward, done, info = env.step(action) state = pre_process(state) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) history = torch.cat((state, history[:-1]), dim=0) while not done: if args.render: env.render() steps += 1 qvalue = net(history.unsqueeze(0)) action = get_action(0, qvalue, num_actions) next_state, reward, done, info = env.step(action + 1) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) next_history = torch.cat((next_state, history[:-1]), dim=0) score += reward history = next_history print('{} episode | score: {:.2f}'.format(e, score))
def run_caesar(): action = utils.get_action() encrypting = action == 'E' data = clean_caesar(utils.get_input(binary=False)) print("* Transform *") print("{}crypting {} using Caesar cipher...".format( 'En' if encrypting else 'De', data)) output = (encrypt_caesar if encrypting else decrypt_caesar)(data) utils.set_output(output)
def test_validate_mask(self): env = tf_py_environment.TFPyEnvironment(self.env) policy = random_tf_policy.RandomTFPolicy( time_step_spec=env.time_step_spec(), action_spec=env.action_spec(), observation_and_action_constraint_splitter=GameEnv.obs_and_mask_splitter) driver = dynamic_step_driver.DynamicStepDriver(env, policy, num_steps=1) for i in range(10): time_step, _ = driver.run() action_step = policy.action(time_step) print(utils.get_action(action_step.action.numpy()[0], 3))
def run_vigenere(): action = utils.get_action() encrypting = action == 'E' data = clean_vigenere(utils.get_input(binary=False)) print("* Transform *") keyword = clean_vigenere(input("Keyword? ")) print("{}crypting {} using Vigenere cipher and keyword {}...".format( 'En' if encrypting else 'De', data, keyword)) output = (encrypt_vigenere if encrypting else decrypt_vigenere)(data, keyword) utils.set_output(output)
def choose_action(self, obs): features = [] for action_id in range(2): features.append(self.features[-1] + [STATE_TO_ID[obs], action_id]) self.qval[obs] = self.model(torch.tensor( features, dtype=torch.float32)).data.numpy().ravel() action_id = get_action(self.qval[obs], self.player_lambda) chosen_action = ID_TO_ACTION[self.player_type][action_id] if self.save_history: self.history['states'].append(obs) self.history['actions'].append(chosen_action) self.rounds += 1 return chosen_action
def _get_path(net, dataset, map, map_index, start_pos, goal_pos, max_number_steps): with torch.no_grad(): success = True path = [start_pos] pos = start_pos for idx in range(max_number_steps): # ensure that whole perceptive area lies within grid world if pos[0] >= 3 * map.size()[0] // 4 or pos[0] < map.size( )[0] // 4 or pos[1] >= 3 * map.size()[1] // 4 or pos[1] < map.size( )[1] // 4: return (path, False) # reached goal if pos[0] == goal_pos[0] and pos[1] == goal_pos[1]: return (path, success) # check collision if map[pos[0], pos[1]] == 1: success = False # get input maps for current position occ_map, goal_map = dataset.get_inputs((map_index, pos, goal_pos)) occ_map, goal_map = occ_map.unsqueeze_(0).to( net.device), goal_map.unsqueeze_(0).to(net.device) # predict next action action_vector = net.forward(occ_map, goal_map) action = get_action(action_vector[0], dim=2) # update position new_pos = pos + action path.append(new_pos) pos = new_pos if pos[0] == goal_pos[0] and pos[1] == goal_pos[1] and pos[ 2] == goal_pos[2]: # reached goal return (path, success) else: # did not reach goal return (path, False)
def run_merkle_hellman(): action = utils.get_action() print("* Seed *") seed = input("Set Seed [enter for random]: ") import random random.seed(seed) print("* Building private key...") private_key = generate_private_key() public_key = create_public_key(private_key) if action == 'E': # Encrypt data = utils.get_input(binary=True) print("* Transform *") chunks = encrypt_mh(data, public_key) output = ' '.join(map(str, chunks)) else: # Decrypt data = utils.get_input(binary=False) chunks = [int(line.strip()) for line in data.split() if line.strip()] print("* Transform *") output = decrypt_mh(chunks, private_key) utils.set_output(output)
def _rollout(net, batch_size=128, validation=True, num_workers=4): with torch.no_grad(): diff = 0. net_length = 0. expert_length = 0. # load dataset and make it available to all workers global rollout_data if validation: rollout_data = GridDataset_2d(net.size, data_type='validation', full_paths=True) else: rollout_data = GridDataset_2d(net.size, data_type='evaluation', full_paths=True) iterations = rollout_data.num_examples # list of all tasks (describes task through map and path indices) open_paths = [(i, j) for i in range(rollout_data.num_examples) for j in range(rollout_data.num_paths_per_map)] paths = [[[rollout_data.expert_paths[map_id][path_id][0]] for path_id in range(rollout_data.num_paths_per_map)] for map_id in range(rollout_data.num_examples)] success = [[ False for path_id in range(rollout_data.num_paths_per_map) ] for map_id in range(rollout_data.num_examples)] path_length = 0 if not validation: print("Starting Rollout-Test.") print("Max expert path length:", rollout_data.max_path_length) start_time = time.time() pool = Pool(processes=num_workers) while len(open_paths ) != 0 and path_length < 2 * rollout_data.max_path_length: parameters = [] # get map indices and current positions for all open paths for map_id, path_id in open_paths: parameters.append( (map_id, paths[map_id][path_id][-1], rollout_data.expert_paths[map_id][path_id][-1])) # get inputs for all open paths inputs = pool.map(_get_inputs, parameters) path_length += 1 current_open_task_id = 0 # predict next step for each open path for input_batch in batch(inputs, batch_size): # unpack inputs occ_maps, goal_maps = zip(*input_batch) occ_maps, goal_maps = torch.stack(occ_maps, dim=0).to( net.device), torch.stack(goal_maps, dim=0).to(net.device) # predict next action action_vectors = net.forward(occ_maps, goal_maps) for i in range(action_vectors.size(0)): # update positions and paths map_id, path_id = open_paths[current_open_task_id] action = get_action(action_vectors[i], dim=2) pos = paths[map_id][path_id][-1] + action paths[map_id][path_id].append(pos) goal_pos = rollout_data.expert_paths[map_id][path_id][-1] # reached goal if pos[0] == goal_pos[0] and pos[1] == goal_pos[1]: success[map_id][path_id] = True del open_paths[current_open_task_id] continue # check upper border for path length # (to detect oscillation) if path_length > 2 * len( rollout_data.expert_paths[map_id][path_id]): del open_paths[current_open_task_id] continue # ensure that perceptive area lies completely within grid world if pos[0] >= 3 * rollout_data.grids[map_id].size( )[0] // 4 or pos[0] < rollout_data.grids[map_id].size( )[0] // 4 or pos[1] >= 3 * rollout_data.grids[map_id].size( )[1] // 4 or pos[1] < rollout_data.grids[map_id].size( )[1] // 4: del open_paths[current_open_task_id] continue # check collision if rollout_data.grids[map_id][pos[0], pos[1]] == 1: del open_paths[current_open_task_id] continue current_open_task_id += 1 if not validation: if path_length % 20 == 0: print("Computed paths up to length ", path_length) pool.close() # count successful paths num_successful = 0 for i in range(rollout_data.num_examples): for j in range(rollout_data.num_paths_per_map): paths[i][j] = torch.stack(paths[i][j], dim=0) if success[i][j]: num_successful += 1 if not validation: # compare length of network and expert paths diff += get_path_length( paths[i][j], dim=2) - get_path_length( rollout_data.expert_paths[i][j], dim=2) net_length += get_path_length(paths[i][j], dim=2) expert_length += get_path_length( rollout_data.expert_paths[i][j], dim=2) if not validation: print("Success: ", num_successful / len(rollout_data)) print("Path length (network): ", net_length) print("Path length (expert): ", expert_length) print("Average absolute path difference: ", diff / num_successful) print("average relative path difference: ", net_length / expert_length) print("Duration: ", time.time() - start_time) print("") return num_successful / len(rollout_data)
def train_dqn(episode, rand_obs=0, rand_act=0, noise_obs_level=0.01, noise_act_level=0.1): loss = [] agent = DQN(env.action_space.n, env.observation_space.shape[0]) all_actions = [] all_rand_acts = [] all_rewards = [] for e in range(episode): curr_acts = [] curr_rand_acts = [] curr_rewards = [] state = env.reset() state = np.reshape(state, (1, 8)) score = 0 max_steps = 5000 for i in range(max_steps): if rand_obs == 1: state = get_observation(state, option=0, noise_obs_level=noise_obs_level) action = agent.act(state) if rand_act == 1: action, is_rand = get_action(action) else: action, is_rand = action, 0 curr_acts.append(action) curr_rand_acts.append(is_rand) # env.render() next_state, reward, done, _ = env.step(action) curr_rewards.append(reward) score += reward next_state = np.reshape(next_state, (1, 8)) agent.remember(state, action, reward, next_state, done) state = next_state agent.replay() if done: print("episode: {}/{}, score: {}".format(e, episode, score)) break loss.append(score) all_actions.append(np.array(curr_acts)) all_rand_acts.append(np.array(curr_rand_acts)) all_rewards.append(np.array(curr_rewards)) # Average score of last 100 episode is_solved = np.mean(loss[-100:]) # if is_solved > 50: # print('\n Task Completed! \n') # break print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) # np.savez("./saved/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + ".npz", # acts=np.array(all_actions), # rand_actions=np.array(all_rand_acts), # rewards=np.array(all_rewards), # scores=np.array(loss)) # np.savez("./saved_dqn/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + "_noise_obs_lvl_" + str(noise_obs_level) + ".npz", # acts=np.array(all_actions), # rand_actions=np.array(all_rand_acts), # rewards=np.array(all_rewards), # scores=np.array(loss)) np.savez("./saved_dqn/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + "_noise_act_lvl_" + str(noise_act_level) + ".npz", acts=np.array(all_actions), rand_actions=np.array(all_rand_acts), rewards=np.array(all_rewards), scores=np.array(loss)) return loss
def train_a3c(episode, rand_obs=0, rand_act=0): # Defaults parameters: # gamma = 0.99 # lr = 0.02 # betas = (0.9, 0.999) # random_seed = 543 render = False gamma = 0.99 lr = 0.02 betas = (0.9, 0.999) random_seed = 543 torch.manual_seed(random_seed) policy = ActorCritic() optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas) print(lr, betas) running_reward = 0 loss_ls = [] all_actions = [] all_rand_acts = [] all_rewards = [] for i_episode in range(0, episode): curr_acts = [] curr_rand_acts = [] curr_rewards = [] state = env.reset() score = 0 for t in range(10000): if rand_obs == 1: state = get_observation(state, option=1) # action = agent.act(state) # state = get_observation(state, option=1) action = policy(state) if rand_act == 1: action, is_rand = get_action(action) else: action, is_rand = action, 0 curr_acts.append(action) curr_rand_acts.append(is_rand) # action = get_action(action) state, reward, done, _ = env.step(action) curr_rewards.append(reward) policy.rewards.append(reward) running_reward += reward score += reward if render and i_episode > 1000: env.render() if done: break loss_ls.append(score) # Updating the policy : optimizer.zero_grad() loss = policy.calculateLoss(gamma) loss.backward() optimizer.step() policy.clearMemory() all_actions.append(np.array(curr_acts)) all_rand_acts.append(np.array(curr_rand_acts)) all_rewards.append(np.array(curr_rewards)) # # saving the model if episodes > 999 OR avg reward > 200 # if i_episode > 999: # torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) # if running_reward > 4000: # torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) # print("########## Solved! ##########") # test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) # break if i_episode % 20 == 0: running_reward = running_reward / 20 print('Episode {}\tlength: {}\treward: {}'.format( i_episode, t, running_reward)) running_reward = 0 np.savez("./saved/a3c_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + ".npz", acts=np.array(all_actions), rand_actions=np.array(all_rand_acts), rewards=np.array(all_rewards), scores=np.array(loss_ls)) return loss_ls
# training loop losses = [] ti = time() for e in range(epochs): n_hands = 0 winner = None actions = np.zeros((2)).astype(np.int) og_states = np.zeros((2, state_cards + 2)) aux_state = None game_over = False while winner is None: n_hands += 1 og_states[env.turn, :] = env.get_state().flatten() action_A = get_action(model, og_states[env.turn][np.newaxis], env.legal_moves(), epsilon[e]) actions[env.turn] = action_A hand_over, new_state, rewards, winner = env.play_card(action_A) if aux_state is not None: exp_replay.remember(aux_state[np.newaxis], aux_action, env.get_state(), rewards[int(not env.hand_winner)], game_over) og_states[env.turn, :] = env.get_state().flatten() action_B = get_action(model, og_states[env.turn][np.newaxis], env.legal_moves(), epsilon[e]) actions[env.turn] = action_B hand_over, new_state, rewards, winner = env.play_card(action_B) aux_state = og_states[int(not env.hand_winner)]
def do_episode(self, config): """ :param config: :return: """ # Initial values done = False score_e = 0 step_e = 0 # Get epsilon for initial state self.update_epsilon_step() # Episodic decay (only after linear decay) self.update_alpha_episode() self.update_epsilon_episode() # Get current state s, act based on s state = self.discretize_state(self.env.reset()) action = self.act(state) # Continue while not crashed all_acts = [] rand_acts = [] all_rewards = [] while not done: # Update for other steps self.update_alpha_step() self.update_epsilon_step() # Get next state s' and reward, act based on s' state_, reward, done, _ = self.env.step(action) if config['rand_obs'] == 1: state_ = get_observation(state_, option=1) state_ = self.discretize_state(state_) action_ = self.act(state_) if config['rand_act'] == 1: action, is_rand = get_action(action) else: action, is_rand = action, 0 all_acts.append(action) if is_rand: rand_acts.append(1) else: rand_acts.append(0) # Learn self.learn(done, state, action, reward, state_, action_) all_rewards.append(reward) # Set next state and action to current state = state_ action = action_ # Increment score and steps score_e += reward step_e += 1 self.step += 1 # Append score self.score.append(score_e) self.score_100.append(score_e) self.actions.append(np.array(all_acts)) self.rand_actions.append(np.array(rand_acts)) self.rewards.append(np.array(all_rewards)) mean_score = np.mean(self.score_100) # Increment episode self.episode += 1
outputs={ 'pi': opt_action, 'q': opt_action_value }) # Main loop start_time = time.time() ep_len, rewd, s = 0, 0.0, env.reset() for t in range(num_epochs * ep_per_epoch): if random_steps >= t: a = action_space.sample() if random_steps == t: print("Finished pure random episodes") else: a = get_action({s_ph: s.reshape(1, -1)}, opt_action, sess, max_act, action_dim) s2, r, done, _ = env.step(a) rewd += r ep_len += 1 #env.render() # Ignoring done signal if comes from end of episode # because d is about if the agent died because of a # very bad action, not because the episode ended # TODO could send wrong information d = False if ep_len == max_ep_len else done # Store transition buf.store(s, a, r, s2, d) s = s2
# parameter datetime_minute_cached = None position = 1 # order while ws.ws.sock.connected: try: if datetime_minute_cached != datetime.now().minute: cur_time = datetime.now(pytz.timezone('Asia/Seoul')) df = get_minute_data(client, args.symbol, minutes=1, cur_time=cur_time) action = get_action(df) if action == 1 and position == 1: # market_order(client, args.symbol, "buy", args.amount) position = -1 print("BUY") elif action == -1 and position == -1: # market_order(client, args.symbol, "sell", args.amount) position = 1 print("SELL") else: print("HOLD")
def train_for_n(nb_epoch=5000, BATCH_SIZE=32): for e in tqdm(range(nb_epoch)): ### Shuffle and Batch the data _random = np.random.randint(0, emb_cs.shape[0], size=BATCH_SIZE) _random2 = np.random.randint(0, emb_zh.shape[0], size=BATCH_SIZE) if not WORD_ONLY: pos_seq_cs_batch = pos_seq_cs[_random] pos_seq_zh_batch = pos_seq_zh[_random2] emb_cs_batch = emb_cs[_random] emb_zh_batch = emb_zh[_random2] noise_g = np.random.normal(0, 1, size=(BATCH_SIZE, MAX_SEQUENCE_LENGTH, NOISE_SIZE)) reward_batch = np.zeros((BATCH_SIZE, 1)) ############################################# ### Train generator ############################################# for ep in range(1): # G v.s. D training ratio if not WORD_ONLY: output_g = generator.predict( [emb_zh_batch, pos_seq_zh_batch, noise_g, reward_batch]) else: output_g = generator.predict( [emb_zh_batch, noise_g, reward_batch]) action_g, action_one_hot_g = get_action(output_g) emb_g = translate(emb_zh_batch, action_g) text_g = translate_output(emb_zh_batch, action_g) # tag POS if not WORD_ONLY: pos_seq_g = [] for line in text_g: words = pseg.cut(line) sub_data = [] idx = 0 for w in words: if w.flag == "x": idx = 0 elif idx == 0: sub_data.append(postag[w.flag]) idx = 1 pos_seq_g.append(sub_data) pos_seq_g = pad_sequences(pos_seq_g, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post', value=0) one_hot_action = action_one_hot_g.reshape(BATCH_SIZE, MAX_SEQUENCE_LENGTH, 2) make_trainable(generator, True) if not WORD_ONLY: reward_batch = discriminator.predict([emb_g, pos_seq_g])[:, 0] g_loss = generator.train_on_batch( [emb_zh_batch, pos_seq_zh_batch, noise_g, reward_batch], one_hot_action) else: reward_batch = discriminator.predict([emb_g])[:, 0] g_loss = generator.train_on_batch( [emb_zh_batch, noise_g, reward_batch], one_hot_action) losses["g"].append(g_loss) write_log(callbacks, log_g, g_loss, len(losses["g"])) if g_loss < 0.15: # early stop break ############################################# ### Train discriminator on generated sentence ############################################# X_emb = np.concatenate((emb_cs_batch, emb_g)) if not WORD_ONLY: X_pos = np.concatenate((pos_seq_cs_batch, pos_seq_g)) y = np.zeros([2 * BATCH_SIZE]) y[0:BATCH_SIZE] = 0.7 + np.random.random([BATCH_SIZE]) * 0.3 y[BATCH_SIZE:] = 0 + np.random.random([BATCH_SIZE]) * 0.3 make_trainable(discriminator, True) model.embedding_word.trainable = False if not WORD_ONLY: model.embedding_pos.trainable = False model.g_bi.trainable = False for ep in range(1): # G v.s. D training ratio if not WORD_ONLY: d_loss = discriminator.train_on_batch([X_emb, X_pos], y) else: d_loss = discriminator.train_on_batch([X_emb], y) losses["d"].append(d_loss) write_log(callbacks, log_d, d_loss, len(losses["d"])) if d_loss < 0.6: # early stop break ### Save model generator.save_weights(MODEL_PATH + "gen.mdl") discriminator.save_weights(MODEL_PATH + "dis.mdl")
callbacks.set_model(generator) earlystopper = EarlyStopping(monitor='val_loss', patience=2, verbose=1) ### Pre-train the discriminator network ... print("========== PretrainING Discriminator START!") t1 = time.time() noise_g = np.random.normal(0, 1, size=(ntrain, MAX_SEQUENCE_LENGTH, NOISE_SIZE)) if not WORD_ONLY: output_g = generator.predict([input_g_emb, input_g_pos, noise_g, reward]) else: output_g = generator.predict([input_g_emb, noise_g, reward]) action_g, action_one_hot_g = get_action(output_g) emb_g = translate(input_g_emb, action_g) text_g = translate_output(input_g_emb, action_g) if not WORD_ONLY: pos_seq_g = [] for line in text_g: words = pseg.cut(line) sub_data = [] idx = 0 for w in words: if w.flag == "x": idx = 0 elif idx == 0: sub_data.append(postag[w.flag]) idx = 1
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = env.action_space.n print('image size:', img_shape) print('action size:', num_actions) net = FuN(num_actions) optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) net.train() epsilon = 1.0 steps = 0 for e in range(10000): memory = Memory(capacity=400) done = False dead = False score = 0 avg_loss = [] start_life = 6 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) state = state.permute(2, 0, 1) m_hx = torch.zeros(1, 288).to(device) m_cx = torch.zeros(1, 288).to(device) m_lstm = (m_hx, m_cx) w_hx = torch.zeros(1, 288).to(device) w_cx = torch.zeros(1, 288).to(device) w_lstm = (w_hx, w_cx) goals = torch.zeros(1, 288, 1).to(device) while not done: if args.render: env.render() steps += 1 net_output = net(state.unsqueeze(0), m_lstm, w_lstm, goals) policy, goal, goals, m_lstm, w_lstm, m_value, w_value, m_state = net_output action = get_action(policy, num_actions) next_state, reward, done, info = env.step(action) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.permute(2, 0, 1) if start_life > info['ale.lives']: dead = True start_life = info['ale.lives'] score += reward reward = np.clip(reward, -1, 1) mask = 0 if dead else 1 memory.push(action, reward, mask, goal, policy, m_lstm, w_lstm, m_value, w_value, m_state) if dead: batch = memory.sample() loss = train_model(net, optimizer, batch, args.gamma) avg_loss.append(loss.cpu().data) dead = False m_hx = torch.zeros(1, 288).to(device) m_cx = torch.zeros(1, 288).to(device) m_lstm = (m_hx, m_cx) w_hx = torch.zeros(1, 288).to(device) w_cx = torch.zeros(1, 288).to(device) w_lstm = (w_hx, w_cx) goals = torch.zeros(1, 288, 1).to(device) state = next_state if e % args.log_interval == 0: print('{} episode | score: {:.2f} | steps: {} | loss: {:.4f}'.format( e, score, steps, np.mean(avg_loss))) writer.add_scalar('log/score', float(score), steps) writer.add_scalar('log/score', np.mean(avg_loss), steps) if score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
memory = deque() steps = 0 scores = [] while steps < 2048: episodes += 1 state = env.reset() state = running_state(state) score = 0 for _ in range(10000): if episodes % 50 == 0: env.render() steps += 1 mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) next_state = running_state(next_state) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) score += reward state = next_state if done: break
def self_play(agent, cur_memory, rank=0): agent.model.eval() state_black = deque() state_white = deque() pi_black = deque() pi_white = deque() episode = 0 while True: if (episode + 1) % 10 == 0: logging.info('Playing Episode {:3}'.format(episode + 1)) env = game.GameState('text') board = np.zeros((BOARD_SIZE, BOARD_SIZE), 'float') turn = 0 root_id = (0, ) win_index = 0 time_steps = 0 action_index = None while win_index == 0: if PRINT_SELFPLAY and rank == 0: utils.render_str(board, BOARD_SIZE, action_index) # ====================== start MCTS ============================ # if time_steps < TAU_THRES: tau = 1 else: tau = 0 pi = agent.get_pi(root_id, tau, rank) # ===================== collect samples ======================== # state = utils.get_state_pt(root_id, BOARD_SIZE, IN_PLANES) if turn == 0: state_black.appendleft(state) pi_black.appendleft(pi) else: state_white.appendleft(state) pi_white.appendleft(pi) # ======================== get action ========================== # action, action_index = utils.get_action(pi) root_id += (action_index, ) # ====================== print evaluation ====================== # if PRINT_SELFPLAY and rank == 0: with torch.no_grad(): state_input = torch.tensor([state]).to(device).float() p, v = agent.model(state_input) p = p.cpu().numpy()[0] v = v.item() print('\nPi:\n{}'.format( pi.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2))) print('\nPolicy:\n{}'.format( p.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2))) if turn == 0: print("\nBlack's win%: {:.2f}%".format((v + 1) / 2 * 100)) else: print("\nWhite's win%: {:.2f}%".format((v + 1) / 2 * 100)) # =========================== step ============================= # board, _, win_index, turn, _ = env.step(action) time_steps += 1 # ========================== result ============================ # if win_index != 0: if win_index == 1: reward_black = 1. reward_white = -1. result['Black'] += 1 elif win_index == 2: reward_black = -1. reward_white = 1. result['White'] += 1 else: reward_black = 0. reward_white = 0. result['Draw'] += 1 # ====================== store in memory ======================= # while state_black or state_white: if state_black: cur_memory.append( (state_black.pop(), pi_black.pop(), reward_black)) if state_white: cur_memory.append( (state_white.pop(), pi_white.pop(), reward_white)) # ========================= result =========================== # if PRINT_SELFPLAY and rank == 0: utils.render_str(board, BOARD_SIZE, action_index) bw, ww, dr = result['Black'], result['White'], \ result['Draw'] print('') print('=' * 20, " {:3} Game End ".format(episode + 1), '=' * 20) print('Black Win: {:3} ' 'White Win: {:3} ' 'Draw: {:2} ' 'Win%: {:.2f}%'.format(bw, ww, dr, (bw + 0.5 * dr) / (bw + ww + dr) * 100)) print('current memory size:', len(cur_memory)) episode += 1 agent.reset() if len(cur_memory) >= MEMORY_SIZE: return utils.augment_dataset(cur_memory, BOARD_SIZE)
def self_play(n_selfplay): global cur_memory, rep_memory global Agent state_black = deque() state_white = deque() pi_black = deque() pi_white = deque() if RESIGN_MODE: resign_val_balck = [] resign_val_white = [] resign_val = [] resign_v = -1.0 n_resign_thres = N_SELFPLAY // 4 for episode in range(n_selfplay): if (episode + 1) % 10 == 0: logging.warning('Playing Episode {:3}'.format(episode + 1)) env = game.GameState('text') board = np.zeros((BOARD_SIZE, BOARD_SIZE), 'float') turn = 0 root_id = (0, ) win_index = 0 time_steps = 0 action_index = None if RESIGN_MODE: resign_index = 0 while win_index == 0: if PRINT_SELFPLAY: utils.render_str(board, BOARD_SIZE, action_index) # ====================== start MCTS ============================ # if time_steps < TAU_THRES: tau = 1 else: tau = 0 pi = Agent.get_pi(root_id, tau) # ===================== collect samples ======================== # state = utils.get_state_pt(root_id, BOARD_SIZE, IN_PLANES) if turn == 0: state_black.appendleft(state) pi_black.appendleft(pi) else: state_white.appendleft(state) pi_white.appendleft(pi) # ======================== get action ========================== # action, action_index = utils.get_action(pi) root_id += (action_index, ) # ====================== print evaluation ====================== # if PRINT_SELFPLAY: Agent.model.eval() with torch.no_grad(): state_input = torch.tensor([state]).to(device).float() p, v = Agent.model(state_input) p = p.cpu().numpy()[0] v = v.item() print('\nPi:\n{}'.format( pi.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2))) print('\nPolicy:\n{}'.format( p.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2))) if turn == 0: print("\nBlack's win%: {:.2f}%".format((v + 1) / 2 * 100)) if RESIGN_MODE: if episode < n_resign_thres: resign_val_balck.append(v) elif v < resign_v: resign_index = 2 if PRINT_SELFPLAY: print('"Black Resign!"') else: print("\nWhite's win%: {:.2f}%".format((v + 1) / 2 * 100)) if RESIGN_MODE: if episode < n_resign_thres: resign_val_white.append(v) elif v < resign_v: resign_index = 1 if PRINT_SELFPLAY: print('"White Resign!"') # =========================== step ============================= # board, _, win_index, turn, _ = env.step(action) time_steps += 1 # ========================== result ============================ # if RESIGN_MODE: if resign_index != 0: win_index = resign_index result['Resign'] += 1 if win_index != 0: if win_index == 1: reward_black = 1. reward_white = -1. result['Black'] += 1 if RESIGN_MODE: if episode < n_resign_thres: for val in resign_val_balck: resign_val.append(val) resign_val_balck.clear() resign_val_white.clear() elif win_index == 2: reward_black = -1. reward_white = 1. result['White'] += 1 if RESIGN_MODE: if episode < n_resign_thres: for val in resign_val_white: resign_val.append(val) resign_val_white.clear() resign_val_balck.clear() else: reward_black = 0. reward_white = 0. result['Draw'] += 1 if RESIGN_MODE: if episode < n_resign_thres: for val in resign_val_balck: resign_val.append(val) for val in resign_val_white: resign_val.append(val) resign_val_balck.clear() resign_val_white.clear() if RESIGN_MODE: if episode + 1 == n_resign_thres: resign_v = min(resign_val) resign_val.clear() if PRINT_SELFPLAY: print('Resign win%: {:.2f}%'.format( (resign_v + 1) / 2 * 100)) # ====================== store in memory ======================= # while state_black or state_white: if state_black: cur_memory.append( (state_black.pop(), pi_black.pop(), reward_black)) if state_white: cur_memory.append( (state_white.pop(), pi_white.pop(), reward_white)) # ========================= result =========================== # if PRINT_SELFPLAY: utils.render_str(board, BOARD_SIZE, action_index) bw, ww, dr, rs = result['Black'], result['White'], \ result['Draw'], result['Resign'] print('') print('=' * 20, " {:3} Game End ".format(episode + 1), '=' * 20) print('Black Win: {:3} ' 'White Win: {:3} ' 'Draw: {:2} ' 'Win%: {:.2f}%' '\nResign: {:2}'.format(bw, ww, dr, (bw + 0.5 * dr) / (bw + ww + dr) * 100, rs)) print('current memory size:', len(cur_memory)) Agent.reset() rep_memory.extend(utils.augment_dataset(cur_memory, BOARD_SIZE))
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = 3 print('image size:', img_shape) print('action size:', num_actions) net = QNet(num_actions) target_net = QNet(num_actions) update_target_model(net, target_net) optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) target_net.to(device) net.train() target_net.train() memory = Memory(100000) running_score = 0 epsilon = 1.0 steps = 0 for e in range(10000): done = False dead = False score = 0 avg_loss = [] start_life = 5 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) history = torch.stack((state, state, state, state)) for i in range(3): action = env.action_space.sample() state, reward, done, info = env.step(action) state = pre_process(state) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) history = torch.cat((state, history[:-1]), dim=0) while not done: if args.render: env.render() steps += 1 qvalue = net(history.unsqueeze(0)) action = get_action(epsilon, qvalue, num_actions) next_state, reward, done, info = env.step(action + 1) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) next_history = torch.cat((next_state, history[:-1]), dim=0) if start_life > info['ale.lives']: dead = True start_life = info['ale.lives'] score += reward reward = np.clip(reward, -1, 1) mask = 0 if dead else 1 memory.push(history.cpu(), next_history.cpu(), action, reward, mask) if dead: dead = False if steps > args.initial_exploration: epsilon -= 1e-6 epsilon = max(epsilon, 0.1) batch = memory.sample(args.batch_size) loss = train_model(net, target_net, optimizer, batch) if steps % args.update_target: update_target_model(net, target_net) else: loss = 0 avg_loss.append(loss) history = next_history if e % args.log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}' .format(e, score, epsilon, steps, np.mean(avg_loss))) writer.add_scalar('log/score', float(score), steps) writer.add_scalar('log/score', np.mean(avg_loss), steps) if score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break