def step(self, actionWanted): current_lowest_dist = 190 action = 0 reward_modifier = 0 for i in range(0, 190): action_possible = False if len(self.prev_obs) == 0: action_possible = True else: newIndex = MAX_FOR_BLOCKS + i action_possible = self.prev_obs[int(newIndex / 10)][newIndex % 10] == 1 action_probability = abs(actionWanted - i) if action_probability < current_lowest_dist and action_possible: current_lowest_dist = action_probability action = i if action == actionWanted: reward_modifier = 0.5 if action == actionWanted else -0.5 if self.invalid_tries > self.invalid_try_limit: self.amount_limitsurpass += 1 if self.max_invalid_tries != -1 and self.amount_limitsurpass >= self.max_invalid_tries: print("ABORTING LEARNING DUE TO TOO MANY WRONG TRIES", self.amount_limitsurpass) sys.exit(-1) print("ANOTHER 1k wrong tries", self.force_progression) self.invalid_tries = 0 self.reward_finding_right = True if self.force_progression: print("Forcing game to progress") rustLib.field_counter_action(self.field, 1) is_over = rustLib.field_is_game_over(self.field) == 1 if not is_over: rustLib.field_counter_action(self.field, 0) return field_to_array(self.field), -2.0, rustLib.field_is_game_over(self.field) == 1, {} answer = rustLib.field_do_action_with_answer(self.field, action, 2) placed = answer.placed reward = answer.reward done = answer.done == 0 if placed == 0: self.invalid_tries = 0 self.amount_limitsurpass = 0 if self.reward_finding_right: self.reward_finding_right = False reward += 1 print("Gave an extra bonus for finding the right combo after a lot of invalid tries", reward) else: self.invalid_tries += 1 reward = -1 if done: winner = answer.winner reward += 10.0 if winner == 0 else -10.0 nprew = np.array(self.rewards) print("game is over average reward is", np.average(nprew), " median is", np.median(nprew), " high and low are ", np.min(nprew), np.max(nprew)) new_observation = field_to_array(self.field) self.prev_obs = new_observation self.rewards.append(reward) return new_observation, reward, done, {}
def reset(self): rustLib.field_reset(self.field) self.invalid_tries = 0 self.reward_finding_right = False self.rewards = [] self.amount_limitsurpass = 0 return field_to_array(self.field)
def render(self, mode='human'): outfile = StringIO() if mode == 'ansi' else sys.stdout outfile.write(field_to_array(self.field)) # No need to return anything for human if mode != 'human': with closing(outfile): return outfile.getvalue()
def get_obs(self): if self.use_cnn: return field_to_cnn_array( self.field, 1 if self.self_play_is_second_player else 0) else: return field_to_array(self.field, 1 if self.self_play_is_second_player else 0)
dirname = "D:\\4-System\\rusty\\" filename = "50000_heutistic_pretrain_" filename += "box" if is_box_space else "discrete" np.seterr(all='raise') origEnv = gym.make("rustybox-v0" if is_box_space else "rustydiscrete-v0") origEnv.max_invalid_tries = 7 env = VecCheckNan(DummyVecEnv([lambda: origEnv])) # Instantiate the agent model = PPO2.load("models/ppo2boxbestparam/2e4-30.pkl", env=env) # model.load("models/pretrain/"+filename) rustLib.field_restore_log(origEnv.field, oldLog.encode('utf-8')) obs = field_to_array(origEnv.field) actions, _states = model.predict(obs) if is_box_space: action = 0 current_max = -1 for i in range(0, len(actions)): action_probability = actions[i] newIndex = MAX_FOR_BLOCKS + i action_possible = obs[int(newIndex / 10)][newIndex % 10] == 1 if action_probability > current_max and action_possible: current_max = action_probability action = i rustLib.field_do_action(origEnv.field, action) else: rustLib.field_do_action(origEnv.field, actions) print("1;" + field_to_log(origEnv.field))
def generate_pretraindata_heuristics(save_interval, amount_of_games, is_box_space): env = gym.make("rustydiscrete-v0") actions = [] boxActions = [] observations = [] rewards = [] episode_returns = [] episode_starts = [] lastFile = "" startAt = 0 directory = "D:\\4-System\\rusty\\" for f in os.listdir(directory): if f.endswith("npz"): lastFile = f # if len(lastFile) > 0: # print("LOADING FILE", directory+lastFile, os.listdir(directory)) # loadedData = np.load(directory+lastFile) # actions = loadedData["actions"].tolist() # observations = loadedData["obs"].tolist() # rewards = loadedData["rewards"].tolist() # episode_returns = loadedData["episode_returns"].tolist() # episode_starts = loadedData["episode_starts"].tolist() # startAt = int(lastFile.split("_")[0]) + 1 # print("LOADED DATA IS", lastFile, len(episode_returns), startAt) # loadedData = None gameField = rustLib.field_new() print("Beginning Games") for i in range(startAt, amount_of_games): game_over = False reward_sum = 0 while not game_over: action = rustLib.field_counter_action_index(gameField) answer = rustLib.field_do_action_with_answer(gameField, action, 1) if answer.placed != 0: print("Error a block couldn't be placed", action, answer) sys.exit(-1) actionArr = np.zeros(190, dtype=float) actionArr[action] = 1 boxActions.append(actionArr) actions.append(action) observations.append(field_to_array(gameField)) game_over = answer.done == 0 reward = answer.reward episode_starts.append(game_over) if game_over: winner = answer.winner reward += 3.0 if winner == 0 else -3.0 rewards.append(reward) reward_sum += reward if game_over: episode_returns.append(reward_sum) print("DONE WITH GAME NUMBER", i, reward_sum) rustLib.field_reset(gameField) if i > 0 and i % save_interval == 0: numpy_dict = { 'actions': np.array(actions).reshape((-1, 1)), 'obs': np.array(observations), 'rewards': np.array(rewards), 'episode_returns': np.array(episode_returns), 'episode_starts': np.array(episode_starts[:-1]) } np.savez(directory + str(i) + "_heutistic_pretrain_discrete.npz", **numpy_dict) numpy_dict = { 'actions': np.concatenate(boxActions).reshape((-1, ) + boxEnv.action_space.shape), 'obs': np.array(observations), 'rewards': np.array(rewards), 'episode_returns': np.array(episode_returns), 'episode_starts': np.array(episode_starts[:-1]) } np.savez(directory + str(i) + "_heutistic_pretrain_box.npz", **numpy_dict) numpy_dict = None rustLib.field_free(gameField) numpy_dict = { 'actions': np.concatenate(boxActions).reshape((-1, ) + boxEnv.action_space.shape) if is_box_space else np.array(actions).reshape((-1, 1)), 'obs': np.array(observations), 'rewards': np.array(rewards), 'episode_returns': np.array(episode_returns), 'episode_starts': np.array(episode_starts[:-1]) } return numpy_dict