def close(self): """Make sure to manually close, or else you'll leak the encoder process""" if not self.enabled: return if self.encoder: logger.debug('Closing video encoder: path=%s', self.path) self.encoder.close() self.encoder = None else: # No frames captured. Set metadata, and remove the empty output file. os.remove(self.path) if self.metadata is None: self.metadata = {} self.metadata['empty'] = True # If broken, get rid of the output file, otherwise we'd leak it. if self.broken: logger.info('Cleaning up paths for broken video recorder: path=%s metadata_path=%s', self.path, self.metadata_path) # Might have crashed before even starting the output file, don't try to remove in that case. if os.path.exists(self.path): os.remove(self.path) if self.metadata is None: self.metadata = {} self.metadata['broken'] = True self.write_metadata()
def _start(self, directory, video_callable=None, force=False, resume=False, write_upon_reset=False, uid=None, mode=None): """Start monitoring. Args: directory (str): A per-training run directory where to record stats. video_callable (Optional[function, False]): function that takes in the index of the episode and outputs a boolean, indicating whether we should record a video on this episode. The default (for video_callable is None) is to take perfect cubes, capped at 1000. False disables video recording. force (bool): Clear out existing training data from this directory (by deleting every file prefixed with "openaigym."). resume (bool): Retain the training data already in this directory, which will be merged with our new data write_upon_reset (bool): Write the manifest file on each reset. (This is currently a JSON file, so writing it is somewhat expensive.) uid (Optional[str]): A unique id used as part of the suffix for the file. By default, uses os.getpid(). mode (['evaluation', 'training']): Whether this is an evaluation or training episode. """ if self.env.spec is None: logger.warn("Trying to monitor an environment which has no 'spec' set. This usually means you did not create it via 'gym.make', and is recommended only for advanced users.") env_id = '(unknown)' else: env_id = self.env.spec.id if not os.path.exists(directory): logger.info('Creating monitor directory %s', directory) if six.PY3: os.makedirs(directory, exist_ok=True) else: os.makedirs(directory) if video_callable is None: video_callable = capped_cubic_video_schedule elif video_callable == False: video_callable = disable_videos elif not callable(video_callable): raise error.Error('You must provide a function, None, or False for video_callable, not {}: {}'.format(type(video_callable), video_callable)) self.video_callable = video_callable # Check on whether we need to clear anything if force: clear_monitor_files(directory) elif not resume: training_manifests = detect_training_manifests(directory) if len(training_manifests) > 0: raise error.Error('''Trying to write to monitor directory {} with existing monitor files: {}. You should use a unique directory for each training run, or use 'force=True' to automatically clear previous monitor files.'''.format(directory, ', '.join(training_manifests[:5]))) self._monitor_id = monitor_closer.register(self) self.enabled = True self.directory = os.path.abspath(directory) # We use the 'openai-gym' prefix to determine if a file is # ours self.file_prefix = FILE_PREFIX self.file_infix = '{}.{}'.format(self._monitor_id, uid if uid else os.getpid()) self.stats_recorder = stats_recorder.StatsRecorder(directory, '{}.episode_batch.{}'.format(self.file_prefix, self.file_infix), autoreset=self.env_semantics_autoreset, env_id=env_id) if not os.path.exists(directory): os.mkdir(directory) self.write_upon_reset = write_upon_reset if mode is not None: self._set_mode(mode)
def test_env_semantics(spec): logger.warn("Skipping this test. Existing hashes were generated in a bad way") return with open(ROLLOUT_FILE) as data_file: rollout_dict = json.load(data_file) if spec.id not in rollout_dict: if not spec.nondeterministic: logger.warn("Rollout does not exist for {}, run generate_json.py to generate rollouts for new envs".format(spec.id)) return logger.info("Testing rollout for {} environment...".format(spec.id)) observations_now, actions_now, rewards_now, dones_now = generate_rollout_hash(spec) errors = [] if rollout_dict[spec.id]['observations'] != observations_now: errors.append('Observations not equal for {} -- expected {} but got {}'.format(spec.id, rollout_dict[spec.id]['observations'], observations_now)) if rollout_dict[spec.id]['actions'] != actions_now: errors.append('Actions not equal for {} -- expected {} but got {}'.format(spec.id, rollout_dict[spec.id]['actions'], actions_now)) if rollout_dict[spec.id]['rewards'] != rewards_now: errors.append('Rewards not equal for {} -- expected {} but got {}'.format(spec.id, rollout_dict[spec.id]['rewards'], rewards_now)) if rollout_dict[spec.id]['dones'] != dones_now: errors.append('Dones not equal for {} -- expected {} but got {}'.format(spec.id, rollout_dict[spec.id]['dones'], dones_now)) if len(errors): for error in errors: logger.warn(error) raise ValueError(errors)
def clear_monitor_files(training_dir): files = detect_monitor_files(training_dir) if len(files) == 0: return logger.info('Clearing %d monitor files from previous run (because force=True was provided)', len(files)) for file in files: os.unlink(file)
def __init__(self, env, path=None, metadata=None, enabled=True, base_path=None): modes = env.metadata.get('render.modes', []) self._async = env.metadata.get('semantics.async') self.enabled = enabled # Don't bother setting anything else if not enabled if not self.enabled: return self.ansi_mode = False if 'rgb_array' not in modes: if 'ansi' in modes: self.ansi_mode = True else: logger.info('Disabling video recorder because {} neither supports video mode "rgb_array" nor "ansi".'.format(env)) # Whoops, turns out we shouldn't be enabled after all self.enabled = False return if path is not None and base_path is not None: raise error.Error("You can pass at most one of `path` or `base_path`.") self.last_frame = None self.env = env required_ext = '.json' if self.ansi_mode else '.mp4' if path is None: if base_path is not None: # Base path given, append ext path = base_path + required_ext else: # Otherwise, just generate a unique filename with tempfile.NamedTemporaryFile(suffix=required_ext, delete=False) as f: path = f.name self.path = path path_base, actual_ext = os.path.splitext(self.path) if actual_ext != required_ext: hint = " HINT: The environment is text-only, therefore we're recording its text output in a structured JSON format." if self.ansi_mode else '' raise error.Error("Invalid path given: {} -- must have file extension {}.{}".format(self.path, required_ext, hint)) # Touch the file in any case, so we know it's present. (This # corrects for platform platform differences. Using ffmpeg on # OS X, the file is precreated, but not on Linux. touch(path) self.frames_per_sec = env.metadata.get('video.frames_per_second', 30) self.encoder = None # lazily start the process self.broken = False # Dump metadata self.metadata = metadata or {} self.metadata['content_type'] = 'video/vnd.openai.ansivid' if self.ansi_mode else 'video/mp4' self.metadata_path = '{}.meta.json'.format(path_base) self.write_metadata() logger.info('Starting new video recorder writing to %s', self.path) self.empty = True
def make(self, id): logger.info('Making new env: %s', id) spec = self.spec(id) env = spec.make() if (env.spec.timestep_limit is not None) and not spec.tags.get('vnc'): from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=env.spec.max_episode_steps, max_episode_seconds=env.spec.max_episode_seconds) return env
def close(self): """Flush all monitor data to disk and close any open rending windows.""" if not self.enabled: return self.stats_recorder.close() if self.video_recorder is not None: self._close_video_recorder() self._flush(force=True) # Stop tracking this for autoclose monitor_closer.unregister(self._monitor_id) self.enabled = False logger.info('''Finished writing results. You can upload them to the scoreboard via gym.upload(%r)''', self.directory)
def make(self, id): logger.info('Making new env: %s', id) spec = self.spec(id) env = spec.make() # We used to have people override _reset/_step rather than # reset/step. Set _gym_disable_underscore_compat = True on # your environment if you use these methods and don't want # compatibility code to be invoked. if hasattr(env, "_reset") and hasattr(env, "_step") and not getattr(env, "_gym_disable_underscore_compat", False): patch_deprecated_methods(env) if (env.spec.timestep_limit is not None) and not spec.tags.get('vnc'): from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=env.spec.max_episode_steps, max_episode_seconds=env.spec.max_episode_seconds) return env
def make(self, path, **kwargs): if len(kwargs) > 0: logger.info('Making new env: %s (%s)', path, kwargs) else: logger.info('Making new env: %s', path) spec = self.spec(path) env = spec.make(**kwargs) # We used to have people override _reset/_step rather than # reset/step. Set _gym_disable_underscore_compat = True on # your environment if you use these methods and don't want # compatibility code to be invoked. if hasattr(env, "_reset") and hasattr(env, "_step") and not getattr( env, "_gym_disable_underscore_compat", False): patch_deprecated_methods(env) if (env.spec.max_episode_steps is not None) and not spec.tags.get('vnc'): from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=env.spec.max_episode_steps) return env
def do_episode(self, config): """ :param config: :return: """ # Initial values done = False score_e = 0 step_e = 0 # Reset environment self.env.reset() # Continue while not crashed while not done: # Show on screen if config['VERBOSE'] > 1: self.env.render() # Act action = self.act() _, reward, done, _ = self.env.step(action) # Increment score and steps score_e += reward step_e += 1 self.step += 1 # Append score self.score.append(score_e) self.score_100.append(score_e) mean_score = np.mean(self.score_100) # Increment episode self.episode += 1 if config['VERBOSE'] > 0: logger.info( f'[Episode {self.episode}] - score: {score_e:.2f}, steps: {step_e}, ' f'100-score: {mean_score:.2f}.')
def close(self): """Flush all monitor data to disk and close any open rending windows.""" super().close() if not self.enabled: return self.stats_recorder.close() if self.video_recorder is not None: self._close_video_recorder() self._flush(force=True) # Stop tracking this for autoclose monitor_closer.unregister(self._monitor_id) self.enabled = False logger.info( """Finished writing results. You can upload them to the scoreboard via gym.upload(%r)""", self.directory, )
def sample_minibatch(self): """ Sample a batch of transitions from memory. This only happens - when the memory is full - at some intermediate memory lengths Otherwise, the returned batch is empty :return: a batch of the whole memory """ if self.memory.is_full(): logger.info("Memory is full, switching to evaluation mode.") self.eval() transitions = self.memory.sample(len(self.memory)) return Transition(*zip(*transitions)) elif len(self.memory) % self.config["batch_size"] == 0: transitions = self.memory.sample(len(self.memory)) return Transition(*zip(*transitions)) else: return None
def learn(self): batch_s, batch_a, batch_r, batch_t, batch_s_ = self.buffer.sample_batch( BATCH_SIZE) self.optimizer.zero_grad() batch_s = torch.FloatTensor(batch_s).to(self.device) batch_a = torch.LongTensor(batch_a).to(self.device) batch_r = torch.FloatTensor(batch_r).to(self.device) batch_s_ = torch.FloatTensor(batch_s_).to(self.device) q_eval = self.eval_net(batch_s).gather(1, batch_a.view((-1, 1))) # print(f"q_eval {q_eval.shape}", ) q_next = self.target_net(batch_s_).detach() # print(f"q_next {q_next.shape}", ) # use double Q if self.use_double_q: q_action = self.eval_net(batch_s_).max(1)[1] # print(f"q_action {q_action.shape}") q_target = batch_r.view((-1, 1)) + self.gamma * \ q_next.gather(1, q_action.view((-1, 1))) # print(f"batch_r {batch_r.shape}") # print(f"q_target {q_target.shape}") else: q_target = batch_r + self.gamma * q_next.max(1)[0] loss = self.loss_func(q_eval, q_target) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_value_(self.eval_net.parameters(), 1.0) self.optimizer.step() if self.epsilon > EPSILON_FINAL: self.epsilon = self.epsilon * EPSILON_DECAY # target parameter update if self.learn_iterations % TARGET_UPDATE_ITER == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) logger.info(f" == update targe network") self.learn_iterations += 1
def __init__(self): self.__version__ = VERSION logger.set_level(logger.INFO) logger.info("carmunk {}".format(self.__version__)) screen = pygame.display.set_mode((width, height)) # Turn off alpha since we don't use it. screen.set_alpha(None) # Carmunk game object self.__game = _GameState(screen) # Define the action space: move left or right self.action_space = spaces.Discrete(2) # Observation space self.observation_space = spaces.Box(low=0, high=39, shape=(3, ), dtype=int)
def test_env_semantics(spec): logger.warn( "Skipping this test. Existing hashes were generated in a bad way") return with open(ROLLOUT_FILE) as data_file: rollout_dict = json.load(data_file) if spec.id not in rollout_dict: if not spec.nondeterministic: logger.warn( "Rollout does not exist for {}, run generate_json.py to generate rollouts for new envs" .format(spec.id)) return logger.info("Testing rollout for {} environment...".format(spec.id)) observations_now, actions_now, rewards_now, dones_now = generate_rollout_hash( spec) errors = [] if rollout_dict[spec.id]['observations'] != observations_now: errors.append( 'Observations not equal for {} -- expected {} but got {}'.format( spec.id, rollout_dict[spec.id]['observations'], observations_now)) if rollout_dict[spec.id]['actions'] != actions_now: errors.append( 'Actions not equal for {} -- expected {} but got {}'.format( spec.id, rollout_dict[spec.id]['actions'], actions_now)) if rollout_dict[spec.id]['rewards'] != rewards_now: errors.append( 'Rewards not equal for {} -- expected {} but got {}'.format( spec.id, rollout_dict[spec.id]['rewards'], rewards_now)) if rollout_dict[spec.id]['dones'] != dones_now: errors.append( 'Dones not equal for {} -- expected {} but got {}'.format( spec.id, rollout_dict[spec.id]['dones'], dones_now)) if len(errors): for error in errors: logger.warn(error) raise ValueError(errors)
def step(self, action): assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action)) state = self.state # Create a prey with probability 1% if not self.prey.tolist(): px = np.float(random.randint(-.5*self.world_width, .5*self.world_width - 1)) py = np.float(random.randint(-.5*self.world_height, .5*self.world_height - 1)) while [px, py] in self.snake.blocks.tolist(): px = np.float(random.randint(-.5*self.world_width, .5*self.world_width - 1)) py = np.float(random.randint(-.5*self.world_height, .5*self.world_height - 1)) self.prey = np.array([px, py]) logger.info("[INFO] -- New Prey at {}, {} ".format(px,py)) # print(self.snake.blocks[0].tolist()) if self.snake.blocks[0].tolist() in [self.prey.tolist()]: self.snake.eat_and_move(action) self.state = np.array([self.get_state()]) self.prey = np.array([]) logger.info("[INFO] -- Manger") reward = 500. else: self.snake.move(action) reward = -.5 self.state = np.array([self.get_state()]) done = self.snake.is_dead or self.oob(*self.snake.blocks[0]) if done: logger.warn("DONE") if self.steps_beyond_done is None: self.steps_beyond_done = 0 reward = -1000 else: if self.steps_beyond_done == 0: logger.warn("You are calling 'step()' but it's already done !") self.steps_beyond_done += 1 return self.state, reward, done, {}
def _preprocess(self): data = pd.read_csv(Generator.dataset_path) message = 'Columns found in the dataset {}'.format(data.columns) logger.info(message) data = data.dropna() start_time_stamp = data['Timestamp'][0] timestamps = data['Timestamp'].apply(lambda x: (x - start_time_stamp) / 60) timestamps = timestamps - range(timestamps.shape[0]) data.insert(0, 'blocks', timestamps) blocks = data.groupby('blocks') message = 'Number of blocks of continuous prices found are {}'.format( len(blocks)) logger.info(message) self._data_blocks = [] distinct_episodes = 0 for name, indices in blocks.indices.items(): ''' Length of the block should exceed the history length and horizon by 1. Extra 1 is required to normalize each price block by previos time stamp ''' if len(indices) > (self.history_length + self.horizon + 1): self._data_blocks.append(blocks.get_group(name)) # similarly, we subtract an extra 1 to calculate the number of distinct episodes distinct_episodes = distinct_episodes + ( len(indices) - (self.history_length + self.horizon) + 1 + 1) data = None message_list = [ 'Number of usable blocks obtained from the dataset are {}'.format( len(self._data_blocks)) ] message_list.append( 'Number of distinct episodes for the current configuration are {}'. format(distinct_episodes)) map(logger.info, message_list)
def main(): parser = argparse.ArgumentParser(description=None) parser.add_argument('-b', '--base-dir', default='blackjack-1', help='Set base dir.') parser.add_argument('-v', '--verbose', action='count', dest='verbosity', default=0, help='Set verbosity.') args = parser.parse_args() if args.verbosity == 0: logger.setLevel(logging.INFO) elif args.verbosity >= 1: logger.setLevel(logging.DEBUG) num_episodes = 100000 epsilon_decay = 8000 policy, Q = learn(args.base_dir, num_episodes, epsilon_decay) final_average_return = score(policy) logger.info("final average returns: {}".format(final_average_return)) plot_policy(policy, "diag_{}_{}_{}.png".format(num_episodes, epsilon_decay, final_average_return)) return 0
def record(self, state, action, reward, next_state, done, info): """ Record a transition by performing a Fitted-Q iteration - push the transition into memory - when enough experience is acquired, sample a batch - perform N value iteration steps Qk -> Qk+1, ie: - compute the Bellman residual loss over the batch - Minimize it through M gradient descent steps :param state: a state :param action: an action :param reward: a reward :param next_state: a next state :param done: whether state is terminal :param info: information about the environment """ if not self.training: return # Store transition to memory self.memory.push(state, action, reward, next_state, done, info) batch = self.sample_minibatch() if not batch: return batch = self._add_constraint_penalty(batch) # Optimize model on batch value_iteration_epochs = self.config["value_iteration_epochs"] or int( 3 / (1 - self.config["gamma"])) self.initialize_model() for epoch in range(value_iteration_epochs): self.update_target_network() delta, target = self.compute_bellman_residual(batch) self.initialize_model() logger.info( "Bellman residual at iteration {} on batch {} is {}".format( epoch, len(batch.reward), delta)) for _ in range(self.config["regression_epochs"]): loss, _ = self.compute_bellman_residual(batch, target) self.step_optimizer(loss)
def move(self, a): """Move the snake one step in direction 'a'.""" assert a in [0, 1, 2] if not self.is_dead: diff = np.array(self.blocks[0] - self.old_head) if a == 0: dxdy = diff elif a == 1: dxdy = np.array(rotate(*diff, .5*math.pi)) else: dxdy = np.array(rotate(*diff, -.5*math.pi)) self.blocks = np.roll(self.blocks, 2) self.blocks[0] = self.blocks[1] + dxdy self.old_head = self.blocks[1] else: return if self.blocks[0].tolist() in self.blocks[1:].tolist(): self.is_dead = True logger.info("[INFO] -- Head moved to {}".format(self.blocks[0]))
def step(self, action): err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg logger.info("\n\n#####action: %s", action) logger.info("step: %s", self.number_of_steps) logger.info("window bounds: %s", (self.window_x_bounds, self.window_y_bounds)) logger.info("ball center coordinate: %s", (self.ball_center_x, self.ball_center_y)) logger.info("velocity: %s", (self.velocity_x, self.velocity_y)) self.number_of_steps += 1 self.ball_center_x += int(self.velocity_x * self.dt) self.ball_center_y += int(self.velocity_y * self.dt + 1 / 2 * self.gravity * self.dt**2) self.velocity_y = self.velocity_y + self.gravity * self.dt self.check_wall_collision_and_update_state() reward = self.apply_force(action) self.state = (self.ball_center_x, self.ball_center_y, self.ball_radius) return np.array(self.state), reward, False, {}
def add_new_rollouts(spec_ids, overwrite): environments = [ spec for spec in envs.registry.all() if spec.entry_point is not None ] if spec_ids: environments = [spec for spec in environments if spec.id in spec_ids] assert len(environments) == len(spec_ids), "Some specs not found" with open(ROLLOUT_FILE) as data_file: rollout_dict = json.load(data_file) modified = False for spec in environments: if not overwrite and spec.id in rollout_dict: logger.debug("Rollout already exists for {}. Skipping.".format( spec.id)) else: modified = update_rollout_dict(spec, rollout_dict) or modified if modified: logger.info("Writing new rollout file to {}".format(ROLLOUT_FILE)) with open(ROLLOUT_FILE, "w") as outfile: json.dump(rollout_dict, outfile, indent=2, sort_keys=True) else: logger.info("No modifications needed.")
def close(self): """Flush all data to disk and close any open frame encoders.""" if not self.enabled or self._closed: return if self.encoder: logger.debug("Closing video encoder: path=%s", self.path) self.encoder.close() self.encoder = None else: # No frames captured. Set metadata, and remove the empty output file. os.remove(self.path) if self.metadata is None: self.metadata = {} self.metadata["empty"] = True # If broken, get rid of the output file, otherwise we'd leak it. if self.broken: logger.info( "Cleaning up paths for broken video recorder: path=%s metadata_path=%s", self.path, self.metadata_path, ) # Might have crashed before even starting the output file, don't try to remove in that case. if os.path.exists(self.path): os.remove(self.path) if self.metadata is None: self.metadata = {} self.metadata["broken"] = True self.write_metadata() # Stop tracking this for autoclose self._closed = True
def make(self, path: str, **kwargs) -> Env: if len(kwargs) > 0: logger.info("Making new env: %s (%s)", path, kwargs) else: logger.info("Making new env: %s", path) # We need to manually parse the ID so we can check # the version without error-ing out in self.spec namespace, name, version = parse_env_id(path) # Get all versions of this spec. versions = self.env_specs.versions(namespace, name) # We check what the latest version of the environment is and display # a warning if the user is attempting to initialize an older version # or an unversioned one. latest_versioned_spec = max( filter(lambda spec: spec.version, versions), key=lambda spec: cast(int, spec.version), default=None, ) if (latest_versioned_spec and version is not None and version < cast(int, latest_versioned_spec.version)): logger.warn( f"The environment {path} is out of date. You should consider " f"upgrading to version `v{latest_versioned_spec.version}` " f"with the environment ID `{latest_versioned_spec.id}`.") elif latest_versioned_spec and version is None: logger.warn( f"Using the latest versioned environment `{latest_versioned_spec.id}` " f"instead of the unversioned environment `{path}`") path = latest_versioned_spec.id # Lookup our path spec = self.spec(path) # Construct the environment return spec.make(**kwargs)
def check_wall_collision_and_update_state(self): collision_with_ground = self.ball_center_y <= self.window_y_bounds[0] collision_with_left_wall = self.ball_center_x <= self.window_x_bounds[0] collision_with_right_wall = self.ball_center_x >= self.window_x_bounds[ 1] if collision_with_ground: logger.info("collision with ground") logger.info("initial velocity: %s", (self.velocity_x, self.velocity_y)) self.velocity_y = -self.velocity_y * self.damping_factor self.ball_center_y = int(self.window_y_bounds[0]) logger.info("final velocity: %s", (self.velocity_x, self.velocity_y)) elif collision_with_left_wall: logger.info("collision with left wall") self.velocity_x = -self.velocity_x * self.damping_factor self.ball_center_x = int(self.window_x_bounds[0]) elif collision_with_right_wall: logger.info("collision with right wall") self.velocity_x = -self.velocity_x * self.damping_factor self.ball_center_x = int(self.window_x_bounds[1])
def compare_experiments(experiments: dict, full_memory=True): """Deep Q network for differential robot control. Learn to control the robot in the PathFollower environment where the actions are the forward and rotational velocity. """ logger.info('Train new experiments') for name, experiment in experiments.items(): experiment.train(render=False, full_memory=full_memory) experiment.target_network.model.save(f'model_{name}.h5') rewards = [experiment.rewards_train for experiment in experiments.values()] names = list(experiments.keys()) plot_rewards(rewards, names, tag='Training') smooth_rewards = [] for reward in rewards: smooth_rewards.append(list(pd.Series(reward).rolling(100).mean())) plot_rewards(smooth_rewards, names, tag='Training rolling mean') test_rewards = list() mean_test_rewards = list() for name, experiment in experiments.items(): reward = experiment.test(render=False) test_rewards.append(reward) mean_test_rewards.append(np.mean(reward)) plot_rewards(test_rewards, names, tag='Test') print(list(zip(names, mean_test_rewards))) smooth_rewards_test = [] for reward in test_rewards: smooth_rewards_test.append(list(pd.Series(reward).rolling(5).mean())) plot_rewards(smooth_rewards_test, names, tag='Test rolling mean')
def update_rollout_dict(spec, rollout_dict): """ Takes as input the environment spec for which the rollout is to be generated, and the existing dictionary of rollouts. Returns True iff the dictionary was modified. """ # Skip platform-dependent if should_skip_env_spec_for_tests(spec): logger.info("Skipping tests for {}".format(spec.id)) return False # Skip environments that are nondeterministic if spec.nondeterministic: logger.info("Skipping tests for nondeterministic env {}".format( spec.id)) return False logger.info("Generating rollout for {}".format(spec.id)) try: ( observations_hash, actions_hash, rewards_hash, dones_hash, ) = generate_rollout_hash(spec) except: # If running the env generates an exception, don't write to the rollout file logger.warn( "Exception {} thrown while generating rollout for {}. Rollout not added." .format(sys.exc_info()[0], spec.id)) return False rollout = {} rollout["observations"] = observations_hash rollout["actions"] = actions_hash rollout["rewards"] = rewards_hash rollout["dones"] = dones_hash existing = rollout_dict.get(spec.id) if existing: differs = False for key, new_hash in rollout.items(): differs = differs or existing[key] != new_hash if not differs: logger.debug("Hashes match with existing for {}".format(spec.id)) return False else: logger.warn("Got new hash for {}. Overwriting.".format(spec.id)) rollout_dict[spec.id] = rollout return True
def __init__(self, capacity: int, alpha: float) -> None: self._type = ReplayBufferTypes.Prioritized self.capacity = 1 while self.capacity < capacity: self.capacity = self.capacity << 1 self._buffer = [] self._idx = 0 self._sum_tree = SumSegmentTree(self.capacity) self._min_tree = MinSegmentTree(self.capacity) self._max_priority = 1.0 self._alpha = alpha logger.info("prioritized replay buffer init") logger.info("\t alpha is %f", self._alpha) logger.info("\t capacity is %d", self.capacity)
def run_test(env: gym.Env, agents: Tuple[Agent, Agent], epoch: int) -> List[str]: """ Run N matches as test :param env: The gym.Env :param agents: The two agents (Defender, Attacker) :param epoch: The current training epoch the agents are at """ logger.info(f"Starting {SETTINGS.TEST_MATCHES} test match(es)") logger.info( f"video recording is {'enabled' if SETTINGS.RECORD_TEST_MATCHES else 'disabled'}" ) test_env = env if not SETTINGS.RECORD_TEST_MATCHES else Monitor( env=env, directory=os.path.join(videos_dir, str(epoch)), video_callable=lambda episode_id: (episode_id + 1) % SETTINGS.TEST_MATCHES_RECORD_INTERVAL == 0) winners = [] moves_len = [] for ep in range(SETTINGS.TEST_MATCHES): moves = [] obs = test_env.reset() if SETTINGS.RENDER_TEST_MATCHES: test_env.render() curr_agent = 0 while True: action, _ = agents[curr_agent].choose_action( obs, test_env.env.action_space if SETTINGS.RECORD_TEST_MATCHES else env.action_space) moves.append(test_env.env.actions[action] if SETTINGS. RECORD_TEST_MATCHES else env.actions[action]) obs, _, done, info = test_env.step(action) if SETTINGS.RENDER_TEST_MATCHES: test_env.render() captures = info.get('captured') if len(captures) > 0: moves[-1] += 'x' + 'x'.join(captures) if done: write_match_infos(info, moves, f'match_{epoch}_{ep}') winners.append(info.get('winner', None)) moves_len.append(len(moves)) break curr_agent = 0 if curr_agent == 1 else 1 test_env.close() update_summary(winners, moves_len, epoch) logger.info('Test match(es) completed and results saved') return winners
def train(self, env, episodes): max_score = -514229 total_step = 0 for eps in range(self.cur_episode, episodes): state = env.reset() score = 0 done = False while not done: if total_step < MIN_STEP_TO_TRAIN: action = env.action_space.sample() else: action = self.act(state) state_, reward, done, _ = env.step(action) total_step += 1 score += reward reward = check_reward(self.env_name, state, action, reward, state_, done) self.buffer.add(state, action, reward, done, state_) if self.buffer.size > MIN_STEP_TO_TRAIN: self.learn() state = state_ max_score = score if score > max_score else max_score self.score_history.append(score) logger.info( f" == episode: {eps+1:05d} | total step: {total_step:7d} | score: {score:8.2f} | max score: {max_score:8.2f}" ) if (eps + 1) % 100 == 0: ckpt_name = os.path.join(self.ckpt_save_path, f"ckpt_{eps}.pth") self.save_model(ckpt_name, eps) logger.info(f" == model {ckpt_name} saved") ckpt_name = os.path.join(self.ckpt_save_path, "ckpt_final.pth") self.save_model(ckpt_name, eps) logger.info(f" == model {ckpt_name} saved") figure_name = os.path.join(self.ckpt_save_path, f"{self.agent_name}.png") plot_figure(figure_name, self.score_history)
def build_dataset(matches: MatchesCollection, epoch: int): """ Build the dataset for both Attacker and Defender and at the various last-moves windows :param matches: The matches as a collection :param epoch: The current epoch for training """ logger.info('Starting building dataset...') for player in ['ATK', 'DEF']: logger.info(f'Building dataset for {player}...') ms = matches.matches.get(player) for lm in LAST_MOVES: if lm <= matches.shortest_match(player): samples = np.empty(shape=(matches.n_matches * lm, SHAPE_STATE[0], SHAPE_STATE[1], SHAPE_STATE[2]), dtype=np.float) labels = np.empty(shape=(matches.n_matches * lm)) with_limit = True else: n_samples = 0 for m in ms: n_samples += len(m[0]) samples = np.empty(shape=(n_samples, SHAPE_STATE[0], SHAPE_STATE[1], SHAPE_STATE[2]), dtype=np.float) labels = np.empty(shape=n_samples) with_limit = False i = 0 for m in ms: for s, l in zip(m[0][-(lm if with_limit else 0):], m[1][-(lm if with_limit else 0):]): samples[i] = s labels[i] = l i += 1 dataset = TablutDataset( samples=samples, labels=labels, name= f"{player}_{epoch}_{matches.n_matches}_{(lm if with_limit else 'full')}", transform=transforms.Compose([transforms.ToTensor()])) save_dataset(dataset, datasets_dir) logger.info('All datasets built')
def train(self, env, episodes): max_score = -514229 total_step = 0 for eps in range(self.cur_episode, episodes): state = env.reset() score = 0 done = False episode_step = 0 while not done: action = self.predict(state) state_, reward, done, _ = env.step(action) episode_step += 1 total_step += 1 score += reward reward = check_reward(self.env_name, state, action, reward, state_, done) self.store_rewards(reward) state = state_ self.score_history.append(score) max_score = score if score > max_score else max_score if score > -1.0 * episode_step: self.learn() logger.info( f" == episode: {eps+1}, score: {score}, max score: {max_score}" ) else: self.clear_memory() if (eps + 1) % 100 == 0: ckpt_name = os.path.join(self.ckpt_save_path, f"ckpt_{eps}.pth") self.save_model(ckpt_name, eps) logger.info(f" == model {ckpt_name} saved") ckpt_name = os.path.join(self.ckpt_save_path, "ckpt_final.pth") self.save_model(ckpt_name, eps) logger.info(f" == model {ckpt_name} saved") figure_name = os.path.join(self.ckpt_save_path, f"{self.agent_name}.png") plot_figure(figure_name, self.score_history)
def get_transactions(): if not Generator.dataset_path: Generator.set_dataset_path() message = 'Getting latest transactions from {}.'.format(URL) + \ '\nThis might take a few minutes depending upon your internet speed.' logger.info(message) path = os.path.join(Generator.temp_dir, 'coinbaseUSD.csv.gz') f = urllib2.urlopen(URL) with open(path, 'w') as buffer: buffer.write(f.read()) message = 'Latest transactions saved to {}'.format(path) logger.info(message) # Read the transactions into pandas dataframe with gzip.open(path, 'r') as f: d = pd.read_table(f, sep=',', header=None, index_col=0, names=['price', 'volume']) os.remove(path) d.index = d.index.map( lambda ts: datetime.datetime.fromtimestamp(int(ts))) d.index.names = ['DateTime_UTC'] p = pd.DataFrame(d['price'].resample('1Min').ohlc()) p.columns = ['price_open', 'price_high', 'price_low', 'price_close'] v = pd.DataFrame(d['volume'].resample('1Min').sum()) v.columns = ['volume'] p['volume'] = v['volume'] unix_timestamps = p.index.map( lambda ts: int(time.mktime(ts.timetuple()))) p.insert(0, 'Timestamp', unix_timestamps) p.to_csv(Generator.dataset_path, sep=',') message = 'Dataset sampled and saved to {}'.format( Generator.dataset_path) logger.info(message)
def train(self, env, episodes): max_score = -514229 total_step = 0 for eps in range(self.cur_episode, episodes): state = env.reset() score = 0 done = False while not done: action = self.predict(state) state_, reward, done, _ = env.step(action) total_step += 1 score += reward reward = check_reward(self.env_name, state, action, reward, state_, done) self.buffer.add(state, action, reward, done, state_) if self.buffer.size > INIT_REPLAY_SIZE: self.learn() elif self.buffer.size % 500 == 0: print(f' == populate the replay buffer ... ... ') state = state_ max_score = score if score > max_score else max_score self.score_history.append(score) logger.info( f" == episode: {eps+1}, total step: {total_step}, score: {score}, max score: {max_score}" ) if (eps + 1) % 100 == 0: ckpt_name = os.path.join(self.ckpt_save_path, f"ckpt_{eps}.pth") self.save_model(ckpt_name, eps) logger.info(f" == model {ckpt_name} saved") ckpt_name = os.path.join(self.ckpt_save_path, "ckpt_final.pth") self.save_model(ckpt_name, eps) logger.info(f" == model {ckpt_name} saved") figure_name = os.path.join(self.ckpt_save_path, f"{self.agent_name}.png") plot_figure(figure_name, self.score_history)
observation = env.reset() for frame_idx in range(1, config.MAX_FRAMES + 1): epsilon = config.epsilon_by_frame(frame_idx) action = model.get_action(observation, epsilon) prev_observation = observation observation, reward, done, _ = env.step(action) #observation = None if done else observation episode_reward += reward model.update(prev_observation, action, reward, observation, frame_idx) if done: logger.info('Finished episode at frame {} with a reward of {}.'.format(frame_idx, episode_reward)) model.finish_nstep() model.reset_hx() observation = env.reset() model.save_reward(episode_reward) episode_reward = 0 if frame_idx % 10 == 0: print('') print('') print('FRAME_IDX: {}'.format(frame_idx)) print('') print('') model.save_w() plot_all_data(log_dir, env_id, 'DeepDip', config.MAX_FRAMES, bin_size=(10, 100, 100, 1), smooth=1, time=timedelta(seconds=int(timer()-start)), save_filename='./results.png', ipynb=False)
def _start(self, directory, force=False, resume=False, write_upon_reset=False, uid=None, mode=None): """Start monitoring. Args: directory (str): A per-training run directory where to record stats. video_flg (Optional[function, False]): function that takes in the index of the episode and outputs a boolean, indicating whether we should record a video on this episode. The default (for video_flg is None) is to take perfect cubes, capped at 1000. False disables video recording. force (bool): Clear out existing training data from this directory (by deleting every file prefixed with "openaigym."). resume (bool): Retain the training data already in this directory, which will be merged with our new data write_upon_reset (bool): Write the manifest file on each reset. (This is currently a JSON file, so writing it is somewhat expensive.) uid (Optional[str]): A unique id used as part of the suffix for the file. By default, uses os.getpid(). mode (['evaluation', 'training']): Whether this is an evaluation or training episode. """ if self.env.spec is None: logger.warn( "Trying to monitor an environment which has no 'spec' set. This usually means you did not create it via 'gym.make', and is recommended only for advanced users." ) env_id = '(unknown)' else: env_id = self.env.spec.id if not os.path.exists(directory): logger.info('Creating monitor directory %s', directory) if six.PY3: os.makedirs(directory, exist_ok=True) else: os.makedirs(directory) self.video_flg = False # Check on whether we need to clear anything if force: clear_monitor_files(directory) elif not resume: training_manifests = detect_training_manifests(directory) if len(training_manifests) > 0: raise error.Error( '''Trying to write to monitor directory {} with existing monitor files: {}. You should use a unique directory for each training run, or use 'force=True' to automatically clear previous monitor files.''' .format(directory, ', '.join(training_manifests[:5]))) self._monitor_id = monitor_closer.register(self) self.enabled = True self.directory = os.path.abspath(directory) # We use the 'openai-gym' prefix to determine if a file is # ours self.file_prefix = FILE_PREFIX self.file_infix = '{}.{}'.format(self._monitor_id, uid if uid else os.getpid()) self.stats_recorder = stats_recorder.StatsRecorder( directory, '{}.episode_batch.{}'.format(self.file_prefix, self.file_infix), autoreset=self.env_semantics_autoreset, env_id=env_id) if not os.path.exists(directory): os.mkdir(directory) self.write_upon_reset = write_upon_reset if mode is not None: self._set_mode(mode)
def set_monitor_mode(self, mode): logger.info( "Setting the monitor mode is deprecated and will be removed soon") self._set_mode(mode)
def set_monitor_mode(self, mode): logger.info("Setting the monitor mode is deprecated and will be removed soon") self._set_mode(mode)
info['env_id'] = env.spec.id # ------------------------------------------ def noisy_evaluation(theta): agent = BinaryActionLinearPolicy(theta) rew, T = do_rollout(agent, env, num_steps) return rew # Train the agent, and snapshot each stage for (i, iterdata) in enumerate( cem(noisy_evaluation, np.zeros(env.observation_space.shape[0] + 1), **params)): print('Iteration %2i. Episode mean reward: %7.3f' % (i, iterdata['y_mean'])) agent = BinaryActionLinearPolicy(iterdata['theta_mean']) if args.display: do_rollout(agent, env, 200, render=True) writefile('agent-%.4i.pkl' % i, str(pickle.dumps(agent, -1))) # Write out the env at the end so we store the parameters of this # environment. writefile('info.json', json.dumps(info)) env.close() logger.info( "Successfully ran cross-entropy method. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results." ) gym.upload(outdir)
# directory, including one with existing data -- all monitor files # will be namespaced). You can also dump to a tempdir if you'd # like: tempfile.mkdtemp(). outdir = '/tmp/random-agent-results' env = wrappers.Monitor(env, directory=outdir, force=True) env.seed(0) agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False for i in range(episode_count): ob = env.reset() while True: action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break # Note there's no env.render() here. But the environment still can open window and # render if asked by env.monitor: it calls env.render('rgb_array') to record video. # Video is not recorded every episode, see capped_cubic_video_schedule for details. # Close the env and write monitor result info to disk env.close() # Upload to the scoreboard. We could also do this from another # process if we wanted. logger.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.") gym.upload(outdir)
# ---------------------------------------- def writefile(fname, s): with open(path.join(outdir, fname), 'w') as fh: fh.write(s) info = {} info['params'] = params info['argv'] = sys.argv info['env_id'] = env.spec.id # ------------------------------------------ def noisy_evaluation(theta): agent = BinaryActionLinearPolicy(theta) rew, T = do_rollout(agent, env, num_steps) return rew # Train the agent, and snapshot each stage for (i, iterdata) in enumerate( cem(noisy_evaluation, np.zeros(env.observation_space.shape[0]+1), **params)): print('Iteration %2i. Episode mean reward: %7.3f'%(i, iterdata['y_mean'])) agent = BinaryActionLinearPolicy(iterdata['theta_mean']) if args.display: do_rollout(agent, env, 200, render=True) writefile('agent-%.4i.pkl'%i, str(pickle.dumps(agent, -1))) # Write out the env at the end so we store the parameters of this # environment. writefile('info.json', json.dumps(info)) env.close() logger.info("Successfully ran cross-entropy method. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.") gym.upload(outdir)