class Learner(object): def __init__(self, args): """ Construct a Learner from parsed arguments """ self.total_timesteps = 0 self.total_episodes = 0 self._datetime = datetime.datetime.now() self._render = args.render self._learn_loops = args.loops self._learn_freq = args.erfreq self._offpolicy_noise = args.offpolicy_noise self._temp = float(args.temp) self._retro = args.retro self._filter_actions = args.filter_actions # Make environment if args.retro: import retro self._env = retro.make(game=args.env) else: self._env = gym.make(args.env) # Observations self._discrete_obs = isinstance(self._env.observation_space, gym.spaces.Discrete) if self._discrete_obs: self._state_vars = self._env.observation_space.n # Prepare for one-hot encoding else: self._state_vars = int( np.product(self._env.observation_space.shape)) # Primitive actions aspace = self._env.action_space if isinstance(aspace, gym.spaces.Tuple): aspace = aspace.spaces else: aspace = [ aspace ] # Ensure that the action space is a list for all the environments if isinstance(aspace[0], gym.spaces.Discrete): # Discrete actions self._num_actions = int(np.prod([a.n for a in aspace])) elif isinstance(aspace[0], gym.spaces.MultiBinary): # Retro actions are binary vectors of pressed buttons. Quick HACK, # only press one button at a time self._num_actions = int(np.prod([a.n for a in aspace])) else: # Continuous actions raise NotImplementedError('Continuous actions are not supported') self._aspace = aspace # BDPI algorithm instance self._bdpi = BDPI(self._state_vars, self._num_actions, args, None) # Summary print('Number of primitive actions:', self._num_actions) print('Number of state variables', self._state_vars) def loadstore(self, filename, load=True): """ Load or store weights from/to a file """ self._bdpi.loadstore(filename, load) def encode_state(self, state): """ Encode a raw state from Gym to a Numpy vector """ if self._discrete_obs: # One-hot encode discrete variables rs = np.zeros(shape=(self._state_vars, ), dtype=np.float32) rs[state] = 1.0 elif isinstance(state, np.ndarray): rs = state.flatten().astype(np.float32) else: rs = np.array(state, dtype=np.float32) return rs def reset(self, last_reward): self._last_experience = None self._first_experience = None self._bdpi.reset(last_reward) self.total_episodes += 1 def save_episode(self, name): states = [] actions = [] rewards = [] entropies = [] e = self._first_experience while e: states.append(e.state()) actions.append(e.action) rewards.append(e.reward) entropies.append(e.entropy) e = e.next_experience s = pickle.dumps((states, actions, rewards, entropies)) s = lzo.compress(s) f = open(name + '.episode', 'wb') f.write(s) f.close() def execute(self, env_state, f_probs, f_probs_n, f_actions, f_Q_values): """ Execute one episode in the environment. """ done = False cumulative_reward = 0.0 seen_reward = 0.0 i = 0 show_actions = random.random() < 0.05 while (not done) and (i < 300): # Select an action based on the current state self.total_timesteps += 1 old_env_state = env_state state = self.encode_state(env_state) if self._filter_actions: possible_actions = label_decoder_reverse( self._env._get_not_empty_tracks(0)) else: possible_actions = list(range(self._num_actions)) action, experience = self._bdpi.select_action( state, env_state, possible_actions, f_probs, f_probs_n, f_Q_values) # Change the action if off-policy noise is to be used if self._offpolicy_noise and random.random() < self._temp: if self._filter_actions: possible_actions = label_decoder_reverse( self._env._get_not_empty_tracks(0)) if 52 in possible_actions: r = random.randrange(4) if r == 0: action = 52 else: ind = random.randrange(len(possible_actions)) experience.action = possible_actions[ind] else: action = random.randrange(self._num_actions) experience.action = action # Manage the experience chain if self._first_experience is None: self._first_experience = experience if self._last_experience is not None: self._last_experience.next_experience = experience self._last_experience = experience # Execute the action if len(self._aspace) > 1: # Choose each of the factored action depending on the composite action actions = [0] * len(self._aspace) for j in range(len(actions)): actions[j] = action % self._aspace[j].n action //= self._aspace[j].n env_state, reward, done, __ = self._env.step(actions) else: # Simple scalar action if self._retro: # Binary action a = np.zeros((self._num_actions, ), dtype=np.int8) a[action] = 1 action = a env_state, reward, done, __ = self._env.step(action) if show_actions: if i == 0: print('start', file=f_actions) if action != 52: print(label_decoder(action).start_track, 'to', label_decoder(action).end_track, file=f_actions) else: print('wait', file=f_actions) if reward > 3: print('solved', file=f_actions) if done: print('done', file=f_actions) i += 1 public_reward = reward # Render the environment if needed if self._render > 0 and self.total_episodes >= self._render: self._env.render() # Add the reward of the action experience.reward = reward cumulative_reward += public_reward seen_reward += experience.reward # Learn from the experience buffer if self._learn_freq == 0: do_learn = done else: do_learn = (self.total_timesteps % self._learn_freq == 0) if do_learn: s = datetime.datetime.now() d = (s - self._datetime).total_seconds() #print('Start Learning, in-between is %.3f seconds...' % d) count = self._bdpi.train(f_Q_values) ns = datetime.datetime.now() d = (ns - s).total_seconds() ## try: ## print('Learned %i steps in %.3f seconds, %.2f timesteps per second' % (count, d, count / d)) ## print('S', count / d, file=sys.stderr) ## except ZeroDivisionError: ## pass sys.stderr.flush() sys.stdout.flush() self._datetime = ns return (env_state, cumulative_reward, seen_reward, done, i)
def __init__(self, args): """ Construct a Learner from parsed arguments """ self.total_timesteps = 0 self.total_episodes = 0 self._datetime = datetime.datetime.now() self._render = args.render self._learn_loops = args.loops self._learn_freq = args.erfreq self._offpolicy_noise = args.offpolicy_noise self._temp = float(args.temp) self._retro = args.retro self._filter_actions = args.filter_actions # Make environment if args.retro: import retro self._env = retro.make(game=args.env) else: self._env = gym.make(args.env) # Observations self._discrete_obs = isinstance(self._env.observation_space, gym.spaces.Discrete) if self._discrete_obs: self._state_vars = self._env.observation_space.n # Prepare for one-hot encoding else: self._state_vars = int( np.product(self._env.observation_space.shape)) # Primitive actions aspace = self._env.action_space if isinstance(aspace, gym.spaces.Tuple): aspace = aspace.spaces else: aspace = [ aspace ] # Ensure that the action space is a list for all the environments if isinstance(aspace[0], gym.spaces.Discrete): # Discrete actions self._num_actions = int(np.prod([a.n for a in aspace])) elif isinstance(aspace[0], gym.spaces.MultiBinary): # Retro actions are binary vectors of pressed buttons. Quick HACK, # only press one button at a time self._num_actions = int(np.prod([a.n for a in aspace])) else: # Continuous actions raise NotImplementedError('Continuous actions are not supported') self._aspace = aspace # BDPI algorithm instance self._bdpi = BDPI(self._state_vars, self._num_actions, args, None) # Summary print('Number of primitive actions:', self._num_actions) print('Number of state variables', self._state_vars)
def __init__(self, args, task): """ Construct a Learner from parsed arguments """ self.total_timesteps = 0 self.total_episodes = 0 self._datetime = datetime.datetime.now() self._async_actor = args.async_actor self._render = args.render self._learn_loops = args.loops self._learn_freq = args.erfreq self._atari = args.atari self._retro = args.retro self._offpolicy_noise = args.offpolicy_noise self._temp = float(args.temp.split('_')[0]) self._task = task # Make environment self._env = gym.make('RPiLEDEnv-v0', resizeCamImagePct=50, ledHSVLower=np.array([0, 0, 252]), ledHSVHigher=np.array([31, 9, 255]), rPiIP='192.168.0.183', rPiPort=50000, episodeLength=100, bullseye=10) # callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20, verbose=1) # # eval_callback = EvalCallback(env, best_model_save_path='./logs/best', # log_path='./logs/', eval_freq=5000, # deterministic=True, render=False, callback_on_new_best=callback_on_best) # # # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :( # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', # name_prefix='ppo1_model') # # cb = CallbackList([checkpoint_callback, eval_callback]) if isinstance(self._env.action_space, gym.spaces.Box): # Wrap continuous-action environments self._env = gym_envs.contwrapper.ContWrapper(self._env) # Observations ob = self._env.observation_space self._discrete_obs = isinstance(ob, gym.spaces.Discrete) if self._discrete_obs: self._state_shape = (ob.n, ) # Prepare for one-hot encoding else: self._state_shape = ob.shape if len(self._state_shape) > 1: # Fix 2D shape for PyTorch s = self._state_shape self._state_shape = (s[2], s[0], s[1]) # Primitive actions aspace = self._env.action_space if isinstance(aspace, gym.spaces.Tuple): aspace = aspace.spaces else: aspace = [ aspace ] # Ensure that the action space is a list for all the environments if isinstance(aspace[0], gym.spaces.Discrete): # Discrete actions self._num_actions = int(np.prod([a.n for a in aspace])) elif isinstance(aspace[0], gym.spaces.MultiBinary): # Retro actions are binary vectors of pressed buttons. Quick HACK, # only press one button at a time self._num_actions = int(np.prod([a.n for a in aspace])) self._aspace = aspace # BDPI algorithm instance self._bdpi = BDPI(self._state_shape, self._num_actions, args) # Summary print('Number of primitive actions:', self._num_actions) print('State shape:', self._state_shape)
class Learner(object): def __init__(self, args, task): """ Construct a Learner from parsed arguments """ self.total_timesteps = 0 self.total_episodes = 0 self._datetime = datetime.datetime.now() self._async_actor = args.async_actor self._render = args.render self._learn_loops = args.loops self._learn_freq = args.erfreq self._atari = args.atari self._retro = args.retro self._offpolicy_noise = args.offpolicy_noise self._temp = float(args.temp.split('_')[0]) self._task = task # Make environment self._env = gym.make('RPiLEDEnv-v0', resizeCamImagePct=50, ledHSVLower=np.array([0, 0, 252]), ledHSVHigher=np.array([31, 9, 255]), rPiIP='192.168.0.183', rPiPort=50000, episodeLength=100, bullseye=10) # callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20, verbose=1) # # eval_callback = EvalCallback(env, best_model_save_path='./logs/best', # log_path='./logs/', eval_freq=5000, # deterministic=True, render=False, callback_on_new_best=callback_on_best) # # # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :( # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', # name_prefix='ppo1_model') # # cb = CallbackList([checkpoint_callback, eval_callback]) if isinstance(self._env.action_space, gym.spaces.Box): # Wrap continuous-action environments self._env = gym_envs.contwrapper.ContWrapper(self._env) # Observations ob = self._env.observation_space self._discrete_obs = isinstance(ob, gym.spaces.Discrete) if self._discrete_obs: self._state_shape = (ob.n, ) # Prepare for one-hot encoding else: self._state_shape = ob.shape if len(self._state_shape) > 1: # Fix 2D shape for PyTorch s = self._state_shape self._state_shape = (s[2], s[0], s[1]) # Primitive actions aspace = self._env.action_space if isinstance(aspace, gym.spaces.Tuple): aspace = aspace.spaces else: aspace = [ aspace ] # Ensure that the action space is a list for all the environments if isinstance(aspace[0], gym.spaces.Discrete): # Discrete actions self._num_actions = int(np.prod([a.n for a in aspace])) elif isinstance(aspace[0], gym.spaces.MultiBinary): # Retro actions are binary vectors of pressed buttons. Quick HACK, # only press one button at a time self._num_actions = int(np.prod([a.n for a in aspace])) self._aspace = aspace # BDPI algorithm instance self._bdpi = BDPI(self._state_shape, self._num_actions, args) # Summary print('Number of primitive actions:', self._num_actions) print('State shape:', self._state_shape) def loadstore(self, filename, load=True): """ Load or store weights from/to a file """ self._bdpi.loadstore(filename, load) def encode_state(self, state): """ Encode a raw state from Gym to a Numpy vector """ if self._discrete_obs: # One-hot encode discrete variables rs = np.zeros(shape=self._state_shape, dtype=np.float32) rs[state] = 1.0 return rs elif len(state.shape) > 1: # Atari, retro and other image-based are NHWC, PyTorch is NCHW return np.swapaxes(state, 2, 0) else: return np.asarray(state, dtype=np.float32) def reset(self, last_reward): self._last_experience = None self._first_experience = None self._bdpi.reset(last_reward) self.total_episodes += 1 def save_episode(self, name): states = [] actions = [] rewards = [] entropies = [] e = self._first_experience index = self._bdpi._experiences.index(e) for e in list(self._bdpi._experiences)[index:]: states.append(e.state()) actions.append(e.action) rewards.append(e.reward) entropies.append(e.entropy) with open(name + '.episode', 'wb') as f: f.write( lzo.compress( pickle.dumps((states, actions, rewards, entropies)))) with open('/tmp/' + name + '-buffer.picklez', 'wb') as f: f.write(lzo.compress(pickle.dumps(list(self._bdpi._experiences)))) def execute(self, env_state): """ Execute one episode in the environment. """ done = False cumulative_reward = 0.0 i = 0 while (not done) and (i < 108000): # Select an action based on the current state self.total_timesteps += 1 old_env_state = env_state state = self.encode_state(env_state) action, experience = self._bdpi.select_action(state) # Change the action if off-policy noise is to be used if self._offpolicy_noise and random.random() < self._temp: action = random.randrange(self._num_actions) experience.action = action # Manage the experience chain if self._first_experience is None: self._first_experience = experience if self._last_experience is not None: self._last_experience.set_next(experience) self._last_experience = experience # Execute the action if len(self._aspace) > 1: # Choose each of the factored action depending on the composite action actions = [0] * len(self._aspace) for j in range(len(actions)): actions[j] = action % self._aspace[j].n action //= self._aspace[j].n env_state, reward, done, _ = self._env.step(actions) else: # Simple scalar action if self._retro: # Binary action a = np.zeros((self._num_actions, ), dtype=np.int8) a[action] = 1 action = a env_state, reward, done, _ = self._env.step(action) i += 1 # Render the environment if needed if self._render > 0 and self.total_episodes >= self._render: self._env.render() # Use the taskfile to modify reward and done additional_reward, additional_done = self._task( old_env_state, action, env_state) reward += additional_reward if additional_done is not None: done = additional_done # Add the reward of the action experience.reward = reward cumulative_reward += reward # Learn from the experience buffer if self._learn_freq == 0: do_learn = done else: do_learn = (self.total_timesteps % self._learn_freq == 0) if do_learn and not self._async_actor: s = datetime.datetime.now() d = (s - self._datetime).total_seconds() print('Start Learning, in-between is %.3f seconds...' % d) count = self._bdpi.train() ns = datetime.datetime.now() d = (ns - s).total_seconds() print( 'Learned %i steps in %.3f seconds, %.2f timesteps per second' % (count, d, count / d)) sys.stderr.flush() sys.stdout.flush() self._datetime = ns return (env_state, cumulative_reward, done, i)
def __init__(self, args, task): """ Construct a Learner from parsed arguments """ self.total_timesteps = 0 self.total_episodes = 0 self._datetime = datetime.datetime.now() self._async_actor = args.async_actor self._render = args.render self._learn_loops = args.loops self._learn_freq = args.erfreq self._atari = args.atari self._retro = args.retro self._offpolicy_noise = args.offpolicy_noise self._temp = float(args.temp.split('_')[0]) self._task = task # Make environment if args.retro: import retro self._env = retro.make(game=args.env) elif args.atari: self._env = make_atari(args.env) self._env = wrap_deepmind(self._env) else: self._env = gym.make(args.env) if isinstance(self._env.action_space, gym.spaces.Box): # Wrap continuous-action environments self._env = gym_envs.contwrapper.ContWrapper(self._env) # Observations ob = self._env.observation_space self._discrete_obs = isinstance(ob, gym.spaces.Discrete) if self._discrete_obs: self._state_shape = (ob.n, ) # Prepare for one-hot encoding else: self._state_shape = ob.shape if len(self._state_shape) > 1: # Fix 2D shape for PyTorch s = self._state_shape self._state_shape = (s[2], s[0], s[1]) # Primitive actions aspace = self._env.action_space if isinstance(aspace, gym.spaces.Tuple): aspace = aspace.spaces else: aspace = [ aspace ] # Ensure that the action space is a list for all the environments if isinstance(aspace[0], gym.spaces.Discrete): # Discrete actions self._num_actions = int(np.prod([a.n for a in aspace])) elif isinstance(aspace[0], gym.spaces.MultiBinary): # Retro actions are binary vectors of pressed buttons. Quick HACK, # only press one button at a time self._num_actions = int(np.prod([a.n for a in aspace])) self._aspace = aspace # BDPI algorithm instance self._bdpi = BDPI(self._state_shape, self._num_actions, args) # Summary print('Number of primitive actions:', self._num_actions) print('State shape:', self._state_shape)
def __init__(self, args): """ Construct a Learner from parsed arguments """ self.total_timesteps = 0 self.total_episodes = 0 self._datetime = datetime.datetime.now() self._render = args.render self._learn_loops = args.loops self._learn_freq = args.erfreq self._offpolicy_noise = args.offpolicy_noise self._temp = float(args.temp) self._retro = args.retro # Make environment if args.retro: import retro self._env = retro.make(game=args.env) else: self._env = gym.make(args.env) # Wrap Atari with the DeepMind cheats if hasattr(self._env, 'unwrapped') and isinstance( self._env.unwrapped, gym.envs.atari.atari_env.AtariEnv): assert 'NoFrameskip' in self._env.spec.id self._env = atariwrap.NoopResetEnv(self._env, noop_max=30) self._env = atariwrap.MaxAndSkipEnv(self._env, skip=4) self._env = atariwrap.wrap_deepmind(self._env) # Observations self._discrete_obs = isinstance(self._env.observation_space, gym.spaces.Discrete) if self._discrete_obs: self._state_shape = (self._env.observation_space.n, ) # Prepare for one-hot encoding else: self._state_shape = self._env.observation_space.shape if len(self._state_shape) > 1: # Fix 2D shape for PyTorch s = self._state_shape self._state_shape = (s[2], s[0], s[1]) # Primitive actions aspace = self._env.action_space if isinstance(aspace, gym.spaces.Tuple): aspace = aspace.spaces else: aspace = [ aspace ] # Ensure that the action space is a list for all the environments if isinstance(aspace[0], gym.spaces.Discrete): # Discrete actions self._num_actions = int(np.prod([a.n for a in aspace])) elif isinstance(aspace[0], gym.spaces.MultiBinary): # Retro actions are binary vectors of pressed buttons. Quick HACK, # only press one button at a time self._num_actions = int(np.prod([a.n for a in aspace])) else: # Continuous actions print(aspace) raise NotImplementedError('Continuous actions are not supported') self._aspace = aspace # BDPI algorithm instance self._bdpi = BDPI(self._state_shape, self._num_actions, args, None) # Summary print('Number of primitive actions:', self._num_actions) print('State shape:', self._state_shape)
class Learner(object): def __init__(self, args): """ Construct a Learner from parsed arguments """ self.total_timesteps = 0 self.total_episodes = 0 self._datetime = datetime.datetime.now() self._render = args.render self._learn_loops = args.loops self._learn_freq = args.erfreq self._offpolicy_noise = args.offpolicy_noise self._temp = float(args.temp) self._retro = args.retro # Make environment if args.retro: import retro self._env = retro.make(game=args.env) else: self._env = gym.make(args.env) # Wrap Atari with the DeepMind cheats if hasattr(self._env, 'unwrapped') and isinstance( self._env.unwrapped, gym.envs.atari.atari_env.AtariEnv): assert 'NoFrameskip' in self._env.spec.id self._env = atariwrap.NoopResetEnv(self._env, noop_max=30) self._env = atariwrap.MaxAndSkipEnv(self._env, skip=4) self._env = atariwrap.wrap_deepmind(self._env) # Observations self._discrete_obs = isinstance(self._env.observation_space, gym.spaces.Discrete) if self._discrete_obs: self._state_shape = (self._env.observation_space.n, ) # Prepare for one-hot encoding else: self._state_shape = self._env.observation_space.shape if len(self._state_shape) > 1: # Fix 2D shape for PyTorch s = self._state_shape self._state_shape = (s[2], s[0], s[1]) # Primitive actions aspace = self._env.action_space if isinstance(aspace, gym.spaces.Tuple): aspace = aspace.spaces else: aspace = [ aspace ] # Ensure that the action space is a list for all the environments if isinstance(aspace[0], gym.spaces.Discrete): # Discrete actions self._num_actions = int(np.prod([a.n for a in aspace])) elif isinstance(aspace[0], gym.spaces.MultiBinary): # Retro actions are binary vectors of pressed buttons. Quick HACK, # only press one button at a time self._num_actions = int(np.prod([a.n for a in aspace])) else: # Continuous actions print(aspace) raise NotImplementedError('Continuous actions are not supported') self._aspace = aspace # BDPI algorithm instance self._bdpi = BDPI(self._state_shape, self._num_actions, args, None) # Summary print('Number of primitive actions:', self._num_actions) print('State shape:', self._state_shape) def loadstore(self, filename, load=True): """ Load or store weights from/to a file """ self._bdpi.loadstore(filename, load) def encode_state(self, state): """ Encode a raw state from Gym to a Numpy vector """ if self._discrete_obs: # One-hot encode discrete variables rs = np.zeros(shape=self._state_shape, dtype=np.float32) rs[state] = 1.0 elif len(self._state_shape) > 1: # Atari, retro and other image-based are NHWC, PyTorch is NCHW rs = np.float32(np.swapaxes(state, 2, 0)) else: rs = np.asarray(state, dtype=np.float32) return rs def reset(self, last_reward): self._last_experience = None self._first_experience = None self._bdpi.reset(last_reward) self.total_episodes += 1 def save_episode(self, name): states = [] actions = [] rewards = [] entropies = [] e = self._first_experience while e: states.append(e.state()) actions.append(e.action) rewards.append(e.reward) entropies.append(e.entropy) e = e.next_experience s = pickle.dumps((states, actions, rewards, entropies)) s = lzo.compress(s) f = open(name + '.episode', 'wb') f.write(s) f.close() def execute(self, env_state): """ Execute one episode in the environment. """ done = False cumulative_reward = 0.0 seen_reward = 0.0 i = 0 while (not done) and (i < 108000): # Select an action based on the current state self.total_timesteps += 1 old_env_state = env_state state = self.encode_state(env_state) action, experience = self._bdpi.select_action(state, env_state) # Change the action if off-policy noise is to be used if random.random() < self._offpolicy_noise: action = random.randrange(self._num_actions) experience.action = action # Manage the experience chain if self._first_experience is None: self._first_experience = experience if self._last_experience is not None: self._last_experience.next_experience = experience self._last_experience = experience # Execute the action if len(self._aspace) > 1: # Choose each of the factored action depending on the composite action actions = [0] * len(self._aspace) for j in range(len(actions)): actions[j] = action % self._aspace[j].n action //= self._aspace[j].n env_state, reward, done, __ = self._env.step(actions) else: # Simple scalar action if self._retro: # Binary action a = np.zeros((self._num_actions, ), dtype=np.int8) a[action] = 1 action = a env_state, reward, done, __ = self._env.step(action) i += 1 public_reward = reward # Render the environment if needed if self._render > 0 and self.total_episodes >= self._render: self._env.render() # Add the reward of the action experience.reward = reward cumulative_reward += public_reward seen_reward += experience.reward # Learn from the experience buffer if self._learn_freq == 0: do_learn = done else: do_learn = (self.total_timesteps % self._learn_freq == 0) if do_learn: s = datetime.datetime.now() d = (s - self._datetime).total_seconds() print('Start Learning, in-between is %.3f seconds...' % d) count = self._bdpi.train() ns = datetime.datetime.now() d = (ns - s).total_seconds() print( 'Learned %i steps in %.3f seconds, %.2f timesteps per second' % (count, d, count / d)) print('S', count / d, file=sys.stderr) sys.stderr.flush() sys.stdout.flush() self._datetime = ns return (env_state, cumulative_reward, seen_reward, done, i)