Ejemplo n.º 1
0
def make_atari_env(env_id, seed, monitor=False, monitor_dir=None):
    r"""Create Atari environment with all necessary preprocessings. 
    
    Args:
        env_id (str): Atari game name without version, e.g. Pong, Breakout
        seed (int): random seed for the environment
        monitor (bool, optional): If ``True``, then wrap the enviroment with Monitor for video recording.  
        monitor_dir (str, optional): directory to save all data from Monitor. 
        
    Returns
    -------
    env : Env
        lagom-compatible environment
    """
    env = gym.make(env_id + 'NoFrameskip-v4')
    # remove gym TimeLimit wrapper (caps 100k frames), we want to cap 108k frames (30 min)
    env = env.env
    if monitor:
        env = Monitor(env, monitor_dir)
    env = GymWrapper(env)
    env = ResizeObservation(env, 84)
    env = GrayScaleObservation(env, keep_dim=False)
    env = AtariPreprocessing(env)
    env = ScaleImageObservation(env)
    env = ClipReward(env)
    env = FrameStack(env, 4)
    env.seed(seed)

    return env
Ejemplo n.º 2
0
def setup_env_agent(env, monitor, reward_shaping, frame_stack, train):

    env = gym.make(env)
    if monitor:
        if not os.path.exists('./monitor_dir'):
            os.makedirs('./monitor_dir')
        env = Monitor(env, './monitor_dir/', force=True)
    if reward_shaping and train:  # only shape reward when training and when stipulated
        reward_shaping = True
    else:
        reward_shaping = False
    env = RewardClipWrapper(env, reward_shaping)
    if len(
            env.observation_space.shape
    ) == 1:  # if we have rank of 1, it's a 1D space, so no need for convolutions
        conv = False
        input_dim = int(env.observation_space.shape[0])
    else:
        conv = True  # otherwise it's an image, so we'll have to add these
        input_dim = 84
    env.seed(0)
    agent = DQNAgent(env.action_space,
                     frame_stack=frame_stack,
                     conv=conv,
                     input_dim=input_dim)
    return env, agent
Ejemplo n.º 3
0
def make_gym_env(env_id, seed, monitor=False, monitor_dir=None):
    r"""Create an OpenAI Gym environment, and wrap it into lagom-compatible :class:`Env`. 
    
    Example::
    
        >>> env = make_gym_env(env_id='CartPole-v1', seed=1, monitor=False)
        >>> env
        <GymWrapper, <TimeLimit<CartPoleEnv<CartPole-v1>>>>
        
        >>> env.reset()
        array([ 0.03073904,  0.00145001, -0.03088818, -0.03131252])
    
    Args:
        env_id (str): OpenAI Gym environment ID, e.g. 'Pendulum-v0', 'Ant-v2'
        seed (int): random seed for the environment
        monitor (bool, optional): If ``True``, then wrap the enviroment with Monitor for video recording.  
        monitor_dir (str, optional): directory to save all data from Monitor. 
        
    Returns
    -------
    env : Env
        lagom-compatible environment
    """
    env = gym.make(env_id)
    if monitor:
        env = Monitor(env, monitor_dir)
    env = GymWrapper(env)
    env.seed(seed)

    return env
Ejemplo n.º 4
0
def main():
    """ Orchestrates agent and environment interactions. """
    # Create environment
    environment = gym.make(ENVIRONMENT)
    if RECORD:
        environment = Monitor(env=environment,
                              directory=VIDEO_DIRECTORY,
                              video_callable=lambda episode_id: True,
                              force=True)
    # Set random seeds
    environment.seed(0)
    np.random.seed(0)
    # Get action and state space sizes
    action_space = environment.action_space.n
    state_space = environment.observation_space.shape[0]
    # Instantiate agent
    agent = Agent(action_space, state_space)
    # Load model weights
    if path.exists(CHECKPOINT_DIRECTORY):
        agent.load(CHECKPOINT_DIRECTORY)
    # Initialise list of all rewards
    rewards = []
    for episode in range(EPISODES):
        # Get initial state
        state = environment.reset()
        state = np.reshape(state, (1, state_space))
        # Reset score for this episode
        score = 0
        for _ in range(STEPS):
            if RENDER:
                environment.render()
            # Agent selects action from state
            action = agent.act(state)
            # Agent performs action and makes an observation of the environment
            next_state, reward, done, _ = agent.observe(environment, action)
            next_state = np.reshape(next_state, (1, state_space))
            observation = (state, action, reward, next_state, done)
            # Agent remembers parameters of this time step
            agent.remember(observation)
            state = next_state
            # Agent retrains model
            agent.learn()
            score += reward
            if done:
                print("Episode: {}/{}. Reward: {:.2f}".format(
                    episode + 1, EPISODES, score))
                break
        rewards.append(score)
        # Average reward over the last 100 episodes
        average_reward = np.mean(rewards[-100:])
        print("Average reward: {:.2f}\n".format(average_reward))
    # Terminate environment
    environment.close()
    # Save model
    agent.save(CHECKPOINT_DIRECTORY)
    # Display performance over time
    summary(rewards)
Ejemplo n.º 5
0
 def thunk():
     env = gym.make(gym_id)
     env = gym.wrappers.RecordEpisodeStatistics(env)
     if args.capture_video:
         if idx == 0:
             env = Monitor(env, f'videos/{experiment_name}')
     env.seed(seed)
     env.action_space.seed(seed)
     env.observation_space.seed(seed)
     return env
Ejemplo n.º 6
0
def make_env(seed=None, monitor=False, monitor_dir=None):
    env = gym.make('Acrobot-v1')
    if monitor:
        env = Monitor(env, directory=monitor_dir)
    env = GymEnv(env)

    if seed is not None:
        env.seed(seed)

    return env
Ejemplo n.º 7
0
def make_env(env_id, use_monitor=False, monitor_dir='recordings', seed=None):
    """Instantiates the OpenAI Gym environment
    
    Args:
        env_id (string): OpenAI Gym environment ID
        use_monitor (bool): whether or not to use gym.wrappers.Monitor
        seed (int)
    """

    env = gym.make(env_id)  # instantiate the environment

    if use_monitor:
        env = Monitor(env, monitor_dir)

    env.seed(seed)

    return env
Ejemplo n.º 8
0
def main():
    config = EvolutionConfig()

    env = gym.make('LunarLander-v2')
    env = Monitor(env, '/tmp/evolution', force=True)
    env.seed(config.seed)
    np.random.seed(config.seed)

    policy = EvolutionPolicy(env, config)
    # Continue training
    #policy.W = np.load('weights.npy')

    rewards, n_generations = train(policy, config)

    env.close()

    create_plot(rewards, n_generations)
Ejemplo n.º 9
0
def get_new_env(env_name, cmdl):
    """Configure the training environment and return an instance."""
    import logging
    import gym
    import gym_fast_envs  # noqa
    from gym.wrappers import Monitor

    # Undo the default logger and configure a new one.
    gym.undo_logger_setup()
    logger = logging.getLogger()
    logger.setLevel(logging.WARNING)

    # Configure environment
    outdir = '/tmp/nec/%s-results' % cmdl.label
    env = gym.make(env_name)
    env = Monitor(env, directory=outdir, force=True, video_callable=False)
    env.seed(cmdl.seed)
    return env
Ejemplo n.º 10
0
 def thunk():
     env = gym.make(gym_id)
     env = wrap_atari(env)
     env = gym.wrappers.RecordEpisodeStatistics(env)
     if args.capture_video:
         if idx == 0:
             env = Monitor(env, f'videos/{experiment_name}')
     env = wrap_pytorch(
         wrap_deepmind(
             env,
             clip_rewards=True,
             frame_stack=True,
             scale=False,
         ))
     env.seed(seed)
     env.action_space.seed(seed)
     env.observation_space.seed(seed)
     return env
Ejemplo n.º 11
0
        def _thunk():
            random_seed(seed)
            if env_id.startswith('bsuite'):
                id = env_id.split('bsuite-')[1]
                self.video_enabled = False
                bsuite_env = bsuite.load_from_id(id)
                env = gym_wrapper.GymFromDMEnv(bsuite_env)

            elif env_id.startswith("dm"):
                import dm_control2gym
                _, domain, task = env_id.split('-')
                env = dm_control2gym.make(domain_name=domain, task_name=task)

            else:
                if special_args is not None:
                    if 'NChain' in special_args[0]:
                        print('starting chain N = ', special_args[1])
                        env = gym.make(env_id, n=special_args[1])
                else:
                    env = gym.make(env_id)

            if self.video_enabled:
                env = Monitor(env,
                              self.log_dir,
                              video_callable=self.video_callable)

            is_atari = hasattr(gym.envs, 'atari') and isinstance(
                env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
            if is_atari:
                env = make_atari(env_id)
            env.seed(seed + rank)
            env = OriginalReturnWrapper(env)
            if is_atari:
                env = wrap_deepmind(env,
                                    episode_life=episode_life,
                                    clip_rewards=False,
                                    frame_stack=False,
                                    scale=False)
                obs_shape = env.observation_space.shape
                if len(obs_shape) == 3:
                    env = TransposeImage(env)
                env = FrameStack(env, 4)
            return env
Ejemplo n.º 12
0
 def thunk():
     env = gym.make(gym_id)
     env = wrap_atari(env, sticky_action=args.sticky_action)
     env = gym.wrappers.RecordEpisodeStatistics(env)
     if args.capture_video:
         if idx == 0:
             env = Monitor(env, f'videos/{experiment_name}',  video_callable=lambda episode_id: episode_id%args.video_interval==0)
     env = wrap_pytorch(
         wrap_deepmind(
             env,
             episode_life=True,
             clip_rewards=True,
             frame_stack=True,
             scale=False,
         )
     )
     env.seed(seed)
     env.action_space.seed(seed)
     env.observation_space.seed(seed)
     return env
Ejemplo n.º 13
0
    def _thunk():
        if env_id.startswith("dm"):
            _, domain, task = env_id.split('.')
            env = dm_control2gym.make(domain_name=domain, task_name=task)
        else:
            env = gym.make(env_id)
        if save_video is not None:
            env = Monitor(env, save_video, force=True)
        env.seed(seed + rank)

        obs_shape = env.observation_space.shape
        if add_timestep and len(
                obs_shape) == 1 and str(env).find('TimeLimit') > -1:
            env = AddTimestep(env)

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 2, 3]:
            env = WrapPyTorch(env)

        return env
Ejemplo n.º 14
0
def make_gym_env(env_id, seed, monitor=False, monitor_dir=None):
    """
    Create a gym environment, wrap it with GymEnv and seed it. 
    
    Args:
        env_id (str): environment ID, e.g. 'Pendulum-v0'
        seed (int): random seed
        monitor (bool, optional): Whether to wrap the environment with Monitor for video recording. 
        monitor_dir (str, optional): directory to save all data from Monitor. 
        
    Returns:
        env (GymEnv): created environment
    """
    # Create gym environment
    env = gym.make(env_id)
    # Wrap the enviroment with Monitor if required
    if monitor:
        env = Monitor(env, monitor_dir)
    # Wrap the environment with GymEnv
    env = GymEnv(env)
    # Seed the environment
    env.seed(seed)

    return env
Ejemplo n.º 15
0
        makedirs(output_dir)

    # register environment in Gym according to env_config
    helper.register_gym_environment(True, FPS, SHOW_SCORE_BAR)

    # saves / copies configs to file
    config.save_json(join(output_dir, 'config.json'))
    helper.save_state_features(join(output_dir, 'state_features.csv'))

    # create environment and monitor
    env = gym.make(config.gym_env_id)
    env = Monitor(env,
                  directory=output_dir,
                  force=True,
                  video_callable=lambda _: True)
    env.seed(config.seed)

    # adds reference to monitor to allow for gym environments to update video frames
    env.env.env.monitor = env

    # create the agent
    agent, exploration_strategy = create_agent(helper, AgentType.Manual, None)
    behavior_tracker = BehaviorTracker(config.num_episodes)

    # tries to load agent info
    if agent_t != AgentType.Manual and exists(agent_dir):
        agent.load(agent_dir)

    window_still_open = True
    e = 0
    save_features = save_environment = False
def collect_data(cfg, plot=False):  # Creates horizon^2/2 points
    """
    Collect data for environment model
    :param nTrials:
    :param horizon:
    :return: an array of DotMaps, where each DotMap contains info about a trajectory
    """

    env_model = cfg.env.name
    env = gym.make(env_model)
    if (cfg.video):
        env = Monitor(env, hydra.utils.get_original_cwd() + '/trajectories/reacher/video',
         video_callable = lambda episode_id: episode_id==1,force=True)
    log.info('Initializing env: %s' % env_model)

    # Logs is an array of dotmaps, each dotmap contains 2d np arrays with data
    # about <horizon> steps with actions, rewards and states
    logs = []
    if (cfg.PID_test):
        target = np.random.rand(5) * 2 - 1
    for i in range(cfg.num_trials):
        log.info('Trial %d' % i)
        if (cfg.PID_test):
            env.seed(0)
        else:
            env.seed(i)
        s0 = env.reset()

        # P = np.array([4, 4, 1, 1, 1])
        P = np.random.rand(5) * 5
        I = np.zeros(5)
        # D = np.array([0.2, 0.2, 2, 0.4, 0.4])
        D = np.random.rand(5)

        # Samples target uniformely from [-1, 1]
        if (not cfg.PID_test):
            target = np.random.rand(5) * 2 - 1

        policy = PID(dX=5, dU=5, P=P, I=I, D=D, target=target)
        # print(type(env))
        dotmap = run_controller(env, horizon=cfg.trial_timesteps, policy=policy, video = cfg.video)

        dotmap.target = target
        dotmap.P = P / 5
        dotmap.I = I
        dotmap.D = D
        logs.append(dotmap)

    if plot:
        import plotly.graph_objects as go

        fig = go.Figure()

        fig.update_layout(
            width=1500,
            height=800,
            autosize=False,
            scene=dict(
                camera=dict(
                    up=dict(
                        x=0,
                        y=0,
                        z=1
                    ),
                    eye=dict(
                        x=0,
                        y=1.0707,
                        z=1,
                    )
                ),
                aspectratio=dict(x=1, y=1, z=0.7),
                aspectmode='manual'
            ),
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)'
        )
        for d in logs:
            states = d.states
            actions = d.actions
            plot_reacher(states, actions)

    return logs
Ejemplo n.º 17
0
class Task():
    """Problem domain to be solved by neural network. Uses OpenAI Gym patterns.
  """
    def __init__(self,
                 game,
                 paramOnly=False,
                 nReps=1,
                 record_path="./test_videos"):
        """Initializes task environment
  
    Args:
      game - (string) - dict key of task to be solved (see domain/config.py)
  
    Optional:
      paramOnly - (bool)  - only load parameters instead of launching task?
      nReps     - (nReps) - number of trials to get average fitness
    """
        # Network properties
        self.nInput = game.input_size
        self.nOutput = game.output_size
        self.actRange = game.h_act
        self.absWCap = game.weightCap
        self.layers = game.layers
        self.activations = np.r_[np.full(1, 1), game.i_act, game.o_act]

        # Environment
        self.maxEpisodeLength = game.max_episode_length
        self.actSelect = game.actionSelect

        if not paramOnly:
            self.env = make_env(game.env_name)
            if record_path:
                self.env_to_wrap = self.env
                self.env = Monitor(self.env_to_wrap, record_path, force=True)

        # Special needs...
        self.needsClosed = (game.env_name.startswith("CartPoleSwingUp"))

    def testInd(self, wVec, aVec, view=False, seed=-1):
        """Evaluate individual on task
    Args:
      wVec    - (np_array) - weight matrix as a flattened vector
                [N**2 X 1]
      aVec    - (np_array) - activation function of each node 
                [N X 1]    - stored as ints (see applyAct in ann.py)
  
    Optional:
      view    - (bool)     - view trial?
      seed    - (int)      - starting random seed for trials
  
    Returns:
      fitness - (float)    - reward earned in trial
    """
        if seed >= 0:
            random.seed(seed)
            np.random.seed(seed)
            self.env.seed(seed)

        state = self.env.reset()
        self.env.t = 0

        annOut = act(wVec, aVec, self.nInput, self.nOutput, state)
        action = selectAct(annOut, self.actSelect)

        state, reward, done, info = self.env.step(action)
        if self.maxEpisodeLength == 0:
            return reward
        else:
            totalReward = reward

        for tStep in range(self.maxEpisodeLength):
            annOut = act(wVec, aVec, self.nInput, self.nOutput, state)
            action = selectAct(annOut, self.actSelect)
            state, reward, done, info = self.env.step(action)
            totalReward += reward
            if view:
                #time.sleep(0.01)
                if self.needsClosed:
                    self.env.render(close=done)
                else:
                    self.env.render()
            if done:
                break

        return totalReward


# -- 'Weight Agnostic Network' evaluation -------------------------------- -- #

    def setWeights(self, wVec, wVal):
        """Set single shared weight of network
  
    Args:
      wVec    - (np_array) - weight matrix as a flattened vector
                [N**2 X 1]
      wVal    - (float)    - value to assign to all weights
  
    Returns:
      wMat    - (np_array) - weight matrix with single shared weight
                [N X N]
    """
        # Create connection matrix
        wVec[np.isnan(wVec)] = 0
        dim = int(np.sqrt(np.shape(wVec)[0]))
        cMat = np.reshape(wVec, (dim, dim))
        cMat[cMat != 0] = 1.0

        # Assign value to all weights
        wMat = np.copy(cMat) * wVal
        return wMat


    def getDistFitness(self, wVec, aVec, hyp, \
                      seed=-1,nRep=False,nVals=6,view=False,returnVals=False):
        """Get fitness of a single individual with distribution of weights
  
    Args:
      wVec    - (np_array) - weight matrix as a flattened vector
                [N**2 X 1]
      aVec    - (np_array) - activation function of each node 
                [N X 1]    - stored as ints (see applyAct in ann.py)
      hyp     - (dict)     - hyperparameters
        ['alg_wDist']        - weight distribution  [standard;fixed;linspace]
        ['alg_absWCap']      - absolute value of highest weight for linspace
  
    Optional:
      seed    - (int)      - starting random seed for trials
      nReps   - (int)      - number of trials to get average fitness
      nVals   - (int)      - number of weight values to test

  
    Returns:
      fitness - (float)    - mean reward over all trials
    """
        if nRep is False:
            nRep = hyp['alg_nReps']

        # Set weight values to test WANN with
        if (hyp['alg_wDist'] == "standard"
            ) and nVals == 6:  # Double, constant, and half signal
            wVals = np.array((-2, -1.0, -0.5, 0.5, 1.0, 2))
        else:
            wVals = np.linspace(-self.absWCap, self.absWCap, nVals)

        # Get reward from 'reps' rollouts -- test population on same seeds
        reward = np.empty((nRep, nVals))
        for iRep in range(nRep):
            for iVal in range(nVals):
                monitor_name = "./cartpole_{}".format(iVal)
                self.env = Monitor(self.env_to_wrap, monitor_name)
                wMat = self.setWeights(wVec, wVals[iVal])
                if seed == -1:
                    reward[iRep, iVal] = self.testInd(wMat,
                                                      aVec,
                                                      seed=seed,
                                                      view=view)
                else:
                    reward[iRep, iVal] = self.testInd(wMat,
                                                      aVec,
                                                      seed=seed + iRep,
                                                      view=view)

        if returnVals is True:
            return np.mean(reward, axis=0), wVals
        return np.mean(reward, axis=0)
Ejemplo n.º 18
0
def main(argv=None):
    try:
        options, args = getopt.getopt(sys.argv[1:], "s:x:b:u:mh", [
                                      "step=", "max_eps=", "buffer_size=", "hidden_unit=","monitor", "help"])
    except getopt.GetoptError as err:
        print(str(err))
        print(usage.__doc__)
        sys.exit(1)

    GAME_NAME = 'CartPole-v1'
    AGENT_NAME = 'DQN-lr_1_e-3'
    MONITOR = False
    print_step = 10
    max_eps = 500
    buffer_size=1000000
    hidden_unit = 16
    lr=1e-3

    print(options)
    for o, v in options:
        if o in ("-h", "--help"):
            print(usage.__doc__)
            sys.exit()
        elif o in ("-m", "--monitor"):
            MONITOR = True
        elif o in ("-s", "--step"):
            print_step = int(v)
        elif o in ("-x", "--max_eps"):
            max_eps = int(v)
        elif o in ("-b", "--buffer_size"):
            buffer_size = int(v)
        elif o in ("-u", "--hidden_unit"):
            hidden_unit = int(v)
        else:
            print(usage.__doc__)
            sys.exit()

    print('process game: %s\tusing agent: %s' % (GAME_NAME, AGENT_NAME))

    # -------------------------------- loop for training -----------------------------
    # preparing env
    output_dir = '%s/%s' % (GAME_NAME, AGENT_NAME)
    cmd = 'mkdir -p %s && mkdir -p %s/%s' % (GAME_NAME, GAME_NAME, AGENT_NAME)
    os.system(cmd)

    env = gym.make(GAME_NAME)
    if MONITOR:
        env = Monitor(env, directory=output_dir, force=True, video_callable=lambda ep: ep % 10 == 0, write_upon_reset=True, mode='training')
    
    env.seed(0)

    state_num = len(env.reset())
    print(state_num)
    action_sample = env.action_space.sample()
    action_num = env.action_space.n if isinstance(action_sample, int) else len(action_sample)
    print('state_num: %d\taction_num: %d' % (state_num, action_num))
    
    device = torch.device('cpu')
    agent = DQNAgent(state_num, action_num, buffer_size=buffer_size, batch_size=128, device=device, hidden_unit=hidden_unit, lr=lr)

    scores_window = deque(maxlen=print_step)  # last 10 scores
    avg_scores = []

    for i_episode in range(max_eps):
        score = 0
        state = env.reset()
        
        while True:
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            
            agent.step(state, action, reward, next_state, done)
            score += reward
            state = next_state
            if done:
                break

        scores_window.append(score)

        print('\rEpisode {}\tAverage Score: {:.2f} '.format(
            i_episode, np.mean(scores_window)), end="")
        if i_episode % print_step == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
            # save model
            agent.save_model_params(output_dir, i_episode)

        avg_scores.append(np.mean(scores_window))
        sys.stdout.flush()

    env.close()
Ejemplo n.º 19
0
def act(args, i, rb, q_network, lock, queues, queue, stats_queue, global_step,
        device, writer):
    env = gym.make(args.gym_id)
    env = wrap_atari(env)
    env = gym.wrappers.RecordEpisodeStatistics(
        env)  # records episode reward in `info['episode']['r']`
    if args.capture_video:
        env = Monitor(env, f'videos/{experiment_name}')
    env = wrap_pytorch(
        wrap_deepmind(
            env,
            clip_rewards=True,
            frame_stack=True,
            scale=False,
        ))
    env.seed(args.seed + i)
    env.action_space.seed(args.seed + i)
    # TRY NOT TO MODIFY: start the game
    obs = env.reset()
    episode_reward = 0
    while global_step < (args.total_timesteps):
        # global_step *= args.num_actor
        # ALGO LOGIC: put action logic here
        epsilon = linear_schedule(
            args.start_e, args.end_e,
            args.exploration_fraction * args.total_timesteps, global_step)
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            logits = q_network.forward(
                torch.Tensor(obs.reshape((1, ) + obs.shape)))
            action = torch.argmax(logits, dim=1).tolist()[0]
        # action = env.action_space.sample()

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, reward, done, info = env.step(action)
        episode_reward += reward

        # TRY NOT TO MODIFY: record rewards for plotting purposes
        with lock:
            global_step += 1
            if 'episode' in info.keys():
                stats_queue.put((info['episode']['r'], info['episode']['l']))
                # writer.add_scalar("charts/episode_reward", info['episode']['r'], global_step)
                # writer.add_scalar("charts/epsilon", epsilon, global_step)

            # ALGO LOGIC: training.
            rb.put((obs, action, reward, next_obs, done))
            if global_step > args.learning_starts and global_step % args.train_frequency == 0:
                s = rb.sample(args.batch_size)
                queue.put([torch.Tensor(item) for item in s])
                # for idx, queue in enumerate(queues):
                #     queue.put(torch.Tensor(s[idx]))

        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
        obs = next_obs
        if done:
            # important to note that because `EpisodicLifeEnv` wrapper is applied,
            # the real episode reward is actually the sum of episode reward of 5 lives
            # which we record through `info['episode']['r']` provided by gym.wrappers.RecordEpisodeStatistics
            obs, episode_reward = env.reset(), 0
Ejemplo n.º 20
0
def act(args, experiment_name, i, worker_models, lock, rollouts_queue,
        stats_queue, global_step, device):
    actor, qf1, qf2, qf1_target, qf2_target, target_actor = worker_models
    env = gym.make(args.gym_id)
    env = gym.wrappers.RecordEpisodeStatistics(
        env)  # records episode reward in `info['episode']['r']`
    if args.capture_video:
        if i == 0:
            env = Monitor(env, f'videos/{experiment_name}')
    env.seed(args.seed + i)
    env.action_space.seed(args.seed + i)
    # TRY NOT TO MODIFY: start the game
    obs = env.reset()
    max_action = float(env.action_space.high[0])
    storage = []
    episode_reward = 0
    update_step = 0
    while True:
        update_step += 1
        # ALGO LOGIC: put action logic here
        if global_step < args.learning_starts // args.num_actors:
            action = env.action_space.sample()
        else:
            action = actor.forward(obs.reshape((1, ) + obs.shape))
            action = (action.tolist()[0] +
                      np.random.normal(0,
                                       max_action * args.exploration_noise,
                                       size=env.action_space.shape[0])).clip(
                                           env.action_space.low,
                                           env.action_space.high)

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, reward, done, info = env.step(action)
        episode_reward += reward
        storage += [(obs, action, reward, next_obs, float(done))]
        with lock:
            global_step += 1
        if 'episode' in info.keys():
            stats_queue.put(("charts/episode_reward", info['episode']['r'],
                             info['episode']['l']))

        if len(storage) == args.actor_buffer_size:
            obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
            for data in storage:
                obs_t, action, reward, obs_tp1, done = data
                obses_t.append(np.array(obs_t, copy=False))
                actions.append(np.array(action, copy=False))
                rewards.append(reward)
                obses_tp1.append(np.array(obs_tp1, copy=False))
                dones.append(done)
            s_obs, s_actions, s_rewards, s_next_obses, s_dones = np.array(
                obses_t), np.array(actions), np.array(rewards), np.array(
                    obses_tp1), np.array(dones)

            with torch.no_grad():
                clipped_noise = (torch.randn_like(torch.Tensor(action)) *
                                 args.policy_noise).clamp(
                                     -args.noise_clip, args.noise_clip)

                next_state_actions = (target_actor.forward(s_next_obses) +
                                      clipped_noise.to(device)).clamp(
                                          env.action_space.low[0],
                                          env.action_space.high[0])
                qf1_next_target = qf1_target.forward(s_next_obses,
                                                     next_state_actions)
                qf2_next_target = qf2_target.forward(s_next_obses,
                                                     next_state_actions)
                min_qf_next_target = torch.min(qf1_next_target,
                                               qf2_next_target)
                next_q_value = torch.Tensor(s_rewards).to(device) + (
                    1 - torch.Tensor(s_dones).to(device)) * args.gamma * (
                        min_qf_next_target).view(-1)

                qf1_a_values = qf1.forward(
                    s_obs,
                    torch.Tensor(s_actions).to(device)).view(-1)
                td_errors = qf1_a_values - next_q_value

            new_priorities = np.abs(td_errors.tolist()) + args.pr_eps
            rollouts_queue.put((storage, new_priorities))
            storage = []

        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
        obs = next_obs
        if done:
            # important to note that because `EpisodicLifeEnv` wrapper is applied,
            # the real episode reward is actually the sum of episode reward of 5 lives
            # which we record through `info['episode']['r']` provided by gym.wrappers.RecordEpisodeStatistics
            obs, episode_reward = env.reset(), 0
Ejemplo n.º 21
0
def run_trial(args):
    # tries to get agent type
    agent_t = args.agent
    if agent_t == AgentType.Testing:
        # tries to load a pre-trained agent configuration file
        config, results_dir = load_agent_config(args.results, args.trial)
    else:
        # tries to load env config from provided file path
        config_file = args.config_file_path
        config = args.default_frogger_config if config_file is None or not exists(config_file) \
            else EnvironmentConfiguration.load_json(config_file)
    # creates env helper
    helper = create_helper(config)
    # checks for provided output dir
    output_dir = args.output if args.output is not None else \
        get_agent_output_dir(config, agent_t, args.trial)
    if not exists(output_dir):
        makedirs(output_dir)
    # saves / copies configs to file
    config.save_json(join(output_dir, 'config.json'))
    helper.save_state_features(join(output_dir, 'state_features.csv'))
    # register environment in Gym according to env config
    env_id = '{}-{}-v0'.format(config.gym_env_id, args.trial)
    helper.register_gym_environment(env_id, False, args.fps,
                                    args.show_score_bar)
    # create environment and monitor
    env = gym.make(env_id)
    config.num_episodes = args.num_episodes
    video_callable = video_schedule(config, args.record)
    env = Monitor(env,
                  directory=output_dir,
                  force=True,
                  video_callable=video_callable)
    # adds reference to monitor to allow for gym environments to update video frames
    if video_callable(0):
        env.env.monitor = env
    # initialize seeds (one for the environment, another for the agent)
    env.seed(config.seed + args.trial)
    agent_rng = np.random.RandomState(config.seed + args.trial)
    # creates the agent
    agent, exploration_strategy = create_agent(helper, agent_t, agent_rng)
    # if testing, loads tables from file (some will be filled by the agent during the interaction)
    if agent_t == AgentType.Testing:
        agent.load(results_dir)
    # runs episodes
    behavior_tracker = BehaviorTracker(config.num_episodes)
    recorded_episodes = []
    for e in range(config.num_episodes):
        # checks whether to activate video monitoring
        env.env.monitor = env if video_callable(e) else None
        # reset environment
        old_obs = env.reset()
        old_s = helper.get_state_from_observation(old_obs, 0, False)
        if args.verbose:
            print(f'Episode: {e}')
            # helper.update_stats_episode(e)
        exploration_strategy.update(e)  # update for learning agent
        t = 0
        done = False
        while not done:
            # select action
            a = agent.act(old_s)
            # observe transition
            obs, r, done, _ = env.step(a)
            s = helper.get_state_from_observation(obs, r, done)
            r = helper.get_reward(old_s, a, r, s, done)
            # update agent and stats
            agent.update(old_s, a, r, s)
            behavior_tracker.add_sample(old_s, a)
            helper.update_stats(e, t, old_obs, obs, old_s, a, r, s)
            old_s = s
            old_obs = obs
            t += 1
        # adds to recorded episodes list
        if video_callable(e):
            recorded_episodes.append(e)
        # signals new episode to tracker
        behavior_tracker.new_episode()
    # writes results to files
    agent.save(output_dir)
    behavior_tracker.save(output_dir)
    write_table_csv(recorded_episodes, join(output_dir, 'rec_episodes.csv'))
    helper.save_stats(join(output_dir, 'results'), args.clear_results)
    print('\nResults of trial {} written to:\n\t\'{}\''.format(
        args.trial, output_dir))
    env.close()
Ejemplo n.º 22
0
params.log_dir = "../../logs/logs/self_R_DDPG/{}-mu{}".format(
    str(params.env_name.split("-")[0]), mu)
params.actor_model_dir = "../../logs/models/self_R_DDPG/{}/actor-mu{}/".format(
    str(params.env_name.split("-")[0]), mu)
params.critic_model_dir = "../../logs/models/self_R_DDPG/{}/critic-mu{}/".format(
    str(params.env_name.split("-")[0]), mu)
params.video_dir = "../../logs/video/self_R/{}-mu{}".format(
    str(params.env_name.split("-")[0]), mu)
params.plot_path = "../../logs/plots/self_R/{}-mu{}/".format(
    str(params.env_name.split("-")[0]), mu)

env = gym.make(params.env_name)
env = Monitor(env, params.video_dir)

# set seed
env.seed(params.seed)
tf.random.set_random_seed(params.seed)

replay_buffer = ReplayBuffer(params.memory_size)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)
# random_process = OrnsteinUhlenbeckProcess(size=env.action_space.shape[0], theta=0.15, mu=0.9, sigma=0.05)
random_process = GaussianNoise(mu=params.mu, sigma=params.sigma)
agent = DDPG(Actor, Critic, env.action_space.shape[0], random_process, params)

get_ready(agent.params)

global_timestep = tf.compat.v1.train.get_or_create_global_step()
time_buffer = deque(maxlen=agent.params.reward_buffer_ep)
log = logger(agent.params)
action_buffer, distance_buffer, eval_epochs = list(), list(), list()
Ejemplo n.º 23
0
class GymTask():
    """Problem domain to be solved by neural network. Uses OpenAI Gym patterns.
  """
    def __init__(self, game, paramOnly=False, nReps=1, record=False):
        """Initializes task environment
  
    Args:
      game - (string) - dict key of task to be solved (see domain/config.py)
  
    Optional:
      paramOnly - (bool)  - only load parameters instead of launching task?
      nReps     - (nReps) - number of trials to get average fitness
    """
        # Network properties
        self.nInput = game.input_size
        self.nOutput = game.output_size
        self.actRange = game.h_act
        self.absWCap = game.weightCap
        self.layers = game.layers
        self.activations = np.r_[np.full(1, 1), game.i_act, game.o_act]

        # Environment
        self.nReps = nReps
        self.maxEpisodeLength = game.max_episode_length
        self.actSelect = game.actionSelect
        if not paramOnly:
            if record:
                env_to_wrap = make_env(game.env_name)
                self.env = Monitor(env_to_wrap, "trial_recording/", force=True)
            else:
                self.env = make_env(game.env_name)

        # Special needs...
        self.needsClosed = (game.env_name.startswith("CartPoleSwingUp"))

    def getFitness(self, wVec, aVec, view=False, nRep=False, seed=-1):
        """Get fitness of a single individual.
  
    Args:
      wVec    - (np_array) - weight matrix as a flattened vector
                [N**2 X 1]
      aVec    - (np_array) - activation function of each node 
                [N X 1]    - stored as ints (see applyAct in ann.py)
  
    Optional:
      view    - (bool)     - view trial?
      nReps   - (nReps)    - number of trials to get average fitness
      seed    - (int)      - starting random seed for trials
  
    Returns:
      fitness - (float)    - mean reward over all trials
    """
        if nRep is False:
            nRep = self.nReps
        wVec[np.isnan(wVec)] = 0
        reward = np.empty(nRep)
        for iRep in range(nRep):
            if seed > 0:
                seed = seed + iRep
            reward[iRep] = self.testInd(wVec, aVec, view=view, seed=seed)
        fitness = np.mean(reward)
        return fitness

    def testInd(self, wVec, aVec, hyp=None, view=False, seed=-1):
        """Evaluate individual on task
    Args:
      wVec    - (np_array) - weight matrix as a flattened vector
                [N**2 X 1]
      aVec    - (np_array) - activation function of each node 
                [N X 1]    - stored as ints (see applyAct in ann.py)
  
    Optional:
      view    - (bool)     - view trial?
      seed    - (int)      - starting random seed for trials
  
    Returns:
      fitness - (float)    - reward earned in trial
    """
        if seed >= 0:
            random.seed(seed)
            np.random.seed(seed)
            self.env.seed(seed)

        state = self.env.reset()
        self.env.t = 0
        annOut = act(wVec, aVec, self.nInput, self.nOutput, state)
        action = selectAct(annOut, self.actSelect)
        state, reward, done, info = self.env.step(action)

        if self.maxEpisodeLength == 0:
            if view:
                if self.needsClosed:
                    self.env.render(close=done)
                else:
                    self.env.render()
            return reward
        else:
            totalReward = reward

        for tStep in range(self.maxEpisodeLength):
            annOut = act(wVec, aVec, self.nInput, self.nOutput, state)
            action = selectAct(annOut, self.actSelect)
            state, reward, done, info = self.env.step(action)
            totalReward += reward
            if view:
                if self.needsClosed:
                    self.env.render(close=done)
                else:
                    self.env.render()
            if done:
                break
        return totalReward
Ejemplo n.º 24
0
for test_mode in test_modes:

    # Generate environment

    if "_n" in args.env:
        env = gym.make(args.env,
                       pairs_dict=pairs_dict,
                       test_instr_mode=test_mode,
                       num_dists=args.num_dists)
    else:
        env = gym.make(args.env)

    demo_path = os.path.join(model_path, test_mode)
    env = Monitor(env, demo_path, _check_log_this, force=True)
    env.seed(args.seed)

    # Define agent
    agent = utils.load_agent(env=env,
                             model_name=args.model,
                             argmax=args.argmax,
                             env_name=args.env,
                             instr_arch=args.instr_arch)
    utils.seed(args.seed)

    print('\n')
    print(f'=== EVALUATING MODE: {test_mode} ===')

    # Run the agent
    done = False
    action = None
Ejemplo n.º 25
0
    device = torch.device(
        'cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
    env = gym.make(args.gym_id)
    env = wrap_atari(env)
    env = gym.wrappers.RecordEpisodeStatistics(
        env)  # records episode reward in `info['episode']['r']`
    if args.capture_video:
        env = Monitor(env, f'videos/{experiment_name}')
    env = wrap_pytorch(
        wrap_deepmind(
            env,
            clip_rewards=True,
            frame_stack=True,
            scale=False,
        ))
    env.seed(args.seed)
    env.action_space.seed(args.seed)
    env.observation_space.seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic

    # respect the default timelimit
    assert isinstance(env.action_space,
                      Discrete), "only discrete action space is supported"

    rb = ReplayBuffer(args.buffer_size)
    q_network = QNetwork()
    q_network.share_memory()
    target_network = QNetwork().to(device)
Ejemplo n.º 26
0
import gym
from gym.wrappers import Monitor
env = Monitor(gym.make('LunarLanderContinuous-v2'),
              './video',
              force=True,
              video_callable=lambda episode_id: True)
env.seed(9756745635)
fitness = 0

for _ in range(10):
    observation = env.reset()
    done = False
    while not done:

        #################################

        x = observation[0]
        y = observation[1]
        vel_x = observation[2]
        vel_y = observation[3]
        ang = observation[4]
        vel_ang = observation[5]
        l_left = observation[6]
        l_right = observation[7]
        input = [0., 0.]

        #################################

        input = [
            max(min(i, 1.0), -1.0)
            for i in [(((observation[3] * -78.0698466187944) -
Ejemplo n.º 27
0
def runExperiment(experiment):
    import numpy as np
    from collections import deque
    import gym
    from gym.wrappers import Monitor
    from agents.dqnagent import DQNAgent

    #environment parameters
    gym_id = experiment["gym_id"]
    sliding_window_solved_score = experiment["sliding_window_solved_score"]
    sliding_window_score_length = experiment["sliding_window_score_length"]
    env_seed = experiment["env_seed"]
    max_episode = experiment["max_episode"]

    env = gym.make(gym_id)
    env = Monitor(env,
                  "{}".format(experiment['folder']),
                  video_callable=False,
                  force=True,
                  resume=False,
                  write_upon_reset=False,
                  uid=None,
                  mode=None)

    env.seed(env_seed)
    scores = deque()
    sw_scores = deque(maxlen=sliding_window_score_length)

    #agent parameters
    agent_seed = experiment["agent_seed"]
    activation = experiment["activation"]
    min_episode_before_acting = experiment["min_episode_before_acting"]
    epsilon = experiment["epsilon"]
    nb_hidden_layer = experiment["nb_hidden_layer"]
    layer_width = experiment["layer_width"]
    memory_length = experiment["memory_length"]
    batch_size = experiment["batch_size"]
    agent = DQNAgent(env.observation_space, env.action_space, agent_seed,
                     min_episode_before_acting, activation, epsilon,
                     layer_width, nb_hidden_layer, memory_length)

    current_episode = 0
    while (len(sw_scores) == 0
           or np.mean(sw_scores) < sliding_window_solved_score) and (
               max_episode == None or current_episode < max_episode):
        state = env.reset()

        current_episode += 1
        reward = 0
        done = False
        episode_score = 0

        while not done:
            action = agent.act(state)

            next_state, reward, done, _ = env.step(action)

            agent.remember(state, action, reward, next_state, done)

            state = next_state

            episode_score += reward

            # if np.mean(sw_scores) > 180:
            #     env.render()

            if done:
                scores.append(episode_score)
                sw_scores.append(episode_score)

                print(
                    'Episode: {}\t Epsilon: {}\t Score: {}\t Mean Score:{}\t Sliding Score:{}\t'
                    .format(current_episode, agent.epsilon, episode_score,
                            np.mean(scores), np.mean(sw_scores)))
                agent.train(batch_size=batch_size)
    env.close()
Ejemplo n.º 28
0
parser.add_argument('--sd_min', type=float, default=0.01)
parser.add_argument('--sd_steps', type=int, default=50000000)
parser.add_argument('--gpu_memory', type=float, default=0.1)
parser.add_argument('--loss_type', type=str, default='kl')
parser.add_argument('--device', type=str, default='/cpu:0')
parser.add_argument('--alg', choices=['dqn','adfq'], default='dqn')
parser.add_argument('--record',type=int, default=0)

args = parser.parse_args()
    
# Get the environment and extract the number of actions.
env = gym.make(args.env_name)
if args.record == 1:
    env = Monitor(env, directory=args.log_dir)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE

with tf.device(args.device):
    model = Sequential()
    if K.image_dim_ordering() == 'tf':
        # (width, height, channels)
        model.add(Permute((2, 3, 1), input_shape=input_shape))
    elif K.image_dim_ordering() == 'th':
        # (channels, width, height)
        model.add(Permute((1, 2, 3), input_shape=input_shape))
    else:
        raise RuntimeError('Unknown image_dim_ordering.')
Ejemplo n.º 29
0
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    env = gym.make('FlappyBird-v0' if len(sys.argv)<2 else sys.argv[1])

    # You provide the directory to write to (can be an existing
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    outdir = '/tmp/random-agent-results'
    env = Monitor(env, directory=outdir, force=True)

    # This declaration must go *after* the monitor call, since the
    # monitor's seeding creates a new action_space instance with the
    # appropriate pseudorandom number generator.
    env.seed(0)
    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()

        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
            # Note there's no env.render() here. But the environment still can open window and
Ejemplo n.º 30
0
import os
import torch

import BCQ
import DDPG
import utils

from gym.wrappers import Monitor
# %%
ENV_NAME = "LunarLanderContinuous-v2"
SEED = 0
# %%
env = gym.make(ENV_NAME)
env = Monitor(env, 'videos/', force=True)
# %%
env.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
# %%
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
# %%
# Loading

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bcq = BCQ.BCQ(state_dim, action_dim, max_action, device, 0.99, 0.005, 0.75,
              0.05)

bcq.load(f"./models/bcq_{ENV_NAME}_{SEED}")