コード例 #1
0
ファイル: play.py プロジェクト: ramon-oliveira/deeptitans
def game_config(args):
    if args.game == 'pong':
        actions = [0, 2, 3]
        meanings = ['NOOP', 'UP', 'DOWN']
        enviroment = gym.make('Pong-v0')
    elif args.game == 'breakout':
        actions = [0, 1, 2, 3]
        meanings = ['NOOP', 'FIRE', 'RIGTH', 'LEFT']
        enviroment = gym.make('Breakout-v0')
    elif args.game == 'space-invaders':
        actions = [0, 1, 2, 3]
        meanings = ['NOOP', 'FIRE', 'RIGTH', 'LEFT']
        enviroment = gym.make('SpaceInvaders-v0')
    else:
        raise Exception('Unknown game')

    shape = enviroment.observation_space.shape
    screen = args.nb_frame_state, shape[0]//2, shape[1]//2
    return {
        'actions': actions,
        'meanings': meanings,
        'enviroment': enviroment,
        'state_shape': screen,
        'preprocessing': utils.preprocessing,
    }
コード例 #2
0
ファイル: train.py プロジェクト: wbaek/reinforcement_learning
def main(args):
    logging.info( args )
    device = 'gpu' if args.gpu else 'cpu'

    devices = device_lib.list_local_devices()
    num_gpus = len([d for d in devices if '/gpu' in d.name])
 
    env = gym.make(args.game)
    env = Env(env, resized_width=84, resized_height=84, agent_history_length=4)
    num_actions = len(env.gym_actions)

    global_net = Network(num_actions, -1, 'cpu')
    actor_networks = []
    for t in range(args.threads):
        device_index = 0 if device is 'cpu' else (t if args.threads <= num_gpus else 0)
        n = Network(num_actions, t, device, device_index)
        n.tie_global_net(global_net)
        actor_networks.append(n)

    sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=args.threads, inter_op_parallelism_threads=args.threads))
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    if not os.path.exists(args.checkpoint_dir):
        os.makedirs(args.checkpoint_dir)

    threads = []
    for t, net in enumerate(actor_networks):
        e = Env(gym.make(args.game), net.width, net.height, net.depth)
        w = Worker(t, e, net, sess, saver, args.checkpoint_dir)
        w.start()
        threads.append(w)

    for t in threads:
        t.join()
def worker_func(input_queue, output_queue, device="cpu"):
    env_pool = [gym.make("RoboschoolHalfCheetah-v1")]

    # first generation -- just evaluate given single seeds
    parents = input_queue.get()
    for seed in parents:
        net = build_net(env_pool[0], seed).to(device)
        net.zero_noise(batch_size=1)
        reward, steps = evaluate(env_pool[0], net, device)
        output_queue.put((seed, reward, steps))

    while True:
        parents = input_queue.get()
        if parents is None:
            break
        parents.sort()
        for parent_seeds, children_iter in itertools.groupby(parents, key=lambda s: s[:-1]):
            batch = list(children_iter)
            children_seeds = [b[-1] for b in batch]
            net = build_net(env_pool[0], parent_seeds).to(device)
            net.set_noise_seeds(children_seeds)
            batch_size = len(children_seeds)
            while len(env_pool) < batch_size:
                env_pool.append(gym.make("RoboschoolHalfCheetah-v1"))
            rewards, steps = evaluate_batch(env_pool[:batch_size], net, device)
            for seeds, reward, step in zip(batch, rewards, steps):
                output_queue.put((seeds, reward, step))
コード例 #4
0
ファイル: environment.py プロジェクト: joshiatul/game_playing
    def __init__(self, name, grid_size=None, last_n=None, delta_preprocessing=False):
        # self.base_folder_name = os.path.dirname(os.path.realpath(__file__)).replace('environments', 'solved_environments') + '/' + name
        # # TODO simplfy for all atari games
        self.name = name
        if name == 'breakout':
            self.env = gym.make('Breakout-v0')
        elif name == 'pong':
            self.env = gym.make('Pong-v0')
        elif name == 'gridworld':
            pass
        else:
            self.env = gym.make(name)

        # gym returns 6 possible actions for breakout and pong.
        # I think only 3 are used for both. So making life easier
        # with "LEFT", "RIGHT", "NOOP" actions space.
        # env.unwrapped.get_action_meanings()
        if name in {'breakout', 'pong'}:
            self.action_space = [2, 3]
        elif name == 'gridworld':
            pass
        else:
            self.action_space = self.env.action_space

        self.resize = tuple(grid_size)
        self.history_length = last_n
        self.history = deque(maxlen=last_n)
        self.prev_x = None
        self.delta_preprocessing = delta_preprocessing
コード例 #5
0
ファイル: test_catalog.py プロジェクト: adgirish/ray
    def testGymPreprocessors(self):
        p1 = ModelCatalog.get_preprocessor(
            get_registry(), gym.make("CartPole-v0"))
        self.assertEqual(type(p1), NoPreprocessor)

        p2 = ModelCatalog.get_preprocessor(
            get_registry(), gym.make("FrozenLake-v0"))
        self.assertEqual(type(p2), OneHotPreprocessor)
コード例 #6
0
def get_env(name):
    if 'Acrobot-v0' == name:
        return gym.make('Acrobot-v0')
    elif 'MountainCar-v0' == name:
        return gym.make('MountainCar-v0')
    elif 'CartPole-v0' == name:
        return gym.make('CartPole-v0')
    else:
        raise Exception('Not %s env found'%(name))
コード例 #7
0
ファイル: mdp.py プロジェクト: yuhsh24/RLlearning
def get_env(name):
    if "Acrobot-v0" == name:
        return gym.make("Acrobot-v0")
    elif "MountainCar-v0" == name:
        return gym.make("MountainCar-v0")
    elif "CartPole-v0" == name:
        return gym.make("CartPole-v0")
    else:
        raise Exception('Not %s env found' % (name))
コード例 #8
0
def run(args, parser):
    def create_environment(env_config):
        # This import must happen inside the method so that worker processes import this code
        import roboschool
        return gym.make(args.env)

    if not args.config:
        # Load configuration from file
        config_dir = os.path.dirname(args.checkpoint)
        # params.json is saved in the model directory during ray training by default
        config_path = os.path.join(config_dir, "params.json")
        with open(config_path) as f:
            args.config = json.load(f)

    if not args.env:
        if not args.config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = args.config.get("env")

    ray.init()

    register_env(args.env, create_environment)

    cls = get_agent_class(args.algorithm)
    config = args.config
    config["monitor"] = False
    config["num_workers"] = 1
    config["num_gpus"] = 0
    agent = cls(env=args.env, config=config)
    agent.restore(args.checkpoint)
    num_episodes = int(args.evaluate_episodes)

    if args.algorithm == "DQN":
        env = gym.make(args.env)
        env = wrap_dqn(env, args.config.get("model", {}))
    else:
        env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env))
    env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True)
    all_rewards = []
    for episode in range(num_episodes):
        steps = 0
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done:
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
            steps += 1
            state = next_state
        all_rewards.append(reward_total)
        print("Episode reward: %s. Episode steps: %s" % (reward_total, steps))
    print("Mean Reward:", np.mean(all_rewards))
    print("Max Reward:", np.max(all_rewards))
    print("Min Reward:", np.min(all_rewards))
コード例 #9
0
def main():
    import roboschool
    import gym
    import chainer
    env = gym.make('CartPole-v0')
    env.reset()
    env.step(env.action_space.sample())
    env = gym.make('RoboschoolHalfCheetah-v1')
    env.reset()
    env.step(env.action_space.sample())
    print("Your environment has been successfully set up!")
コード例 #10
0
ファイル: main.py プロジェクト: megvii-rl/pytorch-gym
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Create envs.
    env = gym.make(env_id)
    env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(0)))

    if evaluation:
        eval_env = gym.make(env_id)
        eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                        sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
                   action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    logger.info('total runtime: {}s'.format(time.time() - start_time))
コード例 #11
0
ファイル: test_optimizers.py プロジェクト: jamescasbon/ray
    def _make_evs(self):
        def make_sess():
            return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2}))

        local = PolicyEvaluator(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_graph=PPOPolicyGraph,
            tf_session_creator=make_sess)
        remotes = [
            PolicyEvaluator.as_remote().remote(
                env_creator=lambda _: gym.make("CartPole-v0"),
                policy_graph=PPOPolicyGraph,
                tf_session_creator=make_sess)
        ]
        return local, remotes
コード例 #12
0
ファイル: ddqn.py プロジェクト: tatsuyaokubo/dqn
def main():
    env = gym.make(ENV_NAME)
    agent = Agent(num_actions=env.action_space.n)

    if TRAIN:  # Train mode
        for _ in range(NUM_EPISODES):
            terminal = False
            observation = env.reset()
            for _ in range(random.randint(1, NO_OP_STEPS)):
                last_observation = observation
                observation, _, _, _ = env.step(0)  # Do nothing
            state = agent.get_initial_state(observation, last_observation)
            while not terminal:
                last_observation = observation
                action = agent.get_action(state)
                observation, reward, terminal, _ = env.step(action)
                # env.render()
                processed_observation = preprocess(observation, last_observation)
                state = agent.run(state, action, reward, terminal, processed_observation)
    else:  # Test mode
        # env.monitor.start(ENV_NAME + '-test')
        for _ in range(NUM_EPISODES_AT_TEST):
            terminal = False
            observation = env.reset()
            for _ in range(random.randint(1, NO_OP_STEPS)):
                last_observation = observation
                observation, _, _, _ = env.step(0)  # Do nothing
            state = agent.get_initial_state(observation, last_observation)
            while not terminal:
                last_observation = observation
                action = agent.get_action_at_test(state)
                observation, _, terminal, _ = env.step(action)
                env.render()
                processed_observation = preprocess(observation, last_observation)
                state = np.append(state[1:, :, :], processed_observation, axis=0)
コード例 #13
0
def main():
    # initialize OpenAI Gym env and dqn agent
    env = gym.make(ENV_NAME)
    agent = DQN(env)

    for episode in range(EPISODE):
        state = env.reset()  # initialize task
        for step in range(STEP):  # Train; STEP=300
            action = agent.egreedy_action(state)  # e-greedy action for train, 获取包含随机的动作
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)  # 感知信息; 当获取足够的 batch 时, 开始训练网络
            state = next_state
            if done: break

        if episode % 100 == 0:  # Test every 100 episodes
            total_reward = 0
            for i in range(TEST):  # TEST = 20
                state = env.reset()
                for j in range(STEP):
                    env.render()
                    action = agent.action(state)  # direct action for test
                    # 测试中的不同是: 1. 使用 action(state): 来获取动作, 也就是完全没有随机性, 只根据神经网络来输出, 没有探索; 2. 同时这里也就不再 perceive 输入信息来训练
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done: break
            ave_reward = total_reward/TEST
            print('episode: ', episode, 'Evaluation Average Reward:', ave_reward)
コード例 #14
0
def main():
  env = gym.make('MountainCarContinuous-v0')
  ft = FeatureTransformer(env, n_components=100)
  D = ft.dimensions
  pmodel = PolicyModel(ft, D, [], [])
  # init = tf.global_variables_initializer()
  session = tf.InteractiveSession()
  # session.run(init)
  pmodel.set_session(session)
  pmodel.init_vars()
  gamma = 0.99

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)

  totalrewards, pmodel = random_search(env, pmodel, gamma)
  print("max reward:", np.max(totalrewards))

  # play 100 episodes and check the average
  avg_totalrewards = play_multiple_episodes(env, 100, pmodel, gamma, print_iters=True)
  print("avg reward over 100 episodes with best models:", avg_totalrewards)

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()
コード例 #15
0
 def __init__(self, thread_id, master):
     self.thread_id = thread_id
     threading.Thread.__init__(self, name="thread_%d" % thread_id)
     self.env = AtariEnv(gym.make(flags.game))
     self.master = master
     # local network
     if flags.use_lstm:
         self.local_net = A3CLSTMNet(self.env.state_shape, self.env.action_dim, scope="local_net_%d" % thread_id)
     else:
         self.local_net = A3CNet(self.env.state_shape, self.env.action_dim, scope="local_net_%d" % thread_id)
     # sync network
     self.sync = self.sync_network(master.shared_net)
     # accumulate gradients
     self.accum_grads = self.create_accumulate_gradients()
     self.do_accum_grads_ops = self.do_accumulate_gradients()
     self.reset_accum_grads_ops = self.reset_accumulate_gradients()
     # collect summaries for debugging
     summaries = list()
     summaries.append(tf.scalar_summary("entropy/%d" % self.thread_id, self.local_net.entropy))
     summaries.append(tf.scalar_summary("policy_loss/%d" % self.thread_id, self.local_net.policy_loss))
     summaries.append(tf.scalar_summary("value_loss/%d" % self.thread_id, self.local_net.value_loss))
     summaries.append(tf.scalar_summary("total_loss/%d" % self.thread_id, self.local_net.total_loss))
     # apply accumulated gradients
     with tf.device("/gpu:%d" % flags.gpu):
         self.apply_gradients = master.shared_opt.apply_gradients(
             zip(self.accum_grads, master.shared_net.get_vars()), global_step=master.global_step)
         self.summary_op = tf.merge_summary(summaries)
コード例 #16
0
ファイル: envs.py プロジェクト: chriscremer/Other_Code
    def _thunk():

        env = gym.make(env_id) #this prints
        # print('here')

        # print (env.unwrapped)
        # print (env.unwrapped.get_action_meanings())
        # fdsadsfa

        is_atari = hasattr(gym.envs, 'atari') and isinstance(env.unwrapped, gym.envs.atari.atari_env.AtariEnv)

        #so this overwrites the other env? so ill change it
        if is_atari:
            # env = make_atari(env_id)
            #took this from make_atari
            assert 'NoFrameskip' in env.spec.id
            env = NoopResetEnv(env, noop_max=30)
            env = MaxAndSkipEnv(env, skip=4)

        env.seed(seed + rank)

        if log_dir != '':
            env = bench.Monitor(env, os.path.join(log_dir, str(rank)))

        if is_atari:

            warp = False
            env = wrap_deepmind(env, warp=warp)


            env = WrapPyTorch(env)



        return env
コード例 #17
0
ファイル: PlayGame.py プロジェクト: abri-simond/RL
    def play_game(self,env=None):
        if env is None:
            env = gym.make(self.gamename)
            
        obs = env.reset()
        agent = self.agent
        obs_hist = []
        reward_hist = []
        action_hist = []
        while True:
            # Execute
            action = agent.predict(obs)
            obs, reward, done, info = env.step(action)

            # Collect variables
            obs_hist.append(obs)
            reward_hist.append(reward)
            action_hist.append(action)
            if done:
                break

        obs_hist = np.array(obs_hist)
        reward_hist = np.array(reward_hist)
        action_hist = np.array(action_hist)
        #print('Game done.')
        full_result = {'obs' : obs_hist,'action': action_hist, 'reward' : reward_hist}
        
        # Process result according to 
        processed_result = self.agent.process_one_game(full_result)
        return processed_result
def main():
    env = gym.make("InvertedPendulumSwingupBulletEnv-v0")
    env.render(mode="human")
   
    pi = SmallReactivePolicy(env.observation_space, env.action_space)

    while 1:
        frame = 0
        score = 0
        restart_delay = 0
        obs = env.reset()

        while 1:
            time.sleep(0.05)
            a = pi.act(obs)
            obs, r, done, _ = env.step(a)
            score += r
            frame += 1
            still_open = env.render("human")
            if still_open==False:
                return
            if not done: continue
            if restart_delay==0:
                print("score=%0.2f in %i frames" % (score, frame))
                restart_delay = 60*2  # 2 sec at 60 fps
            else:
                restart_delay -= 1
                if restart_delay > 0: continue
                break
コード例 #19
0
def test_coexistence(learn_fn, network_fn):
    '''
    Test if more than one model can exist at a time
    '''

    if learn_fn == 'deepq':
            # TODO enable multiple DQN models to be useable at the same time
            # github issue https://github.com/openai/baselines/issues/656
            return

    if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
            # TODO make acktr work with recurrent policies
            # and test
            # github issue: https://github.com/openai/baselines/issues/660
            return

    env = DummyVecEnv([lambda: gym.make('CartPole-v0')])
    learn = get_learn_function(learn_fn)

    kwargs = {}
    kwargs.update(network_kwargs[network_fn])
    kwargs.update(learn_kwargs[learn_fn])

    learn =  partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
    make_session(make_default=True, graph=tf.Graph())
    model1 = learn(seed=1)
    make_session(make_default=True, graph=tf.Graph())
    model2 = learn(seed=2)

    model1.step(env.observation_space.sample())
    model2.step(env.observation_space.sample())
コード例 #20
0
ファイル: ddpg.py プロジェクト: shehroze37/deep-rl
def main(_):
    with tf.Session() as sess:

        env = gym.make(ENV_NAME)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high
        # Ensure action bound is symmetric
        assert (env.action_space.high == -env.action_space.low)

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars())

        if GYM_MONITOR_EN:
            if not RENDER_ENV:
                env = wrappers.Monitor(
                    env, MONITOR_DIR, video_callable=False, force=True)
            else:
                env = wrappers.Monitor(env, MONITOR_DIR, force=True)

        train(sess, env, actor, critic)

        if GYM_MONITOR_EN:
            env.monitor.close()
コード例 #21
0
ファイル: agent.py プロジェクト: robertnishihara/ray
    def _setup(self, config):
        env = self._env_id
        if env:
            config["env"] = env
            if _global_registry.contains(ENV_CREATOR, env):
                self.env_creator = _global_registry.get(ENV_CREATOR, env)
            else:
                import gym  # soft dependency
                self.env_creator = lambda env_config: gym.make(env)
        else:
            self.env_creator = lambda env_config: None

        # Merge the supplied config with the class default
        merged_config = copy.deepcopy(self._default_config)
        merged_config = deep_update(merged_config, config,
                                    self._allow_unknown_configs,
                                    self._allow_unknown_subkeys)
        self.raw_user_config = config
        self.config = merged_config
        Agent._validate_config(self.config)
        if self.config.get("log_level"):
            logging.getLogger("ray.rllib").setLevel(self.config["log_level"])

        # TODO(ekl) setting the graph is unnecessary for PyTorch agents
        with tf.Graph().as_default():
            self._init()
コード例 #22
0
ファイル: ars.py プロジェクト: zhan0903/ARS
    def __init__(self, env_name='HalfCheetah-v1',
                 policy_params=None,
                 num_workers=32, 
                 num_deltas=320, 
                 deltas_used=320,
                 delta_std=0.02, 
                 logdir=None, 
                 rollout_length=1000,
                 step_size=0.01,
                 shift='constant zero',
                 params=None,
                 seed=123):

        logz.configure_output_dir(logdir)
        logz.save_params(params)
        
        env = gym.make(env_name)
        
        self.timesteps = 0
        self.action_size = env.action_space.shape[0]
        self.ob_size = env.observation_space.shape[0]
        self.num_deltas = num_deltas
        self.deltas_used = deltas_used
        self.rollout_length = rollout_length
        self.step_size = step_size
        self.delta_std = delta_std
        self.logdir = logdir
        self.shift = shift
        self.params = params
        self.max_past_avg_reward = float('-inf')
        self.num_episodes_used = float('inf')

        
        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise.remote()
        self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3)
        print('Created deltas table.')

        # initialize workers with different random seeds
        print('Initializing workers.') 
        self.num_workers = num_workers
        self.workers = [Worker.remote(seed + 7 * i,
                                      env_name=env_name,
                                      policy_params=policy_params,
                                      deltas=deltas_id,
                                      rollout_length=rollout_length,
                                      delta_std=delta_std) for i in range(num_workers)]


        # initialize policy 
        if policy_params['type'] == 'linear':
            self.policy = LinearPolicy(policy_params)
            self.w_policy = self.policy.get_weights()
        else:
            raise NotImplementedError
            
        # initialize optimization algorithm
        self.optimizer = optimizers.SGD(self.w_policy, self.step_size)        
        print("Initialization of ARS complete.")
コード例 #23
0
ファイル: tuto_rl.py プロジェクト: thbeucher/DQN
def QTable_algo():
    env = gym.make('FrozenLake-v0')

    #initialize table with all zeros
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    #set learning parameters
    lr = .85
    y = .99
    num_episodes = 2000
    #create lists to contain total rewards and steps per episode
    rList = []
    for i in range(num_episodes):
        #reset environment and get first new observation
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        #the Q-Table learning algorithm
        while j < 99:
            j+=1
            #choose an action by greedily (with noise) picking from Q-Table
            a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
            #get new state and reward from environment
            s1, r, d,_ = env.step(a)
            #update Q-Table with new knowledge
            Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
            rAll += r
            s = s1
            if d == True:
                break
        rList.append(rAll)
        print("Score over time: " + str(sum(rList)/num_episodes))
    print("Final Q-Table Values")
    print(Q)
コード例 #24
0
def main():
  env = gym.make('CartPole-v0')
  ft = FeatureTransformer(env)
  model = Model(env, ft)
  gamma = 0.99

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)


  N = 500
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    eps = 1.0/np.sqrt(n+1)
    totalreward = play_one(env, model, eps, gamma)
    totalrewards[n] = totalreward
    if n % 100 == 0:
      print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  print("total steps:", totalrewards.sum())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)
コード例 #25
0
ファイル: modelEvaluator.py プロジェクト: rgarzonj/PhD_repo
    def runEpisodesForAgent (self,num_episodes,numBlocks):
        ''' Runs numEpisodes of the agent
        ''' 
        
        #numBlocks = 3
        env = gym.make('BlocksWorld-v0')
        env.seed(0)
        env.reset()       
        done = False
#        num_episodes = 1000
        ep_lengths = []
        n = 0
        while (n<num_episodes):    
            steps =1
            done = False
            env.reset()
            next_action = [random.randint(0,numBlocks),random.randint(0,numBlocks)]
            while (done == False):
                obs, reward, done, empty = env.step (next_action)
                print ('Next action ' + str(next_action))
                print ('Obs ' + str(obs))
                next_action = self.agent.sampleAction(obs)
                #env.render()
                steps +=1    
            print (done)
            print ('New episode')
            ep_lengths.append(steps)
            n+=1
        
        print ("Average episode length " + str(sum(ep_lengths) / float(len(ep_lengths))))
            #input("Press Enter to continue...")
        self.ep_lengths = ep_lengths
        return ep_lengths
コード例 #26
0
def main():
  env = gym.make('MountainCarContinuous-v0')
  ft = FeatureTransformer(env, n_components=100)
  D = ft.dimensions
  pmodel = PolicyModel(D, ft, [])
  vmodel = ValueModel(D, ft, [])
  init = tf.global_variables_initializer()
  session = tf.InteractiveSession()
  session.run(init)
  pmodel.set_session(session)
  vmodel.set_session(session)
  gamma = 0.95

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)

  N = 50
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma)
    totalrewards[n] = totalreward
    if n % 1 == 0:
      print("episode:", n, "total reward: %.1f" % totalreward, "num steps: %d" % num_steps, "avg reward (last 100): %.1f" % totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)
  plot_cost_to_go(env, vmodel)
コード例 #27
0
ファイル: environment.py プロジェクト: Deanout/simple_dqn
 def __init__(self, env_id, args):
   import gym
   self.gym = gym.make(env_id)
   self.obs = None
   self.terminal = None
   # OpenCV expects width as first and height as second s
   self.dims = (args.screen_width, args.screen_height)
コード例 #28
0
ファイル: run_atari.py プロジェクト: IcarusTan/baselines
def train(env_id, num_frames, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
コード例 #29
0
 def __init__(self):
     env = gym.make(ENV)
     self.env = wrappers.Monitor(env, '/tmp/gym/cartpole_dqn', force=True)
     self.num_states = self.env.observation_space.shape[0]
     self.num_actions = self.env.action_space.n
     self.agent = Agent(self.num_states, self.num_actions)
     self.total_step = np.zeros(10)
コード例 #30
0
def evaluation(session, graph_ops, saver):
    """
    Evaluate a model.
    """
    ckpt = tf.train.get_checkpoint_state('./model')
    if ckpt and ckpt.model_checkpoint_path:
        print (ckpt.model_checkpoint_path)
    else:
        print ("exit")
    saver.restore(session, ckpt.model_checkpoint_path)
    print("Restored model weights from ", test_model_path)
    monitor_env = gym.make(game)
    monitor_env.monitor.start("qlearning/eval")

    # Unpack graph ops
    s = graph_ops["s"]
    q_values = graph_ops["q_values"]

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=monitor_env,
                           action_repeat=action_repeat)

    for i_episode in xrange(num_eval_episodes):
        s_t = env.get_initial_state()
        ep_reward = 0
        terminal = False
        while not terminal:
            monitor_env.render()
            readout_t = q_values.eval(session=session, feed_dict={s : [s_t]})
            action_index = np.argmax(readout_t)
            s_t1, r_t, terminal, info = env.step(action_index)
            s_t = s_t1
            ep_reward += r_t
        print(ep_reward)
    monitor_env.monitor.close()
コード例 #31
0
        return self.action_space.sample()

class BiasedAgent(object):
    def __init__(self, action_space):
        self.action_space = action_space
        self.action_always = self.action_space.sample()
    def act(self, observation, reward, done):
        return self.action_always


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('game', nargs="?", default="CartPole-v0")
    args = parser.parse_args()

    env = gym.make(args.game)
    num_episodes = 20
    num_maxstep = 100

    agent_id = 1
    if agent_id == 1:
        agent = RandomAgent(env.action_space)
    elif agent_id == 2:
        agent = BiasedAgent(env.action_space)

    reward = 0
    done = False

    for i_episode in range(num_episodes):
        observation = env.reset()
        for t in range(num_maxstep):
コード例 #32
0
ファイル: train.py プロジェクト: wobushihuair/Paddle-RLBooks
import paddle
import paddle.nn.functional as F
import numpy as np
import gym

batch_size = 256
num_episodes = 100000
memory_size = 1000000
policy_delay = 2
learning_rate = 0.1
gamma = 0.99
ratio = 0.005
exploration_noise = 1e-3
epoch = 0

env = gym.make('Pendulum-v0')
env.seed(1)
paddle.seed(1)
np.random.seed(1)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
min_val = paddle.to_tensor(1e-7).astype('float32')

actor = Actor(state_dim, action_dim, max_action)
actor_optimizer = paddle.optimizer.RMSProp(parameters=actor.parameters(),
                                  learning_rate=learning_rate)

Q_net = Q(state_dim, action_dim)
Q_optimizer = paddle.optimizer.RMSProp(parameters=Q_net.parameters(),
コード例 #33
0
def test_spec_with_kwargs():
    map_name_value = "8x8"
    env = gym.make("FrozenLake-v1", map_name=map_name_value)
    assert env.spec.kwargs["map_name"] == map_name_value
コード例 #34
0
 def __init__(self, name, globalAC):
     self.env = gym.make(GAME).unwrapped
     self.name = name
     self.AC = ACNet(name, globalAC)
コード例 #35
0
ファイル: pong.py プロジェクト: BaibhaVatsa/learn-ml
    probs = tf.nn.softmax(logits).numpy()
    action = np.random.choice(6, size = 1, p = probs.flatten())[0]
    return action

def pre_process(image):
    img = image[35:195]
    img = img[::2, ::2, 0]
    img[img == 144] = 0
    img[img == 109] = 0
    img[img != 0] = 1
    return img.astype(np.float).ravel()

if __name__ == "__main__":
    import time
    start = time.process_time()
    env = gym.make('Pong-v4')
    print("Number of obswervations: {}".format(env.observation_space))
    print("Number of allowed actions: {}".format(env.action_space))
    print(tf.__version__)
    print(tf.keras.__version__)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    model = create_model()
    # model.load_weights('model/agentcycle1750-agent99gamma1kepochs')
    # print(model.summary())
    # print('Model loaded successfully!')
    memory = Memory()

    import skvideo.io
    from pyvirtualdisplay import Display
    display = Display(visible=0)
    display.start()
コード例 #36
0
ファイル: main.py プロジェクト: RiddlerQ/LunarLander-V2
import copy
from turtle import pd

import numpy as np
import pandas as pandas
import torch
import random
from matplotlib import pylab as plt
import gym
from collections import deque
import Box2D

env = gym.make('LunarLander-v2')
env.reset()
#-------------------------------------------------------------------------------------------------

def discretize(val,bounds,n_states):
    if val <= bounds[0]:
        discrete_val = 0
    elif val >= bounds[1]:
        discrete_val = n_states-1
    else:
        discrete_val = int(round((n_states-1)*((val-bounds[0])/(bounds[1]-bounds[0]))))
    return discrete_val

def discretize_state(vals,s_bounds,n_s):
    discrete_vals = []
    for i in range(len(n_s)):
        discrete_vals.append(discretize(vals[i],s_bounds[i],n_s[i]))
    return np.array(discrete_vals,dtype=np.int)
コード例 #37
0
    n_action = 2
    actions = np.array([0, 1])
    # ----------------------------------------
    # Observation
    # Type: Box(4)
    # Num | Observation   | Min    | Max
    # 0   | Cart Position | -2.4   | 2.4
    # 1   | Cart Velocity | -Inf   | Inf
    # 2   | Pole Angle    | -41.8  | 41.8
    # 3   | Pole Velocity | -Inf   | Inf
    n_input = 4
    observation = []
    # ----------------------------------------
    # Define environment/game
    env_name = 'CartPole-v0'
    env = gym.make(env_name)
    # ----------------------------------------
    # Initialize Neural Q-Learn object
    AI = NeuralQLearner(n_input, actions, batch_size, epsilon, alpha, gamma)
    #AI.plotQ()
    # Initialize experience replay object
    exp = Experience(max_memory)
    # ----------------------------------------
    # Train
    for e in range(epoch):
        # Get initial input
        observation = env.reset()
        observation_init = observation

        # Training for single episode
        step = 0
コード例 #38
0
ファイル: main.py プロジェクト: gongjue/pytorch-gym
    parser.add_argument('--vis', action='store_true', help='visualize each action or not')
    parser.add_argument('--discrete', dest='discrete', action='store_true', help='the actions are discrete or not')
    parser.add_argument('--cuda', dest='cuda', action='store_true')
    # parser.add_argument('--l2norm', default=0.01, type=float, help='l2 weight decay') # TODO

    args = parser.parse_args()
    # StrCat args.output with args.env
    if args.resume is None:
        args.output = get_output_folder(args.output, args.env)
    else:
        args.output = args.resume

    if args.env == "KukaGym":
        env = KukaGymEnv(renders=False, isDiscrete=True)
    elif args.discrete:
        env = gym.make(args.env)
        env = env.unwrapped
    else:
        env = NormalizedEnv(gym.make(args.env))

    # input random seed
    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    # input states count & actions count
    print(env.observation_space.shape, env.action_space.shape)
    nb_states = env.observation_space.shape[0]
    if args.discrete:
        nb_actions = env.action_space.n
    else:
コード例 #39
0
ファイル: ddqn.py プロジェクト: wwongkamjan/pytorch-DQN
    config.gamma = 0.99
    config.epsilon = 1
    config.epsilon_min = 0.01
    config.eps_decay = 500
    config.frames = 160000
    config.use_cuda = True
    config.learning_rate = 1e-3
    config.max_buff = 1000
    config.update_tar_interval = 100
    config.batch_size = 128
    config.print_interval = 200
    config.log_interval = 200
    config.win_reward = 198  # CartPole-v0
    config.win_break = True

    env = gym.make(config.env)
    config.action_dim = env.action_space.n
    config.state_dim = env.observation_space.shape[0]
    agent = DDQNAgent(config)

    if args.train:
        trainer = Trainer(agent, env, config)
        trainer.train()

    elif args.test:
        if args.model_path is None:
            print('please add the model path:', '--model_path xxxx')
            exit(0)
        tester = Tester(agent, env, args.model_path)
        tester.test()
コード例 #40
0
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make("CartPole-v0")
gamma = 0.99
beta = 0.00001
alpha = 0.000001
sigma = 0.001
w = np.array([0, 0, 0, 0, 0, 0, 0, 0])
delta_w = np.array([0, 0, 0, 0, 0, 0, 0, 0])
v = np.array([0, 0, 0, 0, 0, 0, 0, 0])
delta_v = np.array([0, 0, 0, 0, 0, 0, 0, 0])


def log(log_message):
    """
    
    DESCRIPTION:
    - Adds a log message "log_message" to a log file.
    
    """

    # open the log file and make sure that it's closed properly at the end of the
    # block, even if an exception occurs:
    with open("C:/Users/Fregus/log2.txt", "a") as log_file:
        # write the log message to logfile:
        log_file.write(log_message)
        log_file.write("\n")  # (so the next message is put on a new line)

コード例 #41
0
    disk_roll_vel = observations[0]
    # roll_angle = observations[2]
    y_linear_speed = observations[4]
    yaw_angle = observations[5]

    state_converted = [disk_roll_vel, y_linear_speed, yaw_angle]

    return state_converted


if __name__ == '__main__':

    rospy.init_node('j2n6s300_gym', anonymous=True, log_level=rospy.WARN)

    # Create the Gym environment
    env = gym.make('j2n6s300Test-v3')
    rospy.loginfo("Gym environment done")

    # Set the logging system
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('j2n6s300_ml')
    outdir = pkg_path + '/training_results'
    env = wrappers.Monitor(env, outdir, force=True)
    rospy.loginfo("Monitor Wrapper started")

    last_time_steps = numpy.ndarray(0)

    # Loads parameters from the ROS param server
    # Parameters are stored in a yaml file inside the config directory
    # They are loaded at runtime by the launch file
    Alpha = rospy.get_param("/j2n6s300/alpha")
コード例 #42
0
ファイル: pendulum_ddpg.py プロジェクト: ouj/RL
Q_LEARNING_RATE = 1e-3
GAMMA = 0.99
DECAY = 0.995
ACTION_NOISE = 0.1
MINIMAL_SAMPLES = 10000
MAXIMAL_SAMPLES = 1000000
ITERATIONS = 100000
BATCH_SIZE = 64

MAX_EPISODE_LENGTH = 200

SAVE_CHECKPOINT_EVERY = 100
DEMO_EVERY = 100

# Environment
env = gym.make("Pendulum-v0")

def create_mu_network(
    name,
    output_dim,
    action_max,
    activation=tf.nn.relu,
    output_activation=tf.nn.tanh,
    trainable=True,
):
    return MLPNetwork([
        tf.layers.Dense(
            units=512,
            activation=activation,
            trainable=trainable,
            name="W",
コード例 #43
0
ファイル: test_main.py プロジェクト: sangkeun00/CMU-10703
import copy
from numpy import array
from PIL import Image

from improcess import AtariProcessor
from improcess import HistoryStore
from policy import GreedyPolicy
from policy import UniformRandomPolicy
from memhelpers import NNMemStore
IMAGE_SIZE = (84, 84)
HISTORY_LENGTH = 4

MEM_SIZE = 2000
INIT_MEM_RATIO = 0.5

env = gym.make('BreakoutDeterministic-v0')
observation = env.reset()
num_actions = env.action_space.n

atari_processor = AtariProcessor(IMAGE_SIZE)
history_store = HistoryStore(HISTORY_LENGTH, IMAGE_SIZE)
greedy_selector = GreedyPolicy()
random_selector = UniformRandomPolicy(num_actions)
episode_end_flag = False
mem_store = NNMemStore(MEM_SIZE, (84, 84, 4))
observation = env.reset()
state = atari_processor.state_for_mem(observation)
history_store.add_history(state)
i = 0
life = False
first_step = True
コード例 #44
0
    is_save_raw_data: bool = False
    is_save_analytica_data: bool = True
    is_save_chart_data: bool = True

    # program execution settings
    is_print_episode_idx: bool = True
    user_input_next_operation: str = "fly"

    # i frequently got ram issues running with my Geforce 1050. Thus, I disabled it.
    device = "cpu"  #torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # core procedure start point
    is_ipython = 'inline' in matplotlib.get_backend()
    if is_ipython: pass

    env = gym.make('CartPole-v0').unwrapped
    env.reset()

    rgb_array = env.render('rgb_array')
    processed_screen = CommonUtil.process_screen(rgb_array, device)
    screen_height = processed_screen.shape[2]
    screen_width = processed_screen.shape[3]
    num_of_nn_input_node: int = screen_height * screen_width * 3

    tmp_torch_policy_network = BinaryOutputDeepQNetwork_5096_1024_512_128_64_16_8(
        num_of_nn_input_node).to(device)
    tmp_torch_target_network = BinaryOutputDeepQNetwork_5096_1024_512_128_64_16_8(
        num_of_nn_input_node).to(device)

    deep_q_agent: DeepQAgent = DeepQAgent(epsilon_start, epsilon_end,
                                          epsilon_decay, nn_learning_rate,
コード例 #45
0
import gym
import gym_reinmav

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

env = gym.make('quadrotor2d-v0')
# Optional: PPO2 requires a vectorized environment to run
# the env is now wrapped automatically when passing it to the constructor
# env = DummyVecEnv([lambda: env])

model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log="/home/jaeyoung/dev/reinmav-gym/ppo2_quadrotor2d_tensorboard/")
model.learn(total_timesteps=300000, tb_log_name="first_run")

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
コード例 #46
0
ファイル: td3.py プロジェクト: Tubbz-alt/TEAC
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='HalfCheetah-v2')
    parser.add_argument('--hid', type=int, default=256)
    parser.add_argument('--l', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--exp_name', type=str, default='td3')
    args = parser.parse_args()

    from spinup.utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed, args.env)

    td3(lambda: gym.make(args.env), actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
        gamma=args.gamma, seed=args.seed, epochs=args.epochs,
        logger_kwargs=logger_kwargs)
コード例 #47
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=50)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--replay-capacity', type=int, default=5000)
    parser.add_argument('--replay-start-size', type=int, default=10**3)
    parser.add_argument('--disable-online-update', action='store_true')
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--truncation-threshold', type=float, default=5)
    parser.add_argument('--trust-region-delta', type=float, default=0.1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels, action_space.n, wscale=1e-3),
                SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels, action_space.n, wscale=1e-3),
                DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta,
                      phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
コード例 #48
0
# -*- coding: utf-8 -*-
"""
Created on Thu May 31 10:43:03 2018

@author: vw1586
"""

import gym
import numpy as np
import random
import math

## Initialize the "Cart-Pole" environment
env = gym.make('CartPole-v0')

## Defining the environment related constants

# Number of discrete states (bucket) per state dimension
NUM_BUCKETS = (1, 1, 6, 3)  # (x, x', theta, theta')
# Number of discrete actions
NUM_ACTIONS = env.action_space.n  # (left, right)
# Bounds for each discrete state
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
STATE_BOUNDS[1] = [-0.5, 0.5]
STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)]
# Index of the action
ACTION_INDEX = len(NUM_BUCKETS)

## Creating a Q-Table for each state-action pair
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS, ))
コード例 #49
0
import gym
import numpy as np
env = gym.make('SafeUfoReachEnv2-v1')

obs = env.reset()

for i in range(50):

    action = np.array([20])  #env.action_space.sample()

    # print(type(env.ACs))# a tuple
    # print(f"action{action}")
    # print(f"env.ACs{env.ACs}")

    obs, reward, done, info = env.step(action)
    # print(obs)
    # print(reward)
    # print(done)
    # print(info)

    if done:

        # print(f"env.ACs{env.ACs}")\
        print(obs)

        break
print(obs)
print(np.random.randint(50, int(2 / 0.02) - 1))
# print result
#  $ obs = env.reset()
# {'observation': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.60000000e+02,
コード例 #50
0
ファイル: run_tests.py プロジェクト: human-ui/gym-minigrid
import numpy as np
import gym

import gym_minigrid
# from gym_minigrid import wrappers

rng = np.random.RandomState(1337)

env_list = [e for e in gym.envs.registry.env_specs if e.startswith('MiniGrid')]
print(f'{len(env_list)} environments registered')

for env_name in env_list:
    print(f'testing {env_name}')

    # Load the gym environment
    env = gym.make(env_name, seed=1337)
    print(env)

    env.max_steps = min(env.max_steps, 200)
    env.reset()
    env.render('rgb_array')

    # Verify that the same seed always produces the same environment
    for i in range(0, 5):
        seed = 1337 + i
        env.seed(seed)
        grid1 = env.grid
        env.seed(seed)
        grid2 = env.grid
        assert grid1 == grid2
コード例 #51
0
    SCREEN_SIZE = 1000
    SPARSE_REWARD = False
    SCREEN_SHOT = False
    action_range = 10.0

    env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \
    ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False)
    action_space = spaces.Box(low=-1.0,
                              high=1.0,
                              shape=(env.num_actions, ),
                              dtype=np.float32)
    state_space = spaces.Box(low=-np.inf,
                             high=np.inf,
                             shape=(env.num_observations, ))
else:
    env = NormalizedActions(gym.make(ENV))
    action_space = env.action_space
    state_space = env.observation_space
    action_range = 1.

replay_buffer_size = 5e5
replay_buffer = ReplayBufferLSTM2(replay_buffer_size)

# hyper-parameters for RL training
max_episodes = 1000
max_steps = 20 if ENV == 'Reacher' else 150  # Pendulum needs 150 steps per episode to learn well, cannot handle 20
frame_idx = 0
batch_size = 2  # each sample contains an episode for lstm policy
explore_steps = 0  # for random action sampling in the beginning of training
update_itr = 1
hidden_dim = 512
コード例 #52
0
Generate Expert Trajectories from a model
"""

# env_id = 'NovelGridworld-v2'
# model = DQN('MlpPolicy', env_id, verbose=1)
#
# # Train a DQN agent for 1e5 timesteps and generate 10 trajectories
# # data will be saved in a numpy archive named `expert_+env_id.npz`
# generate_expert_traj(model, 'expert_'+env_id, n_timesteps=int(10), n_episodes=5)

"""
Generate Expert Trajectories from a human expert player
"""

env_id = 'NovelGridworld-v5'
env = gym.make(env_id)

KEY_ACTION_DICT = ENV_KEY[env_id]


def print_play_keys(action_str):
    print("Press a key to play: ")
    for key, key_id in KEY_ACTION_DICT.items():
        print(key, ": ", action_str[key_id])


def human_expert(_obs):
    """
    Random agent. It samples actions randomly
    from the action space of the environment.
コード例 #53
0
import sys
sys.path.insert(0, "../../")
import numpy as np
import gym
import algorithms as alg
from evaluate import *

env = gym.make("FrozenLake-v0")

print("\nSARSA")
alg.utils.random_seed(env, 1)
Q,history_sarsa = alg.sarsa(
    env, alpha=0.1, gamma=1, epsilon=0.4, N_episodes=10000,
    epsilon_decay=alg.utils.decay_linear)
pi = alg.utils.create_greedy_policy(Q)
print(np.array(
    [np.argmax(pi[s]) for s in range(env.nS)]).reshape(env.nrow,env.ncol))
evaluate_policy(env, pi, 10000, env.nS-1)

print("\nQ-Learning")
alg.utils.random_seed(env, 1)
Q,history_qlearning = alg.qlearning(
    env, alpha=0.1, gamma=0.99, epsilon=0.5, N_episodes=10000,
    epsilon_decay=alg.utils.decay_linear)
pi = alg.utils.create_greedy_policy(Q)
print(np.array(
    [np.argmax(pi[s]) for s in range(env.nS)]).reshape(env.nrow,env.ncol))
evaluate_policy(env, pi, 10000, env.nS-1)

print("\nExpected SARSA")
alg.utils.random_seed(env, 1)
コード例 #54
0
    def __init__(self, params):

        #############
        ## INIT
        #############

        # Get params, create logger, create TF session
        self.params = params
        self.logger = Logger(self.params['logdir'])
        self.sess = create_tf_session(self.params['use_gpu'],
                                      which_gpu=self.params['which_gpu'])

        # Set random seeds
        seed = self.params['seed']
        tf.set_random_seed(seed)
        np.random.seed(seed)

        #############
        ## ENV
        #############

        # Make the gym environment
        self.env = gym.make(self.params['env_name'])
        self.env.seed(seed)

        # Maximum length for episodes
        self.params['ep_len'] = self.params[
            'ep_len'] or self.env.spec.max_episode_steps

        # Is this env continuous, or self.discrete?
        discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
        self.params['agent_params']['discrete'] = discrete

        # Observation and action sizes
        ob_dim = self.env.observation_space.shape[0]
        ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[
            0]
        self.params['agent_params']['ac_dim'] = ac_dim
        self.params['agent_params']['ob_dim'] = ob_dim

        # simulation timestep, will be used for video saving
        if 'model' in dir(self.env):
            self.fps = 1 / self.env.model.opt.timestep
        else:
            self.fps = self.env.env.metadata['video.frames_per_second']

        #############
        ## AGENT
        #############

        agent_class = self.params['agent_class']
        self.agent = agent_class(self.sess, self.env,
                                 self.params['agent_params'])

        #############
        ## INIT VARS
        #############

        ## TODO initialize all of the TF variables (that were created by agent, etc.)
        ## HINT: use global_variables_initializer
        self.sess.run(tf.global_variables_initializer())
コード例 #55
0
parser.add_argument('--lr_start', default=0.001, type=float)
parser.add_argument('--lr_end', default=0.0005, type=float)
parser.add_argument('--eps_start', default=1, type=float)
parser.add_argument('--eps_end', default=0.1, type=float)
parser.add_argument('--nsteps', default=100000, type=int, help='total steps')

parser.add_argument('--framehistory', default=1, type=int, help='number of images into network')
parser.add_argument('--buffersize', default=1000, type=int, help='replay buffer size')
parser.add_argument('--batchsize', default=4, type=int, help='replay buffer size')


args = parser.parse_args()
print(args)

env = gym.make(args.env)


lr = args.lr_start

dqn = DQN(6400, env.action_space.n)
target_dqn = DQN(6400, env.action_space.n)
eps_vals = np.linspace(args.eps_end, args.eps_start, args.nsteps)

total_rewards = []
count = 0
run_avg = 0
running_avg_rew = []

for episode in range(args.episodes):
コード例 #56
0

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = []
    for rewards in all_rewards:
        all_discounted_rewards.append(
            helper_discount_rewards(rewards, discount_rate))

    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]


env = gym.make("CartPole-v0")

num_game_rounds = 100
max_game_steps = 1000
num_iterations = 100
discount_rate = 0.95

with tf.Session() as sess:
    new_saver = tf.train.import_meta_graph('.models/my-650-step-model.meta')
    new_saver.restore(sess, '.models/my-650-step-model')
    # sess.run(init)

    for iteration in range(num_iterations):
        print("Currently on Iteration: {} \n".format(iteration))
        all_rewards = []
        all_gradients = []
コード例 #57
0
ファイル: ppo.py プロジェクト: ViRu-ThE-ViRuS/elvis_is_alive
            total_reward += reward
            n_steps += 1

            if timestep % agent.update_interval == 0:
                loss = agent.learn()
                losses.extend(loss)

        rewards.append(total_reward)
        steps.append(n_steps)

        if episode % (episodes // 10) == 0 and episode != 0:
            print(f'{episode:5d} : {np.mean(rewards):06.2f} '
                  f': {np.mean(losses):06.4f} : {np.mean(steps):06.2f}')
            rewards = []
            # losses = [0]
            steps = []

    print(f'{episode:5d} : {np.mean(rewards):06.2f} '
          f': {np.mean(losses):06.4f} : {np.mean(steps):06.2f}')
    return losses, rewards
# }}}


if __name__ == '__main__':
    env = gym.make('CartPole-v1')
    # env = gym.make('LunarLander-v2')
    agent = Agent(0.99, env.observation_space.shape, [env.action_space.n],
                  update_interval=2000, K=4, c1=1.0)

    learn(env, agent, 1000)
コード例 #58
0
def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold,
        replay_memory_size, save_dir,
        epsilon_steps, epsilon_final, tau_actor, tau_critic, use_ornstein_noise,
        learning_rate_actor, learning_rate_critic, clip_grad, layers, initialise_params, title):
    env = gym.make('Platform-v0')
    env = ScaledStateWrapper(env)

    initial_params_ = [3., 10., 400.]
    for a in range(env.action_space.spaces[0].n):
        initial_params_[a] = 2. * (initial_params_[a] - env.action_space.spaces[1].spaces[a].low) / (
                env.action_space.spaces[1].spaces[a].high - env.action_space.spaces[1].spaces[a].low) - 1.

    env = PlatformFlattenedActionWrapper(env)
    env = ScaledParameterisedActionWrapper(env)

    dir = os.path.join(save_dir, title)
    env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True)
    env.seed(seed)
    np.random.seed(seed)

    agent = PADDPGAgent(observation_space=env.observation_space.spaces[0],
                        action_space=env.action_space,
                        batch_size=batch_size,
                        learning_rate_actor=learning_rate_actor,
                        learning_rate_critic=learning_rate_critic,
                        epsilon_steps=epsilon_steps,
                        epsilon_final=epsilon_final,
                        gamma=gamma,
                        clip_grad=clip_grad,
                        tau_actor=tau_actor,
                        tau_critic=tau_critic,
                        initial_memory_threshold=initial_memory_threshold,
                        use_ornstein_noise=use_ornstein_noise,
                        replay_memory_size=replay_memory_size,
                        inverting_gradients=inverting_gradients,
                        adam_betas=(0.9, 0.999),
                        critic_kwargs={'hidden_layers': layers, 'init_type': "kaiming"},
                        actor_kwargs={'hidden_layers': layers, 'init_type': "kaiming", 'init_std': 0.0001,
                                      'squashing_function': False},
                        seed=seed)
    print(agent)
    if initialise_params:
        initial_weights = np.zeros((env.action_space.spaces[0].n, env.observation_space.spaces[0].shape[0]))
        initial_bias = np.zeros(env.action_space.spaces[0].n)
        for a in range(env.action_space.spaces[0].n):
            initial_bias[a] = initial_params_[a]
        agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias)

    max_steps = 250
    total_reward = 0.
    returns = []
    start_time = time.time()
    for i in range(episodes):
        state, _ = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)

        act, act_param, all_actions, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):
            ret = env.step(action)
            (next_state, steps), reward, terminal, _ = ret
            next_state = np.array(next_state, dtype=np.float32, copy=False)

            next_act, next_act_param, next_all_actions, next_all_action_parameters = agent.act(next_state)
            next_action = pad_action(next_act, next_act_param)
            agent.step(state, (act, act_param, all_actions, all_action_parameters), reward, next_state,
                       (next_act, next_act_param, next_all_actions, next_all_action_parameters), terminal, steps)
            act, act_param, all_actions, all_action_parameters = next_act, next_act_param, next_all_actions, next_all_action_parameters
            action = next_action
            state = next_state  # .copy()

            episode_reward += reward

            if terminal:
                break
        agent.end_episode()

        returns.append(episode_reward)
        total_reward += episode_reward
        if (i + 1) % 100 == 0:
            print('{0:5s} R:{1:.5f}'.format(str(i + 1), total_reward / (i + 1)))
    end_time = time.time()
    print("Took %.2f seconds" % (end_time - start_time))

    env.close()

    returns = env.get_episode_rewards()
    print("Ave. return =", sum(returns) / len(returns))
    print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.)
    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
コード例 #59
0
 def env(self):
     return gym.make('orthogonal-single-boundary-v0')
コード例 #60
0
LOG_DIR = './log'
N_WORKERS = 2  #multiprocessing.cpu_count()
MAX_EP_STEP = 200
MAX_GLOBAL_EP = 2000
MAX_R = -1600
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 10
GAMMA = 0.9
ENTROPY_BETA = 0.01
LR_A = 0.0001  # learning rate for actor
LR_C = 0.001  # learning rate for critic
GLOBAL_MEAN_R = []
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0

env = gym.make(GAME)

N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]
A_BOUND = [env.action_space.low, env.action_space.high]


class ACNet(object):
    def __init__(self, scope, globalAC=None):

        if scope == GLOBAL_NET_SCOPE:  # get global network
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_params, self.c_params = self._build_net(scope)[-2:]
        else:  # local net, calculate losses
            with tf.variable_scope(scope):