def test_predict(self):
        """
        Test case for AC-learning and Q-learning predictions
        """
        num_actions = 4

        def test(input, ct, max):
            action_counter = [0] * num_actions
            total = 3000
            for i in range(total):
                actions, states = ct.predict(inputs=input)
                assert not states, "states should be empty"
                ## actions["action"] is a batch of actions
                for a in actions["action"]:
                    action_counter[a[0]] += 1

            if max:
                ### if max, some action will always be chosen (which action is
                ### chosen depends on the network initialization
                count = 0
                for i in range(num_actions):
                    prob = action_counter[i] / float(sum(action_counter))
                    if abs(prob - 1.0) < 1e-1:
                        count = count + 1
                self.assertEqual(count, 1)
            else:
                ### the actions should be uniform
                for i in range(num_actions):
                    prob = action_counter[i] / float(sum(action_counter))
                    self.assertAlmostEqual(prob, 1.0 / num_actions, places=1)

        dims = 100

        q_cnn = SimpleQ(model=TestModelCNN(
            width=84, height=84, num_actions=num_actions))

        q = SimpleQ(model=SimpleModelQ(
            dims=[dims],
            num_actions=num_actions,
            perception_net=nn.Sequential(
                nn.Linear(
                    dims, 32, bias=False),
                nn.ReLU(),
                nn.Linear(
                    32, 16, bias=False),
                nn.ReLU())))

        batch_size = 10
        height, width = 84, 84
        sensor = np.zeros([batch_size, dims]).astype("float32")
        image = np.zeros([batch_size, 1, height, width]).astype("float32")

        ct0 = ComputationTask("test", algorithm=q_cnn)
        ct1 = ComputationTask("test", algorithm=q)

        test(dict(image=image), ct0, max=False)
        test(dict(sensor=sensor), ct1, max=True)
Beispiel #2
0
    def test_predict(self):
        """
        Test case for AC-learning and Q-learning predictions
        """
        num_actions = 4

        def test(input, ct, max):
            action_counter = [0] * num_actions
            total = 2000
            for i in range(total):
                actions, states = ct.predict(inputs=input)
                assert not states, "states should be empty"
                ## actions["action"] is a batch of actions
                for a in actions["action"]:
                    action_counter[a[0]] += 1

            if max:
                ### if max, the first action will always be chosen
                for i in range(num_actions):
                    prob = action_counter[i] / float(sum(action_counter))
                    self.assertAlmostEqual(prob,
                                           1.0 if i == 0 else 0.0,
                                           places=1)
            else:
                ### the actions should be uniform
                for i in range(num_actions):
                    prob = action_counter[i] / float(sum(action_counter))
                    self.assertAlmostEqual(prob, 1.0 / num_actions, places=1)

        dims = 100

        q_cnn = SimpleQ(
            model=TestModelCNN(width=84, height=84, num_actions=num_actions))

        q = SimpleQ(model=SimpleModelQ(
            dims=dims,
            num_actions=num_actions,
            mlp=nn.Sequential(nn.Linear(dims, 32, bias=False), nn.ReLU(),
                              nn.Linear(32, 16, bias=False), nn.ReLU(),
                              nn.Linear(16, num_actions, bias=False))))

        batch_size = 10
        height, width = 84, 84
        sensor = np.zeros([batch_size, dims]).astype("float32")
        image = np.zeros([batch_size, 1, height, width]).astype("float32")

        ct0 = ComputationTask(algorithm=q_cnn)
        ct1 = ComputationTask(algorithm=q)

        test(dict(image=image), ct0, max=False)
        test(dict(sensor=sensor), ct1, max=True)
Beispiel #3
0
    def __init__(self, ct_settings, log_settings=dict()):
        """
            Initialize `Manager`. `ct_settings` is used to create
            `ComputationTask`; The parameters in `ct_settings` are for each
            `ComputationTask`.
        """
        ## default settings
        log_settings_ = dict(print_interval=100,
                             load_model=False,
                             model_dir="",
                             pass_num=0,
                             model_save_interval=10,
                             log_file="")
        ## update with the user provided ones
        log_settings_.update(log_settings)
        log_settings = log_settings_

        self.agents = []
        self.logger = GameLogger(
            timeout=1,
            print_interval=log_settings["print_interval"],
            model_save_interval=log_settings["model_save_interval"],
            log_file=log_settings["log_file"])
        self.cts = {}
        self.CDPs = {}
        for name, setting in ct_settings.items():
            setting["model_dir"] = log_settings["model_dir"]
            setting["pass_num"] = log_settings["pass_num"]
            setting["load_model"] = log_settings["load_model"]
            self.cts[name] = ComputationTask(name, **setting)
            self.CDPs[name] = self.cts[name].CDP
            self.logger.model_save_signals.append(
                self.cts[name].model_save_signal)
Beispiel #4
0
 def __init__(self,
              ct_settings,
              log_settings=dict(
                  print_interval=100, model_dir="",
                  model_save_interval=10)):
     """
         Initialize `Manager`. `ct_settings` is used to create
         `ComputationTask`; The parameters in `ct_settings` are for each
         `ComputationTask`.
     """
     self.agents = []
     self.cts = {}
     self.CDPs = {}
     for name, setting in ct_settings.iteritems():
         setting["model_dir"] = log_settings["model_dir"]
         self.cts[name] = ComputationTask(name, **setting)
         self.CDPs[name] = self.cts[name].CDP
     self.logger = GameLogger(
         timeout=1,
         print_interval=log_settings["print_interval"],
         model_save_interval=log_settings["model_save_interval"],
         cts=self.cts)
Beispiel #5
0
    def test_ct_para_sharing(self):
        """
        Test case for two CTs sharing parameters
        """
        alg = TestAlgorithm(
            model=SimpleModelDeterministic(dims=10, mlp=nn.Linear(10, 10)))
        ct0 = ComputationTask(algorithm=alg)
        ct1 = ComputationTask(algorithm=alg)

        batch_size = 10
        sensor = np.random.uniform(
            0, 1, [batch_size, alg.model.dims]).astype("float32")

        outputs0, _ = ct0.predict(inputs=dict(sensor=sensor))
        outputs1, _ = ct1.predict(inputs=dict(sensor=sensor))
        self.assertEqual(np.sum(outputs0["continuous_action"].flatten()),
                         np.sum(outputs1["continuous_action"].flatten()))
    def test_ct_para_copy(self):
        """
        Test case for two CTs copying parameters
        """

        alg = TestAlgorithm(model=SimpleModelDeterministic(
            dims=[10], perception_net=nn.Linear(10, 10)))

        ct0 = ComputationTask("test", algorithm=alg)
        ct1 = ComputationTask("test", algorithm=deepcopy(alg))

        batch_size = 10
        sensor = np.random.uniform(
            0, 1, [batch_size] + ct0.alg.model.dims).astype("float32")

        outputs0, _ = ct0.predict(inputs=dict(sensor=sensor))
        outputs1, _ = ct1.predict(inputs=dict(sensor=sensor))
        self.assertEqual(
            np.sum(outputs0["action"].flatten()),
            np.sum(outputs1["action"].flatten()))
Beispiel #7
0
    def test_ct_learning(self):
        """
        Test training
        """
        num_actions = 2
        dims = 100
        batch_size = 8
        sensor = np.ones(
            [batch_size, dims]).astype("float32") / dims  # normalize
        next_sensor = np.zeros([batch_size, dims]).astype("float32")

        for on_policy in [True, False]:
            if on_policy:
                alg = SimpleAC(model=SimpleModelAC(
                    dims=dims,
                    num_actions=num_actions,
                    mlp=nn.Sequential(
                        nn.Linear(
                            dims, 64, bias=False),
                        nn.ReLU(),
                        nn.Linear(
                            64, 32, bias=False),
                        nn.ReLU())))
                ct = ComputationTask(
                    "test", algorithm=alg, hyperparas=dict(lr=1e-1))
            else:
                alg = SimpleQ(
                    model=SimpleModelQ(
                        dims=dims,
                        num_actions=num_actions,
                        mlp=nn.Sequential(
                            nn.Linear(
                                dims, 64, bias=False),
                            nn.ReLU(),
                            nn.Linear(
                                64, 32, bias=False),
                            nn.ReLU(),
                            nn.Linear(
                                32, num_actions, bias=False))),
                    update_ref_interval=100)
                ct = ComputationTask(
                    "test", algorithm=alg, hyperparas=dict(lr=1e-1))

            for i in range(1000):
                if on_policy:
                    outputs, _ = ct.predict(inputs=dict(sensor=sensor))
                    actions = outputs["action"]
                else:
                    ## randomly assemble a batch
                    actions = np.random.choice(
                        [0, 1], size=(batch_size, 1),
                        p=[0.5, 0.5]).astype("int")
                rewards = (1.0 - actions).astype("float32")
                cost = ct.learn(
                    inputs=dict(sensor=sensor),
                    next_inputs=dict(sensor=next_sensor),
                    next_alive=dict(alive=np.zeros(
                        (batch_size, 1)).astype("float32")),
                    actions=dict(action=actions),
                    rewards=dict(reward=rewards))

            ### the policy should bias towards the first action
            outputs, _ = ct.predict(inputs=dict(sensor=sensor))
            for a in outputs["action"]:
                self.assertEqual(a[0], 0)
Beispiel #8
0
    def test_gym_games(self):
        """
        Test games in OpenAI gym.
        """

        games = ["MountainCar-v0", "CartPole-v0", "Pendulum-v0"]
        final_rewards_thresholds = [
            -1.5,  ## drive to the right top in 150 steps (timeout is -2.0)
            1.5,  ## hold the pole for at least 150 steps
            -3.0  ## can swing the stick to the top most of the times
        ]
        on_policies = [False, True, False]
        discrete_actions = [True, True, False]

        for game, threshold, on_policy, discrete_action in \
            zip(games, final_rewards_thresholds, on_policies, discrete_actions):

            env = gym.make(game)
            state_shape = env.observation_space.shape[0]
            if discrete_action:
                num_actions = env.action_space.n
            else:
                num_actions = env.action_space.shape[0]

            hidden_size = 256

            mlp = nn.Sequential(
                nn.Linear(state_shape, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU())

            q_model = SimpleModelQ(
                dims=state_shape,
                num_actions=num_actions,
                mlp=nn.Sequential(mlp, nn.Linear(hidden_size, num_actions)))

            if on_policy:
                alg = SimpleSARSA(model=q_model, epsilon=0.1)
                # alg = SuccessorRepresentationQ(
                #     ## much slower than SARSA because of more things to learn
                #     model=SimpleSRModel(
                #         dims=state_shape,
                #         hidden_size=hidden_size,
                #         num_actions=num_actions, ),
                #     exploration_end_steps=20000)
            else:
                if discrete_action:
                    alg = SimpleQ(
                        model=q_model,
                        exploration_end_steps=200000,
                        update_ref_interval=100)
                else:
                    alg = OffPolicyAC(
                        model=GaussianPolicyModel(
                            dims=state_shape,
                            action_dims=num_actions,
                            mlp=mlp,
                            std=1.0),
                        epsilon=0.2)

            glog.info("algorithm: " + alg.__class__.__name__)

            ct = ComputationTask("RL", algorithm=alg, hyperparas=dict(lr=1e-4))
            batch_size = 32
            if not on_policy:
                train_every_steps = batch_size / 4
                buffer_size_limit = 200000

            max_episode = 10000

            average_episode_reward = []
            past_exps = []
            max_steps = env._max_episode_steps
            for n in range(max_episode):
                ob = env.reset()
                episode_reward = 0
                alive = 1
                for t in range(max_steps):
                    inputs = dict(sensor=np.array([ob]).astype("float32"))
                    res, _ = ct.predict(inputs=inputs)

                    ## when discrete_action is True, this is a scalar
                    ## otherwise it's a floating vector
                    pred_action = res["action"][0]

                    ## end before the env wrongly gives game_over=True for a timeout case
                    if t == max_steps - 1:
                        past_exps.append(
                            (inputs, res, dict(reward=[[0]]),
                             dict(alive=[[-1]])))  ## -1 denotes timeout
                        break
                    elif (not alive):
                        past_exps.append((inputs, res, dict(reward=[[0]]),
                                          dict(alive=[[alive]])))
                        break
                    else:
                        next_ob, reward, next_is_over, _ = env.step(
                            pred_action[0] if discrete_action else pred_action)
                        reward /= 100
                        episode_reward += reward
                        past_exps.append((inputs, res, dict(reward=[[reward]]),
                                          dict(alive=[[alive]])))

                    ## only for off-policy training we use a circular buffer
                    if (not on_policy) and len(past_exps) > buffer_size_limit:
                        past_exps.pop(0)

                    ## compute the learning condition
                    learn_cond = False
                    if on_policy:
                        learn_cond = (len(past_exps) >= batch_size)
                    else:
                        learn_cond = (
                            t % train_every_steps == train_every_steps - 1)

                    if learn_cond:
                        exps = sample(past_exps, batch_size)
                        sampled_inputs, next_sampled_inputs, sampled_actions, \
                            next_sampled_actions, reward, next_alive = unpack_exps(exps)
                        cost = ct.learn(
                            inputs=sampled_inputs,
                            next_inputs=next_sampled_inputs,
                            next_alive=next_alive,
                            actions=sampled_actions,
                            next_actions=next_sampled_actions,
                            rewards=reward)
                        ## we clear the exp buffer for on-policy
                        if on_policy:
                            past_exps = []

                    ob = next_ob
                    ### bool must be converted to int for correct computation
                    alive = 1 - int(next_is_over)

                if n % 50 == 0:
                    glog.info("episode reward: %f" % episode_reward)

                average_episode_reward.append(episode_reward)
                if len(average_episode_reward) > 20:
                    average_episode_reward.pop(0)

                ### once hit the threshold, we don't bother running
                if sum(average_episode_reward) / len(
                        average_episode_reward) > threshold:
                    glog.info(
                        "Test terminates early due to threshold satisfied!")
                    break

            ### compuare the average episode reward to reduce variance
            self.assertGreater(
                sum(average_episode_reward) / len(average_episode_reward),
                threshold)
    def test_gym_games(self):
        """
        Test games in OpenAI gym.
        """

        games = ["MountainCar-v0", "CartPole-v0"]
        final_rewards_thresholds = [
            -1.8,  ## drive to the right top in 180 steps (timeout is -2.0)
            1.5  ## hold the pole for at least 150 steps
        ]

        for game, threshold in zip(games, final_rewards_thresholds):
            for on_policy in [False, True]:

                if on_policy and game != "CartPole-v0":
                    ## SimpleAC has difficulty training mountain-car and acrobot
                    continue

                env = gym.make(game)
                state_shape = env.observation_space.shape[0]
                num_actions = env.action_space.n

                mlp = nn.Sequential(nn.Linear(state_shape, 128), nn.ReLU(),
                                    nn.Linear(128, 128), nn.ReLU(),
                                    nn.Linear(128, 128), nn.ReLU())

                if on_policy:
                    alg = SimpleAC(model=SimpleModelAC(dims=state_shape,
                                                       num_actions=num_actions,
                                                       mlp=mlp),
                                   hyperparas=dict(lr=1e-3))
                else:
                    alg = SimpleQ(model=SimpleModelQ(
                        dims=state_shape,
                        num_actions=num_actions,
                        mlp=nn.Sequential(mlp, nn.Linear(128, num_actions))),
                                  hyperparas=dict(lr=1e-4),
                                  exploration_end_batches=25000,
                                  update_ref_interval=100)

                print "algorithm: " + alg.__class__.__name__

                ct = ComputationTask(algorithm=alg)
                batch_size = 16
                if not on_policy:
                    train_every_steps = batch_size / 4
                    buffer_size_limit = 100000

                max_episode = 5000

                average_episode_reward = []
                past_exps = []
                max_steps = env._max_episode_steps
                for n in range(max_episode):
                    ob = env.reset()
                    episode_reward = 0
                    for t in range(max_steps):
                        res, _ = ct.predict(inputs=dict(
                            sensor=np.array([ob]).astype("float32")))
                        pred_action = res["action"][0][0]

                        next_ob, reward, next_is_over, _ = env.step(
                            pred_action)
                        reward /= 100
                        episode_reward += reward

                        past_exps.append((ob, next_ob, [pred_action], [reward],
                                          [not next_is_over]))
                        ## only for off-policy training we use a circular buffer
                        if (not on_policy
                            ) and len(past_exps) > buffer_size_limit:
                            past_exps.pop(0)

                        ## compute the learning condition
                        learn_cond = False
                        if on_policy:
                            learn_cond = (len(past_exps) >= batch_size)
                            exps = past_exps  ## directly use all exps in the buffer
                        else:
                            learn_cond = (
                                t % train_every_steps == train_every_steps - 1)
                            exps = sample(past_exps,
                                          batch_size)  ## sample some exps

                        if learn_cond:
                            sensor, next_sensor, action, reward, next_episode_end \
                                = unpack_exps(exps)
                            cost = ct.learn(
                                inputs=dict(sensor=sensor),
                                next_inputs=dict(next_sensor=next_sensor),
                                next_episode_end=dict(
                                    next_episode_end=next_episode_end),
                                actions=dict(action=action),
                                rewards=dict(reward=reward))
                            ## we clear the exp buffer for on-policy
                            if on_policy:
                                past_exps = []

                        ob = next_ob

                        ## end before the Gym wrongly gives game_over=True for a timeout case
                        if t == max_steps - 2 or next_is_over:
                            break

                    if n % 50 == 0:
                        print("episode reward: %f" % episode_reward)

                    average_episode_reward.append(episode_reward)
                    if len(average_episode_reward) > 20:
                        average_episode_reward.pop(0)

                ### compuare the average episode reward to reduce variance
                self.assertGreater(
                    sum(average_episode_reward) / len(average_episode_reward),
                    threshold)