Esempio n. 1
0
def weight_example():
    import matplotlib.pyplot as plt
    # s2_means = np.array([-100.0, 10.0])
    # s2_vars = np.array([1.0, 10.0])
    # c_mean = 5.0
    # hist = []
    # var_avg = []
    # mean_disp = []
    # for log_c_var in np.arange(-2.5, 5.0, 0.1):
    #     outputs = adfq_fun.posterior_adfq(s2_means, s2_vars, c_mean, np.exp(log_c_var), 0.0, 0.9, terminal=0, varTH=1e-10)
    #     hist.append(outputs[1])
    #     var_avg.append(np.sum(outputs[2][1]*outputs[2][2]))

    # f, ax = plt.subplots()
    # ax.plot(np.exp(np.arange(-2.5, 5.0, 0.1)), np.exp(np.arange(-2.5, 5.0, 0.1)), 'k--')
    # ax.plot(np.exp(np.arange(-2.5, 5.0, 0.1)), hist)
    # ax.plot(np.exp(np.arange(-2.5, 5.0, 0.1)), var_avg)
    # ax.plot(np.exp(np.arange(-2.5, 5.0, 0.1)), np.array(hist)-np.array(var_avg))
    # ax.legend(['prior','new varaince', 'avg variance', 'mean dispersion'])
    # plt.show()
    # pdb.set_trace()
    hist = []

    X = np.arange(-20, 20, 0.1)
    Y = np.arange(-20, 20, 0.1)
    n_vars = np.array([100.0, 100.0])
    for n1 in X:
        for n2 in Y:
            n_means = np.array([n1, n2])
            outputs = adfq_fun.posterior_adfq(n_means,
                                              n_vars,
                                              0.0,
                                              1000.0,
                                              0.0,
                                              0.9,
                                              terminal=0,
                                              varTH=1e-10)
            hist.append(outputs[1])
    hist = np.reshape(hist, (X.shape[0], Y.shape[0]))
    from mpl_toolkits.mplot3d import Axes3D
    from matplotlib import cm
    from matplotlib.ticker import LinearLocator, FormatStrFormatter
    X, Y = np.meshgrid(X, Y)
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    surf = ax.plot_wireframe(X, Y, hist, rstride=10, cstride=10)
    ax.set_xlabel('s_tp1 mean 1')
    ax.set_ylabel('s_tp1 mean 2')
    ax.set_zlabel('Variance update')
    #fig.colorbar(surf, shrink=0.5, aspect=5)
    plt.show()

    pdb.set_trace()
Esempio n. 2
0
def fun(rewards, test_num):
    alpha = 0.5
    print("Test%d: r=%.2f, r=%.2f" % (test_num, rewards[0], rewards[1]))
    s2_means = np.array([0.0, 0.0])
    s2_vars = np.array([10.0, 10.0])
    c_mean = s2_means[0]
    c_var = s2_vars[0]
    Q = np.zeros((3, 2))

    outputs = adfq_fun.posterior_adfq(s2_means,
                                      s2_vars,
                                      c_mean,
                                      c_var,
                                      rewards[0],
                                      0.9,
                                      terminal=0,
                                      varTH=1e-10)
    s2_means[0] = outputs[0]
    s2_vars[0] = outputs[1]
    print("t=1 mean: ", s2_means)
    print("t=1 var: ", s2_vars)
    outputs = adfq_fun.posterior_adfq(s2_means,
                                      s2_vars,
                                      c_mean,
                                      c_var,
                                      rewards[1],
                                      0.9,
                                      terminal=0,
                                      varTH=1e-10)
    s2_means[0] = outputs[0]
    s2_vars[0] = outputs[1]
    print("t=2 mean: ", s2_means)
    print("t=2 var: ", s2_vars)

    print("Before Q: ", Q[1, :])
    for r in rewards:
        Q[1, 0] = (1 - alpha) * Q[1, 0] + alpha * (r + 0.9 * max(Q[1, :]))
    print("After Q: ", Q[1, :])
Esempio n. 3
0
def adfq_update(q_means, q_vars, sars_tuples):
    print("\nInitial ADFQ Mean and Variance")
    print("means:")
    print(q_means[:2, :])
    print("vars:")
    print(q_vars[:2, :])
    visits = np.zeros(q_means.shape)
    for (i, sars) in enumerate(sars_tuples):
        visits[sars[0], sars[1]] += 1.0
        print("t=%d" % i)
        prior = [q_means[sars[0], sars[1]], q_vars[sars[0], sars[1]]]
        targets = [
            sars[2] + GAMMA * q_means[sars[-1], :],
            GAMMA * GAMMA * q_vars[sars[-1], :]
        ]
        td_err = sars[2] + GAMMA * q_means[sars[-1], :] - q_means[sars[0],
                                                                  sars[1]]
        outputs = adfq_fun.posterior_adfq(q_means[sars[-1], :],
                                          q_vars[sars[-1], :],
                                          q_means[sars[0], sars[1]],
                                          q_vars[sars[0], sars[1]],
                                          sars[2],
                                          GAMMA,
                                          terminal=0,
                                          varTH=1e-10)

        print("sars:", sars)
        q_means[sars[0], sars[1]] = outputs[0]
        q_vars[sars[0], sars[1]] = outputs[1]
        print("means:")
        print(q_means[:2, :])
        print("vars:")
        print(q_vars[:2, :])
        print("mu_bs:", outputs[2][0].astype(np.float16))
        print("target:", sars[2] + GAMMA * q_means[sars[-1], :])
        print("var_bs:", outputs[2][1].astype(np.float16))
        print("k_bs:", outputs[2][2].astype(np.float16))
        print("TD error:", td_err)
        if i > 11:
            display_update(prior, targets, outputs)
            q_tab[sars[0],
                  sars[1]] = q_update(sars,
                                      q_tab,
                                      alpha=1. / (visits[sars[0], sars[1]]))
            print("If Q-learning with alpha:%.2f" %
                  (1. / (visits[sars[0], sars[1]])))
            print(q_tab)
        else:
            q_tab = np.copy(q_means)
    return q_means, q_vars, q_tab
Esempio n. 4
0
def td_err_effect():
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    s2_means = np.array([0.0, 5.0])
    s2_vars = np.array([10.0, 10.0])
    c_var = 1.0
    td_errs = []
    var_hist = []
    comp = []
    for c_mean in np.arange(-10.0, 20.0, 0.5):
        #s2_means[1] = c_mean -10.0
        # if len(comp) > 1:
        #     if not(comp[-1][0]) and not(comp[-1][1]):
        #         pdb.set_trace()
        outputs = adfq_fun.posterior_adfq(s2_means,
                                          s2_vars,
                                          c_mean,
                                          c_var,
                                          0.0,
                                          0.9,
                                          terminal=0,
                                          varTH=1e-10)
        td_errs.append(0.0 + 0.9 * s2_means - c_mean)
        td_targets = 0.9 * s2_means
        comp.append([
            outputs[2][0][0] < td_targets[1], outputs[2][0][1] < td_targets[0]
        ])
        var_hist.append(outputs[1])

    #td_errs = np.abs(td_errs)
    td_errs = np.array(td_errs)
    pdb.set_trace()
    ax.plot(td_errs[:, 0], td_errs[:, 1], var_hist, 'bo-')
    ax.set_xlabel('TD error 1')
    ax.set_ylabel('TD error 2')
    ax.set_zlabel('Variance update')
    plt.show()
Esempio n. 5
0
File: adfq.py Progetto: coco66/ADFQ
    def learning(self,
                 actionPolicy,
                 actionParam=None,
                 updatePolicy='adfq',
                 eval_greedy=False,
                 draw=False,
                 varTH=1e-5,
                 updateParam=None,
                 asymptotic=False,
                 asymptotic_trigger=1e-8,
                 useScale=False,
                 noise=0.0,
                 noise_c=0.0,
                 batch_size=0):
        """train with ADFQ
		Parameters
		----------
			actionPolicy : action policy. See "action_selection" function below.
			actionParam : a hyperparameter for the chosen action policy if necessary.
			updatePolicy : 'adfq' for the ADFQ algorithm. 'numeric' for the ADFQ-Numeric update. 'adfq-v2' for the ADFQ V2 update (appendix).
			eval_greedy : True to evaluate the current policy during learning.
			draw : True to print out the simulation (for grid and maze domains)
			varTH : variance thereshold
			asymptotic : True to use the asymptotic update
			asymptotic_trigger : a value to decide when to start the asymptotic update if "asymptotic==True"
			useScale : use the scaling trick.
			noise : for stochastic case, you can add a small noise to the variance[s,a]
			batch_size : batch size. 0 if you don't use experience replay.
		"""
        if len(self.rewards) == self.env.timeH:
            print("The object has already learned")
            return None

        if (actionPolicy
                == 'offline') and (len(actionParam) != self.env.timeH):
            print(len(actionParam), self.env.timeH)
            raise ValueError(
                'The given action trajectory does not match with the number of learning steps.'
            )

        np.random.seed()
        self.varTH = varTH

        if batch_size > 0:
            s = self.env.reset(self.np_random)
            while (len(self.replayMem[(0, 0)]) < self.memory_size):
                a = np.random.choice(self.env.anum)
                r, s_n, done = self.env.observe(s, a, self.np_random)
                self.store({
                    'state': s,
                    'action': a,
                    'reward': r,
                    'state_n': s_n,
                    'terminal': done
                })

        s = self.env.reset(self.np_random)
        self.log_scale = 0.0

        while (self.step < self.env.timeH):
            if self.step % (int(self.env.timeH / util.EVAL_NUM)) == 0:
                self.Q_err.append(self.err())

            a = self.action_selection(s, actionPolicy, actionParam)

            # Observation
            r, s_n, done = self.env.observe(s, a, self.np_random)

            self.rewards.append(r)
            self.visits[s][a] += 1
            if batch_size > 0:
                self.store({
                    'state': s,
                    'action': a,
                    'reward': r,
                    'state_n': s_n,
                    'terminal': done
                })
                batch = self.get_batch(s, a, batch_size)
                n_means = self.means[batch['state_n'], :]
                n_vars = self.vars[batch['state_n'], :]
                c_mean = self.means[batch['state'], batch['action']]
                c_var = self.vars[batch['state'], batch['action']]
                reward = batch['reward']
                terminal = batch['terminal']
            else:
                # Record
                self.states.append(s)
                self.actions.append(a)
                n_means = self.means[s_n]
                n_vars = self.vars[s_n]
                c_mean = self.means[s][a]
                c_var = self.vars[s][a]
                reward = r
                terminal = done
            # Update
            self.varTH = varTH / np.exp(self.log_scale, dtype=util.DTYPE)
            if (updatePolicy == 'adfq'):
                new_mean, new_var, _ = adfq_fun.posterior_adfq(
                    n_means,
                    n_vars,
                    c_mean,
                    c_var,
                    reward,
                    self.discount,
                    terminal,
                    scale_factor=np.exp(self.log_scale, dtype=util.DTYPE),
                    varTH=self.varTH,
                    asymptotic=asymptotic,
                    asymptotic_trigger=asymptotic_trigger,
                    noise=noise / (1. + self.visits[s][a]),
                    noise_c=noise_c / (1. + self.visits[s][a]),
                    batch=(batch_size > 0))

            elif updatePolicy == 'numeric':
                new_mean, new_var, _ = adfq_fun.posterior_numeric(
                    n_means,
                    n_vars,
                    c_mean,
                    c_var,
                    reward,
                    self.discount,
                    terminal,
                    scale_factor=np.exp(self.log_scale, dtype=util.DTYPE),
                    varTH=self.varTH,
                    noise=noise / (1. + self.visits[s][a]),
                    noise_c=noise_c / (1. + self.visits[s][a]),
                    batch=(batch_size > 0))

            elif (updatePolicy == 'adfq-v2'):
                new_mean, new_var, _ = adfq_fun.posterior_adfq_v2(
                    n_means,
                    n_vars,
                    c_mean,
                    c_var,
                    reward,
                    self.discount,
                    terminal,
                    scale_factor=np.exp(self.log_scale, dtype=util.DTYPE),
                    varTH=self.varTH,
                    asymptotic=asymptotic,
                    asymptotic_trigger=asymptotic_trigger,
                    noise=noise,
                    batch=(batch_size > 0))

            elif updatePolicy == 'hybrid':
                new_mean, new_var, _ = adfq_fun.posterior_hybrid(
                    n_means,
                    n_vars,
                    c_mean,
                    c_var,
                    reward,
                    self.discount,
                    terminal,
                    scale_factor=np.exp(self.log_scale, dtype=util.DTYPE),
                    varTH=self.varTH,
                    noise=noise,
                    batch=(batch_size > 0))

            else:
                raise ValueError("No such update policy")

            self.means[s][a] = np.mean(new_mean)
            self.vars[s][a] = np.mean(
                new_var)  #np.maximum(self.varTH, new_var)

            if useScale:
                delta = np.log(np.mean(self.vars[self.env.eff_states, :]))
                self.vars[self.env.eff_states, :] = np.exp(
                    np.log(self.vars[self.env.eff_states, :]) - delta,
                    dtype=np.float64)
                self.log_scale = np.maximum(-100.0, self.log_scale + delta)

            if draw:
                self.draw(s, a, self.step, r)

            if eval_greedy and ((self.step + 1) %
                                (int(self.env.timeH / util.EVAL_NUM)) == 0):
                count, rew, _, _ = self.greedy_policy(
                    lambda x: self.get_action_egreedy(x, util.EVAL_EPS))
                self.test_counts.append(count)
                self.test_rewards.append(rew)
            s = self.env.reset(self.np_random) if done else s_n
            self.step += 1
Esempio n. 6
0
def batch_test():
    # Test example
    n_means = np.array(
        [[
            0.8811533, 0.9458812, 0.78800523, 0.7855228, 0.8548022, 1.0071998,
            0.8347546, 0.8921166, 0.9238528, 0.92809606, 0.9720104, 0.8862572
        ],
         [
             0.818557, 0.7337841, 1.1270436, 1.239617, 1.5188323, 1.0134017,
             1.0220736, 1.0529686, 1.1472604, 1.0060027, 0.79905474, 1.0601841
         ],
         [
             8.18557, 0.7337841, 1.1270436, -1.239617, 1.5188323, 1.0134017,
             -10.220736, 1.0529686, -11.472604, 1.0060027, 79.905474, 1.0601841
         ]],
        dtype=np.float32)
    n_vars = np.array(
        [[
            56.72953, 50.670547, 57.65268, 57.328403, 58.38417, 57.436073,
            56.893364, 54.29608, 57.82789, 51.035233, 50.11123, 56.001694
        ],
         [
             62.661198, 69.064156, 48.007538, 44.541973, 28.289375, 49.62694,
             56.438583, 51.90718, 51.234684, 46.22086, 53.456276, 41.784817
         ],
         [
             0.626611, 69.064156, 48.007538, 0.44541973, 0.282893, 49.62694,
             56.438583, 51.90718, 0.051234, 46.22086, 0.053456, 41.784817
         ]],
        dtype=np.float32)
    c_mean = np.array([0.9727963, 0.78518265, 6.2386910], dtype=np.float32)
    c_var = np.array([50.026917, 69.16801, 10.0012345], dtype=np.float64)
    reward = np.array([-0.0119087, -0., 1.0], dtype=np.float64)
    discount = 0.99
    terminal = [0, 1, 0]

    out_batch = adfq_fun.posterior_adfq(n_means,
                                        n_vars,
                                        c_mean,
                                        c_var,
                                        reward,
                                        discount,
                                        terminal=terminal,
                                        batch=True)
    for i in range(len(n_means)):
        out = adfq_fun.posterior_adfq(n_means[i],
                                      n_vars[i],
                                      c_mean[i],
                                      c_var[i],
                                      reward[i],
                                      discount,
                                      terminal=terminal[i])
        if np.abs(out[0] - out_batch[0][i]) > 1e-5:
            print("MISMATCH Mean for ENTRY.%d:" % i, out[0], out_batch[0][i])
        else:
            print("PASS Mean for ENTRY.%d" % i)

        if np.abs(out[1] - out_batch[1][i]) > 1e-5:
            print("MISMATCH Variance for ENTRY.%d:" % i, out[1],
                  out_batch[1][i])
        else:
            print("PASS Variance for ENTRY.%d" % i)
Esempio n. 7
0
    def learning(self,
                 actionPolicy,
                 actionParam,
                 updatePolicy='adfq',
                 eval_greedy=False,
                 draw=False,
                 varTH=1e-10,
                 updateParam=None,
                 asymptotic=False,
                 asymptotic_trigger=1e-8,
                 useScale=False,
                 noise=0.0,
                 batch_size=0,
                 change=True,
                 beta=0.0):
        """train with ADFQ
		Parameters
		----------
			actionPolicy : action policy. See "action_selection" function below.
			actionParam : a hyperparameter for the chosen action policy if necessary.
			updatePolicy : 'adfq' for the ADFQ algorithm. 'numeric' for the ADFQ-Numeric update. 'adfq-v2' for the ADFQ V2 update (appendix).
			eval_greedy : True to evaluate the current policy during learning.
			draw : True to print out the simulation (for grid and maze domains)
			varTH : variance thereshold
			asymptotic : True to use the asymptotic update
			asymptotic_trigger : a value to decide when to start the asymptotic update if "asymptotic==True"
			useScale : use the scaling trick.
			noise : for stochastic case, you can add a small noise to the variance[s,a]
			batch_size : batch size. 0 if you don't use experience replay.
		"""
        if len(self.rewards) == self.env.timeH:
            print("The object has already learned")
            return None

        if (actionPolicy
                == 'offline') and (len(actionParam) != self.env.timeH):
            print(len(actionParam), self.env.timeH)
            raise ValueError(
                'The given action trajectory does not match with the number of learning steps.'
            )

        np.random.seed()
        self.Q_target = np.array(self.env.optQ(self.discount))
        self.varTH = varTH

        records = {'t': [], 'k': [], 'var': [], 'mean': []}

        if batch_size > 0:
            s = self.env.reset(self.np_random)
            while (len(self.replayMem[(0, 0)]) < self.memory_size):
                a = np.random.choice(self.env.anum)
                r, s_n, done = self.env.observe(s, a, self.np_random)
                self.store({
                    'state': s,
                    'action': a,
                    'reward': r,
                    'state_n': s_n,
                    'terminal': done
                })

        s = self.env.reset(self.np_random)
        self.log_scale = 0.0

        temp = []
        while (self.step < self.env.timeH):
            if change and (self.step
                           == self.env.changePt):  # 0.5*self.env.timeH):
                self.env.change()
                self.Q_target = np.array(
                    self.env.optQ(self.discount, changed=True))

            if self.step % (int(self.env.timeH / util.EVAL_NUM)) == 0:
                self.Q_err.append(self.err())

            a = self.action_selection(s, actionPolicy, actionParam)

            # Observation
            r, s_n, done = self.env.observe(s, a, self.np_random)

            self.rewards.append(r)
            self.visits[s][a] += 1
            if batch_size > 0:
                self.store({
                    'state': s,
                    'action': a,
                    'reward': r,
                    'state_n': s_n,
                    'terminal': done
                })
                batch = self.get_batch(s, a, batch_size)
                n_means = self.means[batch['state_n'], :]
                n_vars = self.vars[batch['state_n'], :]
                c_mean = self.means[batch['state'], batch['action']]
                c_var = self.vars[batch['state'], batch['action']]
                reward = batch['reward']
                terminal = batch['terminal']
            else:
                # Record
                self.states.append(s)
                self.actions.append(a)
                n_means = self.means[s_n]
                n_vars = self.vars[s_n]
                c_mean = self.means[s][a]
                c_var = self.vars[s][a]
                reward = r
                terminal = done
            # Update
            self.varTH = varTH / np.exp(self.log_scale, dtype=util.DTYPE)
            if (updatePolicy == 'adfq'):
                new_mean, new_var, stats = adfq_fun.posterior_adfq(
                    n_means,
                    n_vars,
                    c_mean,
                    c_var,
                    reward,
                    self.discount,
                    terminal,
                    scale_factor=np.exp(self.log_scale, dtype=util.DTYPE),
                    varTH=self.varTH,
                    asymptotic=asymptotic,
                    asymptotic_trigger=asymptotic_trigger,
                    noise=noise,
                    batch=(batch_size > 0))

            elif updatePolicy == 'numeric':
                new_mean, new_var, _ = adfq_fun.posterior_numeric(
                    n_means,
                    n_vars,
                    c_mean,
                    c_var,
                    reward,
                    self.discount,
                    terminal,
                    scale_factor=np.exp(self.log_scale, dtype=util.DTYPE),
                    varTH=self.varTH,
                    noise=noise,
                    batch=(batch_size > 0))

            elif (updatePolicy == 'adfq-v2'):
                new_mean, new_var, stats = adfq_fun.posterior_adfq_v2(
                    n_means,
                    n_vars,
                    c_mean,
                    c_var,
                    reward,
                    self.discount,
                    terminal,
                    scale_factor=np.exp(self.log_scale, dtype=util.DTYPE),
                    varTH=self.varTH,
                    asymptotic=asymptotic,
                    asymptotic_trigger=asymptotic_trigger,
                    noise=noise,
                    batch=(batch_size > 0))

            elif updatePolicy == 'hybrid':
                new_mean, new_var, _ = adfq_fun.posterior_hybrid(
                    n_means,
                    n_vars,
                    c_mean,
                    c_var,
                    reward,
                    self.discount,
                    terminal,
                    scale_factor=np.exp(self.log_scale, dtype=util.DTYPE),
                    varTH=self.varTH,
                    noise=noise,
                    batch=(batch_size > 0))

            else:
                raise ValueError("No such update policy")

            td_err = reward + self.discount * n_means - c_mean  #np.clip(np.abs(reward + self.discount*n_means - c_mean), 0.1, 10.0)
            add_vars = c_var + self.discount**2 * n_vars
            #penalty = np.dot(stats[2], norm.cdf(td_err,0.0, 0.001*np.sqrt(add_vars)))-0.5
            #penalty = 50*(np.tanh(0.1*(np.dot(stats[2],td_err**2/add_vars)-50.0))+1.0)
            gate_bound = 1.0
            penalty = np.dot(stats[2], td_err**2 / add_vars)
            gate_const = 1.0 if penalty > gate_bound else 0.0
            #penalty *= gate_const
            steepness = 0.01
            midpoint = 5.0
            penalty = gate_const * 30.0 / (1. + np.exp(-steepness *
                                                       (penalty - midpoint)))
            temp.append([np.dot(stats[2], td_err**2 / add_vars), penalty])
            if s == 1 and a == 3:
                records['t'].append(self.step)
                records['k'].append(stats[2])
            records['mean'].append(copy.deepcopy(self.means))
            records['var'].append(copy.deepcopy(self.vars))
            #print("t:%d, var:%.4f, penalty:%.4f"%(self.step,new_var, penalty))

            self.means[s][a] = np.mean(new_mean)
            self.vars[s][a] = np.mean(
                new_var) + beta * penalty  #np.maximum(self.varTH, new_var)

            if useScale:
                delta = np.log(np.mean(self.vars[self.env.eff_states, :]))
                self.vars[self.env.eff_states, :] = np.exp(
                    np.log(self.vars[self.env.eff_states, :]) - delta,
                    dtype=np.float64)
                self.log_scale = np.maximum(-100.0, self.log_scale + delta)

            if draw:
                #self.var_plot()
                self.draw(s, a, self.step, r)

            if eval_greedy and ((self.step + 1) %
                                (int(self.env.timeH / util.EVAL_NUM)) == 0):
                count, rew, _, _ = self.greedy_policy(
                    lambda x: self.get_action_egreedy(x, util.EVAL_EPS))
                self.test_counts.append(count)
                self.test_rewards.append(rew)
            s = self.env.reset(self.np_random) if done else s_n
            self.step += 1
        return records, temp