class DPendulum: def __init__(self): self.pendulum = Pendulum(1) self.pendulum.DT = DT self.pendulum.NDT = 5 @property def nqv(self): return [NQ, NV] @property def nx(self): return NQ * NV @property def nu(self): return NU @property def goal(self): return x2i(c2d([0., 0.])) def reset(self, x=None): if x is None: x = [np.random.randint(0, NQ), np.random.randint(0, NV)] else: x = i2x(x) assert (len(x) == 2) self.x = x return x2i(self.x) def step(self, iu): self.x = self.dynamics(self.x, iu) reward = 1 if x2i(self.x) == self.goal else 0 return x2i(self.x), reward def render(self): q = d2cq(self.x[0]) self.pendulum.display(np.matrix([ q, ])) time.sleep(self.pendulum.DT) def dynamics(self, ix, iu): x = np.matrix(d2c(ix)).T u = d2cu(iu) self.xc, _ = self.pendulum.dynamics(x, u) return c2d(x.T.tolist()[0])
class DPendulum: def __init__(self): self.pendulum = Pendulum(1) self.pendulum.DT = DT self.pendulum.NDT = 5 @property def nqv(self): return [NQ,NV] @property def nx(self): return NQ*NV @property def nu(self): return NU @property def goal(self): return x2i(c2d([0.,0.])) def reset(self,x=None): if x is None: x = [ np.random.randint(0,NQ), np.random.randint(0,NV) ] else: x = i2x(x) assert(len(x)==2) self.x = x return x2i(self.x) def step(self,iu): self.x = self.dynamics(self.x,iu) reward = 1 if x2i(self.x)==self.goal else 0 return x2i(self.x),reward def render(self): q = d2cq(self.x[0]) self.pendulum.display(np.matrix([q,])) time.sleep(self.pendulum.DT) def dynamics(self,ix,iu): x = np.matrix(d2c (ix)).T u = d2cu(iu) self.xc,_ = self.pendulum.dynamics(x,u) return c2d(x.T.tolist()[0])
# self.x = x # self.u = u # self.reward = r # self.done = d # self.x2 = x2 ReplayItem = namedtuple('ReplayItem', 'x u reward done x2 value') ReplayItem.__new__.__defaults__ = (None, ) replayDeque = deque() ### Data for d in data: T = d.cost for x, u, t in zip(d.X, d.U, d.T): x2 = np.asarray(env.dynamics(np.matrix(x).T, np.matrix(u).T)[0].flat) o = env.obs(np.matrix(x).T).flat o2 = env.obs(np.matrix(x2).T).flat replayDeque.append( ReplayItem(x=o, u=u.copy(), reward=env.DT, done=False, x2=o2, value=T - t)) #if t>T*.9: break # avoid trajectory ends #if len(replayDeque)>BATCH_SIZE: break replayDeque[-1] = replayDeque[-1]._replace(done=True) print 'Done loading the motion lib'