Esempio n. 1
0
    def iterate(self, XA, X1, u, A_low, A_high, ITER=50, Ascaled=False, plot=True, xargs=[], output=True, a = 0, b = 0, pc_samp=1, maxT=60000, eta=0.8, tilesg=False, sg_prop=0.96, sg_samp=1, sg_points=100, sgmem_max=0.4, plotiter=False, test=False, NS=False):

        tic = time()

        self.v_e = 0        # Value function error
        self.p_e = 0        # Policy function error
        
        T = XA.shape[0]
        
        self.value_error = np.zeros(ITER)
        
        if not(tilesg):
            grid, m = buildgrid(X1, sg_points, self.radius, scale=True, stopnum=X1.shape[0])
        else: 
            nn = int(X1.shape[0]*sg_samp)
            tic = time()
            tile = TilecodeSamplegrid(X1.shape[1], 25, mem_max=sgmem_max, cores=self.CORES)
            grid = tile.fit(X1[0:nn], self.radius, prop=sg_prop)
            toc = time()
            print 'State grid points: ' + str(grid.shape[0]) + ', of maximum: ' + str(tile.max_points) + ', Time taken: ' + str(toc - tic)
            del tile
         
        points = grid.shape[0]

        ticfit = time()
        if self.first:
            self.W_f.fit(grid, np.zeros(points), NS=NS)
            self.V_f.fit(grid, np.zeros(points), NS=NS)
            self.first = False
        
        Al = np.zeros(points)
        Ah = np.zeros(points)
        if Ascaled:
            for i in range(points):
                Ws = self.W_f.predict(grid[i,:])
                Al[i] = A_low(grid[i,:], Ws)
                Ah[i] = A_high(grid[i,:], Ws)
            minpol = 0
            maxpol = 1
        else:
            for i in range(points):
                Al[i] = A_low(grid[i,:])
                Ah[i] = A_high(grid[i,:])
            minpol = np.min(Al)
            maxpol = np.max(Ah)
        
        tocfit = time()
        print 'Constraint time: ' + str(tocfit - ticfit)
        
        if ITER == 1:
            precompute = False
        else:
            precompute = True
        
        # ------------------
        #   Q-learning
        # ------------------

        #First iteration
        j = 0

        # Q values
        ticfit = time()
        Q = u + self.beta * self.V_f.predict(X1, store_XS=precompute)
        tocfit = time()
        print 'V prediction time: ' + str(tocfit - ticfit)
        
        # Fit Q function
        ticfit = time()
        self.Q_f.fit(XA, Q, pa=minpol, pb=maxpol , copy=True, unsupervised=precompute, sgd=self.asgd, asgd=self.asgd, eta=eta, n_iters=1, scale=1* (1 / min(T, maxT)), storeindex=(self.asgd and precompute), a=a, b=b, pc_samp=pc_samp)
        tocfit = time()
        print 'Q Fitting time: ' + str(tocfit - ticfit)

        # Optimise Q function
        self.value_error[0], W_opt, state = self.maximise(grid, Al, Ah, Ascaled, output=output, plotiter=plotiter, xargs=xargs, NS=NS)
         
        for j in range(1, ITER):
            # Q values
            Q = u + self.beta * self.V_f.fast_values()
            
            # Fit Q function
            self.Q_f.partial_fit(Q, 0)

            # Optimise Q function
            self.value_error[j], W_opt, state = self.maximise(grid, Al, Ah, Ascaled, output=output, plotiter=plotiter, xargs=xargs, NS=NS)
            
        ticfit = time()
        NN = min(X1.shape[0], 20000)
        W_opt_old = self.W_f.predict(X1[0:NN,:])
        self.W_f.fit(state, W_opt, sgd=0, eta=0.1, n_iters=5, scale=0, NS=NS)
        W_opt_new = self.W_f.predict(X1[0:NN,:])
        self.pe = np.mean((W_opt_old - W_opt_new))/np.mean(W_opt_old)
        toc = time()
        tocfit = time()
        print 'Policy time: ' + str(tocfit - ticfit)
        
        print 'Solve time: ' + str(toc - tic) + ', Policy change: ' + str(self.pe)
        
        if plot:
            xargstemp = xargs
            self.W_f.plot(xargs, showdata=True)
            pylab.show()
            self.V_f.plot(xargstemp, showdata=True)
            pylab.show()
Esempio n. 2
0
    def iterate(self, XA, X1, u, A_low, A_high, ITER=50, Ascaled=False, plot=True, xargs =[], output=True, gridsamp=1):

        tic = time()

        self.v_e = 0        # Value function error
        self.p_e = 0        # Policy function error

        tic = time()
        N = int(gridsamp * X1.shape[0])
        grid, m = buildgrid(X1[0:N, :], self.maxgrid, self.radius, scale=True)
        points = grid.shape[0]
        toc = time()
        print 'State grid points: ' + str(points) + ', of maximum: ' + str(m) + ', Time taken: ' + str(toc - tic)

        if self.first:
            self.W_f.fit(grid, np.zeros(points))
            self.V_f.fit(grid, np.zeros(points))
            self.first = False
        
        Al = np.zeros(points)
        Ah = np.zeros(points)
        if Ascaled:
            for i in range(points):
                Ws = self.W_f_old.predict(grid[i,:])
                Al[i] = A_low(grid[i,:], Ws)
                Ah[i] = A_high(grid[i,:], Ws)
        else:
            for i in range(points):
                Al[i] = A_low(grid[i,:])
                Ah[i] = A_high(grid[i,:])

        # ------------------
        #   Q-learning
        # ------------------
        
        #First iteration
        j = 0
        
        # Q values
        Q = u + self.beta * self.V_f.predict(X1, store_XS=True)
        
        # Fit Q function
        self.Q_f.fit(XA, Q)
        
        # Optimise Q function
        ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output)

        for j in range(ITER):

            # Q values
            Q = u + self.beta * self.V_f.fast_values()

            # Fit Q function
            tic = time()
            self.Q_f.fit(XA, Q)
            toc = time()
            print 'Fit time: ' + str(toc - tic)

            # Optimise Q function
            ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output)

        toc = time()

        print 'Solve time: ' + str(toc - tic)
        
        if plot:
            self.W_f.plot(xargs, showdata=True)
            pylab.show()
Esempio n. 3
0
    def iterate(self, XA, X1, u, A_low, A_high, ITER=50, plot=True, xargs=[], output=True, a = 0, b = 0, pc_samp=1, maxT=60000, eta=0.8, tilesg=False, sg_prop=0.96, sg_samp=1, sg_points=100, sgmem_max=0.4, plotiter=False, test=False, NS=False):

        tic = time()
        M = self.M
        self.v_e = 0        # Value function error
        self.p_e = 0        # Policy function error
        
        T = [XA[m].shape[0] for m in M]
        
        self.value_error = np.zeros(ITER)
        
        grid = [0,0] 
        tile = [0,0] 
        if not(tilesg):
            grid[0], _ = buildgrid(X1[0], sg_points, self.radius, scale=True, stopnum=X1[0].shape[0])
            grid[1], _ = buildgrid(X1[1], sg_points, self.radius, scale=True, stopnum=X1[1].shape[0])
        else: 
            for m in range(2):
                nn = int(X1[m].shape[0]*sg_samp)
                tic = time()
                tile[m] = TilecodeSamplegrid(X1[m].shape[1], 25, mem_max=sgmem_max, cores=self.CORES)
                grid[m] = tile[m].fit(X1[m][0:nn], self.radius, prop=sg_prop)
                toc = time()
                print 'State grid points: ' + str(grid[m].shape[0]) + ', of maximum: ' + str(tile[m].max_points) + ', Time taken: ' + str(toc - tic)
            del tile
        #import pdb; pdb.set_trace()
        points = [grid[m].shape[0] for m in M]

        ticfit = time()
        if self.first:
            [self.W_f[m].fit(grid[m], np.zeros(points[m]), NS=NS) for m in M]
            [self.V_f[m].fit(grid[m], np.zeros(points[m]), NS=NS) for m in M]
            self.first = False
        
        Al = [np.zeros(points[m]) for m in M]
        Ah = [np.zeros(points[m]) for m in M]
        for m in range(2):
            for i in range(points[m]):
                Al[m][i] = A_low(grid[m][i,:])
                Ah[m][i] = A_high(grid[m][i,:])
        minpol = [np.min(Al[m]) for m in M]
        maxpol = [np.max(Ah[m]) for m in M]
        
        tocfit = time()
        print 'Constraint time: ' + str(tocfit - ticfit)
        
        if ITER == 1:
            precompute = False
        else:
            precompute = True

        W_opt = [0,0]
        state = [0,0]
        Q = [0,0]
        # ------------------
        #   Q-learning
        # ------------------

        #First iteration
        j = 0

        ticfit = time()
        m1 = 1
        for m in range(2):
            
            # Q values
            Q[m] = u[m] + self.beta * self.V_f[m1].predict(X1[m], store_XS=precompute)
            tocfit = time()
            print 'V prediction time: ' + str(tocfit - ticfit)
            
            # Fit Q function
            ticfit = time()
            self.Q_f[m].fit(XA[m], Q[m], pa=minpol[m], pb=maxpol[m] , copy=True, unsupervised=precompute, sgd=self.asgd, asgd=self.asgd, eta=eta, n_iters=1, scale=1* (1 / min(T[m], maxT)), storeindex=(self.asgd and precompute), a=a, b=b, pc_samp=pc_samp)
            tocfit = time()
            print 'Q Fitting time: ' + str(tocfit - ticfit)

            # Optimise Q function
            value_error, W_opt[m], state[m] = self.maximise(m, grid[m], Al[m], Ah[m], output=output, plotiter=plotiter, xargs=xargs, NS=NS)
            m1 = 0
            if test:
                import pdb; pdb.set_trace()

        for j in range(1, ITER):
            m1 = 1
            for m in range(2):
                # Q values
                Q[m] = u[m] + self.beta * self.V_f[m1].fast_values()
                
                # Fit Q function
                self.Q_f[m].partial_fit(Q[m], 0)

                # Optimise Q function
                value_error, W_opt[m], state[m] = self.maximise(m, grid[m], Al[m], Ah[m], output=output, plotiter=plotiter, xargs=xargs, NS=NS)
                m1 = 0
                if test:
                    import pdb; pdb.set_trace()

        self.pe = [0,0]
        for m in range(2):
            ticfit = time()
            NN = min(X1[m].shape[0], 20000)
            W_opt_old = self.W_f[m].predict(X1[m][0:NN,:])
            self.W_f[m].fit(state[m], W_opt[m], sgd=0, eta=0.1, n_iters=5, scale=0, NS=NS)
            W_opt_new = self.W_f[m].predict(X1[m][0:NN,:])
            self.pe[m] = np.mean((W_opt_old - W_opt_new)/W_opt_old)
            toc = time()
            tocfit = time()
            print 'Policy time: ' + str(tocfit - ticfit)
        
        print 'Solve time: ' + str(toc - tic) + ', Policy change: ' + str(self.pe)
        
        if plot:
            xargstemp1 = xargs
            xargstemp2 = xargs
            for m in range(2):
                xargs1 = xargstemp1
                self.W_f[m].plot(xargs1, showdata=True)
                pylab.show()
                xargs2 = xargstemp2
                self.V_f[m].plot(xargs2, showdata=True)
                pylab.show()
                xargstemp1 = xargs
                xargstemp2 = xargs
Esempio n. 4
0
    def iterate(self,
                XA,
                X1,
                u,
                A_low,
                A_high,
                ITER=50,
                Ascaled=False,
                plot=True,
                xargs=[],
                output=True,
                a=0,
                b=0,
                pc_samp=1,
                maxT=60000,
                eta=0.8,
                tilesg=False,
                sg_prop=0.96,
                sg_samp=1,
                sg_points=100,
                sgmem_max=0.4,
                plotiter=False,
                test=False,
                NS=False):

        tic = time()

        self.v_e = 0  # Value function error
        self.p_e = 0  # Policy function error

        T = XA.shape[0]

        self.value_error = np.zeros(ITER)

        if not (tilesg):
            grid, m = buildgrid(X1,
                                sg_points,
                                self.radius,
                                scale=True,
                                stopnum=X1.shape[0])
        else:
            nn = int(X1.shape[0] * sg_samp)
            tic = time()
            tile = TilecodeSamplegrid(X1.shape[1],
                                      25,
                                      mem_max=sgmem_max,
                                      cores=self.CORES)
            grid = tile.fit(X1[0:nn], self.radius, prop=sg_prop)
            toc = time()
            print 'State grid points: ' + str(
                grid.shape[0]) + ', of maximum: ' + str(
                    tile.max_points) + ', Time taken: ' + str(toc - tic)
            del tile

        points = grid.shape[0]

        ticfit = time()
        if self.first:
            self.W_f.fit(grid, np.zeros(points), NS=NS)
            self.V_f.fit(grid, np.zeros(points), NS=NS)
            self.first = False

        Al = np.zeros(points)
        Ah = np.zeros(points)
        if Ascaled:
            for i in range(points):
                Ws = self.W_f.predict(grid[i, :])
                Al[i] = A_low(grid[i, :], Ws)
                Ah[i] = A_high(grid[i, :], Ws)
            minpol = 0
            maxpol = 1
        else:
            for i in range(points):
                Al[i] = A_low(grid[i, :])
                Ah[i] = A_high(grid[i, :])
            minpol = np.min(Al)
            maxpol = np.max(Ah)

        tocfit = time()
        print 'Constraint time: ' + str(tocfit - ticfit)

        if ITER == 1:
            precompute = False
        else:
            precompute = True

        # ------------------
        #   Q-learning
        # ------------------

        #First iteration
        j = 0

        # Q values
        ticfit = time()
        Q = u + self.beta * self.V_f.predict(X1, store_XS=precompute)
        tocfit = time()
        print 'V prediction time: ' + str(tocfit - ticfit)

        # Fit Q function
        ticfit = time()
        self.Q_f.fit(XA,
                     Q,
                     pa=minpol,
                     pb=maxpol,
                     copy=True,
                     unsupervised=precompute,
                     sgd=self.asgd,
                     asgd=self.asgd,
                     eta=eta,
                     n_iters=1,
                     scale=1 * (1 / min(T, maxT)),
                     storeindex=(self.asgd and precompute),
                     a=a,
                     b=b,
                     pc_samp=pc_samp)
        tocfit = time()
        print 'Q Fitting time: ' + str(tocfit - ticfit)

        # Optimise Q function
        self.value_error[0], W_opt, state = self.maximise(grid,
                                                          Al,
                                                          Ah,
                                                          Ascaled,
                                                          output=output,
                                                          plotiter=plotiter,
                                                          xargs=xargs,
                                                          NS=NS)

        for j in range(1, ITER):
            # Q values
            Q = u + self.beta * self.V_f.fast_values()

            # Fit Q function
            self.Q_f.partial_fit(Q, 0)

            # Optimise Q function
            self.value_error[j], W_opt, state = self.maximise(
                grid,
                Al,
                Ah,
                Ascaled,
                output=output,
                plotiter=plotiter,
                xargs=xargs,
                NS=NS)

        ticfit = time()
        NN = min(X1.shape[0], 20000)
        W_opt_old = self.W_f.predict(X1[0:NN, :])
        self.W_f.fit(state, W_opt, sgd=0, eta=0.1, n_iters=5, scale=0, NS=NS)
        W_opt_new = self.W_f.predict(X1[0:NN, :])
        self.pe = np.mean((W_opt_old - W_opt_new)) / np.mean(W_opt_old)
        toc = time()
        tocfit = time()
        print 'Policy time: ' + str(tocfit - ticfit)

        print 'Solve time: ' + str(toc - tic) + ', Policy change: ' + str(
            self.pe)

        if plot:
            xargstemp = xargs
            self.W_f.plot(xargs, showdata=True)
            pylab.show()
            self.V_f.plot(xargstemp, showdata=True)
            pylab.show()
Esempio n. 5
0
    def iterate(self,
                XA,
                X1,
                u,
                A_low,
                A_high,
                ITER=50,
                Ascaled=False,
                plot=True,
                xargs=[],
                output=True,
                gridsamp=1):

        tic = time()

        self.v_e = 0  # Value function error
        self.p_e = 0  # Policy function error

        tic = time()
        N = int(gridsamp * X1.shape[0])
        grid, m = buildgrid(X1[0:N, :], self.maxgrid, self.radius, scale=True)
        points = grid.shape[0]
        toc = time()
        print 'State grid points: ' + str(points) + ', of maximum: ' + str(
            m) + ', Time taken: ' + str(toc - tic)

        if self.first:
            self.W_f.fit(grid, np.zeros(points))
            self.V_f.fit(grid, np.zeros(points))
            self.first = False

        Al = np.zeros(points)
        Ah = np.zeros(points)
        if Ascaled:
            for i in range(points):
                Ws = self.W_f_old.predict(grid[i, :])
                Al[i] = A_low(grid[i, :], Ws)
                Ah[i] = A_high(grid[i, :], Ws)
        else:
            for i in range(points):
                Al[i] = A_low(grid[i, :])
                Ah[i] = A_high(grid[i, :])

        # ------------------
        #   Q-learning
        # ------------------

        #First iteration
        j = 0

        # Q values
        Q = u + self.beta * self.V_f.predict(X1, store_XS=True)

        # Fit Q function
        self.Q_f.fit(XA, Q)

        # Optimise Q function
        ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output)

        for j in range(ITER):

            # Q values
            Q = u + self.beta * self.V_f.fast_values()

            # Fit Q function
            tic = time()
            self.Q_f.fit(XA, Q)
            toc = time()
            print 'Fit time: ' + str(toc - tic)

            # Optimise Q function
            ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output)

        toc = time()

        print 'Solve time: ' + str(toc - tic)

        if plot:
            self.W_f.plot(xargs, showdata=True)
            pylab.show()
Esempio n. 6
0
    def iterate(self,
                XA,
                X1,
                u,
                A_low,
                A_high,
                ITER=50,
                plot=True,
                xargs=[],
                output=True,
                a=0,
                b=0,
                pc_samp=1,
                maxT=60000,
                eta=0.8,
                tilesg=False,
                sg_prop=0.96,
                sg_samp=1,
                sg_points=100,
                sgmem_max=0.4,
                plotiter=False,
                test=False,
                NS=False):

        tic = time()
        M = self.M
        self.v_e = 0  # Value function error
        self.p_e = 0  # Policy function error

        T = [XA[m].shape[0] for m in M]

        self.value_error = np.zeros(ITER)

        grid = [0, 0]
        tile = [0, 0]
        if not (tilesg):
            grid[0], _ = buildgrid(X1[0],
                                   sg_points,
                                   self.radius,
                                   scale=True,
                                   stopnum=X1[0].shape[0])
            grid[1], _ = buildgrid(X1[1],
                                   sg_points,
                                   self.radius,
                                   scale=True,
                                   stopnum=X1[1].shape[0])
        else:
            for m in range(2):
                nn = int(X1[m].shape[0] * sg_samp)
                tic = time()
                tile[m] = TilecodeSamplegrid(X1[m].shape[1],
                                             25,
                                             mem_max=sgmem_max,
                                             cores=self.CORES)
                grid[m] = tile[m].fit(X1[m][0:nn], self.radius, prop=sg_prop)
                toc = time()
                print 'State grid points: ' + str(
                    grid[m].shape[0]) + ', of maximum: ' + str(
                        tile[m].max_points) + ', Time taken: ' + str(toc - tic)
            del tile
        #import pdb; pdb.set_trace()
        points = [grid[m].shape[0] for m in M]

        ticfit = time()
        if self.first:
            [self.W_f[m].fit(grid[m], np.zeros(points[m]), NS=NS) for m in M]
            [self.V_f[m].fit(grid[m], np.zeros(points[m]), NS=NS) for m in M]
            self.first = False

        Al = [np.zeros(points[m]) for m in M]
        Ah = [np.zeros(points[m]) for m in M]
        for m in range(2):
            for i in range(points[m]):
                Al[m][i] = A_low(grid[m][i, :])
                Ah[m][i] = A_high(grid[m][i, :])
        minpol = [np.min(Al[m]) for m in M]
        maxpol = [np.max(Ah[m]) for m in M]

        tocfit = time()
        print 'Constraint time: ' + str(tocfit - ticfit)

        if ITER == 1:
            precompute = False
        else:
            precompute = True

        W_opt = [0, 0]
        state = [0, 0]
        Q = [0, 0]
        # ------------------
        #   Q-learning
        # ------------------

        #First iteration
        j = 0

        ticfit = time()
        m1 = 1
        for m in range(2):

            # Q values
            Q[m] = u[m] + self.beta * self.V_f[m1].predict(X1[m],
                                                           store_XS=precompute)
            tocfit = time()
            print 'V prediction time: ' + str(tocfit - ticfit)

            # Fit Q function
            ticfit = time()
            self.Q_f[m].fit(XA[m],
                            Q[m],
                            pa=minpol[m],
                            pb=maxpol[m],
                            copy=True,
                            unsupervised=precompute,
                            sgd=self.asgd,
                            asgd=self.asgd,
                            eta=eta,
                            n_iters=1,
                            scale=1 * (1 / min(T[m], maxT)),
                            storeindex=(self.asgd and precompute),
                            a=a,
                            b=b,
                            pc_samp=pc_samp)
            tocfit = time()
            print 'Q Fitting time: ' + str(tocfit - ticfit)

            # Optimise Q function
            value_error, W_opt[m], state[m] = self.maximise(m,
                                                            grid[m],
                                                            Al[m],
                                                            Ah[m],
                                                            output=output,
                                                            plotiter=plotiter,
                                                            xargs=xargs,
                                                            NS=NS)
            m1 = 0
            if test:
                import pdb
                pdb.set_trace()

        for j in range(1, ITER):
            m1 = 1
            for m in range(2):
                # Q values
                Q[m] = u[m] + self.beta * self.V_f[m1].fast_values()

                # Fit Q function
                self.Q_f[m].partial_fit(Q[m], 0)

                # Optimise Q function
                value_error, W_opt[m], state[m] = self.maximise(
                    m,
                    grid[m],
                    Al[m],
                    Ah[m],
                    output=output,
                    plotiter=plotiter,
                    xargs=xargs,
                    NS=NS)
                m1 = 0
                if test:
                    import pdb
                    pdb.set_trace()

        self.pe = [0, 0]
        for m in range(2):
            ticfit = time()
            NN = min(X1[m].shape[0], 20000)
            W_opt_old = self.W_f[m].predict(X1[m][0:NN, :])
            self.W_f[m].fit(state[m],
                            W_opt[m],
                            sgd=0,
                            eta=0.1,
                            n_iters=5,
                            scale=0,
                            NS=NS)
            W_opt_new = self.W_f[m].predict(X1[m][0:NN, :])
            self.pe[m] = np.mean((W_opt_old - W_opt_new) / W_opt_old)
            toc = time()
            tocfit = time()
            print 'Policy time: ' + str(tocfit - ticfit)

        print 'Solve time: ' + str(toc - tic) + ', Policy change: ' + str(
            self.pe)

        if plot:
            xargstemp1 = xargs
            xargstemp2 = xargs
            for m in range(2):
                xargs1 = xargstemp1
                self.W_f[m].plot(xargs1, showdata=True)
                pylab.show()
                xargs2 = xargstemp2
                self.V_f[m].plot(xargs2, showdata=True)
                pylab.show()
                xargstemp1 = xargs
                xargstemp2 = xargs