def showLearning(self, representation): allStates = np.arange(0, self.chainSize) X = np.arange(self.chainSize) * 2.0 / 10.0 - self.SHIFT Y = np.ones(self.chainSize) * self.Y DY = np.zeros(self.chainSize) DX = np.zeros(self.chainSize) C = np.zeros(self.chainSize) if self.value_function_fig is None: self.value_function_fig = plt.subplot(3, 1, 2) self.V_star_line = self.value_function_fig.plot( allStates, self.V_star) V = [representation.V(s, False, self.possibleActions(s=s)) for s in allStates] # Note the comma below, since a tuple of line objects is returned self.V_approx_line, = self.value_function_fig.plot( allStates, V, 'r-', linewidth=3) self.V_star_line = self.value_function_fig.plot( allStates, self.V_star, 'b--', linewidth=3) # Maximum value function is sum of all possible rewards plt.ylim([0, self.GOAL_REWARD * (len(self.GOAL_STATES) + 1)]) self.policy_fig = plt.subplot(3, 1, 3) self.policy_fig.set_xlim(0, self.chainSize * 2 / 10.0) self.policy_fig.set_ylim(0, 2) self.arrows = plt.quiver( X, Y, DX, DY, C, cmap='fiftyChainActions', units='x', width=0.05, scale=.008, alpha=.8) # headwidth=.05, headlength = .03, headaxislength = .02) self.policy_fig.xaxis.set_visible(False) self.policy_fig.yaxis.set_visible(False) V = [representation.V(s, False, self.possibleActions(s=s)) for s in allStates] pi = [representation.bestAction(s, False, self.possibleActions(s=s)) for s in allStates] #pi = [self.optimal_policy[s] for s in allStates] DX = [(2 * a - 1) * self.SHIFT * .1 for a in pi] self.V_approx_line.set_ydata(V) self.arrows.set_UVC(DX, DY, pi) plt.draw()
def showLearning(self, representation): if self.valueFunction_fig is None: plt.figure("Value Function") self.valueFunction_fig = plt.imshow(self.map, cmap='ValueFunction', interpolation='nearest', vmin=self.MIN_RETURN, vmax=self.MAX_RETURN) plt.xticks(np.arange(self.COLS), fontsize=12) plt.yticks(np.arange(self.ROWS), fontsize=12) # Create quivers for each action. 4 in total X = np.arange(self.ROWS) - self.SHIFT Y = np.arange(self.COLS) X, Y = np.meshgrid(X, Y) DX = DY = np.ones(X.shape) C = np.zeros(X.shape) C[0, 0] = 1 # Making sure C has both 0 and 1 # length of arrow/width of bax. Less then 0.5 because each arrow is # offset, 0.4 looks nice but could be better/auto generated arrow_ratio = 0.4 Max_Ratio_ArrowHead_to_ArrowLength = 0.25 ARROW_WIDTH = 0.5 * Max_Ratio_ArrowHead_to_ArrowLength / 5.0 self.upArrows_fig = plt.quiver(Y, X, DY, DX, C, units='y', cmap='Actions', scale_units="height", scale=self.ROWS / arrow_ratio, width=-1 * ARROW_WIDTH) self.upArrows_fig.set_clim(vmin=0, vmax=1) X = np.arange(self.ROWS) + self.SHIFT Y = np.arange(self.COLS) X, Y = np.meshgrid(X, Y) self.downArrows_fig = plt.quiver(Y, X, DY, DX, C, units='y', cmap='Actions', scale_units="height", scale=self.ROWS / arrow_ratio, width=-1 * ARROW_WIDTH) self.downArrows_fig.set_clim(vmin=0, vmax=1) X = np.arange(self.ROWS) Y = np.arange(self.COLS) - self.SHIFT X, Y = np.meshgrid(X, Y) self.leftArrows_fig = plt.quiver(Y, X, DY, DX, C, units='x', cmap='Actions', scale_units="width", scale=self.COLS / arrow_ratio, width=ARROW_WIDTH) self.leftArrows_fig.set_clim(vmin=0, vmax=1) X = np.arange(self.ROWS) Y = np.arange(self.COLS) + self.SHIFT X, Y = np.meshgrid(X, Y) self.rightArrows_fig = plt.quiver(Y, X, DY, DX, C, units='x', cmap='Actions', scale_units="width", scale=self.COLS / arrow_ratio, width=ARROW_WIDTH) self.rightArrows_fig.set_clim(vmin=0, vmax=1) plt.show() plt.figure("Value Function") V = np.zeros((self.ROWS, self.COLS)) # Boolean 3 dimensional array. The third array highlights the action. # Thie mask is used to see in which cells what actions should exist Mask = np.ones((self.COLS, self.ROWS, self.actions_num), dtype='bool') arrowSize = np.zeros((self.COLS, self.ROWS, self.actions_num), dtype='float') # 0 = suboptimal action, 1 = optimal action arrowColors = np.zeros((self.COLS, self.ROWS, self.actions_num), dtype='uint8') for r in xrange(self.ROWS): for c in xrange(self.COLS): if self.map[r, c] == self.BLOCKED: V[r, c] = 0 if self.map[r, c] == self.GOAL: V[r, c] = self.MAX_RETURN if self.map[r, c] == self.PIT: V[r, c] = self.MIN_RETURN if self.map[r, c] == self.EMPTY or self.map[r, c] == self.START: s = np.array([r, c]) As = self.possibleActions(s) terminal = self.isTerminal(s) Qs = representation.Qs(s, terminal) bestA = representation.bestActions(s, terminal, As) V[r, c] = max(Qs[As]) Mask[c, r, As] = False arrowColors[c, r, bestA] = 1 for i in xrange(len(As)): a = As[i] Q = Qs[i] value = linearMap(Q, self.MIN_RETURN, self.MAX_RETURN, 0, 1) arrowSize[c, r, a] = value # Show Value Function self.valueFunction_fig.set_data(V) # Show Policy Up Arrows DX = arrowSize[:, :, 0] DY = np.zeros((self.ROWS, self.COLS)) DX = np.ma.masked_array(DX, mask=Mask[:, :, 0]) DY = np.ma.masked_array(DY, mask=Mask[:, :, 0]) C = np.ma.masked_array(arrowColors[:, :, 0], mask=Mask[:, :, 0]) self.upArrows_fig.set_UVC(DY, DX, C) # Show Policy Down Arrows DX = -arrowSize[:, :, 1] DY = np.zeros((self.ROWS, self.COLS)) DX = np.ma.masked_array(DX, mask=Mask[:, :, 1]) DY = np.ma.masked_array(DY, mask=Mask[:, :, 1]) C = np.ma.masked_array(arrowColors[:, :, 1], mask=Mask[:, :, 1]) self.downArrows_fig.set_UVC(DY, DX, C) # Show Policy Left Arrows DX = np.zeros((self.ROWS, self.COLS)) DY = -arrowSize[:, :, 2] DX = np.ma.masked_array(DX, mask=Mask[:, :, 2]) DY = np.ma.masked_array(DY, mask=Mask[:, :, 2]) C = np.ma.masked_array(arrowColors[:, :, 2], mask=Mask[:, :, 2]) self.leftArrows_fig.set_UVC(DY, DX, C) # Show Policy Right Arrows DX = np.zeros((self.ROWS, self.COLS)) DY = arrowSize[:, :, 3] DX = np.ma.masked_array(DX, mask=Mask[:, :, 3]) DY = np.ma.masked_array(DY, mask=Mask[:, :, 3]) C = np.ma.masked_array(arrowColors[:, :, 3], mask=Mask[:, :, 3]) self.rightArrows_fig.set_UVC(DY, DX, C) plt.draw()
def showLearning(self, representation): if self.valueFunction_fig is None: plt.figure("Value Function") self.valueFunction_fig = plt.imshow( self.map, cmap='ValueFunction', interpolation='nearest', vmin=self.MIN_RETURN, vmax=self.MAX_RETURN) plt.xticks(np.arange(self.COLS), fontsize=12) plt.yticks(np.arange(self.ROWS), fontsize=12) # Create quivers for each action. 4 in total X = np.arange(self.ROWS) - self.SHIFT Y = np.arange(self.COLS) X, Y = np.meshgrid(X, Y) DX = DY = np.ones(X.shape) C = np.zeros(X.shape) C[0, 0] = 1 # Making sure C has both 0 and 1 # length of arrow/width of bax. Less then 0.5 because each arrow is # offset, 0.4 looks nice but could be better/auto generated arrow_ratio = 0.4 Max_Ratio_ArrowHead_to_ArrowLength = 0.25 ARROW_WIDTH = 0.5 * Max_Ratio_ArrowHead_to_ArrowLength / 5.0 self.upArrows_fig = plt.quiver( Y, X, DY, DX, C, units='y', cmap='Actions', scale_units="height", scale=self.ROWS / arrow_ratio, width=- 1 * ARROW_WIDTH) self.upArrows_fig.set_clim(vmin=0, vmax=1) X = np.arange(self.ROWS) + self.SHIFT Y = np.arange(self.COLS) X, Y = np.meshgrid(X, Y) self.downArrows_fig = plt.quiver( Y, X, DY, DX, C, units='y', cmap='Actions', scale_units="height", scale=self.ROWS / arrow_ratio, width=- 1 * ARROW_WIDTH) self.downArrows_fig.set_clim(vmin=0, vmax=1) X = np.arange(self.ROWS) Y = np.arange(self.COLS) - self.SHIFT X, Y = np.meshgrid(X, Y) self.leftArrows_fig = plt.quiver( Y, X, DY, DX, C, units='x', cmap='Actions', scale_units="width", scale=self.COLS / arrow_ratio, width=ARROW_WIDTH) self.leftArrows_fig.set_clim(vmin=0, vmax=1) X = np.arange(self.ROWS) Y = np.arange(self.COLS) + self.SHIFT X, Y = np.meshgrid(X, Y) self.rightArrows_fig = plt.quiver( Y, X, DY, DX, C, units='x', cmap='Actions', scale_units="width", scale=self.COLS / arrow_ratio, width=ARROW_WIDTH) self.rightArrows_fig.set_clim(vmin=0, vmax=1) plt.show() plt.figure("Value Function") V = np.zeros((self.ROWS, self.COLS)) # Boolean 3 dimensional array. The third array highlights the action. # Thie mask is used to see in which cells what actions should exist Mask = np.ones( (self.COLS, self.ROWS, self.actions_num), dtype='bool') arrowSize = np.zeros( (self.COLS, self.ROWS, self.actions_num), dtype='float') # 0 = suboptimal action, 1 = optimal action arrowColors = np.zeros( (self.COLS, self.ROWS, self.actions_num), dtype='uint8') for r in xrange(self.ROWS): for c in xrange(self.COLS): if self.map[r, c] == self.BLOCKED: V[r, c] = 0 if self.map[r, c] == self.GOAL: V[r, c] = self.MAX_RETURN if self.map[r, c] == self.PIT: V[r, c] = self.MIN_RETURN if self.map[r, c] == self.EMPTY or self.map[r, c] == self.START: s = np.array([r, c]) As = self.possibleActions(s) terminal = self.isTerminal(s) Qs = representation.Qs(s, terminal) bestA = representation.bestActions(s, terminal, As) V[r, c] = max(Qs[As]) Mask[c, r, As] = False arrowColors[c, r, bestA] = 1 for i in xrange(len(As)): a = As[i] Q = Qs[i] value = linearMap( Q, self.MIN_RETURN, self.MAX_RETURN, 0, 1) arrowSize[c, r, a] = value # Show Value Function self.valueFunction_fig.set_data(V) # Show Policy Up Arrows DX = arrowSize[:, :, 0] DY = np.zeros((self.ROWS, self.COLS)) DX = np.ma.masked_array(DX, mask=Mask[:, :, 0]) DY = np.ma.masked_array(DY, mask=Mask[:, :, 0]) C = np.ma.masked_array(arrowColors[:, :, 0], mask=Mask[:,:, 0]) self.upArrows_fig.set_UVC(DY, DX, C) # Show Policy Down Arrows DX = -arrowSize[:, :, 1] DY = np.zeros((self.ROWS, self.COLS)) DX = np.ma.masked_array(DX, mask=Mask[:, :, 1]) DY = np.ma.masked_array(DY, mask=Mask[:, :, 1]) C = np.ma.masked_array(arrowColors[:, :, 1], mask=Mask[:,:, 1]) self.downArrows_fig.set_UVC(DY, DX, C) # Show Policy Left Arrows DX = np.zeros((self.ROWS, self.COLS)) DY = -arrowSize[:, :, 2] DX = np.ma.masked_array(DX, mask=Mask[:, :, 2]) DY = np.ma.masked_array(DY, mask=Mask[:, :, 2]) C = np.ma.masked_array(arrowColors[:, :, 2], mask=Mask[:,:, 2]) self.leftArrows_fig.set_UVC(DY, DX, C) # Show Policy Right Arrows DX = np.zeros((self.ROWS, self.COLS)) DY = arrowSize[:, :, 3] DX = np.ma.masked_array(DX, mask=Mask[:, :, 3]) DY = np.ma.masked_array(DY, mask=Mask[:, :, 3]) C = np.ma.masked_array(arrowColors[:, :, 3], mask=Mask[:,:, 3]) self.rightArrows_fig.set_UVC(DY, DX, C) plt.draw()
def showLearning(self, representation): allStates = np.arange(0, self.chainSize) X = np.arange(self.chainSize) * 2.0 / 10.0 - self.SHIFT Y = np.ones(self.chainSize) * self.Y DY = np.zeros(self.chainSize) DX = np.zeros(self.chainSize) C = np.zeros(self.chainSize) if self.value_function_fig is None: self.value_function_fig = plt.subplot(3, 1, 2) self.V_star_line = self.value_function_fig.plot( allStates, self.V_star) V = [ representation.V(s, False, self.possibleActions(s=s)) for s in allStates ] # Note the comma below, since a tuple of line objects is returned self.V_approx_line, = self.value_function_fig.plot(allStates, V, 'r-', linewidth=3) self.V_star_line = self.value_function_fig.plot(allStates, self.V_star, 'b--', linewidth=3) # Maximum value function is sum of all possible rewards plt.ylim([0, self.GOAL_REWARD * (len(self.GOAL_STATES) + 1)]) self.policy_fig = plt.subplot(3, 1, 3) self.policy_fig.set_xlim(0, self.chainSize * 2 / 10.0) self.policy_fig.set_ylim(0, 2) self.arrows = plt.quiver( X, Y, DX, DY, C, cmap='fiftyChainActions', units='x', width=0.05, scale=.008, alpha=.8 ) # headwidth=.05, headlength = .03, headaxislength = .02) self.policy_fig.xaxis.set_visible(False) self.policy_fig.yaxis.set_visible(False) V = [ representation.V(s, False, self.possibleActions(s=s)) for s in allStates ] pi = [ representation.bestAction(s, False, self.possibleActions(s=s)) for s in allStates ] #pi = [self.optimal_policy[s] for s in allStates] DX = [(2 * a - 1) * self.SHIFT * .1 for a in pi] self.V_approx_line.set_ydata(V) self.arrows.set_UVC(DX, DY, pi) plt.draw()