def forward(policy, transition, start, time_steps, discount=None): #print "START of FORWARD ------------------->" num_actions = transition.tot_actions num_states = transition.tot_states dt_states = 0.0625 * np.zeros([num_states, time_steps]) dt_states_actions = np.zeros([num_actions, num_states, time_steps]) for i in start: dt_states[i, 0] += 1 dt_states[:, 0] /= len(start) for i in range(time_steps): for j in range(num_states): tr = transition.dense_forward[j] if i != time_steps - 1: dt_states[j, i + 1] = np.sum(dt_states[map(int, tr[1, :]), i] * policy[map(int, tr[0, :]), map(int, tr[1, :])] * tr[2, :]) dt_states_actions[:, j, i] = dt_states[j, i] * policy[:, j] if discount == None: state_action_freq = np.sum(dt_states_actions, axis=2) state_freq = np.sum(dt_states, axis=1) else: state_action_freq = discounted_sum(dt_states_actions, discount, ax=2) state_freq = discounted_sum(dt_states, discount, ax=1) #print "END of Forward" return state_freq, state_action_freq, dt_states
def forward_sparse(policy, transition_forward, start, time_steps, discount=None): #print "START of FORWARD ------------------->" num_states = transition_forward.shape[1] num_actions = transition_forward.shape[0] / num_states #print num_states,num_actions #alpha = 1w*0/(num_states*100) alpha = 0 dt_states = np.zeros((num_states, time_steps)) dt_states_actions = np.zeros((num_actions * num_states, time_steps)) for i in start: dt_states[i, 0] += 1 dt_states[:, 0] /= len(start) for i in range(time_steps): #print "CHAPE YOU WANT",(dt_states[:,i]*policy).shape dt_states_actions[:, i] = (dt_states[:, i] * policy).reshape( num_actions * num_states, order="F").T if i != time_steps - 1: dt_states[:, i + 1] = np.dot( (1 - alpha) * transition_forward.T, dt_states_actions[:, i]) + alpha * np.sum(dt_states_actions[:, i]) if discount == None: state_action_freq = np.sum(dt_states_actions, axis=1) state_freq = np.sum(dt_states, axis=1) else: state_action_freq = discounted_sum(dt_states_actions, discount, ax=1) state_freq = discounted_sum(dt_states, discount, ax=1) state_action_freq = state_action_freq.reshape(num_actions, num_states, order="F") return state_freq, state_action_freq, dt_states
def forward(policy,transition,start,time_steps, discount = None): #print "START of FORWARD ------------------->" num_actions = transition.tot_actions;num_states = transition.tot_states dt_states = 0.0625 * np.zeros([num_states,time_steps]) dt_states_actions = np.zeros([num_actions,num_states,time_steps]) for i in start: dt_states[i,0]+=1 dt_states[:,0] /=len(start) for i in range(time_steps): for j in range(num_states): tr = transition.dense_forward[j] if i != time_steps-1: dt_states[j,i+1] = np.sum(dt_states[map(int,tr[1,:]),i] *policy[map(int,tr[0,:]),map(int,tr[1,:])] * tr[2,:]) dt_states_actions[:,j,i] = dt_states[j,i]*policy[:,j] if discount ==None: state_action_freq = np.sum(dt_states_actions,axis=2) state_freq = np.sum(dt_states,axis = 1) else: state_action_freq = discounted_sum(dt_states_actions,discount,ax=2) state_freq = discounted_sum(dt_states,discount,ax = 1) #print "END of Forward" return state_freq,state_action_freq,dt_states
def forward_sparse(policy,transition_forward,start,time_steps, discount = None): #print "START of FORWARD ------------------->" num_states = transition_forward.shape[1];num_actions = transition_forward.shape[0]/num_states #print num_states,num_actions #alpha = 1w*0/(num_states*100) alpha = 0 dt_states = np.zeros((num_states,time_steps)) dt_states_actions = np.zeros((num_actions*num_states,time_steps)) for i in start: dt_states[i,0]+=1 dt_states[:,0] /=len(start) for i in range(time_steps): #print "CHAPE YOU WANT",(dt_states[:,i]*policy).shape dt_states_actions[:,i] = (dt_states[:,i]*policy).reshape(num_actions*num_states,order="F").T if i != time_steps-1: dt_states[:,i+1] =np.dot((1-alpha)*transition_forward.T,dt_states_actions[:,i]) + alpha*np.sum(dt_states_actions[:,i]) if discount ==None: state_action_freq = np.sum(dt_states_actions,axis=1) state_freq = np.sum(dt_states,axis = 1) else: state_action_freq = discounted_sum(dt_states_actions,discount,ax=1) state_freq = discounted_sum(dt_states,discount,ax = 1) state_action_freq = state_action_freq.reshape(num_actions,num_states,order = "F") return state_freq,state_action_freq,dt_states