def learner(self, gamma=0.9, alpha=0.1, epsilon=1e-5, display=False, lambda_=None): self.state = self.env.reset() s0 = self.state if display: self.env.render() a0 = self.perform_policy(s0, epsilon=epsilon) time_in_episode, total_reward = 0, 0 is_done = False while not is_done: s1, r1, is_done, info, total_reward = self.act(a0) if display: self.env.render() a1 = self.perform_policy(s1, epsilon=epsilon) old_q = get_dict(self.Q, s0, a0) q_prime = get_dict(self.Q, s1, a1) td_target = r1 + gamma * q_prime new_q = old_q + alpha * (td_target - old_q) set_dict(self.Q, new_q, s0, a0) s0, a0 = s1, a1 time_in_episode += 1 if display: print(self.experience.last_episode) return time_in_episode, total_reward
def learning_method(self, gamma=0.9, alpha=0.1, epsilon=1e-5, display=False, lambda_=None): self.state = self.env.reset() s0 = self.state if display: self.env.render() # a0 = self.perform_policy(s0, epsilon) # print(self.action_t.name) time_in_episode, total_reward = 0, 0 is_done = False while not is_done: # add code here a0 = self.perform_policy(s0, epsilon) s1, r1, is_done, info, total_reward = self.act(a0) if display: self.env.render() self.policy = greedy_policy a1 = greedy_policy(self.A, s1, self.Q) old_q = get_dict(self.Q, s0, a0) q_prime = get_dict(self.Q, s1, a1) td_target = r1 + gamma * q_prime #alpha = alpha / num_episode new_q = old_q + alpha * (td_target - old_q) set_dict(self.Q, new_q, s0, a0) # s0, a0 = s1, a1 s0 = s1 time_in_episode += 1 if display: print(self.experience.last_episode) return time_in_episode, total_reward
def policy_evaluate(episodes,V,Ns): for episode,r in episodes: for s,a in episode: ns = get_dict(Ns,s) v = get_dict(V,s) set_dict(Ns,ns+1,s) set_dict(V,v+(r-v)/(ns+1),s)
def learn_Q(self, episode, r): #Learn the Q value from the state sequence ''' Learn from an episode ''' for s, a in episode: nsa = get_dict(self.Nsa, s, a) set_dict(self.Nsa, nsa + 1, s, a) q = get_dict(self.Q, s, a) set_dict(self.Q, q + (r - q) / (nsa + 1), s, a) self.total_learning_times += 1
def policy_evaluate(episodes, V, Ns): '''统计一个状态的价值, 衰减因子为1, 中间状态的即时奖励为0, 递增式蒙特卡罗策略评估 V,Ns保存着蒙特卡罗策略评估进程中的价值和统计次数数据, 我们使用是的每次访问计数的方法''' for episode, r in episodes: for s, a in episode: ns = get_dict(Ns, s) v = get_dict(V, s) set_dict(Ns, ns + 1, s) set_dict(V, v + (r - v) / (ns + 1), s) pass
def learning_method(self, lambda_=0.9, gamma=0.9, alpha=0.1, epsilon=1e-5, display=False): self.state = self.env.reset() s0 = self.state if display: self.env.render() a0 = self.perform_policy(s0, epsilon) # print(self.action_t.name) time_in_episode, total_reward = 0, 0 is_done = False E = {} while not is_done: # add code here s1, r1, is_done, info, total_reward = self.act(a0) if display: self.env.render() a1 = self.perform_policy(s1, epsilon) q = get_dict(self.Q, s0, a0) # old q q_prime = get_dict(self.Q, s1, a1) # new q # delta = R + gamma * Q(s',s') - Q(s,s) delta = r1 + gamma * q_prime - q e = get_dict(E, s0, a0) e += 1 set_dict(E, e, s0, a0) # for all s in S, a in A for s in self.S: for a in self.A: e_value = get_dict(E, s, a) old_q = get_dict(self.Q, s, a) # Q(s,a) = Q(s,a) + alpha * delta * E(s,a) new_q = old_q + alpha * delta * e_value # E(s,a) = gamma * lambda * E(s,a) new_e = gamma * lambda_ * e_value set_dict(self.Q, new_q, s, a) set_dict(E, new_e, s, a) # s=s', a=a' s0, a0 = s1, a1 time_in_episode += 1 if display: print(self.experience.last_episode) return time_in_episode, total_reward
def learning_method(self, gamma=0.9, alpha=0.1,epsilon=1e-5,display=False,lambda_=0.9 ): self.state = self.env.reset() s0 = self.state if display: self.env.render() a0 = self.perform_policy(s0, self.Q, epsilon) time_in_episode,total_reward = 0, 0 is_done = False E = {} # 效用值 while not is_done: s1, r1, is_done, info, total_reward = self.act(a0) if display: self.env.render() a1 = self.perform_policy(s1, self.Q, epsilon) q = get_dict(self.Q, s0, a0) q_prime = get_dict(self.Q, s1, a1) delta = r1 + gamma * (q_prime - q) e = get_dict(E, s0, a0) e += 1 set_dict(E, e, s0, a0) for s in self.S: for a in self.A: e_value = get_dict(E, s, a) old_q = get_dict(self.Q, s, a) new_q = old_q + alpha * delta * e_value new_e = gamma * lambda_ *e_value set_dict(self.Q, new_q, s, a) set_dict(E, new_e, s, a) s0, a0 = s1, a1 time_in_episode += 1 if display: print(self.experience.last_episode) return time_in_episode, total_reward
def add_datauser(request): errors={} if request.POST: companies = {} storage = "" for i in request.POST.keys(): if ("food" in i) or ("weapon" in i): quality_v = i[len(i)-1:] key = i[:len(i)-1] util.set_dict(companies,key,quality_v,request.POST[i]) if "q_storage" in request.POST: storage = request.POST["q_storage"] name = request.POST["name"] er = request.POST["url_or_id"] er_id =0 url_citizen = "" error = False val = URLValidator() try: val(er) if er.index("erepublik.com/en/citizen/profile/") > -1: url_citizen = er er_id = er.split("/").pop() except Exception, e: errors['url'] = "url invalid" if util.is_int(er) and int(er) >0: er_id = er url_citizen = "http://www.erepublik.com/en/citizen/profile/"+er_id else: errors['id citizen'] = "id invalid" if er_id == 0 and len(url_citizen)>0: error = True else: error = False errors ={} if not error: try: cit=Citizen(name=name,citizen_id_er=er_id,url_citizen=url_citizen) cit.save() Tg = TraningGround(owner_citizen=cit) Tg.weights_room = request.POST['TG1'] Tg.climbing_center = request.POST['TG2'] Tg.shooting_range = request.POST['TG3'] Tg.special_forces = request.POST['TG4'] Tg.save() sto=Storage(owner_citizen=cit) if(storage <> "" and util.is_int(storage)): sto.quantity=int(storage) sto.save() for key in companies: company = util.get_company(companies[key]) company.owner_citizen = cit company.save() except DatabaseError as e: transaction.rollback() errors[e.errno]=e.strerror except ValidationError as e: transaction.rollback() errors[e.errno]=e.strerror
previous_y = int(round(inputs['previous_mouse']['repr']['ydata'])) if current_x >= 0 and current_y >= 0 and previous_x >= 0 and previous_y >= 0: rr, cc = line(previous_x, previous_y, current_x, current_y) dd = np.zeros(len(rr), dtype=np.int64) #print(rr.dtype, cc.dtype, dd.dtype) # NEED TO EITHER COPY THIS IMAGE, OR ASSUME NON-SHARING (CURRENTLY THIS HOLDS) inputs['accum']['repr'][cc, rr, dd] = 1.0 # 255 return {'current_image_mouse': inputs['accum']} dmms.type_functions['image_mouse_type'] = draw_a_line_on_image initial_output = {} set_dict(initial_output, ['self', 'current matrix'], { 'kind': 'matrix', 'repr': {} }) # this is where the network matrix sits set_dict(initial_output['self']['current matrix']['repr'], ['self', 'accum', 'self', 'current matrix'], 1) # this is the "main 1" of that matrix set_dict(initial_output['self']['current matrix']['repr'], ['main_mouse', 'previous', 'main_mouse', 'current_mouse'], 1) # mouse neuron in the network matrix set_dict(initial_output['self']['current matrix']['repr'], ['image_mouse', 'accum', 'image_mouse', 'current_image_mouse'], 1) # accum connection for image_mouse neuron in the network matrix set_dict(
dmms.type_functions['accum matrix'] = lambda inputs: { 'current matrix': { 'kind': 'matrix', 'repr': dmms.add_nested_dict( copy.deepcopy(inputs['accum']['repr']), inputs['delta']['repr'] if 'delta' in inputs else {}) } } initial_output = {} set_dict(initial_output, ['self', 'current matrix'], { 'kind': 'matrix', 'repr': {} }) # this is where the network matrix sits set_dict(initial_output['self']['current matrix']['repr'], ['self', 'accum', 'self', 'current matrix'], 1) # this is the "main 1" of that matrix print('initial_output: ', initial_output) initial_input = dmms.down_movement(initial_output) print('initial input: ', initial_input) second_output = dmms.up_movement(initial_input) print('second output: ', second_output)