def test(env, n_episodes, policy, exp, exp_name, agent, render=True): for episode in range(n_episodes): obs = env.reset() state = utils.get_state(obs) total_reward = 0.0 for _ in count(): action = policy(state.to('cuda')).max(1)[1].view(1, 1) if render: env.render() time.sleep(0.02) obs, reward, done, info = env.step(action) total_reward += reward if not done: next_state = utils.get_state(obs) else: next_state = None state = next_state if done: out_str = "Finished Episode {} (test) with reward {}".format(episode, total_reward) exp.log(out_str) with open(agent.CONSTANTS.TEST_LOG_FILE_PATH, 'wt') as f: f.write(out_str) break env.close()
def train_model(agent, episode, data, ep_count=100, batch_size=32, window_size=10): data_length = len(data) - 1 total_profit = 0 agent.inventory = [] avg_loss = [] start = clock() state = get_state(data, 0, window_size + 1) for t in tqdm(range(data_length), total=data_length, leave=True, desc='Episode {}/{}'.format(episode, ep_count)): action = agent.act(state) # SIT next_state = get_state(data, t + 1, window_size + 1) reward = 0 # BUY if action == 1: agent.inventory.append(data[t]) # SELL elif action == 2 and len(agent.inventory) > 0: bought_price = agent.inventory.pop(0) reward = max(data[t] - bought_price, 0) total_profit += data[t] - bought_price done = True if t == data_length - 1 else False agent.remember(state, action, reward, next_state, done) state = next_state if len(agent.memory) > batch_size: loss = agent.train_experience_replay(batch_size) avg_loss.append(loss) if done: end = clock() - start if episode % 20 == 0: agent.save(episode) return (episode, ep_count, total_profit, np.mean(np.array(avg_loss)), end)
def play(self, env): print(f'[I] - Set Online Net to evaluation mode ... ', end='') self.online_net.eval() print('Done.') for i in range(5): stackedstates = StackedStates() env.reset() state = get_state(env, stackedstates, self.device) for t in count(): env.render() # time.sleep(0.04) with torch.no_grad(): action = torch.argmax(self.online_net(state), dim=1).item() _, reward, done, _ = env.step(action) state = get_state(env, stackedstates, self.device) if done or t > 1000: env.close() break
def create_components(initial, bindings): """From bindings list, it creates components and return a component list""" components = [] for binding in bindings: #r, p, n = binding.require, binding.provide, binding.arity state = get_state(binding.provide_xpath) c = get_component(components, binding.provide_xpath) s = c.get_state(state) if binding.type == 'local': arity = 1 else: arity = common.INFINITY s.provides.append(Provide(binding.provide_xpath, arity)) # Secondly, we create components that require something state = get_state(binding.require_xpath) c = get_component(components, binding.require_xpath) s = c.get_state(state) s.requires.append(Require(binding.provide_xpath, binding.arity)) # We manually add the state that contains the provide required by # the user. It is not necessary added via Bindings because if this # state doens't contain any require, it doesn't appear in # bindings. c = get_component(components, initial.xpath) s = c.get_state(get_state(initial.xpath)) return components
def eval(self, env, verbose=False, display=False): if verbose: print('Start evaluation.') rewards = [] for i_episode in range(self.episode): reward = [] time_step = env.reset() state = torch.tensor([utils.get_state(time_step.observation)], device=self.device) while not time_step.last(): action_ID = self.select_action(state, random_choose=False) time_step = env.step(self.action_space[action_ID]) reward.append(time_step.reward) state = torch.tensor([utils.get_state(time_step.observation)], device=self.device) reward = np.mean(reward) rewards.append(reward) if verbose: print('Episode {} average reward: {}'.format( i_episode, reward)) if verbose: print('End evaluation.') print('Average reward: {}'.format(np.mean(rewards)))
def init_reward_matrix(): reward = [] # set the reward of unconnected area to -1000 for i in range(0, 27): row = [] for j in range(0, 27): row.append(-1000) reward.append(row) # set the reward of connected states to 0 for state in range(0, 27): # enumerate state pos_s = state % 3 pos_m = (state / 3) % 3 pos_l = (state / 9) % 3 for i in range(0, 3): # small can move freely reward[state][get_state(i, pos_m, pos_l)] = 0 if pos_m != pos_s: # if middle can move for i in range(0, 3): if pos_s != i: reward[state][get_state(pos_s, i, pos_l)] = 0 if pos_l != pos_m and pos_l != pos_s: # if large can move for i in range(0, 3): if pos_m != i and pos_s != i: reward[state][get_state(pos_s, pos_m, i)] = 0 reward[state][state] = -1000 # prevent loop return reward
def evaluate_model(agent, data, window_size, debug): data_length = len(data) - 1 state = get_state(data, 0, window_size + 1) total_profit = 0 agent.inventory = [] for t in range(data_length): action = agent.act(state, is_eval=True) # SIT next_state = get_state(data, t + 1, window_size + 1) reward = 0 # BUY if action == 1: agent.inventory.append(data[t]) if debug: logging.debug('Buy at: {}'.format(format_currency(data[t]))) # SELL elif action == 2 and len(agent.inventory) > 0: bought_price = agent.inventory.pop(0) reward = max(data[t] - bought_price, 0) total_profit += data[t] - bought_price if debug: logging.debug('Sell at: {} | Position: {}'.format( format_currency(data[t]), format_position(data[t] - bought_price))) done = True if t == data_length - 1 else False agent.memory.append((state, action, reward, next_state, done)) state = next_state if done: return total_profit
def exp_rep_pretrain(self, env): i = 0 print('Pretrain Filling Experience Replay Memory ... ', end='') while i < self.exp_rep_pretrain_size: # Initialize the environment and state stackedstates = StackedStates() env.reset() state = get_state(env, stackedstates, self.device) for t in count(): i += 1 action = env.action_space.sample() _, reward, done, _ = env.step(action) reward = torch.tensor([reward], device=self.device) done = torch.tensor([done], device=self.device) action = torch.tensor([action], device=self.device) # Observe new state next_state = get_state(env, stackedstates, self.device) # Store the transition in memory self.memory.push(state, action, next_state, reward, done) if done: print("{} ".format(t + 1), end='') break else: # Move to the next state state = next_state print('Done.')
def dashboard(): if utils.needs_user(): return flask.redirect(flask.url_for('setup')) if ('logged_in' not in flask.session) or (not flask.session['logged_in']): return flask.redirect(flask.url_for('login')) running = utils.check_pid(Config["pidfile"]) tstate = utils.get_state("alarm_thread") tutime_s = "" trunning = False if tstate: trunning, tutime = tstate tutime_s = time.strftime("%c", time.localtime(tutime)) thread_state = {"running": bool(trunning), "utime": tutime_s} utime = time.strftime("%c", time.localtime()) state_text = "Not Runnning" flags = { "alarm": False, "armed": False, "disarmed": False, "tripped": False, "faulted": False } state_data = None alarm_state_d = utils.get_state("alarm") if alarm_state_d: alarm_state, state_time_i = alarm_state_d if alarm_state is not None: utime = time.strftime("%c", time.localtime(state_time_i)) state_text = Alarm.ALARM_STATES[alarm_state] flags["alarm"] = alarm_state == Alarm.ALARMED flags["disarmed"] = alarm_state == Alarm.DISARMED flags["tripped"] = alarm_state == Alarm.TRIPPED flags["faulted"] = alarm_state == Alarm.FAULT flags["armed"] = alarm_state == Alarm.ARMED states = utils.get_states_not("alarm", "alarm_thread") state_data = { state['key']: { 'data': state['data'], 'time': time.strftime("%c", time.localtime(state['state_time'])) } for state in states } interfaces = utils.get_interfaces() return flask.render_template('dashboard.j2', flags=flags, running=running, thread_state=thread_state, state_text=state_text, state_data=state_data, utime=utime, interfaces=interfaces, smbio=smbio)
def dashboard(): if utils.needs_user(): return flask.redirect(flask.url_for('setup')) if ('logged_in' not in flask.session) or (not flask.session['logged_in']): return flask.redirect(flask.url_for('login')) running = utils.check_pid(Config["pidfile"]) tstate = utils.get_state("alarm_thread") tutime_s = "" trunning = False if tstate: trunning, tutime = tstate tutime_s = time.strftime("%c", time.localtime(tutime)) thread_state = { "running": bool(trunning), "utime": tutime_s} utime = time.strftime("%c", time.localtime()) state_text = "Not Runnning" flags = { "alarm": False, "armed": False, "disarmed": False, "tripped": False, "faulted": False} state_data = None alarm_state_d = utils.get_state("alarm") if alarm_state_d: alarm_state, state_time_i = alarm_state_d if alarm_state is not None: utime = time.strftime("%c", time.localtime(state_time_i)) state_text = Alarm.ALARM_STATES[alarm_state] flags["alarm"] = alarm_state == Alarm.ALARMED flags["disarmed"] = alarm_state == Alarm.DISARMED flags["tripped"] = alarm_state == Alarm.TRIPPED flags["faulted"] = alarm_state == Alarm.FAULT flags["armed"] = alarm_state == Alarm.ARMED states = utils.get_states_not("alarm", "alarm_thread") state_data = { state['key']: { 'data': state['data'], 'time': time.strftime("%c", time.localtime(state['state_time']))} for state in states} interfaces = utils.get_interfaces() return flask.render_template( 'dashboard.j2', flags=flags, running=running, thread_state=thread_state, state_text=state_text, state_data=state_data, utime=utime, interfaces=interfaces, smbio=smbio)
def get_exp_context(code): ctx = { 'user_type': 'researcher', 'today_date': datetime.now().strftime('%Y-%m-%d'), 'experiment': Experiment.query.filter_by(code=code).first(), 'protocols': Protocol.query.filter_by(exp_code=code).all(), 'dashboard_page': True } ctx['show_pam'] = utils.get_state("pam", ctx['protocols']) ctx['show_survey'] = utils.get_state("push_survey", ctx['protocols']) return ctx
def experiment_options(code): ctx = { 'user_type': 'researcher', 'today_date': datetime.now().strftime('%Y-%m-%d'), 'experiment': Experiment.query.filter_by(code=code).first(), 'protocols': Protocol.query.filter_by(exp_code=code).all(), 'experiment_page': True } ctx['show_pam'] = utils.get_state("pam", ctx['protocols']) ctx['show_survey'] = utils.get_state("push_survey", ctx['protocols']) return render_template('experiment/create-edit-experiment.html', **ctx)
def evaluate(env, load_path='agent.pt'): """ Evaluate a trained model and compute your leaderboard scores NO CHANGES SHOULD BE MADE TO THIS FUNCTION Parameters ------- env: gym.Env environment to evaluate on load_path: str path to load the model (.pt) from """ episode_rewards = [0.0] actions = get_action_set() action_size = len(actions) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # These are not the final evaluation seeds, do not overfit on these tracks! seeds = [ 22597174, 68545857, 75568192, 91140053, 86018367, 49636746, 66759182, 91294619, 84274995, 31531469 ] # Build & load network policy_net = DQN(action_size, device).to(device) checkpoint = torch.load(load_path, map_location=device) policy_net.load_state_dict(checkpoint) policy_net.eval() # Iterate over a number of evaluation episodes for i in range(10): env.seed(seeds[i]) obs, done = env.reset(), False obs = get_state(obs) t = 0 # Run each episode until episode has terminated or 600 time steps have been reached while not done and t < 600: env.render() action_id = select_greedy_action(obs, policy_net, action_size) action = actions[action_id] obs, rew, done, _ = env.step(action) obs = get_state(obs) episode_rewards[-1] += rew t += 1 print('episode %d \t reward %f' % (i, episode_rewards[-1])) episode_rewards.append(0.0) print('---------------------------') print(' total score: %f' % np.mean(np.array(episode_rewards))) print('---------------------------')
def get_non_local_provide(initial, bindings): """Return the list of lifecycle and state that are not local provide. They will be used by specification. :rtype: [{"component":lifecycle, "state": state}] """ non_local = [] i = initial.xpath non_local.append({"component": get_lifecycle(i), "state": get_state(i)}) for b in bindings: if b.type == "external": non_local.append({"component": get_lifecycle(b.provide_xpath), "state": get_state(b.provide_xpath)}) return non_local
def evaluate(self, env, render=False): success_count = 0 for i_episode in range(self.evaluate_episode): obs = env.reset() s = get_state(obs) for i_step in range(self.max_timestep): if render: env.render() a = self.select_action(s, train_mode=False) obs_, r_e, done, info = env.step(a) s_ = get_state(obs_) s = s_ success_count += info['is_success'] return success_count / self.evaluate_episode
def _simulate_truncated(agent, node, frac_occupied=.5): # Rollout until (board.size ** 2) * frac_occupied positions are occupied by players' stones node.increment_visits() board = node.board.copy() curr_color = node.color while (np.sum(board.scores()) / (board.size ** 2)) < frac_occupied: state, valid_positions, valid_positions_mask = get_state(board, curr_color) if len(valid_positions) == 0: break action_p, _ = agent(tf.convert_to_tensor([state], dtype=tf.float32)) action_p = action_p[0].numpy() * valid_positions_mask.reshape((-1,)) action_idx = np.argmax(action_p) position_key = (int(action_idx / board.size), int(action_idx % board.size)) board.apply_position(curr_color, valid_positions[position_key]) curr_color = 1 - curr_color valid_positions = board.valid_positions(node.color) if len(valid_positions) == 0: return -_get_value_from_scores(board.scores(), node.color) _, v = node.get_p_v(agent) return -v
def visualisation_plan(workspace_path): file_replay = workspace_path + "/" + aeolus.common.FILE_ARMONIC_REPLAY_FILLED file_metis_plan = workspace_path + "/" + aeolus.common.FILE_METIS_PLAN_JSON with open(file_replay, 'r') as f: replay = json.load(f) with open(file_metis_plan, 'r') as f: metis_plan = json.load(f) plan = aeolus.launcher.metis_to_armonic(metis_plan, replay) def is_final_state(jid, cpt, plan): # Used to know if a component state change is the last one or not. for p in plan: if p.type == 'state-goto' and p.jid == jid and utils.get_lifecycle(p.xpath) == cpt: return False return True ret = [] for idx, action in enumerate(plan): if type(action) == StateGoto: action.location = action.jid action.component_type = utils.get_lifecycle(action.xpath) action.state = utils.get_state(action.xpath) action.final = is_final_state(action.location, action.component_type, plan[idx+1:]) action.last_one = (idx == len(plan) - 1) else: pass plan.insert(0, Start(len(plan))) plan.append(End()) return plan
def mode_solid_sparkly(key, pixels): ##### # To make John happy (he's worried you'll think he wrote this): # This function was written by Caitlin, who has not coded in rather # a long time. And who thinks all this formatting bullshit is exactly that. # But it means this code is not nearly as elegant as he would have it be. # And he can f*****g deal with it. <3 ###### state = get_state() primary = state.get("primary") sparkles = state.get("sparkles") primary_color = ( primary.get("r", 255), primary.get("g", 255), primary.get("b", 255), ) sparkles_color = ( sparkles.get("r", 255), sparkles.get("g", 255), sparkles.get("b", 255), ) pixels.fill(primary_color) selected_pixels = random.sample(range(0, 300), 10) while key == state_key(): pixels[selected_pixels[0]] = primary_color selected_pixels.pop(0) new_pixel = random.randint(0, 299) selected_pixels.append(new_pixel) pixels[new_pixel] = sparkles_color pixels.show() time.sleep(.25)
def mode_chaos_colors(key, pixels): ##### # A bunch of flashing colors - by Caitlin ##### state = get_state() def rand_color(): rand_1 = random.randint(0, 254) rand_2 = random.randint(0, (254 - rand_1)) rand_3 = 254 - (rand_1 + rand_2) return rand_1, rand_2, rand_3 # red = (255, 0, 0) # green = (0, 255, 0) # blue = (0, 0, 255) # return random.choice([red, green, blue]) # Init by setting all to random colors for i in range(0, len(pixels)): pixels[i] = rand_color() time.sleep(.01) # Each .5s, change 10 pixels while key == state_key(): selected_pixels = random.sample(range(0, 300), 20) for i in selected_pixels: pixels[i] = rand_color() time.sleep(.5) pixels.show()
def get_component(components, xpath): name = get_lifecycle(xpath) # First, we create components that provide something os = [armonic.utils.OsTypeMBS(), armonic.utils.OsTypeDebian()] lfms = [] for o in os: l = armonic.serialize.Serialize(os_type=o) lf = name state = get_state(xpath) state_xpath = "%s/%s" % (lf, state) path = l.state_goto_path(state_xpath)[0]['paths'] if len(path) > 1: raise Exception("Number of paths to reach %s must not be greather than 1" % state_xpath) if len(path) == 1: lfms.append(l) if len(lfms) > 1: logger.error("%s is available the following OS:" % xpath) for l in lfms: logger.error(" %s" % l.lf_manager.os_type) raise Exception("%s is available on several OS and this is not supported (yet)." % xpath) lfm = lfms[0] c = None for i in components: if i.name == name: c = i break if c is None: c = Component(name, lfm) components.append(c) return c
def mode_fading(key, pixels): state = get_state() colors = state.get("colors", [ { "r": 255, "g": 255, "b": 255 }, { "r": 0, "g": 0, "b": 0 }, ]) c1 = _color_tuple(colors[0]) c2 = _color_tuple(colors[1]) i = 0 delta = 1 while key == state_key(): color = _color_between(c1, c2, float(i) / 100.0) print(color) pixels.fill(color) pixels.show() time.sleep(0.01) if i >= 100: delta = -1 if i <= 0: delta = 1 i += delta
def file_view(): state = request.files print(state) import pdb pdb.set_trace() # set_state(state) # print(state) return jsonify(get_state())
def show_table(): project_id = int(request.args.get('project_id')) summary_dict, df = utils.get_state(project_id) return render_template("show_table.html", summary_dict = summary_dict, table=df.to_html( index=False,header="true", classes="display",table_id="example",border=0), )
def train(self, env, logger=None): total_step = 0 loss_pi, loss_q, loss_forward, loss_inverse = 0., 0., 0., 0. for i_episode in range(self.max_episode): obs = env.reset() s = get_state(obs) cumulative_r = 0. for i_step in range(self.max_timestep): a = self.select_action(s) obs_, r_e, done, info = env.step(a) s_ = get_state(obs_) r_i = self.get_intrisic_reward(s, a, s_) r = r_e + r_i self.memory.store(s, a, r, s_) s = s_ if len(self.memory) > self.batch_size: loss_pi, loss_q, loss_forward, loss_inverse = self.learn() cumulative_r += r_e total_step += 1 print( 'i_episode: {} total step: {} cumulative reward: {:.4f} is_success: {} ' .format(i_episode, total_step, cumulative_r, info['is_success'])) if logger is not None and i_episode % self.log_interval == 0: logger.add_scalar('Indicator/cumulative reward', cumulative_r, i_episode) logger.add_scalar('Loss/pi_loss', loss_pi, i_episode) logger.add_scalar('Loss/q_loss', loss_q, i_episode) logger.add_scalar('Loss/forward_loss', loss_forward, i_episode) logger.add_scalar('Loss/inverse_loss', loss_inverse, i_episode) if i_episode % self.evaluate_interval == 0: success_rate = self.evaluate(env) if logger is not None: logger.add_scalar('Indicator/success rate', success_rate, i_episode) if i_episode > self.save_model_start and i_episode % self.save_model_interval == 0: self.save_model(remarks='{}_{}'.format(env.spec.id, i_episode))
def mode_solid(key, pixels): state = get_state() color = state.get("color") print(f"solid: {color}") pixels.fill(( color.get("r", 255), color.get("g", 255), color.get("b", 255), )) pixels.show() time.sleep(5)
def main(): #time_step = 0.0002 # TODO: context.get_continuous_state_vector() fails time_step = 2e-3 parser = argparse.ArgumentParser() parser.add_argument('-c', '--cfree', action='store_true', help='Disables collisions when planning') parser.add_argument('-d', '--deterministic', action='store_true', help='Manually sets the random seeds used by the stream generators') parser.add_argument('-s', '--simulate', action='store_true', help='Simulates the system') args = parser.parse_args() if args.deterministic: # TODO: still not fully deterministic random.seed(0) np.random.seed(0) import meshcat meshcat_vis = meshcat.Visualizer() task, diagram, state_machine = load_station(time_step=time_step) print(task) plant = task.mbp #dump_plant(plant) #dump_models(plant) RenderSystemWithGraphviz(diagram) # Useful for getting port names context = diagram.GetMutableSubsystemContext(plant, task.diagram_context) task.publish() initial_state = get_state(plant, context) trajectories = plan_trajectories(task, context, collisions=not args.cfree) if trajectories is None: return ################################################## set_state(plant, context, initial_state) if args.simulate: from manipulation_station.robot_plans import JointSpacePlan splines, gripper_setpoints = convert_splines(plant, task.robot, task.gripper, context, trajectories) sim_duration = compute_duration(splines) plan_list = [JointSpacePlan(spline) for spline in splines] print('Splines: {}\nDuration: {:.3f} seconds'.format(len(splines), sim_duration)) task, diagram, state_machine = load_station(time_step=time_step, plan=(plan_list, gripper_setpoints)) task.set_initial() #set_state(plant, context, initial_state) #state_machine.Load(plan_list, gripper_setpoints) simulate_splines(task.diagram, task.diagram_context, sim_duration) else: step_trajectories(diagram, task.diagram_context, context, trajectories, time_step=0.001)
def table_csv(): project_id = int(request.args.get('project_id')) summary_dict, df = utils.get_state(project_id) with tempfile.TemporaryDirectory() as tempdir: filepath = os.path.join(tempdir,'out.csv') df.to_csv(filepath,index=False) with open(filepath,'r') as f: csv_txt = f.read() tstamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') return Response( csv_txt, mimetype="text/csv", headers={"Content-disposition": f"attachment; filename={tstamp}.csv"})
def create_visualisation_attributes(self): def is_final_state(jid, cpt, actions): # Used to know if a component state change is the last one or not. for p in actions: if p.type == 'state-goto' and p.jid == jid and utils.get_lifecycle(p.xpath) == cpt: return False return True for idx, action in enumerate(self.actions): if type(action) == StateGoto: action.location = action.jid action.component_type = utils.get_lifecycle(action.xpath) action.state = utils.get_state(action.xpath) action.final = is_final_state(action.location, action.component_type, self.actions[idx+1:]) action.last_one = (idx == len(self.actions) - 1) else: pass # Add header and footer used to inform interface deployment has started and is ended self.actions.insert(0, Start(len(self.actions))) self.actions.append(End())
def mode_solid_rough(key, pixels): state = get_state() color = state.get("color") variance = int(state.get("variance", 10)) print(f"solid rough: {color}") for i in range(0, len(pixels)): r = color.get("r", 255) + random.randint(-variance, variance) g = color.get("g", 255) + random.randint(-variance, variance) b = color.get("b", 255) + random.randint(-variance, variance) r = min(255, max(0, r)) g = min(255, max(0, g)) b = min(255, max(0, b)) pixels[i] = (r, g, b) time.sleep(.01) pixels.show()
def tianbiao_my(person, n): base_row_index = n * 6 r = 0 + base_row_index set_value(r, 2, person['成员姓名']) set_value(r, 5, person['性别']) set_value(r, 8, '汉') set_value(r, 10, get_birth_day(person['身份证号码'])) set_font_size(r, 10, 9) set_value(r, 12, person['文化程度']) r = 1 + base_row_index set_value(r, 2, str(person['身份证号码'])) set_value(r, 8, person['兵役状况']) # 兵役情况 # set_value(r, 10, phone_numbers[person['成员姓名']]) set_value(r, 10, person['联系电话']) set_value(r, 12, person['与户主关系']) r = 2 + base_row_index set_value( r, 2, get_power(person[[ '土地(共有)使用权', '保留型土地使用权', '承包经营权', '集体资产管理权', '集体收益分配权' ]])) set_value(r, 7, get_state(person['存在状态'])) set_align(r, 2, WD_ALIGN_PARAGRAPH.JUSTIFY) set_align(r, 7, WD_ALIGN_PARAGRAPH.LEFT) r = 3 + base_row_index # set_value(r, 7, get_state(person['存在状态'])) set_value(r, 10, person['现住地址']) r = 4 + base_row_index # set_value(r, 2, get_power(person[['土地(共有)使用权', '保留型土地使用权', '承包经营权', '集体资产管理权', '集体收益分配权']])) set_value(r, 10, str(person['户籍号'])) r = 5 + base_row_index set_value(r, 2, person['婚姻状况']) # 婚姻状况
def _simulate(agent, node): # Rollout until a terminal state is reached and return true win/loss value. node.increment_visits() board = node.board.copy() curr_color = node.color while True: state, valid_positions, valid_positions_mask = get_state(board, curr_color) if len(valid_positions) == 0: break action_p, _ = agent(tf.convert_to_tensor([state], dtype=tf.float32)) action_p = action_p[0].numpy() * valid_positions_mask.reshape((-1,)) action_idx = np.random.choice(len(action_p), p=action_p / np.sum(action_p)) position_key = (int(action_idx / board.size), int(action_idx % board.size)) board.apply_position(curr_color, valid_positions[position_key]) curr_color = 1 - curr_color return -_get_value_from_scores(board.scores(), node.color)
def move(self, board): if self.mcts is not None: try: _, mcts_p, _, _ = self.mcts.search(board, self.color) # _, mcts_p, _, _ = mcts(board, self.agent, self.color, n_iter=self._mcts_iter, c=1) except TerminalStateException: return action_idx = np.argmax(mcts_p) else: state, valid_positions, valid_positions_mask = get_state( board, self.color) if len(valid_positions) == 0: return action_p, _ = self.agent( tf.convert_to_tensor([state], dtype=tf.float32)) action_p = action_p[0].numpy() * valid_positions_mask.reshape( (-1, )) action_idx = np.argmax(action_p) # action_idx = np.random.choice(len(action_p), p=action_p / np.sum(action_p)) return (int(action_idx / board.size), int(action_idx % board.size))
def drive_leds(): global done key = "" while not done: new_key = state_key() if new_key != key: key = new_key state = get_state() mode = state.get("mode", "solid") print(f"change in state!: {state}") func = { "off": mode_off, "solid": mode_solid, "solid_rough": mode_solid_rough, "fading": mode_fading, "solid_rainbow": mode_solid_rainbow, "sliding_rainbow": mode_sliding_circle_rainbow, "halloween": mode_halloween, "per_step": mode_per_step, "nyan_cat": mode_nyan_cat, "nyan_cats": mode_nyan_cats, "solid_sparkly": mode_solid_sparkly, "chaos_colors": mode_chaos_colors, }.get(mode, mode_solid) worker_thread = threading.Thread(target=func, args=(key, pixels)) worker_thread.daemon = True worker_thread.start() # kill_thread = threading.Thread(target=delay_off, args=(60*2,)) # kill_thread.daemon = True # kill_thread.start() time.sleep(0.1) pixels.fill((0, 0, 0))
def think(self, game): legal_moves = game.board.get_legal_nearby_moves(2) or [(7, 7)] values_dict = {} tmp_board = game.board.board pattern_array = [] white_will_win = 0 black_will_win = 0 max_point = (-1, -1) max_eval_move = (-1, -1) if game.current_player.stone_color == 'b': max_eval = -10000 else: max_eval = 10000 occurence = utils.pattern_occurrence(game.board.board, self.load_pattern) od_value = sum([a*b for a,b in zip(occurence, self.mul_values)]) for x, y in legal_moves: tmp_board[x][y] = game.current_player.stone_color pattern = utils.extract_features(tmp_board, config.pattern_file_name) pattern_array.append(pattern) state = utils.get_state(tmp_board) self_occurence = utils.pattern_occurrence(tmp_board, self.load_pattern) self_value = sum([a*b for a,b in zip(self_occurence, self.mul_values)]) if game.current_player.stone_color == 'b': if self_value > max_eval: max_eval = self_value max_eval_move = (x, y) elif self_value == max_eval: if random.randint(0,9) >= 4: max_eval_move = (x, y) elif game.current_player.stone_color == 'w': if self_value < max_eval: max_eval = self_value max_eval_move = (x, y) elif self_value == max_eval: if random.randint(0,9) >= 4: max_eval_move = (x, y) if state == 1: print('b win') black_will_win = 1 max_point = (x, y) elif state == 2: print('w win') white_will_win = 1 max_point = (x, y) tmp_board[x][y] = '.' if max_eval_move == (-1, -1): max_eval_move = random.choice(legal_moves) values = self.CNN.run_value(pattern_array) value_set = set() for index, (x, y) in enumerate(legal_moves): values_dict[(x, y)] = values[index] value_set.add(values[index][0]) if black_will_win == 0 and white_will_win == 0: if random.randint(0,9) >= 3 and len(value_set) >= 5: #print("set len:", len(value_set)) if game.current_player.stone_color == 'b': max_point = max(values_dict.items(), key=operator.itemgetter(1))[0] else: max_point = min(values_dict.items(), key=operator.itemgetter(1))[0] else: max_point = max_eval_move #max_point = random.choice(legal_moves) tmp_board[max_point[0]][max_point[1]] = game.current_player.stone_color self._feature = utils.extract_features(game.board.board, config.pattern_file_name) new_pattern = utils.extract_features(tmp_board, config.pattern_file_name) print(max_point) #print(values_dict[max_point]) #print("new_pattern", new_pattern) #reward if black_will_win == 1: print("learning...reward 1") print(self.CNN.run_learning([[1.]], [self._feature], [new_pattern])) elif white_will_win == 1: print("learning...reward -1") print(self.CNN.run_learning([[-1.]], [self._feature], [new_pattern])) else: new_occurence = utils.pattern_occurrence(tmp_board, self.load_pattern) print("new_occur", new_occurence) self_occurence = utils.pattern_occurrence(game.board.board, self.load_pattern) self_value = sum([a*b for a,b in zip(self_occurence, self.mul_values)]) new_value = sum([a*b for a,b in zip(new_occurence, self.mul_values)]) print("self value:", self_value) print("new value:", new_value) if new_value > self_value: print("learning...reward 0.x") print(self.CNN.run_learning([[0.00001 * (new_value - self_value)]], [self._feature], [new_pattern])) elif new_value < self_value: print("learning...reward -0.x") print(self.CNN.run_learning([[0.00001 * (new_value - self_value)]], [self._feature], [new_pattern])) else: print("reward 0") print(self.CNN.run_learning([[0.]], [self._feature], [new_pattern])) return max_point
def get_state_name(self): """Returns the current workflow state name of the object. """ return str(self.current_state or utils.get_state(self))
def current_state(self): """Return the state of the current object""" return get_state(self.context)
def state_view(): if request.method == "POST": state = request.json.get("state") set_state(state) print(state) return jsonify(get_state())
def single_train(envs, agents, core_env, core_agent, n_episodes, agent_n, exp, exp_name, render=False,): """ Training step for single-agent settings. Parameters ---------- envs: list of Environment List of environment for multi-agent agents: list of Agent List of multi-agents to create candidates for core_agent core_env: Environment Main environment of this train step core_agent: Agent Main agent of this train step n_episodes: int The number of episodes agent_n : int The number of agent exp: Experiment The Experiment object used by hyperdash exp_name: str The name of experiment render: boolean, default False Flag for whether to render the environment """ print("INFO: Single mode...") for episode in range(n_episodes): # 0. Initialize the environment, state and agent params obs = core_env.reset() core_state = utils.get_state(obs) core_agent.reset_total_reward() core_agent.set_state(core_state) for t in count(): if episode == 0 and t == 0: core_agent_action = core_agent.select_action(core_agent.get_state(), is_first=True) else: core_agent_action = core_agent.select_action(core_agent.get_state()) core_agent.set_action(core_agent_action) core_obs, core_reward, core_done, core_info = core_agent.get_env().step(core_agent.get_action()) core_agent.set_step_retrun_value(core_obs, core_done, core_info) core_agent.set_done_state(core_done) core_agent.set_total_reward(core_reward) if not core_done: core_next_state = utils.get_state(core_obs) else: core_next_state = None core_reward = torch.tensor([core_reward], device=core_agent.CONSTANTS.DEVICE) core_agent.memory.push(core_agent.get_state(), core_agent.get_action().to('cpu'), core_next_state, core_reward.to('cpu')) core_agent.set_state(core_next_state) if core_agent.steps_done > core_agent.CONSTANTS.INITIAL_MEMORY: core_agent.optimize_model() if core_agent.steps_done % core_agent.CONSTANTS.TARGET_UPDATE == 0: core_agent.target_net.load_state_dict(core_agent.policy_net.state_dict()) if core_agent.is_done(): print("\n") break exp.log("{}: Current core_agent reward: {} | Episode:{}\n".format(t, core_agent.get_total_reward(), episode)) core_agent.writer.add_scalar("core/reward/all_step", core_agent.get_total_reward(), core_agent.steps_done) # print("Current core_agent reward: {}".format(core_agent.get_total_reward())) if episode % core_agent.CONSTANTS.MODEL_SAVING_FREQUENCY == 0: with open(core_agent.CONSTANTS.OUTPUT_DIRECTORY_PATH + "/model_tmp/{}-policy".format(core_agent.get_name()), 'wb') as f: cloudpickle.dump(core_agent.target_net, f) with open(core_agent.CONSTANTS.OUTPUT_DIRECTORY_PATH + "/model_tmp/{}-target".format(core_agent.get_name()), 'wb') as f: cloudpickle.dump(core_agent.target_net, f) t_reward = core_agent.get_total_reward() o_reward = core_agent.get_obtained_reward() exp.metric("total_reward", t_reward) exp.metric("steps", t) exp.metric("obtained_reward", o_reward) out_str = 'Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format( core_agent.steps_done, episode, t, core_agent.get_total_reward()) if episode % 20 == 0: print(out_str) out_str = str("\n" + out_str + "\n") exp.log(out_str) else: exp.log(out_str) with open(core_agent.CONSTANTS.TRAIN_LOG_FILE_PATH, 'a') as f: f.write(str(out_str) + "\n") core_agent.writer.add_scalar("core/reward/total", core_agent.get_total_reward(), episode) core_agent.writer.add_scalar("core/steps/total", t, episode) core_agent.writer.add_scalars("telemetry", {"steps": t, "reward": core_agent.get_total_reward()}, episode) core_agent.writer.add_scalar("core/obtained_reward/", core_agent.get_obtained_reward(), episode) core_env.close() core_agent.writer.close()
def dashboard(): return render_template("index.html", sensors=get_state(app.sensors_path)) @app.route("/id_pub")
def train(envs, agents, core_env, core_agent, n_episodes, agent_n, exp, exp_name, render=False,): """ Training step. In this code, we use the multi-agents to create candidate for core agent. The core agent and environment is main RL set. In addition, each agent has own environment and durability. Each agent's reward is checked for the specified number of episodes, and if an agent is not selected as the best-agent, that agent's durability is reduced. Parameters ---------- envs: list of Environment List of environment for multi-agent agents: list of Agent List of multi-agents to create candidates for core_agent core_env: Environment Main environment of this train step core_agent: Agent Main agent of this train step n_episodes: int The number of episodes agent_n : int The number of agent exp: Experiment The Experiment object used by hyperdash exp_name: str The name of experiment render: boolean, default False Flag for whether to render the environment """ _count = 0 for episode in range(n_episodes): # 0. Initialize the environment, state and agent params obs = core_env.reset() core_state = utils.get_state(obs) core_agent.reset_total_reward() core_agent.set_state(core_state) for agent in agents: obs = agent.get_env().reset() state = utils.get_state(obs) agent.set_state(state) agent.reset_total_reward() # agent.durability = DEFAULT_DURABILITY for t in count(): # if t % 20 != 0: # print(str(t) + " ", end='') # else: # print("\n") exp.log("agent_durability:{}".format([agent.get_durability() for agent in agents])) for agent in agents: if agent.get_state() is not None and len(agents) > 1: agent.writer.add_scalar("internal/durability/{}".format(agent.get_name()), agent.get_durability(), _count) utils.write_csv([_count, agent.get_durability()], core_agent.CONSTANTS.OUTPUT_DIRECTORY_PATH + "/{}-durability.csv".format(agent.get_name())) else: utils.write_csv([_count, 0], core_agent.CONSTANTS.OUTPUT_DIRECTORY_PATH + "/{}-durability.csv".format(agent.get_name())) _count += 1 # print(str(t) + " ", end='') # 1. Select action from environment of each agent for agent in agents: if agent.get_state() is not None and len(agents) > 1: # if agent.get_state() is not None: # agent.set_env(core_agent.get_env()) # agent.set_state(core_agent.get_state()) # agent.set_init_state(core_agent.get_state()) agent.set_init_state(agent.get_state()) if episode == 0 and t == 0: action = agent.select_action(agent.get_state(), is_first=True) else: action = agent.select_action(agent.get_state()) agent.set_action(action) # 2. Proceed step of each agent for agent in agents: if agent.get_state() is not None: # if agent.get_state() is not None and len(agents) > 1: obs, reward, done, info = agent.get_env().step(agent.get_action()) agent.set_step_retrun_value(obs, done, info) agent.set_total_reward(reward) # Agent reward value # print("Agent:{}, Reward:{}, State:{}".format(agent.name, reward, agent.get_state())) # print("Agent:{}, Reward:{}".format(agent.name, reward)) if not done: next_state = utils.get_state(obs) else: next_state = None reward = torch.tensor([reward], device=agent.CONSTANTS.DEVICE) agent.memory.push(agent.get_state(), agent.get_action().to('cpu'), next_state, reward.to('cpu')) agent.set_state(next_state) if agent.steps_done > agent.CONSTANTS.INITIAL_MEMORY: agent.optimize_model() if agent.steps_done % agent.CONSTANTS.TARGET_UPDATE == 0: agent.target_net.load_state_dict(agent.policy_net.state_dict()) # print("\n") # print([agent.get_total_reward() for agent in agents]) exp.log([agent.get_total_reward() for agent in agents]) # print(str(t) + " ", end='') # --------------- # Proposal method # --------------- # 3. Select best agent in this step if len(agents) > 1: best_agent = utils.select_best_agent(agents, core_agent.CONSTANTS.ROULETTE_MODE, max_reward=core_agent.CONSTANTS.MAX_REWARD, min_reward=core_agent.CONSTANTS.MIN_REWARD) # best_agent.best_counter() [agent.best_counter() for agent in agents if agent.get_name() == best_agent.get_name()] # for agent in agents: # if agent.get_name() == best_agent.get_name(): # agent.best_counter() core_agent.memory.push(best_agent.get_init_state(), best_agent.get_action().to('cpu'), best_agent.get_next_state(), torch.tensor([best_agent.get_reward()], device=best_agent.CONSTANTS.DEVICE).to('cpu')) for agent in agents: agent.writer.add_scalar("internal/reward/{}/all_step".format(agent.get_name()), agent.get_total_reward(), core_agent.steps_done) agent.writer.add_scalar("internal/obtained_reward/{}".format(agent.get_name()), agent.get_obtained_reward(), episode) # core_agent_action = best_agent.get_action() # best_agent_state = best_agent.get_state() # policy_net_flag = best_agent.get_policy_net_flag() # best_agent_action = best_agent.get_action() # 3.5 Only best_agent can heal own durability at specific iteration if t % core_agent.CONSTANTS.DURABILITY_HEALING_FREQUENCY == 0 and len(agents) > 1: # best_agent.heal_durability(core_agent.CONSTANTS.DEFAULT_DURABILITY_INCREASED_LEVEL) [agent.heal_durability(core_agent.CONSTANTS.DEFAULT_DURABILITY_INCREASED_LEVEL) for agent in agents if agent.get_name() == best_agent.get_name()] # Best_agent information # exp.log("{}: Current best agent: {}, Disabilities:{}".format(t, best_agent.name, # [agent.durability() for agent in agents])) # print("{}: Current best agent: {}, Reward:{}".format(t, best_agent.name, best_agent.get_total_reward())) exp.log("{}: Current best agent: {}, Reward:{}".format(t, best_agent.get_name(), best_agent.get_total_reward())) # 4. Check the agent durability in specified step if t % core_agent.CONSTANTS.DURABILITY_CHECK_FREQUENCY == 0: if len(agents) > 1: # index = [i for i in range(len(agents)) if i not in best_agents] index = [i for i, agent in enumerate(agents) if agent.get_name() != best_agent.get_name()] for i in index: if agents[i].get_state() is not None: agents[i].reduce_durability(core_agent.CONSTANTS.DEFAULT_DURABILITY_DECREASED_LEVEL) # 5. Kill agent if len(agents) > 1: for i, agent in enumerate(agents): if agent.get_durability() <= 0: del agents[i] # 6. Main step of core agent # core_agent_action = core_agent.select_core_action(best_agent_state, policy_net_flag, best_agent_action) if episode == 0 and t == 0: core_agent_action = core_agent.select_action(core_agent.get_state(), is_first=True) else: core_agent_action = core_agent.select_action(core_agent.get_state()) core_agent.set_action(core_agent_action) core_obs, core_reward, core_done, core_info = core_agent.get_env().step(core_agent.get_action()) core_agent.set_step_retrun_value(core_obs, core_done, core_info) core_agent.set_done_state(core_done) core_agent.set_total_reward(core_reward) if not core_done: core_next_state = utils.get_state(core_obs) else: core_next_state = None core_reward = torch.tensor([core_reward], device=core_agent.CONSTANTS.DEVICE) core_agent.memory.push(core_agent.get_state(), core_agent.get_action().to('cpu'), core_next_state, core_reward.to('cpu')) core_agent.set_state(core_next_state) if core_agent.steps_done > core_agent.CONSTANTS.INITIAL_MEMORY: core_agent.optimize_model() if core_agent.steps_done % core_agent.CONSTANTS.TARGET_UPDATE == 0: core_agent.target_net.load_state_dict(core_agent.policy_net.state_dict()) if core_agent.is_done(): print("\n") break exp.log("{} steps | Current core_agent reward: {} | Episode:{}\n".format(t, core_agent.get_total_reward(), episode)) core_agent.writer.add_scalar("core/reward/all_step", core_agent.get_total_reward(), core_agent.steps_done) for agent in agents: agent.writer.add_scalar("internal/reward/{}/episode".format(agent.get_name()), agent.get_total_reward(), episode) # print("Current core_agent reward: {}".format(core_agent.get_total_reward())) # ---------------------- # End of proposal method # ---------------------- if episode % core_agent.CONSTANTS.MODEL_SAVING_FREQUENCY == 0: for agent in agents: with open(core_agent.CONSTANTS.OUTPUT_DIRECTORY_PATH + "/model_tmp/{}-policy".format(agent.get_name()), 'wb') as f: cloudpickle.dump(agent.policy_net, f) with open(core_agent.CONSTANTS.OUTPUT_DIRECTORY_PATH + "/model_tmp/{}-target".format(agent.get_name()), 'wb') as f: cloudpickle.dump(agent.target_net, f) agent.writer.add_scalar("internal/obtained_reward/{}".format(agent.get_name()), agent.get_obtained_reward(), episode) with open(core_agent.CONSTANTS.OUTPUT_DIRECTORY_PATH + "/model_tmp/{}-policy".format(core_agent.get_name()), 'wb') as f: cloudpickle.dump(core_agent.target_net, f) with open(core_agent.CONSTANTS.OUTPUT_DIRECTORY_PATH + "/model_tmp/{}-target".format(core_agent.get_name()), 'wb') as f: cloudpickle.dump(core_agent.target_net, f) t_reward = core_agent.get_total_reward() o_reward = core_agent.get_obtained_reward() exp.metric("total_reward", t_reward) exp.metric("steps", t) exp.metric("obtained_reward", o_reward) out_str = 'Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format( core_agent.steps_done, episode, t, core_agent.get_total_reward()) if episode % 20 == 0: print(out_str) out_str = str("\n" + out_str + "\n") exp.log(out_str) else: # print(out_str) exp.log(out_str) with open(core_agent.CONSTANTS.TRAIN_LOG_FILE_PATH, 'a') as f: f.write(str(out_str) + "\n") core_agent.writer.add_scalar("core/reward/total", core_agent.get_total_reward(), episode) core_agent.writer.add_scalar("core/steps/total", t, episode) core_agent.writer.add_scalars("telemetry", {"steps": t, "reward": core_agent.get_total_reward()}, episode) core_agent.writer.add_scalar("core/obtained_reward/", core_agent.get_obtained_reward(), episode) core_env.close() core_agent.writer.close() for agent in agents: agent.writer.close() for agent in agents: agent.get_env().close() del best_agent