def __init__(self): self.n_episode = [] self.n_epsilon = [] self.n_dist = [] self.avg_err = [] # Parameters self.n_episodes = rospy.get_param("/n_episodes") self.avg_err_fre = rospy.get_param('/avg_err_fre') # create environment self.env = Env() self.n_states = self.env.observation_space self.n_actions = self.env.action_space.n # create Deep Q-Network self.dqn = DQN(self.n_states, self.n_actions) # load DQN weight rospack = rospkg.RosPack() data_path = rospack.get_path("pioneer_dragging") + "/data" username = rospy.get_param("/username") n_folder = rospy.get_param("/n_folder") self.dqn.file_models = "{0}/{1}-{2}/{2}-pytorch-RL.tar".format( data_path, username, n_folder) self.dqn.load_model() # plot self.color1 = 'tab:green' self.color2 = 'tab:blue' self.color3 = 'tab:orange' self.color4 = 'tab:red' self.style_plot = random.choice(plt.style.available) plt.style.use(self.style_plot) plt.ion() ########### # Figure 1 - Rewards self.fig1 = plt.figure(1) self.ax1 = self.fig1.add_subplot(1, 1, 1) self.ax2 = self.ax1.twinx() title_1 = 'Rewards - (Mode: Testing)' self.ax1.set_title(title_1) self.ax1.set_xlabel('Episode') self.ax1.set_ylabel('Reward', color=self.color1) self.ax2.set_ylabel('Epsilon', color=self.color2) self.ax1.tick_params(axis='y', labelcolor=self.color1) self.ax2.tick_params(axis='y', labelcolor=self.color2) ########### # Figure 2 - Error self.fig2 = plt.figure(2) self.ax3 = self.fig2.add_subplot(1, 1, 1) title_2 = 'Error Distance - (Mode: Testing)' self.ax3.set_title(title_2) self.ax3.set_xlabel('Episode') self.ax3.set_ylabel('Meter')
def UCTPlayGame(): """ Play a sample game between two UCT players where each player gets a different number of UCT iterations (= simulations = tree nodes). state는 양 플레이어가 공통으로 공유하는 env 정보다 state의 playerJustMoved에 따라서 서로 다른 플레이어가 행동한다 """ # state = OthelloState(4) # uncomment to play Othello on a square board of the given size state = Env() # uncomment to play OXO # state = NimState(15) # uncomment to play Nim with the given number of starting chips while (state.board.finished == False): print(str(state)) if state.playerJustMoved == 1: # play with values for itermax and verbose = True m = UCT(rootstate=state, itermax=100, verbose=True) else: # play with values for itermax and verbose = True m = UCT(rootstate=state, itermax=100, verbose=True) print("Best Move: " + str(m) + "\n") state.DoMove(m) #time.sleep(5) if state.GetResult(state.playerJustMoved) == 1.0: print("Player " + str(state.playerJustMoved) + " wins!") elif state.GetResult(state.playerJustMoved) == 0.0: print("Player " + str(3 - state.playerJustMoved) + " wins!") else: print("Nobody wins!")
def __init__(self, pp, logger, pid="", gui=None, *args, **kwargs): self.rows, self.cols, self.n_channels = self.dims = pp['dims'] self.pid = pid self.save = pp['save_exp_data'] self.batch_size = pp['batch_size'] self.n_hours, self.n_events = pp['n_hours'], pp['n_events'] self.pp = pp self.logger = logger self.grid = np.zeros(pp['dims'], np.bool) # Contains hand-offs only self.hgrid = np.zeros(pp['dims'], np.bool) self.env = Env(pp, self.grid, self.hgrid, logger, pid) if (self.save or self.batch_size > 1): size = self.n_events if self.save else pp['buffer_size'] self.exp_buffer = PrioritizedReplayBuffer( size=size, rows=self.rows, cols=self.cols, n_channels=self.n_channels, alpha=0.6) # original: 0.6 self.pri_beta_schedule = LinearSchedule( self.n_events, initial_p=0.4, # pp['prioritized_replay_beta'], final_p=1.0) # original: 1.0 self.prioritized_replay_eps = float(1e-6) self.quit_sim, self.invalid_loss, self.exceeded_bthresh = False, False, False self.i, self.t = 0, 0.1 # Iteration, time signal.signal(signal.SIGINT, self.exit_handler)
def test(self, num_actions): self.saver.restore(self.session, FLAGS.checkpoint_path) print "Restored model weights from ", FLAGS.checkpoint_path monitor_env = gym.make(FLAGS.game) monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True) env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) for i_episode in xrange(FLAGS.num_eval_episodes): state = env.get_initial_state() episode_reward = 0 done = False # create state sequence state_sequence = np.zeros((t_max, FLAGS.history_length, FLAGS.width, FLAGS.height)) state_sequence[t_max -1, :, :, :] = state while not done: monitor_env.render() q_values = self.q_values.eval(session = self.session, feed_dict = {self.state : [state_sequence]}) action_index = np.argmax(q_values) new_state, reward, done = env.step(action_index) state = new_state # update state sequence state_sequence = np.delete(state_sequence, 0, 0) state_sequence = np.insert(state_sequence, t_max-1, state, 0) episode_reward += reward print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward) monitor_env.monitor.close()
def __init__(self, is_training, is_obstacle=True): Env.__init__(self, is_training) Gazebo.__init__(self) self.is_obstacle = is_obstacle spp = self.get_physics_properties() self.set_physics_properties(time_step=spp.time_step, max_update_rate=0.0, gravity=spp.gravity, ode_config=spp.ode_config) self._spawn_models()
def main(args): embedder_hidden_sizes = args.embedder_hidden_sizes embedded_dim = args.embedded_dim lstm_size = args.lstm_size n_shuffle = args.n_shuffle clf_hidden_sizes = args.clf_hidden_sizes policy_hidden_sizes = args.policy_hidden_sizes shared_dim = args.shared_dim nsteps = args.nsteps n_envs = args.n_envs data_type = args.data_type r_cost = args.r_cost # TODO data load first, classifier defining and declare env traindata, valdata, testdata = data_load(data_type=args.data_type, random_seed=args.random_seed) input_dim = traindata.n_features + 1 clf_output_size = traindata.n_classes if traindata.n_classes > 2 else 1 encoder = SetEncoder(input_dim, traindata.n_features, embedder_hidden_sizes, embedded_dim, lstm_size, n_shuffle, normalize=args.normalize, dropout=args.dropout, p=args.p) dfsnet = DFSNet(encoder=encoder, classifier=MLP(lstm_size + embedded_dim, clf_hidden_sizes, clf_output_size, dropout=args.dropout, p=args.p, batch_norm=args.batchnorm), policy=DuelingNet(lstm_size + embedded_dim, policy_hidden_sizes, shared_dim, traindata.n_actions)) dfsnet.to(args.device) step_runner = StepRunner(dfsnet, args) env = Env(args, n_envs, r_cost, traindata, step_runner.classify) valenv = Env(args, n_envs, r_cost, valdata, step_runner.classify) testenv = Env(args, n_envs, r_cost, testdata, step_runner.classify) env.classify = step_runner.classify valenv.classify = step_runner.classify testenv.classify = step_runner.classify learn_start = time() learn(step_runner, args, env, valenv, nsteps=nsteps, total_steps=int(5e6), scheduler=args.scheduler) learn_elapsed = time() - learn_start dfsnet.eval() test_and_record(step_runner, args, env, valenv, testenv) print(learn_elapsed)
def start(self): self.env = Env(length=10, height=2, Nstep=10) self.agent = QLearningAgent( alpha=0.5, epsilon=0.1, discount=0.99, get_legal_actions=lambda s: range(len(self.env.actions))) self.agent.load_param("qtable.pickle") print("================Qtable:================") for m in self.agent._qvalues: print(m, self.agent._qvalues[m]) rewards = [] replay = ReplayBuffer(1000) for i in range(500): rewards.append(self.one_episode(2000, replay=replay)) #OPTIONAL YOUR CODE: adjust epsilon self.agent.epsilon *= 0.9999 print("+++++++++++++++++++++++++++++++++++++++++++++++") print("episode = ", i, 'eps =', self.agent.epsilon, 'mean reward =', np.mean(rewards[-10:])) print("+++++++++++++++++++++++++++++++++++++++++++++++") # if i %2 ==0: self.agent.save_param("qtable.pickle") self.agent.save_param("bak.pickle")
def start(self): self.env = Env(height=3, discrete=True) self.action_lib = [[-1, -1], [-1, 1], [-1, 0], [0, -1], [0, 1], [0, 0], [1, -1], [1, 1], [1, 0]] self.agent = QLearningAgent( alpha=0.5, epsilon=0.2, discount=0.9, get_legal_actions=lambda s: range(len(self.action_lib))) self.agent.load_param("q_table3.pickle") # print("================Qtable:================") # for m in self.agent._qvalues: # print(m,self.agent._qvalues[m]) rewards = [] replay = ReplayBuffer(10000) for i in range(500): mr, me = self.one_episode(replay, t_max=5000) rewards.append(mr) #OPTIONAL YOUR CODE: adjust epsilon # self.agent.epsilon *= 0.99 print("+++++++++++++++++++++++++++++++++++++++++++++++") print("episode = ", i, 'mean reward =', np.mean(rewards[-10:])) print("+++++++++++++++++++++++++++++++++++++++++++++++") # if i %2 ==0: title = "mr:" + str(mr) + "me:" + str(me) self.env.ROSNode.log_showfigure(title) self.agent.save_param("q_table3.pickle") self.agent.save_param("q_table_bak.pickle")
def main(): # num = sys.argv[1] num = '81' print(num) env_name = 'SuperMarioBros-{}-{}-v0'.format(num[0], num[1]) print(env_name) communication = Communication(child_num=process_num) brain = ACBrain(talker=communication.master, env_name=env_name) envs_p = [] seed = get_seed() for i in range(process_num): agent = Agent(talker=communication.children[i]) env_temp = Env(agent, i, seed=seed + i, env_name=env_name) envs_p.append(Process(target=env_temp.run, args=())) for i in envs_p: i.start() tfb_p = subprocess.Popen(['tensorboard', '--logdir', "./logs/scalars"]) brain.run() for p in envs_p: p.terminate() tfb_p.kill()
def learn(self, env, max_episodes=MAX_EPISODES): for episode in range(0, max_episodes): print(episode) state = env.init() done = False while not done: goal = self.ac_agent.choose_goal(state) # low-level learn action according to sub-goal best_action, r_in_his, train_limit = [], [], 500 while len(r_in_his) < train_limit: action = self.ts_agent.choose_action(state, [goal]) r_in = Env.intrinsic_reward(state, goal, action) if r_in_his == [] or r_in > max(r_in_his): best_action = action r_in_his.append(r_in) self.ts.memory.push(state, action, r_in) self.ts.train() # high-level learn sub-goal action = best_action next_state, reward, done, env_state = env.step(state, action) print(reward, env_state) if not done: self.ac.memory.push(state, action, reward, next_state) self.ac.train() state = next_state # anneal exploration probability self.ac.anneal_epsilon() self.ts.anneal_epsilon() # save model self.save_model()
def main(): """ 1. Read and parse PDF files into month-wise DF 2. Create CSV by combining all the DFs 3. Create visualizations from the CSV :return: None """ logger = Env.setup_logging() path_base = os.path.join(os.getcwd(), "PDF") csv_path = os.path.join(os.getcwd(), "output", "Salary_Slips_Merged.csv") pdf_path_list = sorted([os.path.join(path_base, file_name) for file_name in os.listdir(path_base) if (os.path.isfile(os.path.join(path_base, file_name)) and os.path.basename(file_name).endswith(".pdf") and ("Pay" in file_name))]) combined_payslip_df = pd.DataFrame() for pdf_path in pdf_path_list: monthly_df = pdf_to_df(pdf_path, logger) combined_payslip_df = combined_payslip_df.append(monthly_df) combined_payslip_df.to_csv(csv_path, index=None) if not os.path.exists(os.path.dirname(csv_path)): os.mkdir(os.path.dirname(csv_path)) side_by_side_bar_plot(csv_path) line_plot(csv_path)
def experiment1_test( output_folder, word_vectors, agent, episode_index, testset_path='./dataset/conll2003/en/eng.testb', ): # 初始化环境 env = Env(testset_path, word_vectors) step = 0 s = env.reset() print('[' + util.now_time() + "] start testing...") while True: # check task is ended if env.end(): print('[' + util.now_time() + "] testing...done") result_file = '%03d_episode_test.txt' % (episode_index + 1) env.save_all_newlines_to_file(output_folder, result_file) return evaluate.conlleval(output_folder, result_file) # Choose Action a a = agent.choose_action(s) # Execute action s_, r = env.step(a) # Next status step += 1 s = s_
def test(self, env): # initialize environment env = Env(env, 84, 84, 4) terminal = False # Get initial game observation state = env.get_initial_state() # episode's reward and cost episode_reward = 0 for _ in range(100): while not terminal: # forward pass of network. Get probability of all actions probs, v = self.sess.run((self.policy, self.state_value), feed_dict={self.input_state: [state]}) probs = probs[0] v = v[0][0] if random.random() < 0.01: action_index = random.choice([0, 1, 2, 3]) else: action_index = np.argmax(probs) # Gym excecutes action in game environment on behalf of actor-learner new_state, reward, terminal = env.step(action_index) env.env.render() # clip reward to -1, 1 # Update the state and global counters state = new_state # update episode's counter episode_reward += reward if terminal: terminal = False print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \ episode_reward, "/ COST" episode_reward = 0 counter = 0 # Get initial game observation state = env.get_initial_state()
class CarEnvironment(Environment): def __init__(self): self.action = [0.0, 0.0] self.delay = False self.grid = Grid() self.env = Env(self.grid) self.reset() def step(self): # Simulate a step in the environment self.agent.brain.stored_action = self.action self.env.tick() self.env.calculate_moves() self.env.do_moves() self.env.print_env(self.env.tick_number) def reset(self): self.env.reset() self.agent = self.env.cars[0] # (x, y, vel, deg, goal_x, goal_y, dist_to_goal, dir_of_goal, direction_diff) self.sensors = self.agent.brain.get_state_tuple() self.distance_to_goal = self.agent.brain.distance_to_goal() def getSensors(self): # (x, y, vel, deg, goal_x, goal_y, dist_to_goal, dir_of_goal, direction_diff) return self.agent.brain.get_state_tuple() def getCarState(self): return self.agent.brain.get_state() def in_goal_state(self): return self.agent.brain.get_state().reached_goal def performAction(self, action): self.action = action self.step() def indim(self): return 2 def outdim(self): return len(self.getSensors())
def objective(self, params): """ Computes the mean squared error between the networks predictions and the true Q-values. Network is trained based on the input parameters. Arguments --------- :param params : dict Dictionary containing values for hyperparameters to be optimized. Returns ------- :returns : dict Dictionary containint mean squared error between true and estimated (using the parameter configuration from params) Q-values and the status of the optimization. """ a = params["lr"] b = params["lr decay"] c = params["batch size"] d = params["target update"] # initiliaze the RL-agent: agent = AgentDQN(dim_state=self.dim_state, dim_actions=self.dim_actions, hidden_dims=self.hidden_dims, optimizer=Adam(lr=a, decay=b), gamma=self.gamma, eps=self.eps, eps_decay=self.eps_decay, frozen=self.frozen, pretrained=self.pretrained) # initiliaze the environment: env = Env(start=self.start, tcost=self.tcost, horizon=self.horizon, w=self.w, theta=self.theta, regimes=self.regimes) trained_agent, _, _, _, _, _, _ = train_dqn(agent, env, self.train_episodes, c, self.init_d_size, self.max_d_size, d, self.freeze_after) pred = trained_agent.qnn.predict(self.x) true = self.y mse = np.mean((pred - true)**2) return {"loss": mse, "status": STATUS_OK}
def experiment1_train( output_folder, word_vectors, n_episodes=300, trainset_path='./dataset/conll2003/en/eng.train', ): # 初始化环境 print('[' + util.now_time() + "] init environment...") env = Env(trainset_path, word_vectors) print('[' + util.now_time() + "] 环境初始化完毕") # 初始化DQN print('[' + util.now_time() + "] init agent...") agent = DQN(n_actions=env.n_actions, status_dim=env.status_dim, action_dim=env.action_dim, reward_dim=env.reward_dim) print('[' + util.now_time() + "] agent初始化完毕") # 迭代episodes for i in range(n_episodes): print('[' + util.now_time() + "] start episode %03d of learning..." % (i + 1)) step = 0 s = env.reset() while True: # check task is ended if env.end(): print('[' + util.now_time() + "] episode %03d of learning...done" % (i + 1)) result_file = '%03d_episode_train.txt' % (i + 1) env.save_all_newlines_to_file(output_folder, result_file) train_eval = evaluate.conlleval(output_folder, result_file) test_eval = experiment1_test(output_folder, word_vectors, agent, i) break # Choose Action a a = agent.choose_action(s) # Execute action # print('step %d' % step) s_, r = env.step(a) agent.store_transition(s, a, r, s_) step += 1 s = s_ if step > 200 and step % 5 == 0: agent.learn() # plot and compare train and test set TODO # plot(train_evals,test_evals) agent.eval_network.save(output_folder + os.path.sep + 'ex1_eval_model', overwrite=True)
def evaluate_file(fname): with open(fname) as f: env = Env() inst_q = parser.parse(f.read()) # TODO term = inst_q.execute(env) # assuming this will be a Queue for now # break on the top level will cause execution to stop, but should really be an error. ret # can be used to return early. if term == command.LOOP_TERMINATE: raise exceptions.QQError( "Can't break out of the main program body.")
def execute(self, env): fname = env.qframe.popleft() subqframe = env.qframe.popleft() body = env.fnqueue[fname].copy() term = body.execute( Env(qframe=subqframe.statements, rqueue=None, fnqueue=env.fnqueue)) if term == command.LOOP_TERMINATE: raise exceptions.QQError("Can't break from a function") env.qframe.append(subqframe)
def test(self, num_actions): self.saver.restore(self.session, FLAGS.checkpoint_path) print "Restored model weights from ", FLAGS.checkpoint_path monitor_env = gym.make(FLAGS.game) monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True) env = Env(monitor_env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) for i_episode in xrange(FLAGS.num_eval_episodes): state = env.get_initial_state() episode_reward = 0 done = False while not done: monitor_env.render() probs = self.session.run(self.policy_values, feed_dict={self.state: [state]})[0] action_index = sample_policy_action(num_actions, probs) new_state, reward, done = env.step(action_index) state = new_state episode_reward += reward print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward) monitor_env.monitor.close()
def cartpole(): env = Env('localhost:32822') env.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.box.shape[0] action_space = env.action_space.discrete.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() # print(state) state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 # env.render() print("acting on state: ", state) action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) plt.plot(dqn_solver.loss) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Episode') plt.savefig("loss.png") break dqn_solver.experience_replay()
def __init__(self, _config_path): self.__config = Config(_config_path) self.__envs = {} self.__timer = Timer() self.__repeat_times = self.__config.repeat_times Buffer.set_policy(self.__config.q_policy) for _V in self.__config.Vs: for _win_size in self.__config.win_sizes: _config = self.__config.copy() setattr(_config, "win_size", _win_size) self.__envs[(_V, _win_size)] = Env(_V, _config, self.__repeat_times) delattr(_config, "win_size")
def main(): args = parse_arguments() # dataset, dataloader transforms = get_transform() train_dataset = Dataset.TrackData_RL(args.train_data, transform=transforms) train_loader = DataLoader(train_dataset, num_workers=args.num_workers, shuffle=True, batch_size=1) # model, environment R = Reinforce(train_loader, transforms) env = Env(args) start_epoch = 1 if args.init_sl: if os.path.isfile(args.init_sl): print("=> loading checkpoint '{}'".format(args.init_sl)) checkpoint = torch.load(args.init_sl) R.agent.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.init_sl)) elif args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] R.agent.load_state_dict(checkpoint['state_dict']) R.optim.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) for epoch in range(start_epoch, args.max_epochs + 1): R.train(env, epoch, args.gamma, logging=True) if epoch % args.save_freq == 0: # save model save_checkpoint( { 'epoch': epoch + 1, 'state_dict': R.agent.state_dict(), 'optimizer': R.optim.state_dict(), }, dir='cv/%s/' % args.name)
def main(): communication = Communication(child_num=process_num) brain = ACBrain(talker=communication.master) envs_p = [] for i in range(process_num): agent = Agent(talker=communication.children[i], seed=i) env_temp = Env(agent, i + 1) envs_p.append(Process(target=env_temp.run, args=())) for i in envs_p: i.start() brain.run()
def __init__(self, model, is_training=False, var=1.): self.max_step = 200 self.exploration_decay_start_step = 50000 state_dim = 366 action_dim = 2 self.action_linear_max = 0.25 # m/s self.action_angular_max = 0.5 # rad/s rospy.init_node('ddpg_stage_1') rospy.on_shutdown(self.clear_vel) self.is_training = is_training if ['/gazebo/model_states', 'gazebo_msgs/ModelStates'] in rospy.get_published_topics(): self.env = SimEnv(self.is_training) print("Gazebo mode") else: self.env = Env(self.is_training) print("Real world mode") self.agent = DDPG(model, self.env, state_dim, action_dim) self.past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(self.action_linear_max) + ' m/s and ' + str(self.action_angular_max) + ' rad/s') self.var = var
def __call__(self, parser, namespace, values, option_string=None): environments = [] for env_name_list in values: for env_name in env_name_list: try: env = Env(env_name) except: msg = "can't load environment from '{}'".format(env_name) raise argparse.ArgumentTypeError(msg) else: environments.append(env) namespace.environments = environments
def play_deterministic(self, n_tot): self.model.eval() env = Env() render = args.render n_human = 60 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) print("Observation %s" % observation) trajectory = self.data[observation] j = 0 ims = [] # fig = plt.figure() while not env.t: if j < n_human: a = trajectory[j, self.meta['action']] else: # im = plt.imshow(np.rollaxis(env.s.numpy().squeeze(0)[:3], 0, 3), animated=True) # ims.append([im]) if self.cuda: s = Variable(env.s.cuda(), requires_grad=False) else: s = Variable(env.s, requires_grad=False) _, _, beta, _, _, _ = self.model(s) beta = beta.squeeze(0) beta = beta.sign().int() * (beta.abs() > 0.5).int() a = reverse_excitation_index[tuple(beta.data)] env.step(a) j += 1 # if render: # ani = animation.ArtistAnimation(fig, ims, interval=10, blit=True, # repeat=False) # plt.show() yield env.score
def load_env(): #load file and data x_data, y_data = Acquire_data.data_aquire() X_train, X_test, y_train, y_test = xdata_split(x_data, y_data, test_size=.3) #preprocessing X_train, X_test = preprocessing(X_train, X_test) print(X_train.shape, X_test.shape) #save data to mat scio.savemat(dataNew, { 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test }) #bulid an env for testing env = Env(X_train, y_train, X_test, y_test) return env
def eval_args(stack, function, code): # Evaluate arguments args = code.args frame = StackFrame([], stack[-1].env) stack.append(frame) frame.new_env = Env(function.bindings, frame.env) frame.push(Instruction(Instruction.APPLY, function)) for index, arg in enumerate(function.lambda_list): if isinstance(arg, List): if arg.items[0].symbol == "quote": frame.new_env.set(arg.items[1].symbol, args[index]) else: print "OH NO BAD LIST ARG", arg else: frame.push(Instruction(Instruction.ARGUMENT, symbol=arg.symbol)) frame.push(Instruction(Instruction.CODE, args[index]))
def play_episode_deterministic(self, n_tot): self.model.eval() env = Env() n_human = 300 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) v, q, beta, r, p, phi = self.model(s) beta = beta.squeeze(0) if j < n_human: a = trajectory[j, self.meta['action']] else: beta_index = (beta.sign().int() * (beta.abs() > 0.5).int()).data.cpu().numpy() beta_index[0] = abs(beta_index[0]) a = reverse_excitation_index[tuple(beta_index.data)] env.step(a) # x = phi.squeeze(0).data.cpu().numpy() # print(np.mean(abs(x))) # yield v, q, beta, r, p, s yield { 'o': env.s.cpu().numpy(), 'v': v.data.cpu().numpy(), 's': phi.data.cpu().numpy(), 'score': env.score, 'beta': beta.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy() } j += 1 raise StopIteration
def play_deterministic(self, n_tot): self.model.eval() env = Env() render = args.render n_human = 60 humans_trajectories = iter(self.data) reverse_excitation_index = consts.reverse_excitation_index for i in range(n_tot): env.reset() observation = next(humans_trajectories) print("Observation %s" % observation) trajectory = self.data[observation] j = 0 while not env.t: if j < n_human: a = trajectory[j, self.meta['action']] else: if self.cuda: s = Variable(env.s.cuda(), requires_grad=False) else: s = Variable(env.s, requires_grad=False) _, _, beta, _, _, _ = self.model(s) beta = beta.squeeze(0) beta = (beta.sign().int() * (beta.abs() > 0.5).int()).data if self.cuda: beta = beta.cpu().numpy() else: beta = beta.numpy() beta[0] = abs(beta[0]) a = reverse_excitation_index[tuple(beta)] env.step(a) j += 1 yield {'o': env.s.cpu().numpy(), 'score': env.score}
def main(): communication = Communication(child_num=process_num) brain = ACBrain(talker=communication.master) envs_p = [] seed = get_seed() for i in range(process_num): agent = Agent(talker=communication.children[i]) env_temp = Env(agent, i, seed=seed + i) envs_p.append(Process(target=env_temp.run, args=())) for i in envs_p: i.start() tfb_p = subprocess.Popen(['tensorboard', '--logdir', "./logs/scalars"]) brain.run() for p in envs_p: p.terminate() tfb_p.kill()
act = np.zeros(self.action_size) act[action] = 1 self.actions.append(act) # update policy neural network def train_model(self): discounted_rewards = np.float32(self.discount_rewards(self.rewards)) discounted_rewards -= np.mean(discounted_rewards) discounted_rewards /= np.std(discounted_rewards) self.optimizer([self.states, self.actions, discounted_rewards]) self.states, self.actions, self.rewards = [], [], [] if __name__ == "__main__": env = Env() agent = ReinforceAgent() global_step = 0 scores, episodes = [], [] for e in range(EPISODES): done = False score = 0 # fresh env state = env.reset() state = np.reshape(state, [1, 15]) while not done: global_step += 1 # get action for the current state and go one step in environment
@staticmethod def arg_max(state_action): max_index_list = [] max_value = state_action[0] for index, value in enumerate(state_action): if value > max_value: max_index_list.clear() max_value = value max_index_list.append(index) elif value == max_value: max_index_list.append(index) return random.choice(max_index_list) if __name__ == "__main__": env = Env() agent = SARSAgent(actions=list(range(env.n_actions))) for episode in range(1000): # reset environment and initialize state state = env.reset() # get action of state from agent action = agent.get_action(str(state)) while True: env.render() # take action and proceed one step in the environment next_state, reward, done = env.step(action) next_action = agent.get_action(str(next_state))
def __init__(self): self.action = [0.0, 0.0] self.delay = False self.grid = Grid() self.env = Env(self.grid) self.reset()
def main(): # Config data_dir = '../data/data 1' test_set_iter = [['s1'], ['s2'], ['s3'], ['s4'], ['s5']] # Parameters alpha = 0.25 nit = 10 eps = 6.0 # Initialize print '[ Initialize ] start' Env.set_data_set(data_dir) Env.load_sw_set() Env.load_all_doc() Env.init_idf() print '[ Initialize ] initialize ok' avg_f1 = 0.0 for test_set_ids in test_set_iter: print '[ GLOBAL ] test_set %s' % (json.dumps(test_set_ids)) # Construct DT matrix and Y vector print '[ Construct DT matrix and Y vector ] start' train_docs = [] test_docs = [] for doc in Env.all_docs: if doc.set_id in test_set_ids: test_docs.append(doc) else: train_docs.append(doc) V = Env.w_size DT = np.zeros((len(train_docs), V), dtype=float) Y = np.zeros(len(train_docs), dtype=float) print '[ Construct DT matrix and Y vector ] ok' # Load Train DT and Y print '[ Load Train DT and Y ] start' for itd in range(len(train_docs)): doc = train_docs[itd] tfidf_dic = doc.get_tfidf() vec = np.zeros(V, dtype=float) for _k, _v in tfidf_dic.items(): vec[int(_k)] = _v DT[itd] = vec Y[itd] = doc.label print '[ Load Train DT and Y ] ok' # Train print '[ Train ] start' mod = Perceptron(DT, Y, alpha, nit, eps) mod.train() print '[ Train ] ok' # Load test DT and Y print '[ Load Test DT and Y ] start' DT_test = np.zeros((len(test_docs), V), dtype=float) Y_test = np.zeros((len(test_docs))) for itd in range(len(test_docs)): doc = test_docs[itd] tfidf_dic = doc.get_tfidf() vec = np.zeros(V, dtype=float) for _k, _v in tfidf_dic.items(): vec[int(_k)] = _v DT_test[itd] = vec Y_test[itd] = doc.label print '[ Load Test DT and Y ] ok' # Test print '[ Test ] start' pre, rec, f1 = mod.test(DT_test, Y_test) print '[ Test ] ok' print '[ Test ] pre = %f rec = %f f1 = %f' % (pre, rec, f1) avg_f1 += f1 avg_f1 /= 5 print '[ Average ] average_f1 = %f' % avg_f1