def __init__(self):
        self.n_episode = []
        self.n_epsilon = []
        self.n_dist = []
        self.avg_err = []

        # Parameters
        self.n_episodes = rospy.get_param("/n_episodes")
        self.avg_err_fre = rospy.get_param('/avg_err_fre')

        # create environment
        self.env = Env()
        self.n_states = self.env.observation_space
        self.n_actions = self.env.action_space.n

        # create Deep Q-Network
        self.dqn = DQN(self.n_states, self.n_actions)

        # load DQN weight
        rospack = rospkg.RosPack()
        data_path = rospack.get_path("pioneer_dragging") + "/data"
        username = rospy.get_param("/username")
        n_folder = rospy.get_param("/n_folder")
        self.dqn.file_models = "{0}/{1}-{2}/{2}-pytorch-RL.tar".format(
            data_path, username, n_folder)
        self.dqn.load_model()

        # plot
        self.color1 = 'tab:green'
        self.color2 = 'tab:blue'
        self.color3 = 'tab:orange'
        self.color4 = 'tab:red'

        self.style_plot = random.choice(plt.style.available)
        plt.style.use(self.style_plot)
        plt.ion()

        ###########
        # Figure 1 - Rewards
        self.fig1 = plt.figure(1)
        self.ax1 = self.fig1.add_subplot(1, 1, 1)
        self.ax2 = self.ax1.twinx()

        title_1 = 'Rewards - (Mode: Testing)'
        self.ax1.set_title(title_1)
        self.ax1.set_xlabel('Episode')
        self.ax1.set_ylabel('Reward', color=self.color1)
        self.ax2.set_ylabel('Epsilon', color=self.color2)
        self.ax1.tick_params(axis='y', labelcolor=self.color1)
        self.ax2.tick_params(axis='y', labelcolor=self.color2)

        ###########
        # Figure 2 - Error
        self.fig2 = plt.figure(2)
        self.ax3 = self.fig2.add_subplot(1, 1, 1)

        title_2 = 'Error Distance - (Mode: Testing)'
        self.ax3.set_title(title_2)
        self.ax3.set_xlabel('Episode')
        self.ax3.set_ylabel('Meter')
Beispiel #2
0
def UCTPlayGame():
    """
		Play a sample game between two UCT players where each player gets a different number of UCT iterations (= simulations = tree nodes).
		state는 양 플레이어가 공통으로 공유하는 env 정보다
		state의 playerJustMoved에 따라서 서로 다른 플레이어가 행동한다
	"""
    # state = OthelloState(4) # uncomment to play Othello on a square board of the given size
    state = Env()  # uncomment to play OXO
    # state = NimState(15) # uncomment to play Nim with the given number of starting chips
    while (state.board.finished == False):
        print(str(state))
        if state.playerJustMoved == 1:
            # play with values for itermax and verbose = True
            m = UCT(rootstate=state, itermax=100, verbose=True)
        else:
            # play with values for itermax and verbose = True
            m = UCT(rootstate=state, itermax=100, verbose=True)
        print("Best Move: " + str(m) + "\n")
        state.DoMove(m)
        #time.sleep(5)
    if state.GetResult(state.playerJustMoved) == 1.0:
        print("Player " + str(state.playerJustMoved) + " wins!")
    elif state.GetResult(state.playerJustMoved) == 0.0:
        print("Player " + str(3 - state.playerJustMoved) + " wins!")
    else:
        print("Nobody wins!")
Beispiel #3
0
    def __init__(self, pp, logger, pid="", gui=None, *args, **kwargs):
        self.rows, self.cols, self.n_channels = self.dims = pp['dims']
        self.pid = pid
        self.save = pp['save_exp_data']
        self.batch_size = pp['batch_size']
        self.n_hours, self.n_events = pp['n_hours'], pp['n_events']
        self.pp = pp
        self.logger = logger

        self.grid = np.zeros(pp['dims'], np.bool)
        # Contains hand-offs only
        self.hgrid = np.zeros(pp['dims'], np.bool)
        self.env = Env(pp, self.grid, self.hgrid, logger, pid)
        if (self.save or self.batch_size > 1):
            size = self.n_events if self.save else pp['buffer_size']
            self.exp_buffer = PrioritizedReplayBuffer(
                size=size,
                rows=self.rows,
                cols=self.cols,
                n_channels=self.n_channels,
                alpha=0.6)  # original: 0.6
            self.pri_beta_schedule = LinearSchedule(
                self.n_events,
                initial_p=0.4,  # pp['prioritized_replay_beta'],
                final_p=1.0)  # original: 1.0
            self.prioritized_replay_eps = float(1e-6)

        self.quit_sim, self.invalid_loss, self.exceeded_bthresh = False, False, False
        self.i, self.t = 0, 0.1  # Iteration, time
        signal.signal(signal.SIGINT, self.exit_handler)
Beispiel #4
0
    def test(self, num_actions):
        self.saver.restore(self.session, FLAGS.checkpoint_path)
        print "Restored model weights from ", FLAGS.checkpoint_path
        monitor_env = gym.make(FLAGS.game)
        monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True)
        env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)
        
        for i_episode in xrange(FLAGS.num_eval_episodes):
            state = env.get_initial_state()
            episode_reward = 0
            done = False
            
            # create state sequence
            state_sequence = np.zeros((t_max, FLAGS.history_length, FLAGS.width, FLAGS.height))
            state_sequence[t_max -1, :, :, :] = state
            
            while not done:
                monitor_env.render()
                q_values = self.q_values.eval(session = self.session, feed_dict = {self.state : [state_sequence]})
                action_index = np.argmax(q_values)
                new_state, reward, done = env.step(action_index)
                state = new_state

                # update state sequence
                state_sequence = np.delete(state_sequence, 0, 0)
                state_sequence = np.insert(state_sequence, t_max-1, state, 0)
                episode_reward += reward
            print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward)
        
        monitor_env.monitor.close()
Beispiel #5
0
 def __init__(self, is_training, is_obstacle=True):
     Env.__init__(self, is_training)
     Gazebo.__init__(self)
     self.is_obstacle = is_obstacle
     spp = self.get_physics_properties()
     self.set_physics_properties(time_step=spp.time_step,
                                 max_update_rate=0.0,
                                 gravity=spp.gravity,
                                 ode_config=spp.ode_config)
     self._spawn_models()
Beispiel #6
0
def main(args):
    embedder_hidden_sizes = args.embedder_hidden_sizes
    embedded_dim = args.embedded_dim
    lstm_size = args.lstm_size
    n_shuffle = args.n_shuffle
    clf_hidden_sizes = args.clf_hidden_sizes
    policy_hidden_sizes = args.policy_hidden_sizes
    shared_dim = args.shared_dim
    nsteps = args.nsteps
    n_envs = args.n_envs
    data_type = args.data_type
    r_cost = args.r_cost

    # TODO data load first, classifier defining and declare env
    traindata, valdata, testdata = data_load(data_type=args.data_type,
                                             random_seed=args.random_seed)
    input_dim = traindata.n_features + 1
    clf_output_size = traindata.n_classes if traindata.n_classes > 2 else 1
    encoder = SetEncoder(input_dim,
                         traindata.n_features,
                         embedder_hidden_sizes,
                         embedded_dim,
                         lstm_size,
                         n_shuffle,
                         normalize=args.normalize,
                         dropout=args.dropout,
                         p=args.p)

    dfsnet = DFSNet(encoder=encoder,
                    classifier=MLP(lstm_size + embedded_dim,
                                   clf_hidden_sizes,
                                   clf_output_size,
                                   dropout=args.dropout,
                                   p=args.p,
                                   batch_norm=args.batchnorm),
                    policy=DuelingNet(lstm_size + embedded_dim,
                                      policy_hidden_sizes, shared_dim,
                                      traindata.n_actions))
    dfsnet.to(args.device)
    step_runner = StepRunner(dfsnet, args)
    env = Env(args, n_envs, r_cost, traindata, step_runner.classify)
    valenv = Env(args, n_envs, r_cost, valdata, step_runner.classify)
    testenv = Env(args, n_envs, r_cost, testdata, step_runner.classify)

    env.classify = step_runner.classify
    valenv.classify = step_runner.classify
    testenv.classify = step_runner.classify
    learn_start = time()
    learn(step_runner,
          args,
          env,
          valenv,
          nsteps=nsteps,
          total_steps=int(5e6),
          scheduler=args.scheduler)
    learn_elapsed = time() - learn_start
    dfsnet.eval()
    test_and_record(step_runner, args, env, valenv, testenv)
    print(learn_elapsed)
    def start(self):
        self.env = Env(length=10, height=2, Nstep=10)

        self.agent = QLearningAgent(
            alpha=0.5,
            epsilon=0.1,
            discount=0.99,
            get_legal_actions=lambda s: range(len(self.env.actions)))

        self.agent.load_param("qtable.pickle")
        print("================Qtable:================")
        for m in self.agent._qvalues:
            print(m, self.agent._qvalues[m])

        rewards = []
        replay = ReplayBuffer(1000)
        for i in range(500):

            rewards.append(self.one_episode(2000, replay=replay))

            #OPTIONAL YOUR CODE: adjust epsilon
            self.agent.epsilon *= 0.9999
            print("+++++++++++++++++++++++++++++++++++++++++++++++")
            print("episode = ", i, 'eps =', self.agent.epsilon,
                  'mean reward =', np.mean(rewards[-10:]))
            print("+++++++++++++++++++++++++++++++++++++++++++++++")
            # if i %2 ==0:

            self.agent.save_param("qtable.pickle")
            self.agent.save_param("bak.pickle")
Beispiel #8
0
    def start(self):
        self.env = Env(height=3, discrete=True)
        self.action_lib = [[-1, -1], [-1, 1], [-1, 0], [0, -1], [0, 1], [0, 0],
                           [1, -1], [1, 1], [1, 0]]
        self.agent = QLearningAgent(
            alpha=0.5,
            epsilon=0.2,
            discount=0.9,
            get_legal_actions=lambda s: range(len(self.action_lib)))

        self.agent.load_param("q_table3.pickle")
        # print("================Qtable:================")
        # for m in self.agent._qvalues:
        # 	print(m,self.agent._qvalues[m])

        rewards = []
        replay = ReplayBuffer(10000)
        for i in range(500):
            mr, me = self.one_episode(replay, t_max=5000)
            rewards.append(mr)

            #OPTIONAL YOUR CODE: adjust epsilon
            # self.agent.epsilon *= 0.99
            print("+++++++++++++++++++++++++++++++++++++++++++++++")
            print("episode = ", i, 'mean reward =', np.mean(rewards[-10:]))
            print("+++++++++++++++++++++++++++++++++++++++++++++++")
            # if i %2 ==0:
            title = "mr:" + str(mr) + "me:" + str(me)
            self.env.ROSNode.log_showfigure(title)
            self.agent.save_param("q_table3.pickle")
            self.agent.save_param("q_table_bak.pickle")
Beispiel #9
0
def main():
    # num = sys.argv[1]
    num = '81'
    print(num)
    env_name = 'SuperMarioBros-{}-{}-v0'.format(num[0], num[1])
    print(env_name)
    communication = Communication(child_num=process_num)

    brain = ACBrain(talker=communication.master, env_name=env_name)

    envs_p = []

    seed = get_seed()
    for i in range(process_num):
        agent = Agent(talker=communication.children[i])
        env_temp = Env(agent, i, seed=seed + i, env_name=env_name)
        envs_p.append(Process(target=env_temp.run, args=()))

    for i in envs_p:
        i.start()

    tfb_p = subprocess.Popen(['tensorboard', '--logdir', "./logs/scalars"])

    brain.run()

    for p in envs_p:
        p.terminate()
    tfb_p.kill()
 def learn(self, env, max_episodes=MAX_EPISODES):
     for episode in range(0, max_episodes):
         print(episode)
         state = env.init()
         done = False
         while not done:
             goal = self.ac_agent.choose_goal(state)
             # low-level learn action according to sub-goal
             best_action, r_in_his, train_limit = [], [], 500
             while len(r_in_his) < train_limit:
                 action = self.ts_agent.choose_action(state, [goal])
                 r_in = Env.intrinsic_reward(state, goal, action)
                 if r_in_his == [] or r_in > max(r_in_his):
                     best_action = action
                 r_in_his.append(r_in)
                 self.ts.memory.push(state, action, r_in)
                 self.ts.train()
             # high-level learn sub-goal
             action = best_action
             next_state, reward, done, env_state = env.step(state, action)
             print(reward, env_state)
             if not done:
                 self.ac.memory.push(state, action, reward, next_state)
                 self.ac.train()
             state = next_state
         # anneal exploration probability
         self.ac.anneal_epsilon()
         self.ts.anneal_epsilon()
     # save model
     self.save_model()
Beispiel #11
0
def main():
    """
    1. Read and parse PDF files into month-wise DF
    2. Create CSV by combining all the DFs
    3. Create visualizations from the CSV
    :return: None
    """

    logger = Env.setup_logging()

    path_base = os.path.join(os.getcwd(), "PDF")
    csv_path = os.path.join(os.getcwd(), "output", "Salary_Slips_Merged.csv")

    pdf_path_list = sorted([os.path.join(path_base, file_name) for file_name in os.listdir(path_base) if
                            (os.path.isfile(os.path.join(path_base, file_name)) and
                             os.path.basename(file_name).endswith(".pdf") and ("Pay" in file_name))])

    combined_payslip_df = pd.DataFrame()

    for pdf_path in pdf_path_list:
        monthly_df = pdf_to_df(pdf_path, logger)
        combined_payslip_df = combined_payslip_df.append(monthly_df)

    combined_payslip_df.to_csv(csv_path, index=None)

    if not os.path.exists(os.path.dirname(csv_path)):
        os.mkdir(os.path.dirname(csv_path))

    side_by_side_bar_plot(csv_path)

    line_plot(csv_path)
Beispiel #12
0
def experiment1_test(
    output_folder,
    word_vectors,
    agent,
    episode_index,
    testset_path='./dataset/conll2003/en/eng.testb',
):
    # 初始化环境
    env = Env(testset_path, word_vectors)
    step = 0
    s = env.reset()
    print('[' + util.now_time() + "] start testing...")
    while True:
        # check task is ended
        if env.end():
            print('[' + util.now_time() + "] testing...done")
            result_file = '%03d_episode_test.txt' % (episode_index + 1)
            env.save_all_newlines_to_file(output_folder, result_file)
            return evaluate.conlleval(output_folder, result_file)

        # Choose Action a
        a = agent.choose_action(s)

        # Execute action
        s_, r = env.step(a)

        # Next status
        step += 1
        s = s_
Beispiel #13
0
    def test(self, env):

        # initialize environment
        env = Env(env, 84, 84, 4)

        terminal = False
        # Get initial game observation
        state = env.get_initial_state()

        # episode's reward and cost
        episode_reward = 0

        for _ in range(100):
            while not terminal:

                # forward pass of network. Get probability of all actions
                probs, v = self.sess.run((self.policy, self.state_value),
                                         feed_dict={self.input_state: [state]})

                probs = probs[0]
                v = v[0][0]

                if random.random() < 0.01:
                    action_index = random.choice([0, 1, 2, 3])
                else:
                    action_index = np.argmax(probs)

                # Gym excecutes action in game environment on behalf of actor-learner
                new_state, reward, terminal = env.step(action_index)
                env.env.render()
                # clip reward to -1, 1
                # Update the state and global counters
                state = new_state
                # update episode's counter
                episode_reward += reward

            if terminal:

                terminal = False
                print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \
                    episode_reward, "/ COST"
                episode_reward = 0
                counter = 0
                # Get initial game observation
                state = env.get_initial_state()
Beispiel #14
0
class CarEnvironment(Environment):
    
    def __init__(self):
        self.action = [0.0, 0.0]
        self.delay = False
        self.grid = Grid()
        self.env = Env(self.grid)
        self.reset()

    def step(self):
        # Simulate a step in the environment
        self.agent.brain.stored_action = self.action
        self.env.tick()
        self.env.calculate_moves()
        self.env.do_moves()
        self.env.print_env(self.env.tick_number)
    
    def reset(self):
        self.env.reset()
        self.agent = self.env.cars[0]
        # (x, y, vel, deg, goal_x, goal_y, dist_to_goal, dir_of_goal, direction_diff)
        self.sensors = self.agent.brain.get_state_tuple()
        self.distance_to_goal = self.agent.brain.distance_to_goal()
        
    def getSensors(self):
        # (x, y, vel, deg, goal_x, goal_y, dist_to_goal, dir_of_goal, direction_diff)
        return self.agent.brain.get_state_tuple()
    
    def getCarState(self):
        return self.agent.brain.get_state()
    
    def in_goal_state(self):
        return self.agent.brain.get_state().reached_goal
    
    def performAction(self, action):
        self.action = action
        self.step()
    
    def indim(self):
        return 2
    
    def outdim(self):
        return len(self.getSensors())
Beispiel #15
0
    def objective(self, params):
        """
        Computes the mean squared error between the networks predictions and
        the true Q-values. Network is trained based on the input parameters.

        Arguments
        ---------
        :param params : dict
               Dictionary containing values for hyperparameters to be
               optimized.

        Returns
        -------
        :returns : dict
                 Dictionary containint mean squared error between true and
                 estimated (using the parameter configuration from params)
                 Q-values and the status of the optimization.
        """

        a = params["lr"]
        b = params["lr decay"]
        c = params["batch size"]
        d = params["target update"]

        # initiliaze the RL-agent:
        agent = AgentDQN(dim_state=self.dim_state,
                         dim_actions=self.dim_actions,
                         hidden_dims=self.hidden_dims,
                         optimizer=Adam(lr=a, decay=b),
                         gamma=self.gamma,
                         eps=self.eps,
                         eps_decay=self.eps_decay,
                         frozen=self.frozen,
                         pretrained=self.pretrained)

        # initiliaze the environment:
        env = Env(start=self.start,
                  tcost=self.tcost,
                  horizon=self.horizon,
                  w=self.w,
                  theta=self.theta,
                  regimes=self.regimes)

        trained_agent, _, _, _, _, _, _ = train_dqn(agent, env,
                                                    self.train_episodes, c,
                                                    self.init_d_size,
                                                    self.max_d_size, d,
                                                    self.freeze_after)

        pred = trained_agent.qnn.predict(self.x)
        true = self.y

        mse = np.mean((pred - true)**2)

        return {"loss": mse, "status": STATUS_OK}
Beispiel #16
0
def experiment1_train(
    output_folder,
    word_vectors,
    n_episodes=300,
    trainset_path='./dataset/conll2003/en/eng.train',
):
    # 初始化环境
    print('[' + util.now_time() + "] init environment...")
    env = Env(trainset_path, word_vectors)
    print('[' + util.now_time() + "] 环境初始化完毕")

    # 初始化DQN
    print('[' + util.now_time() + "] init agent...")
    agent = DQN(n_actions=env.n_actions,
                status_dim=env.status_dim,
                action_dim=env.action_dim,
                reward_dim=env.reward_dim)
    print('[' + util.now_time() + "] agent初始化完毕")

    # 迭代episodes
    for i in range(n_episodes):
        print('[' + util.now_time() + "] start episode %03d of learning..." %
              (i + 1))
        step = 0
        s = env.reset()

        while True:
            # check task is ended
            if env.end():
                print('[' + util.now_time() +
                      "] episode %03d of learning...done" % (i + 1))
                result_file = '%03d_episode_train.txt' % (i + 1)
                env.save_all_newlines_to_file(output_folder, result_file)
                train_eval = evaluate.conlleval(output_folder, result_file)
                test_eval = experiment1_test(output_folder, word_vectors,
                                             agent, i)
                break

            # Choose Action a
            a = agent.choose_action(s)

            # Execute action
            # print('step %d' % step)
            s_, r = env.step(a)

            agent.store_transition(s, a, r, s_)

            step += 1
            s = s_

            if step > 200 and step % 5 == 0:
                agent.learn()

    # plot and compare train and test set TODO
    # plot(train_evals,test_evals)
    agent.eval_network.save(output_folder + os.path.sep + 'ex1_eval_model',
                            overwrite=True)
Beispiel #17
0
def evaluate_file(fname):
    with open(fname) as f:
        env = Env()
        inst_q = parser.parse(f.read())  # TODO
        term = inst_q.execute(env)  # assuming this will be a Queue for now

        # break on the top level will cause execution to stop, but should really be an error.  ret
        # can be used to return early.
        if term == command.LOOP_TERMINATE:
            raise exceptions.QQError(
                "Can't break out of the main program body.")
Beispiel #18
0
    def execute(self, env):
        fname = env.qframe.popleft()
        subqframe = env.qframe.popleft()
        body = env.fnqueue[fname].copy()

        term = body.execute(
            Env(qframe=subqframe.statements, rqueue=None, fnqueue=env.fnqueue))

        if term == command.LOOP_TERMINATE:
            raise exceptions.QQError("Can't break from a function")

        env.qframe.append(subqframe)
Beispiel #19
0
    def test(self, num_actions):
        self.saver.restore(self.session, FLAGS.checkpoint_path)
        print "Restored model weights from ", FLAGS.checkpoint_path
        monitor_env = gym.make(FLAGS.game)
        monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True)
        env = Env(monitor_env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)

        for i_episode in xrange(FLAGS.num_eval_episodes):
            state = env.get_initial_state()
            episode_reward = 0
            done = False
            while not done:
                monitor_env.render()
                probs = self.session.run(self.policy_values, feed_dict={self.state: [state]})[0]
                action_index = sample_policy_action(num_actions, probs)
                new_state, reward, done = env.step(action_index)
                state = new_state
                episode_reward += reward
            print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward)
        
        monitor_env.monitor.close()
Beispiel #20
0
def cartpole():
    env = Env('localhost:32822')
    env.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.box.shape[0]
    action_space = env.action_space.discrete.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        # print(state)
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            # env.render()
            print("acting on state: ", state)
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                plt.plot(dqn_solver.loss)
                plt.title('Model loss')
                plt.ylabel('Loss')
                plt.xlabel('Episode')
                plt.savefig("loss.png")
                break
            dqn_solver.experience_replay()
Beispiel #21
0
    def __init__(self, _config_path):
        self.__config = Config(_config_path)
        self.__envs = {}
        self.__timer = Timer()
        self.__repeat_times = self.__config.repeat_times
        Buffer.set_policy(self.__config.q_policy)

        for _V in self.__config.Vs:
            for _win_size in self.__config.win_sizes:
                _config = self.__config.copy()
                setattr(_config, "win_size", _win_size)
                self.__envs[(_V, _win_size)] = Env(_V, _config,
                                                   self.__repeat_times)
                delattr(_config, "win_size")
Beispiel #22
0
def main():
    args = parse_arguments()

    # dataset, dataloader
    transforms = get_transform()
    train_dataset = Dataset.TrackData_RL(args.train_data, transform=transforms)
    train_loader = DataLoader(train_dataset,
                              num_workers=args.num_workers,
                              shuffle=True,
                              batch_size=1)

    # model, environment
    R = Reinforce(train_loader, transforms)
    env = Env(args)

    start_epoch = 1

    if args.init_sl:
        if os.path.isfile(args.init_sl):
            print("=> loading checkpoint '{}'".format(args.init_sl))
            checkpoint = torch.load(args.init_sl)
            R.agent.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.init_sl))
    elif args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            R.agent.load_state_dict(checkpoint['state_dict'])
            R.optim.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    for epoch in range(start_epoch, args.max_epochs + 1):
        R.train(env, epoch, args.gamma, logging=True)
        if epoch % args.save_freq == 0:
            # save model
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': R.agent.state_dict(),
                    'optimizer': R.optim.state_dict(),
                },
                dir='cv/%s/' % args.name)
Beispiel #23
0
def main():
    communication = Communication(child_num=process_num)

    brain = ACBrain(talker=communication.master)

    envs_p = []
    for i in range(process_num):
        agent = Agent(talker=communication.children[i], seed=i)
        env_temp = Env(agent, i + 1)
        envs_p.append(Process(target=env_temp.run, args=()))

    for i in envs_p:
        i.start()

    brain.run()
Beispiel #24
0
    def __init__(self, model, is_training=False, var=1.):
        self.max_step = 200
        self.exploration_decay_start_step = 50000
        state_dim = 366
        action_dim = 2
        self.action_linear_max = 0.25  # m/s
        self.action_angular_max = 0.5  # rad/s
        rospy.init_node('ddpg_stage_1')
        rospy.on_shutdown(self.clear_vel)
        self.is_training = is_training
        if ['/gazebo/model_states', 'gazebo_msgs/ModelStates'] in rospy.get_published_topics():
            self.env = SimEnv(self.is_training)
            print("Gazebo mode")
        else:
            self.env = Env(self.is_training)
            print("Real world mode")

        self.agent = DDPG(model, self.env, state_dim, action_dim)
        self.past_action = np.array([0., 0.])
        print('State Dimensions: ' + str(state_dim))
        print('Action Dimensions: ' + str(action_dim))
        print('Action Max: ' + str(self.action_linear_max) + ' m/s and ' + str(self.action_angular_max) + ' rad/s')

        self.var = var
Beispiel #25
0
    def __call__(self, parser, namespace, values, option_string=None):
        environments = []

        for env_name_list in values:
            for env_name in env_name_list:
                try:
                    env = Env(env_name)

                except:
                    msg = "can't load environment from '{}'".format(env_name)
                    raise argparse.ArgumentTypeError(msg)

                else:
                    environments.append(env)

        namespace.environments = environments
    def play_deterministic(self, n_tot):

        self.model.eval()
        env = Env()
        render = args.render

        n_human = 60
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()

            observation = next(humans_trajectories)
            print("Observation %s" % observation)
            trajectory = self.data[observation]

            j = 0

            ims = []
            # fig = plt.figure()
            while not env.t:

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    # im = plt.imshow(np.rollaxis(env.s.numpy().squeeze(0)[:3], 0, 3), animated=True)
                    # ims.append([im])
                    if self.cuda:
                        s = Variable(env.s.cuda(), requires_grad=False)
                    else:
                        s = Variable(env.s, requires_grad=False)
                    _, _, beta, _, _, _ = self.model(s)

                    beta = beta.squeeze(0)
                    beta = beta.sign().int() * (beta.abs() > 0.5).int()
                    a = reverse_excitation_index[tuple(beta.data)]

                env.step(a)

                j += 1

            # if render:
            #     ani = animation.ArtistAnimation(fig, ims, interval=10, blit=True,
            #                                     repeat=False)
            #     plt.show()

            yield env.score
Beispiel #27
0
def load_env():
    #load file and data
    x_data, y_data = Acquire_data.data_aquire()
    X_train, X_test, y_train, y_test = xdata_split(x_data,
                                                   y_data,
                                                   test_size=.3)
    #preprocessing
    X_train, X_test = preprocessing(X_train, X_test)
    print(X_train.shape, X_test.shape)
    #save data to mat
    scio.savemat(dataNew, {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    })
    #bulid an env for testing
    env = Env(X_train, y_train, X_test, y_test)
    return env
Beispiel #28
0
def eval_args(stack, function, code):
    # Evaluate arguments
    args = code.args

    frame = StackFrame([], stack[-1].env)
    stack.append(frame)
    frame.new_env = Env(function.bindings, frame.env)

    frame.push(Instruction(Instruction.APPLY, function))

    for index, arg in enumerate(function.lambda_list):
        if isinstance(arg, List):
            if arg.items[0].symbol == "quote":
                frame.new_env.set(arg.items[1].symbol, args[index])
            else:
                print "OH NO BAD LIST ARG", arg
        else:
            frame.push(Instruction(Instruction.ARGUMENT, symbol=arg.symbol))
            frame.push(Instruction(Instruction.CODE, args[index]))
Beispiel #29
0
    def play_episode_deterministic(self, n_tot):
        self.model.eval()
        env = Env()

        n_human = 300
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                v, q, beta, r, p, phi = self.model(s)
                beta = beta.squeeze(0)

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    beta_index = (beta.sign().int() *
                                  (beta.abs() > 0.5).int()).data.cpu().numpy()
                    beta_index[0] = abs(beta_index[0])
                    a = reverse_excitation_index[tuple(beta_index.data)]

                env.step(a)

                # x = phi.squeeze(0).data.cpu().numpy()
                # print(np.mean(abs(x)))
                # yield v, q, beta, r, p, s
                yield {
                    'o': env.s.cpu().numpy(),
                    'v': v.data.cpu().numpy(),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy()
                }

                j += 1

        raise StopIteration
Beispiel #30
0
    def play_deterministic(self, n_tot):

        self.model.eval()
        env = Env()
        render = args.render

        n_human = 60
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()

            observation = next(humans_trajectories)
            print("Observation %s" % observation)
            trajectory = self.data[observation]

            j = 0

            while not env.t:

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    if self.cuda:
                        s = Variable(env.s.cuda(), requires_grad=False)
                    else:
                        s = Variable(env.s, requires_grad=False)
                    _, _, beta, _, _, _ = self.model(s)

                    beta = beta.squeeze(0)
                    beta = (beta.sign().int() * (beta.abs() > 0.5).int()).data
                    if self.cuda:
                        beta = beta.cpu().numpy()
                    else:
                        beta = beta.numpy()
                    beta[0] = abs(beta[0])
                    a = reverse_excitation_index[tuple(beta)]

                env.step(a)

                j += 1

            yield {'o': env.s.cpu().numpy(), 'score': env.score}
Beispiel #31
0
def main():
    communication = Communication(child_num=process_num)

    brain = ACBrain(talker=communication.master)

    envs_p = []

    seed = get_seed()
    for i in range(process_num):
        agent = Agent(talker=communication.children[i])
        env_temp = Env(agent, i, seed=seed + i)
        envs_p.append(Process(target=env_temp.run, args=()))

    for i in envs_p:
        i.start()

    tfb_p = subprocess.Popen(['tensorboard', '--logdir', "./logs/scalars"])

    brain.run()

    for p in envs_p:
        p.terminate()
    tfb_p.kill()
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)

    # update policy neural network
    def train_model(self):
        discounted_rewards = np.float32(self.discount_rewards(self.rewards))
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)

        self.optimizer([self.states, self.actions, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []


if __name__ == "__main__":
    env = Env()
    agent = ReinforceAgent()

    global_step = 0
    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        # fresh env
        state = env.reset()
        state = np.reshape(state, [1, 15])

        while not done:
            global_step += 1
            # get action for the current state and go one step in environment
    @staticmethod
    def arg_max(state_action):
        max_index_list = []
        max_value = state_action[0]
        for index, value in enumerate(state_action):
            if value > max_value:
                max_index_list.clear()
                max_value = value
                max_index_list.append(index)
            elif value == max_value:
                max_index_list.append(index)
        return random.choice(max_index_list)

if __name__ == "__main__":
    env = Env()
    agent = SARSAgent(actions=list(range(env.n_actions)))

    for episode in range(1000):
        # reset environment and initialize state

        state = env.reset()
        # get action of state from agent
        action = agent.get_action(str(state))

        while True:
            env.render()

            # take action and proceed one step in the environment
            next_state, reward, done = env.step(action)
            next_action = agent.get_action(str(next_state))
Beispiel #34
0
 def __init__(self):
     self.action = [0.0, 0.0]
     self.delay = False
     self.grid = Grid()
     self.env = Env(self.grid)
     self.reset()
Beispiel #35
0
def main():
  # Config
  data_dir = '../data/data 1'
  test_set_iter = [['s1'], ['s2'], ['s3'], ['s4'], ['s5']]

  # Parameters
  alpha = 0.25
  nit = 10
  eps = 6.0

  # Initialize
  print '[ Initialize ] start'
  Env.set_data_set(data_dir)
  Env.load_sw_set()
  Env.load_all_doc()
  Env.init_idf()
  print '[ Initialize ] initialize ok'

  avg_f1 = 0.0
  for test_set_ids in test_set_iter:
    print '[ GLOBAL ] test_set %s' % (json.dumps(test_set_ids))
    # Construct DT matrix and Y vector
    print '[ Construct DT matrix and Y vector ] start'
    train_docs = []
    test_docs = []

    for doc in Env.all_docs:
      if doc.set_id in test_set_ids:
        test_docs.append(doc)
      else:
        train_docs.append(doc)

    V = Env.w_size
    DT = np.zeros((len(train_docs), V), dtype=float)
    Y = np.zeros(len(train_docs), dtype=float)
    print '[ Construct DT matrix and Y vector ] ok'

    # Load Train DT and Y
    print '[ Load Train DT and Y ] start'
    for itd in range(len(train_docs)):
      doc = train_docs[itd]
      tfidf_dic = doc.get_tfidf()
      vec = np.zeros(V, dtype=float)
      for _k, _v in tfidf_dic.items():
        vec[int(_k)] = _v

      DT[itd] = vec
      Y[itd] = doc.label
    print '[ Load Train DT and Y ] ok'

    # Train
    print '[ Train ] start'
    mod = Perceptron(DT, Y, alpha, nit, eps)
    mod.train()
    print '[ Train ] ok'

    # Load test DT and Y
    print '[ Load Test DT and Y ] start'
    DT_test = np.zeros((len(test_docs), V), dtype=float)
    Y_test = np.zeros((len(test_docs)))

    for itd in range(len(test_docs)):
      doc = test_docs[itd]
      tfidf_dic = doc.get_tfidf()
      vec = np.zeros(V, dtype=float)
      for _k, _v in tfidf_dic.items():
        vec[int(_k)] = _v
      DT_test[itd] = vec
      Y_test[itd] = doc.label
    print '[ Load Test DT and Y ] ok'

    # Test
    print '[ Test ] start'
    pre, rec, f1 = mod.test(DT_test, Y_test)
    print '[ Test ] ok'
    print '[ Test ] pre = %f rec = %f f1 = %f' % (pre, rec, f1)
    avg_f1 += f1

  avg_f1 /= 5
  print '[ Average ]  average_f1 = %f' % avg_f1