Beispiel #1
0
def local_test(index, opt, global_model):
    torch.manual_seed(123 + index)
    env, num_states, num_actions = create_train_env(args.world, args.stage, args.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=args.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        env.render()
        actions.append(action)
        if curr_step > args.num_global_steps or actions.count(actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
Beispiel #2
0
def test(game_size, norm):
    #  start_pprof_server(port=8081)
    env = gym.make('game2048-v0', size=game_size, norm=norm)
    obs = env.reset()
    rewards = 0
    step = 0

    for _ in range(1):
        start = time.time() * 1000
        while True:
            # if render for every step
            #  env.render()
            action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
            rewards += reward
            step += 1
            if done:
                escape = time.time() * 1000 - start
                env.render()
                print(f'obs: {obs}')
                print(
                    f'play games steps: {step} reward: {rewards} info: {info}'
                    +
                    f' use {escape:.3f}ms speed: {(step * 1000 / escape):.3f}ops/s'
                )
                time.sleep(0.5)

                step = 0
                rewards = 0
                start = time.time() * 1000
                env.reset()
Beispiel #3
0
def run_env(env, n_runs=100):
    """
    Plots simulated games in an environment for visualization
    :param env: environment to be run
    :param n_runs: how many episodes should be run
    :return: plot of each step in the environment
    """
    for i in range(n_runs):
        env.reset()
        env.show()
        done = False
        while not done:
            state = env.agents[0].board_to_state(
            )  # for the reinforcement agent convert board to state input
            action = env.agents[0].select_action(state, 0.00)
            action = action[0, 0]  # action is unwrapped from the LongTensor
            move = env.agents[0].action_to_move(
                action)  # e.g. action = 1 -> move = ((0, 0), (0, 1))
            _, done, won = env.step(move)
            env.show()
            if done and won:
                print("Won!")
            elif done and not won or env.steps > 20:
                print("Lost")
                break
Beispiel #4
0
def monte_carlo_control():
    action_value_function = defaultdict(float)
    n_s = defaultdict(int)
    n_s_a = defaultdict(int)

    n_zero = 1E5
    episodes = xrange(int(1E8))

    pbar = ProgressBar(maxval=len(episodes)).start()
    for episode in episodes:
        state = State()
        while not state.terminal:
            player = state.player
            dealer = state.dealer

            epsilon = float(n_zero) / (n_zero + n_s[(dealer, player)])
            action = epsilon_greedy_policy(action_value_function, state, epsilon)

            n_s[(dealer, player)] += 1
            n_s_a[(dealer, player, action)] += 1

            reward = step(state, action)

            # update the action value function
            alpha = 1.0 / n_s_a[(dealer, player, action)]
            new_reward = action_value_function[(dealer, player, action)]
            action_value_function[(dealer, player, action)] += alpha * (reward - new_reward)

        pbar.update(episode)
    pbar.finish()
    value_function = action_value_to_value_function(action_value_function)
    plot_value_function(value_function, "Optimal Value Function: Question 2")

    return action_value_function
Beispiel #5
0
def train_sl(size, lr, rd):
    env = gym.make('game2048-v0', size=size)
    agent = model.SarsaLambda(env.action_space)
    trials = 1 * 10000 * (size ** 2)

    for trial in range(trials):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        action = agent.choose_action(obs)
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            action_ = agent.choose_action(obs_)
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_, action_)
            obs = obs_
            action = action_
            rewards += reward
            if done:
                break

        env.render()
        print(f'Completed in {trial} use {stepno} steps highest: \
{env.highest()} rewards: {rewards}')
        stepno = 0
        rewards = 0

    print(len(agent.q_table))
Beispiel #6
0
 def behaviour(self, candidate):
     obs = env.reset()
     done = False
     while not done:
         action = get_action(ns, obs)
         obs, reward, done, _ = env.step(action)
     return obs
Beispiel #7
0
def self_play(env, agent, return_trajectory=False, verbose=False):
    if return_trajectory:
        trajectory = []
    observation = env.reset()
    for step in itertools.count():
        board,_,player,_,_ = observation
        action, prob = agent.decide(observation, return_prob=True)
        if verbose:
            print(strfboard(observation))
            logging.info('The {} step:palyer {}, action {}'.format(step, player,
                    action))
        observation, winner, done, _ = env.step(action[0])
        if return_trajectory:
            m,n = board.shape
            board = np.reshape(board, m*n)
            trajectory.append((player, board, prob))
        if done:
            if verbose:
                print(strfboard(observation))
                logging.info('Winner {}'.format(winner))
            break
    if return_trajectory:
        df_trajectory = pd.DataFrame(trajectory,
                columns=['player', 'board', 'prob'])
        df_trajectory['winner'] = winner
        return df_trajectory
    else:
        return winner
Beispiel #8
0
def some_random_games_first():
    for episode in range(10):
        env.reset()
        for t in range(goal_steps):
            action = env.action_space()
            observation, reward, done, info = env.step(action)
            if done:
                break
Beispiel #9
0
 def respond(self, env):
     mask = env.get_mask()
     for i in range(len(action_space)):
         if mask[i]:
             # print('taking action, ', action_space[i])
             return env.step(action_space[i])
     raise Exception("should not be here")
     return None, None
Beispiel #10
0
    def respond(self, env):
        mask = env.get_mask()
        valid_actions = np.take(np.arange(len(action_space)), mask.nonzero())
        valid_actions = valid_actions.reshape(-1)
        a = np.random.choice(valid_actions)

        # print('taking action, ', action_space[a])
        return env.step(action_space[a])
Beispiel #11
0
def initial_population():
    training_data = []
    scores = []
    accepted_scores = []
    for _ in range(initial_games):
        env.reset()
        if (_ % 100 == 0):
            print(_)
        score = 0
        game_memory = []
        prev_observation = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        for _ in range(goal_steps):
            #print(prev_observation)
            action = env.action_space()
            observation, reward, done, info = env.step(action)
            #print(action)

            if len(prev_observation) > 0:
                game_memory.append([prev_observation, action])

            prev_observation = observation
            score += reward
            #if done:
            #    break
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [1, 0, 0, 0, 0, 0, 0, 0, 0]
                elif data[1] == 2:
                    output = [0, 1, 0, 0, 0, 0, 0, 0, 0]
                elif data[1] == 3:
                    output = [0, 0, 1, 0, 0, 0, 0, 0, 0]
                elif data[1] == 4:
                    output = [0, 0, 0, 1, 0, 0, 0, 0, 0]
                elif data[1] == 5:
                    output = [0, 0, 0, 0, 1, 0, 0, 0, 0]
                elif data[1] == 6:
                    output = [0, 0, 0, 0, 0, 1, 0, 0, 0]
                elif data[1] == 7:
                    output = [0, 0, 0, 0, 0, 0, 1, 0, 0]
                elif data[1] == 8:
                    output = [0, 0, 0, 0, 0, 0, 0, 1, 0]
                elif data[1] == 9:
                    output = [0, 0, 0, 0, 0, 0, 0, 0, 1]

                training_data.append([data[0], output])

        scores.append(score)

    training_data_save = np.array(training_data)
    np.save('saved2.npy', training_data_save)

    print('Average accepted score:', mean(accepted_scores))
    print('Median accepted score: ', median(accepted_scores))
    print(Counter(accepted_scores))

    return training_data
Beispiel #12
0
def main(args):
    param_str = (
        f'{args.env}_{args.algo}_rep={args.repeat}_hor={args.horizon}_prop={args.proposals}'
        f'_iter={args.iterations}_sigma={args.sigma}')

    env = gym.make(args.env)
    env = ActionRepeat(env, args.repeat)

    # Pool of workers, each has its own copy of global environment variable
    pool = Pool(32, initializer, [env])

    if args.algo == 'gaussian':
        planner = partial(gaussian_cem,
                          pool=pool,
                          action_space=env.action_space,
                          horizon=args.horizon,
                          proposals=args.proposals,
                          topk=args.topk,
                          iterations=args.iterations)
    elif args.algo == 'nonparametric':
        planner = partial(nonparametric_cem,
                          pool=pool,
                          action_space=env.action_space,
                          horizon=args.horizon,
                          proposals=args.proposals,
                          topk=args.topk,
                          iterations=args.iterations,
                          sigma=args.sigma)

    scores = np.zeros(args.episodes)
    observations = np.zeros((args.episodes, env.num_steps + 1) +
                            env.observation_space.shape)
    actions = np.zeros((args.episodes, env.num_steps) + env.action_space.shape)

    for i in range(args.episodes):
        logger = Logger(os.path.join(args.logdir, f'{param_str}_run{i}'))
        observations[i, 0] = env.reset()

        for t in range(env.num_steps):
            state = env.sim.get_state()
            actions[i, t] = planner(state)
            observations[i, t + 1], reward, _, _ = env.step(actions[i, t])
            scores[i] += reward
            logger.log_scalar('reward', scores[i], t)

        print(scores[i])

    print(param_str)
    print('Mean score:         ', scores.mean())
    print('Standard deviation: ', scores.std())

    if args.save:
        path = os.path.join(args.savedir, args.env)
        if not os.path.exists(path):
            os.makedirs(path)
        np.save(os.path.join(path, 'obs'), observations)
        np.save(os.path.join(path, 'act'), actions)
Beispiel #13
0
def test(env):
    action = env.action_space.sample()
    obs, r, done, info = env.step(action)
    env.render()
    print('action:', action)
    print('reward:', r)
    print('done:', done)
    print('info:', info)
    print('nb_actions', env.action_space.n)
Beispiel #14
0
 def evaluate(self):
     obs = env.reset()
     done = False
     total_reward = 0
     while not done:
         action = get_action(ns, obs)
         obs, reward, done, _ = env.step(action)
         total_reward += reward
     return total_reward
Beispiel #15
0
def dqn(n_runs,
        n_episodes,
        max_t=300,
        eps_start=0.05,
        eps_end=1e-4,
        eps_decay=0.996):
    steps = np.zeros(n_episodes)
    acc_rewards = []
    scores = []
    eps = eps_start

    map_vec = env.init_map_vec()
    probMap = np.full((8, 8), 0)
    for num in map_vec:
        loc = util.num_to_loc(num, 8)
        probMap[loc[0]][loc[1]] = 1

    print(agent.probMap)

    for i_run in range(0, n_runs):
        # train
        print("run: ", i_run)
        # provide the learned map
        #agent.reset()
        for i_episode in range(0, n_episodes):
            if i_episode % 500 == 0:
                print(i_episode)
            state = env.reset()
            #score = 0
            #agent.probMap = probMap
            #agent.visitMap = np.full((8, 8), 0)
            for t in range(max_t):
                success = False
                action = agent.act(state, eps)
                next_state, reward, done = env.step(action)
                agent.step(state, action, reward, next_state, done, False,
                           True)  # not update the map
                state = next_state
                eps = max(eps * eps_decay, eps_end)
                #score += reward
                if done:
                    #print(env.map)
                    #print("t",t,"score",score)
                    steps[i_episode] = steps[i_episode] + t
                    success = True
                    #print(t)
                    break

            if not success:
                steps[i_episode] = steps[i_episode] + max_t
                #print(t)

        #agent.reset()

    return scores, steps, agent.probMap
def rollout(sentence_generator, vae, sentences, inst_to_one_hot, dict_goals, valid_goals, env, policy, env_params, inits, goals, self_eval, true_eval, biased_init=False, \
                                                                                                                                                              animated=False):

    expressions = get_list_of_expressions()

    scores = []
    np.random.shuffle(expressions)
    for expression in expressions:
        print('\nAttempting expression: ', expression)
        observation = env.unwrapped.reset_goal(np.array(goals[i]), biased_init=biased_init)
        config_inital = observation['achieved_goal'].copy()
        trial_counter = 0
        success = False
        while trial_counter < 5:
            trial_counter += 1
            goals_str = sample_vae_logic(vae, inst_to_one_hot, observation['achieved_goal'], expression, valid_goals)
            if len(goals_str) > 0:
                goal = dict_goals[np.random.choice(list(goals_str))]
                # goal = dict_goals[np.random.choice(list(goals_str))]
                env.unwrapped.target_goal = goal.copy()
                observation = env.unwrapped._get_obs()
                obs = observation['observation']
                ag = observation['achieved_goal']
                g = observation['desired_goal']

                # start to collect samples
                for t in range(env_params['max_timesteps']):
                    # run policy
                    no_noise = self_eval or true_eval
                    action = policy.act(obs.copy(), ag.copy(), g.copy(), no_noise)
                    # feed the actions into the environment
                    if animated:
                        env.render()
                    observation_new, _, _, info = env.step(action)
                    obs = observation_new['observation']
                    ag = observation_new['achieved_goal']
                config_final = ag.copy()
                true_sentences = sentence_generator(config_inital, config_final)

                if check_sentence(true_sentences, expression):
                    scores.append(trial_counter)
                    success = True
                    print('Success!')
                    break
                else:
                    print('\tFailed. Trying again.')

        if not success:
            scores.append(0)
            print('\tFailed 5 times, Moving On.')


    return scores.copy()
Beispiel #17
0
def q_learning(size, num_episodes, alpha, gamma=1.0, plot_every=100):
    env = gym.make('game2048-v0', size=size)
    """Q-Learning - TD Control

    Params
    ======
        num_episodes (int): number of episodes to run the algorithm
        alpha (float): learning rate
        gamma (float): discount factor
        plot_every (int): number of episodes to use when calculating average score
    """
    nA = env.action_space.n                # number of actions
    Q = defaultdict(lambda: np.zeros(nA))  # initialize empty dictionary of arrays

    # monitor performance
    tmp_scores = deque(maxlen=plot_every)     # deque for keeping track of scores
    avg_scores = deque(maxlen=num_episodes)   # average scores over every plot_every episodes

    for i_episode in range(1, num_episodes+1):
        # monitor progress
        score = 0                                              # initialize score
        state = env.reset()                                    # start episode
        state = str(state.reshape(size ** 2).tolist())
        eps = 1.0 / i_episode                                  # set value of epsilon

        while True:
            action = epsilon_greedy(env, Q, state, nA, eps)         # epsilon-greedy action selection
            next_state, reward, done, info = env.step(action)  # take action A, observe R, S'
            next_state = str(next_state.reshape(size ** 2).tolist())
            score += reward                                    # add reward to agent's score
            Q[state][action] = update_Q_sarsamax(alpha, gamma, Q, \
                    state, action, reward, next_state)
            state = next_state                                 # S <- S'
            if done:
                tmp_scores.append(score)                       # append score
                break

        print("\rEpisode {}/{}\t Average Score: {:.2f}".format(i_episode, num_episodes, np.mean(tmp_scores)), end="")
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes))
            sys.stdout.flush()
        if (i_episode % plot_every == 0):
            avg_scores.append(np.mean(tmp_scores))

    # plot performance
    plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores))
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
    plt.show()
    # print best 100-episode performance
    print(('Best Average Reward over %d Q length: %d Episodes: ' % (plot_every, len(Q))), np.max(avg_scores))
    return Q
Beispiel #18
0
def test_env(model, vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward, env.get_score()
Beispiel #19
0
def test():
    training_data = np.load('saved2.npy')
    X = np.array([i[0]
                  for i in training_data]).reshape(-1,
                                                   len(training_data[0][0]), 1)
    model = neural_network_model(input_size=len(X[0]))

    model.load("model2.model")
    scores = []
    choices = []

    for each_game in range(1000):
        score = 0
        game_memory = []
        prev_obs = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        env.reset()
        for _ in range(goal_steps):
            #if len(prev_obs) == 0:
            #    action = random.randrange(1,10)
            #else:
            action = np.argmax(
                model.predict(
                    np.array(prev_obs).reshape(-1, len(prev_obs), 1))[0])
            #if(action==9):
            #print(action)
            #print(prev_obs)
            #print(action)
            choices.append(action)

            new_observation, reward, done, info = env.step(action)
            print(new_observation)
            prev_obs = new_observation
            game_memory.append([new_observation, action])
            score += reward
            if done:
                break
        scores.append(score)

    print('Average Score', sum(scores) / len(scores))
    print(
        'Choice 1: {}, Choice 2: {}, Choice 3: {}, Choice 4: {}, Choice 5: {}, Choice 6: {}, Choice 7: {}, Choice 8: {}, Choice 9: {}'
        .format(
            choices.count(1) / len(choices),
            choices.count(2) / len(choices),
            choices.count(3) / len(choices),
            choices.count(4) / len(choices),
            choices.count(5) / len(choices),
            choices.count(6) / len(choices),
            choices.count(7) / len(choices),
            choices.count(8) / len(choices),
            choices.count(9) / len(choices)))
Beispiel #20
0
def train_ql(size, lr, rd, eps_start=1.0, eps_end=0.05, eps_decay=0.999):
    env = gym.make('game2048-v0', size=size)
    agent = model.QLearning(env.action_space, learning_rate=lr, reward_decay=rd)
    total_steps = 0
    total_scores = 0
    highest_score = 0
    #  trials = 1 * 100000 * (size ** 2)
    trials = 400000
    rewards_window = deque(maxlen=100)
    scores_window = deque(maxlen=100)
    eps = eps_start

    for trial in range(1, trials+1):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            total_steps += 1
            action = agent.choose_action(str(obs), eps)
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_)
            obs = obs_
            rewards += reward
            if done:
                break

        #env.render()
        eps = max(eps_end, eps_decay * eps)
        rewards_window.append(rewards)
        scores_window.append(env.get_score())
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()
        print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'.
                format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps), end="")
        if trial% 100 == 0:
            print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'.
                    format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps))

    eval(env, agent, 1000, render=False)
    print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \
highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}')
    print(f'table_len: {len(agent.q_table)} steps: {total_steps}')
Beispiel #21
0
def train_sarsa(size, lr, rd):
    env = gym.make('game2048-v0', size=size)
    agent = model.Sarsa(env.action_space, learning_rate=lr, reward_decay=rd)
    total_steps = 0
    total_scores = 0
    highest_score = 0
    trials = 1 * 1000 * (size ** 2)

    for trial in range(trials):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        action = agent.choose_action(obs)
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            total_steps += 1
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            action_ = agent.choose_action(obs_, True)
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_, action_)
            obs = obs_
            action = action_
            rewards += reward
            if done:
                break

        #env.render()
        print(f'Completed in {trial} use {stepno} steps highest: \
{env.highest()} rewards: {rewards}', end="")
        if env.highest() >= 2 ** (size ** 2 - 1):
            highest[trial] = env.highest()
            if env.highest() >= 2 ** (size ** 2):
                targets[trial] = env.highest()
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()
        stepno = 0
        rewards = 0

    eval(env, agent, render=False)
    print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \
highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}')
    print(f'highest len: {len(highest)} prob: {len(highest) * 1.0 / trials} \
target len: {len(targets)} prob: {len(targets) * 1.0 / trials}')
Beispiel #22
0
 def respond(self, env):
     mask = get_mask(to_char(self.env.get_curr_cards()), self.action_space, to_char(self.env.get_last_cards()))
     s = env.get_state()
     s = np.reshape(s, [1, -1])
     policy, val = self.sess.run([
         self.agents[0].network.valid_policy,
         self.agents[0].network.val_pred],
         feed_dict={
             self.agents[0].network.input: s,
             self.agents[0].network.mask: np.reshape(mask, [1, -1])
         })
     policy = policy[0]
     valid_actions = np.take(np.arange(self.a_dim), mask.nonzero())
     valid_actions = valid_actions.reshape(-1)
     # a = np.random.choice(valid_actions, p=policy)
     a = valid_actions[np.argmax(policy)]
     # print("taking action: ", self.action_space[a])
     return env.step(self.action_space[a])
Beispiel #23
0
def train(RL):
    acc_r = [0]
    total_steps = 0
    episode = 0
    all_reward = 0
    # observation = env.reset()
    while True:
        # if total_steps-MEMORY_SIZE > 9000: env.render()
        s, t = env.reset()
        observation = s + list(t.reshape(-1, ))
        for i in range(200):
            action = RL.choose_action(observation)

            # f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4)   # [-2 ~ 2] float actions
            (s_, t), reward, done, info = env.step(actions[action])
            observation_ = s_ + list(t.reshape(-1, ))
            acc_r.append(reward + acc_r[-1])  # accumulated reward

            RL.store_transition(observation, action, reward, observation_)

            observation = observation_
            total_steps += 1
            all_reward += reward

            if total_steps > MEMORY_SIZE:
                RL.learn()

            if done:
                break

        # if total_steps-MEMORY_SIZE > 15000:
        #     break
        episode += 1

        if (episode % 100 == 0):
            info = {'averageTotalReward': all_reward / 100}
            all_reward = 0
            for tag, value in info.items():
                logger.scalar_summary(tag, value, i)
            saver.save(sess, './ddpg.ckpt', global_step=episode + 1)
        if (episode > 2000):
            break
    return RL.cost_his, acc_r
Beispiel #24
0
def main():
    env_name = "dobro-CartPole-v0"
    env = gym.make(env_name)

    time_horizon = 20
    agent_args = {
        'discount_factor': 0.99,
        'time_horizon': time_horizon,
        'time_step': 0.02,
    }
    agent = Agent(env, agent_args)

    max_steps = 1000
    max_ep_len = min(500, env.spec.max_episode_steps)
    episodes = int(max_steps / max_ep_len)
    epochs = int(1e5)

    for epoch in range(epochs):
        ep_step = 0

        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            step = 0

            while True:
                step += 1
                ep_step += 1

                action = agent.get_action(state)
                next_state, reward, done, info = env.step(action)
                env.render()
                #time.sleep(0.01)

                state = next_state
                score += reward

                if done or step >= max_ep_len:
                    break

            print(score)
def objective(space):
    env = gym.make(ENV)
    env = ActionRepeat(env, int(space['repeat']))

    proposals = 1000
    iterations = 10

    # Pool of workers, each has its own copy of global environment variable
    pool = Pool(32, initializer, [env])

    cost = 0
    env.reset()
    for _ in range(env.num_steps):
        state = env.sim.get_state()
        action = cem_planner(pool, env.action_space, state,
                             int(space['horizon']), proposals,
                             int(space['topk']), iterations)
        _, reward, _, _ = env.step(action)
        cost -= reward
    return {'loss': cost, 'status': STATUS_OK}
Beispiel #26
0
def eval(env, agent, times=1000, render=False):
    if False:
        write_explore(agent, 'explore_old.file')

    highest_score = 0
    total_scores = 0
    size = env.get_size()
    scores = []
    max_tiles = []

    for i in range(times):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())

        while True:
            action = agent.choose_action(obs)
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            if render:
                print(f'action is: {action} {obs} {obs_}')
                env.render()
            if obs_ == obs:
                #  env.render()
                agent.learn(obs, action, reward, obs_)
            obs = obs_
            if done:
                break

        env.render()
        scores.append(env.get_score())
        max_tiles.append(env.highest())
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()

    if times > 0:
        plot_score(scores, max_tiles)
        print(f'eval avg_score: {total_scores / times} highest_score: {highest_score}')

    if False:
        write_explore(agent, 'explore_new.file')
Beispiel #27
0
def evaluate(time, env, agent, render=False):
    eval_reward = []
    for i in range(time):
        obs = env.reset()
        episode_reward = 0
        step = 0
        while True:
            step += 1
            action = agent.predict(obs) # 选取最优动作
            action = np.clip(action, -1, 1)
            obs, reward, isOver, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if isOver or step >= 200:
                break
        eval_reward.append(episode_reward)
    mean_reward = np.mean(eval_reward)
    print("evaluating on {} episodes with mean reward {}.".format(time, mean_reward))
    logging.warning("evaluating on {} episodes with mean reward {}.".format(time, mean_reward))
    return mean_reward
Beispiel #28
0
def run_episode(env, agent, rpm):
    obs = env.reset()
    step = 0
    total_reward = 0
    while True:
        action = agent.predict(obs) # 采样动作
        action = np.clip(np.random.normal(action, opt["NOISE"]), -1.0, 1.0)
        next_obs, reward, done, info = env.step(action)
        rpm.append((obs, action, opt["REWARD_SCALE"] * reward, next_obs, done))

        if len(rpm) > opt["MEMORY_WARMUP_SIZE"] and (step % opt["LEARN_FREQ"]) == 0:
            (batch_obs, batch_action, batch_reward, batch_next_obs,
             batch_done) = rpm.sample(opt["BATCH_SIZE"])
            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
                        batch_done)

        obs = next_obs
        total_reward += reward
        step += 1
        if done or step >= 200:
            break
    return step, total_reward
def test_agent(fname, agent, avg=100, seed=43):
    _, env_args = load_args(CONFIG_PATH)
    if fname is not None:
        # if map is specified, use the map without random map
        env_args["fname"] = fname
        env_args["random_map"] = False
    env = gym.make("ScavengerHuntMap-v0", **env_args)
    env.seed(seed)
    dist_list = []
    a = agent(env)
    for i in range(avg):
        print("Running %d/%d" % ((i + 1), avg), end="\r")
        obs = env.reset()
        done = False
        dist = 0
        while not done:
            act = a.next_node(obs)
            cl = env.env.map.get_current_loc()
            obs, _, done, info = env.step(act)
            dist += info["cost"]
        dist_list.append(dist)
    return sum(dist_list) / avg, np.std(dist_list)
Beispiel #30
0
def eval(env, agent, times=1000, render=False):
    highest_score = 0
    scores = []
    max_tiles = []
    eps = 0.0

    random = False
    for i in range(times):
        obs = env.reset()
        while True:
            action, action_values = agent.choose_action(obs, eps, rand=random)
            obs_, reward, done, _ = env.step(action)
            if render:
                env.render()
            if str(obs_) == str(obs):
                random = True
                #env.render()
                #  print(f'action is: {action} {reward} {action_values} {obs} {obs_}')
                print(
                    f'action is: {action} {reward} {action_values} {obs} {obs_}'
                )
            else:
                random = False
            obs = obs_
            if done:
                break

        env.render()
        scores.append(env.get_score())
        max_tiles.append(env.highest())
        if env.get_score() > highest_score:
            highest_score = env.get_score()

    if times > 0:
        plot_score(scores, max_tiles)
        print(
            f'eval avg_score: {np.mean(scores)} highest_score: {highest_score}'
        )
Beispiel #31
0
def sarsa(lambd):
    n_episodes = 1000
    epi_batch = 100
    episodes = xrange(n_episodes)
    action_value_function = defaultdict(float)
    n_zero = 100
    n_s = defaultdict(int)
    n_s_a = defaultdict(int)

    if lambd == 0.0 or lambd == 1.0:
        mses = []

    for episode in episodes:
        if episode%epi_batch == 0:
            if lambd == 0.0 or lambd == 1.0:
                mses.append(compute_mse(action_value_function))

        # initialize state, action, epsilon, and eligibility-trace
        state = State()
        current_dealer = state.dealer
        current_player = state.player

        epsilon = float(n_zero) / (n_zero + n_s[(current_dealer, current_player)])
        current_action = epsilon_greedy_policy(action_value_function, state, epsilon)
        eligibility_trace = defaultdict(int)

        while not state.terminal:
            n_s[(current_dealer, current_player)] += 1
            n_s_a[(current_dealer, current_player, current_action)] += 1

            reward = step(state, current_action)
            new_dealer = state.dealer
            new_player = state.player

            epsilon = float(n_zero) / (n_zero + n_s[(new_dealer, new_player)])

            new_action = epsilon_greedy_policy(action_value_function, state, epsilon)

            alpha = 1.0 / n_s_a[(current_dealer, current_player, current_action)]
            prev_action_value = action_value_function[(current_dealer, current_player, current_action)]
            new_action_value = action_value_function[(new_dealer, new_player, new_action)]

            delta = reward + new_action_value - prev_action_value
            eligibility_trace[(current_dealer, current_player, current_action)] += 1

            for key in action_value_function.keys():
                dealer, player, action = key

                # update the action value function
                action_value_function[(dealer, player, action)] \
                    += alpha * delta * eligibility_trace[(dealer, player, action)]

                # update eligibility-trace
                eligibility_trace[(dealer, player, action)] *= lambd

            # update state and action
            current_dealer = new_dealer
            current_player = new_player
            current_action = new_action


    if lambd == 0.0 or lambd == 1.0:
        mses.append(compute_mse(action_value_function))

    # plot mses curve
    if lambd == 0.0 or lambd == 1.0:
        print "Plotting learning curve for $\lambda$=",lambd
        x = range(0, n_episodes + 1, epi_batch)
        fig = plt.figure()
        plt.title('Learning curve of MSE against episode number: $\lambda$ = ' + str(lambd))
        plt.xlabel("episode number")
        plt.xlim([0, n_episodes])
        plt.xticks(range(0, n_episodes + 1, epi_batch))
        plt.ylabel("Mean-Squared Error (MSE)")
        plt.plot(x, mses)
        fname = "mse_lambda%f_%s.png" % (lambd, str(datetime.now()))
        plt.savefig(fname)
        # plt.show()

    mse = compute_mse(action_value_function)

    return mse
Beispiel #32
0
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

ENV_NAME = 'timetable-case0001-v0001'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)

print('observation space:', env.observation_space)
print('action space:', env.action_space)
env.render()
action = env.action_space.sample()
print(action)
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)
print('nb_actions', env.action_space.n)

env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(300))
Beispiel #33
0
def sarsa(lambd):
    n_episodes = 1000
    epi_batch = 100
    episodes = xrange(n_episodes)
    action_value_function = defaultdict(float)
    linear_function = LinearFunction()
    params_hit = np.array([0 for i in range(18)])
    params_stick = np.array([0 for i in range(18)])
    n_zero = 10
    epsilon = 0.05
    alpha = 0.01

    if lambd == 0.0 or lambd == 1.0:
        mses = []

    for episode in episodes:
        if episode%epi_batch == 0:
            if lambd == 0.0 or lambd == 1.0:
                mses.append(calculate_mse(action_value_function))

        # initialize state, action, epsilon, and eligibility-trace
        state = State()
        linear_function.update(state)
        current_feats = linear_function.get_features()
        action = epsilon_greedy_policy(action_value_function, state, epsilon, current_feats)
        eligibility_hit = np.array([0 for i in range(18)])
        eligibility_stick = np.array([0 for i in range(18)])

        while not state.terminal:
            np_feats = np.array(current_feats)
            if action is HIT:
                eligibility_hit = np.add(eligibility_hit, np_feats)
            else:
                eligibility_stick = np.add(eligibility_stick, np_feats)

            reward = step(state, action)
            linear_function.update(state)
            new_features = linear_function.get_features()

            # update delta
            delta_hit = reward - np.array(tuple(new_features)).dot(params_hit)
            delta_stick = reward - np.array(tuple(new_features)).dot(params_stick)

            # update Action Value Function
            if action == HIT:
                update_action_value_function(action_value_function, (new_features, action), params_hit)
            else:
                update_action_value_function(action_value_function, (new_features, action), params_stick)

            # update delta, parameters, and eligibility-trace
            if action == HIT:
                delta_hit += action_value_function[(tuple(new_features), HIT)]
            else:
                delta_stick += action_value_function[(tuple(new_features), STICK)]

            params_hit = np.add(params_hit, alpha * delta_hit * eligibility_hit)
            params_stick = np.add(params_stick, alpha * delta_stick * eligibility_stick)
            eligibility_hit = eligibility_hit * lambd
            eligibility_stick = eligibility_stick * lambd

            # decide an action
            action = epsilon_greedy_policy(action_value_function, state, epsilon, new_features)

            # update state and action
            current_features = new_features


    if lambd == 0.0 or lambd == 1.0:
        mses.append(calculate_mse(action_value_function))

    # plot mses curve
    if lambd == 0.0 or lambd == 1.0:
        print "Plotting learning curve for $\lambda$=",lambd
        x = range(0, n_episodes + 1, epi_batch)
        fig = plt.figure()
        plt.title('Learning curve of MSE against Episodes @ $\lambda$ = ' + str(lambd))
        plt.xlabel("episode number")
        plt.xlim([0, n_episodes])
        plt.xticks(range(0, n_episodes + 1, epi_batch))
        plt.ylabel("Mean-Squared Error (MSE)")
        plt.plot(x, mses)
        fname = "lapprox_mse_lambda%f_%s.png" % (lambd, str(datetime.now()))
        plt.savefig(fname)
        # plt.show()

    mse = calculate_mse(action_value_function)

    return mse