Esempio n. 1
0
def run_env(env, n_runs=100):
    """
    Plots simulated games in an environment for visualization
    :param env: environment to be run
    :param n_runs: how many episodes should be run
    :return: plot of each step in the environment
    """
    for i in range(n_runs):
        env.reset()
        env.show()
        done = False
        while not done:
            state = env.agents[0].board_to_state(
            )  # for the reinforcement agent convert board to state input
            action = env.agents[0].select_action(state, 0.00)
            action = action[0, 0]  # action is unwrapped from the LongTensor
            move = env.agents[0].action_to_move(
                action)  # e.g. action = 1 -> move = ((0, 0), (0, 1))
            _, done, won = env.step(move)
            env.show()
            if done and won:
                print("Won!")
            elif done and not won or env.steps > 20:
                print("Lost")
                break
Esempio n. 2
0
def test(game_size, norm):
    #  start_pprof_server(port=8081)
    env = gym.make('game2048-v0', size=game_size, norm=norm)
    obs = env.reset()
    rewards = 0
    step = 0

    for _ in range(1):
        start = time.time() * 1000
        while True:
            # if render for every step
            #  env.render()
            action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
            rewards += reward
            step += 1
            if done:
                escape = time.time() * 1000 - start
                env.render()
                print(f'obs: {obs}')
                print(
                    f'play games steps: {step} reward: {rewards} info: {info}'
                    +
                    f' use {escape:.3f}ms speed: {(step * 1000 / escape):.3f}ops/s'
                )
                time.sleep(0.5)

                step = 0
                rewards = 0
                start = time.time() * 1000
                env.reset()
Esempio n. 3
0
def local_test(index, opt, global_model):
    torch.manual_seed(123 + index)
    env, num_states, num_actions = create_train_env(args.world, args.stage, args.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=args.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        env.render()
        actions.append(action)
        if curr_step > args.num_global_steps or actions.count(actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
Esempio n. 4
0
def some_random_games_first():
    for episode in range(10):
        env.reset()
        for t in range(goal_steps):
            action = env.action_space()
            observation, reward, done, info = env.step(action)
            if done:
                break
Esempio n. 5
0
def initial_population():
    training_data = []
    scores = []
    accepted_scores = []
    for _ in range(initial_games):
        env.reset()
        if (_ % 100 == 0):
            print(_)
        score = 0
        game_memory = []
        prev_observation = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        for _ in range(goal_steps):
            #print(prev_observation)
            action = env.action_space()
            observation, reward, done, info = env.step(action)
            #print(action)

            if len(prev_observation) > 0:
                game_memory.append([prev_observation, action])

            prev_observation = observation
            score += reward
            #if done:
            #    break
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [1, 0, 0, 0, 0, 0, 0, 0, 0]
                elif data[1] == 2:
                    output = [0, 1, 0, 0, 0, 0, 0, 0, 0]
                elif data[1] == 3:
                    output = [0, 0, 1, 0, 0, 0, 0, 0, 0]
                elif data[1] == 4:
                    output = [0, 0, 0, 1, 0, 0, 0, 0, 0]
                elif data[1] == 5:
                    output = [0, 0, 0, 0, 1, 0, 0, 0, 0]
                elif data[1] == 6:
                    output = [0, 0, 0, 0, 0, 1, 0, 0, 0]
                elif data[1] == 7:
                    output = [0, 0, 0, 0, 0, 0, 1, 0, 0]
                elif data[1] == 8:
                    output = [0, 0, 0, 0, 0, 0, 0, 1, 0]
                elif data[1] == 9:
                    output = [0, 0, 0, 0, 0, 0, 0, 0, 1]

                training_data.append([data[0], output])

        scores.append(score)

    training_data_save = np.array(training_data)
    np.save('saved2.npy', training_data_save)

    print('Average accepted score:', mean(accepted_scores))
    print('Median accepted score: ', median(accepted_scores))
    print(Counter(accepted_scores))

    return training_data
Esempio n. 6
0
def test():
    training_data = np.load('saved2.npy')
    X = np.array([i[0]
                  for i in training_data]).reshape(-1,
                                                   len(training_data[0][0]), 1)
    model = neural_network_model(input_size=len(X[0]))

    model.load("model2.model")
    scores = []
    choices = []

    for each_game in range(1000):
        score = 0
        game_memory = []
        prev_obs = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        env.reset()
        for _ in range(goal_steps):
            #if len(prev_obs) == 0:
            #    action = random.randrange(1,10)
            #else:
            action = np.argmax(
                model.predict(
                    np.array(prev_obs).reshape(-1, len(prev_obs), 1))[0])
            #if(action==9):
            #print(action)
            #print(prev_obs)
            #print(action)
            choices.append(action)

            new_observation, reward, done, info = env.step(action)
            print(new_observation)
            prev_obs = new_observation
            game_memory.append([new_observation, action])
            score += reward
            if done:
                break
        scores.append(score)

    print('Average Score', sum(scores) / len(scores))
    print(
        'Choice 1: {}, Choice 2: {}, Choice 3: {}, Choice 4: {}, Choice 5: {}, Choice 6: {}, Choice 7: {}, Choice 8: {}, Choice 9: {}'
        .format(
            choices.count(1) / len(choices),
            choices.count(2) / len(choices),
            choices.count(3) / len(choices),
            choices.count(4) / len(choices),
            choices.count(5) / len(choices),
            choices.count(6) / len(choices),
            choices.count(7) / len(choices),
            choices.count(8) / len(choices),
            choices.count(9) / len(choices)))
Esempio n. 7
0
def self_play(env, agent, return_trajectory=False, verbose=False):
    if return_trajectory:
        trajectory = []
    observation = env.reset()
    for step in itertools.count():
        board,_,player,_,_ = observation
        action, prob = agent.decide(observation, return_prob=True)
        if verbose:
            print(strfboard(observation))
            logging.info('The {} step:palyer {}, action {}'.format(step, player,
                    action))
        observation, winner, done, _ = env.step(action[0])
        if return_trajectory:
            m,n = board.shape
            board = np.reshape(board, m*n)
            trajectory.append((player, board, prob))
        if done:
            if verbose:
                print(strfboard(observation))
                logging.info('Winner {}'.format(winner))
            break
    if return_trajectory:
        df_trajectory = pd.DataFrame(trajectory,
                columns=['player', 'board', 'prob'])
        df_trajectory['winner'] = winner
        return df_trajectory
    else:
        return winner
Esempio n. 8
0
def train_sl(size, lr, rd):
    env = gym.make('game2048-v0', size=size)
    agent = model.SarsaLambda(env.action_space)
    trials = 1 * 10000 * (size ** 2)

    for trial in range(trials):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        action = agent.choose_action(obs)
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            action_ = agent.choose_action(obs_)
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_, action_)
            obs = obs_
            action = action_
            rewards += reward
            if done:
                break

        env.render()
        print(f'Completed in {trial} use {stepno} steps highest: \
{env.highest()} rewards: {rewards}')
        stepno = 0
        rewards = 0

    print(len(agent.q_table))
Esempio n. 9
0
 def behaviour(self, candidate):
     obs = env.reset()
     done = False
     while not done:
         action = get_action(ns, obs)
         obs, reward, done, _ = env.step(action)
     return obs
Esempio n. 10
0
def get_env_params(env):
    obs = env.reset()

    # close the environment
    params = {'obs': obs['observation'].shape[0], 'goal': obs['desired_goal'].shape[0],
              'action': env.action_space.shape[0], 'action_max': env.action_space.high[0],
              'max_timesteps': env._max_episode_steps}
    return params
Esempio n. 11
0
def main(args):
    param_str = (
        f'{args.env}_{args.algo}_rep={args.repeat}_hor={args.horizon}_prop={args.proposals}'
        f'_iter={args.iterations}_sigma={args.sigma}')

    env = gym.make(args.env)
    env = ActionRepeat(env, args.repeat)

    # Pool of workers, each has its own copy of global environment variable
    pool = Pool(32, initializer, [env])

    if args.algo == 'gaussian':
        planner = partial(gaussian_cem,
                          pool=pool,
                          action_space=env.action_space,
                          horizon=args.horizon,
                          proposals=args.proposals,
                          topk=args.topk,
                          iterations=args.iterations)
    elif args.algo == 'nonparametric':
        planner = partial(nonparametric_cem,
                          pool=pool,
                          action_space=env.action_space,
                          horizon=args.horizon,
                          proposals=args.proposals,
                          topk=args.topk,
                          iterations=args.iterations,
                          sigma=args.sigma)

    scores = np.zeros(args.episodes)
    observations = np.zeros((args.episodes, env.num_steps + 1) +
                            env.observation_space.shape)
    actions = np.zeros((args.episodes, env.num_steps) + env.action_space.shape)

    for i in range(args.episodes):
        logger = Logger(os.path.join(args.logdir, f'{param_str}_run{i}'))
        observations[i, 0] = env.reset()

        for t in range(env.num_steps):
            state = env.sim.get_state()
            actions[i, t] = planner(state)
            observations[i, t + 1], reward, _, _ = env.step(actions[i, t])
            scores[i] += reward
            logger.log_scalar('reward', scores[i], t)

        print(scores[i])

    print(param_str)
    print('Mean score:         ', scores.mean())
    print('Standard deviation: ', scores.std())

    if args.save:
        path = os.path.join(args.savedir, args.env)
        if not os.path.exists(path):
            os.makedirs(path)
        np.save(os.path.join(path, 'obs'), observations)
        np.save(os.path.join(path, 'act'), actions)
Esempio n. 12
0
 def evaluate(self):
     obs = env.reset()
     done = False
     total_reward = 0
     while not done:
         action = get_action(ns, obs)
         obs, reward, done, _ = env.step(action)
         total_reward += reward
     return total_reward
Esempio n. 13
0
def dqn(n_runs,
        n_episodes,
        max_t=300,
        eps_start=0.05,
        eps_end=1e-4,
        eps_decay=0.996):
    steps = np.zeros(n_episodes)
    acc_rewards = []
    scores = []
    eps = eps_start

    map_vec = env.init_map_vec()
    probMap = np.full((8, 8), 0)
    for num in map_vec:
        loc = util.num_to_loc(num, 8)
        probMap[loc[0]][loc[1]] = 1

    print(agent.probMap)

    for i_run in range(0, n_runs):
        # train
        print("run: ", i_run)
        # provide the learned map
        #agent.reset()
        for i_episode in range(0, n_episodes):
            if i_episode % 500 == 0:
                print(i_episode)
            state = env.reset()
            #score = 0
            #agent.probMap = probMap
            #agent.visitMap = np.full((8, 8), 0)
            for t in range(max_t):
                success = False
                action = agent.act(state, eps)
                next_state, reward, done = env.step(action)
                agent.step(state, action, reward, next_state, done, False,
                           True)  # not update the map
                state = next_state
                eps = max(eps * eps_decay, eps_end)
                #score += reward
                if done:
                    #print(env.map)
                    #print("t",t,"score",score)
                    steps[i_episode] = steps[i_episode] + t
                    success = True
                    #print(t)
                    break

            if not success:
                steps[i_episode] = steps[i_episode] + max_t
                #print(t)

        #agent.reset()

    return scores, steps, agent.probMap
def objective(space):
    env = gym.make(ENV)
    env = ActionRepeat(env, int(space['repeat']))

    proposals = 1000
    iterations = 10

    # Pool of workers, each has its own copy of global environment variable
    pool = Pool(32, initializer, [env])

    cost = 0
    env.reset()
    for _ in range(env.num_steps):
        state = env.sim.get_state()
        action = cem_planner(pool, env.action_space, state,
                             int(space['horizon']), proposals,
                             int(space['topk']), iterations)
        _, reward, _, _ = env.step(action)
        cost -= reward
    return {'loss': cost, 'status': STATUS_OK}
Esempio n. 15
0
def get_benchmark(cards, target):
    env = Env()
    episodes = 0
    rewards = 0
    total_episodes = 500
    while episodes < total_episodes:
        if episodes % 100 == 0:
            print('running %d' % episodes)
            print(rewards / (episodes + 1))
        end = False
        env.reset()
        env.prepare(cards)
        while not end:
            r, end = target.respond(env)
            rewards += r
        if r == 1.:
            print('you win!')
        else:
            print('you lose!')
        episodes += 1
    return rewards / total_episodes
Esempio n. 16
0
def q_learning(size, num_episodes, alpha, gamma=1.0, plot_every=100):
    env = gym.make('game2048-v0', size=size)
    """Q-Learning - TD Control

    Params
    ======
        num_episodes (int): number of episodes to run the algorithm
        alpha (float): learning rate
        gamma (float): discount factor
        plot_every (int): number of episodes to use when calculating average score
    """
    nA = env.action_space.n                # number of actions
    Q = defaultdict(lambda: np.zeros(nA))  # initialize empty dictionary of arrays

    # monitor performance
    tmp_scores = deque(maxlen=plot_every)     # deque for keeping track of scores
    avg_scores = deque(maxlen=num_episodes)   # average scores over every plot_every episodes

    for i_episode in range(1, num_episodes+1):
        # monitor progress
        score = 0                                              # initialize score
        state = env.reset()                                    # start episode
        state = str(state.reshape(size ** 2).tolist())
        eps = 1.0 / i_episode                                  # set value of epsilon

        while True:
            action = epsilon_greedy(env, Q, state, nA, eps)         # epsilon-greedy action selection
            next_state, reward, done, info = env.step(action)  # take action A, observe R, S'
            next_state = str(next_state.reshape(size ** 2).tolist())
            score += reward                                    # add reward to agent's score
            Q[state][action] = update_Q_sarsamax(alpha, gamma, Q, \
                    state, action, reward, next_state)
            state = next_state                                 # S <- S'
            if done:
                tmp_scores.append(score)                       # append score
                break

        print("\rEpisode {}/{}\t Average Score: {:.2f}".format(i_episode, num_episodes, np.mean(tmp_scores)), end="")
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes))
            sys.stdout.flush()
        if (i_episode % plot_every == 0):
            avg_scores.append(np.mean(tmp_scores))

    # plot performance
    plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores))
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
    plt.show()
    # print best 100-episode performance
    print(('Best Average Reward over %d Q length: %d Episodes: ' % (plot_every, len(Q))), np.max(avg_scores))
    return Q
Esempio n. 17
0
def test_env(model, vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward, env.get_score()
Esempio n. 18
0
def train_ql(size, lr, rd, eps_start=1.0, eps_end=0.05, eps_decay=0.999):
    env = gym.make('game2048-v0', size=size)
    agent = model.QLearning(env.action_space, learning_rate=lr, reward_decay=rd)
    total_steps = 0
    total_scores = 0
    highest_score = 0
    #  trials = 1 * 100000 * (size ** 2)
    trials = 400000
    rewards_window = deque(maxlen=100)
    scores_window = deque(maxlen=100)
    eps = eps_start

    for trial in range(1, trials+1):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            total_steps += 1
            action = agent.choose_action(str(obs), eps)
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_)
            obs = obs_
            rewards += reward
            if done:
                break

        #env.render()
        eps = max(eps_end, eps_decay * eps)
        rewards_window.append(rewards)
        scores_window.append(env.get_score())
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()
        print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'.
                format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps), end="")
        if trial% 100 == 0:
            print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'.
                    format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps))

    eval(env, agent, 1000, render=False)
    print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \
highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}')
    print(f'table_len: {len(agent.q_table)} steps: {total_steps}')
Esempio n. 19
0
def train_sarsa(size, lr, rd):
    env = gym.make('game2048-v0', size=size)
    agent = model.Sarsa(env.action_space, learning_rate=lr, reward_decay=rd)
    total_steps = 0
    total_scores = 0
    highest_score = 0
    trials = 1 * 1000 * (size ** 2)

    for trial in range(trials):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        action = agent.choose_action(obs)
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            total_steps += 1
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            action_ = agent.choose_action(obs_, True)
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_, action_)
            obs = obs_
            action = action_
            rewards += reward
            if done:
                break

        #env.render()
        print(f'Completed in {trial} use {stepno} steps highest: \
{env.highest()} rewards: {rewards}', end="")
        if env.highest() >= 2 ** (size ** 2 - 1):
            highest[trial] = env.highest()
            if env.highest() >= 2 ** (size ** 2):
                targets[trial] = env.highest()
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()
        stepno = 0
        rewards = 0

    eval(env, agent, render=False)
    print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \
highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}')
    print(f'highest len: {len(highest)} prob: {len(highest) * 1.0 / trials} \
target len: {len(targets)} prob: {len(targets) * 1.0 / trials}')
Esempio n. 20
0
def train(RL):
    acc_r = [0]
    total_steps = 0
    episode = 0
    all_reward = 0
    # observation = env.reset()
    while True:
        # if total_steps-MEMORY_SIZE > 9000: env.render()
        s, t = env.reset()
        observation = s + list(t.reshape(-1, ))
        for i in range(200):
            action = RL.choose_action(observation)

            # f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4)   # [-2 ~ 2] float actions
            (s_, t), reward, done, info = env.step(actions[action])
            observation_ = s_ + list(t.reshape(-1, ))
            acc_r.append(reward + acc_r[-1])  # accumulated reward

            RL.store_transition(observation, action, reward, observation_)

            observation = observation_
            total_steps += 1
            all_reward += reward

            if total_steps > MEMORY_SIZE:
                RL.learn()

            if done:
                break

        # if total_steps-MEMORY_SIZE > 15000:
        #     break
        episode += 1

        if (episode % 100 == 0):
            info = {'averageTotalReward': all_reward / 100}
            all_reward = 0
            for tag, value in info.items():
                logger.scalar_summary(tag, value, i)
            saver.save(sess, './ddpg.ckpt', global_step=episode + 1)
        if (episode > 2000):
            break
    return RL.cost_his, acc_r
Esempio n. 21
0
def main():
    env_name = "dobro-CartPole-v0"
    env = gym.make(env_name)

    time_horizon = 20
    agent_args = {
        'discount_factor': 0.99,
        'time_horizon': time_horizon,
        'time_step': 0.02,
    }
    agent = Agent(env, agent_args)

    max_steps = 1000
    max_ep_len = min(500, env.spec.max_episode_steps)
    episodes = int(max_steps / max_ep_len)
    epochs = int(1e5)

    for epoch in range(epochs):
        ep_step = 0

        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            step = 0

            while True:
                step += 1
                ep_step += 1

                action = agent.get_action(state)
                next_state, reward, done, info = env.step(action)
                env.render()
                #time.sleep(0.01)

                state = next_state
                score += reward

                if done or step >= max_ep_len:
                    break

            print(score)
Esempio n. 22
0
def eval(env, agent, times=1000, render=False):
    if False:
        write_explore(agent, 'explore_old.file')

    highest_score = 0
    total_scores = 0
    size = env.get_size()
    scores = []
    max_tiles = []

    for i in range(times):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())

        while True:
            action = agent.choose_action(obs)
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            if render:
                print(f'action is: {action} {obs} {obs_}')
                env.render()
            if obs_ == obs:
                #  env.render()
                agent.learn(obs, action, reward, obs_)
            obs = obs_
            if done:
                break

        env.render()
        scores.append(env.get_score())
        max_tiles.append(env.highest())
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()

    if times > 0:
        plot_score(scores, max_tiles)
        print(f'eval avg_score: {total_scores / times} highest_score: {highest_score}')

    if False:
        write_explore(agent, 'explore_new.file')
Esempio n. 23
0
def evaluate(time, env, agent, render=False):
    eval_reward = []
    for i in range(time):
        obs = env.reset()
        episode_reward = 0
        step = 0
        while True:
            step += 1
            action = agent.predict(obs) # 选取最优动作
            action = np.clip(action, -1, 1)
            obs, reward, isOver, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if isOver or step >= 200:
                break
        eval_reward.append(episode_reward)
    mean_reward = np.mean(eval_reward)
    print("evaluating on {} episodes with mean reward {}.".format(time, mean_reward))
    logging.warning("evaluating on {} episodes with mean reward {}.".format(time, mean_reward))
    return mean_reward
Esempio n. 24
0
def run_episode(env, agent, rpm):
    obs = env.reset()
    step = 0
    total_reward = 0
    while True:
        action = agent.predict(obs) # 采样动作
        action = np.clip(np.random.normal(action, opt["NOISE"]), -1.0, 1.0)
        next_obs, reward, done, info = env.step(action)
        rpm.append((obs, action, opt["REWARD_SCALE"] * reward, next_obs, done))

        if len(rpm) > opt["MEMORY_WARMUP_SIZE"] and (step % opt["LEARN_FREQ"]) == 0:
            (batch_obs, batch_action, batch_reward, batch_next_obs,
             batch_done) = rpm.sample(opt["BATCH_SIZE"])
            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
                        batch_done)

        obs = next_obs
        total_reward += reward
        step += 1
        if done or step >= 200:
            break
    return step, total_reward
Esempio n. 25
0
def test_agent(fname, agent, avg=100, seed=43):
    _, env_args = load_args(CONFIG_PATH)
    if fname is not None:
        # if map is specified, use the map without random map
        env_args["fname"] = fname
        env_args["random_map"] = False
    env = gym.make("ScavengerHuntMap-v0", **env_args)
    env.seed(seed)
    dist_list = []
    a = agent(env)
    for i in range(avg):
        print("Running %d/%d" % ((i + 1), avg), end="\r")
        obs = env.reset()
        done = False
        dist = 0
        while not done:
            act = a.next_node(obs)
            cl = env.env.map.get_current_loc()
            obs, _, done, info = env.step(act)
            dist += info["cost"]
        dist_list.append(dist)
    return sum(dist_list) / avg, np.std(dist_list)
Esempio n. 26
0
def eval(env, agent, times=1000, render=False):
    highest_score = 0
    scores = []
    max_tiles = []
    eps = 0.0

    random = False
    for i in range(times):
        obs = env.reset()
        while True:
            action, action_values = agent.choose_action(obs, eps, rand=random)
            obs_, reward, done, _ = env.step(action)
            if render:
                env.render()
            if str(obs_) == str(obs):
                random = True
                #env.render()
                #  print(f'action is: {action} {reward} {action_values} {obs} {obs_}')
                print(
                    f'action is: {action} {reward} {action_values} {obs} {obs_}'
                )
            else:
                random = False
            obs = obs_
            if done:
                break

        env.render()
        scores.append(env.get_score())
        max_tiles.append(env.highest())
        if env.get_score() > highest_score:
            highest_score = env.get_score()

    if times > 0:
        plot_score(scores, max_tiles)
        print(
            f'eval avg_score: {np.mean(scores)} highest_score: {highest_score}'
        )
Esempio n. 27
0
# with open('./Center/log.csv', 'w') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL'])

# wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL)
with tf.Session(config=config) as sess:
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)

    for i in range(num_episodes):
        episodeBuffer = []
        sP = env.reset()
        s = processState(sP)
        d = False
        rAll = 0
        j = 0
        state = (np.zeros([1, h_size]), np.zeros([1, h_size]))
        # The Q-Network
        while j < max_epLength:
            j += 1

            if np.random.rand(1) < e:
                state1 = sess.run(mainQN.rnn_state, \
                                  feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1,
                                             mainQN.state_in: state, mainQN.batch_size: 1})
                a = np.random.randint(0, 4)
            else:
Esempio n. 28
0
    a_bound = env.action_space.high

    print('s_dim', s_dim)
    print('a_dim', a_dim)

    # 用DDPG算法
    ddpg = DDPG(a_dim, s_dim, a_bound)

    # 训练部分:
    if args.train:  # train

        reward_buffer = []  # 用于记录每个EP的reward,统计变化
        t0 = time.time()  # 统计时间
        for i in range(MAX_EPISODES):
            t1 = time.time()
            s = env.reset()
            ep_reward = 0  # 记录当前EP的reward
            for j in range(MAX_EP_STEPS):
                # Add exploration noise
                a = ddpg.choose_action(s)  # 这里很简单,直接用actor估算出a动作

                # 为了能保持开发,这里用了另外一种方式增加探索。
                # 因此需要需要以a为均值,VAR为标准差,建立正态分布,再从正态分布采样出a
                # 因为a是均值,所以a的概率是最大的。但a相对其他概率由多大,是靠VAR调整。这里我们其实可以增加更新VAR,动态调整a的确定性
                # 然后进行裁剪

                # Question: 原文的正态分布为 N(0, \sigma^2),按参考论文意思应该是 a + N(0, \sigma^2)
                a = np.clip(np.random.normal(loc=a, scale=sigma), 0, 1)
                # 与环境进行互动
                s_, r, done, info = env.step(a)
Esempio n. 29
0
            kk_seq.append(kk)
        k_seq.reverse()
        kk_seq.reverse()
        return k_seq, kk_seq

    def forward(self, x_seq, u_seq, k_seq, kk_seq):
        x_seq_hat = np.array(x_seq)
        u_seq_hat = np.array(u_seq)
        for t in range(len(u_seq)):
            control = k_seq[t] + np.matmul(kk_seq[t], (x_seq_hat[t] - x_seq[t]))
            u_seq_hat[t] = np.clip(u_seq[t] + control, -self.umax, self.umax)
            x_seq_hat[t + 1] = self.f(x_seq_hat[t], u_seq_hat[t])
        return x_seq_hat, u_seq_hat

env = gym.make('CartPoleContinuous-v0').env
obs = env.reset()
ilqr = ILqr(lambda x, u: env._state_eq(x, u),  # x(i+1) = f(x(i), u)
            lambda x, u: 0.5 * np.sum(np.square(u)),  # l(x, u)
            lambda x: 0.5 * (np.square(1.0 - np.cos(x[2])) + np.square(x[1]) + np.square(x[3])),  # lf(x)
            env.max_force,
            env.observation_space.shape[0])
u_seq = [np.zeros(1) for _ in range(ilqr.pred_time)]
x_seq = [obs.copy()]
for t in range(ilqr.pred_time):
    x_seq.append(env._state_eq(x_seq[-1], u_seq[t]))

cnt = 0
while True:
    env.render(mode="rgb_array")
    #import pyglet
    #pyglet.image.get_buffer_manager().get_color_buffer().save('frame_%04d.png' % cnt)
Esempio n. 30
0
        self.sim_counter += 1
        self.name = f'tour{self.sim_counter:03}'
        if self.verbose:
            print(
                f'[*] Starting a new simulation with noisy travel times-{self.name}'
            )


if __name__ == '__main__':
    from env_rl import EnvRL

    env = EnvRL(5, seed=123456, adaptive=False)
    print('name', env.name)
    env.step(2)
    env.step(4)
    env.step(5)
    env.step(1)
    env.step(3)
    print('tour', env.tour)
    print('tour time', env.tour_time)
    print(50 * '-')
    env.reset()
    print('name', env.name)
    env.step(2)
    env.step(4)
    env.step(5)
    env.step(1)
    env.step(3)
    print('tour', env.tour)
    print('tour time', env.tour_time)