Beispiel #1
0
def test(game_size, norm):
    #  start_pprof_server(port=8081)
    env = gym.make('game2048-v0', size=game_size, norm=norm)
    obs = env.reset()
    rewards = 0
    step = 0

    for _ in range(1):
        start = time.time() * 1000
        while True:
            # if render for every step
            #  env.render()
            action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
            rewards += reward
            step += 1
            if done:
                escape = time.time() * 1000 - start
                env.render()
                print(f'obs: {obs}')
                print(
                    f'play games steps: {step} reward: {rewards} info: {info}'
                    +
                    f' use {escape:.3f}ms speed: {(step * 1000 / escape):.3f}ops/s'
                )
                time.sleep(0.5)

                step = 0
                rewards = 0
                start = time.time() * 1000
                env.reset()
Beispiel #2
0
def train_sl(size, lr, rd):
    env = gym.make('game2048-v0', size=size)
    agent = model.SarsaLambda(env.action_space)
    trials = 1 * 10000 * (size ** 2)

    for trial in range(trials):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        action = agent.choose_action(obs)
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            action_ = agent.choose_action(obs_)
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_, action_)
            obs = obs_
            action = action_
            rewards += reward
            if done:
                break

        env.render()
        print(f'Completed in {trial} use {stepno} steps highest: \
{env.highest()} rewards: {rewards}')
        stepno = 0
        rewards = 0

    print(len(agent.q_table))
def main():
	import time
	st = time.time()
	env = stage_1()
	state_size = 5
	num_actions = 9
	
	solver = DeepQSolver(state_size, num_actions, 2000, 100)
	epsilon = 1
	train_rewards = []
	for i in range(750):
		res = train(env, solver, epsilon)
		print("Train: Episode", i, "epsilon", epsilon, "time", (time.time() - st) / 60, ": Reward =", res)
		epsilon = max(epsilon * 0.90, 0.05)
		train_rewards.append(res)
	visualize(train_rewards, 'DeepQ', 'DeepQ_stage1.png')
	
	# st = time.time()
	# test_rewards = []
	# for i in range(100):
	# 	res = train(env, solver, 0)
	# 	print("Test: Episode", i, "time", (time.time() - st) / 60, ": Reward =", res)
	# 	test_rewards.append(res)
	# print(f'Test: average {np.mean(test_rewards)}')
	
	render(env, save_path='DeepQ_stage1.mp4')
Beispiel #4
0
def main():
    import time
    st = time.time()

    env = stage_2()  # environment
    state_size = 5
    num_actions = 9

    model = Reinforce(state_size, num_actions)

    train_rewards = []
    for i in range(2500):
        res = train(env, model)
        print(f'Train: Episode {i} time {(time.time() - st) / 60}: {res}')
        train_rewards.append(res)
    visualize(train_rewards, 'Reinforce', 'Reinforce_stage2.png')

    # st = time.time()
    # test_rewards = []
    # for i in range(100):
    # 	res = test(env, model)
    # 	print(f'Test: Episode {i} time {(time.time() - st) / 60}: {res}')
    # 	test_rewards.append(res)
    # print(f'Test: average {np.mean(test_rewards)}')

    render(env, save_path='Reinforce_stage2.mp4')
Beispiel #5
0
def local_test(index, opt, global_model):
    torch.manual_seed(123 + index)
    env, num_states, num_actions = create_train_env(args.world, args.stage, args.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=args.max_actions)
    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())
        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        env.render()
        actions.append(action)
        if curr_step > args.num_global_steps or actions.count(actions[0]) == actions.maxlen:
            done = True
        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()
        state = torch.from_numpy(state)
Beispiel #6
0
def test(env):
    action = env.action_space.sample()
    obs, r, done, info = env.step(action)
    env.render()
    print('action:', action)
    print('reward:', r)
    print('done:', done)
    print('info:', info)
    print('nb_actions', env.action_space.n)
def rollout(sentence_generator, vae, sentences, inst_to_one_hot, dict_goals, valid_goals, env, policy, env_params, inits, goals, self_eval, true_eval, biased_init=False, \
                                                                                                                                                              animated=False):

    expressions = get_list_of_expressions()

    scores = []
    np.random.shuffle(expressions)
    for expression in expressions:
        print('\nAttempting expression: ', expression)
        observation = env.unwrapped.reset_goal(np.array(goals[i]), biased_init=biased_init)
        config_inital = observation['achieved_goal'].copy()
        trial_counter = 0
        success = False
        while trial_counter < 5:
            trial_counter += 1
            goals_str = sample_vae_logic(vae, inst_to_one_hot, observation['achieved_goal'], expression, valid_goals)
            if len(goals_str) > 0:
                goal = dict_goals[np.random.choice(list(goals_str))]
                # goal = dict_goals[np.random.choice(list(goals_str))]
                env.unwrapped.target_goal = goal.copy()
                observation = env.unwrapped._get_obs()
                obs = observation['observation']
                ag = observation['achieved_goal']
                g = observation['desired_goal']

                # start to collect samples
                for t in range(env_params['max_timesteps']):
                    # run policy
                    no_noise = self_eval or true_eval
                    action = policy.act(obs.copy(), ag.copy(), g.copy(), no_noise)
                    # feed the actions into the environment
                    if animated:
                        env.render()
                    observation_new, _, _, info = env.step(action)
                    obs = observation_new['observation']
                    ag = observation_new['achieved_goal']
                config_final = ag.copy()
                true_sentences = sentence_generator(config_inital, config_final)

                if check_sentence(true_sentences, expression):
                    scores.append(trial_counter)
                    success = True
                    print('Success!')
                    break
                else:
                    print('\tFailed. Trying again.')

        if not success:
            scores.append(0)
            print('\tFailed 5 times, Moving On.')


    return scores.copy()
Beispiel #8
0
def test_env(model, vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward, env.get_score()
Beispiel #9
0
def main():
    env_name = "dobro-CartPole-v0"
    env = gym.make(env_name)

    time_horizon = 20
    agent_args = {
        'discount_factor': 0.99,
        'time_horizon': time_horizon,
        'time_step': 0.02,
    }
    agent = Agent(env, agent_args)

    max_steps = 1000
    max_ep_len = min(500, env.spec.max_episode_steps)
    episodes = int(max_steps / max_ep_len)
    epochs = int(1e5)

    for epoch in range(epochs):
        ep_step = 0

        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            step = 0

            while True:
                step += 1
                ep_step += 1

                action = agent.get_action(state)
                next_state, reward, done, info = env.step(action)
                env.render()
                #time.sleep(0.01)

                state = next_state
                score += reward

                if done or step >= max_ep_len:
                    break

            print(score)
Beispiel #10
0
def eval(env, agent, times=1000, render=False):
    if False:
        write_explore(agent, 'explore_old.file')

    highest_score = 0
    total_scores = 0
    size = env.get_size()
    scores = []
    max_tiles = []

    for i in range(times):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())

        while True:
            action = agent.choose_action(obs)
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            if render:
                print(f'action is: {action} {obs} {obs_}')
                env.render()
            if obs_ == obs:
                #  env.render()
                agent.learn(obs, action, reward, obs_)
            obs = obs_
            if done:
                break

        env.render()
        scores.append(env.get_score())
        max_tiles.append(env.highest())
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()

    if times > 0:
        plot_score(scores, max_tiles)
        print(f'eval avg_score: {total_scores / times} highest_score: {highest_score}')

    if False:
        write_explore(agent, 'explore_new.file')
Beispiel #11
0
def evaluate(time, env, agent, render=False):
    eval_reward = []
    for i in range(time):
        obs = env.reset()
        episode_reward = 0
        step = 0
        while True:
            step += 1
            action = agent.predict(obs) # 选取最优动作
            action = np.clip(action, -1, 1)
            obs, reward, isOver, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if isOver or step >= 200:
                break
        eval_reward.append(episode_reward)
    mean_reward = np.mean(eval_reward)
    print("evaluating on {} episodes with mean reward {}.".format(time, mean_reward))
    logging.warning("evaluating on {} episodes with mean reward {}.".format(time, mean_reward))
    return mean_reward
def main():
    env = stage_1()
    state_size = 5
    num_actions = 9

    solver = DeepQSolver(env, state_size, num_actions, 2000, 100)

    epsilon = 0.5
    # solver.model.load()
    for i in range(500):
        res = train(solver, epsilon, replay=False)
        print("Episode :{:4d} Reward: {:6d}".format(i, res), end='\r')
        # render(env, None)
        if ((i + 1) % 100 == 0):
            print()
            solver.model.save()

        epsilon = max(epsilon * 0.99, 0.05)

    # test(solver, 0.1)
    # animate_game(env)
    render(env, 'deepfourier.mp4')
Beispiel #13
0
def eval(env, agent, times=1000, render=False):
    highest_score = 0
    scores = []
    max_tiles = []
    eps = 0.0

    random = False
    for i in range(times):
        obs = env.reset()
        while True:
            action, action_values = agent.choose_action(obs, eps, rand=random)
            obs_, reward, done, _ = env.step(action)
            if render:
                env.render()
            if str(obs_) == str(obs):
                random = True
                #env.render()
                #  print(f'action is: {action} {reward} {action_values} {obs} {obs_}')
                print(
                    f'action is: {action} {reward} {action_values} {obs} {obs_}'
                )
            else:
                random = False
            obs = obs_
            if done:
                break

        env.render()
        scores.append(env.get_score())
        max_tiles.append(env.highest())
        if env.get_score() > highest_score:
            highest_score = env.get_score()

    if times > 0:
        plot_score(scores, max_tiles)
        print(
            f'eval avg_score: {np.mean(scores)} highest_score: {highest_score}'
        )
Beispiel #14
0
            control = k_seq[t] + np.matmul(kk_seq[t], (x_seq_hat[t] - x_seq[t]))
            u_seq_hat[t] = np.clip(u_seq[t] + control, -self.umax, self.umax)
            x_seq_hat[t + 1] = self.f(x_seq_hat[t], u_seq_hat[t])
        return x_seq_hat, u_seq_hat

env = gym.make('CartPoleContinuous-v0').env
obs = env.reset()
ilqr = ILqr(lambda x, u: env._state_eq(x, u),  # x(i+1) = f(x(i), u)
            lambda x, u: 0.5 * np.sum(np.square(u)),  # l(x, u)
            lambda x: 0.5 * (np.square(1.0 - np.cos(x[2])) + np.square(x[1]) + np.square(x[3])),  # lf(x)
            env.max_force,
            env.observation_space.shape[0])
u_seq = [np.zeros(1) for _ in range(ilqr.pred_time)]
x_seq = [obs.copy()]
for t in range(ilqr.pred_time):
    x_seq.append(env._state_eq(x_seq[-1], u_seq[t]))

cnt = 0
while True:
    env.render(mode="rgb_array")
    #import pyglet
    #pyglet.image.get_buffer_manager().get_color_buffer().save('frame_%04d.png' % cnt)
    for _ in range(3):
        k_seq, kk_seq = ilqr.backward(x_seq, u_seq)
        x_seq, u_seq = ilqr.forward(x_seq, u_seq, k_seq, kk_seq)

    print(u_seq.T)
    obs, _, _, _ = env.step(u_seq[0])
    x_seq[0] = obs.copy()
    cnt += 1
Beispiel #15
0
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

ENV_NAME = 'timetable-case0001-v0001'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)

print('observation space:', env.observation_space)
print('action space:', env.action_space)
env.render()
action = env.action_space.sample()
print(action)
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)
print('nb_actions', env.action_space.n)

env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
def rollout(sentence_generator,
            vae,
            sentences,
            inst_to_one_hot,
            dict_goals,
            env,
            policy,
            env_params,
            inits,
            goals,
            self_eval,
            true_eval,
            biased_init=False,
            animated=False):

    score = []

    for sentence in sentences:
        sentence = sentence.lower()
        print('\nNew instruction: ', sentence)
        reached = False
        observation = env.unwrapped.reset_goal(np.array(goals[i]),
                                               biased_init=biased_init)

        config_initial = observation['achieved_goal'].copy()
        if sentence.lower() in inst_to_one_hot.keys():
            counter = 0
            while counter < 5:
                goal = sample_vae(vae, inst_to_one_hot,
                                  observation['achieved_goal'],
                                  sentence).flatten()

                # goal = dict_goals[np.random.choice(list(goals_str))]
                env.unwrapped.target_goal = goal.copy()
                observation = env.unwrapped._get_obs()
                obs = observation['observation']
                ag = observation['achieved_goal']
                g = observation['desired_goal']

                # start to collect samples
                for t in range(env_params['max_timesteps']):
                    # run policy
                    no_noise = self_eval or true_eval
                    action = policy.act(obs.copy(), ag.copy(), g.copy(),
                                        no_noise)
                    # feed the actions into the environment
                    if animated:
                        env.render()
                    observation_new, _, _, info = env.step(action)
                    obs = observation_new['observation']
                    ag = observation_new['achieved_goal']
                counter += 1
                config_final = ag.copy()
                true_sentences = sentence_generator(config_initial,
                                                    config_final)
                if sentence in true_sentences:
                    score.append(counter)
                    reached = True
                    print('\tSuccess!')
                    break
                else:
                    print('\tFailed. Trying again.')

        else:
            print('Wrong sentence.')

        if not reached:
            score.append(0)
            print('\tFailed 5 times, Moving On.')

    return np.array(score)
Beispiel #17
0
def rollout(sentence_generator,
            vae,
            sentences,
            inst_to_one_hot,
            dict_goals,
            env,
            policy,
            env_params,
            inits,
            goals,
            self_eval,
            true_eval,
            biased_init=False,
            animated=False):
    observation = env.unwrapped.reset_goal(np.array(goals[i]),
                                           init=inits[i],
                                           biased_init=biased_init)

    counter = 0
    while counter < 50:
        sentence = np.random.choice(sentences).lower()
        reached = False
        # print(sentence)
        # env.render()
        if sentence.lower() in inst_to_one_hot.keys():
            trial_counter = 0

            config_initial = observation['achieved_goal'].copy()
            while trial_counter < 5:
                goal = sample_vae(vae, inst_to_one_hot,
                                  observation['achieved_goal'],
                                  sentence).flatten()

                # goal = dict_goals[np.random.choice(list(goals_str))]
                env.unwrapped.target_goal = goal.copy()
                observation = env.unwrapped._get_obs()
                obs = observation['observation']
                ag = observation['achieved_goal']
                g = observation['desired_goal']

                # start to collect samples
                for t in range(env_params['max_timesteps']):
                    # run policy
                    no_noise = self_eval or true_eval
                    action = policy.act(obs.copy(), ag.copy(), g.copy(),
                                        no_noise)
                    # feed the actions into the environment
                    if animated:
                        env.render()
                    observation_new, _, _, info = env.step(action)
                    obs = observation_new['observation']
                    ag = observation_new['achieved_goal']
                config_final = ag.copy()
                true_sentences = sentence_generator(config_initial,
                                                    config_final)
                if sentence in true_sentences:
                    reached = True
                    counter += 1
                    break
                else:
                    trial_counter += 1

            if not reached:
                break

        else:
            print('Wrong sentence.')
    print('Counter', counter)
    return counter
def main():
    env_name = "dobro-CartPole-v0"
    env = gym.make(env_name)
    x_list = []
    u_list = []
    steps = 500
    N = 10

    state = env.reset()
    sim_t = 0
    cnt = 0

    start_t = time.time()
    for i in range(steps):
        if sim_t >= cnt * dt:
            init_state = list(state)
            init_action_list = np.zeros((N + 1, u_dim))
            init_state_list = np.zeros((N + 1, x_dim))
            init_state_list[:x_dim] = init_state
            x_init = np.concatenate(
                [init_action_list.ravel(),
                 init_state_list.ravel()])
            lowers = np.array([-np.inf] * ((N + 1) * u_dim) + init_state +
                              [-np.inf] * (N * x_dim))
            uppers = np.array([np.inf] * ((N + 1) * u_dim) + init_state +
                              [np.inf] * (N * x_dim))
            bounds = Bounds(lowers, uppers)
            res = minimize(obj_func, x_init, method="SLSQP", jac=obj_jacobian, bounds=bounds, constraints=[eq_cons], \
                        options={'ftol':1e-5, 'disp':False, 'maxiter':20, 'eps':1e-10})
            cnt += 1

        weight = (sim_t - (cnt - 1) * dt) / dt
        action = np.array([res.x[0] * (1 - weight) + res.x[1] * weight])

        state, reward, done, info = env.step(action)
        env.render()

        x_list.append(state)
        u_list.append(action)
        sim_t += env.unwrapped.tau

    env.close()
    print("elapsed time : {:.3f}s, simulation time : {:.3f}".format(
        time.time() - start_t, sim_t))

    x_list = np.array(x_list)
    u_list = np.array(u_list)

    fig_size = 6
    fig, ax_list = plt.subplots(nrows=2,
                                ncols=1,
                                figsize=(fig_size * 1.5, fig_size * 1.5))
    ax_list[0].plot(x_list[:, 0], label="pos")
    ax_list[0].plot(x_list[:, 1], label="pos_dot")
    ax_list[0].plot(x_list[:, 2], label="theta")
    ax_list[0].plot(x_list[:, 3], label="thtta_dot")
    ax_list[0].grid()
    ax_list[0].legend()
    ax_list[0].set_title('x : state')

    ax_list[1].plot(u_list[:, 0])
    ax_list[1].grid()
    ax_list[1].set_title('u : input')

    fig.tight_layout()
    plt.savefig('result.png')
    plt.show()
Beispiel #19
0
def main():
    env_name = "dobro-CartPole-v0"
    env = gym.make(env_name)

    x_dim = env.observation_space.shape[0]
    u_dim = env.action_space.shape[0]
    time_horizon = 200

    #####################################
    ##### set A, B, R, Q, Qf matrix #####
    m1 = env.unwrapped.masscart
    m2 = env.unwrapped.masspole
    L = env.unwrapped.length
    g = env.unwrapped.gravity
    dt = env.unwrapped.tau
    temp_A_mat = np.eye(x_dim)
    temp_A_mat[0, 1] = dt
    temp_A_mat[2, 3] = dt
    temp_B_mat = np.array([[0.5 * dt**2, 0.0], [dt, 0.0], [0.0, 0.5 * dt**2],
                           [0.0, dt]])
    A_mat = np.array(
        [[0, 0, -(m2 / (m1 + m2)) * (g / (4.0 / 3.0 - m2 / (m1 + m2))), 0],
         [0, 0, g / (L * (4.0 / 3.0 - m2 / (m1 + m2))), 0]])
    B_mat = np.array([[(1.0 / (m1 + m2)) * (1 + 3.0 * m2 / (4.0 * m1 + m2))],
                      [-3.0 / (L * (4.0 * m1 + m2))]])
    A_mat = temp_A_mat + np.matmul(temp_B_mat, A_mat)
    B_mat = np.matmul(temp_B_mat, B_mat)

    R_mat = np.eye(u_dim) * 0.01
    Q_mat = np.eye(x_dim) * 1.0
    Qf_mat = np.eye(x_dim) * 100.0
    #####################################

    #declare LQR solver
    agent = Agent(x_dim, u_dim, time_horizon, A_mat, B_mat, R_mat, Q_mat,
                  Qf_mat)
    x_list = []
    u_list = []

    state = env.reset()
    action, P_mat_list = agent.get_action(state)

    for i in range(time_horizon):
        action = -np.matmul(P_mat_list[i], state).ravel()
        state, reward, done, info = env.step(action)
        env.render()
        time.sleep(dt)

        x_list.append(state)
        u_list.append(action)

    env.close()
    x_list = np.array(x_list)
    u_list = np.array(u_list)

    fig_size = 6
    fig, ax_list = plt.subplots(nrows=2,
                                ncols=1,
                                figsize=(fig_size * 1.5, fig_size * 1.5))
    ax_list[0].plot(x_list[:, 0], label="pos")
    ax_list[0].plot(x_list[:, 1], label="pos_dot")
    ax_list[0].plot(x_list[:, 2], label="theta")
    ax_list[0].plot(x_list[:, 3], label="thtta_dot")
    ax_list[0].grid()
    ax_list[0].legend()
    ax_list[0].set_title('x : state')

    ax_list[1].plot(u_list[:, 0])
    ax_list[1].grid()
    ax_list[1].set_title('u : input')

    fig.tight_layout()
    plt.savefig('result.png')
    plt.show()
Beispiel #20
0
    # print('Training Reward:{}'.format(reward))

    # visualize(model.rewards, 'SARSA-lambda', 'try.png')
    # plt.show()

    # """
    # Test model
    # """
    # for i in range(num_test_episodes):
    #     model.reset_state()
    # reward = model.test(num_timesteps, render =  False)
    #     print('test episode: {}/{} reward: {}'.format(i+1, num_test_episodes, reward), end = '\r')
    #     if ((i+1)%int(num_test_episodes/10)==0):
    #         print()

    # print('Training Reward:{}'.format(reward))
    # print('[', end = '')
    # rwd = model.test(num_timesteps, render =  False)
    # print(']', end = '')

    # """
    # Save model for later use
    # """
    # model.save('weight4.npy')
    model.test(num_timesteps, render=False)
    render(env, 'stage1.mp4')
    # animate_game(env)

    # plotV(model, 'w3.npy')