예제 #1
0
class Environment(threading.Thread):
    def __init__(self,
                 brain,
                 environment,
                 eps_start=0,
                 eps_end=0,
                 eps_steps=0,
                 render=False):
        threading.Thread.__init__(self)
        self.env = gym.make(environment)
        self.stop_signal = False
        self.render = render
        self.agent = Agent(brain, eps_start, eps_end, eps_steps)

    def runGame(self):
        R = 0
        s = utils.process(self.env.reset(), self.env.spec.id)

        n_a = 0
        old_a = None

        while True:
            time.sleep(THREAD_DELAY)

            if self.render: self.env.render()

            if n_a > MAX_REPEAT_ACTION:
                a = self.agent.act(s, old_a)
            else:
                a = self.agent.act(s)

            if a == old_a:
                n_a += 1
            else:
                n_a = 0

            old_a = a

            s_, r, done, info = self.env.step(a)
            s_ = utils.process(s_, self.env.spec.id)
            R += r
            self.agent.train(s, a, r, s_, done, R)

            s = s_

            if done or self.stop_signal:
                break
        print("Score:", R)

    def run(self):
        while not self.stop_signal:
            self.runGame()

    def stop(self):
        self.stop_signal = True
예제 #2
0
def initial_log(agent: Agent,
                env: ContinuousSimulation,
                writer: tf.summary.SummaryWriter = None,
                **kwargs) -> None:
    writer = optional_writer(writer)

    state = env.reset()
    context = env.unwrapped.state()

    agent.act(state, context, network='q', log_graph=True)
    agent.act(state, context, network='target', log_graph=True)
예제 #3
0
    def masterprocess(self):
        env, state_size, action_size = self.env, self.state_size, self.action_size
        agent = Agent(state_size,
                      action_size,
                      number_of_agents=self.num_agents,
                      is_master=True,
                      args=self.args,
                      device="cpu")

        scores_deque = deque(maxlen=100)
        scores = []
        tqdm_bar = trange(1, self.n_trajectories, desc="Trajectories")
        episode_bar = tqdm(total=self.max_t)
        train_mode = True
        for i in tqdm_bar:

            state = env.reset(
                train_mode=train_mode)[self.brain_name].vector_observations
            score = 0
            for t in range(self.max_t):
                action, prob, q_value = agent.act(state[0])
                action2, prob2, q_value2 = agent.act(state[1])
                env_info = env.step([
                    action.detach().cpu().data.numpy(),
                    action2.detach().cpu().data.numpy()
                ])[self.brain_name]
                next_state, reward, done = env_info.vector_observations, env_info.rewards, env_info.local_done
                agent.step(action, reward, prob, done, q_value)
                state = next_state
                score += np.mean(reward)
                episode_bar.set_description(
                    "Time Step T: {}, Score: {:.2f}".format(t, score))
                episode_bar.update()
                # if done:
                #     break

            episode_bar.reset()
            tqdm_bar.set_description("Episode: {}, Score: {:.2f}".format(
                i, score))
            scores_deque.append(score)
            scores.append(score)
            # train_mode = score < 10.0
            if i % 100 == 0:
                torch.save(agent.TwoHeadModel.state_dict(), 'checkpoint.pth')

                if np.mean(scores_deque) > self.termination_threshold:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(i - 100, np.mean(scores_deque)))
                    break

        self.scores = scores
        env.close()
    def test_agent_act(self):
        """Test how an agent can act"""

        agent = Agent(3, 5, 1)
        states = np.array([[1.0, 2.0, 3.0], [0.3, 2.0, 1.0]])

        actions1 = agent.act(states, False)
        self.assertEqual((2, 5), actions1.shape)

        actions2 = agent.act(states, False)
        self.assertTrue(np.allclose(actions2, actions1))

        actions3 = agent.act(states, True)
        self.assertFalse(np.allclose(actions2, actions3))
예제 #5
0
def main(train, action_bias=0):
    environment = Environment(tickers,
                              initial_deposit=100000,
                              from_date=datetime(2004, 1, 1),
                              to_date=datetime(2010, 1, 1),
                              min_days_to_hold=min_days_to_hold,
                              max_days_to_hold=max_days_to_hold)
    agent = Agent(environment.state_size(),
                  environment.action_size(),
                  epochs=epochs,
                  gamma=0.2,
                  replay_buffer=64,
                  memory_queue_length=32)

    if train:
        for i in range(epochs):
            state = environment.reset()
            done = False

            while not done:
                action = agent.act(state)
                next_state, reward, done = environment.step(action)
                agent.remember(state, action, reward, next_state, done)
                state = next_state
            agent.decrease_epsilon()
            LOGGER.info('Balance for current game: %d', environment.deposit)

        pprint(environment.actions)
        agent.save(environment.main_ticker + '.h5')
    else:
        agent.load(environment.main_ticker + '.h5')

    # Test on!
    test_environment = Environment(tickers,
                                   initial_deposit=100000,
                                   from_date=datetime(2010, 1, 1),
                                   to_date=datetime(2013, 1, 1),
                                   min_days_to_hold=min_days_to_hold,
                                   max_days_to_hold=max_days_to_hold,
                                   scaler=environment.scaler)

    state = test_environment.reset()
    done = False

    while not done:
        action = agent.act(state, False, action_bias)
        next_state, _, done = test_environment.step(action)
        state = next_state
    print_results_on_test_environment(test_environment)
    export_to_file(test_environment.actions)
예제 #6
0
class CartPole:
    def __init__(self):
        self.replay_batch_size = 500
        self.training_episodes = 500
        self.show_episodes = 3
        self.env = gym.make("CartPole-v1")
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.agent = Agent(self.state_size, self.action_size)

    def train(self):
        for episode in range(self.training_episodes):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])

            done = False
            score = 0
            while not done:
                action = self.agent.act(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                self.agent.remember(state, action, reward, next_state, done)
                state = next_state
                score += 1

            print("Episode #{} Score: {}".format(episode, score))
            self.agent.replay(self.replay_batch_size)

    def show(self):
        self.agent = Agent(self.state_size, self.action_size)
        self.agent.load_model()

        self.env = gym.wrappers.Monitor(self.env, 'video')

        for index_episode in range(self.show_episodes):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])

            done = False
            score = 0
            while not done:
                self.env.render()
                time.sleep(0.01)
                action = self.agent.act(state)
                state, reward, done, info = self.env.step(action)
                state = np.reshape(state, [1, self.state_size])
                score += 1

            print("The score was: {}".format(score))
예제 #7
0
def turn(agent: Agent,
         agent_policy_step: str,
         foe: Foe,
         rand="random") -> (str, str):
    # Not currently very extensible. Oh well.

    agent_action = agent.act(agent_policy_step)

    new_states = (agent_action.resolve_action(foe) if rand == "random" else
                  agent_action.action_expectation(foe))

    if agent_action.target_id == "self":
        agent.update_states(new_states["target"])
    else:
        foe.update_states(new_states["target"])

    foe_action = foe.act(rand)

    new_states = (foe_action.resolve_action(agent) if rand == "random" else
                  foe_action.action_expectation(agent))

    if foe_action.target_id == "self":
        foe.update_states(new_states["target"])
    else:
        agent.update_states(new_states["target"])

    foe.decrement_cooldowns()
    foe_reaction = foe.react()

    return (agent_action, foe_reaction)
예제 #8
0
def fitnessFunction2(genotype):
    """Second version of the fitness function. This one optimizes for the greatest delay to action."""

    agent = Agent(genotype, Size, WeightRange, BiasRange, TimeConstMin,
                  TimeConstMax, InputWeightRange, Dt)
    first_actions = np.zeros(
        (len(Stimuli), Trials)
    )  # Changed to zeros for the sake of fitness eval in the event no action is taken
    for s in range(len(Stimuli)):  # This loops runs each stimulus condition
        for t in range(Trials):  # This loops run all trials of task
            actions = np.empty(Duration)
            agent.sense(Stimuli[s])  # Initial stimulus
            for step in range(Duration):  # Runtime
                agent.think()
                actions[step] = agent.act()  # Record agent action
                agent.sense(
                    0
                )  # Under current experimental design, no stimulus available for rest of

            acted = np.where(actions == 1)
            if len(
                    acted[0]
            ) > 0:  # The 0 index is because we need the array's first size dimension
                first_actions[s, t] = acted[0][
                    0]  # Record when the agent's first action was
            else:
                first_actions[s, t] = 0  # Not acting at all is worst fitness

    return np.average(first_actions) / Duration
예제 #9
0
def run_simulation(title="", num_plants=20, episodes=DEFAULT_EPISODES, episode_length=DEFAULT_EPISODE_LENGTH):
    print("START SIMULATION")

    env = Environment(num_plants=num_plants)
    state_size = env.observation_space
    action_size = env.action_space
    agent = Agent(state_size, action_size)
    batch_size = 32

    for e in range(episodes):
        state = env.reset()
        state = numpy.reshape(state, [1, state_size])
        for time in range(episode_length):
            pour_amount = agent.act(state)
            next_state, reward, done = env.step(pour_amount)
            next_state = numpy.reshape(next_state, [1, state_size])
            agent.remember(state, pour_amount, reward, next_state, done)

            state = next_state
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)

        print("Episode {} of {} done...\n".format(e, episodes))

    generate_graphs(agent, env, title, num_plants, episodes, episode_length)

    print("END SIMULATION")
예제 #10
0
파일: main.py 프로젝트: trtd56/Atari
def main(env_name, monitor=True, load=False, seed=0, gpu=-1):

    env = gym.make(env_name)
    view_path = "./video/" + env_name
    model_path = "./model/" + env_name + "_"

    n_st = env.observation_space.shape[0]
    n_act = env.action_space.n

    agent = Agent(n_act, seed, gpu)
    if load:
        agent.load_model(model_path)

    if monitor:
        env.monitor.start(view_path, video_callable=None, force=True, seed=seed)
    for i_episode in xrange(10000):
        observation = env.reset()
        agent.reset_state(observation)
        ep_end = False
        q_list = []
        r_list = []
        while not ep_end:
            action = agent.act()
            observation, reward, ep_end, _ = env.step(action)
            agent.update_experience(observation, action, reward, ep_end)
            agent.train()
            q_list.append(agent.Q)
            r_list.append(reward)
            if ep_end:
                agent.save_model(model_path)
                break
        print('%i\t%i\t%f\t%i\t%f' % (i_episode, agent.step, agent.eps, sum(r_list), sum(q_list)/float(len(q_list))))
    if monitor:
        env.monitor.close()
예제 #11
0
def run_agent(num_episodes=1):
    env = UnityEnvironment(file_name="env/Reacher20.app")

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]

    action_size = brain.vector_action_space_size
    state_size = env_info.vector_observations.shape[1]
    num_agents = len(env_info.agents)

    agent = Agent(state_size=state_size, action_size=action_size, random_seed=2)
    agent.actor_local.load_state_dict(torch.load("model/checkpoint_actor.pth", map_location='cpu'))
    agent.critic_local.load_state_dict(torch.load("model/checkpoint_critic.pth", map_location='cpu'))

    for i in range(num_episodes):
        scores = np.zeros(num_agents)
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        while True:
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            scores += env_info.rewards
            states = next_states
            if np.any(env_info.local_done):
                break
        print(f"{i + 1} episode, averaged score: {np.mean(scores)}")
예제 #12
0
def ddpg():
    scores = []
    env = gym.make(ENV_NAME)
    agent = Agent(state_size=2, action_size=1)

    for i_episode in range(n_episodes):
        score = 0
        done = False
        state = env.reset()

        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            modified_reward = reward + \
                              POTENTIAL_FUNCTION_COEF * (GAMMA * abs(next_state[1]) - abs(state[1]))

            agent.step(state, action, modified_reward, next_state, done)
            score += reward
            state = next_state
            if done:
                break

        scores.append(score)

        if i_episode % rate_of_print == 0:
            print("Episode: {}. Score: {}, Done: {}".format(i_episode / rate_of_print, score, done))

    return agent, scores
예제 #13
0
def main():
    window_size = 5
    episode_count = 10
    stock_name = "GSPC_10"
    batch_size = 3
    agent = Agent(window_size)
    market = Market(window_size=window_size, stock_name=stock_name)
    start_time = time.time()
    for e in range(episode_count + 1):
        print("Episode {0}/{1}".format(e, episode_count))
        agent.reset()
        state, price_data = market.reset()
        for t in range(market.last_index):
            action, bought_price = agent.act(state, price_data)
            next_state, next_price_data, reward, done = market.get_next_state_reward(
                action, bought_price)
            agent.memory.append([state, action, reward, next_state, done])
            if len(agent.memory) > batch_size:
                agent.experience_replay(batch_size)
            state = next_state
            price_data = next_price_data
            if done:
                print("----------------------")
                print("Total Profit: {0}".format(agent.get_total_profit()))
                print("----------------------")
        if e % 10 == 0:
            if not os.path.exists("models"):
                os.mkdir("models")
            agent.model.save("models/model_ep" + str(e))
        end_time = time.time()
        training_time = end_time - start_time
        print("Training time {0}".format(training_time))
예제 #14
0
def main():
    stock_name = "GSPC_2011-03"
    model_name = "model_ep10"

    model = load_model("models/" + model_name)
    window_size = model.layers[0].input.shape.as_list()[1]

    agent = Agent(window_size, True, model_name)
    market = Market(window_size, stock_name)

    state, price_data = market.reset()

    for t in range(market.last_data_index):
        action, bought_price = agent.act(state, price_data)
        next_state, next_price_data, reward, done = market.get_next_state_reward(
            action, bought_price)

        state = next_state
        price_data = next_price_data

        if done:
            print("----------------------------")
            print("{0} Total profit: {1}".format(stock_name,
                                                 agent.get_total_profit))
            print("----------------------------")

    plot_action_profit(market.data, agent.action_history,
                       agent.get_total_profit())
예제 #15
0
def run(episode=100000000, is_training=True):
    env = gym.make('FlappyBird-v0')
    agent = Agent(env)
    # agent.load_net('./tb/checkpoints/2190000')
    for e in range(episode):
        ob = env.reset()
        ob = agent.preproc(ob)
        done = False
        score = step = 0
        start_time = time.time()
        while not done:
            if is_training is False:
                env.render()
            ac = agent.act(ob, is_training)
            next_ob, rew, done, _ = env.step(ac)
            # if rew == 0:
            #     rew = 0.1
            if is_training:
                ob = agent.memory(ob, ac, next_ob, rew, done)
            else:
                ob = agent.preproc(next_ob)
            score += rew
            step += 1

        agent.get_score(score)
        print('episode: {} | score: {} | fps: {}'.format(e, score, step/(time.time() - start_time)))
예제 #16
0
def run():
    student = Agent(17, 16, model_name=name)
    game_n = 0
    while True:
        games = 16
        for _ in range(games):
            board = TTT4()
            end = False
            game_n += 1
            turn = 0
            while not end:
                player_turn = int(board.player)
                current_board = board.board[:]
                state = get_state(current_board, player_turn)
                if turn == 0:
                    action = game_n % 16
                    ret = board.play(action)
                else:
                    action = student.act(np.array(state))
                    ret = board.play(action)
                events = get_events(state, current_board, player_turn, student.model)
                student.memory.append(events)
                reward = events[action][2]
                print(f'{game_n}: {action} {ret} reward: {reward}')
                board.print_board()
                if 'invalid' in ret:
                    break

                if 'win' in ret or 'draw' in ret:
                    end = True
                turn += 1
        student.exp_replay()
        if game_n % 160 == 0:
            student.model.save(f'keras_model/{name}_{str(int(game_n))}')
예제 #17
0
def main():
    config = Config(
        n_episodes=10000,
        max_episode_length=200,
        n_actions=2,
        n_inp_dim=4,
        n_hidden_dim=64,
        batch_size=1,
        gamma=0.99,
    )

    env = gym.make('CartPole-v0').unwrapped
    memory = Memory(config)
    agent = Agent(config)

    for _ in range(config.n_episodes):
        episode: List[Step] = []
        s = env.reset()
        final_v = 0
        for _ in range(config.max_episode_length):
            a = agent.act(s)
            s2, r, t, _ = env.step(a)
            episode.append(Step(state=s, action=a, reward=r, terminal=t))
            s = s2
            if t:
                break
        else:
            # If no break
            final_v = agent.q(s).max()
        memory.store(episode, final_v)

        print(f"Reward: {sum(step.reward for step in episode)}")

        # Always train on last episode:
        agent.train(memory.episodes[-1:])
예제 #18
0
def run(env_file, model_file, num_episodes=5):
    env = UnityEnvironment(file_name=env_file)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations[0]
    state_shape = state.shape
    agent = Agent(state_shape=state_shape, action_size=action_size, seed=0)

    agent.qnetwork_local.load_state_dict(torch.load(model_file))

    for i in range(num_episodes):
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        while True:
            action = agent.act(state)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            score += reward
            state = next_state
            if done:
                break

        print("Score: {}".format(score))
    env.close()
예제 #19
0
class CartPole:
    def __init__(self):
        # Number of steps we select to learn from while replaying from memory
        self.sample_batch_size = 32
        self.episodes = 10000
        self.env = gym.make('CartPole-v1')
        # Configure model based on the environment
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        # Initialize the Agent
        self.agent = Agent(self.state_size, self.action_size)

    def run(self, render: bool = False):
        try:
            for index_episode in range(self.episodes):
                state = self.env.reset()
                state = np.reshape(state, [1, self.state_size])
                done = False
                index = 0
                while not done:
                    if render:
                        self.env.render()
                    action = self.agent.act(state)
                    next_state, reward, done, _ = self.env.step(action)
                    next_state = np.reshape(next_state, [1, self.state_size])
                    self.agent.remember(state, action, reward, next_state,
                                        done)
                    state = next_state
                    index += 1
                print("Episode {}# Score: {}".format(index_episode, index + 1))
                self.agent.replay(self.sample_batch_size)
        finally:
            self.agent.save_model()
예제 #20
0
def main():
    now = time.localtime()
    dir_name = '{0:04d}-{1:02d}-{2:02d}_{3:02d}-{4:02d}'.format(now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min)
    summary = SummaryWriter(os.path.join(ROOT, 'logs/{}'.format(dir_name)))
    output_dir = os.path.join(ROOT, 'trained_models/{}'.format(dir_name))
    os.makedirs(output_dir, exist_ok=True)

    env = gym.make('Walker2DBulletEnv-v0')
    env.seed(0)
    env.render()
    agent = Agent()

    # seed = 0
    # repeat = 10000

    best_reward = 0.0

    seed_ = 0
    while True:
    # for seed_ in range(seed, seed + repeat):
        observation = env.reset()
        agent.ounoise.reset()
        done = False

        actor_loss = 0.0
        critic_loss = 0.0
        reward_sum = 0.0

        agent.decay_epsilon()

        step = 0
        while not done:
            action = agent.act(observation, is_training=True)
            next_observation, reward, done, _ = env.step(action)

            agent.push_memory(observation, action, reward, next_observation, done)
            loss_a, loss_c = agent.train()
            actor_loss += loss_a
            critic_loss += loss_c
            reward_sum += reward

            observation = next_observation
            step += 1

        summary.add_scalar('actor/model_loss', actor_loss/step, seed_)
        summary.add_scalar('critic/model_loss', critic_loss/step, seed_)
        summary.add_scalar('reward', reward_sum, seed_)

        if reward_sum >= best_reward:
            torch.save(agent.actor.model.state_dict(), '{}/actor.pkl'.format(output_dir, seed_))
            torch.save(agent.critic.model.state_dict(), '{}/critic.pkl'.format(output_dir, seed_))
            torch.save(agent.actor.target_model.state_dict(), '{}/actor_t.pkl'.format(output_dir, seed_))
            torch.save(agent.critic.target_model.state_dict(), '{}/critic_t.pkl'.format(output_dir, seed_))

            with open(os.path.join(ROOT, 'logs/{}.txt'.format(dir_name)), 'a') as f:
                f.write("(Episode {}: Reward {}) The best model parameters were saved.\n".format(seed_, reward_sum))

            best_reward = reward_sum

        seed_ += 1
def main():

    stock_name = "GSPC_2011-03"
    model_name = "model_ep10"

    model = load_model("models/" + model_name)
    window_size = model.layers[0].input.shape.as_list()[1]

    agent = Agent(window_size, True, model_name)
    market = Market(window_size, stock_name)

    state, price_data = market.reset() #ToDo: Start from an initial state

    for t in range(market.last_data_index):
        action, bought_price = agent.act(state, price_data) # ToDo: Get action for the current state

        # Check the action to get reward and observe next state
        next_state, next_price_data, reward, done = market.get_next_state_reward(action, bought_price) #ToDo: get next state

        state = next_state
        price_data = next_price_data

        if done:
            print("--------------------------------")
            print("{0} Total Profit: {1}".format(stock_name, agent.get_total_profit()))
            print("--------------------------------")

    plot_action_profit(market.data, agent.action_history, agent.get_total_profit())
예제 #22
0
def main_eval():
    stock_name = "BABA"
    model_name = "model_ep0"

    model = load_model("models/" + model_name)
    window_size = model.layers[0].input.shape.as_list()[1]

    agent = Agent(window_size, True, model_name)
    market = Market(window_size, stock_name)

    state, price_data, date_data = market.reset()
    date = []

    for t in range(market.last_data_index):

        action, bought_price = agent.act(state, price_data, date_data)

        next_state, next_price_data, next_date_data, reward, done = market.get_next_state_reward(
            action, bought_price)

        state = next_state
        price_data = next_price_data
        date_data = next_date_data

        if done:
            print("--------------------")
            print("{0} Total profit: {1}".format(stock_name,
                                                 agent.get_total_profit()))

            print("--------------------")
    plot_action_profit(market.data, agent.action_history,
                       agent.get_total_profit())
    return agent.book, agent.initial_investment, agent.dates
예제 #23
0
def main():

    stock_name = "GSPC_2011-03"
    model_name = "model_ep30"

    window_size = 5

    agent = Agent(window_size, True, model_name)
    market = Market(window_size, stock_name)

    state, price_data = market.reset()  # Start from an initial state

    for t in range(market.last_data_index):
        action, bought_price = agent.act(
            state, price_data)  # Get action for the current state

        # Check the action to get reward and observe next state
        next_state, next_price_data, reward, done = market.get_next_state_reward(
            action, bought_price)

        state = next_state
        price_data = next_price_data

        if done:
            print("--------------------------------")
            print("{0} Total Profit: {1}".format(stock_name,
                                                 agent.get_total_profit()))
            print("--------------------------------")

    #toDo: change data
    plot_action_profit(market.data["Close"].values, agent.action_history,
                       agent.get_total_profit())
예제 #24
0
def main():


    logger = Logger()
    #------------------------------------ENVIRONMENT---------------------------------------------
    a = Workspace(conversion_a)
    b = Workspace(conversion_b)

    workspaces = []
    workspaces.append(a)
    workspaces.append(b)

    env = Environment(workspaces)
    
    #-------------------------------------------------------------------------------------------
    agent = Agent().build_agent(len(workspaces))
    sess = agent.get_session()

    logger.create_dataholder("Target")
    logger.create_dataholder("Workspace_A")
    logger.create_dataholder("Workspace_B")

  
    #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
    for i in range(config.nb_timesteps):
        Logger.write("INFO", "TIMESTEP " + str(i))
        logger.add_datapoint("Workspace_A", i, distribution_a(i))
        logger.add_datapoint("Workspace_B", i, distribution_b(i))

        actions_tensor = np.zeros((config.training_size, 1))
        rewards_tensor = np.zeros((config.training_size,1))
        
        for j in range(config.training_size):
            action_elem = np.zeros(1)
            reward_elem = np.zeros(1)
            action_elem = agent.act()
            reward_elem = env.act(action_elem, i)
            actions_tensor[j][0] = action_elem
            rewards_tensor[j][0] = reward_elem
        
        for j in range(config.nb_batches):
            action_batch, reward_batch = utils.shuffle_batch(actions_tensor, rewards_tensor)
            loss_value,upd,resp,ww = agent.train(action_batch, reward_batch)
        
        Logger.write("INFO", str(loss_value))
        Logger.write("INFO", str(ww))

        total_reward = np.sum(rewards_tensor)
        reward_mean = float(total_reward)/float(config.training_size)
        
        Logger.write("INFO", "Total Reward of timestep " + str(i) + ': ' + str(reward_mean))
        
        logger.add_datapoint("Target", i, 100.0*reward_mean)             
    
    logger.init_plot()
    logger.plot("Target", 'o')
    logger.plot("Workspace_A", linestyle = None)
    logger.plot("Workspace_B", linestyle = None)
    logger.show()
예제 #25
0
def dqn(n_episodes=2000,
        max_t=1000,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    agent = Agent(state_size=37, action_size=4, seed=0)
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]
        #print(state.shape)# get the current state
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
        if np.mean(scores_window) >= 15.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(),
                       './ckpt/checkpoint.pth')
            print('model saved')
            break
    torch.save(agent.qnetwork_local.state_dict(), './checkpoint.pth')

    env.close()

    return scores
예제 #26
0
class PB18151853(RL_alg):
    def __init__(self, ob_space, ac_space):
        super().__init__()
        assert isinstance(ac_space, Discrete)

        self.team = ['PB17121707', 'PB17121732', 'PB18151853']  # 记录队员学号
        # self.config = get_params_from_file('src.alg.PB00000000.rl_configs',params_name='params') # 传入参数

        self.ac_space = ac_space
        self.state_dim = ob_space.shape
        print(self.state_dim)
        self.action_dim = ac_space.n
        # ----------------------------------------------------------
        # initialize implemented DQN models and weight
        self.agent = Agent()
        # details about the api of Model in pytrace.nn.Model
        self.agent.qnetwork_local.load_seq_list(
            pytrace.load(join(root_path, './riverraid/best_list.pth')))
        pytrace.prYellow(
            f"load weights from: {join(root_path, './riverraid/best_list.pth')}"
        )
        self.state = np.zeros([4, 84, 84])
        #self.state = self.WarpFrame(self.state)
        #self.state = np.stack([self.state] * 4, axis=0)
        # self.state = deque([np.zeros([84, 84, 4])], maxlen=4)

    def step(self, state):
        self.state = self.FrameStack(state, self.state)
        action = self.agent.act(self.state)
        return action

    def explore(self, obs):
        raise NotImplementedError

    def test(self):
        print('??')

    def WarpFrame(self, obs):
        """
        :param obs: The raw observation returned by env, it should be a (210 * 160 * 3) RGB frame
        :return: ans: A (84 * 84) compressed gray style frame normalized in [0, 1]
        """
        frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
        #return frame[:, :, None]
        return frame / 255.0

    def FrameStack(self, new_obs, obs):
        """
        :param new_obs: A raw observation returned by env, it should be a (210 * 160 * 3) RGB frame
        :param obs: The stack of past 4 (84 * 84) compressed gray style frames
        :return: A new stack of past 4 (84 * 84) compressed gray style frames
        """
        new_obs = self.WarpFrame(new_obs)
        obs[0:3, :, :] = obs[1:, :, :]
        obs[3, :, :] = new_obs
        return obs
예제 #27
0
def train(par):
    """ There are other hyperparameters, but I'll just 
        look at these for now.
    """

    #Environment
    seed = 0
    env = gym.make('CartPole-v0')
    env.seed(seed)  # for comparison
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.n

    #Agent
    gamma, lr, tau = par
    agent = Agent(num_states, num_actions, lr, gamma, seed_num=seed)
    agent.memory_size = 10**4
    agent.batchsize = 32
    learning_start = 2000
    agent.tau = tau

    #Train
    EPISODES = 500
    scores = []
    t1 = time.time()
    for e in range(1, EPISODES + 1):
        state = env.reset()
        reward_sum = 0
        done = False
        steps = 0
        actions = []

        while not done:
            #env.render()
            state = np.reshape(state, [1, num_states])  #reshape for keras
            action_onehot = agent.act(state)
            action_scalar = np.dot(action_onehot, range(num_actions))
            actions.append(action_scalar)
            next_state, reward, done, _ = env.step(action_scalar)
            reward_sum += reward
            agent.remember(state[0], action_onehot, reward, next_state, done)
            state = next_state

            if len(agent.memory) > learning_start:
                agent.train_models()
                agent.actor.gumbel_temperature = max(
                    0.999 * agent.actor.gumbel_temperature, 0.1)
            steps += 1

        #Learn & print results
        scores.append(reward_sum)

    #agent.save_target_weights()
    plt.plot(scores)
    figname = 'gamma_' + str(gamma) + '_lr_' + str(lr) + '_tau_' + str(
        tau) + '.png'
    plt.title('gamma_' + str(gamma) + '_lr_' + str(lr) + '_tau_' + str(tau))
    plt.savefig('figs/' + figname)
def create_dataset(size=5000):
    start_time = time.time()
    env = ExternalEnviroment()
    agent = Agent(n_irrelevant_actions=0)
    dataset = []
    samples_counter = 0
    max_actions = 50
    counter_per_class = np.zeros(len(AVAILABLE_ACTIONS[:-1]))
    max_per_class = int(size / (len(counter_per_class) + 1))
    counter_zero = 0
    max_zero = max_per_class
    zeros = np.zeros(140)
    g_i = 0
    while samples_counter < int(size):
        if g_i % 5 == 0:
            agent = Agent(n_irrelevant_actions=0)
            print g_i, samples_counter, counter_per_class, counter_zero
            if g_i % 25 == 0:
                env.reset()
            else:
                env.reset_random()
        n_tried_actions = 0
        while n_tried_actions < max_actions:
            executed = agent.act(env)
            n_tried_actions += 1
            if executed:
                inp = calc_mirror_system_input(agent.current_state,
                                               agent.next_state, agent.hunger)
                action_i = np.nonzero(agent.training_signal)
                if counter_per_class[action_i] < max_per_class and \
                   ((not np.all(inp[:-1] == 0)) or inp[-1] == 1):
                    dataset.append([inp, agent.training_signal])
                    samples_counter += 1
                    counter_per_class[action_i] += 1
            elif counter_zero < max_zero:
                dataset.append([
                    np.append(zeros, 0),
                    np.zeros(len(AVAILABLE_ACTIONS[:-1]))
                ])
                samples_counter += 1
                counter_zero += 1
            if agent.hunger == 0:
                agent.hunger = 1
                env.reset()
                break
        g_i += 1
    for i in range(max_zero * 9):
        dataset.append(
            [np.append(zeros, 0),
             np.zeros(len(AVAILABLE_ACTIONS[:-1]))])
    print "Dataset creation time: {} sec.".format(time.time() - start_time)
    dataset = np.asarray(dataset)
    inconsistent = check_dataset(dataset)
    print "Found %d inconsistent data" % len(inconsistent)
    for i in inconsistent:
        dataset = np.delete(dataset, i, axis=0)
    return np.asarray(dataset)
예제 #29
0
def train(stock_name, window_size, episode_count): 
    agent = Agent(window_size)
    data = getStockDataVec(stock_name)
    l = len(data) - 1
    batch_size = 32
    punishment = -500

    for e in range(episode_count + 1):
        print("Episode " + str(e) + "/" + str(episode_count))
        state = getState(data, 0, window_size + 1)

        total_profit = 0
        agent.inventory = []
        history = []

        for t in range(l):
            action = agent.act(state)
            history.append(action)

            # sit
            next_state = getState(data, t + 1, window_size + 1)
            reward = 0

            if action == 0 and len(history) >= 50 and history[-50:] == [0] * 20:
                print("PUNISHED: 50 consecutive snoozes")
                reward = punishment

            elif action == 1:  # buy
                if len(history) >= 20 and history[-20:] == [1]*20:
                    reward = punishment
                    print("PUNISHED: 20 consecutive buys")
                    
                else:
                    agent.inventory.append(data[t])
                    print("Buy: " + formatPrice(data[t]))

            elif action == 2 and len(agent.inventory) > 0:  # sell
                bought_price = agent.inventory.pop(0)
                reward = (data[t] - bought_price) * 100
                total_profit += data[t] - bought_price
                print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price))

            done = True if t == l - 1 else False
            agent.memory.append((state, action, reward, next_state, done))
            state = next_state

            if done:
                print("--------------------------------")
                print("Total Profit: " + formatPrice(total_profit))
                print("--------------------------------")

            if len(agent.memory) > batch_size:
                agent.expReplay(batch_size)

        if e % 10 == 0:
            agent.model.save("../models/SR_models/model_ep" + str(e))
예제 #30
0
def train(pars):
    """ There are other hyperparameters, but I'll just 
        look at these for now.
    """
    
    alpha,tau, batchsize = pars

    #Environment
    env = gym.make('CartPole-v0')
    env.seed(0)
    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n


    #Agent
    lr,gamma = 3*10**-4, 0.99
    clipnorm, verbose = False, False
    agent = Agent(input_dim, output_dim, lr, gamma, tau, alpha, clipnorm, verbose)
    agent.memory_size = batchsize
    agent.batchsize = batchsize

    #Train
    EPISODES = 10**4
    scores = []
    t1 = time.time()
    for e in range(1,EPISODES+1):
        state = env.reset()
        state = agent.make_tensor(state)
        reward_sum = 0
        done = False
        while not done:

            #Do main step
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward_sum += reward
            next_state = agent.make_tensor(next_state)
            agent.remember(state[0],action,reward,next_state[0],done) #want to remember state as a vec
            state = next_state
            if e >= 2:
                agent.learn()

        #Print results
        scores.append(reward_sum)
 
    plt.figure()
    string = 'alpha_'+str(alpha)+'_tau_'+str(tau)+'_batchsize_'+str(batchsize)
    plt.title(string)
    plt.plot(scores,alpha=0.5)
    plt.plot(agent.window_average(scores,100),'r-')
    plt.savefig('figs/' + string + '.png')
    t2 = time.time()
    print 'took ' + str( (t2-t1) / 60.0 / 60.0) + ' hours'
    return 
예제 #31
0
def ddpg(n_episodes=250, max_t=1000, print_every=25):
    env = UnityEnvironment(file_name="env/Reacher20.app")

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]

    action_size = brain.vector_action_space_size
    state_size = env_info.vector_observations.shape[1]
    num_agents = len(env_info.agents)

    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=2)

    scores_deque = deque(maxlen=print_every)
    scores = []
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        score = np.zeros(num_agents)
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            for i in range(num_agents):
                agent.step(states[i], actions[i], rewards[i], next_states[i],
                           dones[i], t)
            states = next_states
            score += rewards
            if np.any(dones):
                break

        scores_deque.append(score.mean())
        scores.append(score.mean())
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_deque)),
              end="")
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque) > 30:
            print("Model trained successfully")
            torch.save(agent.actor_local.state_dict(),
                       "model/checkpoint_actor.pth")
            torch.save(agent.critic_local.state_dict(),
                       "model/checkpoint_critic.pth")
            break

    return scores