Example #1
0
    def __init__(self, width, height, rows, window, offx, offy, idx=""):
        self.SETTINGS = {}
        self.SETTINGS['w'] = width
        self.SETTINGS['h'] = height
        self.SETTINGS['r'] = rows
        self.SETTINGS['sB'] = width // rows
        self.SETTINGS['ox'] = offx * width
        self.SETTINGS['oy'] = offy * height
        self.idx = idx

        self.window = window

        self.snake = Snake((255, 0, 0),
                           (self.SETTINGS['r'] // 2, self.SETTINGS['r'] // 2),
                           self.SETTINGS)
        self.snack = Cube(self.randomSnack(), self.SETTINGS, color=(0, 255, 0))

        self.dist = self.get_snack_distance()

        self.walls = self.get_wall_pos()

        self.model = Model(len(self.get_observation()), 4)
        self.tgt = Model(len(self.get_observation()), 4)
        self.agent = DQNAgent(self.model, self.tgt)
        self.reward = 0.0
        self.setp_reward = 0.0
        self.rewards = []
        self.finished = False

        self.points = 0
        self.points_ls = []
Example #2
0
def main():
    epi_file = open('../files/episode.txt')
    episode = epi_file.readline()
    epi_file.close()
    episode = int(episode) - 1
    qagent = DQNAgent(episode - 1)
    qagent.load_memory_of_episode(episode)
    qys = []
    qds = []
    for k in range(50):
        for j in range(5):
            # for i in range(0,len(qagent.memory),qagent.batch_size):
            qy, qd = qagent.memory_replay()
        qagent.update_targer_model()
        qys.append(qy)
        qds.append(qd)
    qagent.save_model(episode)
    res = time.strftime('%Y/%m/%d-%H:%M:%S', time.localtime(
        time.time())) + "Average of episode: %d Q_y: %f Q_d: %f" % (
            episode, np.mean(qys), np.mean(qds))
    epi_file = open('../files/avg_Q.txt', 'a')
    epi_file.write(res + '\n')
    epi_file.close()

    if forward:
        epi_file = open('../files/episode.txt', 'w')
        epi_file.write(str(episode + 2))
        epi_file.close()
Example #3
0
def main():
    USE_CUDA = torch.cuda.is_available()

    env = gym.make('CartPole-v0')
    dqn = DQN(env.observation_space.shape[0], env.action_space.n)
    if USE_CUDA:
        dqn = dqn.cuda()
    optimizer = optim.RMSprop(dqn.parameters(),
                              lr=0.00025,
                              momentum=0.95,
                              alpha=0.95,
                              eps=0.01)
    epsilon_schedule = get_epsilon_schedule(start=1.0,
                                            end=0.01,
                                            endt=1000,
                                            learn_start=50)
    replay_buffer = ReplayBuffer(capacity=1000)
    agent = DQNAgent(env,
                     dqn,
                     optimizer,
                     epsilon_schedule,
                     replay_buffer,
                     discount_factor=0.99,
                     target_update_rate=10,
                     batch_size=32,
                     learn_start=50)

    agent.train(5000)
    total_reward = agent.play(render=True)
    agent.env.close()
    print('Total Reward: ', total_reward)
Example #4
0
    def __init__(self, kwargs):
        kwargs["env_cls"] = Atari
        env = kwargs["env_cls"](kwargs["env_id"])
        kwargs["state_shape"] = env.observation_space.shape
        kwargs["state_dtype"] = np.uint8
        kwargs["n_actions"] = env.action_space.n
        kwargs["device"] = torch.device(kwargs["device_id"])
        env.close()
        self.__dict__.update(kwargs)
        self.agent = DQNAgent(**kwargs)
        self.writer = SummaryWriter("./log/")
        self.cuda_eval = torch.cuda.Stream(self.device)

        mem_kwargs = dict(
            capacity=self.mem_capacity,
            history_len=self.history_len,
            state_shape=self.state_shape,
            state_dtype=self.state_dtype,
            batch_sz=self.batch_sz,
            alpha=self.mem_alpha,
            beta=LinearScheduler(self.mem_beta, 1., self.train_steps),
            priority_eps=self.mem_priority_eps,
            priority_upper=self.mem_priority_upper,
            prioritized_replay=self.prioritized_replay,
            device=self.device,
        )
        mem_cls = PrioritizedReplayMemory if self.prioritized_replay else UniformReplayMemory
        self.mem = mem_cls(**mem_kwargs)
        self.mem_lock = Lock()
        self.sync = Queue(maxsize=1)
        self.sync.put(None)
Example #5
0
def main(argv):
    # Pretrained network to use
    inputfile = None
    # Wether to train or to test
    train = False
    # Trained network
    outputfile = None

    try:
        opts, args = getopt.getopt(argv, "hrl:s:", ["loadckpt=", "saveckpt="])
    except getopt.GetoptError:
        print 'Incorrect usage. For more information: test.py -h'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'python test.py -r -l <ckptfile> -s <ckptfile>'
            print '-r for enabling training'
            print '-l for loading pre-existing model'
            print '-s for saving  model to file'
            sys.exit()
        elif opt == '-r':
            train = True
        elif opt in ("-l", "--loadckpt"):
            inputfile = arg
        elif opt in ("-s", "--saveckpt"):
            outputfile = arg

    with tf.Session() as sess:
        env = Environment()
        agent = DQNAgent(env, sess, inputfile)
        if train:
            agent.train(6000000, outputfile)
        else:
            agent.test(2000)
Example #6
0
def run(novis, env_dir, env_file, n_episodes, seed, prioritized, cpu):
    if novis:
        env_dir = "{}_NoVis".format(env_dir)

    env = UnityEnvironment(file_name="environments/{}/{}".format(env_dir, env_file))

    # get default brain
    brain_name = env.brain_names[0]
    brain      = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents in the environment
    # print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    # print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    # print('States look like:', state)
    state_size = len(state)
    # print('States have length:', state_size)

    report = Report(DQNAgent(state_size=state_size, action_size=action_size, seed=seed, prioritized=prioritized, cpu=cpu)).run(dqn, env=env, brain_name=brain_name, n_episodes=n_episodes)
    print(report)
Example #7
0
def purge_round():
    candidate_leaders_map = {}  # {filename --> agent}

    # Load in all of the leaders
    for leader_checkpoint in os.listdir(LEADER_DIR):
        path = os.path.join(LEADER_DIR, leader_checkpoint)
        candidate_leader = try_gpu(
            DQNAgent(6,
                     LinearSchedule(0.05, 0.05, 1),
                     OBSERVATION_MODE,
                     lr=LR,
                     max_grad_norm=GRAD_CLIP_NORM,
                     name=leader_checkpoint))
        candidate_leader.load_state_dict(
            torch.load(path, map_location=lambda storage, loc: storage))
        candidate_leaders_map[leader_checkpoint] = candidate_leader

    candidate_scores = []  # list[(filename, score)]
    filenames, candidate_leaders = zip(*candidate_leaders_map.items())
    for i, (filename,
            candidate_leader) in enumerate(zip(filenames, candidate_leaders)):
        print "EVALUATING {}".format(candidate_leader.name)
        leaders = EnsembleDQNAgent(candidate_leaders[:i] +
                                   candidate_leaders[i + 1:])
        candidate_scores.append((filename,
                                 evaluate(candidate_leader, leaders,
                                          EPISODES_EVALUATE_PURGE)))
    sorted_scores = sorted(candidate_scores, key=lambda x: x[1], reverse=True)

    print "SCORES: {}".format(sorted_scores)
    for filename, score in sorted_scores[NUM_LEADERS:]:
        print "PURGING ({}, {})".format(filename, score)
        leader_path = os.path.join(LEADER_DIR, filename)
        graveyard_path = os.path.join(GRAVEYARD_DIR, filename)
        os.rename(leader_path, graveyard_path)
Example #8
0
def td_learning(args):
    agent = DQNAgent(args)
    replay_memory = PrioritizedReplayBuffer(1000000, args.alpha)
    #eval_game(agent, 500)
    outer = tqdm(range(args.total_steps), desc='Total steps', position=0)
    game = init_game()
    ave_score = 0
    count = 0
    for step in outer:
        board = copy.deepcopy(game.gameboard.board)
        if step < args.start_learn:
            avail_choices = game.gameboard.get_available_choices()
            index = np.random.randint(len(avail_choices))
            choice = avail_choices[index]
        else:
            choice = agent.greedy_policy(
                board, game.gameboard.get_available_choices())

        next_board, reward = game.input_pos(choice[0], choice[1])
        next_board = copy.deepcopy(next_board)
        #####

        replay_memory.add(board, choice, reward, next_board)
        #####
        if game.termination():
            ave_score += game.gameboard.score
            count += 1
            game = init_game()

        if step >= args.start_learn and step % args.train_freq == 0:
            if count > 0:
                message = "ave score of " + str(count) + " game: " + str(
                    ave_score / count)
                out_fd.write("{} {}\n".format(step, ave_score / count))
                outer.write(message)
                ave_score = 0
                count = 0
            if step == args.start_learn:
                experience = replay_memory.sample(args.start_learn,
                                                  beta=agent.beta)
            else:
                experience = replay_memory.sample(args.train_data_size,
                                                  beta=agent.beta)

            boards, choices, rewards, next_boards, weights, batch_idxes = experience

            td_errors = agent.train(
                (boards, choices, rewards, next_boards, weights))
            new_priorities = np.abs(td_errors) + prioritized_replay_eps
            replay_memory.update_priorities(batch_idxes, new_priorities)

            agent.update_target(args.soft_tau)
            agent.update_epsilon()
            agent.update_beta()

    eval_game(agent, 500)
    out_fd.close()
Example #9
0
def test_target_model():
    agent = DQNAgent()
    agent.load('models/model.h5')
    state = np.zeros([6, 7])
    state[5][3] = 1
    state = state.reshape(1, 6, 7, 1)
    p1 = agent.policy_model.predict(state)
    p2 = agent.target_model.predict(state)
    print(p1)
    print(p2)
    if not np.array_equal(p1, p2):
        print('FAIL')
Example #10
0
    def __init__(self,
                 env_creator,
                 device,
                 buffer_size,
                 save_dir,
                 timesteps_per_epoch=1,
                 batch_size=32,
                 total_steps=5 * 10 ** 5,
                 decay_rate=0.1,
                 init_epsilon=1,
                 final_epsilon=0.02,
                 loss_freq=50,
                 refresh_target_network_freq=500,
                 eval_freq=500,
                 max_grad_norm=50):

        self.env_creator = env_creator
        self.env = env_creator()
        n_actions = self.env.action_space.n
        state_shape = self.env.observation_space.shape

        self.save_dir = save_dir
        self.buffer_size = buffer_size
        self.timesteps_per_epoch = timesteps_per_epoch
        self.batch_size = batch_size
        self.total_steps = total_steps
        self.decay_steps = decay_rate * total_steps
        self.init_epsilon = init_epsilon
        self.final_epsilon = final_epsilon
        self.loss_freq = loss_freq
        self.refresh_target_network_freq = refresh_target_network_freq
        self.eval_freq = eval_freq
        self.max_grad_norm = max_grad_norm
        self.device = device

        self.writer = SummaryWriter('runs')

        self.agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)
        self.target_network = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)
        self.target_network.load_state_dict(self.agent.state_dict())
def dqn_run(episodes=2500,
            eps_start=1.0,
            eps_end=0.01,
            eps_decay=0.995,
            double_dqn=False,
            dueling_dqn=False,
            seed=42):
    env = start_env()
    env_info = reset_env_info(env)

    state_size = get_state_size(env_info)
    action_size = get_action_size(env)

    print('Seed used:', seed)
    agent = DQNAgent(state_size, action_size, double_dqn, dueling_dqn, seed)

    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start

    for episode in range(1, episodes + 1):
        env_info = reset_env_info(env)
        score = 0.0
        done = False
        while not done:
            state = env_info.vector_observations[0]
            action = agent.act(state, epsilon=eps)
            env_info = env_step(env, action)
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            score += reward

        scores_window.append(score)
        scores.append(score)
        eps = max(eps * eps_decay, eps_end)
        print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'.format(
            episode, episodes, np.mean(scores_window), eps),
              end='     ')
        if episode % 100 == 0:
            print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'.
                  format(episode, episodes, np.mean(scores_window), eps))
        if np.mean(scores_window) > 13.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break

    env.close()
    return scores
Example #12
0
def test_model(filename):
	env = gym.make("CartPole-v1")
	agent = DQNAgent(4, 2)
	agent.load_model(filename)

	state = env.reset()

	for _ in range(1000):
		env.render()
		state, _, done, _ = env.step(agent.act(state, explore=False))
		if done:
			break

	env.close()
    def __init__(self):
        pygame.init()
        self.window = pygame.display.set_mode((500, 800))
        pygame.display.set_caption("Racing AI")

        self.clock = pygame.time.Clock()
        self.execute = True

        self.car = Car(250, 650, self.window)
        self.agent = DQNAgent(inputs=4, n_actions=2)
        self.episode_durations = []

        self.update_agent = pygame.USEREVENT + 1
        update_timer = 100
        pygame.time.set_timer(self.update_agent, update_timer)
Example #14
0
def eval():
    env = Tetris()
    max_steps = None
    epsilon_stop_episode = 1500
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    replay_start_size = 2000
    n_neurons = [32, 32]
    render_delay = None
    activations = ['relu', 'relu', 'linear']

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons,
                     activations=activations,
                     epsilon=0,
                     epsilon_stop_episode=epsilon_stop_episode,
                     mem_size=mem_size,
                     discount=discount,
                     replay_start_size=replay_start_size,
                     train=False)
    agent.load("ckpts/591_model.ckpt")

    current_state = env.reset()
    done = False
    steps = 0

    # Game
    while not done and (not max_steps or steps < max_steps):
        next_states = env.get_next_states()
        best_state = agent.best_state(next_states.values())

        best_action = None
        for action, state in next_states.items():
            if state == best_state:
                best_action = action
                break

        reward, done = env.play(best_action[0],
                                best_action[1],
                                render=True,
                                render_delay=render_delay)

        agent.add_to_memory(current_state, next_states[best_action], reward,
                            done)
        current_state = next_states[best_action]
        steps += 1
Example #15
0
def main():

    print("Creating model...")
    model = create_model()
    model.summary()

    print("Creating environment...")
    environment = gym.make("CartPole-v0")
    environment._max_episode_steps = 500

    print("Creating agent...")
    if agent_type == "dqn":
        agent = DQNAgent(name="cartpole-dqn",
                         model=model,
                         environment=environment,
                         observation_frames=1,
                         observation_transformation=observation_transformation,
                         reward_transformation=reward_transformation,
                         gamma=0.95,
                         final_epsilon=0.01,
                         initial_epsilon=1.0,
                         number_of_iterations=1000000,
                         replay_memory_size=2000,
                         minibatch_size=32)
    elif agent_type == "ddqn":
        agent = DDQNAgent(
            name="cartpole-ddqn",
            model=model,
            environment=environment,
            observation_frames=1,
            observation_transformation=observation_transformation,
            reward_transformation=reward_transformation,
            gamma=0.95,
            final_epsilon=0.01,
            initial_epsilon=1.0,
            number_of_iterations=1000000,
            replay_memory_size=2000,
            minibatch_size=32,
            model_copy_interval=100)
    agent.enable_rewards_tracking(rewards_running_means_length=10000)
    agent.enable_episodes_tracking(episodes_running_means_length=10000)
    agent.enable_maxq_tracking(maxq_running_means_length=10000)
    agent.enable_model_saving(model_save_frequency=100000)
    agent.enable_tensorboard_for_tracking()

    print("Training ...")
    agent.fit(verbose=True, headless="render" not in sys.argv)
Example #16
0
def main():

    print("Creating environment...")
    environment = gym_tetris.make('Tetris-v0')

    print("Creating model...")
    model = modelutils.create_model(number_of_actions)
    model.summary()

    print("Creating agent...")
    if agent_type == "dqn":
        agent = DQNAgent(
            name="tetris-dqn",
            environment=environment,
            model=model,
            observation_transformation=utils.resize_and_bgr2gray,
            observation_frames=4,
            number_of_iterations=1000000,
            gamma=0.95,
            final_epsilon=0.01,
            initial_epsilon=1.0,
            replay_memory_size=2000,
            minibatch_size=32
        )
    elif agent_type == "ddqn":
        agent = DDQNAgent(
            name="tetris-ddqn",
            environment=environment,
            model=model,
            observation_transformation=utils.resize_and_bgr2gray,
            observation_frames=4,
            number_of_iterations=1000000,
            gamma=0.95,
            final_epsilon=0.01,
            initial_epsilon=1.0,
            replay_memory_size=2000,
            minibatch_size=32,
            model_copy_interval=100
        )
    agent.enable_rewards_tracking(rewards_running_means_length=10000)
    agent.enable_episodes_tracking(episodes_running_means_length=100)
    agent.enable_maxq_tracking(maxq_running_means_length=10000)
    agent.enable_model_saving(model_save_frequency=10000)
    agent.enable_plots_saving(plots_save_frequency=10000)

    print("Training ...")
    agent.fit(verbose=True, headless="headless" in sys.argv, render_states=True)
Example #17
0
def train(model_path='models/model.h5',
          opponent_policy=random_choice,
          num_episodes=1000,
          agent_params={},
          **kwargs):
    stats = statistics.default_stats()
    plt_data = statistics.plot_stats(stats, data=None)

    agent = DQNAgent(**agent_params)

    for episode in range(num_episodes):
        print('Episode {}/{}'.format(episode, num_episodes))
        env = Environment(opponent_policy=opponent_policy,
                          agent_color=board.RED,
                          agent_first_turn=True)
        done = False
        episode_length = 0
        while not done:
            state = env.get_state()
            action = agent.act_epsilon_greedy(state)
            next_state, reward, event = env.step(action)
            done = event != board.EVENT_IN_GAME
            agent.remember(state, action, reward, next_state, done)
            agent.replay(stats=stats)

            if event == board.EVENT_WIN:
                print('Won Game!')

            episode_length += 1

        stats['episode_results'].append(event)
        stats['episode_lengths'].append(episode_length)

        plt_data = statistics.plot_stats(stats, data=plt_data)
        plt.pause(0.0001)

        if episode % 100 == 0:
            agent.save(model_path)

    agent.save(model_path)
    saved_params = {'agent_params': agent_params, 'num_episodes': num_episodes}
    statistics.save_stats(
        stats, saved_params,
        "stats/stats-{}.json".format(time.strftime("%Y%m%d-%H%M%S")))
    statistics.plot_stats(stats, data=plt_data)
    plt.show()
Example #18
0
def main():

    print("Creating model...")
    model = modelutils.create_model(number_of_actions=4)
    model.summary()

    print("Creating agent...")
    if agent_type == "dqn":
        agent = DQNAgent(name="doom-dqn",
                         model=model,
                         number_of_actions=4,
                         gamma=0.99,
                         final_epsilon=0.0001,
                         initial_epsilon=0.1,
                         number_of_iterations=200000,
                         replay_memory_size=10000,
                         minibatch_size=32)
    elif agent_type == "ddqn":
        agent = DDQNAgent(name="doom-ddqn",
                          model=model,
                          number_of_actions=4,
                          gamma=0.99,
                          final_epsilon=0.0001,
                          initial_epsilon=0.1,
                          number_of_iterations=200000,
                          replay_memory_size=10000,
                          minibatch_size=32,
                          model_copy_interval=100)
    agent.enable_rewards_tracking(rewards_running_means_length=1000)
    agent.enable_episodes_tracking(episodes_running_means_length=1000)
    agent.enable_maxq_tracking(maxq_running_means_length=1000)
    agent.enable_model_saving(model_save_frequency=10000)
    agent.enable_plots_saving(plots_save_frequency=10000)

    print("Creating game...")
    #environment = Environment(headless=("headless" in sys.argv))
    # Create an instance of the Doom game.
    environment = DoomGame()
    environment.load_config("scenarios/basic.cfg")
    environment.set_screen_format(ScreenFormat.GRAY8)
    environment.set_window_visible("headless" not in sys.argv)
    environment.init()

    print("Training ...")
    train(agent, environment, verbose="verbose" in sys.argv)
Example #19
0
def DQN(episodes, epsilon, epsilonDeca):
    env = Env()
    agent = DQNAgent()
    #window=pygame.display.set_mode((windowWidth,windowHeight))
    episodeRewards = []
    for episode in range(episodes):
        episode_reward = 0
        step = 1
        current_state = env.reset()
        done = False
        while not done:

            # This part stays mostly the same, the change is to query a model for Q values
            if np.random.random() > epsilon:
                # Get action from Q table

                action = np.argmax(agent.getQs(np.array(current_state)))
            else:
                # Get random action
                action = np.random.randint(0, env.ACTION_SPACE_SIZE)

            new_state, reward, done = env.step(action)
            episode_reward += reward

            #drawWindow(window,[env.blob,env.enemyBlob],[env.ball],env.wall)

            # Every step we update replay memory and train main network
            agent.updateReplyMemory(
                (current_state, action, reward, new_state, done))
            agent.train(done, step)
            current_state = new_state
            step += 1
        episodeRewards.append(episode_reward)
        if episode % 10 == 0:
            averageReward = sum(episodeRewards) / len(episodeRewards)
            minReward = min(episodeRewards)
            maxReward = max(episodeRewards)
            print(
                f"replayMemo:{len(agent.replayMemory)}  avg:{averageReward} \n  min:{minReward}  \n  max:{maxReward} "
            )
        if epsilon > MIN_EPSILON:
            epsilon *= EPSILON_DECAY
            epsilon = max(MIN_EPSILON, epsilon)

    pygame.quit()
Example #20
0
def advise():
    n1 = float(request.form['n1'])
    n2 = float(request.form['n2'])
    n3 = float(request.form['n3'])
    cash = float(request.form['cash'])
    print(n1)
    print(cash)

    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)
    agent.load("202005011635-dqn.h5")

    state = env.reset()
    state[0] = n1
    state[1] = n2
    state[2] = n3
    state[-1] = cash
    state = scaler.transform([state])

    action = agent.act(state)
    # action_combo = list(map(list, itertools.product([0, 1, 2], repeat=3)))
    action_vec = action_combo[action]
    # action_map = {0: "sell", 1: "hold", 2: "buy"}

    # print(action_map[action_vec[0]], action_map[action_vec[1]], action_map[action_vec[2]])

    ans = []
    tmp = 1 if action_vec[0] == 0 and n1 == 0 else action_vec[0]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])
    tmp = 1 if action_vec[1] == 0 and n2 == 0 else action_vec[1]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])
    tmp = 1 if action_vec[2] == 0 and n3 == 0 else action_vec[2]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])

    print(ans)
    return render_template('index.html',
                           ans=ans,
                           n1=n1,
                           n2=n2,
                           n3=n3,
                           cash=cash)
def td_learning(args):
    agent = DQNAgent(args)
    replay_memory = deque(maxlen=args.MAX_REPLAY_MEMORY_SIZE)
    #eval_game(agent, 500)
    outer = tqdm(range(args.total_steps), desc='Total steps', position=0)
    game = init_game()
    ave_score = 0
    count = 0
    for step in outer:
        board = copy.deepcopy(game.gameboard.board)
        if step < args.start_learn:
            avail_choices = game.gameboard.get_available_choices()
            index = np.random.randint(len(avail_choices))
            choice = avail_choices[index]
        else:
            choice = agent.greedy_policy(
                board, game.gameboard.get_available_choices())

        next_board, reward = game.input_pos(choice[0], choice[1])
        next_board = copy.deepcopy(next_board)
        replay_memory.append((board, choice, reward, next_board))

        if game.termination():
            ave_score += game.gameboard.score
            count += 1
            game = init_game()

        if step >= args.start_learn and step % args.train_freq == 0:
            if count > 0:
                message = "ave score of " + str(count) + " game: " + str(
                    ave_score / count)
                #out_fd.write("{} {}\n".format(step, ave_score/count))
                outer.write(message)
                ave_score = 0
                count = 0
            if step == args.start_learn:
                if len(replay_memory) > 0:
                    agent.train(replay_memory)
            else:
                agent.train(random.sample(replay_memory, args.train_data_size))
            agent.update_target(args.soft_tau)
            agent.update_epsilon()

    eval_game(agent, 500)
Example #22
0
def main():

    print("Creating model...")
    model = modelutils.create_model(number_of_actions)
    model.summary()

    print("Creating agent...")
    if agent_type == "dqn":
        agent = DQNAgent(name="supermario-dqn",
                         model=model,
                         number_of_actions=number_of_actions,
                         gamma=0.95,
                         final_epsilon=0.01,
                         initial_epsilon=1.0,
                         number_of_iterations=1000000,
                         replay_memory_size=2000,
                         minibatch_size=32)
    elif agent_type == "ddqn":
        agent = DDQNAgent(name="supermario-ddqn",
                          model=model,
                          number_of_actions=number_of_actions,
                          gamma=0.95,
                          final_epsilon=0.01,
                          initial_epsilon=1.0,
                          number_of_iterations=1000000,
                          replay_memory_size=2000,
                          minibatch_size=32,
                          model_copy_interval=100)
    agent.enable_rewards_tracking(rewards_running_means_length=10000)
    agent.enable_episodes_tracking(episodes_running_means_length=100)
    agent.enable_maxq_tracking(maxq_running_means_length=10000)
    agent.enable_model_saving(model_save_frequency=10000)
    agent.enable_plots_saving(plots_save_frequency=10000)

    print("Creating game...")
    environment = gym_super_mario_bros.make("SuperMarioBros-v0")
    environment = BinarySpaceToDiscreteSpaceEnv(environment, actions)

    print("Training ...")
    train(agent,
          environment,
          verbose="verbose" in sys.argv,
          headless="headless" in sys.argv)
def main(cfg: omegaconf.DictConfig):

	# create the environment
	env = atari_wrappers.make_env(cfg.exp.env)
	env = gym.wrappers.Monitor(env, "recording/", force=True)
	obs = env.reset()

	# TensorBoard
	writer = SummaryWriter()
	writer.add_hparams(flatten_dict(cfg), {})
	logger.info('Hyperparams:', cfg)

	# create the agent
	agent = DQNAgent(env, device=cfg.train.device, summary_writer=writer, cfg=cfg)

	n_games = 0
	max_mean_40_reward = -sys.maxsize

	# Play MAX_N_GAMES games
	while n_games < cfg.train.max_episodes:
		# act greedly
		action = agent.act_eps_greedy(obs)

		# one step on the environment
		new_obs, reward, done, _ = env.step(action)

		# add the environment feedback to the agent
		agent.add_env_feedback(obs, action, new_obs, reward, done)

		# sample and optimize NB: the agent could wait to have enough memories
		agent.sample_and_optimize(cfg.train.batch_size)

		obs = new_obs
		if done:
			n_games += 1
			agent.print_info()
			agent.reset_stats()
			obs = env.reset()
			if agent.rewards:
				current_mean_40_reward = np.mean(agent.rewards[-40:])
				if current_mean_40_reward > max_mean_40_reward:
					agent.save_model(cfg.train.best_checkpoint)
	writer.close()
Example #24
0
def main():

    env = gym.make('carla-v0')
    state_size = env.image_size_net_chans
    action_size = len(env.action_space)
    agent = DQNAgent(state_size, action_size)

    done = False
    batch_size = 10

    try:

        for episode in range(EPISODES):
            state = env.reset(render=True)
            score = 0.0
            for time in range(10000):
                env.render()
                action = agent.act(state)
                next_state, reward, done = env.step(action)

                if done:
                    reward = -15
                else:
                    if abs(reward) < 0.5:
                        continue

                score += reward
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                if done:
                    agent.update_target_model()
                    print('episode: {}/{}, score: {:.5}, e: {}'.format(
                        episode, EPISODES, score, agent.epsilon))
                    break
                if len(agent.memory) > batch_size:
                    agent.replay(batch_size)
            if episode % 10 == 0:
                agent.save(os.path.join('..', 'models', 'carla-ddqn.h5'))

    finally:

        env.world.destroy()
Example #25
0
def main():
    parser = argparse.ArgumentParser(description='DQN')
    parser.add_argument('--env', type=str,
                        default='MsPacman-v0')  # 'Breakout-v0'
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--eps', type=float, default=1.0)
    parser.add_argument('--exploration_decay_speed', type=int, default=1000000)
    parser.add_argument('--eps_min', type=float, default=0.1)
    parser.add_argument('--log_size', type=int, default=100)
    parser.add_argument('--buffer_size', type=int, default=100000)
    parser.add_argument('--buffer_init_size', type=int, default=50000)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--sync_period', type=int, default=10000)
    parser.add_argument('--learn_freq', type=int, default=4)
    parser.add_argument('--save_freq', type=int, default=100)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--device', type=str, default='cuda')
    parser.add_argument('--exp-dir', type=str, default='exp')
    args = parser.parse_args()
    args.device = torch.device(args.device if torch.cuda.is_available() \
        and args.device.startswith('cuda') else 'cpu')

    work_dir = mkdir(args.exp_dir, args.env)  # save models

    # logging infos
    logging.basicConfig(filename=args.env + '.log',
                        filemode='w',
                        level=logging.INFO)

    env = gym.make(args.env)

    # set seed
    env.seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    torch.manual_seed(args.seed)
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

    agent = DQNAgent(env, args, work_dir)
    agent.run()
Example #26
0
def main():
    if len(sys.argv) >1:
        host = sys.argv[1]
    epi_file=open('../files/episode.txt')
    episode = epi_file.readline()
    epi_file.close()
    episode = int(episode)
    qagent=DQNAgent(14)
    data = 'x'
    while(data!='9'):
        data = send_action(9)
    ys,ds=qagent.get_data(episode,0)
    state = np.concatenate((ys,ds),axis=0)

    for step in range(1,t_steps+1):
        action = qagent.get_action(state)
        # action = qagent.get_action(state)
        reward = send_action(action)
        ys,ds = qagent.get_data(episode,step)
        n_state = np.concatenate((ys,ds),axis=0)
        state = n_state
Example #27
0
def main():
    if len(sys.argv) > 1:
        host = sys.argv[1]
    ep_reward_file = memory_path + 'ep_reward.dat'
    epi_file = open('../files/episode.txt')
    episode = epi_file.read(1)
    epi_file.close()
    qagent = DQNAgent()
    data = 'x'
    while (data != '9'):
        data = send_action(9)
    ys, ds = qagent.get_data(episode, 0)
    state = np.concatenate((ys, ds), axis=0)
    actions = []
    rewards = []
    for step in range(1, t_steps + 1):
        action = qagent.e_get_action(state)
        # action = qagent.get_action(state)
        reward = send_action(action)
        ys, ds = qagent.get_data(episode, step)
        n_state = np.concatenate((ys, ds), axis=0)
        actions.append(action)
        rewards.append(reward)
        state = n_state
    #save-actions,rewards
    actions = map(str, actions)
    rewards = map(str, rewards)
    r_file = open(reward_file, 'a')
    a_file = open(action_file, 'a')
    r_str = ','.join(rewards)
    a_str = ','.join(actions)
    r_file.write(r_str + '\n')
    a_file.write(a_str + '\n')
    r_file.close()
    a_file.close()
    print("episode : ", episode, " finished.")
    episode = str(int(episode) + 1)
    epi_file = open('../files/episode.txt', 'w')
    epi_file.write(episode)
    epi_file.close()
Example #28
0
def main():
    # enable GPU memory growth
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # model
    model_name = input("Model name -> ")
    model_file = input("Model file -> ")
    my_model = "models/{}/{}.h5".format(model_name, model_file)

    epsilon = float(input("Epsilon -> "))
    episode_count = int(input("Episode count -> "))

    print("Loading", my_model, "with epsilon", epsilon)
    agent = DQNAgent(my_model, float(epsilon))

    # information
    resizeScale = (40, 30)
    frame_n = 3
    max_cte = 4.35

    # statistics
    score = []
    rewards = []
    highest_score = 0
    highest_reward = 0
    max_score = None

    # velocity
    max_velocity = 10.0
    max_acceleration = 0.75

    # steering
    max_steering = 0.75
    steering_step = 2 * max_steering / (agent.action_space - 1)
    steering_table = [
        i * steering_step - max_steering for i in range(agent.action_space)
    ]

    # setup donkey environment
    conf = {
        # "exe_path":"remote",
        "exe_path": "D:/sdsandbox/build2/donkey_sim.exe",
        "host": "127.0.0.1",
        "port": 9094,
        "body_style": "donkey",
        "body_rgb": (128, 128, 128),
        "car_name": "rl",
        "font_size": 100
    }

    # env = gym.make("donkey-generated-roads-v0", conf=conf)
    env = gym.make("donkey-generated-track-v0", conf=conf)
    env.viewer.handler.max_cte = max_cte
    cv2.namedWindow("camera")

    start = time.time()
    first_start = start

    for e in range(episode_count):
        # at each episode, reset environment to starting position
        state = env.reset()
        states = np.empty((frame_n, resizeScale[1], resizeScale[0], 3))
        states[0] = preprocessImage(state, resizeScale)
        need_frames = frame_n - 1

        done = False
        score.append(0)
        rewards.append(0.0)
        last_velocity = [0.0]
        laps = 0
        start = time.time()

        while not done and (score[-1] < max_score if max_score else True):
            if need_frames > 0:
                next_state, reward, done, info = env.step([
                    steering_table[random.randint(0, agent.action_space - 1)],
                    0.15
                ])

                states[frame_n - need_frames] = preprocessImage(
                    next_state, resizeScale)
                need_frames -= 1

                last_velocity.append(info["speed"])
                continue

            # select action, observe environment, calculate reward
            action, Q = agent.act(np.asarray([states]))
            steering = steering_table[action]
            throttle = calculateThrottle(last_velocity[-1], max_velocity,
                                         max_acceleration)

            next_state, reward, done, info = env.step([steering, throttle])

            img = cv2.resize(next_state, (320, 240),
                             interpolation=cv2.INTER_AREA)
            cv2.imshow("camera", img)

            last_velocity.append(round(info["speed"], 4))
            if abs(info["cte"]) >= max_cte:
                done = True
                reward = -1.0

            # for track
            else:
                reward = (1.0 - (abs(info["cte"]) / max_cte))

            # for roads
            # if not done:
            # reward = (1.0 - (abs(info["cte"]) / max_cte));

            if info["lap_finished"]:
                laps += 1

            score[-1] += 1
            rewards[-1] += reward

            # for roads
            # if self.score[-1] > 1500:
            # laps = max_laps

            next_states = np.roll(states, -1, axis=0)
            next_states[-1] = preprocessImage(next_state, resizeScale)
            states = next_states

            cv2.waitKey(1)

        env.step([0.0, -0.03])

        if len(score) > 20: score = score[-20:]
        if len(rewards) > 20: rewards = rewards[-20:]

        if score[-1] >= highest_score:
            highest_score = score[-1]

        if rewards[-1] >= highest_reward:
            highest_reward = rewards[-1]

        print(
            "episode: {}/{}, score: {}, reward: {}, laps: {}, e: {:.2}".format(
                e + 1, episode_count, score[-1], round(rewards[-1], 2), laps,
                round(agent.epsilon, 2)))

        if (e + 1) % 5 == 0:
            print("Took", round((time.time() - start) / 60, 2), "minutes\n")
            start = time.time()

    print("Showcase time:", round((time.time() - first_start) / 60, 2),
          "minutes")
Example #29
0
from utils import get_args


# Take argument
arg = get_args()

# Build env (first level, right only)
env = gym_super_mario_bros.make(arg.env)
env = JoypadSpace(env, RIGHT_ONLY)
env = wrapper(env)
# Parameters
states = (84, 84, 4)
actions = env.action_space.n

# Agent
agent = DQNAgent(states=states, actions=actions, max_memory=100000, double_q=True)

# Episodes
# episodes = 100001
episodes = 101
rewards = []

# Timing
start = time.time()
step = 0

# Main loop
for e in range(episodes):

    # Reset env
    state = env.reset()
Example #30
0
world = World(args.config_file, thread_num=args.thread)

# create agents
agents = []

#parameters['buffer_size'] = parameters['buffer_size']*len(world.intersections)
#parameters['batch_size'] = parameters['batch_size']*len(world.intersections)

for i in world.intersections:
    action_space = gym.spaces.Discrete(len(i.phases))
    agents.append(
        DQNAgent(
            action_space,
            LaneVehicleGenerator(world,
                                 i, ["lane_count"],
                                 in_only=True,
                                 average=None,
                                 scale=.025),
            PressureRewardGenerator(world, i, scale=0.005, negative=True),
            i.id, parameters, world))
    if args.load_model:
        agents[-1].load_model(args.save_dir)

# Create metric
metric = [
    TravelTimeMetric(world),
    ThroughputMetric(world),
    SpeedScoreMetric(world),
    MaxWaitingTimeMetric(world)
]