예제 #1
0
def learning(episode_count):
    env = Env()

    actions = 8
    input_dim = (2, 84, 84)

    scores = []
    avg_scores = deque([])

    gamma = 0.9
    lr = 0.0001
    exp_buffer_size = 25000
    batch_size = 256

    buffer = ExperienceReplay(exp_buffer_size, input_dim)

    agent = AgentDDQN(gamma,
                      actions,
                      Net(),
                      buffer,
                      lr,
                      update_steps=1000,
                      batch_size=batch_size,
                      path="trained_models/sc2_beacon",
                      epsilon_dec=1e-5)

    d_now = datetime.datetime.now()
    for i in range(0, episode_count):
        score = 0
        terminal = False
        state = env.reset()
        state = torch.from_numpy(state).double()

        while not terminal:
            action = agent.choose_action(state)
            state_, reward, terminal = env.step(action)
            state_ = torch.from_numpy(state_).double()
            score += reward
            agent.store(state, action, reward, state_, terminal)
            agent.learn()
            state = state_

        avg_scores.append(score)
        if len(avg_scores) > MAX_SCORE_COUNT:
            avg_scores.popleft()

        scores.append(score)

        print('episode: ', i, '\t\tscore: ',
              +round(score, 3), '\t\tcumulative score:',
              round(np.average(scores), 3), '\t\tavg-100 score:',
              round(np.average(avg_scores), 3), '\t\tepsilon: ',
              round(agent.epsilon, 4))
        if i % 100 == 0:
            d_end = datetime.datetime.now()
            d = d_end - d_now
            print('time: ', d)
            if i % 100 == 0:
                agent.save_model()
예제 #2
0
def animation():
    env = Env()

    actions = 2 * 64 * 64
    input_dim = (3, 64, 64)

    scores = []
    avg_scores = deque([])

    gamma = 0.9
    lr = 0.0001
    exp_buffer_size = 0
    batch_size = 0

    buffer = ExperienceReplay(exp_buffer_size, input_dim)

    agent = AgentDDQN(gamma,
                      actions,
                      Net(),
                      buffer,
                      lr,
                      update_steps=1000,
                      batch_size=batch_size,
                      epsilon=0.01,
                      path="trained_models/sc2_collect_shards")

    agent.load_model()

    i = 0
    while True:
        score = 0
        terminal = False
        state = env.reset()
        state = torch.from_numpy(state).double()
        i += 1

        while not terminal:
            action = agent.choose_action(state)
            state_, reward, terminal = env.step(action)
            state = torch.from_numpy(state_).double()
            score += reward

        avg_scores.append(score)
        if len(avg_scores) > MAX_SCORE_COUNT:
            avg_scores.popleft()

        scores.append(score)
        print('episode: ', i, '\t\tcumulative score:',
              round(np.average(scores), 3), '\t\tavg-100 score:',
              round(np.average(avg_scores), 3))
예제 #3
0
def learning():
    env = gym.make('LunarLander-v2')
    actions = 4
    state_dim = (8, )

    games_count = 1000
    scores = []

    experience_replay = ExperienceReplay(10000, state_dim)
    agent = AgentDDQN(0.99, actions, Net(), experience_replay, 0.001)

    d_now = datetime.datetime.now()
    for i in range(1, games_count):
        score = 0
        terminal = False
        state = env.reset()
        state = torch.from_numpy(state).double()

        while not terminal:
            action = agent.choose_action(state)
            state_, reward, terminal, _ = env.step(action)
            state_ = torch.from_numpy(state_).double()
            agent.store(state, action, reward, state_, terminal)
            agent.learn()
            state = state_
            score += reward

        scores.append(score)

        print('episode: ', i, '\t\tscore: ', +score, '\t\taverage score:',
              np.average(scores[-100:]), '\t\tepsilon: ', agent.epsilon)
        if i % 10 == 0:
            d_end = datetime.datetime.now()
            d = d_end - d_now
            print('time: ', d)
예제 #4
0
def learning(num):
    env = make_env('PongNoFrameskip-v4')
    actions = 6
    input_dim = (4, 80, 80)
    scores = []

    experience_replay = ExperienceReplay(25000, input_dim)
    agent = AgentDDQN(0.99,
                      actions,
                      Net(),
                      experience_replay,
                      0.0001,
                      update_steps=1000,
                      batch_size=32,
                      epsilon_min=0.02,
                      epsilon=1.0,
                      epsilon_dec=1e-4)
    '''
    d_now = datetime.datetime.now()
    text = 'episode,score,time,step,epsilon'
    for i in range(1, games_count):
        score = 0
        terminal = False
        state = env.reset()
        state = torch.from_numpy(state).double()

        while not terminal:
            action = agent.choose_action(state)
            state_, reward, terminal, _ = env.step(action)
            state_ = torch.from_numpy(state_).double()
            agent.store(state, action, reward, state_, terminal)
            agent.learn()
            state = state_
            score += reward

        scores.append(score)

        print('episode: ', i, '\t\tscore: ', + score, '\t\taverage score:' , np.average(scores[-100:]), '\t\tepsilon: ', agent.epsilon)
        if i % 10 == 0:
            d_end = datetime.datetime.now()
            d = d_end - d_now
            print('time: ', d)
    '''

    i = 0
    step = 0
    d_start = datetime.datetime.now()
    print('start ', d_start)
    d_end = datetime.datetime.now() + datetime.timedelta(minutes=180)
    text = 'episode,score,time,step,epsilon'

    while d_end > datetime.datetime.now():
        score = 0
        terminal = False
        state = env.reset()
        state = torch.from_numpy(state).double()
        i += 1

        while not terminal:
            step += 1
            action = agent.choose_action(state)
            state_, reward, terminal, _ = env.step(action)
            state_ = torch.from_numpy(state_).double()
            score += reward

            agent.store(state, action, reward, state_, terminal)
            agent.learn()
            state = state_

        scores.append(score)
        avg = np.average(scores[-100:])
        d = datetime.datetime.now() - d_start
        text += '\n' + str(i) + ',' + str(avg) + ',' + str(d) + ',' + str(
            step) + ',' + str(agent.epsilon)

    write_to_file(text, 'logs/pong/log_1_' + str(num) + '_DDQN.txt')