Esempio n. 1
0
    step = 0
    while step < arg.game_num:
        obs = env.reset()
        done = 0
        total_reward = 0
        step = agent.step_move()
        epsilon = max(1 - step * arg.epsilon_decrease, arg.epsilon_min)
        while not done:
            if np.random.uniform(0, 1) < epsilon:
                action = agent.random_action()
            else:
                action = agent.choose_action(obs)
            obs_, reward, done, _ = env.step(
                action + 1)  # because there is only three action
            replay_buffer.store_transition(obs, obs_, action, reward, done)
            total_reward += reward
            obs = obs_

        print('in {}, {}th game: the reward {} '.format(
            arg.run_name, step, total_reward))

        if step % train_period == 0:
            s1, s2, a, r, d = replay_buffer.sample(batch_size=train_batch)
            if step % record_period == 0:
                loss = agent.train(s1, s2, a, r, d, True)
                agent.log_reward(total_reward)
                agent.save()
            else:
                loss = agent.train(s1, s2, a, r, d, False)
            print('{}th game: the training loss {}'.format(step, loss))
Esempio n. 2
0
        memory = []
        total_reward = 0

        obs = env.reset()
        frame_processor = FrameProcessor(obs)
        obs, reward, done, _ = env.step(agent.random_action() + 1)  # do random action at the first frame
        total_reward += reward
        # play one game
        while not done:
            input_frame = frame_processor.process(obs)
            prob = agent.get_action_prob(input_frame)
            action = np.random.choice(3, p=prob)
            obs, reward, done, _ = env.step(action + 1)

            if reward == 0:
                replay_buffer.store_transition(input_frame, action)

            else:
                total_reward += reward
                if reward == 1:
                    replay_buffer.back_trace_reward(reward, 1)
                else:
                    replay_buffer.back_trace_reward(reward, 0.9)

                memory.append(replay_buffer)
                replay_buffer = ReplayBuffer(input_shape=[160, 160], start_size=32, max_size=10000000)

        # train in the memory
        step = agent.step_move()
        loss = 0
        for j in range(len(memory)):