コード例 #1
0
def run_dqn(config, gym_wrapper, summaries_collector_traj,
            summaries_collector):
    q_network = DeepQNetwork(config, gym_wrapper, trajectory=1)
    initial_time = round(time(), 3)
    q_network.train(summaries_collector)
    reward = q_network.test(summaries_collector, episodes=10, render=True)
    summaries_collector.read_summaries('test')
    total_time_traj = round(time(), 3) - initial_time
    print("tested avg reward: {0} in: {1}".format(reward, total_time_traj))
コード例 #2
0
from config_utils import read_main_config
from deep_q_network import DeepQNetwork
from gym_wrapper import GymWrapper


from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

config = read_main_config()
gym_wrapper = GymWrapper(config['general']['scenario'])
deep_q_network = DeepQNetwork(config, gym_wrapper)
deep_q_network.train()
deep_q_network.test(episodes=3)
コード例 #3
0
def train(opt):
    '''This function is for the training '''
    if torch.cuda.is_available(
    ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
        torch.cuda.manual_seed(
            125
        )  # the torch cuda manual seed is used in order to have reproducable results
    else:
        torch.manual_seed(125)  # sets the random number generator from pytorch
    if os.path.isdir(
            opt.log_path):  # check if the path is the path that is stored
        shutil.rmtree(
            opt.log_path)  # delets all the content from the lo_path dirfectory
    os.makedirs(opt.log_path
                )  # create a new path directory and store it to the log_path
    new_writer2 = SummaryWriter(
        opt.log_path)  # create a new summary writer with the log path
    environment = Tetris(
        width=opt.width, height=opt.height, block_size=opt.block_size
    )  # sets the environment to the tetris environment that i have created before with with width, the height and the the block size  from the parser.
    deepQ_model = DeepQNetwork(
    )  # the model is set to the deep q network that was created before
    my_optim = torch.optim.Adam(
        deepQ_model.parameters(), lr=opt.lr
    )  # sets the optimizaer with the algorith Adamn and the deep q model paramets and the learning rate from the parser
    cn = nn.MSELoss(
    )  # this is the default as ((input-target)**2).mean() but with pytorch it gets easier
    state = environment.reset(
    )  # the state is equal to a new reset environment
    if torch.cuda.is_available(
    ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
        deepQ_model.cuda(
        )  # sets the .cuda() to the deep q learning model to keep track of the gpu
        state = state.cuda(
        )  # sets the .cuda() to the state to keep track of the gpu
    r_memory = deque(
        maxlen=opt.mem_size
    )  #adds the removed elements to the r_memory. In that case the removed element is the memory sizy from the parser
    epoch = 0  #the epoch is set to 0
    output_training_video = cv2.VideoWriter(
        opt.result, cv2.VideoWriter_fourcc(*'FMP4'), opt.fps,
        (int(1.5 * opt.width * opt.block_size), opt.height * opt.block_size))
    while epoch < opt.num_epochs:  # loops until the epoch is less than the number of epochs from the parser

        next_steps = environment.get_next_states(
        )  # the next steps are set to the environment next states
        epsilon = opt.finalEpsilon + (
            max(opt.decay_epochs - epoch, 0) *
            (opt.initialEpsilon - opt.finalEpsilon) / opt.decay_epochs
        )  # this is for exploration. The epsilon is the final epsilon value from the parser + the max decay epochs - epoch and 0 * with the initial epsilon from the parser - the final epsilon / by the number of decay epochs.
        pp = random()  # pp is a random
        rand_action = pp <= epsilon  # random action is equal to the pp less than the epsilon
        nextActions, next_states = zip(
            *next_steps.items()
        )  # next action and next states are equal to a series of tuples of the next steps
        next_states = torch.stack(
            next_states
        )  # next states are set to the cocatenates of the next states to a new dimension
        if torch.cuda.is_available(
        ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
            next_states = next_states.cuda(
            )  # sets the .cuda() to the next states to keep track of the gpu
        deepQ_model.eval(
        )  # this pytorch function sets the model to evaluation mode while testing
        with torch.no_grad(
        ):  # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up
            dqm_p = deepQ_model(
                next_states
            )[:, 0]  # press is set to the deepq model with the next states
        deepQ_model.train()  # trains the deep q model
        if rand_action:  # if the action is random
            idx = randint(
                0,
                len(next_steps) - 1
            )  # the index is set to the random of the length of the next steps -1
        else:
            idx = torch.argmax(
                dqm_p).item()  #index set the maximum values of dqm_p
        next_state = next_states[
            idx, :]  # the next state is equal to the next states index
        action = nextActions[idx]  #action is set the next actions index
        reward, done = environment.make_step(
            action, cv2_rend=True
        )  # the reword and done is set to the environment with the action and the open cv render which is the environment for visualization
        if torch.cuda.is_available(
        ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
            next_state = next_state.cuda(
            )  # sets the .cuda() to the next state to keep track of the gpu
        r_memory.append([
            state, reward, next_state, done
        ])  # appends the r memory with the state reward next state and done
        if done:  # if its done
            output_training_video.release()
            episode_durations.append(epoch + 1)
            #plot_durations()
            final_total_score = environment.player_score  # the final total score is equal to the environments players score
            tot_reward.append(final_total_score)
            plot_reward()
            final_total_blocks = environment.tetris_blocks  # the final total blocks are equal to the environments tetris blocks
            final_total_completed_lines = environment.completed_lines  # the final total completed lines are equal to the environments completed lines
            state = environment.reset(
            )  # state is equal to a new environment (rest)
            if torch.cuda.is_available(
            ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
                state = state.cuda(
                )  # sets the .cuda() to the state to keep track of the gpu
        else:
            state = next_state  # the state is equal to the next state
            continue
        if len(
                r_memory
        ) < opt.mem_size / 10:  # if the length of the r memory is less than the parsers memory size / 10
            continue  # continues
        epoch += 1  # increments epoch +1
        batch = sample(
            r_memory, min(len(r_memory), opt.mini_batch_size)
        )  # the batch is set to the sample of the r memory the minimum length of the r memory and the mini batch size from the parser
        stateBatch, batchReward, nextB_state, completed_batch = zip(
            *batch
        )  # the statebatch, the batch reward the next state and the completed batch are all zipped all together to a tuple
        stateBatch = torch.stack(
            tuple(state for state in stateBatch)
        )  # the state batch is equal to the  to the cocatenates as a tuple of the states
        batchReward = torch.from_numpy(
            np.array(batchReward, dtype=np.float32)[:, None]
        )  # the batch reward is equal to a numpy ndarray of the batch reward as a float
        nextB_state = torch.stack(
            tuple(state for state in nextB_state)
        )  # the nextB state is equal to the cocatenates as a tuple of the states
        if torch.cuda.is_available(
        ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
            stateBatch = stateBatch.cuda(
            )  # sets the .cuda() to the state batch to keep track of the gpu
            batchReward = batchReward.cuda(
            )  # sets the .cuda() to the batch reward to keep track of the gpu
            nextB_state = nextB_state.cuda(
            )  # sets the .cuda() to the nextB state to keep track of the gpu
        q_values = deepQ_model(
            stateBatch)  # the q values are equal to the models's state batch
        deepQ_model.eval()  # sets the model to evaluation mode for testing
        with torch.no_grad(
        ):  # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up
            nextPred_batch = deepQ_model(
                nextB_state
            )  # the next predi batch is equal to the models's nextB state
        deepQ_model.train()  # sets the model to training mode
        batch_Y = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction
                  for reward, done, prediction in zip(
                      batchReward, completed_batch, nextPred_batch))
        )[:,
          None]  #  Loops in the zip tuple of batch rewards completed batches and next pred batch and if the batch of Y is equal to a oncatenated tuple of the reward. If its not done the reward + the gamma from the parser * the predictions are stored to the batch Y.
        my_optim.zero_grad(
        )  # the gradients of the optimizer are set to zero at the begining of the mini batch
        loss = cn(q_values,
                  batch_Y)  # the loss is equal to the q values and the batch y
        loss.backward(
        )  # computes dloss/dx for every parameter x which has requires the grad = True
        my_optim.step(
        )  #performs a parameter update on the optimzier based on the current gradient
        print(
            "Epoch Num: {}/{}, Action: {}, Score: {}, TPieces {}, Cleared lines: {}"
            .format(epoch, opt.num_epochs, action, final_total_score,
                    final_total_blocks, final_total_completed_lines)
        )  # prints the epoch number the action the final total score the final total blocks and the final completed lines for every epoch during training
        new_writer2.add_scalar(
            'Train/Score', final_total_score, epoch - 1
        )  # creates a summury scaler using tensorflow for the train score which gets the final total score and the step which is epoch -1
        new_writer2.add_scalar(
            'Train/TPieces', final_total_blocks, epoch - 1
        )  # creates a summury scaler using tensorflow for the train TPieces which gets the final total blocks and the step which is epoch -1
        new_writer2.add_scalar(
            'Train/Cleared lines', final_total_completed_lines, epoch - 1
        )  # creates a summury scaler using tensorflow for the train cleared lines which gets the final total completed lines and the step which is epoch -1
        if epoch > 0 and epoch % opt.store_interval == 0:  # if the epoch is greater than 0 and the epoch % the stored interval is equal to 0
            torch.save(
                deepQ_model, "{}/tetris_{}".format(opt.saved_path, epoch)
            )  # the trained model and epochsis saved to the saved path which is the trained models folder.
    torch.save(
        deepQ_model, "{}/tetris".format(opt.saved_path)
    )  # saves the trained model to the saved path from the parser which is the trained models folder
コード例 #4
0
            cur_episode_reward += reward

            if buf.size() > MIN_BUFFER:
                states, actions, rewards, next_states, dones = buf.sample(
                    MINI_BATCH)
                next_state_action_values = np.max(target_dqn.predict(
                    next_states / 255.0),
                                                  axis=1)
                y_true = dqn.predict(
                    states /
                    255.0)  # Y.shape: (MINI_BATCH, num_actions), i.e., (32, 6)
                y_true[range(
                    MINI_BATCH
                ), actions] = rewards + GAMMA * next_state_action_values * np.invert(
                    dones)
                dqn.train(states / 255.0, y_true)
            step += 1
        total_episode_rewards.append(cur_episode_reward)
        if episode % 100 == 0:
            dqn.save(MODEL_DIR, 'dqn-{}'.format(episode))
        if np.mean(total_episode_rewards[-30:]) > 19:
            dqn.save(MODEL_DIR, 'dqn-{}'.format(episode))
            break
    np.save(os.path.join(RES_DIR, 'episode_rewards.npy'),
            np.array(total_episode_rewards))

    # 画episode_reward
    plt.figure()
    plt.title('EPISODE - REWARD')
    plt.plot(range(len(total_episode_rewards)),
             total_episode_rewards,
コード例 #5
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size)
    model = DeepQNetwork()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    criterion = nn.MSELoss()

    state = env.reset()
    if torch.cuda.is_available():
        model.cuda()
        state = state.cuda()

    replay_memory = deque(maxlen=opt.replay_memory_size)
    epoch = 0
    while epoch < opt.num_epochs:
        next_steps = env.get_next_states()
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
            max(opt.num_decay_epochs - epoch, 0) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs)
        u = random()
        random_action = u <= epsilon
        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)
        if torch.cuda.is_available():
            next_states = next_states.cuda()
        model.eval()
        with torch.no_grad():
            predictions = model(next_states)[:, 0]
        model.train()
        # if random_action:
        #     index = randint(0, len(next_steps) - 1)
        # else:
        index = torch.argmax(predictions).item()

        next_state = next_states[index, :]
        action = next_actions[index]

        reward, done = env.step(action, render=True)

        if torch.cuda.is_available():
            next_state = next_state.cuda()
        replay_memory.append([state, reward, next_state, done])
        if done:
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            state = env.reset()
            if torch.cuda.is_available():
                state = state.cuda()
        else:
            state = next_state
            continue
        if len(replay_memory) < opt.replay_memory_size / 10:
            continue
        epoch += 1
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        state_batch = torch.stack(tuple(state for state in state_batch))
        reward_batch = torch.from_numpy(
            np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.stack(
            tuple(state for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        q_values = model(state_batch)
        model.eval()
        with torch.no_grad():
            next_prediction_batch = model(next_state_batch)
        model.train()

        y_batch = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction
                  for reward, done, prediction in zip(
                      reward_batch, done_batch, next_prediction_batch)))[:,
                                                                         None]

        optimizer.zero_grad()
        loss = criterion(q_values, y_batch)
        loss.backward()
        optimizer.step()

        print(
            "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}"
            .format(epoch, opt.num_epochs, action, final_score,
                    final_tetrominoes, final_cleared_lines))
        writer.add_scalar('Train/Score', final_score, epoch - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines,
                          epoch - 1)

        if epoch > 0 and epoch % opt.save_interval == 0:
            torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch))

    torch.save(model, "{}/tetris2".format(opt.saved_path))
コード例 #6
0
ファイル: main.py プロジェクト: rtv313/DQN-SpaceInvaders
            # Prepare data batch
            for i in range(batch_size):
                states[i] = experiences_batch[i][0]
                actions.append(experiences_batch[i][1])
                next_states[i] = experiences_batch[i][2]
                rewards.append(experiences_batch[i][3])

            current_q_values = policy_net.predict(states)
            target_q_values = target_net.predict(next_states)

            # Create Q_targets
            for i in range(batch_size):
                # Q_max = max_a' Q_target(s', a')
                target_q_values[i][actions[i]] = rewards[i] + gamma * (np.amax(
                    target_q_values[i]))

            # Train Policy Network
            policy_net.train(states, target_q_values)

        if environment_manager.done:
            max_reward = max_reward if max_reward > max_episode_reward else max_episode_reward
            print("Episode: " + str(episode) + " Episode reward: " +
                  str(max_episode_reward) + " Max Reward: " + str(max_reward) +
                  " Epsilon value " +
                  str(strategy.get_actual_exploration_rate()))
            break
    # update target network and save network
    if episode % target_update == 0:
        target_net.copy_weights_from_nn(policy_net)
        policy_net.save(episode, strategy.get_actual_exploration_rate())