Exemple #1
0
                                                         args.learn_start)

# Construct validation memory
val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size:
    if done:
        state, done = env.reset(), False

    action = random.randint(0, action_space - 1)
    if args.env == 'peg1-v0':
        action = np.array([action // 16, action % 16 // 4, action % 4])
        next_state, _, done, _ = env.step(action)
    else:
        next_state, _, done = env.step(action)
    val_mem.append(state, None, None, done)
    state = next_state
    T += 1

if args.evaluate:
    dqn.eval()  # Set DQN (online network) to evaluation mode
    avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True)  # Test
    print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
else:
    # Training loop
    dqn.train()
    T, done = 0, True
    while T < args.T_max:
        if done:
            state, done = env.reset(), False
Exemple #2
0
          'online_net': dqn.online_net.state_dict()
        }, ckptdir / 'last_ckpt.tar')
  save_mem(mem, ckptdir)
  log("Checkpoint successfully saved")

priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start)

# Construct validation memory
val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size:
  if done:
    state = env.reset()

  next_state, _, done = env.step(np.random.randint(0, action_space))
  val_mem.append(state, -1, 0.0, done)
  state = next_state
  T += 1

  done = True
    
if args.evaluate:
  dqn.eval()  # Set DQN (online network) to evaluation mode
  avg_reward, avg_Q = test(args, 0, dqn, val_mem, metrics, results_dir, evaluate=True)  # Test
  print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
else:
  # Training loop
  dqn.train()
  T, done = 0, True
  for T in trange(T_init, args.T_max + 1):
    if done:
Exemple #3
0
# Agent
dqn = Agent(args, env)
mem = ReplayMemory(args, args.memory_capacity)
priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                         args.learn_start)

# Construct validation memory
val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size - args.history_length + 1:
    if done:
        state, done = env.reset(), False
        val_mem.preappend()  # Set up memory for beginning of episode

    val_mem.append(state, None, None)
    state, _, done = env.step(random.randint(0, action_space - 1))
    T += 1
    # No need to postappend on done in validation memory

if args.evaluate:
    dqn.eval()  # Set DQN (policy network) to evaluation mode
    avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True)  # Test
    print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
else:
    # Training loop
    dqn.train()
    T, done = 0, True
    while T < args.T_max:
        if done:
            state, done = Variable(env.reset()), False
def train_agent(env, args, config):
    """
    Args:
    """

    # create CNN convert the [1,3,84,84] to [1, 200]
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    torch.manual_seed(config["seed"])
    np.random.seed(config["seed"])
    if torch.cuda.is_available() and not args.disable_cuda:
        args.device = torch.device('cuda')
        torch.cuda.manual_seed(np.random.randint(1, 10000))
        torch.backends.cudnn.enabled = args.enable_cudnn
    pathname = dt_string + "_seed" + str(config["seed"])
    print("save tensorboard {}".format(config["locexp"]))
    tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
    agent = Agent(args, env)
    memory = ReplayMemory(args, args.memory_capacity)
    #memory =  ReplayBuffer((3, config["size"], config["size"]), (1,), config["expert_buffer_size"], int(config["image_pad"]), config["device"])
    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)
    writer = SummaryWriter(tensorboard_name)
    results_dir = os.path.join(str(config["locexp"]), args.id)
    mkdir("", results_dir)
    scores_window = deque(maxlen=100)
    scores = []
    t0 = time.time()
    # Training loop
    agent.train()
    T, done = 0, True
    print("result dir ", results_dir)
    agent.save(results_dir, 'checkpoint-{}.pth'.format(T))
    #eval_policy(env, agent, writer, T, config)
    episode = -1
    steps = 0
    score = 0
    print("save policy ", args.checkpoint_interval)
    # eval_policy(env, agent, writer, 0, config)
    for T in range(1, args.T_max + 1):
        # print("\r {} of {}".format(T, args.T_max), end='')
        if done:
            episode += 1
            scores_window.append(score)  # save most recent scor
            scores.append(score)  # save most recent score
            print(
                '\rTime steps {}  episode {} score {} Average Score: {:.2f} time: {}'
                .format(T, episode, score, np.mean(scores_window),
                        time_format(time.time() - t0)),
                end="")
            writer.add_scalar('Episode_reward ', score, T)
            average_reward = np.mean(scores_window)
            writer.add_scalar('Average_reward ', average_reward, T)
            state, done = env.reset(), False
            steps = 0
            score = 0
        if T % args.replay_frequency == 0:
            agent.reset_noise()  # Draw a new set of noisy weights

        action = agent.act(
            state)  # Choose an action greedily (with noisy weights)
        next_state, reward, done, _ = env.step(action)  # Step
        score += reward
        steps += 1
        if steps == 30:
            done = True
        memory.append(state, action, reward,
                      done)  # Append transition to memory

        # Train and test
        if T >= args.learn_start:
            memory.priority_weight = min(
                memory.priority_weight + priority_weight_increase,
                1)  # Anneal importance sampling weight β to 1

            if T % args.replay_frequency == 0:
                agent.learn(
                    memory
                )  # Train with n-step distributional double-Q learning

            # Update target network
            if T % args.target_update == 0:
                agent.update_target_net()

            # Checkpoint the network
            if (args.checkpoint_interval !=
                    0) and (T % args.checkpoint_interval == 0):
                print("Eval policy")
                eval_policy(env, agent, writer, T, config)
                agent.save(results_dir, 'checkpoint-{}.pth'.format(T))
        state = next_state
Exemple #5
0
def train(args, env):
    action_space = env.action_space.n
    print("show action space", action_space)
    print("state space", env.observation_space)
    # Agent
    dqn_1 = Agent(args, env)
    dqn_2 = Agent(args, env)

    results_dir = os.path.join('results', args.id)
    print("result dir", results_dir)

    T, done = 0, True
    # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory
    print(" ags training", args.continue_training)

    args.continue_training = False
    if args.continue_training:
        print("Continue Training Load buffer 1 ...")

        args.memory = results_dir + "/val_mem_1/memory.pkl"
        mem_1 = load_memory(args.memory, args.disable_bzip_memory)
        val_mem_1 = ReplayMemory(args, args.evaluation_size)
        print("loaded memory buffer 1")
        print("Continue Training Load buffer 2 ...")
        args.memory = results_dir + "/val_mem_2/memory.pkl"
        mem_2 = load_memory(args.memory, args.disable_bzip_memory)
        val_mem_2 = ReplayMemory(args, args.evaluation_size)
        print("loaded memory buffer 2")

    else:
        print("use empty Buffers")
        args.memory = results_dir + "/val_mem_1/memory.pkl"
        path = results_dir + "/val_mem_1"
        print("save memory", args.memory)
        os.makedirs(path, exist_ok=True)
        val_mem_1 = ReplayMemory(args, args.evaluation_size)
        mem_1 = ReplayMemory(args, args.memory_capacity)
        args.memory = results_dir + "/val_mem_2/memory.pkl"
        path = results_dir + "/val_mem_2"
        print("save memory", args.memory)
        os.makedirs(path, exist_ok=True)
        val_mem_2 = ReplayMemory(args, args.evaluation_size)
        mem_2 = ReplayMemory(args, args.memory_capacity)

    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)
    metrics = {
        'steps': [],
        'rewards': [],
        'Qs': [],
        'step_rewards': [],
        'train_rewards': [],
        'best_avg_reward': -float('inf')
    }

    args.continue_training = True

    def write_into_file(text, file_name='document.csv'):
        """
        """
        with open(file_name, 'a', newline='\n') as fd:
            fd.write(str(text) + "\n")

    def log(s):
        text = '[' + str(
            datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s
        write_into_file(text)
        print(text)

    if torch.cuda.is_available():
        print("cuda")

    def save_memory(memory, memory_path, disable_bzip):
        if disable_bzip:
            with open(memory_path, 'wb') as pickle_file:
                pickle.dump(memory, pickle_file)
        else:
            with bz2.open(memory_path, 'wb') as zipped_pickle_file:
                pickle.dump(memory, zipped_pickle_file)

    ("Create eval memory of size {} ".format(args.evaluation_size))
    # Construct validation memory

    size = 84
    print("Fill eval memory")

    # fill both memories at same time
    # use the reward function for each
    try:
        while T < args.evaluation_size:
            T += 1
            print("steps ", T)
            if done:
                t = 0
                done = False
                state = env.reset()
                state = torch.tensor(state,
                                     dtype=torch.float32,
                                     device=args.device).div_(255)
                zeros = torch.zeros_like(state)
                state_buffer = deque([], maxlen=args.history_length)
                state_buffer.append(zeros)
                state_buffer.append(zeros)
                state_buffer.append(zeros)
                state_buffer.append(state)
                state = torch.stack(list(state_buffer), 0)
            t += 1
            if t == args.max_episode_length:
                #if t == 5:
                t = 0
                done = True
            next_state, _, _, _ = env.step(np.random.randint(0, action_space))

            val_mem_1.append(state, None, None, done)
            val_mem_2.append(state, None, None, done)

            next_state = torch.tensor(next_state,
                                      dtype=torch.float32,
                                      device=args.device).div_(255)
            state_buffer.append(next_state)
            state = torch.stack(list(state_buffer), 0)
        eps_1 = 1
        eps_end_1 = 0.05
        eps_decay_1 = 0.999978  # reaches 10% at 105000

        eps_2 = 1
        eps_end_2 = 0.05
        eps_decay_2 = 0.999978  # reaches 10% at 10500
        #args.evaluate = True
        if args.evaluate:
            print("Test")
            dqn.eval()  # Set DQN (online network) to evaluation mode
            #avg_reward, avg_Q = test(args, 0, dqn, val_mem, metrics, results_dir, env, evaluate=True)  # Test
            avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics,
                                     results_dir, env)  # Test
            print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                  str(avg_Q))
        else:
            if args.continue_training:
                print("Start Training")
                T = args.learn_start + 500
            # Training loop
            dqn_1.train()
            dqn_2.train()
            episode = 0
            episode_reward = 0
            mean_reward = deque(maxlen=100)
            plot_rewards = []
            print("Fill both memory buffers ")
            while T < args.learn_start:
                if T % args.max_episode_length == 0:
                    state, done = env.reset(), False
                    state = torch.tensor(state,
                                         dtype=torch.float32,
                                         device=args.device).div_(255)
                    zeros = torch.zeros_like(state)
                    state_buffer = deque([], maxlen=args.history_length)
                    state_buffer.append(zeros)
                    state_buffer.append(zeros)
                    state_buffer.append(zeros)
                    state_buffer.append(state)
                    state = torch.stack(list(state_buffer), 0)
                # choose action at random
                action = np.random.randint(0, action_space)
                next_state, reward, done, reward_2 = env.step(action)  # Step
                text = "Step {} of {} ".format(T, args.learn_start)
                print(text, end='\r', file=sys.stdout, flush=True)
                # set done on the last transition
                if (T + 1) % args.max_episode_length == 0:
                    done = True
                mem_1.append(state, action, reward, done)
                mem_2.append(state, action, reward_2, done)
                next_state = torch.tensor(next_state,
                                          dtype=torch.float32,
                                          device=args.device).div_(255)
                state_buffer.append(next_state)
                state = torch.stack(list(state_buffer), 0)
                T += 1
                if T >= args.learn_start:
                    args.memory = results_dir + "/val_mem_1/memory.pkl"
                    print("save memory 1", args.memory)
                    save_memory(mem_1, args.memory, args.disable_bzip_memory)
                    args.memory = results_dir + "/val_mem_2/memory.pkl"
                    print("save memory 2", args.memory)
                    save_memory(mem_2, args.memory, args.disable_bzip_memory)
                    break
            print("Start Training")
            #for T in tqdm.trange(args.learn_start, args.T_max + 1):
            for T in tqdm.trange(0, args.T_max + 1):
                if T % args.max_episode_length == 0:
                    mean_reward.append(episode_reward)
                    print("Epiosde: {}  Reward: {} Mean Reward: {}  Goal1 {}".
                          format(episode, episode_reward, np.mean(mean_reward),
                                 env.goal_counter_1))
                    plot_rewards.append(np.mean(mean_reward))
                    save_and_plot(T, plot_rewards)
                    episode_reward = 0
                    episode += 1
                    state, done = env.reset(), False
                    state = torch.tensor(state,
                                         dtype=torch.float32,
                                         device=args.device).div_(255)
                    zeros = torch.zeros_like(state)
                    state_buffer = deque([], maxlen=args.history_length)
                    state_buffer.append(zeros)
                    state_buffer.append(zeros)
                    state_buffer.append(zeros)
                    state_buffer.append(state)
                    state = torch.stack(list(state_buffer), 0)
                    g = 0
                    set_input = True
                    secondTask = False

                if T % args.replay_frequency == 0:
                    pass
                    #dqn.reset_noise()  # Draw a new set of noisy weights
                """
                if env.task_one_complete or secondTask:
                    action = dqn_2.act_e_greedy(state, eps_2)  # Choose an action greedily (with noisy weights)
                    secondTask = True
                else:
                    action = dqn_1.act_e_greedy(state, eps_1)  # Choose an action greedily (with noisy weights)
                """
                if set_input:
                    set_input = False
                    g = input("Enter action : ")
                    action = int(g)
                    g = input("Enter steps : ")
                    g = int(g)
                if g <= 0:
                    set_input = True
                g -= 1

                #print("step : {} action: {} eps: {}".format(T, action, eps))
                next_state, reward, done, reward_2 = env.step(action)  # Step

                if args.reward_clip > 0:
                    reward = max(min(reward, args.reward_clip),
                                 -args.reward_clip)  # Clip rewards
                    reward_2 = max(min(reward_2, args.reward_clip),
                                   -args.reward_clip)  # Clip rewards

                if env.task_one_complete or secondTask:
                    episode_reward += reward_2
                    eps_2 = max(eps_end_2, eps_decay_2 * eps_2)
                    mem_2.priority_weight = min(
                        mem_2.priority_weight + priority_weight_increase,
                        1)  # Anneal importance sampling weight β to 1
                else:
                    episode_reward += reward
                    eps_1 = max(eps_end_1, eps_decay_1 * eps_1)
                    mem_1.priority_weight = min(
                        mem_1.priority_weight + priority_weight_increase,
                        1)  # Anneal importance sampling weight β to 1

                #print(reward)
                #print(reward_2)
                # incase the last action set done to True
                if T + 1 % args.max_episode_length == 0:
                    done = True

                mem_1.append(state, action, reward,
                             done)  # Append transition to memory
                mem_2.append(state, action, reward_2,
                             done)  # Append transition to memory

                # Train and test

                next_state = torch.tensor(next_state,
                                          dtype=torch.float32,
                                          device=args.device).div_(255)
                # print("Main shape of  next_state", next_state.shape)
                state_buffer.append(next_state)
                state = torch.stack(list(state_buffer), 0)
                continue
                # print("Main shape of  state", state.shape)
                if T % args.replay_frequency == 0:
                    dqn_1.learn(
                        mem_1
                    )  # Train with n-step distributional double-Q learning
                    dqn_2.learn(
                        mem_2
                    )  # Train with n-step distributional double-Q learning

                if T % args.evaluation_interval == 0:
                    dqn_1.eval()  # Set DQN (online network) to evaluation mode
                    print("Eval epsilon 1 {} epsilon 2 {} ".format(
                        eps_1, eps_2))
                    avg_reward, avg_Q = test(args, T, dqn_1, val_mem_1,
                                             metrics, results_dir, env,
                                             1)  # Test
                    log('T = ' + str(T) + ' / ' + str(args.T_max) +
                        ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                        str(avg_Q))
                    dqn_1.train(
                    )  # Set DQN (online network) back to training mode
                    dqn_2.eval()  # Set DQN (online network) to evaluation mode
                    avg_reward, avg_Q = test(args, T, dqn_2, val_mem_2,
                                             metrics, results_dir, env,
                                             2)  # Test
                    log('T = ' + str(T) + ' / ' + str(args.T_max) +
                        ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                        str(avg_Q))
                    dqn_2.train(
                    )  # Set DQN (online network) back to training mode

                # Update target network
                if T % args.target_update == 0:
                    dqn_1.update_target_net()
                    dqn_2.update_target_net()

                # checkpoint the network
                if (args.checkpoint_interval !=
                        0) and (T % args.checkpoint_interval == 0):
                    #print("save memory", args.memory)
                    #save_memory(mem, args.memory, args.disable_bzip_memory)
                    print("epsilon 1: ", eps_1)
                    print("epsilon 2: ", eps_2)
                    print("Save model at ", results_dir)
                    dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T))
                    dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T))
    except KeyboardInterrupt:
        print("Keybaord error")
    finally:
        print("save state....")
        print("Save model at ", results_dir)
        dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T))
        dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T))
        args.memory = results_dir + "/val_mem_1/memory.pkl"
        print("save memory 1  ...", args.memory)
        save_memory(mem_1, args.memory, args.disable_bzip_memory)
        args.memory = results_dir + "/val_mem_2/memory.pkl"
        print("save memory 2 ...", args.memory)
        save_memory(mem_2, args.memory, args.disable_bzip_memory)
        print("Save model at ", results_dir)
        dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T))
        dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T))
        print("... done Saving State")
        sys.exit()