def train():
    global env_name, save_name, agent_args, env_real, env_sim, nets
    env_real = env_real.Env_real(False)
    env_sim = env_sim.Env_sim(True)
    GAT_model = nets.GAT_net(env_real, env_sim, GAT_args)
    agent = Agent(env_sim, agent_args)

    # wandb.init(project=save_name)
    accum_step = 0
    avg_temp_cost = 0

    v_loss_logger = Logger(save_name, 'v_loss')
    cost_v_loss_logger = Logger(save_name, 'cost_v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    cost_logger = Logger(save_name, 'cost')
    max_steps = 2000
    max_ep_len = 1000
    episodes = int(max_steps / max_ep_len)
    epochs = 2  #50
    save_freq = 1

    log_length = 10
    p_objectives = deque(maxlen=log_length)
    c_objectives = deque(maxlen=log_length)
    v_losses = deque(maxlen=log_length)
    cost_v_losses = deque(maxlen=log_length)
    kl_divergence = deque(maxlen=log_length)
    scores = deque(maxlen=log_length * episodes)
    costs = deque(maxlen=log_length * episodes)

    is_backup = False
    backup_name = '{}/backup.pkl'.format(save_name)
    if os.path.isfile(backup_name):
        #input_value = raw_input('backup file exists. wanna continue the last work?( y/n )')
        #if input_value != 'n':
        #    is_backup = True
        is_backup = True
    if is_backup:
        with open(backup_name, 'rb') as f:
            backup_list = pickle.load(f)
        start_iter = backup_list[0]
    else:
        start_iter = 0
        backup_list = [start_iter]

    for epoch in range(start_iter, epochs):
        #continue?
        print("=" * 20)
        print("Epoch : {}".format(epoch + 1))
        #input_value = raw_input("wanna continue episodes?( y/n )")
        #if input_value == 'n':
        #    break

        states = []
        actions = []
        targets = []
        cost_targets = []
        gaes = []
        cost_gaes = []
        avg_costs = []
        ep_step = 0
        while ep_step < max_steps:
            #input_value = raw_input("ready?")

            state = env_sim.reset()
            done = False
            score = 0
            cost = 0
            step = 0
            temp_rewards = []
            temp_costs = []
            values = []
            cost_values = []
            while True:
                if rospy.is_shutdown():
                    sys.exit()
                step += 1
                ep_step += 1
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
                # action transformer by GAT
                transformed_next_state = GAT_model.forward_transform(
                    state, clipped_action)
                transformed_action = GAT_model.backward_transform(
                    state, transformed_next_state)
                next_state, reward, done, info = env_sim.step(
                    transformed_action)

                predict_cost = info['continuous_cost']

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                temp_costs.append(predict_cost)
                values.append(value)
                cost_values.append(cost_value)

                state = next_state
                score += reward
                cost += info.get('cost', 0)

                if done or step >= max_ep_len:
                    break

            print("step : {}, score : {}".format(step, score))
            if step >= max_ep_len:
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
            else:
                value = 0
                cost_value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            next_cost_values = cost_values[1:] + [cost_value]
            temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets(
                temp_costs, cost_values, next_cost_values)
            avg_costs.append(np.mean(temp_costs))
            targets += list(temp_targets)
            gaes += list(temp_gaes)
            cost_targets += list(temp_cost_targets)
            cost_gaes += list(temp_cost_gaes)

            score_logger.write([step, score])
            cost_logger.write([step, cost])
            scores.append(score)
            costs.append(cost)

            accum_step += step
            avg_temp_cost = np.mean(temp_costs)
            # wandb.log({'step': accum_step, 'score':score, 'cost':cost, 'avg_temp_cost':avg_temp_cost})

        trajs = [
            states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs
        ]
        v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train(
            trajs)

        v_loss_logger.write([ep_step, v_loss])
        cost_v_loss_logger.write([ep_step, cost_v_loss])
        kl_logger.write([ep_step, kl])

        p_objectives.append(p_objective)
        c_objectives.append(cost_objective)
        v_losses.append(v_loss)
        cost_v_losses.append(cost_v_loss)
        kl_divergence.append(kl)

        print(np.mean(scores), np.mean(costs), np.mean(v_losses),
              np.mean(cost_v_losses), np.mean(kl_divergence),
              np.mean(c_objectives))
        if (epoch + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            cost_v_loss_logger.save()
            kl_logger.save()
            score_logger.save()
            cost_logger.save()

        #backup
        backup_list[0] = epoch + 1
        with open(backup_name, 'wb') as f:
            pickle.dump(backup_list, f)
Ejemplo n.º 2
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    v_loss_logger = Logger(save_name, 'v_loss')
    cost_v_loss_logger = Logger(save_name, 'cost_v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    cost_logger = Logger(save_name, 'cost')
    graph = Graph(
        1000, save_name,
        ['score', 'cost', 'value loss', 'cost value loss', 'kl divergence'])
    max_steps = 4000
    max_ep_len = 1000
    episodes = int(max_steps / max_ep_len)
    epochs = 500
    save_freq = 10

    log_length = 10
    p_objectives = deque(maxlen=log_length)
    c_objectives = deque(maxlen=log_length)
    v_losses = deque(maxlen=log_length)
    cost_v_losses = deque(maxlen=log_length)
    kl_divergence = deque(maxlen=log_length)
    scores = deque(maxlen=log_length * episodes)
    costs = deque(maxlen=log_length * episodes)

    for epoch in range(epochs):
        states = []
        actions = []
        targets = []
        cost_targets = []
        gaes = []
        cost_gaes = []
        avg_costs = []
        ep_step = 0
        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            cost = 0
            step = 0
            temp_rewards = []
            temp_costs = []
            values = []
            cost_values = []
            while True:
                step += 1
                ep_step += 1
                assert env.observation_space.contains(state)
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
                assert env.action_space.contains(clipped_action)
                next_state, reward, done, info = env.step(clipped_action)

                #for predict cost
                h_dist = hazard_dist(env.hazards_pos, env.world.robot_pos())
                predict_cost = get_cost(h_dist)

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                temp_costs.append(predict_cost)
                values.append(value)
                cost_values.append(cost_value)

                state = next_state
                score += reward
                cost += info.get('cost',
                                 0)  #로그는 실제 cost를 남겨서, discrete한 cost랑 비교해야함.

                if done or step >= max_ep_len:
                    break

            if step >= max_ep_len:
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
            else:
                value = 0
                cost_value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            next_cost_values = cost_values[1:] + [cost_value]
            temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets(
                temp_costs, cost_values, next_cost_values)
            avg_costs.append(np.mean(temp_costs))
            targets += list(temp_targets)
            gaes += list(temp_gaes)
            cost_targets += list(temp_cost_targets)
            cost_gaes += list(temp_cost_gaes)

            score_logger.write([step, score])
            cost_logger.write([step, cost])
            scores.append(score)
            costs.append(cost)

        trajs = [
            states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs
        ]
        v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train(
            trajs)

        v_loss_logger.write([ep_step, v_loss])
        cost_v_loss_logger.write([ep_step, cost_v_loss])
        kl_logger.write([ep_step, kl])

        p_objectives.append(p_objective)
        c_objectives.append(cost_objective)
        v_losses.append(v_loss)
        cost_v_losses.append(cost_v_loss)
        kl_divergence.append(kl)

        print(np.mean(scores), np.mean(costs), np.mean(v_losses),
              np.mean(cost_v_losses), np.mean(kl_divergence),
              np.mean(c_objectives))
        graph.update([
            np.mean(scores),
            np.mean(costs),
            np.mean(v_losses),
            np.mean(cost_v_losses),
            np.mean(kl_divergence)
        ])
        if (epoch + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            cost_v_loss_logger.save()
            kl_logger.save()
            score_logger.save()
            cost_logger.save()

    graph.update(None, finished=True)
Ejemplo n.º 3
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    p_loss_logger = Logger(save_name, 'p_loss')
    v_loss_logger = Logger(save_name, 'v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    graph = Graph(
        1000, save_name,
        ['score', 'policy loss', 'value loss', 'kl divergence', 'entropy'])
    episodes = 10
    max_steps = 4000
    max_ep_len = min(1000, env.spec.max_episode_steps)
    epochs = int(1e5)
    save_freq = 10

    save_period = 10
    p_losses = deque(maxlen=save_period)
    v_losses = deque(maxlen=save_period)
    kl_divergence = deque(maxlen=save_period)
    entropies = deque(maxlen=save_period)
    scores = deque(maxlen=save_period * episodes)

    for epoch in range(epochs):
        states = []
        actions = []
        targets = []
        next_states = []
        rewards = []
        gaes = []
        ep_step = 0
        #for episode in range(episodes):
        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            step = 0
            temp_rewards = []
            values = []
            while True:
                step += 1
                ep_step += 1
                action, clipped_action, value = agent.get_action(state, True)
                next_state, reward, done, info = env.step(clipped_action)

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                next_states.append(next_state)
                rewards.append(reward)
                values.append(value)

                state = next_state
                score += reward

                if done or step >= max_ep_len:
                    break

            if step >= max_ep_len:
                action, clipped_action, value = agent.get_action(state, True)
            else:  #중간에 끝난 거면, 다 돌기전에 죽어버린거니, value = 0 으로 해야함
                value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            targets += list(temp_targets)
            gaes += list(temp_gaes)

            score_logger.write([step, score])
            scores.append(score)

        trajs = [states, actions, targets, next_states, rewards, gaes]
        p_loss, v_loss, kl, entropy = agent.train(trajs)

        p_loss_logger.write([ep_step, p_loss])
        v_loss_logger.write([ep_step, v_loss])
        kl_logger.write([ep_step, kl])
        p_losses.append(p_loss)
        v_losses.append(v_loss)
        kl_divergence.append(kl)
        entropies.append(entropy)

        print(np.mean(scores), np.mean(p_losses), np.mean(v_losses),
              np.mean(kl_divergence), np.mean(entropies))
        graph.update([
            np.mean(scores),
            np.mean(p_losses),
            np.mean(v_losses),
            np.mean(kl_divergence),
            np.mean(entropies)
        ])
        if (epoch + 1) % save_freq == 0:
            agent.save()
            p_loss_logger.save()
            v_loss_logger.save()
            kl_logger.save()
            score_logger.save()

    graph.update(None, finished=True)