Exemple #1
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    if env_name == 'DobroHalfCheetah-v0':
        env.unwrapped.initialize(is_render=False)
    agent = Agent(env, agent_args)

    v_loss_logger = Logger(save_name, 'v_loss')
    p_loss_logger = Logger(save_name, 'p_loss')
    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name.upper(), agent.name)
    episodes = int(5e5)
    save_freq = 1

    save_period = 1000
    p_losses = deque(maxlen=save_period)
    v_losses = deque(maxlen=save_period)
    entropies = deque(maxlen=save_period)
    scores = deque(maxlen=save_period)

    for episode in range(episodes):
        state = env.reset()
        agent.actor_noise.reset()
        done = False
        score = 0
        step = 0

        while not done:
            step += 1
            action = agent.get_action(state, True)
            next_state, reward, done, info = env.step(action)
            agent.replay_memory.append([
                np.array(state, np.float32), action, reward, done,
                np.array(next_state, np.float32)
            ])
            ########################

            if len(agent.replay_memory) > agent.train_start:
                v_loss, p_loss = agent.train()
                v_loss_logger.write([1, v_loss])
                p_loss_logger.write([1, p_loss])
                p_losses.append(p_loss)
                v_losses.append(v_loss)
                value = agent.get_value(state, action)
                entropies.append(value)
                scores.append(reward)
                graph.update(np.mean(scores), np.mean(p_losses),
                             np.mean(v_losses), np.mean(entropies))
            state = next_state
            score += reward

        print(episode, score, agent.epsilon)
        score_logger.write([step, score])
        if (episode + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            p_loss_logger.save()
            score_logger.save()

    graph.update(0, 0, 0, 0, finished=True)
Exemple #2
0
def train():
    global env, env_name
    env_name = env_name.split('-')[0]
    agent = Agent(env, env_name)
    loss_logger = Logger(env_name, 'loss')
    score_logger = Logger(env_name, 'score')
    action_low = env.action_space.low[0]
    action_high = env.action_space.high[0]
    episodes = int(5e2)
    avg_Q = deque(maxlen=200)

    for episode in range(episodes):
        state = env.reset()
        done = False
        score = 0
        step = 0

        while not done:
            step += 1
            action = agent.get_action(state)
            a_t = (action/(agent.n_action-1))
            a_t = a_t*(action_high - action_low) + action_low
            next_state, reward, done, info = env.step([a_t])

            agent.replay_memory.append([np.array(state, np.float32), action, reward, done, np.array(next_state, np.float32)])
            ########################

            #replay 메모리에 어느정도 쌓이면 학습시작하기
            if len(agent.replay_memory) > agent.train_start:
                Q, loss = agent.train()
                loss_logger.write([1, loss])
                avg_Q.append(Q)
            state = next_state
            score += reward

        #print(episode, accumulate+100, self.epsilon)
        print(episode, score, agent.epsilon, np.mean(avg_Q))
        agent.update_target_model()
        score_logger.write([step, score])
        if (episode+1)%agent.save_freq == 0:
            agent.save()
            loss_logger.save()
            score_logger.save()
Exemple #3
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    v_loss_logger = Logger(save_name, 'v_loss')
    cost_v_loss_logger = Logger(save_name, 'cost_v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    cost_logger = Logger(save_name, 'cost')
    graph = Graph(
        1000, save_name,
        ['score', 'cost', 'value loss', 'cost value loss', 'kl divergence'])
    max_steps = 4000
    max_ep_len = 1000
    episodes = int(max_steps / max_ep_len)
    epochs = 500
    save_freq = 10

    log_length = 10
    p_objectives = deque(maxlen=log_length)
    c_objectives = deque(maxlen=log_length)
    v_losses = deque(maxlen=log_length)
    cost_v_losses = deque(maxlen=log_length)
    kl_divergence = deque(maxlen=log_length)
    scores = deque(maxlen=log_length * episodes)
    costs = deque(maxlen=log_length * episodes)

    for epoch in range(epochs):
        states = []
        actions = []
        targets = []
        cost_targets = []
        gaes = []
        cost_gaes = []
        avg_costs = []
        ep_step = 0
        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            cost = 0
            step = 0
            temp_rewards = []
            temp_costs = []
            values = []
            cost_values = []
            while True:
                step += 1
                ep_step += 1
                assert env.observation_space.contains(state)
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
                assert env.action_space.contains(clipped_action)
                next_state, reward, done, info = env.step(clipped_action)

                #for predict cost
                h_dist = hazard_dist(env.hazards_pos, env.world.robot_pos())
                predict_cost = get_cost(h_dist)

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                temp_costs.append(predict_cost)
                values.append(value)
                cost_values.append(cost_value)

                state = next_state
                score += reward
                cost += info.get('cost',
                                 0)  #로그는 실제 cost를 남겨서, discrete한 cost랑 비교해야함.

                if done or step >= max_ep_len:
                    break

            if step >= max_ep_len:
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
            else:
                value = 0
                cost_value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            next_cost_values = cost_values[1:] + [cost_value]
            temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets(
                temp_costs, cost_values, next_cost_values)
            avg_costs.append(np.mean(temp_costs))
            targets += list(temp_targets)
            gaes += list(temp_gaes)
            cost_targets += list(temp_cost_targets)
            cost_gaes += list(temp_cost_gaes)

            score_logger.write([step, score])
            cost_logger.write([step, cost])
            scores.append(score)
            costs.append(cost)

        trajs = [
            states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs
        ]
        v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train(
            trajs)

        v_loss_logger.write([ep_step, v_loss])
        cost_v_loss_logger.write([ep_step, cost_v_loss])
        kl_logger.write([ep_step, kl])

        p_objectives.append(p_objective)
        c_objectives.append(cost_objective)
        v_losses.append(v_loss)
        cost_v_losses.append(cost_v_loss)
        kl_divergence.append(kl)

        print(np.mean(scores), np.mean(costs), np.mean(v_losses),
              np.mean(cost_v_losses), np.mean(kl_divergence),
              np.mean(c_objectives))
        graph.update([
            np.mean(scores),
            np.mean(costs),
            np.mean(v_losses),
            np.mean(cost_v_losses),
            np.mean(kl_divergence)
        ])
        if (epoch + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            cost_v_loss_logger.save()
            kl_logger.save()
            score_logger.save()
            cost_logger.save()

    graph.update(None, finished=True)
def train():
    global env_name, save_name, agent_args, env_real, env_sim, nets
    env_real = env_real.Env_real(False)
    env_sim = env_sim.Env_sim(True)
    GAT_model = nets.GAT_net(env_real, env_sim, GAT_args)
    agent = Agent(env_sim, agent_args)

    # wandb.init(project=save_name)
    accum_step = 0
    avg_temp_cost = 0

    v_loss_logger = Logger(save_name, 'v_loss')
    cost_v_loss_logger = Logger(save_name, 'cost_v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    cost_logger = Logger(save_name, 'cost')
    max_steps = 2000
    max_ep_len = 1000
    episodes = int(max_steps / max_ep_len)
    epochs = 2  #50
    save_freq = 1

    log_length = 10
    p_objectives = deque(maxlen=log_length)
    c_objectives = deque(maxlen=log_length)
    v_losses = deque(maxlen=log_length)
    cost_v_losses = deque(maxlen=log_length)
    kl_divergence = deque(maxlen=log_length)
    scores = deque(maxlen=log_length * episodes)
    costs = deque(maxlen=log_length * episodes)

    is_backup = False
    backup_name = '{}/backup.pkl'.format(save_name)
    if os.path.isfile(backup_name):
        #input_value = raw_input('backup file exists. wanna continue the last work?( y/n )')
        #if input_value != 'n':
        #    is_backup = True
        is_backup = True
    if is_backup:
        with open(backup_name, 'rb') as f:
            backup_list = pickle.load(f)
        start_iter = backup_list[0]
    else:
        start_iter = 0
        backup_list = [start_iter]

    for epoch in range(start_iter, epochs):
        #continue?
        print("=" * 20)
        print("Epoch : {}".format(epoch + 1))
        #input_value = raw_input("wanna continue episodes?( y/n )")
        #if input_value == 'n':
        #    break

        states = []
        actions = []
        targets = []
        cost_targets = []
        gaes = []
        cost_gaes = []
        avg_costs = []
        ep_step = 0
        while ep_step < max_steps:
            #input_value = raw_input("ready?")

            state = env_sim.reset()
            done = False
            score = 0
            cost = 0
            step = 0
            temp_rewards = []
            temp_costs = []
            values = []
            cost_values = []
            while True:
                if rospy.is_shutdown():
                    sys.exit()
                step += 1
                ep_step += 1
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
                # action transformer by GAT
                transformed_next_state = GAT_model.forward_transform(
                    state, clipped_action)
                transformed_action = GAT_model.backward_transform(
                    state, transformed_next_state)
                next_state, reward, done, info = env_sim.step(
                    transformed_action)

                predict_cost = info['continuous_cost']

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                temp_costs.append(predict_cost)
                values.append(value)
                cost_values.append(cost_value)

                state = next_state
                score += reward
                cost += info.get('cost', 0)

                if done or step >= max_ep_len:
                    break

            print("step : {}, score : {}".format(step, score))
            if step >= max_ep_len:
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
            else:
                value = 0
                cost_value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            next_cost_values = cost_values[1:] + [cost_value]
            temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets(
                temp_costs, cost_values, next_cost_values)
            avg_costs.append(np.mean(temp_costs))
            targets += list(temp_targets)
            gaes += list(temp_gaes)
            cost_targets += list(temp_cost_targets)
            cost_gaes += list(temp_cost_gaes)

            score_logger.write([step, score])
            cost_logger.write([step, cost])
            scores.append(score)
            costs.append(cost)

            accum_step += step
            avg_temp_cost = np.mean(temp_costs)
            # wandb.log({'step': accum_step, 'score':score, 'cost':cost, 'avg_temp_cost':avg_temp_cost})

        trajs = [
            states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs
        ]
        v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train(
            trajs)

        v_loss_logger.write([ep_step, v_loss])
        cost_v_loss_logger.write([ep_step, cost_v_loss])
        kl_logger.write([ep_step, kl])

        p_objectives.append(p_objective)
        c_objectives.append(cost_objective)
        v_losses.append(v_loss)
        cost_v_losses.append(cost_v_loss)
        kl_divergence.append(kl)

        print(np.mean(scores), np.mean(costs), np.mean(v_losses),
              np.mean(cost_v_losses), np.mean(kl_divergence),
              np.mean(c_objectives))
        if (epoch + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            cost_v_loss_logger.save()
            kl_logger.save()
            score_logger.save()
            cost_logger.save()

        #backup
        backup_list[0] = epoch + 1
        with open(backup_name, 'wb') as f:
            pickle.dump(backup_list, f)
Exemple #5
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    p_loss_logger = Logger(save_name, 'p_loss')
    v_loss_logger = Logger(save_name, 'v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    graph = Graph(
        1000, save_name,
        ['score', 'policy loss', 'value loss', 'kl divergence', 'entropy'])
    episodes = 10
    max_steps = 4000
    max_ep_len = min(1000, env.spec.max_episode_steps)
    epochs = int(1e5)
    save_freq = 10

    save_period = 10
    p_losses = deque(maxlen=save_period)
    v_losses = deque(maxlen=save_period)
    kl_divergence = deque(maxlen=save_period)
    entropies = deque(maxlen=save_period)
    scores = deque(maxlen=save_period * episodes)

    for epoch in range(epochs):
        states = []
        actions = []
        targets = []
        next_states = []
        rewards = []
        gaes = []
        ep_step = 0
        #for episode in range(episodes):
        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            step = 0
            temp_rewards = []
            values = []
            while True:
                step += 1
                ep_step += 1
                action, clipped_action, value = agent.get_action(state, True)
                next_state, reward, done, info = env.step(clipped_action)

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                next_states.append(next_state)
                rewards.append(reward)
                values.append(value)

                state = next_state
                score += reward

                if done or step >= max_ep_len:
                    break

            if step >= max_ep_len:
                action, clipped_action, value = agent.get_action(state, True)
            else:  #중간에 끝난 거면, 다 돌기전에 죽어버린거니, value = 0 으로 해야함
                value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            targets += list(temp_targets)
            gaes += list(temp_gaes)

            score_logger.write([step, score])
            scores.append(score)

        trajs = [states, actions, targets, next_states, rewards, gaes]
        p_loss, v_loss, kl, entropy = agent.train(trajs)

        p_loss_logger.write([ep_step, p_loss])
        v_loss_logger.write([ep_step, v_loss])
        kl_logger.write([ep_step, kl])
        p_losses.append(p_loss)
        v_losses.append(v_loss)
        kl_divergence.append(kl)
        entropies.append(entropy)

        print(np.mean(scores), np.mean(p_losses), np.mean(v_losses),
              np.mean(kl_divergence), np.mean(entropies))
        graph.update([
            np.mean(scores),
            np.mean(p_losses),
            np.mean(v_losses),
            np.mean(kl_divergence),
            np.mean(entropies)
        ])
        if (epoch + 1) % save_freq == 0:
            agent.save()
            p_loss_logger.save()
            v_loss_logger.save()
            kl_logger.save()
            score_logger.save()

    graph.update(None, finished=True)
Exemple #6
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    env.unwrapped.initialize(is_render=False)
    agent = Agent(env, agent_args)

    v_loss_logger = Logger(save_name, 'v_loss')
    p_loss_logger = Logger(save_name, 'p_loss')
    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name.upper(), agent.name)
    episodes = 10
    epochs = int(1e5)
    save_freq = 10

    save_period = 100
    p_losses = deque(maxlen=save_period)
    v_losses = deque(maxlen=save_period)
    entropies = deque(maxlen=save_period)
    scores = deque(maxlen=save_period * episodes)

    for epoch in range(epochs):
        states = []
        actions = []
        targets = []
        ep_step = 0
        for episode in range(episodes):
            state = env.reset()
            done = False
            score = 0
            step = 0
            temp_rewards = []
            while not done:
                step += 1
                ep_step += 1
                action, clipped_action = agent.get_action(state, True)
                next_state, reward, done, info = env.step(clipped_action)

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)

                state = next_state
                score += reward

            score_logger.write([step, score])
            scores.append(score)
            temp_targets = np.zeros_like(temp_rewards)
            ret = 0
            for t in reversed(range(len(temp_rewards))):
                ret = temp_rewards[t] + agent.discount_factor * ret
                temp_targets[t] = ret
            targets += list(temp_targets)

        trajs = [states, actions, targets]
        v_loss, p_objective, kl = agent.train(trajs)

        v_loss_logger.write([ep_step, v_loss])
        p_loss_logger.write([ep_step, p_objective])
        p_losses.append(p_objective)
        v_losses.append(v_loss)
        entropies.append(kl)

        #print(v_loss, p_objective, kl)
        print(np.mean(scores), np.mean(p_losses), np.mean(v_losses),
              np.mean(entropies))
        graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses),
                     np.mean(entropies))
        if (epoch + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            p_loss_logger.save()
            score_logger.save()

    graph.update(0, 0, 0, 0, finished=True)
Exemple #7
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name,
                  ['score', 'policy loss', 'Q value loss', 'entropy'])
    max_steps = 4000
    max_ep_len = min(1000, env.spec.max_episode_steps)
    start_training_after_steps = 1000
    step_per_training = 50
    epochs = 1000
    save_freq = 1

    record_length = 10
    p_losses = deque(maxlen=record_length *
                     int(max_ep_len / step_per_training))
    q_losses = deque(maxlen=record_length *
                     int(max_ep_len / step_per_training))
    entropies = deque(maxlen=record_length *
                      int(max_ep_len / step_per_training))
    scores = deque(maxlen=record_length)

    total_step = 0
    for epoch in range(epochs):
        ep_step = 0
        while ep_step < max_steps:
            state = env.reset()
            score = 0
            step = 0
            while True:
                step += 1
                ep_step += 1
                total_step += 1
                action = agent.get_action(state, True)
                next_state, reward, done, info = env.step(action)
                done = False if step >= max_ep_len else done

                agent.replay_memory.append(
                    [state, action, reward,
                     np.float(done), next_state])

                if len(agent.replay_memory) > start_training_after_steps and (
                        total_step + 1) % step_per_training == 0:
                    for _ in range(step_per_training):
                        p_loss, q_loss, entropy = agent.train()
                    p_losses.append(p_loss)
                    q_losses.append(q_loss)
                    entropies.append(entropy)
                    print(np.mean(scores), np.mean(p_losses),
                          np.mean(q_losses), np.mean(entropies))

                state = next_state
                score += reward

                if done or step >= max_ep_len:
                    break

            score_logger.write([step, score])
            scores.append(score)

            graph.update([
                np.mean(scores),
                np.mean(p_losses),
                np.mean(q_losses),
                np.mean(entropies)
            ])

        if (epoch + 1) % save_freq == 0:
            agent.save()
            score_logger.save()

    graph.update(None, finished=True)