Ejemplo n.º 1
0
    def build_model(self):
        """
        Constructs the model architecture
        :return:
        """

        # Set seeds
        utils.set_global_seed(self.config.SEED,
                              use_parallelism=self.config.USE_PARALLELISM)

        # Input layer
        inputs = KL.Input(shape=self.config.INPUT_SHAPE)
        X = inputs

        # Set regularizer
        if self.config.REGULARIZER == "L1":
            reg_func = keras.regularizers.l1(
                self.config.REGULARIZATION_COEFFICIENT)
        elif self.config.REGULARIZER == "L2":
            reg_func = keras.regularizers.l2(
                self.config.REGULARIZATION_COEFFICIENT)
        else:
            raise Exception("Unknown regularizer")

        # Hidden layers
        for L in self.config.ARCHITECTURE:
            if L[0] == "conv2d":
                X = KL.Conv2D(**L[1], kernel_regularizer=reg_func)(X)
                if L[2]["pooling"] is not None:
                    X = KL.MaxPool2D(pool_size=(2, 2))(X)
            elif L[0] == "dense":
                X = KL.Dense(**L[1], kernel_regularizer=reg_func)(X)

            # Activation functions
            if self.config.THETA_TRAINABLE:
                X = KL.PReLU(alpha_initializer='ones', shared_axes=[1, 2,
                                                                    3])(X)
            else:
                X = KL.Lambda(tunable_relu, arguments={"theta": self.theta})(X)

        # Output layer
        X = KL.Flatten()(X)
        outputs = KL.Dense(self.config.OUTPUT_SHAPE,
                           activation=self.config.OUTPUT_ACTIVATION)(X)

        # Create model, specify loss function, optimizer and metrics
        self.model = KM.Model(inputs=inputs, outputs=outputs)

        # Specify optimizer, learning rate schedule, etc and compile model
        opt = keras.optimizers.Adam()
        self.model.compile(optimizer=opt,
                           loss=self.config.LOSS,
                           metrics=self.config.METRICS)
    def test_kernel_computation():
        set_global_seed(10)
        X = np.asarray([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
        D = np.asarray([1, -0.5]).reshape(2, 1)

        with tf.Session() as sess:
            K_op = KAFNet.gauss_kernel(x=tf.convert_to_tensor(X),
                                       D=tf.convert_to_tensor(D))
            K = sess.run(K_op)

        K_true = np.exp(-np.asarray([[[0.9**2, 0.6**2], [0.8**2, 0.7**2]],
                                     [[0.7**2, 0.8**2], [0.6**2, 0.9**2]],
                                     [[0.5**2, 1.0**2], [0.4**2, 1.1**2]]]))

        try:
            np.testing.assert_array_almost_equal(K, K_true, decimal=4)
        except Exception as e:
            print(e)
Ejemplo n.º 3
0
            log_prob = max(min_logprob,
                           np.log(prob) if prob != 0.0 else min_logprob)
            log_likelihood += log_prob

    perplexity = np.exp(-1 / N * log_likelihood)

    return perplexity


if __name__ == "__main__":

    # argparse
    args = get_validate_args()

    # set seed and device
    set_global_seed(args.seed)
    device = torch.device(args.device)

    # load data
    data = load_data(path=args.path_to_data, verbose=args.verbose)

    # max_lenght
    if args.max_length is not None:
        data = [sentence[:args.max_length] for sentence in data]

    # load

    # # vocab char2idx
    path = os.path.join(args.path_to_model_folder, "vocab.json")
    with open(path, mode="r") as fp:
        char2idx = json.load(fp)
Ejemplo n.º 4
0
def run_trials():
    '''Run multiple trials to obtain more statistically relevant results
    and make best use of our small dataset (using cross-validation).
    For each trial, train a new classifier on a random training sample.
    '''

    catastrophic_failures = 0
    seed = None

    t0 = time.time()

    print('Load datasets')
    df = prepare_dataset()

    preds = []
    for i in range(0, settings.TRIALS):
        if settings.SAMPLE_SEED:
            seed = settings.SAMPLE_SEED + i
            utils.set_global_seed(seed)

        print('trial {}/{}{}'.format(i + 1, settings.TRIALS,
                                     f' ({seed} seed)' if seed else ''))
        classifier_key, accuracy, df_train = train_and_test(df, preds, seed)
        if accuracy < 0.4:
            catastrophic_failures += 1
        print('-' * 40)

    t1 = time.time()

    preds = pd.DataFrame(preds)

    if 1:
        df_confusion = utils.get_confusion_matrix(preds, df_train)
        utils.render_confusion(classifier_key, df_confusion, preds)

    if 1:
        utils.render_confidence_matrix(classifier_key, preds)

    # summary - F1
    acc = len(preds.loc[preds['pred'] == preds['cat']]) / len(preds)

    conf = settings.MIN_CONFIDENCE
    positive = len(preds.loc[preds['conf'] >= conf])
    true_positive = len(preds.loc[(preds['conf'] >= conf)
                                  & (preds['pred'] == preds['cat'])])
    if positive < 1:
        positive = 0.001
        precision = 0
    else:
        precision = true_positive / positive
    recall = true_positive / len(preds)
    f1 = 0
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)

    if catastrophic_failures:
        catastrophic_failures = f'; {catastrophic_failures} fails.'
    else:
        catastrophic_failures = ''

    utils.log(
        '{}; {:.2f} acc; {:.2f} prec, {:.2f} rec, {:.2f} f1 for {:.2f} conf.; {:.0f} mins.{}'
        .format(
            classifier_key,
            acc,
            precision,
            recall,
            f1,
            conf,
            (t1 - t0) / 60,
            catastrophic_failures,
        ))
    def __init__(self,
                 env,
                 network,
                 n_quantiles=50,
                 kappa=1,
                 replay_start_size=50000,
                 replay_buffer_size=1000000,
                 gamma=0.99,
                 update_target_frequency=10000,
                 minibatch_size=32,
                 learning_rate=1e-4,
                 update_frequency=1,
                 prior=0.01,
                 initial_exploration_rate=1,
                 final_exploration_rate=0.1,
                 final_exploration_step=1000000,
                 adam_epsilon=1e-8,
                 logging=False,
                 log_folder=None,
                 seed=None):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.replay_start_size = replay_start_size
        self.replay_buffer_size = replay_buffer_size
        self.gamma = gamma
        self.update_target_frequency = update_target_frequency
        self.minibatch_size = minibatch_size
        self.learning_rate = learning_rate
        self.update_frequency = update_frequency
        self.initial_exploration_rate = initial_exploration_rate
        self.epsilon = self.initial_exploration_rate
        self.final_exploration_rate = final_exploration_rate
        self.final_exploration_step = final_exploration_step
        self.adam_epsilon = adam_epsilon
        self.logging = logging
        self.logger = []
        self.timestep = 0
        self.log_folder = log_folder

        self.env = env
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
        self.seed = random.randint(0, 1e6) if seed is None else seed
        set_global_seed(self.seed, self.env)

        self.n_quantiles = n_quantiles

        self.network = network(self.env.observation_space,
                               self.env.action_space.n * self.n_quantiles,
                               self.env.action_space.n * self.n_quantiles).to(
                                   self.device)
        self.target_network = network(
            self.env.observation_space,
            self.env.action_space.n * self.n_quantiles,
            self.env.action_space.n * self.n_quantiles).to(self.device)
        self.target_network.load_state_dict(self.network.state_dict())
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learning_rate,
                                    eps=self.adam_epsilon)

        self.anchor1 = [
            p.data.clone() for p in list(self.network.output_1.parameters())
        ]
        self.anchor2 = [
            p.data.clone() for p in list(self.network.output_2.parameters())
        ]

        self.loss = quantile_huber_loss
        self.kappa = kappa
        self.prior = prior
Ejemplo n.º 6
0
import collections

import torch

import criterions as module_criterion
import data_loader.data_loaders as module_data
import metrics as module_metric
import models as module_arch
import optimizers as module_optim
import utils
from parse_config import ConfigParser
from trainer import Trainer

# fix random seeds for reproducibility
SEED = 123
utils.set_global_seed(SEED)
utils.prepare_cudnn(deterministic=True, benchmark=False)


def main(config: ConfigParser):
    logger = config.get_logger("train")

    # setup data_loader instances
    data_loader = config.init_obj("data_loader", module_data)
    valid_data_loader = data_loader.split_validation()

    # build model architecture, then print to console
    model = config.init_obj("arch", module_arch)
    logger.info(model)

    # get function handles of loss and metrics
Ejemplo n.º 7
0
"""

import os
import random as rn
import numpy as np
import tensorflow as tf
from tensorflow import keras
K = keras.backend
KU = keras.utils
from config import Config
c = Config()
from model_keras import Model
import utils

# Set seeds
utils.set_global_seed(c.SEED, use_parallelism=c.USE_PARALLELISM)

# Download the MNIST dataset
# X_train.shape = (60000, 28, 28)
# y_train.shape = (60000,) (the elements are the actual labels)
# X_test.shape = (10000, 28, 28)
# y_test.shape = (10000,)
MNIST = keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = MNIST.load_data()

# Preprocess the data (reshape, rescale, etc.)
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1).astype('float32') / 255
X_test = X_test.reshape(X_test.shape[0], 28, 28, 1).astype('float32') / 255

# Preprocess class labels
y_train = KU.to_categorical(y_train, num_classes=10)
Ejemplo n.º 8
0
    def __init__(
        self,
        env,
        network,
        replay_start_size=50000,
        replay_buffer_size=1000000,
        gamma=0.99,
        update_target_frequency=10000,
        minibatch_size=32,
        learning_rate=1e-3,
        update_frequency=1,
        initial_exploration_rate=1,
        final_exploration_rate=0.1,
        final_exploration_step=1000000,
        adam_epsilon=1e-8,
        logging=False,
        log_folder=None,
        seed=None,
        loss="huber",
    ):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.replay_start_size = replay_start_size
        self.replay_buffer_size = replay_buffer_size
        self.gamma = gamma
        self.update_target_frequency = update_target_frequency
        self.minibatch_size = minibatch_size
        self.learning_rate = learning_rate
        self.update_frequency = update_frequency
        self.initial_exploration_rate = initial_exploration_rate
        self.epsilon = self.initial_exploration_rate
        self.final_exploration_rate = final_exploration_rate
        self.final_exploration_step = final_exploration_step
        self.adam_epsilon = adam_epsilon
        self.logging = logging
        self.log_folder = log_folder
        if callable(loss):
            self.loss = loss
        else:
            try:
                self.loss = {
                    'huber': F.smooth_l1_loss,
                    'mse': F.mse_loss
                }[loss]
            except KeyError:
                raise ValueError("loss must be 'huber', 'mse' or a callable")

        self.env = env
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
        self.seed = random.randint(0, 1e6) if seed is None else seed
        set_global_seed(self.seed, self.env)

        self.network = network(self.env.observation_space,
                               self.env.action_space.n).to(self.device)
        self.target_network = network(self.env.observation_space,
                                      self.env.action_space.n).to(self.device)
        self.target_network.load_state_dict(self.network.state_dict())
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learning_rate,
                                    eps=self.adam_epsilon)
Ejemplo n.º 9
0
def run_benchmarks(time_steps=4000, single_model_name=None, single_env_name=None, project_name="rl-benchmarks",
                   run_tag="mlp", log_interval=1000, tensorboard_log="./tensorboard-logs", seed=123,
                   policy_type="MlpPolicy"):
    wandb.tensorboard.patch(save=False, tensorboardX=True)

    set_global_seed(seed)

    envs_id = ["Pendulum-v0", "ReacherBulletEnv-v0", "Hopper-v2", "Humanoid-v2", "", "HumanoidStandup-v2", "HalfCheetah-v2"]

    if single_env_name is not None and single_model_name is not None:
        init_wandb_run(
            project_name, single_env_name, single_model_name, f"{single_env_name}/{single_model_name}-{run_tag}",
            dir="."
        )

        models[single_model_name](
            single_env_name, time_steps, log_interval=log_interval, tensorboard_log=tensorboard_log, seed=seed
        )

    else:
        # raise NotImplementedError
        for env_name in envs_id:

            if single_model_name is not None:
                # init_wandb_run(project_name, env_name, single_model_name, f"{env_name}/{single_model_name}-{run_tag}")
                # models[single_model_name](env_name, time_steps, log_interval=log_interval,
                #                           tensorboard_log=tensorboard_log, seed=seed)

                weights_path = f"/home/ionelia/weights-benchmark-master/{single_model_name}_{env_name}_{policy_type}"
                if os.path.exists(f"{weights_path}.zip"):
                    models[single_model_name](
                        env_name,
                        time_steps,
                        log_interval=log_interval,
                        tensorboard_log=tensorboard_log,
                        seed=seed,
                        policy=policy_type,
                        load_weights=weights_path
                    )

            else:
                raise NotImplementedError

                runs = api.runs("ionelia/rl-benchmarks").objects

                for run in runs:
                    id_run = run.id
                    name_run = run.name
                    env_name_run, model_name_run = name_run.split("/")

                    if "NOPE" in env_name_run or name_run == 'HalfCheetah-v2/trpo':  # or  "trpo" not in model_name_run:
                        continue
                    else:
                        try:
                            wandb.init(id=id_run, project="rl-benchmarks", resume="must", monitor_gym=True, reinit=True)
                            weights_path = f"/home/ionelia/weights-benchmark-master/{model_name_run}_{env_name_run}_{policy_type}"
                            if os.path.exists(f"{weights_path}.zip"):
                                print(weights_path)
                                models[model_name_run](
                                    env_name_run,
                                    time_steps,
                                    log_interval=log_interval,
                                    tensorboard_log=tensorboard_log,
                                    seed=seed,
                                    policy=policy_type,
                                    load_weights=weights_path
                                )
                        except Exception as e:
                            print(e)
Ejemplo n.º 10
0
def main():
    args = parser.parse_args()
    env_name = args.env_name
    input_file = args.input_file
    checkpoint_file = args.resume
    test_only = args.test_only
    seed = args.seed
    no_gpu = args.no_gpu
    dir_name = args.dir_name
    visualize = args.visualize
    n_test_steps = args.n_test_steps
    log_perf_file = args.log_perf_file
    min_distance = args.min_distance
    max_distance = args.max_distance
    threshold = args.threshold
    y_range = args.y_range
    n_training_samples = args.n_training_samples
    start_index = args.start_index
    exp_name = args.exp_name
    batch_size = args.batch_size
    learning_rate = args.learning_rate
    n_epochs = args.n_epochs

    # Specific to Humanoid - Pybullet
    if visualize and env_name == 'HumanoidBulletEnv-v0':
        spec = gym.envs.registry.env_specs[env_name]
        class_ = gym.envs.registration.load(spec._entry_point)
        env = class_(**{**spec._kwargs}, **{'render': True})
    else:
        env = gym.make(env_name)

    set_global_seed(seed)
    env.seed(seed)

    input_shape = env.observation_space.shape[0] + 3
    output_shape = env.action_space.shape[0]
    net = Policy(input_shape, output_shape)
    if not no_gpu:
        net = net.cuda()
    optimizer = Adam(net.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    epochs = 0

    if checkpoint_file:
        epochs, net, optimizer = load_checkpoint(checkpoint_file, net,
                                                 optimizer)

    if not checkpoint_file and test_only:
        print('ERROR: You have not entered a checkpoint file.')
        return

    if not test_only:
        if not os.path.isfile(input_file):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                    input_file)

        training_file = open(input_file, 'rb')
        old_states = []
        norms = []
        goals = []
        actions = []
        n_samples = -1

        while n_samples - start_index < n_training_samples:
            try:
                old_s, old_g, new_s, new_g, action = pickle.load(training_file)
                n_samples += 1

                if n_samples < start_index:
                    continue

                old_states.append(np.squeeze(np.array(old_s)))
                norms.append(
                    find_norm(np.squeeze(np.array(new_g) - np.array(old_g))))
                goals.append(
                    preprocess_goal(
                        np.squeeze(np.array(new_g) - np.array(old_g))))
                actions.append(np.squeeze(np.array(action)))
            except (EOFError, ValueError):
                break

        old_states = np.array(old_states)
        norms = np.array(norms)
        goals = np.array(goals)
        actions = np.array(actions)

        normalization_factors = {
            'state': [old_states.mean(axis=0),
                      old_states.std(axis=0)],
            'distance_per_step': [norms.mean(axis=0),
                                  norms.std(axis=0)]
        }
        n_file = open(env_name + '_normalization_factors.pkl', 'wb')
        pickle.dump(normalization_factors, n_file)
        n_file.close()

        old_states = normalize(old_states,
                               env_name + '_normalization_factors.pkl',
                               'state')

        # Summary writer for tensorboardX
        writer = {}
        writer['writer'] = SummaryWriter()

        # Split data into training and validation
        indices = np.arange(old_states.shape[0])
        shuffle(indices)
        val_data = np.concatenate(
            (old_states[indices[:int(old_states.shape[0] / 5)]],
             goals[indices[:int(old_states.shape[0] / 5)]]),
            axis=1)
        val_labels = actions[indices[:int(old_states.shape[0] / 5)]]
        training_data = np.concatenate(
            (old_states[indices[int(old_states.shape[0] / 5):]],
             goals[indices[int(old_states.shape[0] / 5):]]),
            axis=1)
        training_labels = actions[indices[int(old_states.shape[0] / 5):]]
        del old_states, norms, goals, actions, indices

        checkpoint_dir = os.path.join(env_name, 'naive_gcp_checkpoints')
        if dir_name:
            checkpoint_dir = os.path.join(checkpoint_dir, dir_name)
        prepare_dir(checkpoint_dir)

        for e in range(epochs, n_epochs):
            ep_loss = []
            # Train network
            for i in range(int(len(training_data) / batch_size) + 1):
                inp = training_data[batch_size * i:batch_size * (i + 1)]
                out = net(
                    convert_to_variable(inp, grad=False, gpu=(not no_gpu)))
                target = training_labels[batch_size * i:batch_size * (i + 1)]
                target = convert_to_variable(np.array(target),
                                             grad=False,
                                             gpu=(not no_gpu))
                loss = criterion(out, target)
                optimizer.zero_grad()
                ep_loss.append(loss.item())
                loss.backward()
                optimizer.step()

            # Validation
            val_loss = []
            for i in range(int(len(val_data) / batch_size) + 1):
                inp = val_data[batch_size * i:batch_size * (i + 1)]
                out = net(
                    convert_to_variable(inp, grad=False, gpu=(not no_gpu)))
                target = val_labels[batch_size * i:batch_size * (i + 1)]
                target = convert_to_variable(np.array(target),
                                             grad=False,
                                             gpu=(not no_gpu))
                loss = criterion(out, target)
                val_loss.append(loss.item())

            writer['iter'] = e + 1
            writer['writer'].add_scalar('data/val_loss',
                                        np.array(val_loss).mean(), e + 1)
            writer['writer'].add_scalar('data/training_loss',
                                        np.array(ep_loss).mean(), e + 1)

            save_checkpoint(
                {
                    'epochs': (e + 1),
                    'state_dict': net.state_dict(),
                    'optimizer': optimizer.state_dict()
                },
                filename=os.path.join(checkpoint_dir,
                                      str(e + 1) + '.pth.tar'))

            print('Epoch:', e + 1)
            print('Training loss:', np.array(ep_loss).mean())
            print('Val loss:', np.array(val_loss).mean())
            print('')

    # Now we use the trained net to see how the agent reaches a different
    # waypoint from the current one.

    success = 0
    failure = 0

    closest_distances = []
    time_to_closest_distances = []

    f = open(env_name + '_normalization_factors.pkl', 'rb')
    normalization_factors = pickle.load(f)
    average_distance = normalization_factors['distance_per_step'][0]

    for i in range(n_test_steps):
        state = env.reset()
        if env_name == 'Ant-v2':
            obs = env.unwrapped.get_body_com('torso')
            target_obs = [
                obs[0] + np.random.uniform(min_distance, max_distance),
                obs[1] + np.random.uniform(-y_range, y_range), obs[2]
            ]
            target_obs = rotate_point(target_obs, env.unwrapped.angle)
            env.unwrapped.sim.model.body_pos[-1] = target_obs
        elif env_name == 'MinitaurBulletEnv-v0':
            obs = env.unwrapped.get_minitaur_position()
            target_obs = [
                obs[0] + np.random.uniform(min_distance, max_distance),
                obs[1] + np.random.uniform(-y_range, y_range), obs[2]
            ]
            target_obs = rotate_point(
                target_obs, env.unwrapped.get_minitaur_rotation_angle())
            env.unwrapped.set_target_position(target_obs)
        elif env_name == 'HumanoidBulletEnv-v0':
            obs = env.unwrapped.robot.get_robot_position()
            target_obs = [
                obs[0] + np.random.uniform(min_distance, max_distance),
                obs[1] + np.random.uniform(-y_range, y_range), obs[2]
            ]
            target_obs = rotate_point(target_obs, env.unwrapped.robot.yaw)
            env.unwrapped.robot.set_target_position(target_obs[0],
                                                    target_obs[1])
        steps = 0
        done = False
        closest_d = distance(obs, target_obs)
        closest_t = 0
        while distance(obs, target_obs) > threshold and not done:
            goal = preprocess_goal(target_obs - obs)
            state = normalize(np.array(state),
                              env_name + '_normalization_factors.pkl')
            inp = np.concatenate([np.squeeze(state), goal])
            inp = convert_to_variable(inp, grad=False, gpu=(not no_gpu))
            action = net(inp).cpu().detach().numpy()
            state, _, done, _ = env.step(action)
            steps += 1
            if env_name == 'MinitaurBulletEnv-v0':
                obs = env.unwrapped.get_minitaur_position()
            elif env_name == 'HumanoidBulletEnv-v0':
                obs = env.unwrapped.robot.get_robot_position()
            if distance(obs, target_obs) < closest_d:
                closest_d = distance(obs, target_obs)
                closest_t = steps
            if visualize:
                env.render()

        if distance(obs, target_obs) <= threshold:
            success += 1
        elif done:
            failure += 1

        if visualize:
            time.sleep(2)

        closest_distances.append(closest_d)
        time_to_closest_distances.append(closest_t)

    print('Successes: %d, Failures: %d, '
          'Closest distance: %f, Time to closest distance: %d' %
          (success, failure, np.mean(closest_distances),
           np.mean(time_to_closest_distances)))

    if log_perf_file:
        f = open(log_perf_file, 'a+')
        f.write(exp_name + ':Seed-' + str(seed) + ',Success-' + str(success) +
                ',Failure-' + str(failure) + ',Closest_distance-' +
                str(closest_distances) + ',Time_to_closest_distance-' +
                str(time_to_closest_distances) + '\n')
        f.close()
Ejemplo n.º 11
0
def main(exp_name, output_dir, do_train, do_test, n_seeds, seed_val):

    if exp_name is None:
        raise ValueError(
            "Please specify the experiment name. Run '$ experiment_wrapper -h' for info"
        )
    if not (do_train or do_test):
        raise ValueError(
            "Please specify if you want to do training or testing. Run '$ experiment_wrapper -h' for info"
        )

    exp = experiment_registration.get_experiment(exp_name)

    for task in exp['tasks']:
        # decide seed
        if seed_val is not None and n_seeds > 1:
            raise ValueError(
                "You cannot both provide a specific seed value {} and require n_seeds={} random values"
                .format(seed_val, n_seeds))

        # override seed value with the one provided as arg
        if seed_val is not None:
            task['seed'] = seed_val

        if n_seeds > 1 or 'seed' not in task:
            np.random.seed(2)
            seeds = np.random.randint(0, 20000, size=n_seeds)
        else:
            seeds = np.array([task['seed']])

        # a different training for each seed
        for ns in range(n_seeds):

            seed = int(seeds[ns])

            # Seed everything to make things reproducible.
            tf.compat.v1.reset_default_graph()
            set_global_seed(seed)

            # Read experiment conf variables
            rl_library, algo_name, algo_params = exp['algo']['RLlibrary'], exp[
                'algo']['name'], exp['algo']['params']

            # Set path for outputdata
            output_exp_dir = os.path.join(output_dir, exp_name,
                                          task['sub_name'],
                                          'seed_' + str(seed))
            os.makedirs(output_exp_dir, exist_ok=True)

            # Set Gym environment
            renders = True if do_test else False

            task['env_params']['renders'] = renders
            if 'log_file' in task['env_params']:
                task['env_params']['log_file'] = output_exp_dir

            # Create environment as normalized vectorized environment
            with_vecnorm = False
            env, eval_env = robot_agents.ALGOS[rl_library]['make_env'](
                task['env_id'], task['env_params'], seed, do_train,
                with_vecnorm)

            # Run algorithm
            csv_file = os.path.join(output_exp_dir, "exp_param.csv")
            try:
                with open(csv_file, 'w') as f:
                    for key in exp.keys():
                        f.write("%s,%s\n" % (key, exp[key]))
            except IOError:
                print("I/O error")

            if do_train:
                model = robot_agents.ALGOS[rl_library][algo_name](
                    env, eval_env, output_exp_dir, seed, **algo_params)

                if not model is None:
                    print("Saving model to ", output_exp_dir)
                    model.save(os.path.join(output_exp_dir, "final_model"))

            elif do_test:
                algo_name = algo_name + '_test'
                model = robot_agents.ALGOS[rl_library][algo_name](
                    env, output_exp_dir, seed, **algo_params)

            del env
            del eval_env
            del model
Ejemplo n.º 12
0
def main():
    args = parser.parse_args()
    num_training_steps = args.train_steps
    lr = args.learning_rate
    gamma = args.discount_factor
    n_test_episodes = args.n_test_episodes
    checkpoint_file = args.resume
    test_only = args.test_only
    env_name = args.environment
    seed = args.seed
    batch_size = args.batch_size
    horizon = args.horizon
    lam = args.gae
    visualize = args.visualize
    entropy_coeff = args.entropy_coeff
    use_lr_decay = args.use_lr_decay

    env = gym.make(env_name)
    set_global_seed(seed)
    env.seed(seed)

    input_shape = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    net = Network(input_shape, action_dim).to(device)
    total_steps = 0
    total_episodes = 0

    optimizer = Adam(net.parameters(), lr=lr)
    adv_rms = RunningMeanStd(dim=1)
    return_rms = RunningMeanStd(dim=1)
    state_rms = RunningMeanStd(dim=input_shape)

    if checkpoint_file:
        (total_steps, total_episodes, net, optimizer, state_info, adv_info,
         return_info) = load_checkpoint(checkpoint_file, net, optimizer,
                                        'state', 'adv', 'return')
        state_mean, state_var, state_min, state_max = state_info
        adv_mean, adv_var, adv_min, adv_max = adv_info
        return_mean, return_var, return_min, return_max = return_info
        state_rms.set_state(state_mean, state_var, state_min, state_max,
                            total_steps)
        adv_rms.set_state(adv_mean, adv_var, adv_min, adv_max, total_steps)
        return_rms.set_state(return_mean, return_var, return_min, return_max,
                             total_steps)

    checkpoint_dir = os.path.join(env_name, 'a2c_checkpoints_lr2e-3-b32-decay')
    if not os.path.isdir(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    if test_only:
        avg_reward = test(env, action_dim, net, state_rms, n_test_episodes,
                          visualize)
        print('Average episode reward:', avg_reward)
        return

    # Summary writer for tensorboardX
    writer = {}
    writer['writer'] = SummaryWriter()

    s = env.reset()

    reward_buf = []
    ep_reward = 0
    ep_len = 0
    niter = 0
    done = False

    mean_indices = torch.LongTensor([2 * x for x in range(action_dim)])
    logstd_indices = torch.LongTensor([2 * x + 1 for x in range(action_dim)])
    mean_indices = mean_indices.to(device)
    logstd_indices = logstd_indices.to(device)

    prev_best = 0

    total_epochs = int(num_training_steps / batch_size) + 1

    while total_steps < num_training_steps:
        values = []
        rewards = []
        dones = []
        logps = []
        entropies = []
        niter += 1
        for _ in range(batch_size):
            s = state_rms.normalize(s, mode=MEAN_STD)
            out, v = net(prepare_input(s))
            mean = torch.index_select(out, 0, mean_indices)
            logstd = torch.index_select(out, 0, logstd_indices)
            action_dist = Normal(mean, torch.exp(logstd))
            a = action_dist.sample()
            s, r, done, _ = env.step(a.cpu().numpy())
            logp = action_dist.log_prob(a)
            entropy = action_dist.entropy()
            ep_reward += r
            ep_len += 1
            total_steps += 1

            if done:
                writer['iter'] = total_steps + 1
                writer['writer'].add_scalar('data/ep_reward', ep_reward,
                                            total_steps)
                writer['writer'].add_scalar('data/ep_len', ep_len, total_steps)
                reward_buf.append(ep_reward)
                ep_reward = 0
                ep_len = 0
                total_episodes += 1
                if len(reward_buf) > 100:
                    reward_buf = reward_buf[-100:]
                done = False
                s = env.reset()

            values.append(v)
            rewards.append(r)
            dones.append(done)
            logps.append(logp)
            entropies.append(entropy.sum())

        policy_loss, value_loss = batch_actor_critic(logps, rewards, values,
                                                     dones, gamma, lam,
                                                     horizon, adv_rms,
                                                     return_rms)
        optimizer.zero_grad()
        policy_entropy = torch.stack(entropies).mean()
        loss = policy_loss + 0.5 * value_loss - entropy_coeff * policy_entropy
        loss.backward()
        optimizer.step()

        if use_lr_decay:
            for param_group in optimizer.param_groups:
                lr = param_group['lr']
                param_group['lr'] = (
                    lr - lr *
                    (total_steps / num_training_steps) / total_epochs)

        writer['iter'] = total_steps
        writer['writer'].add_scalar('data/last_100_ret',
                                    np.array(reward_buf).mean(), total_steps)
        writer['writer'].add_scalar('data/policy_loss', policy_loss,
                                    total_steps)
        writer['writer'].add_scalar('data/value_loss', value_loss, total_steps)
        writer['writer'].add_scalar('data/loss', loss, total_steps)

        print(total_episodes, 'episodes,', total_steps, 'steps,',
              np.array(reward_buf).mean(), 'reward')

        save_checkpoint(
            {
                'total_steps':
                total_steps,
                'total_episodes':
                total_episodes,
                'state_dict':
                net.state_dict(),
                'optimizer':
                optimizer.state_dict(),
                'state':
                [state_rms.mean, state_rms.var, state_rms.min, state_rms.max],
                'adv': [adv_rms.mean, adv_rms.var, adv_rms.min, adv_rms.max],
                'return': [
                    return_rms.mean, return_rms.var, return_rms.min,
                    return_rms.max
                ]
            },
            filename=os.path.join(checkpoint_dir,
                                  str(niter) + '.pth.tar'))

        if np.array(reward_buf).mean() > prev_best:
            save_checkpoint(
                {
                    'total_steps': total_steps,
                    'total_episodes': total_episodes,
                    'state_dict': net.state_dict(),
                    'optimizer': optimizer.state_dict(),
                },
                filename=os.path.join(checkpoint_dir, 'best.pth.tar'))
                    ROLLING_EVENT.wait()
                    GLOBAL_QUEUE.put((batch_s, batch_a, batch_r))
                    if GLOBAL_QUEUE.qsize(
                    ) >= MAX_QSIZE and not TERM_EVENT.is_set():
                        UPDATE_EVENT.set()
                        ROLLING_EVENT.clear()
                    # Clear buffer after model update
                    buffer_s.clear()
                    buffer_a.clear()
                    buffer_r.clear()
        print(' [*] Worker {} finish and exit'.format(self.wid))


if __name__ == '__main__':
    args = add_arguments()
    set_global_seed(1)
    if args.method == 'kl_pen':
        METHOD = dict(name='kl_pen', kl_target=0.01, lam=0.5)
    elif args.method == 'clip':
        METHOD = dict(name='clip', epsilon=0.2)
    else:
        raise NotImplementedError

    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)
    if not os.path.exists(args.logdir):
        os.makedirs(args.logdir)
    else:
        files = os.listdir(args.logdir)
        files = [os.path.join(args.logdir, fn) for fn in files]
        for f in files:
Ejemplo n.º 14
0
import os
import time
import numpy as np
import tensorflow as tf
import logger
from config import CONFIG as C
from model import Model
from runner import Runner
from utils import create_session, set_global_seed
from wrappers import SubprocVecEnv, make_atari
set_global_seed(113)
time_stamp = time.strftime("%m-%d-%y-%H:%M:%S")
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Only run on GPU 0


def evaluate(env, policy, nb_episodes):
    rewards = [0]
    for i in range(nb_episodes):
        s = env.reset()
        while True:
            a = policy.get_best_action(s)
            s, r, d, info = env.step(a)
            rewards[-1] += r
            if env.env.env.env.env.was_real_done:
                rewards.append(0)
                break
            if d:
                s = env.reset()
    return rewards