def build_model(self): """ Constructs the model architecture :return: """ # Set seeds utils.set_global_seed(self.config.SEED, use_parallelism=self.config.USE_PARALLELISM) # Input layer inputs = KL.Input(shape=self.config.INPUT_SHAPE) X = inputs # Set regularizer if self.config.REGULARIZER == "L1": reg_func = keras.regularizers.l1( self.config.REGULARIZATION_COEFFICIENT) elif self.config.REGULARIZER == "L2": reg_func = keras.regularizers.l2( self.config.REGULARIZATION_COEFFICIENT) else: raise Exception("Unknown regularizer") # Hidden layers for L in self.config.ARCHITECTURE: if L[0] == "conv2d": X = KL.Conv2D(**L[1], kernel_regularizer=reg_func)(X) if L[2]["pooling"] is not None: X = KL.MaxPool2D(pool_size=(2, 2))(X) elif L[0] == "dense": X = KL.Dense(**L[1], kernel_regularizer=reg_func)(X) # Activation functions if self.config.THETA_TRAINABLE: X = KL.PReLU(alpha_initializer='ones', shared_axes=[1, 2, 3])(X) else: X = KL.Lambda(tunable_relu, arguments={"theta": self.theta})(X) # Output layer X = KL.Flatten()(X) outputs = KL.Dense(self.config.OUTPUT_SHAPE, activation=self.config.OUTPUT_ACTIVATION)(X) # Create model, specify loss function, optimizer and metrics self.model = KM.Model(inputs=inputs, outputs=outputs) # Specify optimizer, learning rate schedule, etc and compile model opt = keras.optimizers.Adam() self.model.compile(optimizer=opt, loss=self.config.LOSS, metrics=self.config.METRICS)
def test_kernel_computation(): set_global_seed(10) X = np.asarray([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) D = np.asarray([1, -0.5]).reshape(2, 1) with tf.Session() as sess: K_op = KAFNet.gauss_kernel(x=tf.convert_to_tensor(X), D=tf.convert_to_tensor(D)) K = sess.run(K_op) K_true = np.exp(-np.asarray([[[0.9**2, 0.6**2], [0.8**2, 0.7**2]], [[0.7**2, 0.8**2], [0.6**2, 0.9**2]], [[0.5**2, 1.0**2], [0.4**2, 1.1**2]]])) try: np.testing.assert_array_almost_equal(K, K_true, decimal=4) except Exception as e: print(e)
log_prob = max(min_logprob, np.log(prob) if prob != 0.0 else min_logprob) log_likelihood += log_prob perplexity = np.exp(-1 / N * log_likelihood) return perplexity if __name__ == "__main__": # argparse args = get_validate_args() # set seed and device set_global_seed(args.seed) device = torch.device(args.device) # load data data = load_data(path=args.path_to_data, verbose=args.verbose) # max_lenght if args.max_length is not None: data = [sentence[:args.max_length] for sentence in data] # load # # vocab char2idx path = os.path.join(args.path_to_model_folder, "vocab.json") with open(path, mode="r") as fp: char2idx = json.load(fp)
def run_trials(): '''Run multiple trials to obtain more statistically relevant results and make best use of our small dataset (using cross-validation). For each trial, train a new classifier on a random training sample. ''' catastrophic_failures = 0 seed = None t0 = time.time() print('Load datasets') df = prepare_dataset() preds = [] for i in range(0, settings.TRIALS): if settings.SAMPLE_SEED: seed = settings.SAMPLE_SEED + i utils.set_global_seed(seed) print('trial {}/{}{}'.format(i + 1, settings.TRIALS, f' ({seed} seed)' if seed else '')) classifier_key, accuracy, df_train = train_and_test(df, preds, seed) if accuracy < 0.4: catastrophic_failures += 1 print('-' * 40) t1 = time.time() preds = pd.DataFrame(preds) if 1: df_confusion = utils.get_confusion_matrix(preds, df_train) utils.render_confusion(classifier_key, df_confusion, preds) if 1: utils.render_confidence_matrix(classifier_key, preds) # summary - F1 acc = len(preds.loc[preds['pred'] == preds['cat']]) / len(preds) conf = settings.MIN_CONFIDENCE positive = len(preds.loc[preds['conf'] >= conf]) true_positive = len(preds.loc[(preds['conf'] >= conf) & (preds['pred'] == preds['cat'])]) if positive < 1: positive = 0.001 precision = 0 else: precision = true_positive / positive recall = true_positive / len(preds) f1 = 0 if precision + recall > 0: f1 = 2 * (precision * recall) / (precision + recall) if catastrophic_failures: catastrophic_failures = f'; {catastrophic_failures} fails.' else: catastrophic_failures = '' utils.log( '{}; {:.2f} acc; {:.2f} prec, {:.2f} rec, {:.2f} f1 for {:.2f} conf.; {:.0f} mins.{}' .format( classifier_key, acc, precision, recall, f1, conf, (t1 - t0) / 60, catastrophic_failures, ))
def __init__(self, env, network, n_quantiles=50, kappa=1, replay_start_size=50000, replay_buffer_size=1000000, gamma=0.99, update_target_frequency=10000, minibatch_size=32, learning_rate=1e-4, update_frequency=1, prior=0.01, initial_exploration_rate=1, final_exploration_rate=0.1, final_exploration_step=1000000, adam_epsilon=1e-8, logging=False, log_folder=None, seed=None): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.replay_start_size = replay_start_size self.replay_buffer_size = replay_buffer_size self.gamma = gamma self.update_target_frequency = update_target_frequency self.minibatch_size = minibatch_size self.learning_rate = learning_rate self.update_frequency = update_frequency self.initial_exploration_rate = initial_exploration_rate self.epsilon = self.initial_exploration_rate self.final_exploration_rate = final_exploration_rate self.final_exploration_step = final_exploration_step self.adam_epsilon = adam_epsilon self.logging = logging self.logger = [] self.timestep = 0 self.log_folder = log_folder self.env = env self.replay_buffer = ReplayBuffer(self.replay_buffer_size) self.seed = random.randint(0, 1e6) if seed is None else seed set_global_seed(self.seed, self.env) self.n_quantiles = n_quantiles self.network = network(self.env.observation_space, self.env.action_space.n * self.n_quantiles, self.env.action_space.n * self.n_quantiles).to( self.device) self.target_network = network( self.env.observation_space, self.env.action_space.n * self.n_quantiles, self.env.action_space.n * self.n_quantiles).to(self.device) self.target_network.load_state_dict(self.network.state_dict()) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate, eps=self.adam_epsilon) self.anchor1 = [ p.data.clone() for p in list(self.network.output_1.parameters()) ] self.anchor2 = [ p.data.clone() for p in list(self.network.output_2.parameters()) ] self.loss = quantile_huber_loss self.kappa = kappa self.prior = prior
import collections import torch import criterions as module_criterion import data_loader.data_loaders as module_data import metrics as module_metric import models as module_arch import optimizers as module_optim import utils from parse_config import ConfigParser from trainer import Trainer # fix random seeds for reproducibility SEED = 123 utils.set_global_seed(SEED) utils.prepare_cudnn(deterministic=True, benchmark=False) def main(config: ConfigParser): logger = config.get_logger("train") # setup data_loader instances data_loader = config.init_obj("data_loader", module_data) valid_data_loader = data_loader.split_validation() # build model architecture, then print to console model = config.init_obj("arch", module_arch) logger.info(model) # get function handles of loss and metrics
""" import os import random as rn import numpy as np import tensorflow as tf from tensorflow import keras K = keras.backend KU = keras.utils from config import Config c = Config() from model_keras import Model import utils # Set seeds utils.set_global_seed(c.SEED, use_parallelism=c.USE_PARALLELISM) # Download the MNIST dataset # X_train.shape = (60000, 28, 28) # y_train.shape = (60000,) (the elements are the actual labels) # X_test.shape = (10000, 28, 28) # y_test.shape = (10000,) MNIST = keras.datasets.mnist (X_train, y_train), (X_test, y_test) = MNIST.load_data() # Preprocess the data (reshape, rescale, etc.) X_train = X_train.reshape(X_train.shape[0], 28, 28, 1).astype('float32') / 255 X_test = X_test.reshape(X_test.shape[0], 28, 28, 1).astype('float32') / 255 # Preprocess class labels y_train = KU.to_categorical(y_train, num_classes=10)
def __init__( self, env, network, replay_start_size=50000, replay_buffer_size=1000000, gamma=0.99, update_target_frequency=10000, minibatch_size=32, learning_rate=1e-3, update_frequency=1, initial_exploration_rate=1, final_exploration_rate=0.1, final_exploration_step=1000000, adam_epsilon=1e-8, logging=False, log_folder=None, seed=None, loss="huber", ): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.replay_start_size = replay_start_size self.replay_buffer_size = replay_buffer_size self.gamma = gamma self.update_target_frequency = update_target_frequency self.minibatch_size = minibatch_size self.learning_rate = learning_rate self.update_frequency = update_frequency self.initial_exploration_rate = initial_exploration_rate self.epsilon = self.initial_exploration_rate self.final_exploration_rate = final_exploration_rate self.final_exploration_step = final_exploration_step self.adam_epsilon = adam_epsilon self.logging = logging self.log_folder = log_folder if callable(loss): self.loss = loss else: try: self.loss = { 'huber': F.smooth_l1_loss, 'mse': F.mse_loss }[loss] except KeyError: raise ValueError("loss must be 'huber', 'mse' or a callable") self.env = env self.replay_buffer = ReplayBuffer(self.replay_buffer_size) self.seed = random.randint(0, 1e6) if seed is None else seed set_global_seed(self.seed, self.env) self.network = network(self.env.observation_space, self.env.action_space.n).to(self.device) self.target_network = network(self.env.observation_space, self.env.action_space.n).to(self.device) self.target_network.load_state_dict(self.network.state_dict()) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate, eps=self.adam_epsilon)
def run_benchmarks(time_steps=4000, single_model_name=None, single_env_name=None, project_name="rl-benchmarks", run_tag="mlp", log_interval=1000, tensorboard_log="./tensorboard-logs", seed=123, policy_type="MlpPolicy"): wandb.tensorboard.patch(save=False, tensorboardX=True) set_global_seed(seed) envs_id = ["Pendulum-v0", "ReacherBulletEnv-v0", "Hopper-v2", "Humanoid-v2", "", "HumanoidStandup-v2", "HalfCheetah-v2"] if single_env_name is not None and single_model_name is not None: init_wandb_run( project_name, single_env_name, single_model_name, f"{single_env_name}/{single_model_name}-{run_tag}", dir="." ) models[single_model_name]( single_env_name, time_steps, log_interval=log_interval, tensorboard_log=tensorboard_log, seed=seed ) else: # raise NotImplementedError for env_name in envs_id: if single_model_name is not None: # init_wandb_run(project_name, env_name, single_model_name, f"{env_name}/{single_model_name}-{run_tag}") # models[single_model_name](env_name, time_steps, log_interval=log_interval, # tensorboard_log=tensorboard_log, seed=seed) weights_path = f"/home/ionelia/weights-benchmark-master/{single_model_name}_{env_name}_{policy_type}" if os.path.exists(f"{weights_path}.zip"): models[single_model_name]( env_name, time_steps, log_interval=log_interval, tensorboard_log=tensorboard_log, seed=seed, policy=policy_type, load_weights=weights_path ) else: raise NotImplementedError runs = api.runs("ionelia/rl-benchmarks").objects for run in runs: id_run = run.id name_run = run.name env_name_run, model_name_run = name_run.split("/") if "NOPE" in env_name_run or name_run == 'HalfCheetah-v2/trpo': # or "trpo" not in model_name_run: continue else: try: wandb.init(id=id_run, project="rl-benchmarks", resume="must", monitor_gym=True, reinit=True) weights_path = f"/home/ionelia/weights-benchmark-master/{model_name_run}_{env_name_run}_{policy_type}" if os.path.exists(f"{weights_path}.zip"): print(weights_path) models[model_name_run]( env_name_run, time_steps, log_interval=log_interval, tensorboard_log=tensorboard_log, seed=seed, policy=policy_type, load_weights=weights_path ) except Exception as e: print(e)
def main(): args = parser.parse_args() env_name = args.env_name input_file = args.input_file checkpoint_file = args.resume test_only = args.test_only seed = args.seed no_gpu = args.no_gpu dir_name = args.dir_name visualize = args.visualize n_test_steps = args.n_test_steps log_perf_file = args.log_perf_file min_distance = args.min_distance max_distance = args.max_distance threshold = args.threshold y_range = args.y_range n_training_samples = args.n_training_samples start_index = args.start_index exp_name = args.exp_name batch_size = args.batch_size learning_rate = args.learning_rate n_epochs = args.n_epochs # Specific to Humanoid - Pybullet if visualize and env_name == 'HumanoidBulletEnv-v0': spec = gym.envs.registry.env_specs[env_name] class_ = gym.envs.registration.load(spec._entry_point) env = class_(**{**spec._kwargs}, **{'render': True}) else: env = gym.make(env_name) set_global_seed(seed) env.seed(seed) input_shape = env.observation_space.shape[0] + 3 output_shape = env.action_space.shape[0] net = Policy(input_shape, output_shape) if not no_gpu: net = net.cuda() optimizer = Adam(net.parameters(), lr=learning_rate) criterion = nn.MSELoss() epochs = 0 if checkpoint_file: epochs, net, optimizer = load_checkpoint(checkpoint_file, net, optimizer) if not checkpoint_file and test_only: print('ERROR: You have not entered a checkpoint file.') return if not test_only: if not os.path.isfile(input_file): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), input_file) training_file = open(input_file, 'rb') old_states = [] norms = [] goals = [] actions = [] n_samples = -1 while n_samples - start_index < n_training_samples: try: old_s, old_g, new_s, new_g, action = pickle.load(training_file) n_samples += 1 if n_samples < start_index: continue old_states.append(np.squeeze(np.array(old_s))) norms.append( find_norm(np.squeeze(np.array(new_g) - np.array(old_g)))) goals.append( preprocess_goal( np.squeeze(np.array(new_g) - np.array(old_g)))) actions.append(np.squeeze(np.array(action))) except (EOFError, ValueError): break old_states = np.array(old_states) norms = np.array(norms) goals = np.array(goals) actions = np.array(actions) normalization_factors = { 'state': [old_states.mean(axis=0), old_states.std(axis=0)], 'distance_per_step': [norms.mean(axis=0), norms.std(axis=0)] } n_file = open(env_name + '_normalization_factors.pkl', 'wb') pickle.dump(normalization_factors, n_file) n_file.close() old_states = normalize(old_states, env_name + '_normalization_factors.pkl', 'state') # Summary writer for tensorboardX writer = {} writer['writer'] = SummaryWriter() # Split data into training and validation indices = np.arange(old_states.shape[0]) shuffle(indices) val_data = np.concatenate( (old_states[indices[:int(old_states.shape[0] / 5)]], goals[indices[:int(old_states.shape[0] / 5)]]), axis=1) val_labels = actions[indices[:int(old_states.shape[0] / 5)]] training_data = np.concatenate( (old_states[indices[int(old_states.shape[0] / 5):]], goals[indices[int(old_states.shape[0] / 5):]]), axis=1) training_labels = actions[indices[int(old_states.shape[0] / 5):]] del old_states, norms, goals, actions, indices checkpoint_dir = os.path.join(env_name, 'naive_gcp_checkpoints') if dir_name: checkpoint_dir = os.path.join(checkpoint_dir, dir_name) prepare_dir(checkpoint_dir) for e in range(epochs, n_epochs): ep_loss = [] # Train network for i in range(int(len(training_data) / batch_size) + 1): inp = training_data[batch_size * i:batch_size * (i + 1)] out = net( convert_to_variable(inp, grad=False, gpu=(not no_gpu))) target = training_labels[batch_size * i:batch_size * (i + 1)] target = convert_to_variable(np.array(target), grad=False, gpu=(not no_gpu)) loss = criterion(out, target) optimizer.zero_grad() ep_loss.append(loss.item()) loss.backward() optimizer.step() # Validation val_loss = [] for i in range(int(len(val_data) / batch_size) + 1): inp = val_data[batch_size * i:batch_size * (i + 1)] out = net( convert_to_variable(inp, grad=False, gpu=(not no_gpu))) target = val_labels[batch_size * i:batch_size * (i + 1)] target = convert_to_variable(np.array(target), grad=False, gpu=(not no_gpu)) loss = criterion(out, target) val_loss.append(loss.item()) writer['iter'] = e + 1 writer['writer'].add_scalar('data/val_loss', np.array(val_loss).mean(), e + 1) writer['writer'].add_scalar('data/training_loss', np.array(ep_loss).mean(), e + 1) save_checkpoint( { 'epochs': (e + 1), 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict() }, filename=os.path.join(checkpoint_dir, str(e + 1) + '.pth.tar')) print('Epoch:', e + 1) print('Training loss:', np.array(ep_loss).mean()) print('Val loss:', np.array(val_loss).mean()) print('') # Now we use the trained net to see how the agent reaches a different # waypoint from the current one. success = 0 failure = 0 closest_distances = [] time_to_closest_distances = [] f = open(env_name + '_normalization_factors.pkl', 'rb') normalization_factors = pickle.load(f) average_distance = normalization_factors['distance_per_step'][0] for i in range(n_test_steps): state = env.reset() if env_name == 'Ant-v2': obs = env.unwrapped.get_body_com('torso') target_obs = [ obs[0] + np.random.uniform(min_distance, max_distance), obs[1] + np.random.uniform(-y_range, y_range), obs[2] ] target_obs = rotate_point(target_obs, env.unwrapped.angle) env.unwrapped.sim.model.body_pos[-1] = target_obs elif env_name == 'MinitaurBulletEnv-v0': obs = env.unwrapped.get_minitaur_position() target_obs = [ obs[0] + np.random.uniform(min_distance, max_distance), obs[1] + np.random.uniform(-y_range, y_range), obs[2] ] target_obs = rotate_point( target_obs, env.unwrapped.get_minitaur_rotation_angle()) env.unwrapped.set_target_position(target_obs) elif env_name == 'HumanoidBulletEnv-v0': obs = env.unwrapped.robot.get_robot_position() target_obs = [ obs[0] + np.random.uniform(min_distance, max_distance), obs[1] + np.random.uniform(-y_range, y_range), obs[2] ] target_obs = rotate_point(target_obs, env.unwrapped.robot.yaw) env.unwrapped.robot.set_target_position(target_obs[0], target_obs[1]) steps = 0 done = False closest_d = distance(obs, target_obs) closest_t = 0 while distance(obs, target_obs) > threshold and not done: goal = preprocess_goal(target_obs - obs) state = normalize(np.array(state), env_name + '_normalization_factors.pkl') inp = np.concatenate([np.squeeze(state), goal]) inp = convert_to_variable(inp, grad=False, gpu=(not no_gpu)) action = net(inp).cpu().detach().numpy() state, _, done, _ = env.step(action) steps += 1 if env_name == 'MinitaurBulletEnv-v0': obs = env.unwrapped.get_minitaur_position() elif env_name == 'HumanoidBulletEnv-v0': obs = env.unwrapped.robot.get_robot_position() if distance(obs, target_obs) < closest_d: closest_d = distance(obs, target_obs) closest_t = steps if visualize: env.render() if distance(obs, target_obs) <= threshold: success += 1 elif done: failure += 1 if visualize: time.sleep(2) closest_distances.append(closest_d) time_to_closest_distances.append(closest_t) print('Successes: %d, Failures: %d, ' 'Closest distance: %f, Time to closest distance: %d' % (success, failure, np.mean(closest_distances), np.mean(time_to_closest_distances))) if log_perf_file: f = open(log_perf_file, 'a+') f.write(exp_name + ':Seed-' + str(seed) + ',Success-' + str(success) + ',Failure-' + str(failure) + ',Closest_distance-' + str(closest_distances) + ',Time_to_closest_distance-' + str(time_to_closest_distances) + '\n') f.close()
def main(exp_name, output_dir, do_train, do_test, n_seeds, seed_val): if exp_name is None: raise ValueError( "Please specify the experiment name. Run '$ experiment_wrapper -h' for info" ) if not (do_train or do_test): raise ValueError( "Please specify if you want to do training or testing. Run '$ experiment_wrapper -h' for info" ) exp = experiment_registration.get_experiment(exp_name) for task in exp['tasks']: # decide seed if seed_val is not None and n_seeds > 1: raise ValueError( "You cannot both provide a specific seed value {} and require n_seeds={} random values" .format(seed_val, n_seeds)) # override seed value with the one provided as arg if seed_val is not None: task['seed'] = seed_val if n_seeds > 1 or 'seed' not in task: np.random.seed(2) seeds = np.random.randint(0, 20000, size=n_seeds) else: seeds = np.array([task['seed']]) # a different training for each seed for ns in range(n_seeds): seed = int(seeds[ns]) # Seed everything to make things reproducible. tf.compat.v1.reset_default_graph() set_global_seed(seed) # Read experiment conf variables rl_library, algo_name, algo_params = exp['algo']['RLlibrary'], exp[ 'algo']['name'], exp['algo']['params'] # Set path for outputdata output_exp_dir = os.path.join(output_dir, exp_name, task['sub_name'], 'seed_' + str(seed)) os.makedirs(output_exp_dir, exist_ok=True) # Set Gym environment renders = True if do_test else False task['env_params']['renders'] = renders if 'log_file' in task['env_params']: task['env_params']['log_file'] = output_exp_dir # Create environment as normalized vectorized environment with_vecnorm = False env, eval_env = robot_agents.ALGOS[rl_library]['make_env']( task['env_id'], task['env_params'], seed, do_train, with_vecnorm) # Run algorithm csv_file = os.path.join(output_exp_dir, "exp_param.csv") try: with open(csv_file, 'w') as f: for key in exp.keys(): f.write("%s,%s\n" % (key, exp[key])) except IOError: print("I/O error") if do_train: model = robot_agents.ALGOS[rl_library][algo_name]( env, eval_env, output_exp_dir, seed, **algo_params) if not model is None: print("Saving model to ", output_exp_dir) model.save(os.path.join(output_exp_dir, "final_model")) elif do_test: algo_name = algo_name + '_test' model = robot_agents.ALGOS[rl_library][algo_name]( env, output_exp_dir, seed, **algo_params) del env del eval_env del model
def main(): args = parser.parse_args() num_training_steps = args.train_steps lr = args.learning_rate gamma = args.discount_factor n_test_episodes = args.n_test_episodes checkpoint_file = args.resume test_only = args.test_only env_name = args.environment seed = args.seed batch_size = args.batch_size horizon = args.horizon lam = args.gae visualize = args.visualize entropy_coeff = args.entropy_coeff use_lr_decay = args.use_lr_decay env = gym.make(env_name) set_global_seed(seed) env.seed(seed) input_shape = env.observation_space.shape[0] action_dim = env.action_space.shape[0] net = Network(input_shape, action_dim).to(device) total_steps = 0 total_episodes = 0 optimizer = Adam(net.parameters(), lr=lr) adv_rms = RunningMeanStd(dim=1) return_rms = RunningMeanStd(dim=1) state_rms = RunningMeanStd(dim=input_shape) if checkpoint_file: (total_steps, total_episodes, net, optimizer, state_info, adv_info, return_info) = load_checkpoint(checkpoint_file, net, optimizer, 'state', 'adv', 'return') state_mean, state_var, state_min, state_max = state_info adv_mean, adv_var, adv_min, adv_max = adv_info return_mean, return_var, return_min, return_max = return_info state_rms.set_state(state_mean, state_var, state_min, state_max, total_steps) adv_rms.set_state(adv_mean, adv_var, adv_min, adv_max, total_steps) return_rms.set_state(return_mean, return_var, return_min, return_max, total_steps) checkpoint_dir = os.path.join(env_name, 'a2c_checkpoints_lr2e-3-b32-decay') if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) if test_only: avg_reward = test(env, action_dim, net, state_rms, n_test_episodes, visualize) print('Average episode reward:', avg_reward) return # Summary writer for tensorboardX writer = {} writer['writer'] = SummaryWriter() s = env.reset() reward_buf = [] ep_reward = 0 ep_len = 0 niter = 0 done = False mean_indices = torch.LongTensor([2 * x for x in range(action_dim)]) logstd_indices = torch.LongTensor([2 * x + 1 for x in range(action_dim)]) mean_indices = mean_indices.to(device) logstd_indices = logstd_indices.to(device) prev_best = 0 total_epochs = int(num_training_steps / batch_size) + 1 while total_steps < num_training_steps: values = [] rewards = [] dones = [] logps = [] entropies = [] niter += 1 for _ in range(batch_size): s = state_rms.normalize(s, mode=MEAN_STD) out, v = net(prepare_input(s)) mean = torch.index_select(out, 0, mean_indices) logstd = torch.index_select(out, 0, logstd_indices) action_dist = Normal(mean, torch.exp(logstd)) a = action_dist.sample() s, r, done, _ = env.step(a.cpu().numpy()) logp = action_dist.log_prob(a) entropy = action_dist.entropy() ep_reward += r ep_len += 1 total_steps += 1 if done: writer['iter'] = total_steps + 1 writer['writer'].add_scalar('data/ep_reward', ep_reward, total_steps) writer['writer'].add_scalar('data/ep_len', ep_len, total_steps) reward_buf.append(ep_reward) ep_reward = 0 ep_len = 0 total_episodes += 1 if len(reward_buf) > 100: reward_buf = reward_buf[-100:] done = False s = env.reset() values.append(v) rewards.append(r) dones.append(done) logps.append(logp) entropies.append(entropy.sum()) policy_loss, value_loss = batch_actor_critic(logps, rewards, values, dones, gamma, lam, horizon, adv_rms, return_rms) optimizer.zero_grad() policy_entropy = torch.stack(entropies).mean() loss = policy_loss + 0.5 * value_loss - entropy_coeff * policy_entropy loss.backward() optimizer.step() if use_lr_decay: for param_group in optimizer.param_groups: lr = param_group['lr'] param_group['lr'] = ( lr - lr * (total_steps / num_training_steps) / total_epochs) writer['iter'] = total_steps writer['writer'].add_scalar('data/last_100_ret', np.array(reward_buf).mean(), total_steps) writer['writer'].add_scalar('data/policy_loss', policy_loss, total_steps) writer['writer'].add_scalar('data/value_loss', value_loss, total_steps) writer['writer'].add_scalar('data/loss', loss, total_steps) print(total_episodes, 'episodes,', total_steps, 'steps,', np.array(reward_buf).mean(), 'reward') save_checkpoint( { 'total_steps': total_steps, 'total_episodes': total_episodes, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'state': [state_rms.mean, state_rms.var, state_rms.min, state_rms.max], 'adv': [adv_rms.mean, adv_rms.var, adv_rms.min, adv_rms.max], 'return': [ return_rms.mean, return_rms.var, return_rms.min, return_rms.max ] }, filename=os.path.join(checkpoint_dir, str(niter) + '.pth.tar')) if np.array(reward_buf).mean() > prev_best: save_checkpoint( { 'total_steps': total_steps, 'total_episodes': total_episodes, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), }, filename=os.path.join(checkpoint_dir, 'best.pth.tar'))
ROLLING_EVENT.wait() GLOBAL_QUEUE.put((batch_s, batch_a, batch_r)) if GLOBAL_QUEUE.qsize( ) >= MAX_QSIZE and not TERM_EVENT.is_set(): UPDATE_EVENT.set() ROLLING_EVENT.clear() # Clear buffer after model update buffer_s.clear() buffer_a.clear() buffer_r.clear() print(' [*] Worker {} finish and exit'.format(self.wid)) if __name__ == '__main__': args = add_arguments() set_global_seed(1) if args.method == 'kl_pen': METHOD = dict(name='kl_pen', kl_target=0.01, lam=0.5) elif args.method == 'clip': METHOD = dict(name='clip', epsilon=0.2) else: raise NotImplementedError if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) if not os.path.exists(args.logdir): os.makedirs(args.logdir) else: files = os.listdir(args.logdir) files = [os.path.join(args.logdir, fn) for fn in files] for f in files:
import os import time import numpy as np import tensorflow as tf import logger from config import CONFIG as C from model import Model from runner import Runner from utils import create_session, set_global_seed from wrappers import SubprocVecEnv, make_atari set_global_seed(113) time_stamp = time.strftime("%m-%d-%y-%H:%M:%S") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Only run on GPU 0 def evaluate(env, policy, nb_episodes): rewards = [0] for i in range(nb_episodes): s = env.reset() while True: a = policy.get_best_action(s) s, r, d, info = env.step(a) rewards[-1] += r if env.env.env.env.env.was_real_done: rewards.append(0) break if d: s = env.reset() return rewards