def mytest(env_name, eval_episode=10, num_init_traj=1, max_horizon=15, ensemble=1, gt=0, finetune=False, finetune_iter=41, finetune_proc=10, cem_iter=20): NUMS = { 'HalfCheetahPT-v2': 6, 'HopperPT-v2': 5, 'Walker2dPT-v2': 8, } num = NUMS[env_name] if not finetune: policy_net = get_awr_network(env_name, num) else: policy_net = get_finetune_network(env_name, num, num_iter=finetune_iter, num_proc=finetune_proc) model = make_parallel(10, env_name, num=num, stochastic=False) env = make(env_name, num=num, resample_MP=True, stochastic=False) params = get_params(env) mean_params = np.array([0.5] * len(params)) osi = CEMOSI(model, mean_params, iter_num=cem_iter, num_mutation=100, num_elite=10, std=0.3) rewards, dist = online_osi(env, osi, policy_net, num_init_traj=num_init_traj, max_horizon=max_horizon, eval_episodes=eval_episode, use_state=False, print_timestep=10000, resample_MP=True, ensemble=ensemble, online=0, gt=gt) rewards = np.array(rewards) print('l2 distance', dist) print('rewards', rewards) return { 'mean': rewards.mean(), 'std': rewards.std(), 'min': rewards.min(), 'max': rewards.max(), 'dist': dist.mean(), }
def main(): env_name = 'DartHopperPT-v1' num = 5 policy_net = get_up_network(env_name, num) model = make_parallel(10, env_name, num=num, stochastic=False, done=False) env = make(env_name, num=num, resample_MP=True, stochastic=False) params = get_params(env) mean_params = np.array([0.5] * len(params)) osi = CEMOSI(model, mean_params, iter_num=1, num_mutation=100, num_elite=10, std=0.3) policy_net.set_params(mean_params) for ensemble_num in range(5, 6): haha = OSIModel(model, osi, ensemble=ensemble_num) osi.cem.iter_num = 10 evaluate(env, policy_net, haha, 30, 1, 15, use_state=False) print(f'with {ensemble_num} ensemble')
def osi_eval(eval_env, osi, policy, num_init_traj, max_horizon, eval_episodes, use_state=True, print_timestep=1000, resample_MP=True): resample_MP_init = eval_env.env.resample_MP rewards = [] for episode in range(eval_episodes): osi.reset() eval_env.env.resample_MP = resample_MP eval_env.reset() eval_env.env.resample_MP = False for init_state, observations, actions, masks in collect_trajectories(eval_env, policy, num_init_traj, max_horizon, use_state): osi.update(init_state, observations, actions, masks) params = osi.get_params() print('find params', params) policy.set_params(params) rewards += eval_policy(policy, eval_env, eval_episodes=1, use_state=use_state, set_gt_params=False, timestep=1000, print_timestep=print_timestep)[1] print(get_params(eval_env), params) mean, std = np.mean(rewards), np.std(rewards) print('mean, std', mean, std) eval_env.env.resample_MP = resample_MP_init return mean, std
def test_model(): env_name = 'DartHopperPT-v1' env = make_parallel(1, env_name, num=2) env2 = make(env_name, num=2, stochastic=False) batch_size = 30 horizon = 100 s = [] for i in range(batch_size): env2.reset() s.append(get_state(env2)) param = get_params(env2) params = np.array([param for i in range(batch_size)]) env2.env.noisy_input = False s = np.array(s) a = [[env2.action_space.sample() for j in range(horizon)] for i in range(batch_size)] a = np.array(a) for i in range(3): obs, _, done, _ = env2.step(a[-1][i]) if done: break for i in tqdm.trange(1): r, obs, mask = env(params, s, a) print(obs[-1][:3])
def test_up_diff(): env_name = 'HopperPT-v2' num = 5 policy_net = get_awr_network(env_name, num) model = make_parallel(30, env_name, num=num, stochastic=False) env = make(env_name, num=num, resample_MP=True, stochastic=False) params = get_params(env) #set_params(env, [0.55111654,0.55281674,0.46355396,0.84531834,0.58944066]) set_params(env, [0.31851129, 0.93941556, 0.02147825, 0.43523052, 1.02611646]) set_params(env, [0.94107358, 0.77519005, 0.44055224, 0.9369426, -0.03846457]) set_params(env, [0.05039606, 0.14680257, 0.56502066, 0.25723492, 0.73810709]) mean_params = np.array([0.5] * len(params)) osi = DiffOSI(model, mean_params, 0.001, iter=100, momentum=0.9, eps=1e-3) policy_net.set_params(mean_params) # I run this at the last time.. # online is very useful .. online_osi(env, osi, policy_net, num_init_traj=5, max_horizon=15, eval_episodes=20, use_state=False, print_timestep=10000, resample_MP=True, online=0)
def eval_policy(policy, eval_env, eval_episodes=10, save_video=0, video_path="video{}.avi", timestep=int(1e9), use_state=True, set_gt_params=False, print_timestep=10000): avg_reward = 0. acc = [] trajectories = [] rewards = [] for episode_id in tqdm.trange(eval_episodes): state, done = eval_env.reset(), False out = None if isinstance(policy, object): if 'reset' in policy.__dir__(): policy.reset() if set_gt_params: policy.set_params(get_params(eval_env)) #while not done: states = [] actions = [] for i in tqdm.trange(timestep): if i % print_timestep == print_timestep-1: print('\n\n', avg_reward, "past: ", rewards, '\n\n') if use_state: state = get_state(eval_env) states.append(state.tolist()) action = policy(state) actions.append(action.tolist()) state, reward, done, info = eval_env.step(action) avg_reward += reward if done: break states.append(state.tolist()) if out is not None: out.release() trajectories.append([states, actions]) rewards.append(avg_reward) avg_reward = 0 print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {np.mean(rewards):.3f}, std: {np.std(rewards)}") if len(acc) > 0: print(f"Evaluation success rate over {eval_episodes} episodes: {np.mean(acc):.3f}") print("---------------------------------------") return trajectories, rewards
def collect_trajectories(eval_env, policy, num_traj, max_horizon, use_state, use_done=True, random_policy=True): gt_params = get_params(eval_env) for i in range(num_traj): obs = eval_env.reset() set_params(eval_env, gt_params) init_state = get_state(eval_env) observations = None actions = None masks = None if use_state: obs = init_state policy.reset() for j in range(max_horizon): if np.random.random() > 0 and random_policy: # explore action = eval_env.action_space.sample() else: action = policy(obs) obs, _, done, _ = eval_env.step(action) if observations is None: observations = np.zeros((max_horizon, len(obs))) actions = np.zeros((max_horizon, len(action))) actions -= 10000000 masks = np.zeros(max_horizon) observations[j] = obs actions[j] = action masks[j] = 1 if use_state: # we always record the observation instead of the state obs = get_state(eval_env) if done and use_done: break if j == 0: continue yield init_state, observations, actions, masks
def test_up_osi(): #env_name = 'DartHopperPT-v1' env_name = 'HopperPT-v2' num = 5 #policy_net = get_up_network(env_name, num) policy_net = get_awr_network(env_name, num) model = make_parallel(10, env_name, num=num, stochastic=False) env = make(env_name, num=num, resample_MP=True, stochastic=False) params = get_params(env) #set_params(env, [0.55111654,0.55281674,0.46355396,0.84531834,0.58944066]) set_params(env, [0.31851129, 0.93941556, 0.02147825, 0.43523052, 1.02611646]) set_params(env, [0.94107358, 0.77519005, 0.44055224, 0.9369426, -0.03846457]) set_params(env, [0.05039606, 0.14680257, 0.56502066, 0.25723492, 0.73810709]) mean_params = np.array([0.5] * len(params)) osi = CEMOSI(model, mean_params, iter_num=20, num_mutation=100, num_elite=10, std=0.3) policy_net.set_params(mean_params) online_osi(env, osi, policy_net, num_init_traj=5, max_horizon=15, eval_episodes=30, use_state=False, print_timestep=10000, resample_MP=True, ensemble=1, online=0, gt=0)
def learn_with_dataset(): env_name = 'DartHopperPT-v1' num = 5 env = make(env_name, num) dataset = Dataset(env_name, num=5) params = get_params(env) mean_params = np.array([0.5] * len(params)) model = make_parallel(10, env_name, num=num, stochastic=False, done=False) osi = CEMOSI(model, mean_params, iter_num=20, num_mutation=100, num_elite=10, std=0.3) learner = OSIModel(model, osi, ensemble=3) osi.cem.iter_num = 10 for test, train, train_online, params in zip(*dataset.data): learner.reset() trajs = [[train[j][i] for j in range(3)] + [np.ones((1, ))] for i in range(train[0].shape[0])] learner.fit(trajs) print('===========') #print(learner.osi.get_params(), params) print(cost(learner, [list(test) + [np.ones((1, ))]])) learner.reset() trajs = [[train_online[j][i] for j in range(3)] + [np.ones((1, ))] for i in range(train_online[0].shape[0])] learner.fit(trajs) #print(learner.osi.get_params(), params) print(cost(learner, [list(test) + [np.ones((1, ))]]))
def main(): load_dotenv('.env.general') config = load_config('config.yml') Path(config.logging.handlers.debug_file_handler.filename).parent.mkdir( parents=True, exist_ok=True) Path(config.logging.handlers.info_file_handler.filename).parent.mkdir( parents=True, exist_ok=True) logging.config.dictConfig(config.logging) _logger.info("Loading the data") x, y = load_training_data() x_train, x_test, y_train, y_test = split_data(x, y) with tempfile.TemporaryDirectory() as td: temp_dir = Path(td) mlflow.set_experiment(config.experiment.name) params = {} tags = {} metrics = {} artifacts = {} with mlflow.start_run(): _logger.info("Fitting the preprocessor") preprocessor = get_preprocessor() preprocessor.fit(x_train, y_train) _logger.info("Preprocessing the training data") x_train_prep = preprocessor.transform(x_train) x_test_prep = preprocessor.transform(x_test) estimator_params, search_space = get_params() if search_space is None: estimator, estimator_tags, estimator_metrics, estimator_artifacts = train_run( estimator_params=estimator_params, x_train_prep=x_train_prep, y_train=y_train, x_test_prep=x_test_prep, y_test=y_test, temp_dir=temp_dir) model = make_pipeline(preprocessor, estimator) params.update( {f"estimator_{k}": v for k, v in estimator_params.items()}) tags.update( {f"estimator_{k}": v for k, v in estimator_tags.items()}) metrics.update(estimator_metrics) artifacts.update(estimator_artifacts) else: def hyperopt_objective(search_params): # This function is called for each set of hyper-parameters being tested by HyperOpt. run_name = str(len(trials) - 1) ho_params = {} ho_tags = {} ho_metrics = {} ho_artifacts = {} search_params = flatten_params(search_params) search_params = prep_params(search_params) ho_estimator_params = estimator_params.copy() ho_estimator_params.update(search_params) with mlflow.start_run(nested=True, run_name=run_name): ho_estimator, ho_estimator_tags, ho_estimator_metrics, ho_estimator_artifacts = train_run( estimator_params=ho_estimator_params, x_train_prep=x_train_prep, y_train=y_train, x_test_prep=x_test_prep, y_test=y_test, temp_dir=temp_dir / run_name) ho_model = make_pipeline(preprocessor, ho_estimator) ho_params.update({ f"estimator_{k}": v for k, v in ho_estimator_params.items() }) ho_tags.update({ f"estimator_{k}": v for k, v in ho_estimator_tags.items() }) ho_metrics.update(ho_estimator_metrics) ho_artifacts.update(ho_estimator_artifacts) ho_tags['hyperopt'] = True log_sk_model(ho_model, registered_model_name=None, params=ho_params, tags=ho_tags, metrics=ho_metrics, artifacts=ho_artifacts) loss = 1 - ho_metrics[config.evaluation.primary_metric] return { 'loss': loss, 'status': STATUS_OK, 'model': ho_model, 'params': ho_params, 'tags': ho_tags, 'metrics': ho_metrics, 'artifacts': ho_artifacts } trials = Trials() fmin(fn=hyperopt_objective, space=search_space, algo=tpe.suggest, trials=trials, max_evals=config.training.max_evals, rstate=np.random.RandomState(1), show_progressbar=False) model = trials.best_trial['result']['model'] params = trials.best_trial['result']['params'] tags = trials.best_trial['result']['tags'] metrics = trials.best_trial['result']['metrics'] artifacts = trials.best_trial['result']['artifacts'] if config.evaluation.shap_analysis: _logger.info("Starting shap analysis") shap_tags, shap_artifacts = shap_analyse( model=model, x=x_train, temp_dir=Path(temp_dir) / 'shap') tags.update(shap_tags) artifacts.update(shap_artifacts) else: _logger.info("Shap analysis skipped") log_sk_model(model, registered_model_name=None, params=params, tags=tags, metrics=metrics, artifacts=artifacts) return (x_train, y_train, x_test, y_test), model, params, tags, metrics, artifacts
X = T.ftensor4() Y = T.ftensor4() # Network architecture f1 = (5, channels, 3, 3) # 5 filters of shape 3 x 3 filters = [f1] # More layers can be added. # The following would yield a network with 3 convolutional layers, # followed by 3 deconvolutional layers f1 = (5, channels, 3, 3) f2 = (20, f1[0], 3, 3) # f3 = (10, f2[0], 3, 3) filters = [f1] filter_params, bias_params = model.get_params(img_x, filters) # Model with dropout for training # Note: dropout is implemented but not used noise_out = model.model(X, filter_params, bias_params, 0.0, srng) noise_out_flat = noise_out.flatten(2) # Model without dropout for validating pred_out = model.model(X, filter_params, bias_params, 0.0, srng) pred_out_flat = pred_out.flatten(2) flat_y = Y.flatten(2) # Mean Squared Error L_noise = T.sum((flat_y - noise_out_flat)**2, axis=1)
def _get_data(self, path): if os.path.exists(path): with open(path, 'rb') as f: return pickle.load(f) gt_params = [] train_traj_offline = [] # trajs at random position.. train_traj_online = [] # recent trajs test_traj = [] eval_env = self.eval_env eval_env.env.resample_MP = False for i in tqdm.trange(self.total): eval_env.env.resample_MP = True eval_env.reset() eval_env.env.resample_MP = False gt_param = get_params(eval_env) gt_params.append(gt_param) self.policy.set_params(gt_param) self.policy.reset() # collect_train_traj states = [] observations = [] actions = [] obs = eval_env.reset() while True: # collect the whole trajectories # policy 1 action = self.policy(obs) states.append(get_state(eval_env)) actions.append(action) obs, r, done, _ = eval_env.step(action) observations.append(obs) if done: break if len(observations) < self.max_horizon * 2: continue # traj: states, observation, actions, mask test_idx = np.random.randint(self.max_horizon, len(states) - self.max_horizon) test_traj.append( (states[test_idx], np.array(observations[test_idx:test_idx + self.max_horizon]), np.array(actions[test_idx:test_idx + self.max_horizon]))) train = [] train_online = [] #for i in range(self.num_train): #idx = np.random.randint(len(states)) for idx in range(self.num_train): train.append((states[idx], observations[idx:idx + 1], np.array(actions[idx:idx + 1]))) #for idx in range(test_idx - self.max_horizon, test_idx-1): for i in range(self.num_train): idx = np.random.randint(test_idx) train_online.append((states[idx], observations[idx:idx + 1], actions[idx:idx + 1])) train = [np.array([j[i] for j in train]) for i in range(3)] train_online = [ np.array([j[i] for j in train_online]) for i in range(3) ] train_traj_offline.append(train) train_traj_online.append(train_online) print(np.array(gt_params).shape) data = [test_traj, train_traj_offline, train_traj_online, gt_params] with open(path, 'wb') as f: pickle.dump(data, f) return data
def main(args): # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu torch.manual_seed(1234) save_dir_root = os.path.join(os.path.dirname(os.path.abspath(__file__))) if args.resume_epoch != 0: runs = sorted(glob.glob(os.path.join(save_dir_root, 'run', 'run_*'))) run_id = int(runs[-1].split('_')[-1]) if runs else 0 else: runs = sorted(glob.glob(os.path.join(save_dir_root, 'run', 'run_*'))) run_id = int(runs[-1].split('_')[-1]) + 1 if runs else 0 if args.run_id >= 0: run_id = args.run_id save_dir = os.path.join(save_dir_root, 'run', 'run_' + str(run_id)) log_dir = os.path.join(save_dir, 'models', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) net = DSSNet() # load VGG16 encoder or pretrained DSS if args.load_pretrain is not None: pretrain_weights = torch.load(args.load_pretrain) pretrain_keys = list(pretrain_weights.keys()) net_keys = list(net.state_dict().keys()) for key in pretrain_keys: _key = key if _key in net_keys: net.state_dict()[_key].copy_(pretrain_weights[key]) else: print('missing key: ',_key) print('created and initialized a DSS model.') net.cuda() lr_ = args.lr optimizer = optim.SGD(get_params(net, args.lr),momentum=args.momentum,weight_decay=args.weight_decay) # optimizer = optim.Adam(get_params(net, 1e-6)) criterion = dssloss() composed_transforms_tr = transforms.Compose([ # trforms.FixedResize(size=(args.input_size, args.input_size)), trforms.Normalize_caffevgg(mean=(104.00698793,116.66876762,122.67891434), std=(1.0,1.0,1.0)), trforms.ToTensor()]) composed_transforms_ts = transforms.Compose([ # trforms.FixedResize(size=(args.input_size, args.input_size)), trforms.Normalize_caffevgg(mean=(104.00698793,116.66876762,122.67891434), std=(1.0,1.0,1.0)), trforms.ToTensor()]) train_data = msrab.MSRAB(max_num_samples=-1, split="train", transform=composed_transforms_tr) val_data = msrab.MSRAB(max_num_samples=-1, split="val", transform=composed_transforms_ts) trainloader = DataLoader(train_data, batch_size=args.batch_size, shuffle=False, num_workers=0) testloader = DataLoader(val_data, batch_size=1, shuffle=False, num_workers=0) num_iter_tr = len(trainloader) num_iter_ts = len(testloader) nitrs = args.resume_epoch * num_iter_tr nsamples = args.resume_epoch * len(train_data) print('nitrs: %d num_iter_tr: %d'%(nitrs, num_iter_tr)) print('nsamples: %d tot_num_samples: %d'%(nsamples, len(train_data))) aveGrad = 0 global_step = 0 epoch_losses = [] recent_losses = [] start_t = time.time() print('Training Network') best_f, cur_f = 0.0, 0.0 lr_ = args.lr for epoch in range(args.resume_epoch,args.nepochs): ### do validation if args.use_test == 1: cnt = 0 sum_testloss = 0.0 avg_mae = 0.0 avg_prec, avg_recall = 0.0, 0.0 if args.use_eval == 1: net.eval() for ii, sample_batched in enumerate(testloader): inputs, labels = sample_batched['image'], sample_batched['label'] # Forward pass of the mini-batch inputs, labels = Variable(inputs, requires_grad=True), Variable(labels) inputs, labels = inputs.cuda(), labels.cuda() with torch.no_grad(): outputs = net.forward(inputs) loss = criterion(outputs, labels) sum_testloss += loss.item() predictions = [torch.nn.Sigmoid()(outputs_i) for outputs_i in outputs] if len(predictions) >= 7: predictions = (predictions[2]+predictions[3]+predictions[4]+predictions[6]) / 4.0 else: predictions = predictions[0] predictions = (predictions-predictions.min()+1e-8) / (predictions.max()-predictions.min()+1e-8) avg_mae += eval_mae(predictions, labels).cpu().item() prec, recall = eval_pr(predictions, labels, 100) avg_prec, avg_recall = avg_prec + prec, avg_recall + recall cnt += predictions.size(0) if ii % num_iter_ts == num_iter_ts-1: mean_testloss = sum_testloss / num_iter_ts avg_mae = avg_mae / num_iter_ts avg_prec = avg_prec / num_iter_ts avg_recall = avg_recall / num_iter_ts f = (1+0.3) * avg_prec * avg_recall / (0.3 * avg_prec + avg_recall) f[f != f] = 0 # delete the nan maxf = f.max() print('Validation:') print('epoch: %d, numImages: %d testloss: %.2f mmae: %.4f maxf: %.4f' % ( epoch, cnt, mean_testloss, avg_mae, maxf)) writer.add_scalar('data/validloss', mean_testloss, nsamples) writer.add_scalar('data/validmae', avg_mae, nsamples) writer.add_scalar('data/validmaxf', maxf, nsamples) cur_f = maxf if cur_f > best_f: save_path = os.path.join(save_dir, 'models', args.model_name + '_best' + '.pth') torch.save(net.state_dict(), save_path) print("Save model at {}\n".format(save_path)) best_f = cur_f ### train one epoch net.train() epoch_losses = [] for ii, sample_batched in enumerate(trainloader): inputs, labels = sample_batched['image'], sample_batched['label'] inputs, labels = Variable(inputs, requires_grad=True), Variable(labels) global_step += inputs.data.shape[0] inputs, labels = inputs.cuda(), labels.cuda() outputs = net.forward(inputs) loss = criterion(outputs, labels) trainloss = loss.item() epoch_losses.append(trainloss) if len(recent_losses) < args.log_every: recent_losses.append(trainloss) else: recent_losses[nitrs % len(recent_losses)] = trainloss # Backward the averaged gradient loss /= args.naver_grad loss.backward() aveGrad += 1 nitrs += 1 nsamples += args.batch_size # Update the weights once in p['nAveGrad'] forward passes if aveGrad % args.naver_grad == 0: optimizer.step() optimizer.zero_grad() aveGrad = 0 if nitrs % args.log_every == 0: meanloss = sum(recent_losses) / len(recent_losses) print('epoch: %d ii: %d trainloss: %.2f timecost:%.2f secs'%( epoch,ii,meanloss,time.time()-start_t)) writer.add_scalar('data/trainloss',meanloss,nsamples) # Show 10 * 3 images results each epoch if (ii < 50 and ii % 10 == 0) or (ii % max(1, (num_iter_tr // 10)) == 0): # if ii % 10 == 0: tmp = inputs[:1].clone().cpu().data.numpy() tmp += np.array((104.00698793,116.66876762,122.67891434)).reshape(1, 3, 1, 1) tmp = np.ascontiguousarray(tmp[:, ::-1, :, :]) tmp = torch.tensor(tmp).float() grid_image = make_grid(tmp, 3, normalize=True) writer.add_image('Image', grid_image, global_step) predictions = [nn.Sigmoid()(outputs_i)[:1] for outputs_i in outputs] final_prediction = (predictions[2]+predictions[3]+predictions[4]+predictions[6]) / 4.0 predictions.append(final_prediction) predictions = torch.cat(predictions, dim=0) grid_image = make_grid(utils.decode_seg_map_sequence(predictions.narrow(1, 0, 1).detach().cpu().numpy()), 2, normalize=False, range=(0, 255)) writer.add_image('Predicted label', grid_image, global_step) grid_image = make_grid(utils.decode_seg_map_sequence(torch.squeeze(labels[:1], 1).detach().cpu().numpy()), 3, normalize=False, range=(0, 255)) writer.add_image('Groundtruth label', grid_image, global_step) meanloss = sum(epoch_losses) / len(epoch_losses) print('epoch: %d meanloss: %.2f'%(epoch,meanloss)) writer.add_scalar('data/epochloss', meanloss, nsamples) ### save model if epoch % args.save_every == args.save_every - 1: save_path = os.path.join(save_dir, 'models', args.model_name + '_epoch-' + str(epoch) + '.pth') torch.save(net.state_dict(), save_path) print("Save model at {}\n".format(save_path)) ### adjust lr if epoch % args.update_lr_every == args.update_lr_every - 1: lr_ = lr_ * 0.1 print('current learning rate: ', lr_) optimizer = optim.SGD(get_params(net, lr_),momentum=args.momentum,weight_decay=args.weight_decay)
def test_cem_osi(): env_name = 'HopperPT-v3' num = 5 from networks import get_td3_value #value_net = get_td3_value(env_name) value_net = None from policy import POLO, add_parser import argparse parser = argparse.ArgumentParser() add_parser(parser) args = parser.parse_args() args.num_proc = 20 model = make_parallel(args.num_proc, env_name, num=num, stochastic=True) env = make(env_name, num=num, resample_MP=True) #args.iter_num = 2 args.num_mutation = 500 #args.num_mutation = 100 args.iter_num = 5 args.num_elite = 10 policy_net = POLO(value_net, model, action_space=env.action_space, add_actions=args.add_actions, horizon=args.horizon, std=args.std, iter_num=args.iter_num, initial_iter=args.initial_iter, num_mutation=args.num_mutation, num_elite=args.num_elite, alpha=0.1, trunc_norm=True, lower_bound=env.action_space.low, upper_bound=env.action_space.high) resample_MP = True env = make(env_name, num=num, resample_MP=resample_MP, stochastic=False) params = get_params(env) print("FIXXXXXXXXXXXXXXXXXXXXXXPARAMETERS") set_params( env, np.array([0.58093299, 0.05418986, 0.93399553, 0.1678795, 1.04150952])) set_params(env, [0.55111654, 0.55281674, 0.46355396, 0.84531834, 0.58944066]) set_params(env, [0.31851129, 0.93941556, 0.02147825, 0.43523052, 1.02611646]) set_params(env, [0.58589476, 0.11078934, 0.348238, 0.68130195, 0.98376274]) mean_params = np.array([0.5] * len(params)) osi = CEMOSI(model, mean_params, iter_num=20, num_mutation=100, num_elite=10, std=0.3, ensemble_num=5) policy_net.set_params(mean_params) print(get_params(env)) online_osi(env, osi, policy_net, num_init_traj=1, max_horizon=15, eval_episodes=10, use_state=True, print_timestep=10, resample_MP=resample_MP, online=0, ensemble=5)
def online_osi(eval_env, osi, policy, num_init_traj, max_horizon, eval_episodes, use_state=True, print_timestep=1000, resample_MP=True, online=True, ensemble=1, gt=False): # fix the seed... from osi import seed seed(eval_env, 0) parameters = [] for i in range(100): eval_env.reset() parameters.append(get_params(eval_env)) resample_MP_init = eval_env.env.resample_MP rewards = [] for episode in tqdm.trange(eval_episodes): osi.reset() eval_env.env.resample_MP = resample_MP eval_env.reset() eval_env.env.resample_MP = False if parameters is not None: set_params(eval_env, parameters[episode]) for init_state, observations, actions, masks in collect_trajectories(eval_env, policy, num_init_traj, max_horizon, use_state): osi.update(init_state, observations, actions, masks) #params = osi.get_params() print('gt', get_params(eval_env)) if gt: params = get_params(eval_env) else: params = osi.find_min(ensemble, method='all') # get a good initialization policy.set_params(params) #print(params, get_params(eval_env)) dist = np.linalg.norm((params - get_params(eval_env)), axis=-1) total_rewards = [] for xx in range(5): reward = 0 obs, state = eval_env.reset(), get_state(eval_env) policy.reset() states = [] observations = [] actions = [] states.append(states) for i in range(1000): if use_state: action = policy(state) else: action = policy(obs) obs, r, done, _ = eval_env.step(action) state = get_state(eval_env) states.append(state) observations.append(obs) actions.append(action) if i % print_timestep == print_timestep - 1: print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n') if i % max_horizon == max_horizon - 1 and i > max_horizon + 3 and online: xx = i//max_horizon if xx % online == online - 1: idx = i - max_horizon - 1 osi.update(states[idx], observations[idx:idx+max_horizon], actions[idx:idx+max_horizon], 1, maxlen=3) tmp = osi.cem.iter_num #osi.cem.iter_num = 5 # we need at least 10 iterations?? osi.cem.iter_num = 10 # we need at least 10 iterations?? osi.cem.std = 0.1 osi.cem.num_mutation = 100 osi.cem.num_elite = 5 params = params * 0.5 + osi.get_params() * 0.5 # don't know if this is ok policy.set_params(params) print(params, get_params(eval_env)) print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n') reward += r #if i % print_timestep == print_timestep-1 or done: # print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n') if done: break rewards.append(reward) print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {np.mean(rewards):.3f}, std: {np.std(rewards)}") print("---------------------------------------") return rewards, dist