コード例 #1
0
ファイル: ablation.py プロジェクト: hzaskywalker/AWR-Python
def mytest(env_name,
           eval_episode=10,
           num_init_traj=1,
           max_horizon=15,
           ensemble=1,
           gt=0,
           finetune=False,
           finetune_iter=41,
           finetune_proc=10,
           cem_iter=20):
    NUMS = {
        'HalfCheetahPT-v2': 6,
        'HopperPT-v2': 5,
        'Walker2dPT-v2': 8,
    }

    num = NUMS[env_name]

    if not finetune:
        policy_net = get_awr_network(env_name, num)
    else:
        policy_net = get_finetune_network(env_name,
                                          num,
                                          num_iter=finetune_iter,
                                          num_proc=finetune_proc)

    model = make_parallel(10, env_name, num=num, stochastic=False)
    env = make(env_name, num=num, resample_MP=True, stochastic=False)

    params = get_params(env)
    mean_params = np.array([0.5] * len(params))
    osi = CEMOSI(model,
                 mean_params,
                 iter_num=cem_iter,
                 num_mutation=100,
                 num_elite=10,
                 std=0.3)

    rewards, dist = online_osi(env,
                               osi,
                               policy_net,
                               num_init_traj=num_init_traj,
                               max_horizon=max_horizon,
                               eval_episodes=eval_episode,
                               use_state=False,
                               print_timestep=10000,
                               resample_MP=True,
                               ensemble=ensemble,
                               online=0,
                               gt=gt)
    rewards = np.array(rewards)
    print('l2 distance', dist)
    print('rewards', rewards)
    return {
        'mean': rewards.mean(),
        'std': rewards.std(),
        'min': rewards.min(),
        'max': rewards.max(),
        'dist': dist.mean(),
    }
コード例 #2
0
def main():

    env_name = 'DartHopperPT-v1'
    num = 5
    policy_net = get_up_network(env_name, num)

    model = make_parallel(10, env_name, num=num, stochastic=False, done=False)
    env = make(env_name, num=num, resample_MP=True, stochastic=False)

    params = get_params(env)
    mean_params = np.array([0.5] * len(params))

    osi = CEMOSI(model,
                 mean_params,
                 iter_num=1,
                 num_mutation=100,
                 num_elite=10,
                 std=0.3)
    policy_net.set_params(mean_params)

    for ensemble_num in range(5, 6):
        haha = OSIModel(model, osi, ensemble=ensemble_num)
        osi.cem.iter_num = 10
        evaluate(env, policy_net, haha, 30, 1, 15, use_state=False)
        print(f'with {ensemble_num} ensemble')
コード例 #3
0
def osi_eval(eval_env, osi, policy, num_init_traj, max_horizon, eval_episodes, use_state=True, print_timestep=1000, resample_MP=True):


    resample_MP_init = eval_env.env.resample_MP
    rewards = []
    for episode in range(eval_episodes):
        osi.reset()

        eval_env.env.resample_MP = resample_MP
        eval_env.reset()
        eval_env.env.resample_MP = False

        for init_state, observations, actions, masks in collect_trajectories(eval_env, policy, num_init_traj, max_horizon, use_state):
            osi.update(init_state, observations, actions, masks)

        params = osi.get_params()
        print('find params', params)
        policy.set_params(params)

        rewards += eval_policy(policy, eval_env, eval_episodes=1, use_state=use_state, set_gt_params=False, timestep=1000, print_timestep=print_timestep)[1]
        print(get_params(eval_env), params)

    mean, std = np.mean(rewards), np.std(rewards)
    print('mean, std', mean, std)

    eval_env.env.resample_MP = resample_MP_init
    return mean, std
コード例 #4
0
def test_model():
    env_name = 'DartHopperPT-v1'
    env = make_parallel(1, env_name, num=2)

    env2 = make(env_name, num=2, stochastic=False)
    batch_size = 30
    horizon = 100

    s = []
    for i in range(batch_size):
        env2.reset()
        s.append(get_state(env2))

    param = get_params(env2)
    params = np.array([param for i in range(batch_size)])
    env2.env.noisy_input = False

    s = np.array(s)
    a = [[env2.action_space.sample() for j in range(horizon)]
         for i in range(batch_size)]
    a = np.array(a)

    for i in range(3):
        obs, _, done, _ = env2.step(a[-1][i])
        if done:
            break

    for i in tqdm.trange(1):
        r, obs, mask = env(params, s, a)
    print(obs[-1][:3])
コード例 #5
0
ファイル: osi.py プロジェクト: hzaskywalker/AWR-Python
def test_up_diff():
    env_name = 'HopperPT-v2'
    num = 5

    policy_net = get_awr_network(env_name, num)

    model = make_parallel(30, env_name, num=num, stochastic=False)
    env = make(env_name, num=num, resample_MP=True, stochastic=False)

    params = get_params(env)
    #set_params(env, [0.55111654,0.55281674,0.46355396,0.84531834,0.58944066])
    set_params(env,
               [0.31851129, 0.93941556, 0.02147825, 0.43523052, 1.02611646])
    set_params(env,
               [0.94107358, 0.77519005, 0.44055224, 0.9369426, -0.03846457])
    set_params(env,
               [0.05039606, 0.14680257, 0.56502066, 0.25723492, 0.73810709])

    mean_params = np.array([0.5] * len(params))
    osi = DiffOSI(model, mean_params, 0.001, iter=100, momentum=0.9, eps=1e-3)
    policy_net.set_params(mean_params)

    # I run this at the last time..
    # online is very useful ..
    online_osi(env,
               osi,
               policy_net,
               num_init_traj=5,
               max_horizon=15,
               eval_episodes=20,
               use_state=False,
               print_timestep=10000,
               resample_MP=True,
               online=0)
コード例 #6
0
def eval_policy(policy, eval_env, eval_episodes=10, save_video=0, video_path="video{}.avi", timestep=int(1e9), use_state=True, set_gt_params=False, print_timestep=10000):

    avg_reward = 0.
    acc = []

    trajectories = []
    rewards = []
    for episode_id in tqdm.trange(eval_episodes):
        state, done = eval_env.reset(), False

        out = None
        if isinstance(policy, object):
            if 'reset' in policy.__dir__():
                policy.reset()

        if set_gt_params:
            policy.set_params(get_params(eval_env))

        #while not done:
        states = []
        actions = []
        for i in tqdm.trange(timestep):
            if i % print_timestep == print_timestep-1:
                print('\n\n', avg_reward, "past: ", rewards, '\n\n')

            if use_state:
                state = get_state(eval_env)
            states.append(state.tolist())
            action = policy(state)
            actions.append(action.tolist())
            state, reward, done, info = eval_env.step(action)
            avg_reward += reward
            if done:
                break
        states.append(state.tolist())

        if out is not None:
            out.release()
        trajectories.append([states, actions])

        rewards.append(avg_reward)
        avg_reward = 0


    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {np.mean(rewards):.3f},  std: {np.std(rewards)}")
    if len(acc) > 0:
        print(f"Evaluation success rate over {eval_episodes} episodes: {np.mean(acc):.3f}")
    print("---------------------------------------")
    return trajectories, rewards
コード例 #7
0
def collect_trajectories(eval_env, policy, num_traj, max_horizon, use_state, use_done=True, random_policy=True):
    gt_params = get_params(eval_env)

    for i in range(num_traj):
        obs = eval_env.reset()
        set_params(eval_env, gt_params)

        init_state = get_state(eval_env)
        observations = None
        actions = None
        masks = None

        if use_state:
            obs = init_state

        policy.reset()
        for j in range(max_horizon):
            if np.random.random() > 0 and random_policy: # explore
                action = eval_env.action_space.sample()
            else:
                action = policy(obs)
            obs, _, done, _ = eval_env.step(action)

            if observations is None:
                observations = np.zeros((max_horizon, len(obs)))
                actions = np.zeros((max_horizon, len(action)))
                actions -= 10000000
                masks = np.zeros(max_horizon)

            observations[j] = obs
            actions[j] = action
            masks[j] = 1


            if use_state:
                # we always record the observation instead of the state
                obs = get_state(eval_env)
            if done and use_done:
                break

        if j == 0:
            continue

        yield init_state, observations, actions, masks
コード例 #8
0
ファイル: osi.py プロジェクト: hzaskywalker/AWR-Python
def test_up_osi():
    #env_name = 'DartHopperPT-v1'
    env_name = 'HopperPT-v2'
    num = 5

    #policy_net = get_up_network(env_name, num)
    policy_net = get_awr_network(env_name, num)

    model = make_parallel(10, env_name, num=num, stochastic=False)
    env = make(env_name, num=num, resample_MP=True, stochastic=False)

    params = get_params(env)
    #set_params(env, [0.55111654,0.55281674,0.46355396,0.84531834,0.58944066])
    set_params(env,
               [0.31851129, 0.93941556, 0.02147825, 0.43523052, 1.02611646])
    set_params(env,
               [0.94107358, 0.77519005, 0.44055224, 0.9369426, -0.03846457])
    set_params(env,
               [0.05039606, 0.14680257, 0.56502066, 0.25723492, 0.73810709])

    mean_params = np.array([0.5] * len(params))
    osi = CEMOSI(model,
                 mean_params,
                 iter_num=20,
                 num_mutation=100,
                 num_elite=10,
                 std=0.3)
    policy_net.set_params(mean_params)

    online_osi(env,
               osi,
               policy_net,
               num_init_traj=5,
               max_horizon=15,
               eval_episodes=30,
               use_state=False,
               print_timestep=10000,
               resample_MP=True,
               ensemble=1,
               online=0,
               gt=0)
コード例 #9
0
def learn_with_dataset():

    env_name = 'DartHopperPT-v1'
    num = 5

    env = make(env_name, num)

    dataset = Dataset(env_name, num=5)
    params = get_params(env)
    mean_params = np.array([0.5] * len(params))

    model = make_parallel(10, env_name, num=num, stochastic=False, done=False)
    osi = CEMOSI(model,
                 mean_params,
                 iter_num=20,
                 num_mutation=100,
                 num_elite=10,
                 std=0.3)

    learner = OSIModel(model, osi, ensemble=3)
    osi.cem.iter_num = 10

    for test, train, train_online, params in zip(*dataset.data):
        learner.reset()
        trajs = [[train[j][i] for j in range(3)] + [np.ones((1, ))]
                 for i in range(train[0].shape[0])]
        learner.fit(trajs)
        print('===========')
        #print(learner.osi.get_params(), params)
        print(cost(learner, [list(test) + [np.ones((1, ))]]))

        learner.reset()
        trajs = [[train_online[j][i] for j in range(3)] + [np.ones((1, ))]
                 for i in range(train_online[0].shape[0])]
        learner.fit(trajs)
        #print(learner.osi.get_params(), params)
        print(cost(learner, [list(test) + [np.ones((1, ))]]))
コード例 #10
0
def main():
    load_dotenv('.env.general')
    config = load_config('config.yml')
    Path(config.logging.handlers.debug_file_handler.filename).parent.mkdir(
        parents=True, exist_ok=True)
    Path(config.logging.handlers.info_file_handler.filename).parent.mkdir(
        parents=True, exist_ok=True)
    logging.config.dictConfig(config.logging)

    _logger.info("Loading the data")
    x, y = load_training_data()
    x_train, x_test, y_train, y_test = split_data(x, y)

    with tempfile.TemporaryDirectory() as td:
        temp_dir = Path(td)
        mlflow.set_experiment(config.experiment.name)

        params = {}
        tags = {}
        metrics = {}
        artifacts = {}

        with mlflow.start_run():
            _logger.info("Fitting the preprocessor")
            preprocessor = get_preprocessor()
            preprocessor.fit(x_train, y_train)

            _logger.info("Preprocessing the training data")
            x_train_prep = preprocessor.transform(x_train)
            x_test_prep = preprocessor.transform(x_test)

            estimator_params, search_space = get_params()

            if search_space is None:
                estimator, estimator_tags, estimator_metrics, estimator_artifacts = train_run(
                    estimator_params=estimator_params,
                    x_train_prep=x_train_prep,
                    y_train=y_train,
                    x_test_prep=x_test_prep,
                    y_test=y_test,
                    temp_dir=temp_dir)

                model = make_pipeline(preprocessor, estimator)
                params.update(
                    {f"estimator_{k}": v
                     for k, v in estimator_params.items()})
                tags.update(
                    {f"estimator_{k}": v
                     for k, v in estimator_tags.items()})
                metrics.update(estimator_metrics)
                artifacts.update(estimator_artifacts)

            else:

                def hyperopt_objective(search_params):
                    # This function is called for each set of hyper-parameters being tested by HyperOpt.
                    run_name = str(len(trials) - 1)
                    ho_params = {}
                    ho_tags = {}
                    ho_metrics = {}
                    ho_artifacts = {}

                    search_params = flatten_params(search_params)
                    search_params = prep_params(search_params)
                    ho_estimator_params = estimator_params.copy()
                    ho_estimator_params.update(search_params)

                    with mlflow.start_run(nested=True, run_name=run_name):
                        ho_estimator, ho_estimator_tags, ho_estimator_metrics, ho_estimator_artifacts = train_run(
                            estimator_params=ho_estimator_params,
                            x_train_prep=x_train_prep,
                            y_train=y_train,
                            x_test_prep=x_test_prep,
                            y_test=y_test,
                            temp_dir=temp_dir / run_name)

                        ho_model = make_pipeline(preprocessor, ho_estimator)
                        ho_params.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_params.items()
                        })
                        ho_tags.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_tags.items()
                        })
                        ho_metrics.update(ho_estimator_metrics)
                        ho_artifacts.update(ho_estimator_artifacts)

                        ho_tags['hyperopt'] = True

                        log_sk_model(ho_model,
                                     registered_model_name=None,
                                     params=ho_params,
                                     tags=ho_tags,
                                     metrics=ho_metrics,
                                     artifacts=ho_artifacts)

                    loss = 1 - ho_metrics[config.evaluation.primary_metric]

                    return {
                        'loss': loss,
                        'status': STATUS_OK,
                        'model': ho_model,
                        'params': ho_params,
                        'tags': ho_tags,
                        'metrics': ho_metrics,
                        'artifacts': ho_artifacts
                    }

                trials = Trials()
                fmin(fn=hyperopt_objective,
                     space=search_space,
                     algo=tpe.suggest,
                     trials=trials,
                     max_evals=config.training.max_evals,
                     rstate=np.random.RandomState(1),
                     show_progressbar=False)

                model = trials.best_trial['result']['model']
                params = trials.best_trial['result']['params']
                tags = trials.best_trial['result']['tags']
                metrics = trials.best_trial['result']['metrics']
                artifacts = trials.best_trial['result']['artifacts']

            if config.evaluation.shap_analysis:
                _logger.info("Starting shap analysis")
                shap_tags, shap_artifacts = shap_analyse(
                    model=model, x=x_train, temp_dir=Path(temp_dir) / 'shap')
                tags.update(shap_tags)
                artifacts.update(shap_artifacts)
            else:
                _logger.info("Shap analysis skipped")

            log_sk_model(model,
                         registered_model_name=None,
                         params=params,
                         tags=tags,
                         metrics=metrics,
                         artifacts=artifacts)

    return (x_train, y_train, x_test,
            y_test), model, params, tags, metrics, artifacts
コード例 #11
0
ファイル: train.py プロジェクト: MaGold/Caustics
X = T.ftensor4()
Y = T.ftensor4()

# Network architecture
f1 = (5, channels, 3, 3)  # 5 filters of shape 3 x 3
filters = [f1]

# More layers can be added.
# The following would yield a network with 3 convolutional layers,
# followed by 3 deconvolutional layers
f1 = (5, channels, 3, 3)
f2 = (20, f1[0], 3, 3)
# f3 = (10, f2[0], 3, 3)
filters = [f1]

filter_params, bias_params = model.get_params(img_x, filters)

# Model with dropout for training
# Note: dropout is implemented but not used
noise_out = model.model(X, filter_params, bias_params, 0.0, srng)
noise_out_flat = noise_out.flatten(2)

# Model without dropout for validating
pred_out = model.model(X, filter_params, bias_params, 0.0, srng)
pred_out_flat = pred_out.flatten(2)

flat_y = Y.flatten(2)

# Mean Squared Error
L_noise = T.sum((flat_y - noise_out_flat)**2, axis=1)
コード例 #12
0
    def _get_data(self, path):
        if os.path.exists(path):
            with open(path, 'rb') as f:
                return pickle.load(f)

        gt_params = []

        train_traj_offline = []  # trajs at random position..
        train_traj_online = []  # recent trajs
        test_traj = []

        eval_env = self.eval_env
        eval_env.env.resample_MP = False

        for i in tqdm.trange(self.total):
            eval_env.env.resample_MP = True
            eval_env.reset()
            eval_env.env.resample_MP = False
            gt_param = get_params(eval_env)
            gt_params.append(gt_param)
            self.policy.set_params(gt_param)
            self.policy.reset()

            # collect_train_traj
            states = []
            observations = []
            actions = []

            obs = eval_env.reset()

            while True:
                # collect the whole trajectories
                # policy 1
                action = self.policy(obs)
                states.append(get_state(eval_env))
                actions.append(action)

                obs, r, done, _ = eval_env.step(action)
                observations.append(obs)
                if done:
                    break

            if len(observations) < self.max_horizon * 2:
                continue

            # traj: states, observation, actions, mask
            test_idx = np.random.randint(self.max_horizon,
                                         len(states) - self.max_horizon)
            test_traj.append(
                (states[test_idx],
                 np.array(observations[test_idx:test_idx + self.max_horizon]),
                 np.array(actions[test_idx:test_idx + self.max_horizon])))

            train = []
            train_online = []
            #for i in range(self.num_train):
            #idx = np.random.randint(len(states))
            for idx in range(self.num_train):
                train.append((states[idx], observations[idx:idx + 1],
                              np.array(actions[idx:idx + 1])))

            #for idx in range(test_idx - self.max_horizon, test_idx-1):
            for i in range(self.num_train):
                idx = np.random.randint(test_idx)
                train_online.append((states[idx], observations[idx:idx + 1],
                                     actions[idx:idx + 1]))
            train = [np.array([j[i] for j in train]) for i in range(3)]
            train_online = [
                np.array([j[i] for j in train_online]) for i in range(3)
            ]

            train_traj_offline.append(train)
            train_traj_online.append(train_online)

        print(np.array(gt_params).shape)
        data = [test_traj, train_traj_offline, train_traj_online, gt_params]
        with open(path, 'wb') as f:
            pickle.dump(data, f)
        return data
コード例 #13
0
ファイル: train.py プロジェクト: lhaof/DSS-Pytorch
def main(args):
	# os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
	torch.manual_seed(1234)
	save_dir_root = os.path.join(os.path.dirname(os.path.abspath(__file__)))
	if args.resume_epoch != 0:
		runs = sorted(glob.glob(os.path.join(save_dir_root, 'run', 'run_*')))
		run_id = int(runs[-1].split('_')[-1]) if runs else 0
	else:
		runs = sorted(glob.glob(os.path.join(save_dir_root, 'run', 'run_*')))
		run_id = int(runs[-1].split('_')[-1]) + 1 if runs else 0

	if args.run_id >= 0:
		run_id = args.run_id

	save_dir = os.path.join(save_dir_root, 'run', 'run_' + str(run_id))
	log_dir = os.path.join(save_dir, 'models', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname())
	writer = SummaryWriter(log_dir=log_dir)

	net = DSSNet()
	# load VGG16 encoder or pretrained DSS
	if args.load_pretrain is not None:
		pretrain_weights = torch.load(args.load_pretrain)
		pretrain_keys = list(pretrain_weights.keys())
		net_keys = list(net.state_dict().keys())
		for key in pretrain_keys:
			_key = key 
			if _key in net_keys:
				net.state_dict()[_key].copy_(pretrain_weights[key])
			else:
				print('missing key: ',_key)
	print('created and initialized a DSS model.')
	net.cuda()

	lr_ = args.lr
	optimizer = optim.SGD(get_params(net, args.lr),momentum=args.momentum,weight_decay=args.weight_decay)

	# optimizer = optim.Adam(get_params(net, 1e-6))

	criterion = dssloss()

	composed_transforms_tr = transforms.Compose([
		# trforms.FixedResize(size=(args.input_size, args.input_size)),
		trforms.Normalize_caffevgg(mean=(104.00698793,116.66876762,122.67891434), std=(1.0,1.0,1.0)),
		trforms.ToTensor()])
	
	composed_transforms_ts = transforms.Compose([
		# trforms.FixedResize(size=(args.input_size, args.input_size)),
		trforms.Normalize_caffevgg(mean=(104.00698793,116.66876762,122.67891434), std=(1.0,1.0,1.0)),
		trforms.ToTensor()])

	train_data = msrab.MSRAB(max_num_samples=-1, split="train", transform=composed_transforms_tr)
	val_data = msrab.MSRAB(max_num_samples=-1, split="val", transform=composed_transforms_ts)

	trainloader = DataLoader(train_data, batch_size=args.batch_size, shuffle=False, num_workers=0)
	testloader = DataLoader(val_data, batch_size=1, shuffle=False, num_workers=0)

	num_iter_tr = len(trainloader)
	num_iter_ts = len(testloader)
	nitrs = args.resume_epoch * num_iter_tr
	nsamples = args.resume_epoch * len(train_data) 
	print('nitrs: %d num_iter_tr: %d'%(nitrs, num_iter_tr))
	print('nsamples: %d tot_num_samples: %d'%(nsamples, len(train_data)))

	aveGrad = 0
	global_step = 0
	epoch_losses = []
	recent_losses = []
	start_t = time.time()
	print('Training Network')

	best_f, cur_f = 0.0, 0.0
	lr_ = args.lr
	for epoch in range(args.resume_epoch,args.nepochs):

		### do validation
		if args.use_test == 1:
			cnt = 0
			sum_testloss = 0.0

			avg_mae = 0.0
			avg_prec, avg_recall = 0.0, 0.0

			if args.use_eval == 1:
				net.eval()
			for ii, sample_batched in enumerate(testloader):
				inputs, labels = sample_batched['image'], sample_batched['label']

				# Forward pass of the mini-batch
				inputs, labels = Variable(inputs, requires_grad=True), Variable(labels)
				inputs, labels = inputs.cuda(), labels.cuda()

				with torch.no_grad():
					outputs = net.forward(inputs)
					loss = criterion(outputs, labels)
				sum_testloss += loss.item()
				
				predictions = [torch.nn.Sigmoid()(outputs_i) for outputs_i in outputs]
				if len(predictions) >= 7: 
					predictions = (predictions[2]+predictions[3]+predictions[4]+predictions[6]) / 4.0
				else:
					predictions = predictions[0]
				predictions = (predictions-predictions.min()+1e-8) / (predictions.max()-predictions.min()+1e-8)

				avg_mae += eval_mae(predictions, labels).cpu().item()
				prec, recall = eval_pr(predictions, labels, 100)
				avg_prec, avg_recall = avg_prec + prec, avg_recall + recall

				cnt += predictions.size(0)
				
				if ii % num_iter_ts == num_iter_ts-1:
					mean_testloss = sum_testloss / num_iter_ts
					avg_mae = avg_mae / num_iter_ts
					avg_prec = avg_prec / num_iter_ts
					avg_recall = avg_recall / num_iter_ts
					f = (1+0.3) * avg_prec * avg_recall / (0.3 * avg_prec + avg_recall)
					f[f != f] = 0 # delete the nan
					maxf = f.max()

					print('Validation:')
					print('epoch: %d, numImages: %d testloss: %.2f mmae: %.4f maxf: %.4f' % (
						epoch, cnt, mean_testloss, avg_mae, maxf))
					writer.add_scalar('data/validloss', mean_testloss, nsamples)
					writer.add_scalar('data/validmae', avg_mae, nsamples)
					writer.add_scalar('data/validmaxf', maxf, nsamples)

					cur_f = maxf
					if cur_f > best_f:
						save_path = os.path.join(save_dir, 'models', args.model_name + '_best' + '.pth')
						torch.save(net.state_dict(), save_path)
						print("Save model at {}\n".format(save_path))
						best_f = cur_f


		### train one epoch
		net.train()
		epoch_losses = []
		for ii, sample_batched in enumerate(trainloader):
			
			inputs, labels = sample_batched['image'], sample_batched['label']
			inputs, labels = Variable(inputs, requires_grad=True), Variable(labels) 
			global_step += inputs.data.shape[0] 
			inputs, labels = inputs.cuda(), labels.cuda()

			outputs = net.forward(inputs)
			loss = criterion(outputs, labels)
			trainloss = loss.item()
			epoch_losses.append(trainloss)
			if len(recent_losses) < args.log_every:
				recent_losses.append(trainloss)
			else:
				recent_losses[nitrs % len(recent_losses)] = trainloss

			# Backward the averaged gradient
			loss /= args.naver_grad
			loss.backward()
			aveGrad += 1
			nitrs += 1
			nsamples += args.batch_size

			# Update the weights once in p['nAveGrad'] forward passes
			if aveGrad % args.naver_grad == 0:
				optimizer.step()
				optimizer.zero_grad()
				aveGrad = 0

			if nitrs % args.log_every == 0:
				meanloss = sum(recent_losses) / len(recent_losses)
				print('epoch: %d ii: %d trainloss: %.2f timecost:%.2f secs'%(
					epoch,ii,meanloss,time.time()-start_t))
				writer.add_scalar('data/trainloss',meanloss,nsamples)

			# Show 10 * 3 images results each epoch
			if (ii < 50 and ii % 10 == 0) or (ii % max(1, (num_iter_tr // 10)) == 0):
			# if ii % 10 == 0:
				tmp = inputs[:1].clone().cpu().data.numpy()
				tmp += np.array((104.00698793,116.66876762,122.67891434)).reshape(1, 3, 1, 1)
				tmp = np.ascontiguousarray(tmp[:, ::-1, :, :])
				tmp = torch.tensor(tmp).float()
				grid_image = make_grid(tmp, 3, normalize=True)
				writer.add_image('Image', grid_image, global_step)
				
				predictions = [nn.Sigmoid()(outputs_i)[:1] for outputs_i in outputs]
				final_prediction = (predictions[2]+predictions[3]+predictions[4]+predictions[6]) / 4.0
				predictions.append(final_prediction)
				predictions = torch.cat(predictions, dim=0)

				grid_image = make_grid(utils.decode_seg_map_sequence(predictions.narrow(1, 0, 1).detach().cpu().numpy()), 2, normalize=False, range=(0, 255))
				writer.add_image('Predicted label', grid_image, global_step)

				grid_image = make_grid(utils.decode_seg_map_sequence(torch.squeeze(labels[:1], 1).detach().cpu().numpy()), 3, normalize=False, range=(0, 255))
				writer.add_image('Groundtruth label', grid_image, global_step)


		meanloss = sum(epoch_losses) / len(epoch_losses)
		print('epoch: %d meanloss: %.2f'%(epoch,meanloss))
		writer.add_scalar('data/epochloss', meanloss, nsamples)


		### save model
		if epoch % args.save_every == args.save_every - 1:
			save_path = os.path.join(save_dir, 'models', args.model_name + '_epoch-' + str(epoch) + '.pth')
			torch.save(net.state_dict(), save_path)
			print("Save model at {}\n".format(save_path))


		### adjust lr
		if epoch % args.update_lr_every == args.update_lr_every - 1:
			lr_ = lr_ * 0.1
			print('current learning rate: ', lr_)
			optimizer = optim.SGD(get_params(net, lr_),momentum=args.momentum,weight_decay=args.weight_decay)
コード例 #14
0
ファイル: osi.py プロジェクト: hzaskywalker/AWR-Python
def test_cem_osi():
    env_name = 'HopperPT-v3'
    num = 5

    from networks import get_td3_value
    #value_net = get_td3_value(env_name)
    value_net = None

    from policy import POLO, add_parser
    import argparse

    parser = argparse.ArgumentParser()
    add_parser(parser)
    args = parser.parse_args()
    args.num_proc = 20

    model = make_parallel(args.num_proc, env_name, num=num, stochastic=True)
    env = make(env_name, num=num, resample_MP=True)

    #args.iter_num = 2
    args.num_mutation = 500
    #args.num_mutation = 100
    args.iter_num = 5
    args.num_elite = 10

    policy_net = POLO(value_net,
                      model,
                      action_space=env.action_space,
                      add_actions=args.add_actions,
                      horizon=args.horizon,
                      std=args.std,
                      iter_num=args.iter_num,
                      initial_iter=args.initial_iter,
                      num_mutation=args.num_mutation,
                      num_elite=args.num_elite,
                      alpha=0.1,
                      trunc_norm=True,
                      lower_bound=env.action_space.low,
                      upper_bound=env.action_space.high)

    resample_MP = True
    env = make(env_name, num=num, resample_MP=resample_MP, stochastic=False)

    params = get_params(env)

    print("FIXXXXXXXXXXXXXXXXXXXXXXPARAMETERS")
    set_params(
        env,
        np.array([0.58093299, 0.05418986, 0.93399553, 0.1678795, 1.04150952]))
    set_params(env,
               [0.55111654, 0.55281674, 0.46355396, 0.84531834, 0.58944066])
    set_params(env,
               [0.31851129, 0.93941556, 0.02147825, 0.43523052, 1.02611646])
    set_params(env, [0.58589476, 0.11078934, 0.348238, 0.68130195, 0.98376274])

    mean_params = np.array([0.5] * len(params))
    osi = CEMOSI(model,
                 mean_params,
                 iter_num=20,
                 num_mutation=100,
                 num_elite=10,
                 std=0.3,
                 ensemble_num=5)
    policy_net.set_params(mean_params)
    print(get_params(env))

    online_osi(env,
               osi,
               policy_net,
               num_init_traj=1,
               max_horizon=15,
               eval_episodes=10,
               use_state=True,
               print_timestep=10,
               resample_MP=resample_MP,
               online=0,
               ensemble=5)
コード例 #15
0
def online_osi(eval_env, osi, policy, num_init_traj, max_horizon, eval_episodes, use_state=True, print_timestep=1000, resample_MP=True, online=True, ensemble=1, gt=False):
    # fix the seed...
    from osi import seed 
    seed(eval_env, 0)
    parameters = []
    for i in range(100):
        eval_env.reset()
        parameters.append(get_params(eval_env))

    resample_MP_init = eval_env.env.resample_MP
    rewards = []
    for episode in tqdm.trange(eval_episodes):
        osi.reset()

        eval_env.env.resample_MP = resample_MP
        eval_env.reset()
        eval_env.env.resample_MP = False
        if parameters is not None:
            set_params(eval_env, parameters[episode])

        for init_state, observations, actions, masks in collect_trajectories(eval_env, policy, num_init_traj, max_horizon, use_state):
            osi.update(init_state, observations, actions, masks)

        #params = osi.get_params()
        print('gt', get_params(eval_env))
        if gt:
            params = get_params(eval_env)
        else:
            params = osi.find_min(ensemble, method='all') # get a good initialization
        policy.set_params(params)
        #print(params, get_params(eval_env))
        dist = np.linalg.norm((params - get_params(eval_env)), axis=-1)

        total_rewards = []

        for xx in range(5):
            reward = 0
            obs, state = eval_env.reset(), get_state(eval_env)
            policy.reset()

            states = []
            observations = []
            actions = []
            states.append(states)

            for i in range(1000):
                if use_state:
                    action = policy(state)
                else:
                    action = policy(obs)

                obs, r, done, _ = eval_env.step(action)
                state = get_state(eval_env)
                states.append(state)


                observations.append(obs)
                actions.append(action)

                if i % print_timestep == print_timestep - 1:
                    print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n')

                if i % max_horizon == max_horizon - 1 and i > max_horizon + 3 and online:
                    xx = i//max_horizon
                    if xx % online == online - 1:
                        idx = i - max_horizon - 1
                        osi.update(states[idx], observations[idx:idx+max_horizon], actions[idx:idx+max_horizon], 1, maxlen=3)
                        tmp = osi.cem.iter_num
                        #osi.cem.iter_num = 5 # we need at least 10 iterations??
                        osi.cem.iter_num = 10 # we need at least 10 iterations??
                        osi.cem.std = 0.1
                        osi.cem.num_mutation = 100
                        osi.cem.num_elite = 5
                        params = params * 0.5 + osi.get_params() * 0.5 # don't know if this is ok
                        policy.set_params(params)
                    print(params, get_params(eval_env))
                    print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n')


                reward += r
                #if i % print_timestep == print_timestep-1 or done:
                #    print('\n\n', reward, "past: ", rewards[-10:], len(rewards), '\n\n')
                if done:
                    break
            rewards.append(reward)


    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {np.mean(rewards):.3f},  std: {np.std(rewards)}")
    print("---------------------------------------")
    return rewards, dist