コード例 #1
0
ファイル: bc.py プロジェクト: slin70/OfflineRL
def algo_init(args):
    logger.info('Run algo_init function')

    setup_seed(args['seed'])
    
    if args["obs_shape"] and args["action_shape"]:
        obs_shape, action_shape = args["obs_shape"], args["action_shape"]
        max_action = args["max_action"]
    elif "task" in args.keys():
        from offlinerl.utils.env import get_env_shape, get_env_action_range
        obs_shape, action_shape = get_env_shape(args['task'])
        max_action, _ = get_env_action_range(args["task"])
        args["obs_shape"], args["action_shape"] = obs_shape, action_shape
    else:
        raise NotImplementedError
    
    net_a = Net(layer_num=args['actor_layers'], 
                state_shape=obs_shape, 
                hidden_layer_size=args['actor_features'])

    actor = TanhGaussianPolicy(preprocess_net=net_a,
                               action_shape=action_shape,
                               hidden_layer_size=args['actor_features'],
                               conditioned_sigma=True).to(args['device'])

    actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr'])

    return {
        "actor" : {"net" : actor, "opt" : actor_optim},
    }
コード例 #2
0
def launch_ope(config):
    ''' run on a seed '''
    setup_seed(config['seed'])
    if config['ope'] == 'fqe':
        evaluator = FQEEvaluator()
    elif config['ope'] == 'is':
        evaluator = ISEvaluator()

    train_dataset, val_dataset = get_neorl_datasets(config["domain"],
                                                    config['level'],
                                                    config['amount'])

    evaluator.initialize(train_dataset=train_dataset, val_dataset=val_dataset)

    exp_folder = os.path.join(config['task_folder'], config['exp_name'])
    with open(os.path.join(exp_folder, 'metric_logs.json'), 'r') as f:
        metrics = json.load(f)

    max_step = str(max(map(int, metrics.keys())))
    gt = metrics[max_step]['Reward_Mean_Env']

    policy_file = os.path.join(exp_folder, 'models', f'{max_step}.pt')
    policy = torch.load(policy_file)
    ope = evaluator(policy)

    return {
        'gt': gt,
        'ope': ope,
        'policy_file': policy_file,
        'exp_name': config['exp_name'],
        'seed': config['seed'],
    }
コード例 #3
0
ファイル: bcqd.py プロジェクト: slin70/OfflineRL
def algo_init(args):
    logger.info('Run algo_init function')

    setup_seed(args['seed'])

    if args["obs_shape"] and args["action_shape"]:
        obs_shape, action_shape = args["obs_shape"], args["action_shape"]
    elif "task" in args.keys():
        from offlinerl.utils.env import get_env_shape
        obs_shape, action_shape = get_env_shape(args['task'])
        args["obs_shape"], args["action_shape"] = obs_shape, action_shape
    else:
        raise NotImplementedError

    if isinstance(args["obs_shape"], int):
        state_dim = (4, 84, 84)

        critic = Conv_Q(state_dim[0], args["action_shape"]).to(args['device'])
    else:
        critic = FC_Q(np.prod(args["obs_shape"]),
                      args["action_shape"]).to(args['device'])

    critic_opt = optim.Adam(critic.parameters(),
                            **args["optimizer_parameters"])

    nets = {
        "critic": {
            "net": critic,
            "opt": critic_opt
        },
    }

    return nets
コード例 #4
0
ファイル: bcq.py プロジェクト: slin70/OfflineRL
def algo_init(args):
    logger.info('Run algo_init function')

    setup_seed(args['seed'])

    if args["obs_shape"] and args["action_shape"]:
        obs_shape, action_shape = args["obs_shape"], args["action_shape"]
        max_action = args["max_action"]
    elif "task" in args.keys():
        from offlinerl.utils.env import get_env_shape, get_env_action_range
        obs_shape, action_shape = get_env_shape(args['task'])
        max_action, _ = get_env_action_range(args["task"])
        args["obs_shape"], args["action_shape"] = obs_shape, action_shape
    else:
        raise NotImplementedError

    vae = VAE(obs_shape, action_shape, args['vae_features'],
              args['vae_layers'], max_action).to(args['device'])
    vae_optim = torch.optim.Adam(vae.parameters(), lr=args['vae_lr'])

    jitter = Jitter(obs_shape, action_shape, args['jitter_features'],
                    args['jitter_layers'], max_action,
                    args['phi']).to(args['device'])
    jitter_optim = torch.optim.Adam(jitter.parameters(), lr=args['jitter_lr'])

    q1 = MLP(obs_shape + action_shape,
             1,
             args['value_features'],
             args['value_layers'],
             hidden_activation='relu').to(args['device'])
    q2 = MLP(obs_shape + action_shape,
             1,
             args['value_features'],
             args['value_layers'],
             hidden_activation='relu').to(args['device'])
    critic_optim = torch.optim.Adam([*q1.parameters(), *q2.parameters()],
                                    lr=args['critic_lr'])

    return {
        "vae": {
            "net": vae,
            "opt": vae_optim
        },
        "jitter": {
            "net": jitter,
            "opt": jitter_optim
        },
        "critic": {
            "net": [q1, q2],
            "opt": critic_optim
        },
    }
コード例 #5
0
def algo_init(args):
    logger.info('Run algo_init function')

    setup_seed(args['seed'])
    
    if args["obs_shape"] and args["action_shape"]:
        obs_shape, action_shape = args["obs_shape"], args["action_shape"]
    elif "task" in args.keys():
        from offlinerl.utils.env import get_env_shape
        obs_shape, action_shape = get_env_shape(args['task'])
        args["obs_shape"], args["action_shape"] = obs_shape, action_shape
    else:
        raise NotImplementedError
    
    transition = EnsembleTransition(obs_shape, action_shape, args['hidden_layer_size'], args['transition_layers'], args['transition_init_num']).to(args['device'])
    transition_optim = torch.optim.Adam(transition.parameters(), lr=args['transition_lr'], weight_decay=0.000075)

    net_a = Net(layer_num=args['hidden_layers'], 
                state_shape=obs_shape, 
                hidden_layer_size=args['hidden_layer_size'])

    actor = TanhGaussianPolicy(preprocess_net=net_a,
                               action_shape=action_shape,
                               hidden_layer_size=args['hidden_layer_size'],
                               conditioned_sigma=True).to(args['device'])

    actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr'])

    log_alpha = torch.zeros(1, requires_grad=True, device=args['device'])
    alpha_optimizer = torch.optim.Adam([log_alpha], lr=args["actor_lr"])

    q1 = MLP(obs_shape + action_shape, 1, args['hidden_layer_size'], args['hidden_layers'], norm=None, hidden_activation='swish').to(args['device'])
    q2 = MLP(obs_shape + action_shape, 1, args['hidden_layer_size'], args['hidden_layers'], norm=None, hidden_activation='swish').to(args['device'])
    critic_optim = torch.optim.Adam([*q1.parameters(), *q2.parameters()], lr=args['actor_lr'])

    return {
        "transition" : {"net" : transition, "opt" : transition_optim},
        "actor" : {"net" : actor, "opt" : actor_optim},
        "log_alpha" : {"net" : log_alpha, "opt" : alpha_optimizer},
        "critic" : {"net" : [q1, q2], "opt" : critic_optim},
    }
コード例 #6
0
ファイル: pretrain_dynamics.py プロジェクト: ssimonc/NeoRL
def training_dynamics(config):
    if config["task"] == 'finance' and config["amount"] == 10000:
        return {
            'performance': [],
            'path': '',
        }

    seed = config['seed']
    setup_seed(seed)

    train_buffer, val_buffer = load_data_from_neorl(config["task"],
                                                    config["level"],
                                                    config["amount"])

    obs_shape = train_buffer['obs'].shape[-1]
    action_shape = train_buffer['act'].shape[-1]

    device = 'cuda'

    hidden_units = 1024 if config["task"] in ['ib', 'finance', 'citylearn'
                                              ] else 256
    transition = EnsembleTransition(obs_shape, action_shape, hidden_units, 4,
                                    7).to(device)
    transition_optim = torch.optim.AdamW(transition.parameters(),
                                         lr=1e-3,
                                         weight_decay=0.000075)

    data_size = len(train_buffer)
    val_size = min(int(data_size * 0.2) + 1, 1000)
    train_size = data_size - val_size
    train_splits, val_splits = torch.utils.data.random_split(
        range(data_size), (train_size, val_size))
    valdata = train_buffer[val_splits.indices]
    train_buffer = train_buffer[train_splits.indices]

    batch_size = 256

    val_losses = [float('inf') for i in range(7)]

    epoch = 0
    cnt = 0

    while True:
        epoch += 1
        idxs = np.random.randint(train_buffer.shape[0],
                                 size=[7, train_buffer.shape[0]])
        for batch_num in range(int(np.ceil(idxs.shape[-1] / batch_size))):
            batch_idxs = idxs[:, batch_num * batch_size:(batch_num + 1) *
                              batch_size]
            batch = train_buffer[batch_idxs]
            _train_transition(transition, batch, transition_optim, device)
        new_val_losses = _eval_transition(transition, valdata, device)

        indexes = []
        for i, new_loss, old_loss in zip(range(len(val_losses)),
                                         new_val_losses, val_losses):
            if new_loss < old_loss:
                indexes.append(i)
                val_losses[i] = new_loss

        if len(indexes) > 0:
            transition.update_save(indexes)
            cnt = 0
        else:
            cnt += 1

        if cnt >= 5:
            break

    indexes = _select_best_indexes(val_losses, n=5)
    transition.set_select(indexes)
    performance = _eval_transition(transition, valdata, device)
    transition_path = os.path.join(
        config['dynamics_path'],
        f'{config["task"]}-{config["level"]}-{config["amount"]}-{seed}.pt')

    torch.save(transition, transition_path)

    return {
        'performance': performance,
        'path': transition_path,
    }
コード例 #7
0
ファイル: cql.py プロジェクト: slin70/OfflineRL
def algo_init(args):
    logger.info('Run algo_init function')

    setup_seed(args['seed'])

    if args["obs_shape"] and args["action_shape"]:
        obs_shape, action_shape = args["obs_shape"], args["action_shape"]
    elif "task" in args.keys():
        from offlinerl.utils.env import get_env_shape
        obs_shape, action_shape = get_env_shape(args['task'])
        args["obs_shape"], args["action_shape"] = obs_shape, action_shape
    else:
        raise NotImplementedError

    net_a = Net(layer_num=args['layer_num'],
                state_shape=obs_shape,
                hidden_layer_size=args['hidden_layer_size'])

    actor = TanhGaussianPolicy(
        preprocess_net=net_a,
        action_shape=action_shape,
        hidden_layer_size=args['hidden_layer_size'],
        conditioned_sigma=True,
    ).to(args['device'])

    actor_optim = optim.Adam(actor.parameters(), lr=args['actor_lr'])

    net_c1 = Net(layer_num=args['layer_num'],
                 state_shape=obs_shape,
                 action_shape=action_shape,
                 concat=True,
                 hidden_layer_size=args['hidden_layer_size'])
    critic1 = Critic(
        preprocess_net=net_c1,
        hidden_layer_size=args['hidden_layer_size'],
    ).to(args['device'])
    critic1_optim = optim.Adam(critic1.parameters(), lr=args['critic_lr'])

    net_c2 = Net(layer_num=args['layer_num'],
                 state_shape=obs_shape,
                 action_shape=action_shape,
                 concat=True,
                 hidden_layer_size=args['hidden_layer_size'])
    critic2 = Critic(
        preprocess_net=net_c2,
        hidden_layer_size=args['hidden_layer_size'],
    ).to(args['device'])
    critic2_optim = optim.Adam(critic2.parameters(), lr=args['critic_lr'])

    if args["use_automatic_entropy_tuning"]:
        if args["target_entropy"]:
            target_entropy = args["target_entropy"]
        else:
            target_entropy = -np.prod(args["action_shape"]).item()
        log_alpha = torch.zeros(1, requires_grad=True, device=args['device'])
        alpha_optimizer = optim.Adam(
            [log_alpha],
            lr=args["actor_lr"],
        )

    nets = {
        "actor": {
            "net": actor,
            "opt": actor_optim
        },
        "critic1": {
            "net": critic1,
            "opt": critic1_optim
        },
        "critic2": {
            "net": critic2,
            "opt": critic2_optim
        },
        "log_alpha": {
            "net": log_alpha,
            "opt": alpha_optimizer,
            "target_entropy": target_entropy
        },
    }

    if args["lagrange_thresh"] >= 0:
        target_action_gap = args["lagrange_thresh"]
        log_alpha_prime = torch.zeros(1,
                                      requires_grad=True,
                                      device=args['device'])
        alpha_prime_optimizer = optim.Adam(
            [log_alpha_prime],
            lr=args["critic_lr"],
        )

        nets.update({
            "log_alpha_prime": {
                "net": log_alpha_prime,
                "opt": alpha_prime_optimizer
            }
        })

    return nets
コード例 #8
0
def algo_init(args):
    logger.info('Run algo_init function')
    
    setup_seed(args['seed'])

    if args["obs_shape"] and args["action_shape"]:
        obs_shape, action_shape = args["obs_shape"], args["action_shape"]
        max_action = args["max_action"]
    elif "task" in args.keys():
        from offlinerl.utils.env import get_env_shape, get_env_action_range
        obs_shape, action_shape = get_env_shape(args['task'])
        max_action, _ = get_env_action_range(args["task"])
        args["obs_shape"], args["action_shape"] = obs_shape, action_shape
    else:
        raise NotImplementedError
        
    latent_dim = action_shape *2
    vae = VAE(state_dim = obs_shape, 
              action_dim = action_shape, 
              latent_dim = latent_dim, 
              max_action = max_action,
              hidden_size=args["vae_hidden_size"]).to(args['device'])
    
    vae_opt = optim.Adam(vae.parameters(), lr=args["vae_lr"])
    


    if args["latent"]:
        actor = ActorPerturbation(obs_shape, 
                                  action_shape, 
                                  latent_dim, 
                                  max_action,
                                  max_latent_action=2, 
                                  phi=args['phi']).to(args['device'])
        
    else:
        net_a = Net(layer_num = args["layer_num"], 
                    state_shape = obs_shape, 
                    hidden_layer_size = args["hidden_layer_size"])
        actor = Actor(preprocess_net = net_a,
                     action_shape = latent_dim,
                     max_action = max_action,
                     hidden_layer_size = args["hidden_layer_size"]).to(args['device'])

    
    actor_opt = optim.Adam(actor.parameters(), lr=args["actor_lr"])
    
    net_c1 = Net(layer_num = args['layer_num'],
                  state_shape = obs_shape,  
                  action_shape = action_shape,
                  concat = True, 
                  hidden_layer_size = args['hidden_layer_size'])
    critic1 = Critic(preprocess_net = net_c1, 
                     hidden_layer_size = args['hidden_layer_size'],
                    ).to(args['device'])
    critic1_opt = optim.Adam(critic1.parameters(), lr=args['critic_lr'])
    
    net_c2 = Net(layer_num = args['layer_num'],
                  state_shape = obs_shape,  
                  action_shape = action_shape,
                  concat = True, 
                  hidden_layer_size = args['hidden_layer_size'])
    critic2 = Critic(preprocess_net = net_c2, 
                     hidden_layer_size = args['hidden_layer_size'],
                    ).to(args['device'])
    critic2_opt = optim.Adam(critic2.parameters(), lr=args['critic_lr'])
    
    return {
        "vae" : {"net" : vae, "opt" : vae_opt},
        "actor" : {"net" : actor, "opt" : actor_opt},
        "critic1" : {"net" : critic1, "opt" : critic1_opt},
        "critic2" : {"net" : critic2, "opt" : critic2_opt},
    }