コード例 #1
0
def experiment(args):

    device = torch.device(
        "cuda:{}".format(args.device) if args.cuda else "cpu")

    env = get_vec_env(params["env_name"], params["env"], args.vec_env_nums)

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed_all(args.seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    buffer_param = params['replay_buffer']

    experiment_name = os.path.split(
        os.path.splitext(args.config)[0])[-1] if args.id is None \
        else args.id
    logger = Logger(experiment_name, params['env_name'], args.seed, params,
                    args.log_dir, args.overwrite)

    params['general_setting']['env'] = env

    replay_buffer = BaseReplayBuffer(
        env_nums=args.vec_env_nums,
        max_replay_buffer_size=int(buffer_param['size']),
        time_limit_filter=buffer_param['time_limit_filter'])
    params['general_setting']['replay_buffer'] = replay_buffer

    params['general_setting']['logger'] = logger
    params['general_setting']['device'] = device

    params['net']['base_type'] = networks.MLPBase
    params['net']['activation_func'] = torch.nn.Tanh

    pf = policies.FixGuassianContPolicy(
        input_shape=env.observation_space.shape[0],
        output_shape=env.action_space.shape[0],
        **params['net'],
        **params['policy'])
    qf = networks.QNet(input_shape=env.observation_space.shape[0] +
                       env.action_space.shape[0],
                       output_shape=1,
                       **params['net'])

    print(pf)
    print(qf)
    params['general_setting']['collector'] = VecCollector(
        env=env,
        pf=pf,
        replay_buffer=replay_buffer,
        device=device,
        train_render=False,
        **params["collector"])
    params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model")
    agent = DDPG(pf=pf, qf=qf, **params["ddpg"], **params["general_setting"])
    agent.train()
コード例 #2
0
ファイル: example.py プロジェクト: lgh0504/torchrl
def experiment(args):

    device = torch.device(
        "cuda:{}".format(args.device) if args.cuda else "cpu")

    env = get_env(params['env_name'], params['env'])

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.cuda:
        torch.backends.cudnn.deterministic = True

    buffer_param = params['replay_buffer']
    replay_buffer = BaseReplayBuffer(int(buffer_param['size']))

    experiment_name = os.path.split( os.path.splitext( args.config )[0] )[-1] if args.id is None \
        else args.id
    logger = Logger(experiment_name, params['env_name'], args.seed, params,
                    args.log_dir)

    params['general_setting']['env'] = env
    params['general_setting']['replay_buffer'] = replay_buffer
    params['general_setting']['logger'] = logger
    params['general_setting']['device'] = device

    params['net']['base_type'] = networks.MLPBase
    # agent = get_agent( params )
    # print(env)
    # params['general_setting']['collector'] = BaseCollector(
    #     env, pf, replay_buffer
    # )

    pf = policies.GuassianContPolicy(
        input_shape=env.observation_space.shape[0],
        output_shape=2 * env.action_space.shape[0],
        **params['net'])
    vf = networks.Net(input_shape=env.observation_space.shape[0],
                      output_shape=1,
                      **params['net'])
    qf = networks.FlattenNet(input_shape=env.observation_space.shape[0] +
                             env.action_space.shape[0],
                             output_shape=1,
                             **params['net'])
    pretrain_pf = policies.UniformPolicyContinuous(env.action_space.shape[0])

    params['general_setting']['collector'] = BaseCollector(env,
                                                           pf,
                                                           replay_buffer,
                                                           device=device)
    params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model")
    agent = SAC(pf=pf,
                vf=vf,
                qf=qf,
                pretrain_pf=pretrain_pf,
                **params['sac'],
                **params['general_setting'])
    agent.train()
コード例 #3
0
def experiment(args):
    device = torch.device(
        "cuda:{}".format(args.device) if args.cuda else "cpu")

    env, cls_dicts, cls_args = get_meta_env(params['env_name'], params['env'],
                                            params['meta_env'])

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.backends.cudnn.deterministic = True

    buffer_param = params['replay_buffer']

    experiment_name = os.path.split(os.path.splitext(args.config)[0])[-1] if args.id is None \
        else args.id
    logger = Logger(experiment_name, params['env_name'], args.seed, params,
                    args.log_dir)

    params['general_setting']['env'] = env
    params['general_setting']['logger'] = logger
    params['general_setting']['device'] = device

    params['net']['base_type'] = networks.MLPBase

    import torch.multiprocessing as mp
    mp.set_start_method('spawn', force=True)

    # from torchrl.networks.init import normal_init

    example_ob = env.reset()  # reset task_id as well
    example_embedding = env.active_task_one_hot

    pf = policies.ModularGuassianGatedCascadeCondContPolicy(
        input_shape=env.observation_space.shape[0],
        em_input_shape=np.prod(example_embedding.shape),
        output_shape=2 * env.action_space.shape[0],
        **params['net'])

    if args.pf_snap is not None:
        pf.load_state_dict(torch.load(args.pf_snap, map_location='cpu'))

    qf1 = networks.FlattenModularGatedCascadeCondNet(
        input_shape=env.observation_space.shape[0] + env.action_space.shape[0],
        em_input_shape=np.prod(example_embedding.shape),
        output_shape=1,
        **params['net'])
    qf2 = networks.FlattenModularGatedCascadeCondNet(
        input_shape=env.observation_space.shape[0] + env.action_space.shape[0],
        em_input_shape=np.prod(example_embedding.shape),
        output_shape=1,
        **params['net'])

    if args.qf1_snap is not None:
        qf1.load_state_dict(torch.load(args.qf2_snap, map_location='cpu'))
    if args.qf2_snap is not None:
        qf2.load_state_dict(torch.load(args.qf2_snap, map_location='cpu'))

    example_dict = {
        "obs": example_ob,
        "next_obs": example_ob,
        "acts": env.action_space.sample(),
        "rewards": [0],
        "terminals": [False],
        "task_idxs": [0],
        "embedding_inputs": example_embedding
    }

    replay_buffer = AsyncSharedReplayBuffer(int(buffer_param['size']),
                                            args.worker_nums)
    replay_buffer.build_by_example(example_dict)

    params['general_setting']['replay_buffer'] = replay_buffer

    epochs = params['general_setting']['pretrain_epochs'] + \
             params['general_setting']['num_epochs']

    print(env.action_space)
    print(env.observation_space)
    params['general_setting'][
        'collector'] = AsyncMultiTaskParallelCollectorUniform(
            env=env,
            pf=pf,
            replay_buffer=replay_buffer,
            env_cls=cls_dicts,
            env_args=[params["env"], cls_args, params["meta_env"]],
            device=device,
            reset_idx=True,
            epoch_frames=params['general_setting']['epoch_frames'],
            max_episode_frames=params['general_setting']['max_episode_frames'],
            eval_episodes=params['general_setting']['eval_episodes'],
            worker_nums=args.worker_nums,
            eval_worker_nums=args.eval_worker_nums,
            train_epochs=epochs,
            eval_epochs=params['general_setting']['num_epochs'])
    params['general_setting']['batch_size'] = int(
        params['general_setting']['batch_size'])
    params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model")
    agent = MTSAC(pf=pf,
                  qf1=qf1,
                  qf2=qf2,
                  task_nums=env.num_tasks,
                  **params['sac'],
                  **params['general_setting'])
    agent.train()
コード例 #4
0
def experiment(args):

    import torch.multiprocessing as mp
    mp.set_start_method('spawn')

    device = torch.device("cuda:{}".format(args.device) if args.cuda else "cpu")

    env = get_env( params['env_name'], params['env'])

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.cuda:
        torch.backends.cudnn.deterministic=True
    
    buffer_param = params['replay_buffer']

    experiment_name = os.path.split( os.path.splitext( args.config )[0] )[-1] if args.id is None \
        else args.id
    logger = Logger( experiment_name , params['env_name'], args.seed, params, args.log_dir )

    params['general_setting']['env'] = env

    # replay_buffer = OnPolicyReplayBuffer(int(buffer_param['size']))

    # example_ob = env.reset()
    # example_dict = { 
    #     "obs": example_ob,
    #     "next_obs": example_ob,
    #     "acts": env.action_space.sample(),
    #     "values": [0],
    #     "rewards": [0],
    #     "terminals": [False]
    # }
    replay_buffer = OnPolicyReplayBuffer( int(buffer_param['size']))
    # replay_buffer.build_by_example(example_dict)

    params['general_setting']['replay_buffer'] = replay_buffer

    params['general_setting']['logger'] = logger
    params['general_setting']['device'] = device

    params['net']['base_type']=networks.MLPBase
    pf = policies.CategoricalDisPolicy(
        input_shape = env.observation_space.shape[0],
        output_shape = env.action_space.n,
        **params['net'],
        **params['policy']
    )
    vf = networks.Net( 
        input_shape = env.observation_space.shape,
        output_shape = 1,
        **params['net'] 
    )
    params['general_setting']['collector'] = OnPlicyCollectorBase(
        vf, env = env, pf = pf, replay_buffer = replay_buffer, device = "cuda", 
        train_render=False
    )

    # params['general_setting']['collector'] = ParallelOnPlicyCollector(
    #     vf, env = env, pf = pf, replay_buffer = replay_buffer, device=device, worker_nums=2
    # )

    params['general_setting']['save_dir'] = osp.join(logger.work_dir,"model")
    agent = PPO(
            pf = pf,
            vf = vf,
            **params["ppo"],
            **params["general_setting"]
        )
    agent.train()
コード例 #5
0
ファイル: example_para_sac.py プロジェクト: lgh0504/torchrl
def experiment(args):

    device = torch.device("cuda:{}".format(args.device) if args.cuda else "cpu")

    env = get_env( params['env_name'], params['env'])

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.cuda:
        torch.backends.cudnn.deterministic=True
    
    buffer_param = params['replay_buffer']

    experiment_name = os.path.split( os.path.splitext( args.config )[0] )[-1] if args.id is None \
        else args.id
    logger = Logger( experiment_name , params['env_name'], args.seed, params, args.log_dir )

    params['general_setting']['env'] = env
    params['general_setting']['logger'] = logger
    params['general_setting']['device'] = device

    params['net']['base_type']=networks.MLPBase

    import torch.multiprocessing as mp
    mp.set_start_method('spawn')

    pf = policies.GuassianContPolicy (
        input_shape = env.observation_space.shape[0], 
        output_shape = 2 * env.action_space.shape[0],
        **params['net'] )
    vf = networks.Net( 
        input_shape = env.observation_space.shape[0],
        output_shape = 1,
        **params['net'] )
    qf = networks.FlattenNet( 
        input_shape = env.observation_space.shape[0] + env.action_space.shape[0],
        output_shape = 1,
        **params['net'] )
    # pretrain_pf = policies.UniformPolicyContinuous(env.action_space.shape[0])
    
    example_ob = env.reset()
    example_dict = { 
        "obs": example_ob,
        "next_obs": example_ob,
        "acts": env.action_space.sample(),
        "rewards": [0],
        "terminals": [False]
    }
    replay_buffer = SharedBaseReplayBuffer( int(buffer_param['size']),
            1
    )
    replay_buffer.build_by_example(example_dict)

    params['general_setting']['replay_buffer'] = replay_buffer

    params['general_setting']['collector'] = ParallelCollector(
        env, pf, replay_buffer, device=device, worker_nums=1
    )

    params['general_setting']['save_dir'] = osp.join(logger.work_dir,"model")
    agent = SAC(
        pf = pf,
        vf = vf,
        qf = qf,
        **params['sac'],
        **params['general_setting']
    )
    agent.train()
コード例 #6
0
def experiment(args):

    import torch.multiprocessing as mp
    mp.set_start_method('spawn')

    device = torch.device(
        "cuda:{}".format(args.device) if args.cuda else "cpu")

    # env = get_env(params['env_name'], params['env'])
    env = VecEnv(4, get_env, [params['env_name'], params['env']])

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed_all(args.seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    buffer_param = params['replay_buffer']

    experiment_name = os.path.split(os.path.splitext(args.config)[0])[-1] if args.id is None \
        else args.id
    logger = Logger(experiment_name, params['env_name'], args.seed, params,
                    args.log_dir)

    params['general_setting']['env'] = env

    # replay_buffer = OnPolicyReplayBuffer(int(buffer_param['size']))

    # example_ob = env.reset()
    # example_dict = {
    #     "obs": example_ob,
    #     "next_obs": example_ob,
    #     "acts": env.action_space.sample(),
    #     "values": [0],
    #     "rewards": [0],
    #     "terminals": [False]
    # }
    # replay_buffer = SharedOnPolicyReplayBuffer( int(buffer_param['size']),
    #         args.worker_nums
    # )
    # replay_buffer.build_by_example(example_dict)

    replay_buffer = VecOnPolicyReplayBuffer(
        env_nums=2,
        max_replay_buffer_size=int(buffer_param['size']),
        time_limit_filter=buffer_param['time_limit_filter'])
    params['general_setting']['replay_buffer'] = replay_buffer

    params['general_setting']['logger'] = logger
    params['general_setting']['device'] = device

    params['net']['base_type'] = networks.MLPBase
    params['net']['activation_func'] = torch.tanh
    pf = policies.GuassianContPolicyBasicBias(
        input_shape=env.observation_space.shape[0],
        output_shape=env.action_space.shape[0],
        init_func=lambda x: init.orthogonal_init(
            x, scale=np.sqrt(2), constant=0),
        net_last_init_func=lambda x: init.orthogonal_init(
            x, scale=0.01, constant=0),
        **params['net'],
        **params['policy'])
    vf = networks.Net(input_shape=env.observation_space.shape,
                      output_shape=1,
                      init_func=lambda x: init.orthogonal_init(
                          x, scale=np.sqrt(2), constant=0),
                      net_last_init_func=lambda x: init.orthogonal_init(
                          x, scale=1, constant=0),
                      **params['net'])
    # params['general_setting']['collector'] = OnPlicyCollectorBase(
    #     vf, env=env, pf=pf, replay_buffer=replay_buffer, device=device,
    #     train_render=False
    # )
    params['general_setting']['collector'] = VecOnPlicyCollector(
        vf,
        env=env,
        pf=pf,
        replay_buffer=replay_buffer,
        device=device,
        train_render=False,
        epoch_frames=params["general_setting"]["epoch_frames"] // 2)
    params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model")
    agent = PPO(pf=pf, vf=vf, **params["ppo"], **params["general_setting"])
    agent.train()
コード例 #7
0
def experiment(args):
    # import torch.multiprocessing as mp
    # mp.set_start_method('spawn')

    device = torch.device(
        "cuda:{}".format(args.device) if args.cuda else "cpu")

    env = get_env(params['env_name'], params['env'])

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed_all(args.seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    buffer_param = params['replay_buffer']

    experiment_name = os.path.split(os.path.splitext(args.config)[0])[-1] if args.id is None \
        else args.id
    logger = Logger(experiment_name, params['env_name'], args.seed, params,
                    args.log_dir)

    params['general_setting']['env'] = env

    replay_buffer = OnPolicyReplayBuffer(
        int(buffer_param['size']),
        time_limit_filter=buffer_param['time_limit_filter'])

    params['general_setting']['replay_buffer'] = replay_buffer

    params['general_setting']['logger'] = logger
    params['general_setting']['device'] = device

    params['net']['base_type'] = networks.MLPBase
    params['net']['activation_func'] = nn.Tanh
    pf = policies.GuassianContPolicyBasicBias(
        input_shape=env.observation_space.shape[0],
        output_shape=env.action_space.shape[0],
        init_func=lambda x: init.orthogonal_init(
            x, scale=np.sqrt(2), constant=0),
        net_last_init_func=lambda x: init.orthogonal_init(
            x, scale=0.01, constant=0),
        **params['net'],
        **params['policy'])
    vf = networks.Net(input_shape=env.observation_space.shape,
                      output_shape=1,
                      init_func=lambda x: init.orthogonal_init(
                          x, scale=np.sqrt(2), constant=0),
                      net_last_init_func=lambda x: init.orthogonal_init(
                          x, scale=1, constant=0),
                      **params['net'])
    params['general_setting']['collector'] = OnPlicyCollectorBase(
        vf,
        env=env,
        pf=pf,
        replay_buffer=replay_buffer,
        device=device,
        train_render=False,
        **params["collector"])

    params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model")
    agent = TRPO(pf=pf, vf=vf, **params["trpo"], **params["general_setting"])
    print(params["general_setting"])
    print(agent.epoch_frames)
    agent.train()