def experiment(args): device = torch.device( "cuda:{}".format(args.device) if args.cuda else "cpu") env = get_vec_env(params["env_name"], params["env"], args.vec_env_nums) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True buffer_param = params['replay_buffer'] experiment_name = os.path.split( os.path.splitext(args.config)[0])[-1] if args.id is None \ else args.id logger = Logger(experiment_name, params['env_name'], args.seed, params, args.log_dir, args.overwrite) params['general_setting']['env'] = env replay_buffer = BaseReplayBuffer( env_nums=args.vec_env_nums, max_replay_buffer_size=int(buffer_param['size']), time_limit_filter=buffer_param['time_limit_filter']) params['general_setting']['replay_buffer'] = replay_buffer params['general_setting']['logger'] = logger params['general_setting']['device'] = device params['net']['base_type'] = networks.MLPBase params['net']['activation_func'] = torch.nn.Tanh pf = policies.FixGuassianContPolicy( input_shape=env.observation_space.shape[0], output_shape=env.action_space.shape[0], **params['net'], **params['policy']) qf = networks.QNet(input_shape=env.observation_space.shape[0] + env.action_space.shape[0], output_shape=1, **params['net']) print(pf) print(qf) params['general_setting']['collector'] = VecCollector( env=env, pf=pf, replay_buffer=replay_buffer, device=device, train_render=False, **params["collector"]) params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model") agent = DDPG(pf=pf, qf=qf, **params["ddpg"], **params["general_setting"]) agent.train()
def experiment(args): device = torch.device( "cuda:{}".format(args.device) if args.cuda else "cpu") env = get_env(params['env_name'], params['env']) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.backends.cudnn.deterministic = True buffer_param = params['replay_buffer'] replay_buffer = BaseReplayBuffer(int(buffer_param['size'])) experiment_name = os.path.split( os.path.splitext( args.config )[0] )[-1] if args.id is None \ else args.id logger = Logger(experiment_name, params['env_name'], args.seed, params, args.log_dir) params['general_setting']['env'] = env params['general_setting']['replay_buffer'] = replay_buffer params['general_setting']['logger'] = logger params['general_setting']['device'] = device params['net']['base_type'] = networks.MLPBase # agent = get_agent( params ) # print(env) # params['general_setting']['collector'] = BaseCollector( # env, pf, replay_buffer # ) pf = policies.GuassianContPolicy( input_shape=env.observation_space.shape[0], output_shape=2 * env.action_space.shape[0], **params['net']) vf = networks.Net(input_shape=env.observation_space.shape[0], output_shape=1, **params['net']) qf = networks.FlattenNet(input_shape=env.observation_space.shape[0] + env.action_space.shape[0], output_shape=1, **params['net']) pretrain_pf = policies.UniformPolicyContinuous(env.action_space.shape[0]) params['general_setting']['collector'] = BaseCollector(env, pf, replay_buffer, device=device) params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model") agent = SAC(pf=pf, vf=vf, qf=qf, pretrain_pf=pretrain_pf, **params['sac'], **params['general_setting']) agent.train()
def experiment(args): device = torch.device( "cuda:{}".format(args.device) if args.cuda else "cpu") env, cls_dicts, cls_args = get_meta_env(params['env_name'], params['env'], params['meta_env']) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cuda: torch.backends.cudnn.deterministic = True buffer_param = params['replay_buffer'] experiment_name = os.path.split(os.path.splitext(args.config)[0])[-1] if args.id is None \ else args.id logger = Logger(experiment_name, params['env_name'], args.seed, params, args.log_dir) params['general_setting']['env'] = env params['general_setting']['logger'] = logger params['general_setting']['device'] = device params['net']['base_type'] = networks.MLPBase import torch.multiprocessing as mp mp.set_start_method('spawn', force=True) # from torchrl.networks.init import normal_init example_ob = env.reset() # reset task_id as well example_embedding = env.active_task_one_hot pf = policies.ModularGuassianGatedCascadeCondContPolicy( input_shape=env.observation_space.shape[0], em_input_shape=np.prod(example_embedding.shape), output_shape=2 * env.action_space.shape[0], **params['net']) if args.pf_snap is not None: pf.load_state_dict(torch.load(args.pf_snap, map_location='cpu')) qf1 = networks.FlattenModularGatedCascadeCondNet( input_shape=env.observation_space.shape[0] + env.action_space.shape[0], em_input_shape=np.prod(example_embedding.shape), output_shape=1, **params['net']) qf2 = networks.FlattenModularGatedCascadeCondNet( input_shape=env.observation_space.shape[0] + env.action_space.shape[0], em_input_shape=np.prod(example_embedding.shape), output_shape=1, **params['net']) if args.qf1_snap is not None: qf1.load_state_dict(torch.load(args.qf2_snap, map_location='cpu')) if args.qf2_snap is not None: qf2.load_state_dict(torch.load(args.qf2_snap, map_location='cpu')) example_dict = { "obs": example_ob, "next_obs": example_ob, "acts": env.action_space.sample(), "rewards": [0], "terminals": [False], "task_idxs": [0], "embedding_inputs": example_embedding } replay_buffer = AsyncSharedReplayBuffer(int(buffer_param['size']), args.worker_nums) replay_buffer.build_by_example(example_dict) params['general_setting']['replay_buffer'] = replay_buffer epochs = params['general_setting']['pretrain_epochs'] + \ params['general_setting']['num_epochs'] print(env.action_space) print(env.observation_space) params['general_setting'][ 'collector'] = AsyncMultiTaskParallelCollectorUniform( env=env, pf=pf, replay_buffer=replay_buffer, env_cls=cls_dicts, env_args=[params["env"], cls_args, params["meta_env"]], device=device, reset_idx=True, epoch_frames=params['general_setting']['epoch_frames'], max_episode_frames=params['general_setting']['max_episode_frames'], eval_episodes=params['general_setting']['eval_episodes'], worker_nums=args.worker_nums, eval_worker_nums=args.eval_worker_nums, train_epochs=epochs, eval_epochs=params['general_setting']['num_epochs']) params['general_setting']['batch_size'] = int( params['general_setting']['batch_size']) params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model") agent = MTSAC(pf=pf, qf1=qf1, qf2=qf2, task_nums=env.num_tasks, **params['sac'], **params['general_setting']) agent.train()
def experiment(args): import torch.multiprocessing as mp mp.set_start_method('spawn') device = torch.device("cuda:{}".format(args.device) if args.cuda else "cpu") env = get_env( params['env_name'], params['env']) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.backends.cudnn.deterministic=True buffer_param = params['replay_buffer'] experiment_name = os.path.split( os.path.splitext( args.config )[0] )[-1] if args.id is None \ else args.id logger = Logger( experiment_name , params['env_name'], args.seed, params, args.log_dir ) params['general_setting']['env'] = env # replay_buffer = OnPolicyReplayBuffer(int(buffer_param['size'])) # example_ob = env.reset() # example_dict = { # "obs": example_ob, # "next_obs": example_ob, # "acts": env.action_space.sample(), # "values": [0], # "rewards": [0], # "terminals": [False] # } replay_buffer = OnPolicyReplayBuffer( int(buffer_param['size'])) # replay_buffer.build_by_example(example_dict) params['general_setting']['replay_buffer'] = replay_buffer params['general_setting']['logger'] = logger params['general_setting']['device'] = device params['net']['base_type']=networks.MLPBase pf = policies.CategoricalDisPolicy( input_shape = env.observation_space.shape[0], output_shape = env.action_space.n, **params['net'], **params['policy'] ) vf = networks.Net( input_shape = env.observation_space.shape, output_shape = 1, **params['net'] ) params['general_setting']['collector'] = OnPlicyCollectorBase( vf, env = env, pf = pf, replay_buffer = replay_buffer, device = "cuda", train_render=False ) # params['general_setting']['collector'] = ParallelOnPlicyCollector( # vf, env = env, pf = pf, replay_buffer = replay_buffer, device=device, worker_nums=2 # ) params['general_setting']['save_dir'] = osp.join(logger.work_dir,"model") agent = PPO( pf = pf, vf = vf, **params["ppo"], **params["general_setting"] ) agent.train()
def experiment(args): device = torch.device("cuda:{}".format(args.device) if args.cuda else "cpu") env = get_env( params['env_name'], params['env']) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.backends.cudnn.deterministic=True buffer_param = params['replay_buffer'] experiment_name = os.path.split( os.path.splitext( args.config )[0] )[-1] if args.id is None \ else args.id logger = Logger( experiment_name , params['env_name'], args.seed, params, args.log_dir ) params['general_setting']['env'] = env params['general_setting']['logger'] = logger params['general_setting']['device'] = device params['net']['base_type']=networks.MLPBase import torch.multiprocessing as mp mp.set_start_method('spawn') pf = policies.GuassianContPolicy ( input_shape = env.observation_space.shape[0], output_shape = 2 * env.action_space.shape[0], **params['net'] ) vf = networks.Net( input_shape = env.observation_space.shape[0], output_shape = 1, **params['net'] ) qf = networks.FlattenNet( input_shape = env.observation_space.shape[0] + env.action_space.shape[0], output_shape = 1, **params['net'] ) # pretrain_pf = policies.UniformPolicyContinuous(env.action_space.shape[0]) example_ob = env.reset() example_dict = { "obs": example_ob, "next_obs": example_ob, "acts": env.action_space.sample(), "rewards": [0], "terminals": [False] } replay_buffer = SharedBaseReplayBuffer( int(buffer_param['size']), 1 ) replay_buffer.build_by_example(example_dict) params['general_setting']['replay_buffer'] = replay_buffer params['general_setting']['collector'] = ParallelCollector( env, pf, replay_buffer, device=device, worker_nums=1 ) params['general_setting']['save_dir'] = osp.join(logger.work_dir,"model") agent = SAC( pf = pf, vf = vf, qf = qf, **params['sac'], **params['general_setting'] ) agent.train()
def experiment(args): import torch.multiprocessing as mp mp.set_start_method('spawn') device = torch.device( "cuda:{}".format(args.device) if args.cuda else "cpu") # env = get_env(params['env_name'], params['env']) env = VecEnv(4, get_env, [params['env_name'], params['env']]) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True buffer_param = params['replay_buffer'] experiment_name = os.path.split(os.path.splitext(args.config)[0])[-1] if args.id is None \ else args.id logger = Logger(experiment_name, params['env_name'], args.seed, params, args.log_dir) params['general_setting']['env'] = env # replay_buffer = OnPolicyReplayBuffer(int(buffer_param['size'])) # example_ob = env.reset() # example_dict = { # "obs": example_ob, # "next_obs": example_ob, # "acts": env.action_space.sample(), # "values": [0], # "rewards": [0], # "terminals": [False] # } # replay_buffer = SharedOnPolicyReplayBuffer( int(buffer_param['size']), # args.worker_nums # ) # replay_buffer.build_by_example(example_dict) replay_buffer = VecOnPolicyReplayBuffer( env_nums=2, max_replay_buffer_size=int(buffer_param['size']), time_limit_filter=buffer_param['time_limit_filter']) params['general_setting']['replay_buffer'] = replay_buffer params['general_setting']['logger'] = logger params['general_setting']['device'] = device params['net']['base_type'] = networks.MLPBase params['net']['activation_func'] = torch.tanh pf = policies.GuassianContPolicyBasicBias( input_shape=env.observation_space.shape[0], output_shape=env.action_space.shape[0], init_func=lambda x: init.orthogonal_init( x, scale=np.sqrt(2), constant=0), net_last_init_func=lambda x: init.orthogonal_init( x, scale=0.01, constant=0), **params['net'], **params['policy']) vf = networks.Net(input_shape=env.observation_space.shape, output_shape=1, init_func=lambda x: init.orthogonal_init( x, scale=np.sqrt(2), constant=0), net_last_init_func=lambda x: init.orthogonal_init( x, scale=1, constant=0), **params['net']) # params['general_setting']['collector'] = OnPlicyCollectorBase( # vf, env=env, pf=pf, replay_buffer=replay_buffer, device=device, # train_render=False # ) params['general_setting']['collector'] = VecOnPlicyCollector( vf, env=env, pf=pf, replay_buffer=replay_buffer, device=device, train_render=False, epoch_frames=params["general_setting"]["epoch_frames"] // 2) params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model") agent = PPO(pf=pf, vf=vf, **params["ppo"], **params["general_setting"]) agent.train()
def experiment(args): # import torch.multiprocessing as mp # mp.set_start_method('spawn') device = torch.device( "cuda:{}".format(args.device) if args.cuda else "cpu") env = get_env(params['env_name'], params['env']) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True buffer_param = params['replay_buffer'] experiment_name = os.path.split(os.path.splitext(args.config)[0])[-1] if args.id is None \ else args.id logger = Logger(experiment_name, params['env_name'], args.seed, params, args.log_dir) params['general_setting']['env'] = env replay_buffer = OnPolicyReplayBuffer( int(buffer_param['size']), time_limit_filter=buffer_param['time_limit_filter']) params['general_setting']['replay_buffer'] = replay_buffer params['general_setting']['logger'] = logger params['general_setting']['device'] = device params['net']['base_type'] = networks.MLPBase params['net']['activation_func'] = nn.Tanh pf = policies.GuassianContPolicyBasicBias( input_shape=env.observation_space.shape[0], output_shape=env.action_space.shape[0], init_func=lambda x: init.orthogonal_init( x, scale=np.sqrt(2), constant=0), net_last_init_func=lambda x: init.orthogonal_init( x, scale=0.01, constant=0), **params['net'], **params['policy']) vf = networks.Net(input_shape=env.observation_space.shape, output_shape=1, init_func=lambda x: init.orthogonal_init( x, scale=np.sqrt(2), constant=0), net_last_init_func=lambda x: init.orthogonal_init( x, scale=1, constant=0), **params['net']) params['general_setting']['collector'] = OnPlicyCollectorBase( vf, env=env, pf=pf, replay_buffer=replay_buffer, device=device, train_render=False, **params["collector"]) params['general_setting']['save_dir'] = osp.join(logger.work_dir, "model") agent = TRPO(pf=pf, vf=vf, **params["trpo"], **params["general_setting"]) print(params["general_setting"]) print(agent.epoch_frames) agent.train()