def train(self): args = self.args torch.manual_seed(args.seed) env = env = grid2op.make(args.env_name, test=args.for_test, reward_class=L2RPNReward) shared_model = ActorCritic(env.observation_space.size(), self.action_space, args.hidden_size) shared_model.share_memory() if args.no_shared: optimizer = None else: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() processes = [] counter = mp.Value('i', 0) lock = mp.Lock() p = mp.Process(target=self.do_test, args=(args.num_processes, args, shared_model, counter)) p.start() processes.append(p) for rank in range(0, args.num_processes): p = mp.Process(target=self.do_train, args=(rank, args, shared_model, counter, lock, optimizer)) p.start() processes.append(p) for p in processes: p.join()
def __init__(self, args_, logger_): self.args = args_ self.logger = logger_ self.env = AtariEnv(gym.make(self.args.game), args_.frame_seq, args_.frame_skip, render=True) self.shared_model = A3CLSTMNet(self.env.state_shape, self.env.action_dim) self.shared_model.share_memory() self.optim = my_optim.SharedAdam(self.shared_model.parameters(), lr=self.args.lr) self.optim.share_memory() # visdom self.vis = visdom.Visdom() self.main_update_step = Value('d', 0) # load model if self.args.load_weight != 0: self.load_model(self.args.load_weight) self.jobs = [] if self.args.t_flag: for process_id in xrange(self.args.jobs): job = A3CSingleProcess(process_id, self, logger_) self.jobs.append(job) self.test_win = None
def main(): #env args = config() mp.set_start_method("spawn") os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = "" env = create_atari_env(args.env_name) shared_model = AcotrCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() processes = [] counter = mp.Value('i', 0) lock = mp.Lock() p = mp.Process(target=test, args=(args.num_processes, args, shared_model, counter, "./log/")) p.start() processes.append(p) for rank in range(0, args.num_processes): p = mp.Process(target=train, args=(rank, args, shared_model, counter, lock, optimizer)) p.start() processes.append(p) for p in processes: p.join()
class Params(): def __init__(self): self.lr = 0.0001 self.gamma = 0.99 self.tau = 1. self.seed = 1 self.num_processes = 16 self.num_steps = 20 self.max_episode_length = 10000 self.env_name = 'Breakout-v0' # Main run os.environ['OMP_NUM_THREADS'] = '1' params = Params() torch.manual_seed(params.seed) env = create_atari_env(params.env_name) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr) optimizer.share_memory() processes = [] p = mp.Process(target=test, args=(params.num_processes, params, shared_model)) p.start() processes.append(p) for rank in range(0, params.num_processes): p = mp.Process(target=train, args=(rank, params, shared_model, optimizer)) p.start() processes.append(p) for p in processes: p.join()
loader = DataLoader(opt) # not used in training procedure, just used to set vocab_size and seq_length opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length model = models.setup(opt) model.train() num_parameter = get_num_params(model) print('number of parameters: ' + str(num_parameter)) if opt.async_opt: if opt.use_cuda: model.cuda() model.share_memory() optimizer = my_optim.SharedAdam(model.parameters(), lr=opt.optim_lr, betas=(opt.optim_adam_beta1, opt.optim_adam_beta2), weight_decay=opt.optim_weight_decay) optimizer.share_memory() processes = [] for rank in range(opt.num_processes): p = mp.Process(target=train, args=(rank, model, opt, optimizer)) p.start() processes.append(p) for p in processes: p.join() else: if opt.use_cuda: model.cuda() rank = 0 optimizer = None
shared_model.share_memory() if not args.no_curiosity: # <---ICM--- shared_curiosity = IntrinsicCuriosityModule( # env.observation_space.shape[0], env.action_space) args.num_stack, env.action_space) shared_curiosity.share_memory() # ---ICM---> if args.no_shared: optimizer = None else: if args.no_curiosity: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) elif not args.no_curiosity: if not args.curiosity_only: optimizer = my_optim.SharedAdam( # ICM chain(shared_model.parameters(), shared_curiosity.parameters()), lr=args.lr) elif args.curiosity_only: optimizer = my_optim.SharedAdam(shared_curiosity.parameters(), lr=args.lr) optimizer.share_memory() if (args.model_file is not None) and (args.optimizer_file is not None): logging.info("Start with a pretrained model") shared_model.load_state_dict(torch.load(args.model_file)) optimizer.load_state_dict(torch.load(args.optimizer_file))
def main(method): args = built_parser(method=method) env = gym.make(args.env_name) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] args.state_dim = state_dim args.action_dim = action_dim action_high = env.action_space.high action_low = env.action_space.low args.action_high = action_high.tolist() args.action_low = action_low.tolist() args.seed = np.random.randint(0, 30) args.init_time = time.time() if args.alpha == 'auto' and args.target_entropy == 'auto': delta_a = np.array(args.action_high, dtype=np.float32) - np.array( args.action_low, dtype=np.float32) args.target_entropy = -1 * args.action_dim #+ sum(np.log(delta_a/2)) Q_net1 = QNet(args) Q_net1.train() Q_net1.share_memory() Q_net1_target = QNet(args) Q_net1_target.train() Q_net1_target.share_memory() Q_net2 = QNet(args) Q_net2.train() Q_net2.share_memory() Q_net2_target = QNet(args) Q_net2_target.train() Q_net2_target.share_memory() actor1 = PolicyNet(args) actor1.train() actor1.share_memory() actor1_target = PolicyNet(args) actor1_target.train() actor1_target.share_memory() actor2 = PolicyNet(args) actor2.train() actor2.share_memory() actor2_target = PolicyNet(args) actor2_target.train() actor2_target.share_memory() Q_net1_target.load_state_dict(Q_net1.state_dict()) Q_net2_target.load_state_dict(Q_net2.state_dict()) actor1_target.load_state_dict(actor1.state_dict()) actor2_target.load_state_dict(actor2.state_dict()) Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(), lr=args.critic_lr) Q_net1_optimizer.share_memory() Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(), lr=args.critic_lr) Q_net2_optimizer.share_memory() actor1_optimizer = my_optim.SharedAdam(actor1.parameters(), lr=args.actor_lr) actor1_optimizer.share_memory() actor2_optimizer = my_optim.SharedAdam(actor2.parameters(), lr=args.actor_lr) actor2_optimizer.share_memory() log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True) log_alpha.share_memory_() alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr) alpha_optimizer.share_memory() share_net = [ Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target, actor2, actor2_target, log_alpha ] share_optimizer = [ Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer, alpha_optimizer ] experience_in_queue = [] experience_out_queue = [] for i in range(args.num_buffers): experience_in_queue.append(Queue(maxsize=10)) experience_out_queue.append(Queue(maxsize=10)) shared_queue = [experience_in_queue, experience_out_queue] step_counter = mp.Value('i', 0) stop_sign = mp.Value('i', 0) iteration_counter = mp.Value('i', 0) shared_value = [step_counter, stop_sign, iteration_counter] lock = mp.Lock() procs = [] if args.code_model == "train": for i in range(args.num_actors): procs.append( Process(target=actor_agent, args=(args, shared_queue, shared_value, [actor1, Q_net1], lock, i))) for i in range(args.num_buffers): procs.append( Process(target=buffer, args=(args, shared_queue, shared_value, i))) procs.append( Process(target=evaluate_agent, args=(args, shared_value, share_net))) for i in range(args.num_learners): #device = torch.device("cuda") device = torch.device("cpu") procs.append( Process(target=leaner_agent, args=(args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i))) elif args.code_model == "simu": procs.append(Process(target=simu_agent, args=(args, shared_value))) for p in procs: p.start() for p in procs: p.join()
if args.planning: d_module = load_d_module(env.action_space.shape[0], args) shared_model = R_Module(env.action_space.shape[0], args.dim, discrete=args.discrete, baseline=args.baseline, state_space=env.observation_space.shape[0]) # shared reward module for everyone shared_model.share_memory() if args.no_shared: optimizer = None else: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() processes = [] train_agent_method = None total_args = args train_agent_method = train_rewards for rank in range(0, args.num_processes): if rank == 0: p = mp.Process(target=train_agent_method, args=(rank, total_args, shared_model, enc, optimizer, tb_log_dir, d_module)) else:
def main(method): params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'train', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } args = built_parser(method=method) env = gym.make(args.env_name, params=params) state_dim = env.state_space.shape action_dim = env.action_space.shape[0] args.state_dim = state_dim args.action_dim = action_dim action_high = env.action_space.high action_low = env.action_space.low args.action_high = action_high.tolist() args.action_low = action_low.tolist() args.seed = np.random.randint(0, 30) args.init_time = time.time() num_cpu = mp.cpu_count() print(state_dim, action_dim, action_high, num_cpu) if args.alpha == 'auto' and args.target_entropy == 'auto': delta_a = np.array(args.action_high, dtype=np.float32) - np.array( args.action_low, dtype=np.float32) args.target_entropy = -1 * args.action_dim # + sum(np.log(delta_a/2)) Q_net1 = QNet(args) Q_net1.train() Q_net1.share_memory() Q_net1_target = QNet(args) Q_net1_target.train() Q_net1_target.share_memory() Q_net2 = QNet(args) Q_net2.train() Q_net2.share_memory() Q_net2_target = QNet(args) Q_net2_target.train() Q_net2_target.share_memory() actor1 = PolicyNet(args) print("Network inited") if args.code_model == "eval": actor1.load_state_dict( torch.load('./' + args.env_name + '/method_' + str(args.method) + '/model/policy_' + str(args.max_train) + '.pkl')) actor1.train() actor1.share_memory() actor1_target = PolicyNet(args) actor1_target.train() actor1_target.share_memory() actor2 = PolicyNet(args) actor2.train() actor2.share_memory() actor2_target = PolicyNet(args) actor2_target.train() actor2_target.share_memory() print("Network set") Q_net1_target.load_state_dict(Q_net1.state_dict()) Q_net2_target.load_state_dict(Q_net2.state_dict()) actor1_target.load_state_dict(actor1.state_dict()) actor2_target.load_state_dict(actor2.state_dict()) print("Network loaded!") Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(), lr=args.critic_lr) Q_net1_optimizer.share_memory() Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(), lr=args.critic_lr) Q_net2_optimizer.share_memory() actor1_optimizer = my_optim.SharedAdam(actor1.parameters(), lr=args.actor_lr) actor1_optimizer.share_memory() actor2_optimizer = my_optim.SharedAdam(actor2.parameters(), lr=args.actor_lr) actor2_optimizer.share_memory() log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True) log_alpha.share_memory_() alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr) alpha_optimizer.share_memory() print("Optimizer done") share_net = [ Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target, actor2, actor2_target, log_alpha ] share_optimizer = [ Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer, alpha_optimizer ] experience_in_queue = [] experience_out_queue = [] for i in range(args.num_buffers): experience_in_queue.append(Queue(maxsize=10)) experience_out_queue.append(Queue(maxsize=10)) shared_queue = [experience_in_queue, experience_out_queue] step_counter = mp.Value('i', 0) stop_sign = mp.Value('i', 0) iteration_counter = mp.Value('i', 0) shared_value = [step_counter, stop_sign, iteration_counter] lock = mp.Lock() procs = [] if args.code_model == "train": for i in range(args.num_learners): if i % 2 == 0: device = torch.device("cuda:1") else: device = torch.device("cuda:0") # device = torch.device("cpu") procs.append( Process(target=leaner_agent, args=(args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i))) for i in range(args.num_actors): procs.append( Process(target=actor_agent, args=(args, shared_queue, shared_value, [actor1, Q_net1], lock, i))) for i in range(args.num_buffers): procs.append( Process(target=buffer, args=(args, shared_queue, shared_value, i))) procs.append( Process(target=evaluate_agent, args=(args, shared_value, share_net))) elif args.code_model == "simu": procs.append(Process(target=simu_agent, args=(args, shared_value))) for p in procs: p.start() for p in procs: p.join()
self.seed = 1 self.num_processes = 16 self.num_steps = 20 self.max_episode_length = 10000 self.env_name = 'Breakout-v0' # Main run os.environ['OMP_NUM_THREADS'] = '1' # 1 thread per core params = Params() # creating the params object from the Params class, that sets all the model parameters torch.manual_seed(params.seed) # setting the seed (not essential) env = create_atari_env(params.env_name) # we create an optimized environment thanks to universe shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) # shared_model is the model shared by the different agents (different threads in different cores) shared_model.share_memory() # storing the model in the shared memory of the computer, which allows the threads to have access to this shared memory even if they are in different cores optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr) # the optimizer is also shared because it acts on the shared model optimizer.share_memory() # same, we store the optimizer in the shared memory so that all the agents can have access to this shared memory to optimize the model processes = [] # initializing the processes with an empty list p = mp.Process(target=test, args=(params.num_processes, params, shared_model)) # allowing to create the 'test' process with some arguments 'args' passed to the 'test' target function - the 'test' process doesn't update the shared model but uses it on a part of it - torch.multiprocessing.Process runs a function in an independent thread p.start() # starting the created process p processes.append(p) # adding the created process p to the list of processes for rank in range(0, params.num_processes): # making a loop to run all the other processes that will be trained by updating the shared model p = mp.Process(target=train, args=(rank, params, shared_model, optimizer)) p.start() processes.append(p) for p in processes: # creating a pointer that will allow to kill all the threads when at least one of the threads, or main.py will be killed, allowing to stop the program safely print('working') p.join()
parser.add_argument('--test', action='store_true', help='test ') parser.add_argument('--feature', type=int, default=96, help='features num') if __name__ == '__main__': args = parser.parse_args() os.environ['OMP_NUM_THREADS'] = '1' torch.manual_seed(args.seed) num_inputs = args.feature num_actions = 9 ac_net = ActorCritic(num_inputs, num_actions) opt_ac = my_optim.SharedAdam(ac_net.parameters(), lr=args.lr) if args.resume: print("=> loading checkpoint ") checkpoint = torch.load('../models/kankan/best.t7') #args.start_epoch = checkpoint['epoch'] #best_prec1 = checkpoint['best_prec1'] ac_net.load_state_dict(checkpoint['state_dict']) #opt_ac.load_state_dict(checkpoint['optimizer']) print(ac_net) print("=> loaded checkpoint (epoch {})" .format(checkpoint['epoch'])) ac_net.share_memory() #opt_ac = my_optim.SharedAdam(ac_net.parameters(), lr=args.lr) opt_ac.share_memory()
operative_temp = [all_parameter[8]] + all_parameter[10:21] cost_flex = all_parameter[2:8] + [all_parameter[9]] state_num = all_parameter[10:109] + all_parameter[0:6] + [ all_parameter[7] ] + predictionFlat(params.file_path_prediction, (time_step_update) % 8760) state = np.array(state_normalization(params.file_path_norm, state_num)) state = torch.from_numpy(state).float() cx = torch.zeros(1, params.hidden_layer ) # the cell states of the LSTM are reinitialized to zero hx = torch.zeros(1, params.hidden_layer ) # the hidden states of the LSTM are reinitialized to zero model = ActorCritic(178, params.output_space) optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr) value, action_values, (hx, cx) = model( (state.unsqueeze(0), (hx, cx)) ) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states prob = F.softmax( action_values, dim=1 ) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b))) log_prob = F.log_softmax( action_values, dim=1 ) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a)) entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x)) action = prob.multinomial( 1 ).data # selecting an action by taking a random draw from the prob distribution log_prob = log_prob.gather(
self.env_name = 'Pendulum-v0' if __name__ == '__main__': os.environ['OMP_NUM_THREADS'] = '1' params = Params() torch.manual_seed(params.seed) env = gym.make(params.env_name) num_inputs = env.observation_space.shape[0] num_outputs = env.action_space.shape[0] shared_p = Policy(num_inputs, num_outputs) shared_v = Value(num_inputs) shared_p.share_memory() shared_v.share_memory() optimizer_p = my_optim.SharedAdam(shared_p.parameters(), lr=params.lr) optimizer_v = my_optim.SharedAdam(shared_v.parameters(), lr=params.lr) processes = [] p = mp.Process(target=test, args=(params.num_processes, params, shared_p)) p.start() processes.append(p) for rank in range(0, params.num_processes): p = mp.Process(target=train, args=(rank, params, shared_p, shared_v, optimizer_p, optimizer_v)) p.start() processes.append(p) for p in processes: p.join()
cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) state = env.reset() state = torch.from_numpy(state) # <---ICM--- shared_curiosity = IntrinsicCuriosityModule2(args.num_stack, env.action_space, args.epsilon) shared_curiosity.share_memory() # ---ICM---> if args.no_shared: optimizer = None else: optimizer = my_optim.SharedAdam(shared_curiosity.parameters(), lr=args.lr) optimizer.share_memory() if args.curiosity_file is not None: logging.info("Load curiosity") shared_curiosity.load_state_dict(torch.load(args.curiosity_file), strict=False) if args.optimizer_file is not None: logging.info("Load optimizer") optimizer.load_state_dict(torch.load(args.optimizer_file)) if args.new_curiosity: logging.info("Bayesian curiosity") processes = []