def main(): timeout = 30 num_cpus = psutil.cpu_count() logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',filename='app.log',level=logging.DEBUG) print(os.getpid()) print(os.getppid()) if not ray.is_initialized(): ray.init(include_webui=True) files = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M'] db.setup() print(os.getpid()) print(os.getppid()) with ray.profile('Event'): for i in range(10): time.sleep(randint(0, 4)) try: ray.get(worker.remote(i)) except Exception as e: raise e print(e.message) finally: print('finally')
def gradient_worker(ps, X, y, batch_size): n_batches = X.shape[0] // batch_size start_idx = 0 for batch_idx in range(n_batches): X_b = X[start_idx:start_idx+batch_size] y_b = y[start_idx:start_idx+batch_size] cur_theta = ray.get(ps.get_params.remote()) with ray.profile("Calculate Grad"): cur_grad = calc_grad(X_b, y_b, cur_theta) ps.update_params.remote(cur_grad) start_idx += batch_size
def f(): with ray.profile("custom_event", extra_data={"name": "custom name"}): pass
def collect_experience(self): with ray.profile("Actor collection loop", extra_data={'Actor id': str(self.id)}): # collection loop - COLLECTS EPISODES OF EXPERIENCE UNTIL training_done while True: cassieEnv = True if self.actor_timesteps % self.load_freq == 0: # PUTTING WAIT ON THIS SHOULD MAKE THIS EXACT SAME AS NON-DISTRIBUTED, if using one actor # Query learner for latest model and termination flag self.policy, self.training_done = ray.get( self.learner_id.get_global_policy.remote()) #global_policy_state_dict, training_done = ray.get(self.learner_id.get_global_policy.remote()) #self.policy.load_state_dict(global_policy_state_dict) print("loaded global model") # self.policy, self.training_done = ray.get(self.learner_id.get_global_policy.remote()) # print("loaded global model") if self.training_done: break obs = self.env.reset() done = False episode_reward = 0 episode_timesteps = 0 # nested collection loop - COLLECTS TIMESTEPS OF EXPERIENCE UNTIL episode is over while episode_timesteps < self.max_traj_len and not done: #self.env.render() # Select action randomly or according to policy if self.actor_timesteps < self.start_timesteps: #print("selecting action randomly {}".format(done_bool)) action = torch.randn( self.env.action_space.shape[0] ) if cassieEnv is True else self.env.action_space.sample( ) action = action.numpy() else: #print("selecting from policy") action = select_action(self.policy, np.array(obs), device) if self.act_noise != 0: action = (action + np.random.normal( 0, self.act_noise, size=self.env.action_space.shape[0])).clip( self.env.action_space.low, self.env.action_space.high) # Perform action new_obs, reward, done, _ = self.env.step(action) done_bool = 1.0 if episode_timesteps + 1 == self.max_traj_len else float( done) episode_reward += reward # Store data in replay buffer self.memory_id.add.remote( (obs, new_obs, action, reward, done_bool)) # call update from model server self.learner_id.update_and_evaluate.remote() # update state obs = new_obs # increment step counts episode_timesteps += 1 self.actor_timesteps += 1 # increment global step count self.learner_id.increment_step_count.remote() # episode is over, increment episode count and plot episode info self.episode_num += 1 # pass episode details to visdom logger on memory server self.memory_id.plot_actor_results.remote( self.id, self.actor_timesteps, episode_reward) ray.wait([self.learner_id.increment_episode_count.remote()], num_returns=1) if self.taper_load_freq and self.taper_timesteps >= 2000: self.load_freq = self.load_freq // 2 print("Increased load frequency")
def update_eval_model(self, policy_noise=0.2, noise_clip=0.5, policy_freq=2): with ray.profile("Learner optimization loop", extra_data={'Episode count': str(self.episode_count)}): start_time = time.time() if ray.get(self.memory.storage_size.remote()) < self.batch_size: print("not enough experience yet") return # randomly sample a mini-batch transition from memory_server x, y, u, r, d = ray.get(self.memory.sample.remote(self.batch_size)) state = torch.FloatTensor(x).to(self.device) action = torch.FloatTensor(u).to(self.device) next_state = torch.FloatTensor(y).to(self.device) done = torch.FloatTensor(1 - d).to(self.device) reward = torch.FloatTensor(r).to(self.device) # Select action according to policy and add clipped noise noise = torch.FloatTensor(u).data.normal_( 0, policy_noise).to(self.device) noise = noise.clamp(-noise_clip, noise_clip) next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_state, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (done * self.discount * target_Q).detach() # Get current Q estimates current_Q1, current_Q2 = self.critic(state, action) # Compute critic loss critic_loss = F.mse_loss( current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.update_counter += 1 # Delayed policy updates if self.update_counter % policy_freq == 0: print("optimizing at timestep {} | time = {} | replay size = {} | episode count = {} | update count = {} ".format(self.step_count, time.time()-start_time, ray.get(self.memory.storage_size.remote()), self.episode_count, self.update_counter)) # Compute actor loss actor_loss = -self.critic.Q1(state, self.actor(state)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_( self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_( self.tau * param.data + (1 - self.tau) * target_param.data)