def __init__(self, args): self.args = args ######### Initialize the Multiagent Team of agents ######## if self.args.ps == 'full' or self.args.ps == 'trunk': self.agents = [ Agent(self.args, id) ] #todo: is it one agent or more>? is it just agent? sharing all parameters elif self.args.ps == 'none': self.agents = [ Agent(self.args, id) for id in range(self.args.config.num_agents) ] # neural network for each agent else: sys.exit('Incorrect PS choice') self.test_agent = TestAgent(self.args, 991) ###### Buffer and Model Bucket as references to the corresponding agent's attributes #### if args.ps == "trunk": self.buffer_bucket = [ buffer.tuples for buffer in self.agents[0].buffer ] else: self.buffer_bucket = [ag.buffer.tuples for ag in self.agents] # Specifying 3 different networks for evo, PG and test rollouts self.popn_bucket = [ag.popn for ag in self.agents] self.rollout_bucket = [ag.rollout_actor for ag in self.agents] self.test_bucket = self.test_agent.rollout_actor ######### EVOLUTIONARY WORKERS ############ if self.args.popn_size > 0: self.evo_task_pipes = [ Pipe() for _ in range(args.popn_size * args.num_evals) ] # evals for computing the fitness self.evo_result_pipes = [ Pipe() for _ in range(args.popn_size * args.num_evals) ] self.evo_workers = [ Process(target=rollout_worker, args=(self.args, i, 'evo', self.evo_task_pipes[i][1], self.evo_result_pipes[i][0], self.buffer_bucket, self.popn_bucket, True, RANDOM_BASELINE)) for i in range(args.popn_size * args.num_evals) ] # rollout for pop_size*num_evals, # popn_bucket is the neural network for evo for worker in self.evo_workers: worker.start() ######### POLICY GRADIENT WORKERS ############ if self.args.rollout_size > 0: self.pg_task_pipes = Pipe() self.pg_result_pipes = Pipe() self.pg_workers = [ Process(target=rollout_worker, args=(self.args, 0, 'pg', self.pg_task_pipes[1], self.pg_result_pipes[0], self.buffer_bucket, self.rollout_bucket, self.args.rollout_size > 0, RANDOM_BASELINE)) ] # rollout_bucket is the neural network for evo for worker in self.pg_workers: worker.start() ######### TEST WORKERS ############ self.test_task_pipes = Pipe() self.test_result_pipes = Pipe() self.test_workers = [ Process(target=rollout_worker, args=(self.args, 0, 'test', self.test_task_pipes[1], self.test_result_pipes[0], None, self.test_bucket, False, RANDOM_BASELINE)) ] # test_bucket is the neural network for evo for worker in self.test_workers: worker.start() #### STATS AND TRACKING WHICH ROLLOUT IS DONE ###### self.best_score = -999 self.total_frames = 0 self.gen_frames = 0 self.test_trace = []
def meta_fit(self, meta_dataset_generator): with tf.device('/cpu:0'): LOGGER.debug('My PID: %s' % os.getpid()) self.timer.begin('main training') mp.set_start_method('spawn', force=True) self.timer.begin('build data pipeline') # these reservoirs are used to send data to sub-process train_data_process_reservoir = [queue.Queue(self.train_cache_size) for i in range(len(self.devices))] valid_data_process_reservoir = [queue.Queue(self.valid_cache_size) for i in range(len(self.devices))] meta_valid_reservoir = [queue.Queue(self.eval_tasks) for i in range(self.total_exp)] # these reserviors are used to only store the extracted data train_data_extract_reservoir = [queue.Queue(self.train_cache_size) for i in range(len(self.devices))] valid_data_extract_reservoir = [queue.Queue(self.valid_cache_size) for i in range(len(self.devices))] if self.fix_valid: valid_data_cache = [[] for _ in range(len(self.devices))] valid_data_pointer = [0 for _ in range(len(self.devices))] train_recv, valid_recv = [], [] train_send, valid_send = [], [] for i in range(len(self.devices)): recv, send = Pipe(True) # activate the first handshake recv.send(True) train_recv.append(recv) train_send.append(send) recv, send = Pipe(True) # activate the first handshake recv.send(True) valid_recv.append(recv) valid_send.append(send) def apply_device_to_hp(hp, device): hp['device'] = 'cuda:{}'.format(device) return hp self.timer.end('build data pipeline') self.timer.begin('build main proc pipeline') clsnum = get_base_class_number(meta_dataset_generator) LOGGER.info('base class number detected', clsnum) procs = [mp.Process( target=run_exp, args=( self.modules[i].MyMetaLearner, apply_device_to_hp(self.hp[i], dev), train_recv[i], valid_recv[i], clsnum, self.modules[i].process_data if self.process_protocol != 'process-in-main' else None ) ) for i, dev in enumerate(self.devices)] for p in procs: p.daemon = True; p.start() self.timer.end('build main proc pipeline') LOGGER.info('build data', self.timer.query_time_by_name('build data pipeline'), 'build proc', self.timer.query_time_by_name('build main proc pipeline')) label_meta_valid = [] data_generation = True self.timer.begin('prepare dataset') meta_train_dataset = meta_dataset_generator.meta_train_pipeline.batch(1) meta_train_generator = iter(meta_train_dataset) meta_valid_dataset = meta_dataset_generator.meta_valid_pipeline.batch(1) meta_valid_generator = iter(meta_valid_dataset) self.timer.end('prepare dataset') LOGGER.info('prepare dataset', self.timer.query_time_by_name('prepare dataset')) global valid_ens_data_load_number valid_ens_data_load_number = 0 def train_pipe_fill(): while data_generation: data_train = process_task_batch(next(meta_train_generator), device=torch.device('cpu'), with_origin_label=True) for dr in train_data_extract_reservoir: try: dr.put_nowait(data_train) except: pass time.sleep(0.001) def valid_pipe_fill(): global valid_ens_data_load_number while data_generation: data_valid = process_task_batch(next(meta_valid_generator), device=torch.device('cpu'), with_origin_label=False) for dr in valid_data_extract_reservoir: try: dr.put_nowait(data_valid) except: pass if random.random() < 0.1 and valid_ens_data_load_number < self.eval_tasks: # fill the meta-valid valid_ens_data_load_number += 1 label_meta_valid.extend(data_valid[1][1].tolist()) for dr in meta_valid_reservoir: dr.put([data_valid[0][0], data_valid[0][1], data_valid[1][0]]) time.sleep(0.001) def put_data_train_passive(i): while data_generation: try: if train_send[i].recv(): train_send[i].send(train_data_process_reservoir[i].get()) else: return except: pass def put_data_valid_passive(i): while data_generation: try: if valid_send[i].recv(): if self.fix_valid: if len(valid_data_cache[i]) == self.hp[i]['eval_tasks']: # retrieve the ith element data = valid_data_cache[i][valid_data_pointer[i]] valid_data_pointer[i] = (valid_data_pointer[i] + 1) % self.hp[i]['eval_tasks'] valid_send[i].send(data) else: # fill the cache data = valid_data_process_reservoir[i].get() valid_data_cache[i].append(data) valid_send[i].send(data) else: valid_send[i].send(valid_data_process_reservoir[i].get()) else: return except: pass def process_data(i, train=True): while data_generation: extract_ = train_data_extract_reservoir[i] if train else valid_data_extract_reservoir[i] process_ = train_data_process_reservoir[i] if train else valid_data_process_reservoir[i] data = extract_.get() if data == False: break if self.process_protocol == 'process-in-main': data = self.modules[i].process_data(data[0], data[1], train, apply_device_to_hp(self.hp[i], self.devices[i])) process_.put(data) thread_pool = [threading.Thread(target=train_pipe_fill), threading.Thread(target=valid_pipe_fill)] + \ [threading.Thread(target=put_data_train_passive, args=(i,)) for i in range(self.total_exp)] + \ [threading.Thread(target=put_data_valid_passive, args=(i,)) for i in range(self.total_exp)] + \ [threading.Thread(target=process_data, args=(i, train)) for i, train in itertools.product(range(self.total_exp), [True, False])] for th in thread_pool: th.daemon = True; th.start() try: # we leave about 20 min for decoding of test for p in procs: p.join(max(self.timer.time_left() - 60 * 20, 0.1)) self.timer.begin('clear env') # terminate proc that is out-of-time LOGGER.info('Main meta-train is done', '' if self.timer.time_left() > 60 else 'time out exit') LOGGER.info('time left', self.timer.time_left(), 's') for p in procs: if p.is_alive(): p.terminate() LOGGER.info('all process terminated') data_generation = False LOGGER.info('send necessary messages in case of block') # solve the pipe block try: for s in train_recv + valid_recv: s.send(False) for s in train_send + train_recv + valid_send + valid_recv: s.close() except: LOGGER.error('wired, it should not fire any errors, but it just did') # solve the block of extract reservoir for q in train_data_extract_reservoir + valid_data_extract_reservoir: if q.empty(): q.put(False) for q in train_data_process_reservoir + valid_data_process_reservoir: if q.full(): q.get() elif q.empty(): q.put(False) LOGGER.info('wait for all data thread') for p in thread_pool: p.join() LOGGER.info('wait for sub process to exit') for p in procs: p.join() self.timer.end('clear env') LOGGER.info('clear env', self.timer.query_time_by_name('clear env')) self.timer.end('main training') except Exception: LOGGER.info('error occured in main process') traceback.print_exc() LOGGER.info('spawn total {} meta valid tasks. main training time {}'.format(valid_ens_data_load_number, self.timer.query_time_by_name('main training'))) self.timer.begin('load learner') self.meta_learners = [None] * self.total_exp def load_model(args): module, hp, i = args self.meta_learners[i] = module.load_model(hp) pool = [threading.Thread(target=load_model, args=((self.modules[i], self.hp[i], i), )) for i in range(self.total_exp)] for p in pool: p.daemon=True; p.start() for p in pool: p.join() self.timer.end('load learner') LOGGER.info('load learner done, time spent', self.timer.query_time_by_name('load learner')) if not isinstance(self.ensemble, int): # auto-ensemble by exhaustive search procs = [] reses = [None] * len(self.meta_learners) self.timer.begin('validation') recv_list, sent_list = [], [] for i in range(self.total_exp): r, s = Pipe(True) r.send(True) recv_list.append(r) sent_list.append(s) processes = [mp.Process(target=predict, args=( self.meta_learners[i], recv_list[i], self.eval_tasks, self.hp[i]['device'], { 'time_fired': time.time(), 'taskid': i } )) for i in range(self.total_exp)] for p in processes: p.daemon = True; p.start() # start sub thread to pass data def pass_meta_data(i): for _ in range(self.eval_tasks): if sent_list[i].recv(): sent_list[i].send(meta_valid_reservoir[i].get()) threads = [threading.Thread(target=pass_meta_data, args=(i, )) for i in range(self.total_exp)] for t in threads: t.daemon = True; t.start() for _ in range(self.eval_tasks - valid_ens_data_load_number): data_valid = next(meta_valid_generator) data_valid = process_task_batch(data_valid, device=torch.device('cpu'), with_origin_label=False) label_meta_valid.extend(data_valid[1][1].tolist()) for dr in meta_valid_reservoir: dr.put([data_valid[0][0], data_valid[0][1], data_valid[1][0]]) # LOGGER.info('put data!') LOGGER.info('all data done!') LOGGER.info(len(label_meta_valid)) # now we can receive data for t in threads: t.join() reses = [sent_list[i].recv()['res'] for i in range(self.total_exp)] for send in sent_list: send.send(True) # for p in processes: p.join() # every res in reses is a np.array of shape (eval_task * WAY * QUERY) * WAY ENS_VALID_TASK = 100 ENS_VALID_ELEMENT = ENS_VALID_TASK * 5 * 19 reses_test_list = [deepcopy(res[-ENS_VALID_ELEMENT:]) for res in reses] self.timer.end('validation') LOGGER.info('valid data predict done', self.timer.query_time_by_name('validation')) weight = [1.] * len(self.meta_learners) labels = np.array(label_meta_valid, dtype=np.int) # 19000 acc_o = ((np.array(weight)[:,None, None] / sum(weight) * np.array(reses)).sum(axis=0).argmax(axis=1) == labels).astype(np.float).mean() reses = np.array(reses, dtype=np.float).transpose((1, 0, 2)) reses_test = reses[-ENS_VALID_ELEMENT:].reshape(ENS_VALID_ELEMENT, -1) reses = reses[:-ENS_VALID_ELEMENT] reses = reses.reshape(len(reses), -1) labels_test = labels[-ENS_VALID_ELEMENT:] labels = labels[:-ENS_VALID_ELEMENT] LOGGER.info('voting result', acc_o) self.timer.begin('ensemble') # mp.set_start_method('fork', True) pool = mp.Pool(3) result = pool.map(ensemble_on_data, [ # (GBMEnsembler(), reses, labels, 'gbm'), # currently, gbm has some problems when save/load (GLMEnsembler(), reses, labels, 'glm'), (NBEnsembler(), reses, labels, 'nb'), (RFEnsembler(), reses, labels, 'rf') # too over-fit on simple dataset ]) # test the ensemble model def acc(logit, label): return (logit.argmax(axis=1) == label).mean() res_test = [x[0]._predict(reses_test) for x in result] acc_test = [acc(r, labels_test) for r in res_test] acc_single_test = [acc(np.array(r), labels_test) for r in reses_test_list] LOGGER.info('ensemble test', 'glm', 'nb', 'rf', acc_test) LOGGER.info('single test', acc_single_test) if max(acc_test) > max(acc_single_test): LOGGER.info("will use ensemble model") #idx_acc_max = np.argmax([x[1] for x in result]) idx_acc_max = np.argmax(acc_test) self.timer.end('ensemble') print('best ensembler', ['glm', 'nb', 'rf'][idx_acc_max], 'acc', acc_test[idx_acc_max]) print('ensemble done, time cost', self.timer.query_time_by_name('ensemble')) return MyLearner(self.meta_learners, result[idx_acc_max][0], timers=self.timer) else: LOGGER.info("will use single model") idx_acc_max = np.argmax(acc_single_test) self.timer.end('ensemble') print('best single model id', idx_acc_max) print('ensemble done, time cost', self.timer.query_time_by_name('ensemble')) # return only the best meta learners return MyLearner([self.meta_learners[idx_acc_max]], 0, self.timer) return MyLearner([self.meta_learners[self.ensemble]], 0, timers=self.timer)
lr=0.0007, alpha=0.99, eps=0.1, momentum=0.0) #Optimizer.share_memory() CriticOptimizer.share_memory() ActorOptimizer.share_memory() lock = Lock() num_cpu = 4 agents = [] for cpu in range(num_cpu): agents.append(Agent(cpu)) receiver, sender = Pipe() agent_threads = [] for agent in agents: thread = Process(target=agent.letsgo, args=( GlobalModel, CriticOptimizer, ActorOptimizer, lock, sender, MAX_EPISODES, MAX_ACTIONS, DISCOUNT_FACTOR, STEPS, Optimizer,
class OnlineVaeAlgorithm(TorchBatchRLAlgorithm): def __init__(self, vae, vae_trainer, *base_args, vae_save_period=1, vae_training_schedule=vae_schedules.never_train, oracle_data=False, parallel_vae_train=True, vae_min_num_steps_before_training=0, uniform_dataset=None, **base_kwargs): super().__init__(*base_args, **base_kwargs) assert isinstance(self.replay_buffer, ReplayBuffer) self.vae = vae self.vae_trainer = vae_trainer self.vae_trainer.model = self.vae self.vae_save_period = vae_save_period self.vae_training_schedule = vae_training_schedule self.oracle_data = oracle_data self.parallel_vae_train = parallel_vae_train self.vae_min_num_steps_before_training = vae_min_num_steps_before_training self.uniform_dataset = uniform_dataset self._vae_training_process = None self._update_subprocess_vae_thread = None self._vae_conn_pipe = None def _train(self): super()._train() self._cleanup() def _end_epoch(self, epoch): self._train_vae(epoch) gt.stamp('vae training') super()._end_epoch(epoch) def _log_stats(self, epoch): self._log_vae_stats() super()._log_stats(epoch) def to(self, device): self.vae.to(device) super().to(device) def _get_snapshot(self): snapshot = super()._get_snapshot() assert 'vae' not in snapshot snapshot['vae'] = self.vae snapshot['replay_buffer'] = dict( _obs=self.replay_buffer._obs, _actions=self.replay_buffer._actions, _next_obs=self.replay_buffer._next_obs, _terminals=self.replay_buffer._terminals, _size=self.replay_buffer._size, _top=self.replay_buffer._top, _idx_to_future_obs_idx=self.replay_buffer._idx_to_future_obs_idx) return snapshot """ VAE-specific Code """ def _train_vae(self, epoch): if self.parallel_vae_train and self._vae_training_process is None: self.init_vae_training_subprocess() should_train, amount_to_train = self.vae_training_schedule(epoch) rl_start_epoch = int(self.min_num_steps_before_training / (self.num_expl_steps_per_train_loop * self.num_train_loops_per_epoch)) if should_train or epoch <= (rl_start_epoch - 1): if self.parallel_vae_train: assert self._vae_training_process.is_alive() # Make sure the last vae update has finished before starting # another one if self._update_subprocess_vae_thread is not None: self._update_subprocess_vae_thread.join() self._update_subprocess_vae_thread = Thread( target=OnlineVaeAlgorithm. update_vae_in_training_subprocess, args=(self, epoch, ptu.device)) self._update_subprocess_vae_thread.start() self._vae_conn_pipe.send((amount_to_train, epoch)) else: _train_vae(self.vae_trainer, self.replay_buffer, epoch, amount_to_train) self.replay_buffer.refresh_latents(epoch) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, ) def _log_vae_stats(self): logger.record_dict( self.vae_trainer.get_diagnostics(), prefix='vae_trainer/', ) def _cleanup(self): if self.parallel_vae_train: self._vae_conn_pipe.close() self._vae_training_process.terminate() def init_vae_training_subprocess(self): assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer) self._vae_conn_pipe, process_pipe = Pipe() self._vae_training_process = Process( target=subprocess_train_vae_loop, args=( process_pipe, self.vae, self.vae.state_dict(), self.replay_buffer, self.replay_buffer.get_mp_info(), ptu.device, )) self._vae_training_process.start() self._vae_conn_pipe.send(self.vae_trainer) def update_vae_in_training_subprocess(self, epoch, device): self.vae.__setstate__(self._vae_conn_pipe.recv()) self.vae.to(device) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, )
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') env = gym.make(args.env_name) input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in args.env_name: output_size -= 1 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, args.env_name + '.model') predictor_path = os.path.join(args.save_dir, args.env_name + '.pred') target_path = os.path.join(args.save_dir, args.env_name + '.target') writer = SummaryWriter(log_dir=args.log_dir) reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize observation print('Initializes observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action( model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv( ) next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward( rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path)
class OnlineVaeOffpolicyAlgorithm(TorchBatchRLAlgorithm): def __init__(self, vae, vae_trainer, *base_args, vae_save_period=1, vae_training_schedule=vae_schedules.never_train, oracle_data=False, parallel_vae_train=True, vae_min_num_steps_before_training=0, uniform_dataset=None, dataset_path=None, rl_offpolicy_num_training_steps=0, **base_kwargs): super().__init__(*base_args, **base_kwargs) assert isinstance(self.replay_buffer, OnlineVaeRelabelingBuffer) self.vae = vae self.vae_trainer = vae_trainer self.vae_trainer.model = self.vae self.vae_save_period = vae_save_period self.vae_training_schedule = vae_training_schedule self.oracle_data = oracle_data self.parallel_vae_train = parallel_vae_train self.vae_min_num_steps_before_training = vae_min_num_steps_before_training self.uniform_dataset = uniform_dataset self._vae_training_process = None self._update_subprocess_vae_thread = None self._vae_conn_pipe = None self.dataset_path = dataset_path if self.dataset_path: self.load_dataset(dataset_path) # train Q and policy rl_offpolicy_num_training_steps times self.rl_offpolicy_num_training_steps = rl_offpolicy_num_training_steps def pretrain(self): for _ in range(self.rl_offpolicy_num_training_steps): train_data = self.replay_buffer.random_batch(self.batch_size) self.trainer.train(train_data) def load_dataset(self, dataset_path): dataset = load_local_or_remote_file(dataset_path) dataset = dataset.item() observations = dataset['observations'] actions = dataset['actions'] # dataset['observations'].shape # (2000, 50, 6912) # dataset['actions'].shape # (2000, 50, 2) # dataset['env'].shape # (2000, 6912) N, H, imlength = observations.shape self.vae.eval() for n in range(N): x0 = ptu.from_numpy(dataset['env'][n:n + 1, :] / 255.0) x = ptu.from_numpy(observations[n, :, :] / 255.0) latents = self.vae.encode(x, x0, distrib=False) r1, r2 = self.vae.latent_sizes conditioning = latents[0, r1:] goal = torch.cat( [ptu.randn(self.vae.latent_sizes[0]), conditioning]) goal = ptu.get_numpy(goal) # latents[-1, :] latents = ptu.get_numpy(latents) latent_delta = latents - goal distances = np.zeros((H - 1, 1)) for i in range(H - 1): distances[i, 0] = np.linalg.norm(latent_delta[i + 1, :]) terminals = np.zeros((H - 1, 1)) # terminals[-1, 0] = 1 path = dict( observations=[], actions=actions[n, :H - 1, :], next_observations=[], rewards=-distances, terminals=terminals, ) for t in range(H - 1): # reward = -np.linalg.norm(latent_delta[i, :]) obs = dict( latent_observation=latents[t, :], latent_achieved_goal=latents[t, :], latent_desired_goal=goal, ) next_obs = dict( latent_observation=latents[t + 1, :], latent_achieved_goal=latents[t + 1, :], latent_desired_goal=goal, ) path['observations'].append(obs) path['next_observations'].append(next_obs) # import ipdb; ipdb.set_trace() self.replay_buffer.add_path(path) def _end_epoch(self): self._train_vae(self.epoch) timer.stamp('vae training') super()._end_epoch() def _get_diagnostics(self): vae_log = self._get_vae_diagnostics().copy() vae_log.update(super()._get_diagnostics()) return vae_log def to(self, device): self.vae.to(device) super().to(device) """ VAE-specific Code """ def _train_vae(self, epoch): if self.parallel_vae_train and self._vae_training_process is None: self.init_vae_training_subprocess() should_train, amount_to_train = self.vae_training_schedule(epoch) rl_start_epoch = int(self.min_num_steps_before_training / (self.num_expl_steps_per_train_loop * self.num_train_loops_per_epoch)) if should_train: # or epoch <= (rl_start_epoch - 1): if self.parallel_vae_train: assert self._vae_training_process.is_alive() # Make sure the last vae update has finished before starting # another one if self._update_subprocess_vae_thread is not None: self._update_subprocess_vae_thread.join() self._update_subprocess_vae_thread = Thread( target=OnlineVaeAlgorithm. update_vae_in_training_subprocess, args=(self, epoch, ptu.device)) self._update_subprocess_vae_thread.start() self._vae_conn_pipe.send((amount_to_train, epoch)) else: _train_vae(self.vae_trainer, epoch, self.replay_buffer, amount_to_train) self.replay_buffer.refresh_latents(epoch) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, ) def _get_vae_diagnostics(self): return add_prefix( self.vae_trainer.get_diagnostics(), prefix='vae_trainer/', ) def _cleanup(self): if self.parallel_vae_train: self._vae_conn_pipe.close() self._vae_training_process.terminate() def init_vae_training_subprocess(self): assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer) self._vae_conn_pipe, process_pipe = Pipe() self._vae_training_process = Process( target=subprocess_train_vae_loop, args=( process_pipe, self.vae, self.vae.state_dict(), self.replay_buffer, self.replay_buffer.get_mp_info(), ptu.device, )) self._vae_training_process.start() self._vae_conn_pipe.send(self.vae_trainer) def update_vae_in_training_subprocess(self, epoch, device): self.vae.__setstate__(self._vae_conn_pipe.recv()) self.vae.to(device) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, )
def __init__(self, args, model_constructor, env_constructor): self.args = args self.policy_string = self.compute_policy_type() #Evolution self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Genealogy tool self.genealogy = Genealogy() #Initialize population self.population = self.manager.list() seed = True for _ in range(args.pop_size): self.population.append( model_constructor.make_model(self.policy_string, seed=seed)) seed = False #SEED #self.population[0].load_state_dict(torch.load('Results/Auxiliary/_bestcerl_td3_s2019_roll10_pop10_portfolio10')) #Save best policy self.best_policy = model_constructor.make_model(self.policy_string) #Turn off gradients and put in eval mod for actor in self.population: actor = actor.cpu() actor.eval() #Init BUFFER self.replay_buffer = Buffer(args.buffer_size) self.data_bucket = self.replay_buffer.tuples #Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, args.portfolio_id, model_constructor) #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() for _ in range(len(self.portfolio)): self.rollout_bucket.append( model_constructor.make_model(self.policy_string)) ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 'evo', self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], self.data_bucket, self.population, env_constructor)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], self.data_bucket, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append( model_constructor.make_model(self.policy_string)) #5 Test workers self.test_task_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_result_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], None, self.test_bucket, env_constructor)) for id in range(env_constructor.dummy_env.test_size) ] for worker in self.test_workers: worker.start() self.test_flag = False #Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None self.best_r1_score = 0.0 self.ep_len = 0 self.r1_reward = 0 self.num_footsteps = 0 self.test_trace = []
if is_load_model: if use_cuda: agent.model.load_state_dict(torch.load(load_model_path)) else: agent.model.load_state_dict( torch.load(load_model_path, map_location='cpu')) if not is_training: agent.model.eval() works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = MarioEnvironment(env_id, is_render, idx, child_conn) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 global_step = 0 recent_prob = deque(maxlen=10)
def ars(env_name, policy, n_epochs, env_config={}, n_workers=8, step_size=.02, n_delta=32, n_top=16, exp_noise=0.03, zero_policy=True, learn_means=True, postprocess=postprocess_default): torch.autograd.set_grad_enabled(False) """ Augmented Random Search https://arxiv.org/pdf/1803.07055 Args: Returns: Example: """ proc_list = [] master_pipe_list = [] for i in range(n_workers): master_con, worker_con = Pipe() proc = Process(target=worker_fn, args=(worker_con, env_name, env_config, policy, postprocess)) proc.start() proc_list.append(proc) master_pipe_list.append(master_con) W = torch.nn.utils.parameters_to_vector(policy.parameters()) n_param = W.shape[0] if zero_policy: W = torch.zeros_like(W) env = gym.make(env_name, **env_config) s_mean = policy.state_means s_std = policy.state_std total_steps = 0 env.close() r_hist = [] lr_hist = [] exp_dist = torch.distributions.Normal(torch.zeros(n_delta, n_param), torch.ones(n_delta, n_param)) for epoch in range(n_epochs): deltas = exp_dist.sample() pm_W = torch.cat((W + (deltas * exp_noise), W - (deltas * exp_noise))) for i, Ws in enumerate(pm_W): master_pipe_list[i % n_workers].send((Ws, s_mean, s_std)) results = [] for i, _ in enumerate(pm_W): results.append(master_pipe_list[i % n_workers].recv()) states = torch.empty(0) p_returns = [] m_returns = [] l_returns = [] top_returns = [] for p_result, m_result in zip(results[:n_delta], results[n_delta:]): ps, pr, plr = p_result ms, mr, mlr = m_result states = torch.cat((states, ms, ps), dim=0) p_returns.append(pr) m_returns.append(mr) l_returns.append(plr) l_returns.append(mlr) top_returns.append(max(pr, mr)) top_idx = sorted(range(len(top_returns)), key=lambda k: top_returns[k], reverse=True)[:n_top] p_returns = torch.stack(p_returns)[top_idx] m_returns = torch.stack(m_returns)[top_idx] l_returns = torch.stack(l_returns)[top_idx] lr_hist.append(l_returns.mean()) r_hist.append((p_returns.mean() + m_returns.mean()) / 2) ep_steps = states.shape[0] s_mean = update_mean(states, s_mean, total_steps) s_std = update_std(states, s_std, total_steps) total_steps += ep_steps if epoch % 5 == 0: print( f"epoch: {epoch}, reward: {lr_hist[-1].item()}, processed reward: {r_hist[-1].item()} " ) W = W + (step_size / (n_delta * torch.cat( (p_returns, m_returns)).std() + 1e-6)) * torch.sum( (p_returns - m_returns) * deltas[top_idx].T, dim=1) for pipe in master_pipe_list: pipe.send("STOP") policy.state_means = s_mean policy.state_std = s_std torch.nn.utils.vector_to_parameters(W, policy.parameters()) return policy, r_hist, lr_hist
def main(): print({section: dict(config[section]) for section in config.sections()}) env_id = default_config['EnvID'] env_type = default_config['EnvType'] if env_type == 'mario': env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) elif env_type == 'atari': env = gym.make(env_id) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() is_render = True model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) use_cuda = False use_gae = default_config.getboolean('UseGAE') #use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = 1 num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) clip_grad_norm = float(default_config['ClipGradNorm']) sticky_action = False action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') agent = RNDAgent if default_config['EnvType'] == 'atari': env_type = AtariEnvironment elif default_config['EnvType'] == 'mario': env_type = MarioEnvironment else: raise NotImplementedError agent = agent(input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae) print('Loading Pre-trained model....') if use_cuda: agent.model.load_state_dict(torch.load(model_path)) agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) agent.rnd.target.load_state_dict(torch.load(target_path)) else: agent.model.load_state_dict(torch.load(model_path, map_location='cpu')) agent.rnd.predictor.load_state_dict( torch.load(predictor_path, map_location='cpu')) agent.rnd.target.load_state_dict( torch.load(target_path, map_location='cpu')) print('End load...') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) steps = 0 rall = 0 rd = False intrinsic_reward_list = [] while not rd: steps += 1 actions, value_ext, value_int, policy = agent.get_action( np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() rall += r next_states = s.reshape([1, 4, 84, 84]) next_obs = s[3, :, :].reshape([1, 1, 84, 84]) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward(next_obs) intrinsic_reward_list.append(intrinsic_reward) states = next_states[:, :, :, :] if rd: intrinsic_reward_list = ( intrinsic_reward_list - np.mean(intrinsic_reward_list)) / np.std(intrinsic_reward_list) with open('int_reward', 'wb') as f: pickle.dump(intrinsic_reward_list, f) steps = 0 rall = 0
def __init__(self, args): self.args = args self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Genealogy tool self.genealogy = Genealogy() #Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) else: self.pop.append(Actor(args.state_dim, args.action_dim, wwid)) if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1) #Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() #Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.rollout_bucket = self.manager.list() for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append(Actor(args.state_dim, args.action_dim, -1)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [Process(target=rollout_worker, args=(id, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO)) for id in range(args.pop_size)] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [Process(target=rollout_worker, args=(id, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(args.rollout_size)] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1)) #5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [Process(target=rollout_worker, args=(id, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(TEST_SIZE)] for worker in self.test_workers: worker.start() self.test_flag = False #Meta-learning controller (Resource Distribution) self.allocation = [] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append(i % len(self.portfolio)) #Start uniformly (equal resources) #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores #Trackers self.best_score = 0.0; self.gen_frames = 0; self.total_frames = 0; self.best_shaped_score = None; self.test_score = None; self.test_std = None
def learn(self, n_epochs): torch.autograd.set_grad_enabled(False) proc_list = [] master_pipe_list = [] learn_start_idx = copy.copy(self.total_epochs) for i in range(self.n_workers): master_con, worker_con = Pipe() proc = Process(target=worker_fn, args=(worker_con, self.env_name, self.env_config, self.policy, self.postprocessor, self.seed)) proc.start() proc_list.append(proc) master_pipe_list.append(master_con) W = torch.nn.utils.parameters_to_vector(self.policy.parameters()) n_param = W.shape[0] torch.manual_seed(self.seed) exp_dist = torch.distributions.Normal( torch.zeros(self.n_delta, n_param), torch.ones(self.n_delta, n_param)) for _ in range(n_epochs): deltas = exp_dist.sample() pm_W = torch.cat( (W + (deltas * self.exp_noise), W - (deltas * self.exp_noise))) for i, Ws in enumerate(pm_W): master_pipe_list[i % self.n_workers].send( (Ws, self.policy.state_means, self.policy.state_std)) results = [] for i, _ in enumerate(pm_W): results.append(master_pipe_list[i % self.n_workers].recv()) states = torch.empty(0) p_returns = [] m_returns = [] l_returns = [] top_returns = [] for p_result, m_result in zip(results[:self.n_delta], results[self.n_delta:]): ps, pr, plr = p_result ms, mr, mlr = m_result states = torch.cat((states, ms, ps), dim=0) p_returns.append(pr) m_returns.append(mr) l_returns.append(plr) l_returns.append(mlr) top_returns.append(max(pr, mr)) top_idx = sorted(range(len(top_returns)), key=lambda k: top_returns[k], reverse=True)[:self.n_top] p_returns = torch.stack(p_returns)[top_idx] m_returns = torch.stack(m_returns)[top_idx] l_returns = torch.stack(l_returns)[top_idx] self.lr_hist.append(l_returns.mean()) self.r_hist.append((p_returns.mean() + m_returns.mean()) / 2) ep_steps = states.shape[0] self.policy.state_means = update_mean(states, self.policy.state_means, self.total_steps) self.policy.state_std = update_std(states, self.policy.state_std, self.total_steps) self.total_steps += ep_steps self.total_epochs += 1 W = W + (self.step_size / (self.n_delta * torch.cat( (p_returns, m_returns)).std() + 1e-6)) * torch.sum( (p_returns - m_returns) * deltas[top_idx].T, dim=1) for pipe in master_pipe_list: pipe.send("STOP") for proc in proc_list: proc.join() torch.nn.utils.vector_to_parameters(W, self.policy.parameters()) return self.lr_hist[learn_start_idx:]
def train(self, training_steps, no_actors, learning_rate, epsilons, n_step, beta, alpha, batch_size, policy_update, discount_factor, max_actions_per_episode, size_local_memory_buffer, eval_freq, replay_size_before_sample=None, no_envs=1): if batch_size > self.replay_mem_size: raise ValueError( "Please make sure replay memory size is larger than batch size." ) if 1 > n_step: raise ValueError("Please have n_step >= 1.") if 1 >= size_local_memory_buffer: raise ValueError("Please let size_local_memory_buffer > 1.") if not isinstance(epsilons, list): raise ValueError("Please provide epsilons as a list.") if len(epsilons) != no_envs * no_actors: raise ValueError( "Mismatch in epsilons and no_envs*no_actors. Please let len(epsilons) == no_envs*no_actors." ) world_size = no_actors + 2 #(+ Learner proces and Memory process) actor_processes = [] # Communication channels between processes transition_queue_to_memory = Queue() transition_queue_from_memory = Queue() update_priorities_queue_to_memory = Queue() # Communication pipes from learner to actors, one for each actor # For sending new network weights to the actors # The pipes are one way comunication (duplex = False) con_learner_actor = [] con_actor_learner = [] for a in range(no_actors): con_1, con_2 = Pipe(duplex=True) con_learner_actor.append(con_1) con_actor_learner.append(con_2) con_learner_memory, con_memory_learner = Pipe(duplex=True) """ Learner Process """ learner_args = { "no_actors": no_actors, "train_steps": training_steps, "batch_size": batch_size, "learning_rate": learning_rate, "policy_update": policy_update, "discount_factor": discount_factor, "optimizer": self.optimizer, "policy_net": self.policy_net, "policy_config": self.policy_config, "device": self.device, "transition_queue_from_memory": transition_queue_from_memory, "update_priorities_queue_to_memory": update_priorities_queue_to_memory, "con_actors": con_learner_actor, "con_replay_memory": con_learner_memory, "eval_freq": eval_freq, "env": self.env, "env_config": self.env_config, "tb_log_dir": self.tb_log_dir, "update_tb": self.update_tb } learner_process = Process(target=self._init_process, args=(0, world_size, learner, learner_args)) learner_process.start() """ Memory Process """ mem_args = { "capacity": self.replay_mem_size, "alpha": alpha, "beta": beta, "batch_size": batch_size, "transition_queue_to_memory": transition_queue_to_memory, "transition_queue_from_memory": transition_queue_from_memory, "update_priorities_queue_to_memory": update_priorities_queue_to_memory, "con_learner": con_memory_learner, "replay_size_before_sampling": replay_size_before_sample if not (replay_size_before_sample is None) else min( batch_size, int(self.replay_memory * 0.25)), "tb_log_dir": self.tb_log_dir, "update_tb": self.update_tb } print("Memory Process") memory_process = Process(target=self._init_process, args=(1, world_size, experienceReplayBuffer, mem_args)) memory_process.start() """ Actor Processes """ actor_args = { "train_steps": training_steps, "max_actions_per_episode": max_actions_per_episode, "update_policy": policy_update, "size_local_memory_buffer": size_local_memory_buffer, "model": self.policy_net, "model_config": self.policy_config, "env": self.env, "env_config": self.env_config, "no_envs": no_envs, "device": self.device, "discount_factor": discount_factor, "transition_queue_to_memory": transition_queue_to_memory, "n_step": n_step } split = 0 for rank in range(no_actors): next_split = split + no_envs actor_args["epsilon"] = epsilons[split:next_split] actor_args["con_learner"] = con_actor_learner[rank] split = next_split actor_process = Process(target=self._init_process, args=(rank + 2, world_size, actor, actor_args)) actor_process.start() print("starting actor ", (rank + 2)) actor_processes.append(actor_process) for a in actor_processes: a.join() print(a, "joined") memory_process.join() learner_process.join()
def main(): print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] env_id = default_config['EnvID'] env_type = default_config['EnvType'] if env_type == 'mario': env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) elif env_type == 'atari': env = gym.make(env_id) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() is_load_model = False is_render = False model_path = 'models/{}.model'.format(env_id) icm_path = 'models/{}.icm'.format(env_id) writer = SummaryWriter() use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) eta = float(default_config['ETA']) clip_grad_norm = float(default_config['ClipGradNorm']) reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 4, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(gamma) agent = ICMAgent if default_config['EnvType'] == 'atari': env_type = AtariEnvironment elif default_config['EnvType'] == 'mario': env_type = MarioEnvironment else: raise NotImplementedError agent = agent(input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, eta=eta, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net) if is_load_model: if use_cuda: agent.model.load_state_dict(torch.load(model_path)) else: agent.model.load_state_dict( torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs print('Start to initailize observation normalization parameter.....') next_obs = [] steps = 0 while steps < pre_obs_norm_step: steps += num_worker actions = np.random.randint(0, output_size, size=(num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[:]) next_obs = np.stack(next_obs) obs_rms.update(next_obs) print('End to initalize...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_values, total_policy = \ [], [], [], [], [], [], [], [], [] global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for _ in range(num_step): actions, value, policy = agent.get_action( (states - obs_rms.mean) / np.sqrt(obs_rms.var)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) # total reward = int reward intrinsic_reward = agent.compute_intrinsic_reward( (states - obs_rms.mean) / np.sqrt(obs_rms.var), (next_states - obs_rms.mean) / np.sqrt(obs_rms.var), actions) sample_i_rall += intrinsic_reward[sample_env_idx] total_int_reward.append(intrinsic_reward) total_state.append(states) total_next_state.append(next_states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_values.append(value) total_policy.append(policy) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value, _ = agent.get_action( (states - obs_rms.mean) / np.sqrt(obs_rms.var)) total_values.append(value) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_next_state = np.stack(total_next_state).transpose( [1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_values = np.stack(total_values).transpose() total_logging_policy = torch.stack(total_policy).view( -1, output_size).cpu().numpy() # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage target, adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_values, gamma, num_step, num_worker) adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8) # ----------------------------------------------- # Step 5. Training! agent.train_model( (total_state - obs_rms.mean) / np.sqrt(obs_rms.var), (total_next_state - obs_rms.mean) / np.sqrt(obs_rms.var), target, total_action, adv, total_policy) if global_step % (num_worker * num_step * 100) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.icm.state_dict(), icm_path)
def __init__(self, args, model_constructor, env_constructor): self.args = args #MP TOOLS self.manager = Manager() #Algo self.algo = TD3(model_constructor, actor_lr=args.actor_lr, critic_lr=args.critic_lr, gamma=args.gamma, tau=args.tau, polciy_noise=0.1, policy_noise_clip=0.2, policy_ups_freq=2) #Save best policy self.best_policy = model_constructor.make_model('Gaussian_FF') self.best_policy.stochastic = False #Init BUFFER self.replay_buffer = Buffer(args.buffer_size) self.data_bucket = self.replay_buffer.tuples #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() self.rollout_bucket.append(model_constructor.make_model('Gaussian_FF')) for actor in self.rollout_bucket: actor.stochastic = False actor.eval() ############## MULTIPROCESSING TOOLS ################### #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], self.data_bucket, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append(model_constructor.make_model('Gaussian_FF')) for actor in self.test_bucket: actor.stochastic = False actor.eval() #5 Test workers self.test_task_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_result_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], None, self.test_bucket, env_constructor)) for id in range(env_constructor.dummy_env.test_size) ] for worker in self.test_workers: worker.start() self.test_flag = False #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None self.test_trace = [] self.ep_len = 0 self.r1_reward = 0 self.num_footsteps = 0
def main(): total_steps = int(sys.argv[1]) env_id = str(sys.argv[2]) int_coef = float(sys.argv[3]) print("steps: ", total_steps) print(env_id) print(int_coef) print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] # env_id = default_config['EnvID'] env_type = default_config['EnvType'] if env_type == 'mario': env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) elif env_type == 'atari': env = gym.make(env_id) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() is_load_model = False is_render = False model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) writer = SummaryWriter() use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) int_gamma = float(default_config['IntGamma']) clip_grad_norm = float(default_config['ClipGradNorm']) ext_coef = float(default_config['ExtCoef']) # int_coef = float(default_config['IntCoef']) sticky_action = default_config.getboolean('StickyAction') action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) agent = RNDAgent if default_config['EnvType'] == 'atari': env_type = AtariEnvironment elif default_config['EnvType'] == 'mario': env_type = MarioEnvironment else: raise NotImplementedError agent = agent(input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net) if is_load_model: print('load model...') if use_cuda: agent.model.load_state_dict(torch.load(model_path)) agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) agent.rnd.target.load_state_dict(torch.load(target_path)) else: agent.model.load_state_dict( torch.load(model_path, map_location='cpu')) agent.rnd.predictor.load_state_dict( torch.load(predictor_path, map_location='cpu')) agent.rnd.target.load_state_dict( torch.load(target_path, map_location='cpu')) print('load finished!') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs print('Start to initailize observation normalization parameter.....') next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.randint(0, output_size, size=(num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('End to initalize...') for i in range(total_steps): total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy, total_policy_np = \ [], [], [], [], [], [], [], [], [], [], [] global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for _ in range(num_step): actions, value_ext, value_int, policy = agent.get_action( np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action( np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! agent.train_model( np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip( -5, 5), total_policy) if global_step % (num_worker * num_step * 100) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.rnd.predictor.state_dict(), predictor_path) torch.save(agent.rnd.target.state_dict(), target_path)
def meta_fit(self, meta_dataset_generator): catchable_sigs = set(signal.Signals) - {signal.SIGKILL, signal.SIGSTOP} for sig in catchable_sigs: signal.signal( sig, receive_signal) # Substitute handler of choice for `print` LOGGER.debug('My PID: %s' % os.getpid()) self.timer.begin('main training') mp.set_start_method('spawn', force=True) # >>> BUG: OS Error: Too many opened files # >>> SOLVED: by `ulimit -HSn 4096` # Now, we change all the queues to pipe self.timer.begin('build data pipeline') # every 10 epoch will produce one valid train_data_reservoir = [ queue.Queue(32 * 10) for i in range(len(self.devices)) ] valid_data_reservoir = [ queue.Queue(200) for i in range(len(self.devices)) ] meta_valid_reservoir = [ queue.Queue(self.eval_tasks) for i in range(self.total_exp) ] train_recv, valid_recv = [], [] train_send, valid_send = [], [] for i in range(len(self.devices)): recv, send = Pipe(True) # activate the first handshake recv.send(True) train_recv.append(recv) train_send.append(send) recv, send = Pipe(True) # activate the first handshake recv.send(True) valid_recv.append(recv) valid_send.append(send) def apply_device_to_hp(hp, device): hp['device'] = 'cuda:{}'.format(device) return hp self.timer.end('build data pipeline') self.timer.begin('build main proc pipeline') clsnum = get_base_class_number(meta_dataset_generator) LOGGER.info('base class number detected', clsnum) procs = [ mp.Process(target=run_exp, args=(self.modules[i].MyMetaLearner, apply_device_to_hp(self.hp[i], dev), train_recv[i], valid_recv[i], clsnum)) for i, dev in enumerate(self.devices) ] for p in procs: p.daemon = True p.start() self.timer.end('build main proc pipeline') LOGGER.info('build data', self.timer.query_time_by_name('build data pipeline'), 'build proc', self.timer.query_time_by_name('build main proc pipeline')) label_meta_valid = [] data_generation = True self.timer.begin('prepare dataset') meta_train_dataset = meta_dataset_generator.meta_train_pipeline.batch( 1) meta_train_generator = cycle(iter(meta_train_dataset)) meta_valid_dataset = meta_dataset_generator.meta_valid_pipeline.batch( 1) meta_valid_generator = cycle(iter(meta_valid_dataset)) self.timer.end('prepare dataset') LOGGER.info('prepare dataset', self.timer.query_time_by_name('prepare dataset')) valid_ens_data_load_number = 0 def generate_data(): # manage data globally while data_generation: for i in range(32 * 10): # load train if not data_generation: break data_train = process_task_batch(next(meta_train_generator), device=torch.device('cpu'), with_origin_label=True) for dr in train_data_reservoir: try: dr.put_nowait(data_train) except: pass time.sleep(0.0001) for i in range(200): # load valid if not data_generation: break data_valid = process_task_batch(next(meta_valid_generator), device=torch.device('cpu'), with_origin_label=False) for dr in valid_data_reservoir: try: dr.put_nowait(data_valid) except: pass if random.random() < 0.1: for dr in meta_valid_reservoir: try: if dr.qsize() < self.eval_tasks: valid_ens_data_load_number += 1 dr.put_nowait([ data_valid[0][0], data_valid[0][1], data_valid[1][0] ]) label_meta_valid.extend( data_valid[1][1].tolist()) except: pass time.sleep(0.0001) def put_data_train_passive(i): while data_generation: try: if train_send[i].recv(): supp, quer = train_data_reservoir[i].get() data = self.modules[i].process_data( supp, quer, True, self.hp[i]) train_send[i].send(data) else: return except: pass def put_data_valid_passive(i): while data_generation: try: if valid_send[i].recv(): supp, quer = valid_data_reservoir[i].get() data = self.modules[i].process_data( supp, quer, False, self.hp[i]) valid_send[i].send(data) else: return except: pass thread_pool = [threading.Thread(target=generate_data)] + \ [threading.Thread(target=put_data_train_passive, args=(i,)) for i in range(self.total_exp)] + \ [threading.Thread(target=put_data_valid_passive, args=(i,)) for i in range(self.total_exp)] for th in thread_pool: th.daemon = True th.start() try: # we leave about 20 min for decoding of test for p in procs: p.join(max(self.timer.time_left() - 60 * 10, 0.1)) self.timer.begin('clear env') # terminate proc that is out-of-time LOGGER.info('Main meta-train is done', '' if self.timer.time_left() > 60 else 'time out exit') LOGGER.info('time left', self.timer.time_left(), 's') for p in procs: if p.is_alive(): p.terminate() data_generation = False # in case there are blocking for q in train_data_reservoir + valid_data_reservoir: if q.empty(): q.put(False) for s in train_recv + valid_recv: s.send(False) for s in train_send + train_recv + valid_send + valid_recv: s.close() for p in thread_pool: p.join() self.timer.end('clear env') LOGGER.info('clear env', self.timer.query_time_by_name('clear env')) self.timer.end('main training') except Exception: LOGGER.info('error occured in main process') traceback.print_exc() LOGGER.info( 'spawn total {} meta valid tasks. main training time {}'.format( valid_ens_data_load_number, self.timer.query_time_by_name('main training'))) self.timer.begin('load learner') self.meta_learners = [None] * self.total_exp def load_model(args): module, hp, i = args self.meta_learners[i] = module.load_model(hp) pool = [ threading.Thread(target=load_model, args=((self.modules[i], self.hp[i], i), )) for i in range(self.total_exp) ] for p in pool: p.daemon = True p.start() for p in pool: p.join() self.timer.end('load learner') LOGGER.info('load learner done, time spent', self.timer.query_time_by_name('load learner')) if not isinstance(self.ensemble, int): # instead of just weighted sum, we plan to use stacking procs = [] reses = [None] * len(self.meta_learners) self.timer.begin('validation') recv_list, sent_list = [], [] for i in range(self.total_exp): r, s = Pipe(True) r.send(True) recv_list.append(r) sent_list.append(s) pool = mp.Pool(self.total_exp) procs = pool.starmap_async( predict, [(self.meta_learners[i], recv_list[i], self.eval_tasks, self.hp[i]['device'], { 'time_fired': time.time(), 'taskid': i }) for i in range(self.total_exp)]) # start sub thread to pass data def pass_meta_data(i): for _ in range(self.eval_tasks): if sent_list[i].recv(): # LOGGER.info(i, 'fire data signal get') sent_list[i].send(meta_valid_reservoir[i].get()) # LOGGER.info(i, 'data is sent') threads = [ threading.Thread(target=pass_meta_data, args=(i, )) for i in range(self.total_exp) ] for t in threads: t.daemon = True t.start() for _ in range(self.eval_tasks - valid_ens_data_load_number): data_valid = next(meta_valid_generator) data_valid = process_task_batch(data_valid, device=torch.device('cpu'), with_origin_label=False) label_meta_valid.extend(data_valid[1][1].tolist()) for dr in meta_valid_reservoir: dr.put( [data_valid[0][0], data_valid[0][1], data_valid[1][0]]) # LOGGER.info('put data!') # LOGGER.info('all data done!') # now we can receive data for t in threads: t.join() reses = [sent_list[i].recv()['res'] for i in range(self.total_exp)] # every res in reses is a np.array of shape (eval_task * WAY * QUERY) * WAY ENS_VALID_TASK = 50 ENS_VALID_ELEMENT = ENS_VALID_TASK * 5 * 19 reses_test_list = [ deepcopy(res[-ENS_VALID_ELEMENT:]) for res in reses ] self.timer.end('validation') LOGGER.info('valid data predict done', self.timer.query_time_by_name('validation')) weight = [1.] * len(self.meta_learners) labels = np.array(label_meta_valid, dtype=np.int) # 19000 acc_o = ((np.array(weight)[:, None, None] / sum(weight) * np.array(reses)).sum(axis=0).argmax( axis=1) == labels).mean() reses = np.array(reses, dtype=np.float).transpose((1, 0, 2)) reses_test = reses[-ENS_VALID_ELEMENT:].reshape( ENS_VALID_ELEMENT, -1) reses = reses[:-ENS_VALID_ELEMENT] reses = reses.reshape(len(reses), -1) labels_test = labels[-ENS_VALID_ELEMENT:] labels = labels[:-ENS_VALID_ELEMENT] LOGGER.info('voting result', acc_o) self.timer.begin('ensemble') # mp.set_start_method('fork', True) result = pool.map( ensemble_on_data, [ (GBMEnsembler(), reses, labels, 'gbm'), (GLMEnsembler(), reses, labels, 'glm'), (NBEnsembler(), reses, labels, 'nb'), (RFEnsembler(), reses, labels, 'rf' ) # too over-fit on simple dataset ]) # test the ensemble model def acc(logit, label): return (logit.argmax(axis=1) == label).mean() res_test = [x[0]._predict(reses_test) for x in result] acc_test = [acc(r, labels_test) for r in res_test] acc_single_test = [ acc(np.array(r), labels_test) for r in reses_test_list ] LOGGER.info('ensemble test', 'gbm', 'glm', 'nb', 'rf', acc_test) LOGGER.info('single test', acc_single_test) if max(acc_test) > max(acc_single_test): LOGGER.info("will use ensemble model") #idx_acc_max = np.argmax([x[1] for x in result]) idx_acc_max = np.argmax(acc_test) self.timer.end('ensemble') print('best ensembler', ['gbm', 'glm', 'nb', 'rf'][idx_acc_max], 'acc', acc_test[idx_acc_max]) print('ensemble done, time cost', self.timer.query_time_by_name('ensemble')) # currently we use mean of output as ensemble return MyLearner(self.meta_learners, result[idx_acc_max][0], timers=self.timer) else: LOGGER.info("will use single model") idx_acc_max = np.argmax(acc_single_test) self.timer.end('ensemble') print('best single model id', idx_acc_max) print('ensemble done, time cost', self.timer.query_time_by_name('ensemble')) # return only the best meta learners return MyLearner([self.meta_learners[idx_acc_max]], 0, self.timer) return MyLearner([self.meta_learners[self.ensemble]], 0, timers=self.timer)
def main_read(): log_start(level=logging.INFO) read_conn, write_conn = Pipe() daemon_process_run(read_conn=read_conn, write_conn=write_conn, testcase=1) # 0 for guassian, 1 for PCIE canvas.show() app.run() if __name__ == '__main__': import sys if sys.flags.interactive != 1: # main_read() log_start(level=logging.INFO) # multiprocessing.freeze_support() read_conn, write_conn = Pipe() daemon_process_run(read_conn=read_conn, write_conn=write_conn, lock=lock, testcase=0) # 0 for guassian, 1 for PCIE wview.show() app.run() # To see the save file: ##################### # from Binload import Binload # bf = Binload() # bf.load('PCIE.bin',file_format='float32') # bf.plot(n=(0,10000),chNo=4) #############################################