def __init__(self, args, data, opts): ''' 需要完成几个任务,第一个是对初始任务,怎么考虑 第二是从构建此表,最终完成采样过程 第三是根据采样过程,构建出模型的size,选取出需要用到的参数 :param model: :param task: :param args: :param data: ''' self.args = args self.data = data self.opts = opts self.controller = Controller(args=self.args, task_num=self.opts.num_task) self.controller_optim = Adam(self.controller.parameters(), lr=args.controller_lr) cuda_condition = torch.cuda.is_available() and args.with_cuda self.device = torch.device("cuda" if cuda_condition else "cpu") self.controller = self.controller.to(self.device) self.tasks_config = [] self.task_acc = [] self.model_dict = [] self.task_scope = 1 # =>reuse self.general_scope = 1 # =>new if self.args.adapt: self.task_scope += 1 if self.args.fuse: self.general_scope += 1 self.tensorboard_writer = SummaryWriter() self.iter = 0
def get_data(self, request, id): response = api.get_controller(request, id) data = json.loads(response.text) controller = Controller(data["id"], data["controller_name"], data["class_name"], data["enabled"]) return controller
def __init__(self, mdir, device, time_limit, explorer=False): """ Build vae, rnn, controller and environment. """ self.explorer = explorer # Load controllers vae_file, rnn_file, ctrl_file = \ [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']] if self.explorer: ctrl_file = join(mdir, 'exp', 'best.tar') assert exists(vae_file) and exists(rnn_file),\ "Either vae or mdrnn is untrained." vae_state, rnn_state = [ torch.load(fname, map_location={'cuda:0': str(device)}) for fname in (vae_file, rnn_file) ] for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s['epoch'], s['precision'])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state['state_dict']) # MDRNNCell self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)}) print("Loading Controller with reward {}".format( ctrl_state['reward'])) self.controller.load_state_dict(ctrl_state['state_dict']) self.env = gym.make('CarRacing-v0') self.device = device self.time_limit = time_limit self.mdrnn_notcell = MDRNN(LSIZE, ASIZE, RSIZE, 5) self.mdrnn_notcell.to(device) self.mdrnn_notcell.load_state_dict(rnn_state['state_dict'])
def __init__(self, device, time_limit, discrete_VAE): """ Build vae, rnn, controller and environment. """ self.env = gym.make('CarRacing-v0') self.device = device self.time_limit = time_limit self.discrete_VAE = discrete_VAE #Because the represenation is discrete, we increase the size of the latent vector if (self.discrete_VAE): LSIZE = 128 self.vae = VAE(3, LSIZE, 1024) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5) self.controller = Controller(LSIZE, RSIZE, ASIZE)
for s_id in range(rollouts): p_queue.put((s_id, best_guess)) print("Evaluating...") for _ in tqdm(range(rollouts)): while r_queue.empty(): sleep(.1) restimates.append(r_queue.get()[1]) return best_guess, np.mean(restimates), np.std(restimates) ################################################################################ # Launch CMA # ################################################################################ controller = Controller(LSIZE, RSIZE, ASIZE) # dummy instance # define current best and load parameters cur_best = None ctrl_file = join(ctrl_dir, 'best.tar') print("Attempting to load previous best...") if exists(ctrl_file): state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) cur_best = - state['reward'] controller.load_state_dict(state['state_dict']) print("Previous best was {}...".format(-cur_best)) parameters = controller.parameters() es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1, {'popsize': pop_size})
def train_explorer(logdir, epochs=10, n_samples=4, pop_size=4, display=True, max_workers=10): results = {} results['best'] = [] # multiprocessing variables num_workers = min(max_workers, n_samples * pop_size) time_limit = 1000 # create tmp dir if non existent and clean it if existent tmp_dir = join(logdir, 'tmp_exp') if not exists(tmp_dir): mkdir(tmp_dir) else: for fname in listdir(tmp_dir): unlink(join(tmp_dir, fname)) # create exp dir if non exitent explore_dir = join(logdir, 'explore') if not exists(explore_dir): mkdir(explore_dir) ################################################################################ # Thread routines # ################################################################################ def slave_routine(p_queue, r_queue, e_queue, p_index): """ Thread routine. Threads interact with p_queue, the parameters queue, r_queue, the result queue and e_queue the end queue. They pull parameters from p_queue, execute the corresponding rollout, then place the result in r_queue. Each parameter has its own unique id. Parameters are pulled as tuples (s_id, params) and results are pushed as (s_id, result). The same parameter can appear multiple times in p_queue, displaying the same id each time. As soon as e_queue is non empty, the thread terminate. When multiple gpus are involved, the assigned gpu is determined by the process index p_index (gpu = p_index % n_gpus). :args p_queue: queue containing couples (s_id, parameters) to evaluate :args r_queue: where to place results (s_id, results) :args e_queue: as soon as not empty, terminate :args p_index: the process index """ # init routine gpu = p_index % torch.cuda.device_count() device = torch.device( 'cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu') # redirect streams sys.stdout = open(join(tmp_dir, str(getpid()) + '.out'), 'a') sys.stderr = open(join(tmp_dir, str(getpid()) + '.err'), 'a') # with torch.no_grad(): # r_gen = RolloutGenerator(logdir, device, time_limit) # while e_queue.empty(): # if p_queue.empty(): # sleep(.1) # else: # s_id, params = p_queue.get() # r_queue.put((s_id, r_gen.rollout(params))) with torch.no_grad(): r_gen = RolloutGenerator(logdir, device, time_limit) while e_queue.empty(): if p_queue.empty(): sleep(.1) else: s_id, params = p_queue.get() r_queue.put((s_id, r_gen.rollout(params))) ################################################################################ # Define queues and start workers # ################################################################################ p_queue = Queue() r_queue = Queue() e_queue = Queue() for p_index in range(num_workers): Process(target=slave_routine, args=(p_queue, r_queue, e_queue, p_index)).start() ################################################################################ # Evaluation # ################################################################################ def evaluate(solutions, results, rollouts=100): """ Give current controller evaluation. Evaluation is minus the cumulated reward averaged over rollout runs. :args solutions: CMA set of solutions :args results: corresponding results :args rollouts: number of rollouts :returns: minus averaged cumulated reward """ index_min = np.argmin(results) best_guess = solutions[index_min] restimates = [] for s_id in range(rollouts): p_queue.put((s_id, best_guess)) print("Evaluating...") for _ in tqdm(range(rollouts)): while r_queue.empty(): sleep(.1) restimates.append(r_queue.get()[1]) return best_guess, np.mean(restimates), np.std(restimates) ################################################################################ # Launch CMA # ################################################################################ controller = Controller(LSIZE, RSIZE, ASIZE) # dummy instance # define current best and load parameters cur_best = None ctrl_file = join(explore_dir, 'best.tar') print("Attempting to load previous best...") if exists(ctrl_file): state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) cur_best = -state['reward'] controller.load_state_dict(state['state_dict']) print("Previous best was {}...".format(-cur_best)) parameters = controller.parameters() es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1, {'popsize': pop_size}) epoch = 0 log_step = 3 while not es.stop(): if cur_best is not None and -cur_best > target_return: print("Already better than target, breaking...") break r_list = [0] * pop_size # result list solutions = es.ask() # push parameters to queue for s_id, s in enumerate(solutions): for _ in range(n_samples): p_queue.put((s_id, s)) # retrieve results if display: pbar = tqdm(total=pop_size * n_samples) for _ in range(pop_size * n_samples): while r_queue.empty(): sleep(.1) r_s_id, r = r_queue.get() r_list[r_s_id] += r / n_samples if display: pbar.update(1) if display: pbar.close() es.tell(solutions, r_list) es.disp() # evaluation and saving if epoch % log_step == log_step - 1: best_params, best, std_best = evaluate(solutions, r_list) # log the best results['best'].append(best) print("Current evaluation: {}".format(best)) if not cur_best or cur_best > best: cur_best = best print("Saving new best with value {}+-{}...".format( -cur_best, std_best)) load_parameters(best_params, controller) torch.save( { 'epoch': epoch, 'reward': -cur_best, 'state_dict': controller.state_dict() }, join(explore_dir, 'best.tar')) if -best > target_return: print( "Terminating controller training with value {}...".format( best)) break epoch += 1 es.result_pretty() e_queue.put('EOP') return results
def __init__(self, mdir, device, time_limit, number_goals, Forward_model, hiddengoals: bool, curiosityreward=bool, static=bool): """ Build vae, rnn, controller and environment. """ # Loading world model and vae vae_file, rnn_file, ctrl_file, Dtild_file, hiddenvae_file = [ join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl', 'dtild', 'hiddenvae'] ] assert exists(vae_file) and exists( rnn_file), "Either vae or mdrnn is untrained." vae_state, rnn_state, hiddenvae_state = [ torch.load(fname, map_location={'cuda:0': str(device)}) for fname in (vae_file, rnn_file, hiddenvae_file) ] for m, s in (('VAE', vae_state), ('MDRNN', rnn_state), ('HiddenVAE', hiddenvae_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s['epoch'], s['precision'])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state['state_dict']) self.HiddenVAE = HiddenVAE(256, LSIZE).to(device) self.HiddenVAE.load_state_dict(hiddenvae_state['state_dict']) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()}) self.mdrnnBIG = MDRNN(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnnBIG.load_state_dict(rnn_state["state_dict"]) self.controller = Controller(256, 256, 6).to(device) self.env = gym.make('MiniGrid-MultiRoom-N6-v0') self.device = device self.number_goals = number_goals self.time_limit = time_limit self.vae_state = vae_state self.rnn_state = rnn_state self.hiddenvae_state = hiddenvae_state self.hiddengoals = hiddengoals self.curiosityreward = curiosityreward self.static = static self.Forward_model = Forward_model self.fmodel = Dtild(32, 256, 1, 32).to(device)
for s_id in range(rollouts): p_queue.put((s_id, best_guess)) print("Evaluating...") for _ in tqdm(range(rollouts)): while r_queue.empty(): sleep(.1) restimates.append(r_queue.get()[1]) return best_guess, np.mean(restimates), np.std(restimates) ################################################################################ # Launch CMA # ################################################################################ controller = Controller(LSIZE, RSIZE, ASIZE, is_gate=args.is_gate) # dummy instance # define current best and load parameters cur_best = None ctrl_file = join(ctrl_dir, 'best.tar') print("Attempting to load previous best...") if exists(ctrl_file): state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) cur_best = -state['reward'] # changes so that can load previous mdoels even if the controller source # code has changes (add new parameters) load_model_safe_(controller, state['state_dict']) try: print(controller.gates, controller.is_gate) except AttributeError:
def train_C_given_M(mdrnnCell, latent_dim, hidden_dim, action_dim): # Parameters num_episode = 1 batch_size = 1 learning_rate = 0.01 gamma = 0.99 done_threshold = np.log(0.5) interim_policy = Controller(latent_dim, hidden_dim, action_dim) optimizer = torch.optim.RMSprop(interim_policy.parameters(), lr=learning_rate) # Batch History state_pool = [] action_pool = [] reward_pool = [] steps = 0 for e in range(num_episode): # initial latent and hidden states z_t = torch.randn(1, LSIZE) h_t = 2 * [torch.zeros(1, RSIZE)] for t in range(1000): # pick action using policy net given z_t, h_t mean_a_t = interim_policy(z_t, h_t[0]) action_policy_std = 0.1 cov = action_policy_std * torch.eye(action_dim) stochastic_policy = MultivariateNormal(loc=mean_a_t, covariance_matrix=cov) a_t = stochastic_policy.sample() mu, sigma, pi, r, d, n_h = mdrnnCell(a_t, z_t, h_t) # sample next z_t from N(mu, sigma) pi = pi.squeeze() mixt = Categorical(torch.exp(pi)).sample().item() z_t = mu[:, mixt, :] # + sigma[:, mixt, :] * torch.randn_like(mu[:, mixt, :]) h_t = n_h reward = -0.1 if d >= done_threshold: done = True else: done = False state_pool.append((z_t, h_t)) action_pool.append(a_t) reward_pool.append(reward) steps += 1 if done: break # Update policy if e > 0 and e % batch_size == 0: # Discount reward running_add = 0 for i in reversed(range(steps)): if reward_pool[i] == 0: running_add = 0 else: running_add = running_add * gamma + reward_pool[i] reward_pool[i] = running_add # Normalize reward reward_mean = np.mean(reward_pool) reward_std = np.std(reward_pool) for i in range(steps): reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std # Gradient Desent optimizer.zero_grad() for i in range(steps): z_t, h_t = state_pool[i] action = action_pool[i] reward = reward_pool[i] mean_a_t = interim_policy(z_t, h_t[0]) action_policy_std = 0.1 cov = action_policy_std * torch.eye(action_dim) stochastic_policy = MultivariateNormal(loc=mean_a_t, covariance_matrix=cov) loss = -stochastic_policy.log_prob( action) * reward # Negtive score function x reward # TODO: why do we need to use retain_graph here? loss.backward(retain_graph=True) optimizer.step() state_pool = [] action_pool = [] reward_pool = [] steps = 0 return interim_policy
def __init__(self, mdir, device, time_limit, iteration_num=None, video_dir=None): """ Build vae, rnn, controller and environment. """ # Loading world model and vae vae_file, rnn_file, ctrl_file = [ join(mdir, m, "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] if iteration_num is not None: vae_file, rnn_file, ctrl_file = [ join(mdir, m, "iter_{}".format(iteration_num), "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] assert exists(vae_file) and exists( rnn_file), "Either vae or mdrnn is untrained." if iteration_num is not None: vae_file, rnn_file, ctrl_file = [ join(mdir, m, "iter_{}".format(iteration_num), "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] assert exists(vae_file) and exists( rnn_file), "Either vae or mdrnn is untrained." print("\nRollout Generator") vae_state, rnn_state = [ torch.load(fname, map_location={"cuda:0": str(device)}) for fname in (vae_file, rnn_file) ] print("Loading VAE from {}".format(vae_file)) print("Loading RNN from {}".format(rnn_file)) for m, s in (("VAE", vae_state), ("MDRNN", rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s["epoch"], s["precision"])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state["state_dict"]) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip("_l0"): v for k, v in rnn_state["state_dict"].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): print("Loading Controller from {}".format(ctrl_file)) ctrl_state = torch.load(ctrl_file, map_location={"cuda:0": str(device)}) print("Loading Controller with reward {}".format( ctrl_state["reward"])) self.controller.load_state_dict(ctrl_state["state_dict"]) self.env = gym.make("BipedalWalkerHardcore-v2") self.device = device self.time_limit = time_limit
elif arg == 'fuzzy': controller_fuzzy = True mono = True simulations /= 2 else: raise GetoptError() except GetoptError: usage() sys.exit(-1) #simulations = 5 if len(sys.argv) == 1 else int(sys.argv[1]) for i in range(int(simulations * 2)): # create a controller if mono: control = FuzzyLogicController( log=log) if controller_fuzzy else Controller(log=log) else: control = Controller( log=log) if i % 2 == 0 else FuzzyLogicController(log=log) # create North-to-South and West-to-East lanes north2south = Lane(control, S=15, D=7, name='North to South', init_state=State.green) west2east = Lane(control, S=15, D=7, name='West to East', init_state=State.red)
M = buildMemory('weights/2019.12.07/mdn_rnn_weights') get_hidden = K.function(M.layers[0].input, M.layers[0].output) # In[ ]: print(M.summary()) # In[ ]: controller = Controller(32+256, 3) controller.set_weights(np.load('./weights/C_weights.npy')) # $$\text{Controller}: \mathbb R^{288} \rightarrow \mathbb R^3 $$ # In[ ]: print('controller shape:') # In[ ]: print(controller.shape)
# Fix numeric divergence due to bug in Cudnn torch.backends.cudnn.benchmark = True device = torch.device("cuda" if cuda else "cpu") trained = 0 #model = VAE(3, LSIZE).to(device) vae_model = VAE(3, LSIZE) vae_model = torch.nn.DataParallel(vae_model, device_ids=[7]) vae_model.cuda(7) vae_model.eval() mdrnn_model = MDRNNCell(LSIZE, ASIZE, RSIZE, 5) mdrnn_model = torch.nn.DataParallel(mdrnn_model, device_ids=[7]) mdrnn_model.cuda(7) mdrnn_model.eval() controller = torch.nn.DataParallel(Controller(LSIZE, RSIZE, ASIZE)).cuda() vis = visdom.Visdom(env='dream') image_window = vis.image( np.random.rand(RED_SIZE * 10, RED_SIZE * 10), opts=dict(title='dream!', caption='dream.'), ) # check vae dir exists, if not, create it dream_dir = join(args.logdir, 'dream') vae_dir = join(args.logdir, 'vae') reload_file = join(vae_dir, 'best.tar') state = torch.load(reload_file) print("Reloading model at epoch {}" ", with test error {}".format(state['epoch'], state['precision'])) vae_model.load_state_dict(state['state_dict'])
def ctrl_exp_gen_data(rollouts, datadir, logdir, noise_type, device, use_ctrl_exp, exp_prob=.5, randomness_factor=.1): """ randomness factor is the multiple we will multiply the current standard deviation by to get the std for the normal disnt std. This help because it is more resonable. Really should be updating based on parameter distances over updates, but whatever. ** read the openai parameter thing Uses fixed parameters for vae and mdrnn, but maybe change All the if use_ctrl_exp: should be switched to having the random thing inside a module, or at least consistent with the explorer way. """ assert exists(logdir), "The directory does not exist..." exp_prob = float(exp_prob) env = gym.make("CarRacing-v0") seq_len = 1000 if use_ctrl_exp: a_rollout = [] #### Load controller and explorer ctrl_file = join(logdir, 'ctrl', 'best.tar') exp_file = join(logdir, 'exp', 'best.tar') controller = Controller(LSIZE, RSIZE, ASIZE).to(device) explorer = Controller(LSIZE, RSIZE, ASIZE).to(device) if exists(ctrl_file): ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)}) print("Loading Controller with reward {}".format( ctrl_state['reward'])) controller.load_state_dict(ctrl_state['state_dict']) if exists(exp_file): exp_state = torch.load(exp_file, map_location={'cuda:0': str(device)}) print("Loading Explorer with reward {}".format( exp_state['reward'])) explorer.load_state_dict(exp_state['state_dict']) # Make the generators (this is unnecessary, shoul dbe organized some other way) ctrl_gen = RolloutGeneratorSingle(logdir, device, controller) exp_gen = RolloutGeneratorSingle(logdir, device, explorer) # for parameter noise exploration def update_params_noise(model, randomness_factor): def gaussian(ins, stddev=std): return ins + Variable(torch.randn(ins.size()).cuda() * stddev) all_params = [] controller_new = controller for name, param in controller.named_parameters(): all_params.append(param) std = np.std(np.array(params)) print('Parameter mean: ', np.mean(np.array(params))) print('Parameter std: ', std) std = std * randomness_factor controller_new.apply(gaussian) return controller_new for i in range(rollouts): env.reset() env.env.viewer.window.dispatch_events() s_rollout = [] r_rollout = [] d_rollout = [] if use_ctrl_exp: # randomize the explorer and controller explorer_new = update_params_noise(explorer, randomness_factor) controller_new = update_params_noise(controller, randomness_factor) # initialize the hidden state for the model: hidden = [torch.zeros(1, RSIZE).to(device) for _ in range(2)] else: if noise_type == 'white': a_rollout = [env.action_space.sample() for _ in range(seq_len)] elif noise_type == 'brown': a_rollout = sample_continuous_policy(env.action_space, seq_len, 1. / 50) t = 0 while True: if use_ctrl_exp: # explore or exploit: if random.uniform(0, 1) < exp_prob: action, obs, hidden = ctrl_gen(obs, hidden) else: action, obs, hidden = exp_gen(obs, hidden) a_rollout.append(action) else: action = a_rollout[t] t += 1 s, r, done, _ = env.step(action) env.env.viewer.window.dispatch_events() s_rollout += [s] r_rollout += [r] d_rollout += [done] if done: print("> End of rollout {}, {} frames...".format( i, len(s_rollout))) np.savez(join(datadir, 'rollout_{}'.format(i)), observations=np.array(s_rollout), rewards=np.array(r_rollout), actions=np.array(a_rollout), terminals=np.array(d_rollout)) break