def main(): # Parse the JSON arguments # config_dir = ".\config\breakout.json" config_args = parse_args() tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=config_args.num_envs, inter_op_parallelism_threads=config_args.num_envs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Prepare Directories config_args.experiment_dir, config_args.summary_dir, config_args.checkpoint_dir, config_args.output_dir, config_args.test_dir = \ create_experiment_dirs(config_args.experiment_dir) use_VAE = True a2c = A2C(sess, config_args, use_VAE) if config_args.to_train: a2c.train() if config_args.to_test: a2c.test(total_timesteps=10000000)
def gen_data(gen_args, render=False): """ Format: (obs, action, reward, done) """ batch_num, postfix, max_steps, frame_skip = gen_args file_name = 'Breakout_raw_{}_{:04d}'.format(postfix, batch_num) # env = gym.make("BreakoutNoFrameskip-v4") # obs_data = [] observation_list = [] obs_mask_list = [] actions_list = [] values_list = [] dones_list = [] # configuration set-up config_args = parse_args() tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=config_args.num_envs, inter_op_parallelism_threads=config_args.num_envs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # args = config_args # model = Model(sess, optimizer_params={'learning_rate': args.learning_rate, 'alpha': 0.99, 'epsilon': 1e-5}, args=args) # a2c2 = Trainer(sess, model, args) # a2c2._init_model() # a2c2._load_model() # # states = a2c2.model.step_policy.initial_state # # dones = [False for _ in range(env.num_envs)] # # observation_s = np.zeros( # (env.num_envs, a2c2.model.img_height, a2c2.model.img_width, # a2c2.model.num_classes * a2c2.model.num_stack), # dtype=np.uint8) # observation_s = __observation_update(env.reset(), observation_s) # Prepare Directories config_args.experiment_dir, config_args.summary_dir, config_args.checkpoint_dir, config_args.output_dir, config_args.test_dir = \ create_experiment_dirs(config_args.experiment_dir) a2c = A2C(sess, config_args) # testing with open(a2c.args.experiment_dir + a2c.args.env_name + '.pkl', 'rb') as f: observation_space_shape, action_space_n = pickle.load(f) env = a2c.make_all_environments(num_envs=1, env_class=a2c.env_class, env_name=a2c.args.env_name, seed=a2c.args.env_seed) a2c.model.build(observation_space_shape, action_space_n) a2c.trainer._init_model() a2c.trainer._load_model() states = a2c.trainer.model.step_policy.initial_state dones = [False for _ in range(env.num_envs)] observation_s = np.zeros( (env.num_envs, a2c.trainer.model.img_height, a2c.trainer.model.img_width, a2c.trainer.model.num_classes * a2c.trainer.model.num_stack), dtype=np.uint8) observation_s = __observation_update(env.reset(), observation_s) mask_s = np.zeros_like(observation_s) i = 0 while len(observation_list) < max_steps: actions, values, states = a2c.model.step_policy.step( observation_s, states, dones) observation, rewards, dones, _ = env.step(actions) for n, done in enumerate(dones): if done: observation_s[n] *= 0 print(file_name, i, len(observation_list), max_steps, end='\r') # print(batch_num, len(observation_list), max_steps) # obs_mask = obs.astype(int) - obs obs_mask = observation.astype(int) - observation_s[:, :, :, -1, None] # obs_mask = observation_s.astype(int) - observation_s_new # obs_mask = np.abs(obs_mask) # obs_mask = np.expand_dims(obs_mask,-1) obs_mask = obs_mask * (obs_mask > 0) # obs_mask = np.mean(obs_mask, -1, keepdims=True).astype(np.uint8) # obs_mask = obs_mask / 255. # obs_mask = scipy.ndimage.filters.gaussian_filter(obs_mask, 5) # plt.imshow(obs_mask[:, :, 0]); plt.figure(); plt.imshow(obs); plt.show() # plt.imshow(obs_mask[0,:, :, 0]); plt.figure(); plt.imshow(observation[0,:,:,0]); plt.show() observation_s = __observation_update(observation, observation_s) mask_s = __observation_update(obs_mask, mask_s) # action = generate_action(env) # obs_, reward, done, info = env.step(action) if i % frame_skip == 0: # obs_data.append((obs, obs_mask, action, reward, done)) # obs_data.append((observation, obs_mask, actions, values, dones)) observation_list.append(observation_s) obs_mask_list.append(mask_s) actions_list.append(actions) values_list.append(values) dones_list.append(dones) if render: env.render() if dones: # obs, reward_sum, done = gym_utils.reset_env(env) # obs = data_utils.normalize_observation(obs) pass # else: # obs = data_utils.normalize_observation(observation_s) # observation_s = observation_s_new i += 1 print() # data_as_array = np.concatenate(obs_data, 0) # data_as_array = np.vstack(obs_data) data_as_lists = [ np.vstack(observation_list), np.vstack(obs_mask_list), np.asarray(actions_list), np.asarray(values_list), np.asarray(dones_list) ] ### Compute memory usage of obs # size_of_data = data_utils.getSize_lol(obs_data) # actual_total_obs = len(obs_data[0]) * len(obs_data) # size_per_obs = int(size_of_data/actual_total_obs) # print(data_utils.sizeof_fmt(size_per_obs)) # 24.6KiB # print(size_per_obs) # 25218 Bytes # data_utils.save_np_array_as_h5(file_name, data_as_array) data_utils.save_lists_as_h5(file_name, data_as_lists) # print('Generated dataset with ', data_as_array.shape[0], "observations.") # print("Format: (obs, obs_mask, action, reward, done)") print('Saved batch: {:4}'.format(batch_num), '-', file_name) env.close() # return obs_data return file_name
def main(): # model_name = 'breakout_discrete_BLM64_STD0_lr0.0001_LAT4096(2)_MADE1543847099' # model_path = 'C:\\Users\\Toke\\Dropbox\\MAI\\' model_name = 'VAEModel' model_path = 'C:\\Users\\Vlad-PC\\Desktop\\' model_path += model_name latent = [[32 * 128, 2]] # raw_dim = (210, 160, 3) # net_dim = (32*4, 32*3, 3) raw_dim = (84, 84, 4) net_dim = (84, 84, 4) ### Do stuff exp_param = ExpParam( lat_type="discrete", dataset='breakout', latent=[[32 * 128, 2]], raw_type=tf.uint8, raw_dim=raw_dim, net_dim=net_dim, # very close to org aspect ration batch_size=2, # for testing ) ### Load model sess_ae, AE, saver = create_or_load_vae( model_path, exp_param=exp_param, critical_load=True) graph_a2c = tf.Graph() with graph_a2c.as_default(): # tf.reset_default_graph() config_args = parse_args() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=config_args.num_envs, inter_op_parallelism_threads=config_args.num_envs) config.gpu_options.allow_growth = True sess_a2c = tf.Session(config=config) config_args.experiment_dir, config_args.summary_dir, config_args.checkpoint_dir, config_args.output_dir, config_args.test_dir = \ create_experiment_dirs(config_args.experiment_dir) a2c = A2C(sess_a2c, config_args, True) env = A2C.make_all_environments(a2c.args.num_envs, a2c.env_class, a2c.args.env_name, a2c.args.env_seed) print("\n\nBuilding the model...") if a2c.useVAE: a2c.model.buildForVAE(env.observation_space.shape, env.action_space.n, a2c.latent_size) print("Model is built successfully\n") # with open(a2c.args.experiment_dir + a2c.args.env_name + '.pkl', 'wb') as f: # pickle.dump((env.observation_space.shape, env.action_space.n), f, pickle.HIGHEST_PROTOCOL) print('Training...') # training if a2c.args.to_train: a2c.trainer.trainFromVAE(env, sess_ae, AE) # testing with open(a2c.args.experiment_dir + a2c.args.env_name + '.pkl', 'rb') as f: observation_space_shape, action_space_n = pickle.load(f) env = a2c.make_all_environments( num_envs=1, env_class=a2c.env_class, env_name=a2c.args.env_name, seed=a2c.args.env_seed) a2c.model.buildForVAE(observation_space_shape, action_space_n, a2c.latent_size) a2c.trainer._init_model() a2c.trainer._load_model() states = a2c.trainer.model.step_policy.initial_state dones = [False for _ in range(env.num_envs)] observation_s = np.zeros( (env.num_envs, a2c.trainer.model.img_height, a2c.trainer.model.img_width, a2c.trainer.model.num_classes * a2c.trainer.model.num_stack), dtype=np.uint8) observation = env.reset() observation_s = __observation_update(observation, observation_s) i = 0 max_steps = 1e3 while i < max_steps: i += 1 observation_z = encode_data(AE, sess_ae, observation_s) ## TODO: Change a2c.model.step_policy.step actions, values, states = a2c.model.step_policy.step(observation_z, states, dones) observation, rewards, dones, _ = env.step(actions) for n, done in enumerate(dones): if done: observation_s[n] *= 0 # print(file_name, i, len(observation_list), max_steps, end='\r') # print(batch_num, len(observation_list), max_steps) # print() env.render()