def main(): now = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') model_name = 'simpleCNN_' + now + '.h5' batch_size = 256 num_epochs = 30 lr = .001 num_train_samples = len(os.listdir('./data/train/cancer')) + len(os.listdir('./data/train/healthy')) num_valid_samples = len(os.listdir('./data/validation/cancer')) + len(os.listdir('./data/validation/healthy')) # Build our model input_tensor = Input(shape=(96, 96, 3)) x = layers.Conv2D(32, (3, 3))(input_tensor) x = layers.MaxPooling2D((2, 2))(x) x = layers.Conv2D(64, (3, 3))(x) x = layers.MaxPooling2D((2, 2))(x) x = layers.Conv2D(128, (3, 3))(x) x = layers.MaxPooling2D((2, 2))(x) x = layers.Conv2D(128, (3, 3))(x) x = layers.MaxPooling2D((2, 2))(x) x = layers.Flatten()(x) x = layers.Dropout(.5)(x) x = layers.Dense(512, activation='relu')(x) output_tensor = layers.Dense(1, activation='sigmoid')(x) model = Model(input_tensor, output_tensor) model.summary() # Get things ready to train: should adjust learning rate, etc. model.compile(optimizer=Adam(lr), loss='binary_crossentropy', metrics=['acc']) train_generator = train_gen(batch_size) validation_generator = valid_gen(batch_size) steps_per_epoch = num_train_samples / batch_size validation_steps = num_valid_samples / batch_size # Basic callbacks checkpoint = callbacks.ModelCheckpoint(filepath='./models/' + model_name, monitor='val_loss', save_best_only=True) early_stop = callbacks.EarlyStopping(monitor='val_acc', patience=3) csv_logger = callbacks.CSVLogger('./logs/' + model_name.split('.')[0] + '.csv') callback_list = [checkpoint, early_stop, csv_logger] # Training begins history = model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, callbacks=callback_list, validation_data=validation_generator, validation_steps=validation_steps) model.save('./models/' + model_name) make_plots(history, model_name)
def main(): now = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') model_name = 'pretrain_NASNet_' + now + '.h5' batch_size = 32 num_epochs = 30 lr = .0001 num_train_samples = len(os.listdir('./data/train/cancer')) + len(os.listdir('./data/train/healthy')) num_valid_samples = len(os.listdir('./data/validation/cancer')) + len(os.listdir('./data/validation/healthy')) # Build our model input_tensor = Input(shape=(96, 96, 3)) NASNet = NASNetMobile(include_top=False, input_shape=(96, 96, 3)) x = NASNet(input_tensor) x1 = layers.GlobalMaxPooling2D()(x) x2 = layers.GlobalAveragePooling2D()(x) x3 = layers.Flatten()(x) z = layers.Concatenate(axis=-1)([x1, x2, x3]) z = layers.Dropout(.5)(z) output_tensor = layers.Dense(1, activation='sigmoid')(z) model = Model(input_tensor, output_tensor) model.summary() # Get things ready to train: tweak learning rate, etc. model.compile(optimizer=Adam(lr), loss='binary_crossentropy', metrics=['acc']) train_generator = train_gen(batch_size) validation_generator = valid_gen(batch_size) steps_per_epoch = num_train_samples / batch_size validation_steps = num_valid_samples / batch_size # Basic callbacks checkpoint = callbacks.ModelCheckpoint(filepath='./models/' + model_name, monitor='val_loss', save_best_only=True) early_stop = callbacks.EarlyStopping(monitor='val_acc', patience=4) csv_logger = callbacks.CSVLogger('./logs/' + model_name.split('.')[0] + '.csv') callback_list = [checkpoint, early_stop, csv_logger] # Training begins history = model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, callbacks=callback_list, validation_data=validation_generator, validation_steps=validation_steps) model.save('./models/' + model_name) make_plots(history, model_name)
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] grad_var_ = model_dict['grad_var_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) if vae_: print('env for vae') envs_vae = make_env_basic(env_name) if grad_var_: print('env for grad_var_') envs_grad_var = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 model_dict['action_size'] = envs.action_space.n print(envs.action_space.n, 'actions') # next_state_pred_ = 0 # model_dict['next_state_pred_'] = next_state_pred_ # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') discriminator = CNN_Discriminator(model_dict).cuda() print('init discriminator') # elif algo == 'a2c_over': # agent = a2c_over(envs, model_dict) # print ('init a2c_over agent') # elif algo == 'a2c_under': # agent = a2c_under(envs, model_dict) # print ('init a2c_under agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # see_reward_episode = 0 # if 'Montez' in env_name and see_reward_episode: # states_list = [[] for i in range(num_processes)] # view_reward_episode(model_dict=model_dict, frames=[]) # dfasddsf # if vae_: # vae = VAE() # vae.cuda() buffer_ = 1 if buffer_: buffer_states = deque(maxlen=200) buffer_actions = deque(maxlen=200) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type( dtype) #add the new frame, remove oldest, since its a stack agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): # discrim_errors = [] # discrim_errors_reverse = [] # discrim_errors_2step = [] # frames = [] for step in range(num_steps): # Act, [P,1], [P,1], [P,1], [P] state_pytorch = Variable(agent.rollouts.states[step]) value, action, action_log_probs, dist_entropy = agent.act( state_pytorch) #, volatile=True)) # print (action) # fsdaf # Apply to Environment, S:[P,C,H,W], R:[P], D:[P] cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] frame, reward, done, info = envs.step(cpu_actions) # frames.append(torch.FloatTensor(frame)) #[P,1,84,84] # # current_frame = torch.from_numpy(frame) #[P,1,84,84] # current_frame = torch.FloatTensor(frame) #[P,1,84,84] # if step ==0: # prev_frame = torch.FloatTensor(state) #[P,1,84,84] # #Pred action and get error # discrim_error = discriminator.forward(prev_frame, current_frame, action) # discrim_errors.append(discrim_error) # discrim_error_reverse = discriminator.forward(current_frame, prev_frame, action) # discrim_errors_reverse.append(discrim_error_reverse) # # THIS IS TO SEE PREDICTIONS # if step==0: # f = np.reshape(prev_frame[0].numpy(), [84,84]) # f =np.concatenate([f,np.reshape(current_frame[0].numpy(),[84,84])], axis=0) # # f1 = prev_frame[0].numpy() # # f2 = current_frame[0].numpy() # # f = np.reshape(np.concatenate([f1,f2], axis=1), [168,84]) # # print (f.shape) # print (cpu_actions[0]) # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] for breakout # #for montezuma # #['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT', # #'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE'] # I think FIRE = JUMP # if step ==2: # print (torch.mean(current_frame-prev_frame)) # fdafds # prev_frame_2step = prev_frame # prev_frame = current_frame # # print (torch.sum(prev_frame_2step), torch.sum(prev_frame)) # fadsa # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, frame, shape_dim0) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, 0) #, done) # print (f.shape) # rows = 1 # cols = 1 # fig = plt.figure(figsize=(1+cols,5+rows), facecolor='white') # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) #, rowspan=7) # ax.imshow(f, cmap=plt.get_cmap('gray')) # ax.set_yticks([]) # ax.set_xticks([]) # plt.tight_layout() # plt.savefig(model_dict['exp_path']+'plot.png') # print ('plotted') # fadsfad # if buffer_: # if len(buffer_actions) <100: # buffer_steps = 10 # else: # buffer_steps = 1 buffer_steps = 500 if buffer_: #Insert into buffer buffer_states.append(agent.rollouts.states) buffer_actions.append(agent.rollouts.actions) # print (agent.rollouts.states) # print (agent.rollouts.actions) # fda # print (len(buffer_actions)) #If buffer full enough,sample , predict, optimize # if len(buffer_actions) > 10: if len(buffer_actions) == 100: # if 1: #Num of optimization steps for i in range(buffer_steps): # #Sample batch # states_batch = [] # actions_batch = [] # for bb in range(num_processes): # ind = np.random.randint(len(buffer_actions)) # print (buffer_states[ind].size()) # fadas # states_batch.append(buffer_states[ind]) # actions_batch.append(buffer_actions[ind]) # states_batch = torch.stack(states_batch, dim=1) # actions_batch = torch.stack(actions_batch, dim=1) ind = np.random.randint(len(buffer_actions)) states_batch = buffer_states[ind] actions_batch = buffer_actions[ind] #Optimize action-predictor discrim_errors = discrim_predictions( model_dict, states_batch, actions_batch, discriminator) discriminator.optimize(discrim_errors) if i % 20 == 0: print(i) # print (len(buffer_actions), torch.mean(discrim_errors).data.cpu().numpy()[0]) #Optimize agent discrim_errors = discrim_predictions(model_dict, agent.rollouts.states, agent.rollouts.actions, discriminator) discrim_errors_reverse = discrim_predictions( model_dict, agent.rollouts.states, agent.rollouts.actions, discriminator, reverse=True) if len(buffer_actions) > 100: discriminator.optimize(discrim_errors) agent.update2(discrim_errors, discrim_errors_reverse) #agent.update(j,num_updates) # agent.update2(discrim_errors) #agent.update(j,num_updates) else: discrim_errors = discrim_predictions(model_dict, agent.rollouts.states, agent.rollouts.actions, discriminator) discrim_errors_reverse = discrim_predictions( model_dict, agent.rollouts.states, agent.rollouts.actions, discriminator, reverse=True) #Optimize discriminator discriminator.optimize(discrim_errors) #Optimize agent agent.update2(discrim_errors, discrim_errors_reverse) #agent.update(j,num_updates) # agent.update2(discrim_errors) #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #make vae prob gif if vae_: do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) # #make vae prob gif # if grad_var_: # do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.3f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, torch.mean(discrim_errors).data.cpu().numpy()[0]) print(to_print_info_string) # if vae_: # elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0]) # if next_state_pred_: # state_pred_error_print = "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0]) # print(to_print_info_string+' '+state_pred_error_print+' '+elbo) # to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo" # else: # if vae_: # print(to_print_info_string+' '+elbo) # else: # print(to_print_info_string) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E" #, elbo" start2 = time.time() if j % (log_interval * 30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval * 30) == 0: #writes to file do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval * 30) == 0: update_grad_plot(model_dict) to_print_legend_string += ' grad_var_plot updated ' make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string + " problem with plot") try: make_plots(model_dict) except: print()
description="Replace all Bokeh images with interactivet Bokeh plots." ) parser.add_argument( "--readme_file", dest="readme_file", default="README.md", help="Path to README.md file", ) readme_file = parser.parse_args().readme_file # Read in README: with open(readme_file) as f: readme = f.read() # Create Bokeh plots: plots = make_plots() # Replace pictures with HTML plots: for plotname, plot in plots.items(): print(f"Replace image {plotname}") if re.search(r"!\[{plotname}\]\(.+\)".format(plotname=plotname), readme): to_replace = re.search( r"!\[{plotname}\]\(.+\)".format(plotname=plotname), readme ).group() readme = readme.replace(to_replace, f'<div align="center">\n\n{plot}\n\n</div>') else: raise KeyError( f"No image with name '{plotname}' has been found in the README file '{readme_file}'." ) # Replace path to remaining pictures:
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM #load model # if model_dict['load_params']: # load_params(thigns) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt' # pretrained_dict = torch.load(param_file) # object # print (pretrained_dict) # agent_dict = agent.actor_critic.state_dict() #dict # print (agent_dict.keys()) # agent_dict.update(pretrained_dict) # # agent_dict.update(agent.actor_critic) # agent.actor_critic.load_state_dict(agent_dict) param_dict = torch.load(param_file) agent.actor_critic.load_state_dict(param_dict) # agent.actor_critic = torch.load(param_file) agent.actor_critic.cuda() print ('loaded', param_file) # afdsa # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) # list of lists, where lists are trajectories. trajectories have actinos and states dataset = [] tmp_trajs = [[] for x in range(num_processes)] dataset_count = 0 done = [0]*num_processes #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # y = torch.LongTensor(batch_size,1).random_() % nb_digits # # One hot encoding buffer that you create out of the loop and just keep reusing # y_onehot = torch.FloatTensor(batch_size, nb_digits) # # In your for loop # y_onehot.zero_() # y_onehot.scatter_(1, y, 1) states_ = agent.rollouts.states[step].cpu().numpy() #[P,S,84,84] # print (state_t.shape) actions_ = action.data.cpu().numpy() #[P,1] # print (action) # fdsaf #store step for proc in range(num_processes): #add states state_t = states_[proc] action_t = actions_[proc] tmp_trajs[proc].append([action_t, state_t]) if done[proc]: dataset.append(tmp_trajs[proc]) dataset_count += len(tmp_trajs[proc]) tmp_trajs[proc] = [] for ii in range(len(dataset)): print (len(dataset[ii])) if dataset_count > 10000: # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) ) pickle.dump( dataset, open(home+'/Documents/tmp/RoadRunner/trajectories_10000.pkl', "wb" ) ) print('saved') # pickle.save(dataset) STOP # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) # print (len(dataset)) # print () #Optimize agent # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
def combine_and_resample_px4_nogui(input_path, file_prefix=''): #list of file keywords to identify the files we want keywords_in_wanted_files = [ 'gps_position', 'combined', 'magnetometer_0', 'input_rc', 'battery_status', 'attitude', 'control', 'air_data', 'vehicle_status_0' ] #list of key words to identify the columns we want keywords_for_columns = [ 'lat', 'lon', 'alt', 'hdop', 'vdop', 'mode_slot', 'accelerometer_m_s2[0]', 'accelerometer_m_s2[1]', 'accelerometer_m_s2[2]', 'gyro_rad[0]', 'gyro_rad[1]', 'gyro_rad[2]', 'magnetometer_ga[0]', 'magnetometer_ga[1]', 'magnetometer_ga[2]', 'rc_failsafe', 'voltage_v', 'rssi', 'q[0]', 'q[1]', 'q[2]', 'q[3]', 'baro_alt', 'nav_state', 'values[0]', 'values[1]', 'values[2]', 'values[3]', 'values[4]', 'values[5]', 'values[6]', 'values[7]', 'values[8]', 'values[9]', 'values[10]', 'values[11]', 'values[12]', 'values[13]', 'values[14]', 'values[15]', 'values[16]', 'values[17]', 'values[18]' ] # save the current path to a variable so we can return to it later original_path = os.getcwd() os.chdir(input_path) # get list of filenames in directory ending with csv list_of_filenames = [f for f in os.listdir('.') if f.endswith('.csv')] #remove the files we know we do not want from the list list_of_filenames = [ item for item in list_of_filenames if any(word in item for word in keywords_in_wanted_files) ] #check if list_of_filenames is populated and exit if it is if len(list_of_filenames) == 0: sys.exit("There are no files to process in " + input_path) # create empty list so we can append to it list_of_df = [] #empty list so we can append it reject_column_list = [] #empty list to append new_headers = [] # iterate through the csv in the current directory, create a df for each # filename, put the df into a list of other df for current_filename in list_of_filenames: # this is the important read, read in the data we care about, the index # is stored in column 0, the header is stored in row 0, pandas will # name columns automatically for us using the header row df = pd.read_csv(current_filename, index_col=0, header=0) # get a list of all the headers in the csv column_headers = df.columns # create an empty list to append columns_to_keep = [] # get a list of the columns we want to keep for each csv by iterating # through the file and getting the headers for header in column_headers: #get the list of keywords for keyword in keywords_for_columns: #check if the keyword is on the header if keyword in header: #the word lat and alt is used in other headers this takes #care of the problem if keyword == 'lat' and header == 'lat': columns_to_keep.append(header) assign_names(keyword, new_headers) if keyword == 'alt' and header == 'alt': columns_to_keep.append(header) assign_names(keyword, new_headers) # add the header to the list of headers to keep if it has # the keyword by calling the assignname_ function if keyword != 'lat' and keyword != 'alt': columns_to_keep.append(header) assign_names(keyword, new_headers) #change the list to just have the columns we want reject_column_list = [ header for header in column_headers if header not in columns_to_keep ] #drop all columns that we do not want df = df.drop(columns=reject_column_list) # store the current df (from a single csv) into the big list of dfs list_of_df.append(df) # create the big df by using the concat method called on a list of small df big_df = pd.concat(list_of_df, axis=0, ignore_index=False, sort=False) # sort on the timestamp column, otherwise the small df are stuck together # end-to-end which isn't what we want big_df = big_df.sort_values(by='timestamp') # for px4 some data files start with 0 for timestamp, we don't want this, # so we will just discard these rows for now big_df = big_df.drop(0, errors="ignore") # offset time to zero just because we can big_df.index = big_df.index - big_df.index[0] # fill the missing spaces, use ffill to move the most recent valid observation # forward big_df = big_df.fillna(method='ffill') # fill the remaining na with 0, these only happen at the beginning where we # previously did not have any observations to pass forward big_df = big_df.fillna(0) # get rid of duplicate rows, not sure this is needed but keeping just in case # UPDATE: definitely needed, first line gets rid of duplicate time entries big_df = big_df[~big_df.index.duplicated()] # this one gets rid of duplicated output data, not sure this is required big_df = big_df.drop_duplicates() # create a time delta column with the proper format, note the use of 10**6 to # modify time stamp since timestamp for px4 data is in microseconds, 'S' means # that this function is expected time formated ins seconds so the easiest way # to fix it is just to convert the number to seconds before passing it big_df['time_properformat'] = pd.to_timedelta(big_df.index / 10.0**6, 'S') # switch the index of the big_df to proper time delta column big_df.index = pd.to_datetime(big_df.time_properformat.astype('int64')) # fix this for 250Hz rate since the auto calculator is causing issues min_sampletime = .004 freq_arg = int(min_sampletime * 10**6) freq_type = 'U' # create the resampled df resampled_df = big_df.asfreq(str(freq_arg) + freq_type, method='ffill') # get rid of the annoying time index, switch back to delta time in seconds resampled_df.index = resampled_df.index.values.astype(np.uint64) / 1000000 # get rid of the unused column before we send it to csv resampled_df = resampled_df.drop(columns=['time_properformat']) #assign the new headers to the columns resampled_df.columns = new_headers #call the quat2eul function to change the attitude to eul r, p, y = quat2eul(resampled_df['Att.Qx'], resampled_df['Att.Qy'], resampled_df['Att.Qz'], resampled_df['Att.Qw']) # add the pitch roll and yaw to the resampled_df resampled_df['Att.roll'] = r resampled_df['Att.pitch'] = p resampled_df['Att.yaw'] = y #drop the columns with the quat data in them resampled_df = resampled_df.drop( columns=['Att.Qx', 'Att.Qy', 'Att.Qz', 'Att.Qw']) # check if folder exists and create if needed, this avoid the script trying # to read it's own results.csv as one of the constituent files if we run # this script on a directory where it has been run at least once previously if not os.path.exists('combined'): os.mkdir('combined') # write the result, we want to write the index column, we want to label the index # column, feel free to change the name resampled_df.to_csv(path_or_buf=os.path.join('combined', file_prefix + '_results.csv'), index=True, index_label='Time') print('Resampling complete.') print( 'Minimum sample time is:\t%f s\nThe corresponding frequency is:\t%f Hz\nOutput saved to: %s' % (min_sampletime, 1 / min_sampletime, os.path.join('combined', file_prefix + '_results.csv'))) # call the make_plot function and pass the location of the results file make_plots('combined' + '/' + file_prefix + '_results.csv') # go back to the original directory so we can iterate through the rest of # the files os.chdir(original_path)
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) if vae_: print('env for vae') envs_vae = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 next_state_pred_ = 0 model_dict['next_state_pred_'] = next_state_pred_ # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print('init a2c_minibatch agent') elif algo == 'a2c_list_rollout': agent = a2c_list_rollout(envs, model_dict) print('init a2c_list_rollout agent') elif algo == 'a2c_with_var': agent = a2c_with_var(envs, model_dict) print('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # see_reward_episode = 0 # if 'Montez' in env_name and see_reward_episode: # states_list = [[] for i in range(num_processes)] # view_reward_episode(model_dict=model_dict, frames=[]) # dfasddsf if vae_: vae = VAE() vae.cuda() # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) # prev_action = Variable(torch.zeros([num_processes, 1]).type(torch.LongTensor)).cuda() #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) state_pytorch = Variable(agent.rollouts.states[step]) value, action, action_log_probs, dist_entropy = agent.act( state_pytorch) #, volatile=True)) # if next_state_pred_: # next_state_prediction = agent.actor_critic.predict_next_state2(state_pytorch, prev_action) # next_state_prediction = 0 # print (action_log_probs.size()) # print (dist_entropy.size()) # prev_action = action # print (next_state_prediction.size()) # [P,1,84,84] # fasd cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) reward_numpy = reward # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) if next_state_pred_: agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, next_state_prediction) #, done) agent.rollouts.insert_state_pred(next_state_prediction) else: agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, 0) #, done) # if 'Montez' in env_name and see_reward_episode: # for state_i in range(len(state)): # if done[state_i]: # states_list[state_i] = [] # else: # states_list[state_i].append(np.squeeze(state[state_i])) # # print (state[state_i].shape) # # fasdf # # print (reward) # if reward_numpy[state_i] >0: # #plot the states of state_i # print (len(states_list[state_i])) # # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:]) # # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:]) # view_reward_episode(model_dict=model_dict, frames=states_list[state_i]) # fadsa # # and np.sum(agent.rollouts.rewards.cpu().numpy()) > 0 # # print (np.sum(agent.rollouts.rewards.cpu().numpy())) # # print (j) #Optimize agent agent.update() #agent.update(j,num_updates) batch = agent.rollouts.states # print (batch.size()) # [Steps+1,Processes,Stack,84,84] # remove first state since its repeated, its the last state of last episode # take the first state of the stack for each step #reshape to [P*S,84,84] batch = batch[1:] # [Steps,Processes,Stack,84,84] batch = batch[:, :, 0] # [Steps,Processes,84,84] batch = batch.contiguous().view(-1, 84, 84) # [Steps*Processes,84,84] # print (batch.size()) # fadsa # print (vae) elbo = vae.update(batch) agent.insert_first_state(agent.rollouts.states[-1]) # print (agent.state_pred_error.data.cpu().numpy()) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #make vae prob gif if vae_: do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0]) if next_state_pred_: state_pred_error_print = "{:.2f}".format( agent.state_pred_error.data.cpu().numpy()[0]) print(to_print_info_string + ' ' + state_pred_error_print + ' ' + elbo) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo" else: print(to_print_info_string + ' ' + elbo) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, elbo" start2 = time.time() if j % (log_interval * 30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state def do_vid(): n_vids=3 for i in range(n_vids): done=False state = envs_video.reset() # state = torch.from_numpy(state).float().type(dtype) current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) # print ('Recording') # count=0 while not done: # print (count) # count +=1 # Act state_var = Variable(current_state, volatile=True) # print (state_var.size()) action, value = agent.act(state_var) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs_video.step(cpu_actions) # state:[nProcesss, ndims, height, width] # state = torch.from_numpy(state).float().type(dtype) # current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) state = envs_video.reset() vid_path = save_dir+'/videos/' count =0 for aaa in os.listdir(vid_path): if 'openaigym' in aaa and '.mp4' in aaa: #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4') subprocess.call("(cd "+vid_path+" && mv "+ vid_path+aaa +" "+ vid_path+env_name+'_'+algo+'_vid_t'+str(total_num_steps)+'_'+str(count) +".mp4)", shell=True) count+=1 if '.json' in aaa: os.remove(vid_path+aaa) num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] # print("#######") # print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") # print("#######") os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) num_updates = int(num_frames) // num_steps // num_processes if cuda: torch.cuda.manual_seed(seed) else: torch.manual_seed(seed) # Create environments print (num_processes, 'processes') envs = SubprocVecEnv([make_env(env_name, seed, i, save_dir) for i in range(num_processes)]) print ('env for video') # envs_video = gym.make(env_name) # envs_video = gym.wrappers.Monitor(envs_video, save_dir+'/videos/', video_callable=lambda x: True, force=True) envs_video = make_env_monitor(env_name, save_dir)#+'/videos/') obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape if cuda: dtype = torch.cuda.FloatTensor else: dtype = torch.FloatTensor # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) elif algo == 'ppo': agent = ppo(envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) #Begin training start = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P,1] action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # I need to figure out when the env is being reset # I might be wrong about the placecement of the update because maybe the reset is happening #maybe when its done, the state is acually the first of the next episode, # if so then its all right. #BUT if it is true then that means, it never sees the last frame. since when the done comes, its # already giving you the next frame, so you never see the done frame #so increasing frame_skip, could cause problems because last frame coudl be far from current frame # Update state, I think this should go before record rewards, because its adding the last state # of the previous episode to the done current_states, ie I just set them to 0 #unless the last state is really the first of the next episode, but I doubt it. current_state = update_current_state(current_state, state, shape_dim0) # Agent record step, just saves all those values, I think this should go before record rewards too #but just need to change some numpy to pytorch # Issue could be the mask, it needs to be updated before this . agent.insert_data(step, current_state, action.data, value.data, reward, masks) # Record rewards reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) # print (step, 'value_preds', agent.rollouts.value_preds.cpu().numpy().reshape(6,2)) # self.rewards = self.rewards.cuda() # self.value_preds = self.value_preds.cuda() # self.returns = self.returns.cuda() #Optimize agent agent.update() agent.insert_first_state(agent.rollouts.states[-1]) total_num_steps = (j + 1) * num_processes * num_steps #Save model if total_num_steps % save_interval == 0 and save_dir != "": save_path = os.path.join(save_dir, 'model_params') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = agent.actor_critic if cuda: save_model = copy.deepcopy(agent.actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) save_to=os.path.join(save_path, "model_params" + str(total_num_steps)+".pt") torch.save(save_model, save_to) print ('saved', save_to) #make video do_vid() #Print updates if j % log_interval == 0: end = time.time() if j % (log_interval*30) == 0: #update plots try: make_plots(model_dict) print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated") except: print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start)) try: make_plots(model_dict) except: pass #raise
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 action_size = envs.action_space.n # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print('init a2c_minibatch agent') elif algo == 'a2c_list_rollout': agent = a2c_list_rollout(envs, model_dict) print('init a2c_list_rollout agent') elif algo == 'a2c_with_var': agent = a2c_with_var(envs, model_dict) print('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) #Load model if model_dict['load_params']: # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) if model_dict['load_number'] == 3: load_params_v2( home + '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) elif model_dict['load_number'] == 6: load_params_v2( home + '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) elif model_dict['load_number'] == 9: load_params_v2( home + '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # else: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) else: PROBLEM ls_path = save_dir + '/V_and_Q_errors/' ls_file = ls_path + 'error_monitor.csv' if not os.path.exists(ls_path): os.makedirs(ls_path) # if print_: print('Made dir', ls_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): Vs = [] Qs = [] for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act( Variable(agent.rollouts.states[step])) #, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) one_hot_action = torch.FloatTensor(num_processes, action_size) one_hot_action.zero_() one_hot_action.scatter_(1, action.data.cpu(), 1) # print (action) # print (one_hot_action) # fdsfa V, Q = agent.actor_critic.get_V_and_Q( Variable(agent.rollouts.states[step]), one_hot_action) Vs.append(V) Qs.append(Q) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent # agent.update() #agent.update(j,num_updates) V_loss, Q_loss = agent.update2(Vs, Qs) #agent.update(j,num_updates) V_loss = V_loss.data.cpu().numpy()[0] Q_loss = Q_loss.data.cpu().numpy()[0] # print (V_loss) # fasd agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval * 30) == 0: if total_num_steps > 5000: with open(ls_file, 'a') as f: writer = csv.writer(f) writer.writerow([total_num_steps, V_loss, Q_loss]) if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) if total_num_steps > 5000: update_error_plot(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print()
def minimize_scipy(self, params0, output=False, plots=False, hires=False): """Minimize residual using scipy.optimize Levenberg-Marquardt. Inputs: params0 -- initial guesses for parameters: mn0 -- Mn abundance smoothivar0 -- if applicable, inverse variance to use for smoothing Keywords: plots -- if 'True', also plot final fit & residual output -- if 'True', also output a file (default='False') hires -- if 'True', zoom in a bit on plots to better display hi-res spectra Outputs: fitparams -- best fit parameters rchisq -- reduced chi-squared """ # Do minimization print('Starting minimization! Initial guesses: ', params0) best_mn, covar = scipy.optimize.curve_fit( self.synthetic, self.obswvl_final, self.obsflux_final, p0=[params0], sigma=np.sqrt(np.reciprocal(self.ivar_final)), epsfcn=0.01) error = np.sqrt(np.diag(covar)) print('Answer: ', best_mn) print('Error: ', error) # Do some checks if len(np.atleast_1d(best_mn)) == 1: finalsynth = self.synthetic(self.obswvl_final, best_mn, full=True) else: finalsynth = self.synthetic(self.obswvl_final, best_mn[0], best_mn[1], full=True) # Output the final data if output: if len(np.atleast_1d(best_mn)) == 1: #finalsynthup = self.synthetic(self.obswvl_final, best_mn + error, full=True) #finalsynthdown = self.synthetic(self.obswvl_final, best_mn - error, full=True) finalsynthup = self.synthetic(self.obswvl_final, best_mn + 0.15, full=True) finalsynthdown = self.synthetic(self.obswvl_final, best_mn - 0.15, full=True) else: #finalsynthup = self.synthetic(self.obswvl_final, best_mn[0] + error[0], best_mn[1], full=True) #finalsynthdown = self.synthetic(self.obswvl_final, best_mn[0] - error[0], best_mn[1], full=True) finalsynthup = self.synthetic(self.obswvl_final, best_mn[0] + 0.15, best_mn[1], full=True) finalsynthdown = self.synthetic(self.obswvl_final, best_mn[0] - 0.15, best_mn[1], full=True) # Create file filename = self.outputname + '/' + self.specname + '_data.csv' # Define columns columnstr = [ 'wvl', 'obsflux', 'synthflux', 'synthflux_up', 'synthflux_down', 'ivar' ] columns = np.asarray([ self.obswvl_final, self.obsflux_final, finalsynth, finalsynthup, finalsynthdown, self.ivar_final ]) with open(filename, 'w') as csvfile: datawriter = csv.writer(csvfile, delimiter=',') # Write header datawriter.writerow(['[Mn/H]', best_mn[0]]) if len(np.atleast_1d(best_mn)) > 1: datawriter.writerow(['dlam', best_mn[1]]) datawriter.writerow(columnstr) # Write data for i in range(len(finalsynth)): datawriter.writerow(columns[:, i]) # Make plots if plots: make_plots(self.lines, self.specname + '_', self.obswvl_final, self.obsflux_final, finalsynth, self.outputname, ivar=self.ivar_final, synthfluxup=finalsynthup, synthfluxdown=finalsynthdown, hires=hires) elif plots: make_plots(self.lines, self.specname + '_', self.obswvl_final, self.obsflux_final, finalsynth, self.outputname, ivar=self.ivar_final, hires=hires) return best_mn, error
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print ('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print ('init a2c_minibatch agent') elif algo == 'a2c_list_rollout': agent = a2c_list_rollout(envs, model_dict) print ('init a2c_list_rollout agent') elif algo == 'a2c_with_var': agent = a2c_with_var(envs, model_dict) print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes #Begin training # count =0 start = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, done) #Optimize agent agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) total_num_steps = (j + 1) * num_processes * num_steps if total_num_steps % save_interval == 0 and save_dir != "": #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0: end = time.time() if j % (log_interval*30) == 0: #update plots try: make_plots(model_dict) print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated") except: # raise print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start))#, agent.current_lr) try: make_plots(model_dict) except: print ()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--labels", type=str, help="Experiments to launch.", default="inc_dec_1", ) parser.add_argument( "--n_jobs", type=int, help="Number of jobs to launch in parallel.", default=1 ) parser.add_argument( "--n_discretization", type=int, help="Discretization of the time-horzion.", default=100, ) parser.add_argument( "--n_seeds", type=int, help="Number of random seeds to run per experiment.", default=30, ) parser.add_argument("--optimal", action="store_true") parser.add_argument("--randomize_bandit", action="store_true") parser.add_argument("--compute_regret", action="store_true") args = parser.parse_args() pickle_path = os.path.join(RESULTS_FOLDER, PICKLE_FOLDER) os.makedirs(pickle_path, exist_ok=True) labels = args.labels.split(",") print("Running experiments:", ", ".join(labels)) # Collect selected experiments jobs = [] for experiment_label in labels: experiment_bandits, solvers = EXPERIMENTS[experiment_label] if args.optimal: solvers = [OptimalSolver] for bandit_n, bandit in enumerate(experiment_bandits): if args.optimal: T_array = np.arange(1, bandit.Tmax + 1) else: T_array = np.linspace( 2 * bandit.n + 1, bandit.Tmax, args.n_discretization, dtype=np.int ) for solver in solvers: if bandit.stochastic or solver.stochastic: n_runs = args.n_seeds else: n_runs = 1 for _ in range(n_runs): for T in T_array: jobs.append( { "bandit": bandit, "solver": solver, "T": T, "randomize_bandit": args.randomize_bandit, "compute_regret": args.compute_regret, } ) # Run experiments start_time = time.time() if args.n_jobs == 1: results = [] for job in jobs: result = run_bandit_solver(job) results.append(result) else: with mp.get_context("spawn").Pool(args.n_jobs) as p: results = p.map(run_bandit_solver, jobs, chunksize=1) # Aggregate results results_aggregated = dict() for res in results: bandit = res["bandit"] solver = res["solver"] T = res["T"] policy = res["policy"] cumulative_reward = res["cumulative_reward"] single_peaked_bandits = res["single_peaked_bandits"] if (bandit, solver) not in results_aggregated: results_aggregated[(bandit, solver)] = dict() if T not in results_aggregated[(bandit, solver)]: results_aggregated[(bandit, solver)][T] = [] results_aggregated[(bandit, solver)][T].append( (policy, cumulative_reward, single_peaked_bandits) ) # Write results for (bandit, solver), results in results_aggregated.items(): T_list = [] policy_list = [[] for _ in range(args.n_seeds)] cumulative_reward_list = [[] for _ in range(args.n_seeds)] for T in sorted(results.keys()): T_list.append(T) for i in range(len(results[T])): policy, cumulative_reward, single_peaked_bandits = results[T][i] policy_list[i].append(tuple(policy)) if args.compute_regret: cumulative_reward_list[i].append(single_peaked_bandits) else: cumulative_reward_list[i].append(cumulative_reward) pickle_file = os.path.join(pickle_path, f"{bandit}_{solver}_result.p") with open(pickle_file, "wb") as f: pickle.dump( ( bandit, solver, tuple(T_list), tuple([tuple(x) for x in policy_list]), tuple([tuple(x) for x in cumulative_reward_list]), ), f, ) make_plots() print("Done in", time.time() - start_time)
def main(): now = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') model_name = 'pretrain_vgg16_' + now + '.h5' batch_size = 64 num_epochs = 30 lr = .0001 num_train_samples = len(os.listdir('./data/train/cancer')) + len(os.listdir('./data/train/healthy')) num_valid_samples = len(os.listdir('./data/validation/cancer')) + len(os.listdir('./data/validation/healthy')) # Build our model input_tensor = Input(shape=(96, 96, 3)) vgg = VGG16(include_top=False, input_shape=(96, 96, 3)) x = vgg(input_tensor) z = layers.Flatten()(x) z = layers.Dropout(.5)(z) z = layers.Dense(256, activation='relu')(z) output_tensor = layers.Dense(1, activation='sigmoid')(z) vgg.trainable = True set_trainable = False for layer in vgg.layers: if layer.name == 'block5_conv1': set_trainable = True if set_trainable: layer.trainable = True else: layer.trainable = False vgg.summary() model = Model(input_tensor, output_tensor) model.summary() # Get things ready to train: tweak learning rate, etc. model.compile(optimizer=Adam(lr), loss='binary_crossentropy', metrics=['acc']) train_generator = train_gen(batch_size) validation_generator = valid_gen(batch_size) steps_per_epoch = num_train_samples / batch_size validation_steps = num_valid_samples / batch_size # Basic callbacks checkpoint = callbacks.ModelCheckpoint(filepath='./models/' + model_name, monitor='val_loss', save_best_only=True) early_stop = callbacks.EarlyStopping(monitor='val_acc', patience=10) csv_logger = callbacks.CSVLogger('./logs/' + model_name.split('.')[0] + '.csv') callback_list = [checkpoint, early_stop, csv_logger] # Training begins history = model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, callbacks=callback_list, validation_data=validation_generator, validation_steps=validation_steps) model.save('./models/' + model_name) make_plots(history, model_name)
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda # print (current_state) # fdsf if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 action_size = envs.action_space.n model_dict['action_size']=action_size # Create agent if algo == 'a2c': agent = a2c(model_dict) print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) #Load model if model_dict['load_params']: # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) if model_dict['load_number'] == 3: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) elif model_dict['load_number'] == 6: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) elif model_dict['load_number'] == 9: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # else: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) else: PROBLEM # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]/255.))#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # if np.sum(reward) > 0.: # print (reward) # afdas # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: # do_params(save_dir, agent, total_num_steps, model_dict) # save_params_v2(save_dir, agent, total_num_steps, model_dict) save_params_v3(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM #load model # if model_dict['load_params']: # load_params(thigns) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' param_file = home + '/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt' # pretrained_dict = torch.load(param_file) # object # print (pretrained_dict) # agent_dict = agent.actor_critic.state_dict() #dict # print (agent_dict.keys()) # agent_dict.update(pretrained_dict) # # agent_dict.update(agent.actor_critic) # agent.actor_critic.load_state_dict(agent_dict) param_dict = torch.load(param_file) agent.actor_critic.load_state_dict(param_dict) # agent.actor_critic = torch.load(param_file) agent.actor_critic.cuda() print('loaded', param_file) # afdsa # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) # list of lists, where lists are trajectories. trajectories have actinos and states dataset = [] tmp_trajs = [[] for x in range(num_processes)] dataset_count = 0 done = [0] * num_processes #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act( Variable(agent.rollouts.states[step])) #, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # y = torch.LongTensor(batch_size,1).random_() % nb_digits # # One hot encoding buffer that you create out of the loop and just keep reusing # y_onehot = torch.FloatTensor(batch_size, nb_digits) # # In your for loop # y_onehot.zero_() # y_onehot.scatter_(1, y, 1) states_ = agent.rollouts.states[step].cpu().numpy() #[P,S,84,84] # print (state_t.shape) actions_ = action.data.cpu().numpy() #[P,1] # print (action) # fdsaf #store step for proc in range(num_processes): #add states state_t = states_[proc] action_t = actions_[proc] tmp_trajs[proc].append([action_t, state_t]) if done[proc]: dataset.append(tmp_trajs[proc]) dataset_count += len(tmp_trajs[proc]) tmp_trajs[proc] = [] for ii in range(len(dataset)): print(len(dataset[ii])) if dataset_count > 10000: # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) ) pickle.dump( dataset, open( home + '/Documents/tmp/RoadRunner/trajectories_10000.pkl', "wb")) print('saved') # pickle.save(dataset) STOP # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) # print (len(dataset)) # print () #Optimize agent # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval * 30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 # # Create agent # if algo == 'a2c': # agent = a2c(envs, model_dict) # print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM print ('Init expert agent') expert_agent = a2c(envs, model_dict) param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) expert_agent.actor_critic.load_state_dict(param_dict) print ('loaded params', param_file) expert_agent.actor_critic.cuda() print ('Init imitator agent') imitator_agent = a2c(envs, model_dict) # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt' # param_dict = torch.load(param_file) # imitator_agent.actor_critic.load_state_dict(param_dict) # print ('loaded params', param_file) imitator_agent.actor_critic.cuda() agent = expert_agent expert_policy = expert_agent.actor_critic imitator_policy = imitator_agent.actor_critic optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001) total_steps = 0 display_step = 50 # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) state__ = Variable(agent.rollouts.states[step]) / 255. value, action, action_log_probs, dist_entropy = agent.act(state__) #, requires_grad=False)#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) batch = state__ optimizer.zero_grad() log_dist_expert = expert_policy.action_logdist(batch) log_dist_imitator = imitator_policy.action_logdist(batch) action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator)*torch.exp(log_dist_expert), dim=1) #[B] # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k) loss = torch.mean(action_dist_kl) loss.backward() # nn.utils.clip_grad_norm(self.parameters(), .5) optimizer.step() # if total_steps%display_step==0: # and batch_idx == 0: # # print ('Train Epoch: {}/{}'.format(epoch+1, epochs), # # 'total_epochs {}'.format(total_epochs), # print('LL:{:.4f}'.format(loss.data[0]) # # 'logpx:{:.4f}'.format(logpx.data[0]), # # 'logpz:{:.5f}'.format(logpz.data[0]), # # 'logqz:{:.5f}'.format(logqz.data[0]), # # 'action_kl:{:.4f}'.format(action_dist_kl.data[0]) # ) # total_steps+=1 cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent agent.no_update() #agent.update(j,num_updates) # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: save_to = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt' torch.save(imitator_policy.state_dict(), save_to) print ('saved imitator_policy', save_to) # #Save model # if save_params: # do_params(save_dir, agent, total_num_steps, model_dict) # # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, loss.data[0]) # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start, # end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one # if see_frames: # #Grayscale # save_frame(state, count) # count+=1 # if done[0]: # ffsdfa # #RGB # state = envs.render() # print(state.shape) # fdsafa return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state def do_vid(): n_vids = 3 for i in range(n_vids): done = False state = envs_video.reset() # state = torch.from_numpy(state).float().type(dtype) current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) # print ('Recording') # count=0 while not done: # print (count) # count +=1 # Act state_var = Variable(current_state, volatile=True) # print (state_var.size()) action, value = agent.act(state_var) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs_video.step( cpu_actions) # state:[nProcesss, ndims, height, width] # state = torch.from_numpy(state).float().type(dtype) # current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) state = envs_video.reset() vid_path = save_dir + '/videos/' count = 0 for aaa in os.listdir(vid_path): if 'openaigym' in aaa and '.mp4' in aaa: #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4') subprocess.call("(cd " + vid_path + " && mv " + vid_path + aaa + " " + vid_path + env_name + '_' + algo + '_vid_t' + str(total_num_steps) + '_' + str(count) + ".mp4)", shell=True) count += 1 if '.json' in aaa: os.remove(vid_path + aaa) def save_frame(state, count): frame_path = save_dir + '/frames/' if not os.path.exists(frame_path): os.makedirs(frame_path) print('Made dir', frame_path) state1 = np.squeeze(state[0]) # print (state1.shape) fig = plt.figure(figsize=(4, 4), facecolor='white') plt.imshow(state1, cmap='gray') plt.savefig(frame_path + 'frame' + str(count) + '.png') print('saved', frame_path + 'frame' + str(count) + '.png') plt.close(fig) num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor else: torch.manual_seed(seed) dtype = torch.FloatTensor # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) vid_ = 1 see_frames = 0 if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print('init a2c_minibatch agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes #Begin training # count =0 start = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P,1] action, value = agent.act( Variable(agent.rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) # Update state current_state = update_current_state(current_state, state, shape_dim0) # Agent record step agent.insert_data(step, current_state, action.data, value.data, reward, masks) #Optimize agent agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) total_num_steps = (j + 1) * num_processes * num_steps #Save model if total_num_steps % save_interval == 0 and save_dir != "": save_path = os.path.join(save_dir, 'model_params') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = agent.actor_critic if cuda: save_model = copy.deepcopy(agent.actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) # steps_sci_nota = '{e}'.format(total_num_steps) save_to = os.path.join( save_path, "model_params" + str(total_num_steps) + ".pt") # save_to=os.path.join(save_path, "model_params" + steps_sci_nota+".pt") torch.save(save_model, save_to) print('saved', save_to) #make video if vid_: do_vid() #Print updates if j % log_interval == 0: end = time.time() if j % (log_interval * 30) == 0: #update plots try: make_plots(model_dict) print( "Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated" ) except: raise print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start)) #, agent.current_lr) try: make_plots(model_dict) except: print()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] grad_var_ = model_dict['grad_var_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) if vae_: print('env for vae') envs_vae = make_env_basic(env_name) if grad_var_: print('env for grad_var_') envs_grad_var = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels action_space = envs.action_space model_dict['action_space'] = action_space model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 next_state_pred_ = 0 model_dict['next_state_pred_'] = next_state_pred_ n_contexts = 2 model_dict['n_contexts'] = n_contexts # Create agent # if algo == 'a2c': agent = a2c(model_dict) print('init a2c agent') discriminator = CNN_Discriminator(num_steps, n_contexts, model_dict).cuda() print('init discriminator') # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type( dtype) #add the new frame, remove oldest, since its a stack agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) #when episode complete, sotres it here num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) context_probs = torch.ones(n_contexts) / n_contexts #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): context_np = np.random.choice(n_contexts, num_processes) context = torch.from_numpy(context_np).view(num_processes, 1) context_onehot = torch.FloatTensor(num_processes, n_contexts).zero_() context_onehot.scatter_(1, context, 1) # [P,C] list_frames = [] for step in range(num_steps): #Sample context # context = torch.unsqueeze(context_probs.multinomial(num_processes), dim=1) # [P,1] # print (torch.multinomial.sample(context_probs, num_processes)) # print (np.random.multinomial(num_processes, [1./n_contexts]*n_contexts)) # print (np.random.choice(n_contexts, num_processes)) #[1./n_contexts]*n_contexts)) # Act, [P,1], [P], [P,1], [P] state_pytorch = Variable(agent.rollouts.states[step]) value, action, action_log_probs, dist_entropy = agent.act( state_pytorch, context_onehot) #, volatile=True)) # print (context_np) # print (action) #ACTIONS #['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT', #'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE'] # TO FIX THE ACTIONS # action = action.data # # print (action) # # fdasf # for i in range(len(context_np)): # if context_np[i] == 0: # action[i] = 4 # else: # action[i] = 3 # action = Variable(action) # # print (action) # # print (action) # # fadsf # # TO FIX THE ACTIONS 2 # action = action.data # # print (action) # # fdasf # for i in range(len(action)): # if action[i].cpu().numpy() >= 8: # action[i] = 0 # # else: # # action[i] = 3 # action = Variable(action) # # print (action) # # print (action) # # fadsf # Apply to Environment, S:[P,C,H,W], R:[P], D:[P] cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # print (cpu_actions) state, reward, done, info = envs.step(cpu_actions) # print (state.shape) #[P,1,84,84] list_frames.append(torch.FloatTensor(state)) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, 0) #, done) #Optimize discriminator if j % 2 == 0: discriminator_error = discriminator.update(list_frames, context) #[P] if j == 0: print('multiple updates') for jj in range(20): discriminator_error = discriminator.update( list_frames, context) # print (torch.mean(discriminator_error).data.cpu().numpy()[0]) # fasds grad_sum = agent.actor_critic.graddd(state_pytorch, context_onehot) #Optimize agent # agent.update(context_onehot, discriminator_error) #agent.update(j,num_updates) agent.update2(context_onehot, discriminator_error, grad_sum) #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #make vae prob gif if vae_: do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) # #make vae prob gif # if grad_var_: # do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() # print (torch.mean(discriminator_error).data.cpu().numpy()[0]) to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.3f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, torch.mean(discriminator_error).data.cpu().numpy()[0]) print(to_print_info_string) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, D_error" #, elbo" start2 = time.time() if j % (log_interval * 30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval * 30) == 0: #writes to file do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards) # # THIS IS TO SEE PREDICTIONS # nstep_frames = torch.stack(list_frames) #[N, P, 1,84,84] # nstep_frames = torch.transpose(nstep_frames, 0,1) # nstep_frames = torch.squeeze(nstep_frames) #its [P,N,84,84] so its like a batch of N dimensional images # nstep_frames = Variable(nstep_frames).cuda() # pred = F.softmax(discriminator.predict(nstep_frames), dim=1) # print (pred, context_np) # rows = 1 # cols = 2 # fig = plt.figure(figsize=(1+cols,15+rows), facecolor='white') # zero_comp = 0 # one_comp = 0 # for ii in range(len(context_np)): # if context_np[ii] == 0 and not zero_comp: # print (ii) # imgg = nstep_frames[ii].view(num_steps*84,84).data.cpu().numpy() # # imgg = nstep_frames[ii].view(num_steps*84//2,84*2) # # # imgg = nstep_frames[ii].view(num_steps*84,84) # # imgg = imgg.data.cpu().numpy() # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) #, rowspan=7) # # print (imgg.shape) # # plt.imshow(imgg, cmap=plt.get_cmap('gray')) # ax.imshow(imgg, cmap=plt.get_cmap('gray')) # ax.set_yticks([]) # # plt.savefig(model_dict['exp_path']+'img0.pdf') # # print (model_dict['exp_path']+'img.png') # zero_comp =1 # if context_np[ii] == 1 and not one_comp: # print (ii) # imgg = nstep_frames[ii].view(num_steps*84,84).data.cpu().numpy() # # imgg = nstep_frames[ii].view(num_steps*84//2,84*2) # # # imgg = nstep_frames[ii].view(num_steps*84,84) # # imgg = imgg.data.cpu().numpy() # ax = plt.subplot2grid((rows,cols), (0,1), frameon=False) #, rowspan=7) # # print (imgg.shape) # # plt.imshow(imgg, cmap=plt.get_cmap('gray')) # ax.imshow(imgg, cmap=plt.get_cmap('gray')) # ax.set_yticks([]) # # plt.savefig(model_dict['exp_path']+'img1.pdf') # # print (model_dict['exp_path']+'img.png') # one_comp =1 # if zero_comp and one_comp: # print ('plotted both') # # imgg = nstep_frames[20].view(num_steps*84,84).data.cpu().numpy() # # plt.imshow(imgg, cmap=plt.get_cmap('gray')) # # plt.savefig(model_dict['exp_path']+'img_20.pdf') # # fdfaa fig = plt.figure(figsize=(4+cols,1+rows), facecolor='white') # plt.savefig(model_dict['exp_path']+'img_both.pdf') # ffasd # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval * 30) == 0: update_grad_plot(model_dict) to_print_legend_string += ' grad_var_plot updated ' make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string + " problem with plot") try: make_plots(model_dict) except: print()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 # # Create agent # if algo == 'a2c': # agent = a2c(envs, model_dict) # print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM print('Init expert agent') expert_agent = a2c(envs, model_dict) param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) expert_agent.actor_critic.load_state_dict(param_dict) print('loaded params', param_file) expert_agent.actor_critic.cuda() print('Init imitator agent') imitator_agent = a2c(envs, model_dict) # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt' # param_dict = torch.load(param_file) # imitator_agent.actor_critic.load_state_dict(param_dict) # print ('loaded params', param_file) imitator_agent.actor_critic.cuda() agent = expert_agent expert_policy = expert_agent.actor_critic imitator_policy = imitator_agent.actor_critic optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001) total_steps = 0 display_step = 50 # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) state__ = Variable(agent.rollouts.states[step]) / 255. value, action, action_log_probs, dist_entropy = agent.act( state__) #, requires_grad=False)#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) batch = state__ optimizer.zero_grad() log_dist_expert = expert_policy.action_logdist(batch) log_dist_imitator = imitator_policy.action_logdist(batch) action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator) * torch.exp(log_dist_expert), dim=1) #[B] # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k) loss = torch.mean(action_dist_kl) loss.backward() # nn.utils.clip_grad_norm(self.parameters(), .5) optimizer.step() # if total_steps%display_step==0: # and batch_idx == 0: # # print ('Train Epoch: {}/{}'.format(epoch+1, epochs), # # 'total_epochs {}'.format(total_epochs), # print('LL:{:.4f}'.format(loss.data[0]) # # 'logpx:{:.4f}'.format(logpx.data[0]), # # 'logpz:{:.5f}'.format(logpz.data[0]), # # 'logqz:{:.5f}'.format(logqz.data[0]), # # 'action_kl:{:.4f}'.format(action_dist_kl.data[0]) # ) # total_steps+=1 cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent agent.no_update() #agent.update(j,num_updates) # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: save_to = home + '/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt' torch.save(imitator_policy.state_dict(), save_to) print('saved imitator_policy', save_to) # #Save model # if save_params: # do_params(save_dir, agent, total_num_steps, model_dict) # # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, loss.data[0]) # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start, # end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval * 30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print()
def plot_fits_postfacto(filename, paramfilename, galaxyname, slitmaskname, startstar=0, globular=False, lines='new', mn_cluster=None): """ Plot fits, residuals, and ivar for stars whose [Mn/H] abundances have already been measured. Inputs: filename -- file with observed spectra paramfilename -- file with parameters of observed spectra galaxyname -- galaxy name, options: 'scl' slitmaskname -- slitmask name, options: 'scl1' Keywords: startstar -- if 0 (default), start at beginning of file and write new datafile; else, start at #startstar and just append to datafile globular -- if 'False' (default), put into output path of galaxy; else, put into globular cluster path lines -- if 'new' (default), use new revised linelist; else, use original linelist from Judy's code mn_cluster -- if not None (default), also plot spectrum with [Mn/H] = mean [Mn/H] of cluster """ # Input filename if globular: file = '/raid/madlr/glob/'+galaxyname+'/'+slitmaskname+'.csv' else: file = '/raid/madlr/dsph/'+galaxyname+'/'+slitmaskname+'.csv' # Output filepath if globular: outputname = '/raid/madlr/glob/'+galaxyname+'/'+slitmaskname else: outputname = '/raid/madlr/dsph/'+galaxyname+'/'+slitmaskname name = np.genfromtxt(file, delimiter='\t', skip_header=1, usecols=0, dtype='str') mn = np.genfromtxt(file, delimiter='\t', skip_header=1, usecols=8) mnerr = np.genfromtxt(file, delimiter='\t', skip_header=1, usecols=9) # Get number of stars in file with observed spectra Nstars = open_obs_file(filename) # Open file to store reduced chi-sq values chisqfile = outputname+'_chisq.txt' with open(chisqfile, 'w+') as f: print('made it here') f.write('Star'+'\t'+'Line'+'\t'+'redChiSq (best[Mn/H])'+'\t'+'redChiSq (best[Mn/H]+0.15)'+'\t'+'redChiSq (best[Mn/H]-0.15)'+'\n') # Plot spectra for each star for i in range(startstar, Nstars): try: # Check if parameters are measured temp, logg, fe, alpha, fe_err = open_obs_file(filename, retrievespec=i, specparams=True) if np.isclose(1.5,logg) and np.isclose(fe,-1.5) and np.isclose(fe_err, 0.0): print('Bad parameter measurement! Skipped #'+str(i+1)+'/'+str(Nstars)+' stars') continue # Open star star = chi_sq.obsSpectrum(filename, paramfilename, i, False, galaxyname, slitmaskname, globular, lines, plot=True) # Check if star has already had [Mn/H] measured if star.specname in name: # If so, open data file for star if globular: datafile = '/raid/madlr/glob/'+galaxyname+'/'+slitmaskname+'/'+str(star.specname)+'_data.csv' else: datafile = '/raid/madlr/dsph/'+galaxyname+'/'+slitmaskname+'/'+str(star.specname)+'_data.csv' # Get observed and synthetic spectra and inverse variance array obswvl = np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=0) obsflux = np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=1) synthflux = np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=2) #synthfluxup = np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=3) #synthfluxdown = np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=4) ivar = np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=5) idx = np.where(name == star.specname) synthfluxup = star.synthetic(obswvl, mn[idx] + 0.15, full=True) synthfluxdown = star.synthetic(obswvl, mn[idx] - 0.15, full=True) synthflux_nomn = star.synthetic(obswvl, -10.0, full=True) if mn_cluster is not None: synthflux_cluster = [mn_cluster, star.synthetic(obswvl, mn_cluster, full=True)] else: synthflux_cluster=None if mnerr[idx][0] < 1: # Run code to make plots make_plots(lines, star.specname+'_', obswvl, obsflux, synthflux, outputname, ivar=ivar, resids=True, synthfluxup=synthfluxup, synthfluxdown=synthfluxdown, synthflux_nomn=synthflux_nomn, synthflux_cluster=synthflux_cluster, title=None, savechisq=chisqfile) # Write all plotting data to a file hdr = 'Star '+str(star.specname)+'\n'+'obswvl\tobsflux\tsynthflux\tsynthfluxup\tsynthfluxdown\tsynthflux_nomn\n' np.savetxt(outputname+'/'+str(star.specname)+'_finaldata.csv', np.asarray((obswvl,obsflux,synthflux,synthfluxup,synthfluxdown,synthflux_nomn)).T, header=hdr) except Exception as e: print(repr(e)) print('Skipped star #'+str(i+1)+'/'+str(Nstars)+' stars') continue print('Finished star '+star.specname, '#'+str(i+1)+'/'+str(Nstars)+' stars') return
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] grad_var_ = model_dict['grad_var_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) # if ls_: # print ('env for ls') # envs_ls = make_env_basic(env_name) # if vae_: # print ('env for vae') # envs_vae = make_env_basic(env_name) # if grad_var_: # print ('env for grad_var_') # envs_grad_var = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 model_dict['action_size'] = envs.action_space.n print (envs.action_space.n, 'actions') # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'dqn': agent = DQN(envs, model_dict) print ('init DQN agent') print (agent.q_net) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest, since its a stack # agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) # dqn_epsilon = .1 #lower means less likely to do random .9 # .1 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 50000 epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): dqn_epsilon = epsilon_by_frame(j) #Num steps till agent update # for step in range(num_steps): # Act, [P,1], [P,1], [P,1], [P] # state_pytorch = Variable(agent.rollouts.states[step]) state_pytorch = Variable(current_state) # value, action, action_log_probs, dist_entropy = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True)) action = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True)) # Apply to Environment, S:[P,C,H,W], R:[P], D:[P] # cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] frame, reward, done, info = envs.step(action) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) new_current_state = update_current_state(current_state, frame, shape_dim0) agent.replay_buffer.push(current_state, action, reward, new_current_state, done.astype(int)) current_state = new_current_state if len(agent.replay_buffer) > 100: agent.update() # agent.update() # agent.update() # agent.update() # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #make vae prob gif if vae_: do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) # #make vae prob gif # if grad_var_: # do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.2f}, {:.5f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, dqn_epsilon, agent.loss.data.cpu().numpy()[0]) # torch.mean(discrim_errors).data.cpu().numpy()[0]) print(to_print_info_string) # if vae_: # elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0]) # if next_state_pred_: # state_pred_error_print = "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0]) # print(to_print_info_string+' '+state_pred_error_print+' '+elbo) # to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo" # else: # if vae_: # print(to_print_info_string+' '+elbo) # else: # print(to_print_info_string) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"#, elbo" start2 = time.time() if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval*30) == 0: #writes to file do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval*30) == 0: update_grad_plot(model_dict) to_print_legend_string += ' grad_var_plot updated ' make_plots(model_dict) print(to_print_legend_string + " Plot updated") # print (len(agent.replay_buffer)) except: raise #pass print(to_print_legend_string + " problem with plot") try: make_plots(model_dict) except: print ()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] explore_ = model_dict['explore_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) if vae_: print('env for vae') envs_vae = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 next_state_pred_ = 0 model_dict['next_state_pred_'] = next_state_pred_ # Create agent # if algo == 'a2c': # agent = a2c(envs, model_dict) # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # see_reward_episode = 0 # if 'Montez' in env_name and see_reward_episode: # states_list = [[] for i in range(num_processes)] # view_reward_episode(model_dict=model_dict, frames=[]) # dfasddsf # if vae_: # vae = VAE() # vae.cuda() print('init exploit a2c agent') agent_exploit = a2c(envs, model_dict) if explore_: print('init explore a2c agent') agent_explore = a2c(envs, model_dict) print('init vae') vae = VAE() vae.cuda() # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent_exploit.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step if explore_: agent_explore.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) # prev_action = Variable(torch.zeros([num_processes, 1]).type(torch.LongTensor)).cuda() # For normalizing the logprobs B = .99 m = torch.FloatTensor([-100.]).cuda() v = torch.FloatTensor([10000.]).cuda() # prev_reward = torch.ones(num_processes,1).cuda() if model_dict['init_exploit_processes'] == -1: init_exploit_processes = num_processes else: init_exploit_processes = model_dict['init_exploit_processes'] exploit_processes = init_exploit_processes # explore_processes = 16 all_frames = [] start = time.time() start2 = time.time() for j in range(num_updates): start3 = time.time() for step in range(num_steps): # start3 = time.time() state_pytorch = Variable(agent_exploit.rollouts.states[step] ) #, volatile=True) # [P,S,84,84] # exploit_state = state_pytorch[:exploit_processes] # explore_state = state_pytorch[exploit_processes:] u_value, u_action, u_action_log_probs, u_dist_entropy = agent_exploit.act( state_pytorch) if explore_: r_value, r_action, r_action_log_probs, r_dist_entropy = agent_explore.act( state_pytorch) u_cpu_actions = u_action.data.squeeze(1).cpu().numpy() #[P] if explore_: r_cpu_actions = r_action.data.squeeze(1).cpu().numpy() #[P] #Choose how many you want from each cpu_actions = np.concatenate((u_cpu_actions[:exploit_processes], r_cpu_actions[exploit_processes:]), 0) #[P] # cpu_actions = u_cpu_actions # before_step_time = time.time() - start3 # Step, S:[P,C,H,W], R:[P], D:[P] # start3 = time.time() state, reward, done, info = envs.step(cpu_actions) # step_time = time.time() - start3 # reward_numpy = reward # print (reward) # # for trainign vae. # for p in range(len(state)): # # print (state[p].shape) #[1,84,84] # # fasad # all_frames.append(state[p]) # print (len(all_frames)) # if len(all_frames) == 10000: # pickle.dump( all_frames, open(home + '/Documents/tmp/montezum_frames.pkl' , "wb" ) ) # print ('saved pkl') # fafaadsfs # start3 = time.time() # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # current_state_u = current_state[:exploit_processes] # current_state_r = current_state[exploit_processes:] #Insert data for exploit agent agent_exploit.insert_data(step, current_state, u_action.data, u_value, reward, masks, u_action_log_probs, u_dist_entropy, 0) #, done) if explore_: # Insert log prob for explore agent batch = state_pytorch[:, -1] #last of stack batch = batch.contiguous() # [P,84,84] elbo = vae.forward2(batch, k=10) #[P] elbo = elbo.view(-1, 1).data #[P,1] elbo = (elbo - m) / torch.sqrt(v) elbo = torch.clamp(elbo, max=.01) agent_explore.insert_data(step, current_state, r_action.data, r_value, -elbo, masks, r_action_log_probs, r_dist_entropy, 0) #, done) #update m and v m = B * m + (1. - B) * elbo.mean() v = B * v + (1. - B) * elbo.pow(2).mean() if elbo.mean() < -9000.: print(elbo) print(reward) print(elbo.mean()) print(elbo.pow(2).mean()) fadsads # after_step_time = time.time() - start3 # if 'Montez' in env_name and see_reward_episode: # for state_i in range(len(state)): # if done[state_i]: # states_list[state_i] = [] # else: # states_list[state_i].append(np.squeeze(state[state_i])) # # print (state[state_i].shape) # # fasdf # # print (reward) # if reward_numpy[state_i] >0: # #plot the states of state_i # print (len(states_list[state_i])) # # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:]) # # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:]) # view_reward_episode(model_dict=model_dict, frames=states_list[state_i]) # fadsa # # and np.sum(agent.rollouts.rewards.cpu().numpy()) > 0 # # print (np.sum(agent.rollouts.rewards.cpu().numpy())) # # print (j) steps_time = time.time() - start3 start3 = time.time() #Optimize agents agent_exploit.update() #agent.update(j,num_updates) if explore_: agent_explore.update() #agent.update(j,num_updates) #Optimize vae batch = agent_exploit.rollouts.states batch = batch[1:] # [Steps,Processes,Stack,84,84] batch = batch[:, :, 0] # [Steps,Processes,84,84] batch = batch.contiguous().view(-1, 84, 84) # [Steps*Processes,84,84] elbo = vae.update(batch) #Insert state agent_exploit.insert_first_state(agent_exploit.rollouts.states[-1]) if explore_: agent_explore.insert_first_state(agent_explore.rollouts.states[-1]) total_num_steps = (j + 1) * num_processes * num_steps #Change number of explore vs exploit if model_dict['init_exploit_processes'] != -1 and model_dict[ 'inc_exploiters_over'] != -1: frac_step = np.minimum((total_num_steps + 1.) / float(model_dict['inc_exploiters_over']), 1.) #fraction of steps aaa = int((num_processes - init_exploit_processes) * frac_step) exploit_processes = np.minimum(init_exploit_processes + aaa + 1, num_processes) update_time = time.time() - start3 # agent_exploit.rollouts.reset_lists() # agent_explore.rollouts.reset_lists() # print ('init ', init_exploit_processes) # print ('cur ', exploit_processes) # print ('frac_step', frac_step) # print ('aaa', aaa) # print (agent.state_pred_error.data.cpu().numpy()) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) # #make vae prob gif if vae_: # do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) # do_gifs2(envs_vae, agent_exploit, vae, model_dict, update_current_state, update_rewards, total_num_steps) do_gifs3(envs_vae, agent_exploit, vae, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0]) # elbo = "1" steps_time = "{:.3f}".format(steps_time) update_time = "{:.3f}".format(update_time) # if next_state_pred_: # state_pred_error_print = "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0]) # print(to_print_info_string+' '+state_pred_error_print+' '+elbo) # to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo" # else: # print(to_print_info_string+' '+elbo) # print(to_print_info_string+' '+elbo+' '+str(exploit_processes)+' '+str(before_step_time)+' '+str(step_time)+' '+str(after_step_time))#, value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy()) print( to_print_info_string + ' ' + elbo + ' ' + str(exploit_processes) ) #+' '+steps_time+' '+update_time)#, value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy()) # print (value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy()) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, elbo, Exploit_Procs" start2 = time.time() if j % (log_interval * 30) == 0: if ls_: # do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) do_ls_2(envs_ls, agent_explore, model_dict, total_num_steps, update_current_state, update_rewards, vae) # update_ls_plot(model_dict) update_ls_plot_2(model_dict) print('updated ls') # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots # if ls_: try: start3 = time.time() make_plots(model_dict) print(to_print_legend_string + " Plot updated ") #+str(time.time() - start3)) except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print()