Example #1
0
def main():
    now = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    model_name = 'simpleCNN_' + now + '.h5'
    batch_size = 256
    num_epochs = 30
    lr = .001

    num_train_samples = len(os.listdir('./data/train/cancer')) + len(os.listdir('./data/train/healthy'))
    num_valid_samples = len(os.listdir('./data/validation/cancer')) + len(os.listdir('./data/validation/healthy'))

    # Build our model
    input_tensor = Input(shape=(96, 96, 3))
    x = layers.Conv2D(32, (3, 3))(input_tensor)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(64, (3, 3))(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(128, (3, 3))(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(128, (3, 3))(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Flatten()(x)
    x = layers.Dropout(.5)(x)
    x = layers.Dense(512, activation='relu')(x)
    output_tensor = layers.Dense(1, activation='sigmoid')(x)

    model = Model(input_tensor, output_tensor)
    model.summary()

    # Get things ready to train: should adjust learning rate, etc.
    model.compile(optimizer=Adam(lr), loss='binary_crossentropy', metrics=['acc'])

    train_generator = train_gen(batch_size)
    validation_generator = valid_gen(batch_size)

    steps_per_epoch = num_train_samples / batch_size
    validation_steps = num_valid_samples / batch_size

    # Basic callbacks
    checkpoint = callbacks.ModelCheckpoint(filepath='./models/' + model_name,
                                           monitor='val_loss',
                                           save_best_only=True)
    early_stop = callbacks.EarlyStopping(monitor='val_acc',
                                         patience=3)
    csv_logger = callbacks.CSVLogger('./logs/' + model_name.split('.')[0] + '.csv')

    callback_list = [checkpoint, early_stop, csv_logger]

    # Training begins
    history = model.fit_generator(train_generator,
                                  steps_per_epoch=steps_per_epoch,
                                  epochs=num_epochs,
                                  verbose=1,
                                  callbacks=callback_list,
                                  validation_data=validation_generator,
                                  validation_steps=validation_steps)

    model.save('./models/' + model_name)

    make_plots(history, model_name)
def main():
    now = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    model_name = 'pretrain_NASNet_' + now + '.h5'
    batch_size = 32
    num_epochs = 30
    lr = .0001

    num_train_samples = len(os.listdir('./data/train/cancer')) + len(os.listdir('./data/train/healthy'))
    num_valid_samples = len(os.listdir('./data/validation/cancer')) + len(os.listdir('./data/validation/healthy'))

    # Build our model
    input_tensor = Input(shape=(96, 96, 3))
    NASNet = NASNetMobile(include_top=False, input_shape=(96, 96, 3))
    x = NASNet(input_tensor)
    x1 = layers.GlobalMaxPooling2D()(x)
    x2 = layers.GlobalAveragePooling2D()(x)
    x3 = layers.Flatten()(x)
    z = layers.Concatenate(axis=-1)([x1, x2, x3])
    z = layers.Dropout(.5)(z)
    output_tensor = layers.Dense(1, activation='sigmoid')(z)

    model = Model(input_tensor, output_tensor)
    model.summary()

    # Get things ready to train: tweak learning rate, etc.
    model.compile(optimizer=Adam(lr), loss='binary_crossentropy', metrics=['acc'])

    train_generator = train_gen(batch_size)
    validation_generator = valid_gen(batch_size)

    steps_per_epoch = num_train_samples / batch_size
    validation_steps = num_valid_samples / batch_size

    # Basic callbacks
    checkpoint = callbacks.ModelCheckpoint(filepath='./models/' + model_name,
                                           monitor='val_loss',
                                           save_best_only=True)
    early_stop = callbacks.EarlyStopping(monitor='val_acc',
                                         patience=4)
    csv_logger = callbacks.CSVLogger('./logs/' + model_name.split('.')[0] + '.csv')

    callback_list = [checkpoint, early_stop, csv_logger]

    # Training begins
    history = model.fit_generator(train_generator,
                                  steps_per_epoch=steps_per_epoch,
                                  epochs=num_epochs,
                                  verbose=1,
                                  callbacks=callback_list,
                                  validation_data=validation_generator,
                                  validation_steps=validation_steps)

    model.save('./models/' + model_name)

    make_plots(history, model_name)
Example #3
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    if grad_var_:
        print('env for grad_var_')
        envs_grad_var = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0
    model_dict['action_size'] = envs.action_space.n
    print(envs.action_space.n, 'actions')

    # next_state_pred_ = 0
    # model_dict['next_state_pred_'] = next_state_pred_

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')

    discriminator = CNN_Discriminator(model_dict).cuda()
    print('init discriminator')

    # elif algo == 'a2c_over':
    #     agent = a2c_over(envs, model_dict)
    #     print ('init a2c_over agent')
    # elif algo == 'a2c_under':
    #     agent = a2c_under(envs, model_dict)
    #     print ('init a2c_under agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # see_reward_episode = 0
    # if 'Montez' in env_name and see_reward_episode:
    #     states_list = [[] for i in range(num_processes)]

    # view_reward_episode(model_dict=model_dict, frames=[])
    # dfasddsf

    # if vae_:
    #     vae = VAE()
    #     vae.cuda()

    buffer_ = 1

    if buffer_:
        buffer_states = deque(maxlen=200)
        buffer_actions = deque(maxlen=200)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state, shape_dim0).type(
            dtype)  #add the new frame, remove oldest, since its a stack
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        # discrim_errors = []
        # discrim_errors_reverse = []
        # discrim_errors_2step = []
        # frames = []
        for step in range(num_steps):

            # Act, [P,1], [P,1], [P,1], [P]
            state_pytorch = Variable(agent.rollouts.states[step])
            value, action, action_log_probs, dist_entropy = agent.act(
                state_pytorch)  #, volatile=True))

            # print (action)

            # fsdaf

            # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            frame, reward, done, info = envs.step(cpu_actions)

            # frames.append(torch.FloatTensor(frame)) #[P,1,84,84]

            # # current_frame = torch.from_numpy(frame)  #[P,1,84,84]
            # current_frame = torch.FloatTensor(frame)  #[P,1,84,84]
            # if step ==0:
            #     prev_frame = torch.FloatTensor(state)  #[P,1,84,84]

            # #Pred action and get error
            # discrim_error = discriminator.forward(prev_frame, current_frame, action)
            # discrim_errors.append(discrim_error)

            # discrim_error_reverse = discriminator.forward(current_frame, prev_frame, action)
            # discrim_errors_reverse.append(discrim_error_reverse)

            # # THIS IS TO SEE PREDICTIONS

            # if step==0:
            #     f =  np.reshape(prev_frame[0].numpy(), [84,84])
            # f =np.concatenate([f,np.reshape(current_frame[0].numpy(),[84,84])], axis=0)

            # # f1 = prev_frame[0].numpy()
            # # f2 = current_frame[0].numpy()
            # # f = np.reshape(np.concatenate([f1,f2], axis=1), [168,84])
            # # print (f.shape)
            # print (cpu_actions[0])
            # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] for breakout
            # #for montezuma
            # #['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT',
            #     #'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']
            # I think FIRE = JUMP

            # if step ==2:
            #     print (torch.mean(current_frame-prev_frame))
            #     fdafds

            # prev_frame_2step = prev_frame

            # prev_frame = current_frame

            # # print (torch.sum(prev_frame_2step), torch.sum(prev_frame))

            # fadsa

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, frame,
                                                 shape_dim0)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy,
                              0)  #, done)

        # print (f.shape)
        # rows = 1
        # cols = 1
        # fig = plt.figure(figsize=(1+cols,5+rows), facecolor='white')
        # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) #, rowspan=7)
        # ax.imshow(f, cmap=plt.get_cmap('gray'))
        # ax.set_yticks([])
        # ax.set_xticks([])
        # plt.tight_layout()
        # plt.savefig(model_dict['exp_path']+'plot.png')
        # print ('plotted')
        # fadsfad

        # if buffer_:
        #     if len(buffer_actions) <100:
        #         buffer_steps = 10
        #     else:
        #         buffer_steps = 1

        buffer_steps = 500

        if buffer_:
            #Insert into buffer
            buffer_states.append(agent.rollouts.states)
            buffer_actions.append(agent.rollouts.actions)

            # print (agent.rollouts.states)
            # print (agent.rollouts.actions)
            # fda
            # print (len(buffer_actions))

            #If buffer full enough,sample , predict, optimize
            # if len(buffer_actions) > 10:

            if len(buffer_actions) == 100:

                # if 1:
                #Num of optimization steps
                for i in range(buffer_steps):
                    # #Sample batch
                    # states_batch = []
                    # actions_batch = []
                    # for bb in range(num_processes):
                    #     ind = np.random.randint(len(buffer_actions))
                    #     print (buffer_states[ind].size())
                    #     fadas
                    #     states_batch.append(buffer_states[ind])
                    #     actions_batch.append(buffer_actions[ind])
                    # states_batch = torch.stack(states_batch, dim=1)
                    # actions_batch = torch.stack(actions_batch, dim=1)

                    ind = np.random.randint(len(buffer_actions))
                    states_batch = buffer_states[ind]
                    actions_batch = buffer_actions[ind]

                    #Optimize action-predictor
                    discrim_errors = discrim_predictions(
                        model_dict, states_batch, actions_batch, discriminator)
                    discriminator.optimize(discrim_errors)

                    if i % 20 == 0:
                        print(i)

                # print (len(buffer_actions), torch.mean(discrim_errors).data.cpu().numpy()[0])

            #Optimize agent
            discrim_errors = discrim_predictions(model_dict,
                                                 agent.rollouts.states,
                                                 agent.rollouts.actions,
                                                 discriminator)
            discrim_errors_reverse = discrim_predictions(
                model_dict,
                agent.rollouts.states,
                agent.rollouts.actions,
                discriminator,
                reverse=True)

            if len(buffer_actions) > 100:
                discriminator.optimize(discrim_errors)

            agent.update2(discrim_errors,
                          discrim_errors_reverse)  #agent.update(j,num_updates)
            # agent.update2(discrim_errors)  #agent.update(j,num_updates)

        else:

            discrim_errors = discrim_predictions(model_dict,
                                                 agent.rollouts.states,
                                                 agent.rollouts.actions,
                                                 discriminator)
            discrim_errors_reverse = discrim_predictions(
                model_dict,
                agent.rollouts.states,
                agent.rollouts.actions,
                discriminator,
                reverse=True)

            #Optimize discriminator
            discriminator.optimize(discrim_errors)

            #Optimize agent
            agent.update2(discrim_errors,
                          discrim_errors_reverse)  #agent.update(j,num_updates)
            # agent.update2(discrim_errors)  #agent.update(j,num_updates)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae,
                              update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.3f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2,
                torch.mean(discrim_errors).data.cpu().numpy()[0])

            print(to_print_info_string)

            # if vae_:
            #     elbo =  "{:.2f}".format(elbo.data.cpu().numpy()[0])

            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # if vae_:
            #     print(to_print_info_string+' '+elbo)
            # else:
            # print(to_print_info_string)

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"  #, elbo"
            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_ and j % (log_interval * 30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict,
                                total_num_steps, update_current_state,
                                update_rewards)

                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval * 30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string + " problem with plot")

    try:
        make_plots(model_dict)
    except:
        print()
    description="Replace all Bokeh images with interactivet Bokeh plots."
)
parser.add_argument(
    "--readme_file",
    dest="readme_file",
    default="README.md",
    help="Path to README.md file",
)
readme_file = parser.parse_args().readme_file

# Read in README:
with open(readme_file) as f:
    readme = f.read()

# Create Bokeh plots:
plots = make_plots()

# Replace pictures with HTML plots:
for plotname, plot in plots.items():
    print(f"Replace image {plotname}")
    if re.search(r"!\[{plotname}\]\(.+\)".format(plotname=plotname), readme):
        to_replace = re.search(
            r"!\[{plotname}\]\(.+\)".format(plotname=plotname), readme
        ).group()
        readme = readme.replace(to_replace, f'<div align="center">\n\n{plot}\n\n</div>')
    else:
        raise KeyError(
            f"No image with name '{plotname}' has been found in the README file '{readme_file}'."
        )

# Replace path to remaining pictures:
Example #5
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()
        
    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM






    #load model
    # if model_dict['load_params']:

    # load_params(thigns)
    # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
    param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt'


    # pretrained_dict = torch.load(param_file)  # object
    # print (pretrained_dict)
    # agent_dict = agent.actor_critic.state_dict()  #dict
    # print (agent_dict.keys())
    # agent_dict.update(pretrained_dict)
    # # agent_dict.update(agent.actor_critic)
    # agent.actor_critic.load_state_dict(agent_dict)


    param_dict = torch.load(param_file)
    agent.actor_critic.load_state_dict(param_dict)


    # agent.actor_critic = torch.load(param_file)
    agent.actor_critic.cuda()
    print ('loaded', param_file)

    # afdsa







    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)

    # list of lists, where lists are trajectories. trajectories have actinos and states 
    dataset = []
    tmp_trajs = [[] for x in range(num_processes)]


    dataset_count = 0


    done = [0]*num_processes

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())






            # y = torch.LongTensor(batch_size,1).random_() % nb_digits
            # # One hot encoding buffer that you create out of the loop and just keep reusing
            # y_onehot = torch.FloatTensor(batch_size, nb_digits)
            # # In your for loop
            # y_onehot.zero_()
            # y_onehot.scatter_(1, y, 1)



            states_ = agent.rollouts.states[step].cpu().numpy()  #[P,S,84,84]
            # print (state_t.shape)
            actions_ = action.data.cpu().numpy() #[P,1]
            # print (action)
            # fdsaf


            #store step
            for proc in range(num_processes):

                #add states
                state_t = states_[proc]
                action_t = actions_[proc]
                tmp_trajs[proc].append([action_t, state_t])

                if done[proc]:

                    dataset.append(tmp_trajs[proc])
                    dataset_count += len(tmp_trajs[proc])
                    tmp_trajs[proc] = []

                    for ii in range(len(dataset)):
                        print (len(dataset[ii]))


            if dataset_count > 10000:

                # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) )
                pickle.dump( dataset, open(home+'/Documents/tmp/RoadRunner/trajectories_10000.pkl', "wb" ) )

                print('saved')
                # pickle.save(dataset)
                STOP





            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 







            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)






        # print (len(dataset))
        # print ()





        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2)
            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
def combine_and_resample_px4_nogui(input_path, file_prefix=''):

    #list of file keywords to identify the files we want
    keywords_in_wanted_files = [
        'gps_position', 'combined', 'magnetometer_0', 'input_rc',
        'battery_status', 'attitude', 'control', 'air_data', 'vehicle_status_0'
    ]

    #list of key words to identify the columns we want
    keywords_for_columns = [
        'lat', 'lon', 'alt', 'hdop', 'vdop', 'mode_slot',
        'accelerometer_m_s2[0]', 'accelerometer_m_s2[1]',
        'accelerometer_m_s2[2]', 'gyro_rad[0]', 'gyro_rad[1]', 'gyro_rad[2]',
        'magnetometer_ga[0]', 'magnetometer_ga[1]', 'magnetometer_ga[2]',
        'rc_failsafe', 'voltage_v', 'rssi', 'q[0]', 'q[1]', 'q[2]', 'q[3]',
        'baro_alt', 'nav_state', 'values[0]', 'values[1]', 'values[2]',
        'values[3]', 'values[4]', 'values[5]', 'values[6]', 'values[7]',
        'values[8]', 'values[9]', 'values[10]', 'values[11]', 'values[12]',
        'values[13]', 'values[14]', 'values[15]', 'values[16]', 'values[17]',
        'values[18]'
    ]
    # save the current path to a variable so we can return to it later
    original_path = os.getcwd()

    os.chdir(input_path)

    # get list of filenames in directory ending with csv
    list_of_filenames = [f for f in os.listdir('.') if f.endswith('.csv')]

    #remove the files we know we do not want from the list
    list_of_filenames = [
        item for item in list_of_filenames
        if any(word in item for word in keywords_in_wanted_files)
    ]

    #check if list_of_filenames is populated and exit if it is
    if len(list_of_filenames) == 0:
        sys.exit("There are no files to process in " + input_path)

    # create empty list so we can append to it
    list_of_df = []

    #empty list so we can append it
    reject_column_list = []

    #empty list to append
    new_headers = []

    # iterate through the csv in the current directory, create a df for each
    # filename, put the df into a list of other df
    for current_filename in list_of_filenames:

        # this is the important read, read in the data we care about, the index
        # is stored in column 0, the header is stored in row 0, pandas will
        # name columns automatically for us using the header row
        df = pd.read_csv(current_filename, index_col=0, header=0)

        # get a list of all the headers in the csv
        column_headers = df.columns

        # create an empty list to append
        columns_to_keep = []

        # get a list of the columns we want to keep for each csv by iterating
        # through the file and getting the headers

        for header in column_headers:
            #get the list of keywords
            for keyword in keywords_for_columns:
                #check if the keyword is on the header
                if keyword in header:
                    #the word lat and alt is used in other headers this takes
                    #care of the problem
                    if keyword == 'lat' and header == 'lat':
                        columns_to_keep.append(header)
                        assign_names(keyword, new_headers)
                    if keyword == 'alt' and header == 'alt':
                        columns_to_keep.append(header)
                        assign_names(keyword, new_headers)

                    # add the header to the list of headers to keep if it has
                    # the keyword by calling the assignname_ function
                    if keyword != 'lat' and keyword != 'alt':
                        columns_to_keep.append(header)
                        assign_names(keyword, new_headers)

        #change the list to just have the columns we want
        reject_column_list = [
            header for header in column_headers
            if header not in columns_to_keep
        ]

        #drop all columns that we do not want
        df = df.drop(columns=reject_column_list)

        # store the current df (from a single csv) into the big list of dfs
        list_of_df.append(df)

    # create the big df by using the concat method called on a list of small df
    big_df = pd.concat(list_of_df, axis=0, ignore_index=False, sort=False)

    # sort on the timestamp column, otherwise the small df are stuck together
    # end-to-end which isn't what we want
    big_df = big_df.sort_values(by='timestamp')

    # for px4 some data files start with 0 for timestamp, we don't want this,
    # so we will just discard these rows for now
    big_df = big_df.drop(0, errors="ignore")

    # offset time to zero just because we can
    big_df.index = big_df.index - big_df.index[0]

    # fill the missing spaces, use ffill to move the most recent valid observation
    # forward
    big_df = big_df.fillna(method='ffill')

    # fill the remaining na with 0, these only happen at the beginning where we
    # previously did not have any observations to pass forward
    big_df = big_df.fillna(0)

    # get rid of duplicate rows, not sure this is needed but keeping just in case
    # UPDATE:  definitely needed, first line gets rid of duplicate time entries
    big_df = big_df[~big_df.index.duplicated()]

    # this one gets rid of duplicated output data, not sure this is required
    big_df = big_df.drop_duplicates()

    # create a time delta column with the proper format, note the use of 10**6 to
    # modify time stamp since timestamp for px4 data is in microseconds, 'S' means
    # that this function is expected time formated ins seconds so the easiest way
    # to fix it is just to convert the number to seconds before passing it
    big_df['time_properformat'] = pd.to_timedelta(big_df.index / 10.0**6, 'S')

    # switch the index of the big_df to proper time delta column
    big_df.index = pd.to_datetime(big_df.time_properformat.astype('int64'))

    # fix this for 250Hz rate since the auto calculator is causing issues
    min_sampletime = .004
    freq_arg = int(min_sampletime * 10**6)
    freq_type = 'U'

    # create the resampled  df
    resampled_df = big_df.asfreq(str(freq_arg) + freq_type, method='ffill')

    # get rid of the annoying time index, switch back to delta time in seconds
    resampled_df.index = resampled_df.index.values.astype(np.uint64) / 1000000

    # get rid of the unused column before we send it to csv
    resampled_df = resampled_df.drop(columns=['time_properformat'])

    #assign the new headers to the columns
    resampled_df.columns = new_headers

    #call the quat2eul function to change the attitude to eul
    r, p, y = quat2eul(resampled_df['Att.Qx'], resampled_df['Att.Qy'],
                       resampled_df['Att.Qz'], resampled_df['Att.Qw'])

    # add the pitch roll and yaw to the resampled_df
    resampled_df['Att.roll'] = r
    resampled_df['Att.pitch'] = p
    resampled_df['Att.yaw'] = y

    #drop the columns with the quat data in them
    resampled_df = resampled_df.drop(
        columns=['Att.Qx', 'Att.Qy', 'Att.Qz', 'Att.Qw'])

    # check if folder exists and create if needed, this avoid the script trying
    # to read it's own results.csv as one of the constituent files if we run
    # this script on a directory where it has been run at least once previously
    if not os.path.exists('combined'):
        os.mkdir('combined')

    # write the result, we want to write the index column, we want to label the index
    # column, feel free to change the name
    resampled_df.to_csv(path_or_buf=os.path.join('combined',
                                                 file_prefix + '_results.csv'),
                        index=True,
                        index_label='Time')

    print('Resampling complete.')
    print(
        'Minimum sample time is:\t%f s\nThe corresponding frequency is:\t%f Hz\nOutput saved to:  %s'
        % (min_sampletime, 1 / min_sampletime,
           os.path.join('combined', file_prefix + '_results.csv')))

    # call the make_plot function and pass the location of the results file
    make_plots('combined' + '/' + file_prefix + '_results.csv')

    # go back to the original directory so we can iterate through the rest of
    # the files
    os.chdir(original_path)
Example #7
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    next_state_pred_ = 0
    model_dict['next_state_pred_'] = next_state_pred_

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # see_reward_episode = 0
    # if 'Montez' in env_name and see_reward_episode:
    #     states_list = [[] for i in range(num_processes)]

    # view_reward_episode(model_dict=model_dict, frames=[])
    # dfasddsf

    if vae_:
        vae = VAE()
        vae.cuda()

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    # prev_action = Variable(torch.zeros([num_processes, 1]).type(torch.LongTensor)).cuda()

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state_pytorch = Variable(agent.rollouts.states[step])

            value, action, action_log_probs, dist_entropy = agent.act(
                state_pytorch)  #, volatile=True))

            # if next_state_pred_:
            #     next_state_prediction = agent.actor_critic.predict_next_state2(state_pytorch, prev_action)
            # next_state_prediction = 0

            # print (action_log_probs.size())
            # print (dist_entropy.size())

            # prev_action = action

            # print (next_state_prediction.size()) # [P,1,84,84]
            # fasd

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            reward_numpy = reward

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)

            if next_state_pred_:

                agent.insert_data(step, current_state, action.data, value,
                                  reward, masks, action_log_probs,
                                  dist_entropy,
                                  next_state_prediction)  #, done)
                agent.rollouts.insert_state_pred(next_state_prediction)

            else:
                agent.insert_data(step, current_state, action.data, value,
                                  reward, masks, action_log_probs,
                                  dist_entropy, 0)  #, done)

            # if 'Montez' in env_name and see_reward_episode:

            #     for state_i in range(len(state)):
            #         if done[state_i]:
            #             states_list[state_i] = []
            #         else:
            #             states_list[state_i].append(np.squeeze(state[state_i]))

            #             # print (state[state_i].shape)
            #             # fasdf

            #         # print (reward)

            #         if reward_numpy[state_i] >0:
            #             #plot the states of state_i
            #             print (len(states_list[state_i]))
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             view_reward_episode(model_dict=model_dict, frames=states_list[state_i])

            #             fadsa

            #      # and np.sum(agent.rollouts.rewards.cpu().numpy()) > 0

            #     # print (np.sum(agent.rollouts.rewards.cpu().numpy()))
            #     # print (j)

        #Optimize agent
        agent.update()  #agent.update(j,num_updates)

        batch = agent.rollouts.states

        # print (batch.size())   # [Steps+1,Processes,Stack,84,84]
        # remove first state since its repeated, its the last state of last episode
        # take the first state of the stack for each step
        #reshape to [P*S,84,84]
        batch = batch[1:]  # [Steps,Processes,Stack,84,84]
        batch = batch[:, :, 0]  # [Steps,Processes,84,84]
        batch = batch.contiguous().view(-1, 84, 84)  # [Steps*Processes,84,84]

        # print (batch.size())

        # fadsa
        # print (vae)
        elbo = vae.update(batch)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print (agent.state_pred_error.data.cpu().numpy())

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae,
                              update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)

            elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0])

            if next_state_pred_:
                state_pred_error_print = "{:.2f}".format(
                    agent.state_pred_error.data.cpu().numpy()[0])
                print(to_print_info_string + ' ' + state_pred_error_print +
                      ' ' + elbo)
                to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            else:
                print(to_print_info_string + ' ' + elbo)
                to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, elbo"

            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Example #8
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one
        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward

        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    def do_vid():



        n_vids=3
        for i in range(n_vids):
            done=False
            state = envs_video.reset()
            # state = torch.from_numpy(state).float().type(dtype)
            current_state = torch.zeros(1, *obs_shape)
            current_state = update_current_state(current_state, state, shape_dim0).type(dtype)
            # print ('Recording')
            # count=0
            while not done:
                # print (count)
                # count +=1
                # Act
                state_var = Variable(current_state, volatile=True) 
                # print (state_var.size())
                action, value = agent.act(state_var)
                cpu_actions = action.data.squeeze(1).cpu().numpy()

                # Observe reward and next state
                state, reward, done, info = envs_video.step(cpu_actions) # state:[nProcesss, ndims, height, width]
                # state = torch.from_numpy(state).float().type(dtype)
                # current_state = torch.zeros(1, *obs_shape)
                current_state = update_current_state(current_state, state, shape_dim0).type(dtype)
        state = envs_video.reset()
        
        vid_path = save_dir+'/videos/'
        count =0
        for aaa in os.listdir(vid_path):

            if 'openaigym' in aaa and '.mp4' in aaa:
                #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4')
                subprocess.call("(cd "+vid_path+" && mv "+ vid_path+aaa +" "+ vid_path+env_name+'_'+algo+'_vid_t'+str(total_num_steps)+'_'+str(count) +".mp4)", shell=True) 
                count+=1
            if '.json' in aaa:
                os.remove(vid_path+aaa)




    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    # print("#######")
    # print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    # print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    num_updates = int(num_frames) // num_steps // num_processes

    
    if cuda:
        torch.cuda.manual_seed(seed)
    else:
        torch.manual_seed(seed)


    # Create environments
    print (num_processes, 'processes')
    envs = SubprocVecEnv([make_env(env_name, seed, i, save_dir) for i in range(num_processes)])

    print ('env for video')
    # envs_video = gym.make(env_name)
    # envs_video = gym.wrappers.Monitor(envs_video, save_dir+'/videos/', video_callable=lambda x: True, force=True)
    envs_video = make_env_monitor(env_name, save_dir)#+'/videos/')


    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape

    if cuda:
        dtype = torch.cuda.FloatTensor
    else:
        dtype = torch.FloatTensor


    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])


    #Begin training
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P,1]
            action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 

            # I need to figure out when the env is being reset
            # I might be wrong about the placecement of the update because maybe the reset is happening 
            #maybe when its done, the state is acually the first of the next episode, 
                # if so then its all right.
            #BUT if it is true then that means, it never sees the last frame. since when the done comes, its
                # already giving you the next frame, so you never see the done frame
            #so increasing frame_skip, could cause problems because last frame coudl be far from current frame


            # Update state, I think this should go before record rewards, because its adding the last state
                # of the previous episode to the done current_states, ie I just set them to 0
                #unless the last state is really the first of the next episode, but I doubt it.
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step, just saves all those values, I think this should go before record rewards too
                #but just need to change some numpy to pytorch 
            # Issue could be the mask, it needs to be updated before this .
            agent.insert_data(step, current_state, action.data, value.data, reward, masks)




            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            


            # print (step, 'value_preds', agent.rollouts.value_preds.cpu().numpy().reshape(6,2))
            # self.rewards = self.rewards.cuda()
            # self.value_preds = self.value_preds.cuda()
            # self.returns = self.returns.cuda()

        #Optimize agent
        agent.update()
        agent.insert_first_state(agent.rollouts.states[-1])







        total_num_steps = (j + 1) * num_processes * num_steps

        #Save model
        if total_num_steps % save_interval == 0 and save_dir != "":
            save_path = os.path.join(save_dir, 'model_params')
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            # A really ugly way to save a model to CPU
            save_model = agent.actor_critic
            if cuda:
                save_model = copy.deepcopy(agent.actor_critic).cpu()
            # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            save_to=os.path.join(save_path, "model_params" + str(total_num_steps)+".pt")
            torch.save(save_model, save_to)
            print ('saved', save_to)

            #make video
            do_vid()


        #Print updates
        if j % log_interval == 0:
            end = time.time()

            if j % (log_interval*30) == 0:

                #update plots
                try:
                    make_plots(model_dict)
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated")
                except:
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".
                    format(j, total_num_steps,
                           final_rewards.min(),
                           final_rewards.median(),
                           final_rewards.mean(),
                           final_rewards.max(),
                           int(total_num_steps / (end - start)),
                           end - start))

    
    try:
        make_plots(model_dict)
    except:
        pass #raise
Example #9
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    action_size = envs.action_space.n

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    #Load model
    if model_dict['load_params']:
        # agent.actor_critic = torch.load(os.path.join(args.load_path))
        # agent.actor_critic = torch.load(args.load_path).cuda()

        # print ('loaded ', args.load_path)

        if model_dict['load_number'] == 3:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 3000160, model_dict)

        elif model_dict['load_number'] == 6:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 6000160, model_dict)
        elif model_dict['load_number'] == 9:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 9000160, model_dict)

        # else:
        #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
        else:
            PROBLEM

    ls_path = save_dir + '/V_and_Q_errors/'
    ls_file = ls_path + 'error_monitor.csv'

    if not os.path.exists(ls_path):
        os.makedirs(ls_path)
        # if print_:
        print('Made dir', ls_path)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        Vs = []
        Qs = []

        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(
                Variable(agent.rollouts.states[step]))  #, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            one_hot_action = torch.FloatTensor(num_processes, action_size)
            one_hot_action.zero_()
            one_hot_action.scatter_(1, action.data.cpu(), 1)

            # print (action)
            # print (one_hot_action)
            # fdsfa

            V, Q = agent.actor_critic.get_V_and_Q(
                Variable(agent.rollouts.states[step]), one_hot_action)
            Vs.append(V)
            Qs.append(Q)

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy)  #, done)

        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)

        V_loss, Q_loss = agent.update2(Vs, Qs)  #agent.update(j,num_updates)

        V_loss = V_loss.data.cpu().numpy()[0]
        Q_loss = Q_loss.data.cpu().numpy()[0]
        # print (V_loss)
        # fasd

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)
            print(to_print_info_string)
            start2 = time.time()

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval * 30) == 0:

                if total_num_steps > 5000:
                    with open(ls_file, 'a') as f:
                        writer = csv.writer(f)
                        writer.writerow([total_num_steps, V_loss, Q_loss])

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)

                    if total_num_steps > 5000:
                        update_error_plot(model_dict)

                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Example #10
0
    def minimize_scipy(self, params0, output=False, plots=False, hires=False):
        """Minimize residual using scipy.optimize Levenberg-Marquardt.

		Inputs:
		params0 -- initial guesses for parameters:
			mn0 -- Mn abundance
			smoothivar0 -- if applicable, inverse variance to use for smoothing

		Keywords:
		plots  -- if 'True', also plot final fit & residual
		output -- if 'True', also output a file (default='False')
		hires  -- if 'True', zoom in a bit on plots to better display hi-res spectra

		Outputs:
		fitparams -- best fit parameters
		rchisq	  -- reduced chi-squared
		"""

        # Do minimization
        print('Starting minimization! Initial guesses: ', params0)
        best_mn, covar = scipy.optimize.curve_fit(
            self.synthetic,
            self.obswvl_final,
            self.obsflux_final,
            p0=[params0],
            sigma=np.sqrt(np.reciprocal(self.ivar_final)),
            epsfcn=0.01)
        error = np.sqrt(np.diag(covar))

        print('Answer: ', best_mn)
        print('Error: ', error)

        # Do some checks
        if len(np.atleast_1d(best_mn)) == 1:
            finalsynth = self.synthetic(self.obswvl_final, best_mn, full=True)
        else:
            finalsynth = self.synthetic(self.obswvl_final,
                                        best_mn[0],
                                        best_mn[1],
                                        full=True)

        # Output the final data
        if output:

            if len(np.atleast_1d(best_mn)) == 1:
                #finalsynthup 	= self.synthetic(self.obswvl_final, best_mn + error, full=True)
                #finalsynthdown 	= self.synthetic(self.obswvl_final, best_mn - error, full=True)
                finalsynthup = self.synthetic(self.obswvl_final,
                                              best_mn + 0.15,
                                              full=True)
                finalsynthdown = self.synthetic(self.obswvl_final,
                                                best_mn - 0.15,
                                                full=True)
            else:
                #finalsynthup = self.synthetic(self.obswvl_final, best_mn[0] + error[0], best_mn[1], full=True)
                #finalsynthdown = self.synthetic(self.obswvl_final, best_mn[0] - error[0], best_mn[1], full=True)
                finalsynthup = self.synthetic(self.obswvl_final,
                                              best_mn[0] + 0.15,
                                              best_mn[1],
                                              full=True)
                finalsynthdown = self.synthetic(self.obswvl_final,
                                                best_mn[0] - 0.15,
                                                best_mn[1],
                                                full=True)

            # Create file
            filename = self.outputname + '/' + self.specname + '_data.csv'

            # Define columns
            columnstr = [
                'wvl', 'obsflux', 'synthflux', 'synthflux_up',
                'synthflux_down', 'ivar'
            ]
            columns = np.asarray([
                self.obswvl_final, self.obsflux_final, finalsynth,
                finalsynthup, finalsynthdown, self.ivar_final
            ])

            with open(filename, 'w') as csvfile:
                datawriter = csv.writer(csvfile, delimiter=',')

                # Write header
                datawriter.writerow(['[Mn/H]', best_mn[0]])
                if len(np.atleast_1d(best_mn)) > 1:
                    datawriter.writerow(['dlam', best_mn[1]])
                datawriter.writerow(columnstr)

                # Write data
                for i in range(len(finalsynth)):
                    datawriter.writerow(columns[:, i])

            # Make plots
            if plots:
                make_plots(self.lines,
                           self.specname + '_',
                           self.obswvl_final,
                           self.obsflux_final,
                           finalsynth,
                           self.outputname,
                           ivar=self.ivar_final,
                           synthfluxup=finalsynthup,
                           synthfluxdown=finalsynthdown,
                           hires=hires)

        elif plots:
            make_plots(self.lines,
                       self.specname + '_',
                       self.obswvl_final,
                       self.obsflux_final,
                       finalsynth,
                       self.outputname,
                       ivar=self.ivar_final,
                       hires=hires)

        return best_mn, error
Example #11
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print ('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)












    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    # count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, done)





        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])





        total_num_steps = (j + 1) * num_processes * num_steps
        
        if total_num_steps % save_interval == 0 and save_dir != "":

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:
            end = time.time()

            if j % (log_interval*30) == 0:

                #update plots
                try:
                    make_plots(model_dict)
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated")
                except:
                    # raise
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".
                    format(j, total_num_steps,
                           final_rewards.min(),
                           final_rewards.median(),
                           final_rewards.mean(),
                           final_rewards.max(),
                           int(total_num_steps / (end - start)),
                           end - start))#, agent.current_lr)
    
    try:
        make_plots(model_dict)
    except:
        print ()
Example #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--labels",
        type=str,
        help="Experiments to launch.",
        default="inc_dec_1",
    )
    parser.add_argument(
        "--n_jobs", type=int, help="Number of jobs to launch in parallel.", default=1
    )
    parser.add_argument(
        "--n_discretization",
        type=int,
        help="Discretization of the time-horzion.",
        default=100,
    )
    parser.add_argument(
        "--n_seeds",
        type=int,
        help="Number of random seeds to run per experiment.",
        default=30,
    )
    parser.add_argument("--optimal", action="store_true")
    parser.add_argument("--randomize_bandit", action="store_true")
    parser.add_argument("--compute_regret", action="store_true")
    args = parser.parse_args()

    pickle_path = os.path.join(RESULTS_FOLDER, PICKLE_FOLDER)
    os.makedirs(pickle_path, exist_ok=True)
    labels = args.labels.split(",")
    print("Running experiments:", ", ".join(labels))

    # Collect selected experiments
    jobs = []
    for experiment_label in labels:
        experiment_bandits, solvers = EXPERIMENTS[experiment_label]
        if args.optimal:
            solvers = [OptimalSolver]
        for bandit_n, bandit in enumerate(experiment_bandits):
            if args.optimal:
                T_array = np.arange(1, bandit.Tmax + 1)
            else:
                T_array = np.linspace(
                    2 * bandit.n + 1, bandit.Tmax, args.n_discretization, dtype=np.int
                )
            for solver in solvers:
                if bandit.stochastic or solver.stochastic:
                    n_runs = args.n_seeds
                else:
                    n_runs = 1
                for _ in range(n_runs):
                    for T in T_array:
                        jobs.append(
                            {
                                "bandit": bandit,
                                "solver": solver,
                                "T": T,
                                "randomize_bandit": args.randomize_bandit,
                                "compute_regret": args.compute_regret,
                            }
                        )

    # Run experiments
    start_time = time.time()
    if args.n_jobs == 1:
        results = []
        for job in jobs:
            result = run_bandit_solver(job)
            results.append(result)
    else:
        with mp.get_context("spawn").Pool(args.n_jobs) as p:
            results = p.map(run_bandit_solver, jobs, chunksize=1)

    # Aggregate results
    results_aggregated = dict()
    for res in results:
        bandit = res["bandit"]
        solver = res["solver"]
        T = res["T"]
        policy = res["policy"]
        cumulative_reward = res["cumulative_reward"]
        single_peaked_bandits = res["single_peaked_bandits"]

        if (bandit, solver) not in results_aggregated:
            results_aggregated[(bandit, solver)] = dict()
        if T not in results_aggregated[(bandit, solver)]:
            results_aggregated[(bandit, solver)][T] = []
        results_aggregated[(bandit, solver)][T].append(
            (policy, cumulative_reward, single_peaked_bandits)
        )

    # Write results
    for (bandit, solver), results in results_aggregated.items():
        T_list = []
        policy_list = [[] for _ in range(args.n_seeds)]
        cumulative_reward_list = [[] for _ in range(args.n_seeds)]
        for T in sorted(results.keys()):
            T_list.append(T)
            for i in range(len(results[T])):
                policy, cumulative_reward, single_peaked_bandits = results[T][i]
                policy_list[i].append(tuple(policy))
                if args.compute_regret:
                    cumulative_reward_list[i].append(single_peaked_bandits)
                else:
                    cumulative_reward_list[i].append(cumulative_reward)

        pickle_file = os.path.join(pickle_path, f"{bandit}_{solver}_result.p")
        with open(pickle_file, "wb") as f:
            pickle.dump(
                (
                    bandit,
                    solver,
                    tuple(T_list),
                    tuple([tuple(x) for x in policy_list]),
                    tuple([tuple(x) for x in cumulative_reward_list]),
                ),
                f,
            )

    make_plots()
    print("Done in", time.time() - start_time)
def main():
    now = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    model_name = 'pretrain_vgg16_' + now + '.h5'
    batch_size = 64
    num_epochs = 30
    lr = .0001

    num_train_samples = len(os.listdir('./data/train/cancer')) + len(os.listdir('./data/train/healthy'))
    num_valid_samples = len(os.listdir('./data/validation/cancer')) + len(os.listdir('./data/validation/healthy'))

    # Build our model
    input_tensor = Input(shape=(96, 96, 3))
    vgg = VGG16(include_top=False, input_shape=(96, 96, 3))
    x = vgg(input_tensor)
    z = layers.Flatten()(x)
    z = layers.Dropout(.5)(z)
    z = layers.Dense(256, activation='relu')(z)

    output_tensor = layers.Dense(1, activation='sigmoid')(z)

    vgg.trainable = True
    set_trainable = False
    for layer in vgg.layers:
        if layer.name == 'block5_conv1':
            set_trainable = True
        if set_trainable:
            layer.trainable = True
        else:
            layer.trainable = False
    vgg.summary()
    model = Model(input_tensor, output_tensor)
    model.summary()

    # Get things ready to train: tweak learning rate, etc.
    model.compile(optimizer=Adam(lr), loss='binary_crossentropy', metrics=['acc'])

    train_generator = train_gen(batch_size)
    validation_generator = valid_gen(batch_size)

    steps_per_epoch = num_train_samples / batch_size
    validation_steps = num_valid_samples / batch_size

    # Basic callbacks
    checkpoint = callbacks.ModelCheckpoint(filepath='./models/' + model_name,
                                           monitor='val_loss',
                                           save_best_only=True)
    early_stop = callbacks.EarlyStopping(monitor='val_acc',
                                         patience=10)
    csv_logger = callbacks.CSVLogger('./logs/' + model_name.split('.')[0] + '.csv')

    callback_list = [checkpoint, early_stop, csv_logger]

    # Training begins
    history = model.fit_generator(train_generator,
                                  steps_per_epoch=steps_per_epoch,
                                  epochs=num_epochs,
                                  verbose=1,
                                  callbacks=callback_list,
                                  validation_data=validation_generator,
                                  validation_steps=validation_steps)

    model.save('./models/' + model_name)

    make_plots(history, model_name)
Example #14
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda

        # print (current_state)
        # fdsf

        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0

    action_size = envs.action_space.n
    model_dict['action_size']=action_size



    # Create agent
    if algo == 'a2c':
        agent = a2c(model_dict)
        print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    #Load model
    if model_dict['load_params']:
        # agent.actor_critic = torch.load(os.path.join(args.load_path))
        # agent.actor_critic = torch.load(args.load_path).cuda()
        
        # print ('loaded ', args.load_path)

        if model_dict['load_number'] == 3:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

        elif model_dict['load_number'] == 6:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
        elif model_dict['load_number'] == 9:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

        # else:
        #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
        else:
            PROBLEM















    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]/255.))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 


            # if np.sum(reward) > 0.:
            #     print (reward)
            #     afdas

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)





        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                # do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
                save_params_v3(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2)
            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Example #15
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()

    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM

    #load model
    # if model_dict['load_params']:

    # load_params(thigns)
    # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
    param_file = home + '/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt'

    # pretrained_dict = torch.load(param_file)  # object
    # print (pretrained_dict)
    # agent_dict = agent.actor_critic.state_dict()  #dict
    # print (agent_dict.keys())
    # agent_dict.update(pretrained_dict)
    # # agent_dict.update(agent.actor_critic)
    # agent.actor_critic.load_state_dict(agent_dict)

    param_dict = torch.load(param_file)
    agent.actor_critic.load_state_dict(param_dict)

    # agent.actor_critic = torch.load(param_file)
    agent.actor_critic.cuda()
    print('loaded', param_file)

    # afdsa

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    # list of lists, where lists are trajectories. trajectories have actinos and states
    dataset = []
    tmp_trajs = [[] for x in range(num_processes)]

    dataset_count = 0

    done = [0] * num_processes

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(
                Variable(agent.rollouts.states[step]))  #, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # y = torch.LongTensor(batch_size,1).random_() % nb_digits
            # # One hot encoding buffer that you create out of the loop and just keep reusing
            # y_onehot = torch.FloatTensor(batch_size, nb_digits)
            # # In your for loop
            # y_onehot.zero_()
            # y_onehot.scatter_(1, y, 1)

            states_ = agent.rollouts.states[step].cpu().numpy()  #[P,S,84,84]
            # print (state_t.shape)
            actions_ = action.data.cpu().numpy()  #[P,1]
            # print (action)
            # fdsaf

            #store step
            for proc in range(num_processes):

                #add states
                state_t = states_[proc]
                action_t = actions_[proc]
                tmp_trajs[proc].append([action_t, state_t])

                if done[proc]:

                    dataset.append(tmp_trajs[proc])
                    dataset_count += len(tmp_trajs[proc])
                    tmp_trajs[proc] = []

                    for ii in range(len(dataset)):
                        print(len(dataset[ii]))

            if dataset_count > 10000:

                # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) )
                pickle.dump(
                    dataset,
                    open(
                        home +
                        '/Documents/tmp/RoadRunner/trajectories_10000.pkl',
                        "wb"))

                print('saved')
                # pickle.save(dataset)
                STOP

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy)  #, done)

        # print (len(dataset))
        # print ()

        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)
            print(to_print_info_string)
            start2 = time.time()

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # # Create agent
    # if algo == 'a2c':
    #     agent = a2c(envs, model_dict)
    #     print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')

    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()
        
    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM


    print ('Init expert agent')
    expert_agent = a2c(envs, model_dict)
    param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'    
    param_dict = torch.load(param_file)
    expert_agent.actor_critic.load_state_dict(param_dict)
    print ('loaded params', param_file)
    expert_agent.actor_critic.cuda()



    print ('Init imitator agent')
    imitator_agent = a2c(envs, model_dict)
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt'  
    # param_dict = torch.load(param_file)
    # imitator_agent.actor_critic.load_state_dict(param_dict)
    # print ('loaded params', param_file)
    imitator_agent.actor_critic.cuda()







    agent = expert_agent
    expert_policy = expert_agent.actor_critic

    imitator_policy = imitator_agent.actor_critic
    optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001)

    total_steps = 0

    display_step = 50






    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)




    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state__ = Variable(agent.rollouts.states[step]) / 255.
            value, action, action_log_probs, dist_entropy = agent.act(state__) #, requires_grad=False)#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())





            batch = state__

            optimizer.zero_grad()

            log_dist_expert = expert_policy.action_logdist(batch)
            log_dist_imitator = imitator_policy.action_logdist(batch)

            action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator)*torch.exp(log_dist_expert), dim=1) #[B]

            # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k)
            loss = torch.mean(action_dist_kl)

            loss.backward()
            # nn.utils.clip_grad_norm(self.parameters(), .5)
            optimizer.step()

            # if total_steps%display_step==0: # and batch_idx == 0:
            #     # print ('Train Epoch: {}/{}'.format(epoch+1, epochs),
            #         # 'total_epochs {}'.format(total_epochs),
            #         print('LL:{:.4f}'.format(loss.data[0])
            #         # 'logpx:{:.4f}'.format(logpx.data[0]),
            #         # 'logpz:{:.5f}'.format(logpz.data[0]),
            #         # 'logqz:{:.5f}'.format(logqz.data[0]),
            #         # 'action_kl:{:.4f}'.format(action_dist_kl.data[0])
            #         )

            # total_steps+=1






            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)





        #Optimize agent


        agent.no_update()  #agent.update(j,num_updates)
        # agent.update()  #agent.update(j,num_updates)


        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            save_to = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt'
            torch.save(imitator_policy.state_dict(), save_to)
            print ('saved imitator_policy', save_to)

            # #Save model
            # if save_params:
            #     do_params(save_dir, agent, total_num_steps, model_dict)
            #     # save_params_v2(save_dir, agent, total_num_steps, model_dict)

                
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2, 
                                       loss.data[0])


            # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
            #                            final_rewards.min(),
            #                            final_rewards.median(),
            #                            final_rewards.mean(),
            #                            final_rewards.max(),
            #                            int(total_num_steps / (end - start)),
            #                            end - start,
            #                            end - start2)


            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Example #17
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        # if see_frames:
        #     #Grayscale
        #     save_frame(state, count)
        #     count+=1
        #     if done[0]:
        #         ffsdfa
        #     #RGB
        #     state = envs.render()
        #     print(state.shape)
        #     fdsafa

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    def do_vid():
        n_vids = 3
        for i in range(n_vids):
            done = False
            state = envs_video.reset()
            # state = torch.from_numpy(state).float().type(dtype)
            current_state = torch.zeros(1, *obs_shape)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0).type(dtype)
            # print ('Recording')
            # count=0
            while not done:
                # print (count)
                # count +=1
                # Act
                state_var = Variable(current_state, volatile=True)
                # print (state_var.size())
                action, value = agent.act(state_var)
                cpu_actions = action.data.squeeze(1).cpu().numpy()

                # Observe reward and next state
                state, reward, done, info = envs_video.step(
                    cpu_actions)  # state:[nProcesss, ndims, height, width]
                # state = torch.from_numpy(state).float().type(dtype)
                # current_state = torch.zeros(1, *obs_shape)
                current_state = update_current_state(current_state, state,
                                                     shape_dim0).type(dtype)
        state = envs_video.reset()

        vid_path = save_dir + '/videos/'
        count = 0
        for aaa in os.listdir(vid_path):

            if 'openaigym' in aaa and '.mp4' in aaa:
                #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4')
                subprocess.call("(cd " + vid_path + " && mv " + vid_path +
                                aaa + " " + vid_path + env_name + '_' + algo +
                                '_vid_t' + str(total_num_steps) + '_' +
                                str(count) + ".mp4)",
                                shell=True)
                count += 1
            if '.json' in aaa:
                os.remove(vid_path + aaa)

    def save_frame(state, count):

        frame_path = save_dir + '/frames/'
        if not os.path.exists(frame_path):
            os.makedirs(frame_path)
            print('Made dir', frame_path)

        state1 = np.squeeze(state[0])
        # print (state1.shape)
        fig = plt.figure(figsize=(4, 4), facecolor='white')
        plt.imshow(state1, cmap='gray')
        plt.savefig(frame_path + 'frame' + str(count) + '.png')
        print('saved', frame_path + 'frame' + str(count) + '.png')
        plt.close(fig)

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    vid_ = 1
    see_frames = 0

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    # count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P,1]
            action, value = agent.act(
                Variable(agent.rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)

            # Update state
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data,
                              reward, masks)

        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])

        total_num_steps = (j + 1) * num_processes * num_steps

        #Save model
        if total_num_steps % save_interval == 0 and save_dir != "":
            save_path = os.path.join(save_dir, 'model_params')
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            # A really ugly way to save a model to CPU
            save_model = agent.actor_critic
            if cuda:
                save_model = copy.deepcopy(agent.actor_critic).cpu()
            # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            # steps_sci_nota = '{e}'.format(total_num_steps)
            save_to = os.path.join(
                save_path, "model_params" + str(total_num_steps) + ".pt")
            # save_to=os.path.join(save_path, "model_params" + steps_sci_nota+".pt")
            torch.save(save_model, save_to)
            print('saved', save_to)

            #make video
            if vid_:
                do_vid()

        #Print updates
        if j % log_interval == 0:
            end = time.time()

            if j % (log_interval * 30) == 0:

                #update plots
                try:
                    make_plots(model_dict)
                    print(
                        "Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated"
                    )
                except:
                    raise
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".format(
                j, total_num_steps, final_rewards.min(),
                final_rewards.median(), final_rewards.mean(),
                final_rewards.max(), int(total_num_steps / (end - start)),
                end - start))  #, agent.current_lr)

    try:
        make_plots(model_dict)
    except:
        print()
Example #18
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    if grad_var_:
        print('env for grad_var_')
        envs_grad_var = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels
    action_space = envs.action_space

    model_dict['action_space'] = action_space
    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    next_state_pred_ = 0
    model_dict['next_state_pred_'] = next_state_pred_

    n_contexts = 2
    model_dict['n_contexts'] = n_contexts

    # Create agent
    # if algo == 'a2c':
    agent = a2c(model_dict)
    print('init a2c agent')

    discriminator = CNN_Discriminator(num_steps, n_contexts, model_dict).cuda()
    print('init discriminator')

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state, shape_dim0).type(
            dtype)  #add the new frame, remove oldest, since its a stack
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes,
                                 1])  #when episode complete, sotres it here

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    context_probs = torch.ones(n_contexts) / n_contexts

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        context_np = np.random.choice(n_contexts, num_processes)
        context = torch.from_numpy(context_np).view(num_processes, 1)
        context_onehot = torch.FloatTensor(num_processes, n_contexts).zero_()
        context_onehot.scatter_(1, context, 1)  # [P,C]

        list_frames = []
        for step in range(num_steps):

            #Sample context
            # context = torch.unsqueeze(context_probs.multinomial(num_processes), dim=1) # [P,1]

            # print (torch.multinomial.sample(context_probs, num_processes))

            # print (np.random.multinomial(num_processes, [1./n_contexts]*n_contexts))
            # print (np.random.choice(n_contexts, num_processes)) #[1./n_contexts]*n_contexts))

            # Act, [P,1], [P], [P,1], [P]
            state_pytorch = Variable(agent.rollouts.states[step])
            value, action, action_log_probs, dist_entropy = agent.act(
                state_pytorch, context_onehot)  #, volatile=True))

            # print (context_np)
            # print (action)

            #ACTIONS
            #['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT',
            #'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']

            # TO FIX THE ACTIONS
            # action = action.data
            # # print (action)
            # # fdasf
            # for i in range(len(context_np)):
            #     if context_np[i] == 0:
            #         action[i] = 4
            #     else:
            #         action[i] = 3
            # action = Variable(action)
            # # print (action)
            # # print (action)
            # # fadsf

            # # TO FIX THE ACTIONS 2
            # action = action.data
            # # print (action)
            # # fdasf
            # for i in range(len(action)):
            #     if action[i].cpu().numpy() >= 8:
            #         action[i] = 0
            #     # else:
            #     #     action[i] = 3
            # action = Variable(action)
            # # print (action)
            # # print (action)
            # # fadsf

            # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # print (cpu_actions)
            state, reward, done, info = envs.step(cpu_actions)

            # print (state.shape) #[P,1,84,84]
            list_frames.append(torch.FloatTensor(state))

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy,
                              0)  #, done)

        #Optimize discriminator
        if j % 2 == 0:
            discriminator_error = discriminator.update(list_frames,
                                                       context)  #[P]
            if j == 0:
                print('multiple updates')
                for jj in range(20):
                    discriminator_error = discriminator.update(
                        list_frames, context)
                    # print (torch.mean(discriminator_error).data.cpu().numpy()[0])
                # fasds

        grad_sum = agent.actor_critic.graddd(state_pytorch, context_onehot)

        #Optimize agent
        # agent.update(context_onehot, discriminator_error)  #agent.update(j,num_updates)
        agent.update2(context_onehot, discriminator_error,
                      grad_sum)  #agent.update(j,num_updates)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae,
                              update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            # print (torch.mean(discriminator_error).data.cpu().numpy()[0])
            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.3f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2,
                torch.mean(discriminator_error).data.cpu().numpy()[0])

            print(to_print_info_string)

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, D_error"  #, elbo"
            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_ and j % (log_interval * 30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict,
                                total_num_steps, update_current_state,
                                update_rewards)

                # # THIS IS TO SEE PREDICTIONS

                # nstep_frames = torch.stack(list_frames)  #[N, P, 1,84,84]
                # nstep_frames = torch.transpose(nstep_frames, 0,1)
                # nstep_frames = torch.squeeze(nstep_frames) #its [P,N,84,84] so its like a batch of N dimensional images
                # nstep_frames = Variable(nstep_frames).cuda()
                # pred = F.softmax(discriminator.predict(nstep_frames), dim=1)
                # print (pred, context_np)

                # rows = 1
                # cols = 2
                # fig = plt.figure(figsize=(1+cols,15+rows), facecolor='white')

                # zero_comp = 0
                # one_comp = 0
                # for ii in range(len(context_np)):

                #     if context_np[ii] == 0 and not zero_comp:
                #         print (ii)
                #         imgg = nstep_frames[ii].view(num_steps*84,84).data.cpu().numpy()

                #         # imgg = nstep_frames[ii].view(num_steps*84//2,84*2)
                #         # # imgg = nstep_frames[ii].view(num_steps*84,84)
                #         # imgg = imgg.data.cpu().numpy()

                #         ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) #, rowspan=7)

                #         # print (imgg.shape)
                #         # plt.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         ax.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         ax.set_yticks([])
                #         # plt.savefig(model_dict['exp_path']+'img0.pdf')
                #         # print (model_dict['exp_path']+'img.png')
                #         zero_comp =1
                #     if context_np[ii] == 1 and not one_comp:
                #         print (ii)
                #         imgg = nstep_frames[ii].view(num_steps*84,84).data.cpu().numpy()

                #         # imgg = nstep_frames[ii].view(num_steps*84//2,84*2)
                #         # # imgg = nstep_frames[ii].view(num_steps*84,84)
                #         # imgg = imgg.data.cpu().numpy()

                #         ax = plt.subplot2grid((rows,cols), (0,1), frameon=False) #, rowspan=7)
                #         # print (imgg.shape)
                #         # plt.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         ax.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         ax.set_yticks([])

                #         # plt.savefig(model_dict['exp_path']+'img1.pdf')
                #         # print (model_dict['exp_path']+'img.png')
                #         one_comp =1
                #     if zero_comp and one_comp:
                #         print ('plotted both')

                #         # imgg = nstep_frames[20].view(num_steps*84,84).data.cpu().numpy()
                #         # plt.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         # plt.savefig(model_dict['exp_path']+'img_20.pdf')

                #         # fdfaa  fig = plt.figure(figsize=(4+cols,1+rows), facecolor='white')

                #         plt.savefig(model_dict['exp_path']+'img_both.pdf')

                #         ffasd

                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval * 30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string + " problem with plot")

    try:
        make_plots(model_dict)
    except:
        print()
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    # # Create agent
    # if algo == 'a2c':
    #     agent = a2c(envs, model_dict)
    #     print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')

    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()

    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM

    print('Init expert agent')
    expert_agent = a2c(envs, model_dict)
    param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'
    param_dict = torch.load(param_file)
    expert_agent.actor_critic.load_state_dict(param_dict)
    print('loaded params', param_file)
    expert_agent.actor_critic.cuda()

    print('Init imitator agent')
    imitator_agent = a2c(envs, model_dict)
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt'
    # param_dict = torch.load(param_file)
    # imitator_agent.actor_critic.load_state_dict(param_dict)
    # print ('loaded params', param_file)
    imitator_agent.actor_critic.cuda()

    agent = expert_agent
    expert_policy = expert_agent.actor_critic

    imitator_policy = imitator_agent.actor_critic
    optimizer = optim.Adam(imitator_policy.parameters(),
                           lr=.0005,
                           weight_decay=.00001)

    total_steps = 0

    display_step = 50

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state__ = Variable(agent.rollouts.states[step]) / 255.
            value, action, action_log_probs, dist_entropy = agent.act(
                state__)  #, requires_grad=False)#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            batch = state__

            optimizer.zero_grad()

            log_dist_expert = expert_policy.action_logdist(batch)
            log_dist_imitator = imitator_policy.action_logdist(batch)

            action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator) *
                                       torch.exp(log_dist_expert),
                                       dim=1)  #[B]

            # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k)
            loss = torch.mean(action_dist_kl)

            loss.backward()
            # nn.utils.clip_grad_norm(self.parameters(), .5)
            optimizer.step()

            # if total_steps%display_step==0: # and batch_idx == 0:
            #     # print ('Train Epoch: {}/{}'.format(epoch+1, epochs),
            #         # 'total_epochs {}'.format(total_epochs),
            #         print('LL:{:.4f}'.format(loss.data[0])
            #         # 'logpx:{:.4f}'.format(logpx.data[0]),
            #         # 'logpz:{:.5f}'.format(logpz.data[0]),
            #         # 'logqz:{:.5f}'.format(logqz.data[0]),
            #         # 'action_kl:{:.4f}'.format(action_dist_kl.data[0])
            #         )

            # total_steps+=1

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy)  #, done)

        #Optimize agent

        agent.no_update()  #agent.update(j,num_updates)
        # agent.update()  #agent.update(j,num_updates)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            save_to = home + '/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt'
            torch.save(imitator_policy.state_dict(), save_to)
            print('saved imitator_policy', save_to)

            # #Save model
            # if save_params:
            #     do_params(save_dir, agent, total_num_steps, model_dict)
            #     # save_params_v2(save_dir, agent, total_num_steps, model_dict)

            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2, loss.data[0])

            # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
            #                            final_rewards.min(),
            #                            final_rewards.median(),
            #                            final_rewards.mean(),
            #                            final_rewards.max(),
            #                            int(total_num_steps / (end - start)),
            #                            end - start,
            #                            end - start2)

            print(to_print_info_string)
            start2 = time.time()

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Example #20
0
def plot_fits_postfacto(filename, paramfilename, galaxyname, slitmaskname, startstar=0, globular=False, lines='new', mn_cluster=None):
	""" Plot fits, residuals, and ivar for stars whose [Mn/H] abundances have already been measured.

	Inputs:
	filename 		-- file with observed spectra
	paramfilename 	-- file with parameters of observed spectra
	galaxyname		-- galaxy name, options: 'scl'
	slitmaskname 	-- slitmask name, options: 'scl1'

	Keywords:
	startstar		-- if 0 (default), start at beginning of file and write new datafile;
						else, start at #startstar and just append to datafile
	globular 		-- if 'False' (default), put into output path of galaxy;
						else, put into globular cluster path
	lines 			-- if 'new' (default), use new revised linelist;
						else, use original linelist from Judy's code
	mn_cluster 		-- if not None (default), also plot spectrum with [Mn/H] = mean [Mn/H] of cluster

	"""

	# Input filename
	if globular:
		file = '/raid/madlr/glob/'+galaxyname+'/'+slitmaskname+'.csv'
	else:
		file = '/raid/madlr/dsph/'+galaxyname+'/'+slitmaskname+'.csv'

	# Output filepath
	if globular:
		outputname = '/raid/madlr/glob/'+galaxyname+'/'+slitmaskname
	else:
		outputname = '/raid/madlr/dsph/'+galaxyname+'/'+slitmaskname

	name  = np.genfromtxt(file, delimiter='\t', skip_header=1, usecols=0, dtype='str')
	mn    = np.genfromtxt(file, delimiter='\t', skip_header=1, usecols=8)
	mnerr = np.genfromtxt(file, delimiter='\t', skip_header=1, usecols=9)

	# Get number of stars in file with observed spectra
	Nstars = open_obs_file(filename)

	# Open file to store reduced chi-sq values
	chisqfile = outputname+'_chisq.txt'
	with open(chisqfile, 'w+') as f:
		print('made it here')
		f.write('Star'+'\t'+'Line'+'\t'+'redChiSq (best[Mn/H])'+'\t'+'redChiSq (best[Mn/H]+0.15)'+'\t'+'redChiSq (best[Mn/H]-0.15)'+'\n')

	# Plot spectra for each star
	for i in range(startstar, Nstars):

		try:

			# Check if parameters are measured
			temp, logg, fe, alpha, fe_err = open_obs_file(filename, retrievespec=i, specparams=True)
			if np.isclose(1.5,logg) and np.isclose(fe,-1.5) and np.isclose(fe_err, 0.0):
				print('Bad parameter measurement! Skipped #'+str(i+1)+'/'+str(Nstars)+' stars')
				continue

			# Open star
			star = chi_sq.obsSpectrum(filename, paramfilename, i, False, galaxyname, slitmaskname, globular, lines, plot=True)

			# Check if star has already had [Mn/H] measured
			if star.specname in name:

				# If so, open data file for star
				if globular:
					datafile = '/raid/madlr/glob/'+galaxyname+'/'+slitmaskname+'/'+str(star.specname)+'_data.csv'
				else:
					datafile = '/raid/madlr/dsph/'+galaxyname+'/'+slitmaskname+'/'+str(star.specname)+'_data.csv'

				# Get observed and synthetic spectra and inverse variance array
				obswvl 		= np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=0)
				obsflux 	= np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=1)
				synthflux 	= np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=2)
				#synthfluxup = np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=3)
				#synthfluxdown = np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=4)
				ivar 		= np.genfromtxt(datafile, delimiter=',', skip_header=2, usecols=5)

				idx = np.where(name == star.specname)

				synthfluxup 	= star.synthetic(obswvl, mn[idx] + 0.15, full=True)
				synthfluxdown 	= star.synthetic(obswvl, mn[idx] - 0.15, full=True)
				synthflux_nomn 	= star.synthetic(obswvl, -10.0, full=True)

				if mn_cluster is not None:
					synthflux_cluster = [mn_cluster, star.synthetic(obswvl, mn_cluster, full=True)]
				else:
					synthflux_cluster=None

				if mnerr[idx][0] < 1:
					# Run code to make plots
					make_plots(lines, star.specname+'_', obswvl, obsflux, synthflux, outputname, ivar=ivar, resids=True, synthfluxup=synthfluxup, synthfluxdown=synthfluxdown, synthflux_nomn=synthflux_nomn, synthflux_cluster=synthflux_cluster, title=None, savechisq=chisqfile)

					# Write all plotting data to a file
					hdr = 'Star '+str(star.specname)+'\n'+'obswvl\tobsflux\tsynthflux\tsynthfluxup\tsynthfluxdown\tsynthflux_nomn\n'
					np.savetxt(outputname+'/'+str(star.specname)+'_finaldata.csv', np.asarray((obswvl,obsflux,synthflux,synthfluxup,synthfluxdown,synthflux_nomn)).T, header=hdr)

		except Exception as e:
			print(repr(e))
			print('Skipped star #'+str(i+1)+'/'+str(Nstars)+' stars')
			continue

		print('Finished star '+star.specname, '#'+str(i+1)+'/'+str(Nstars)+' stars')

	return
Example #21
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']


    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    # if ls_:
    #     print ('env for ls')
    #     envs_ls = make_env_basic(env_name)

    # if vae_:
    #     print ('env for vae')
    #     envs_vae = make_env_basic(env_name)

    # if grad_var_:
    #     print ('env for grad_var_')
    #     envs_grad_var = make_env_basic(env_name)



    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0
    model_dict['action_size'] = envs.action_space.n
    print (envs.action_space.n, 'actions')



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')

    elif algo == 'dqn':
        agent = DQN(envs, model_dict)
        print ('init DQN agent')  
        print (agent.q_net)   



    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest, since its a stack
    # agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)


    # dqn_epsilon = .1 #lower means less likely to do random .9 # .1

    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 50000
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        dqn_epsilon = epsilon_by_frame(j)

        #Num steps till agent update
        # for step in range(num_steps):

        # Act, [P,1], [P,1], [P,1], [P]
        # state_pytorch = Variable(agent.rollouts.states[step])
        state_pytorch = Variable(current_state)
        # value, action, action_log_probs, dist_entropy = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True))
        action = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True))
        
        # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
        # cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
        frame, reward, done, info = envs.step(action) 

        # Record rewards and update state
        reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
        new_current_state = update_current_state(current_state, frame, shape_dim0)


        agent.replay_buffer.push(current_state, action, reward, new_current_state, done.astype(int))

        current_state = new_current_state


        if len(agent.replay_buffer) > 100:
            agent.update()
            # agent.update()
            # agent.update()
            # agent.update()







        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()


            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.2f}, {:.5f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2,
                                       dqn_epsilon,
                                       agent.loss.data.cpu().numpy()[0])
                                       # torch.mean(discrim_errors).data.cpu().numpy()[0])

            print(to_print_info_string)


            # if vae_:
            #     elbo =  "{:.2f}".format(elbo.data.cpu().numpy()[0])


            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # if vae_:
            #     print(to_print_info_string+' '+elbo)
            # else:
            # print(to_print_info_string)


            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"#, elbo"
            start2 = time.time()

            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_  and j % (log_interval*30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards)






                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval*30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")

                    # print (len(agent.replay_buffer))
                except:
                    raise #pass
                    print(to_print_legend_string + " problem with plot")



    try:
        make_plots(model_dict)
    except:
        print ()
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    explore_ = model_dict['explore_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    next_state_pred_ = 0
    model_dict['next_state_pred_'] = next_state_pred_

    # Create agent
    # if algo == 'a2c':

    # agent = a2c(envs, model_dict)

    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # see_reward_episode = 0
    # if 'Montez' in env_name and see_reward_episode:
    #     states_list = [[] for i in range(num_processes)]

    # view_reward_episode(model_dict=model_dict, frames=[])
    # dfasddsf

    # if vae_:
    #     vae = VAE()
    #     vae.cuda()

    print('init exploit a2c agent')
    agent_exploit = a2c(envs, model_dict)

    if explore_:
        print('init explore a2c agent')
        agent_explore = a2c(envs, model_dict)
        print('init vae')
        vae = VAE()
        vae.cuda()

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest

    agent_exploit.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step
    if explore_:
        agent_explore.insert_first_state(
            current_state
        )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    # prev_action = Variable(torch.zeros([num_processes, 1]).type(torch.LongTensor)).cuda()

    # For normalizing the logprobs
    B = .99
    m = torch.FloatTensor([-100.]).cuda()
    v = torch.FloatTensor([10000.]).cuda()

    # prev_reward = torch.ones(num_processes,1).cuda()
    if model_dict['init_exploit_processes'] == -1:
        init_exploit_processes = num_processes
    else:
        init_exploit_processes = model_dict['init_exploit_processes']
    exploit_processes = init_exploit_processes
    # explore_processes = 16

    all_frames = []

    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        start3 = time.time()
        for step in range(num_steps):

            # start3 = time.time()
            state_pytorch = Variable(agent_exploit.rollouts.states[step]
                                     )  #, volatile=True) # [P,S,84,84]

            # exploit_state = state_pytorch[:exploit_processes]
            # explore_state = state_pytorch[exploit_processes:]

            u_value, u_action, u_action_log_probs, u_dist_entropy = agent_exploit.act(
                state_pytorch)
            if explore_:
                r_value, r_action, r_action_log_probs, r_dist_entropy = agent_explore.act(
                    state_pytorch)

            u_cpu_actions = u_action.data.squeeze(1).cpu().numpy()  #[P]
            if explore_:
                r_cpu_actions = r_action.data.squeeze(1).cpu().numpy()  #[P]

            #Choose how many you want from each
            cpu_actions = np.concatenate((u_cpu_actions[:exploit_processes],
                                          r_cpu_actions[exploit_processes:]),
                                         0)  #[P]
            # cpu_actions = u_cpu_actions

            # before_step_time = time.time() - start3

            # Step, S:[P,C,H,W], R:[P], D:[P]
            # start3 = time.time()
            state, reward, done, info = envs.step(cpu_actions)
            # step_time = time.time() - start3
            # reward_numpy = reward
            # print (reward)

            # # for trainign vae.
            # for p in range(len(state)):
            #     # print (state[p].shape) #[1,84,84]
            #     # fasad
            #     all_frames.append(state[p])
            #     print (len(all_frames))
            #     if len(all_frames) == 10000:
            #         pickle.dump( all_frames, open(home + '/Documents/tmp/montezum_frames.pkl' , "wb" ) )
            #         print ('saved pkl')
            #         fafaadsfs

            # start3 = time.time()
            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)
            # current_state_u = current_state[:exploit_processes]
            # current_state_r = current_state[exploit_processes:]

            #Insert data for exploit agent
            agent_exploit.insert_data(step, current_state, u_action.data,
                                      u_value, reward, masks,
                                      u_action_log_probs, u_dist_entropy,
                                      0)  #, done)

            if explore_:
                # Insert log prob for explore agent
                batch = state_pytorch[:, -1]  #last of stack
                batch = batch.contiguous()  # [P,84,84]
                elbo = vae.forward2(batch, k=10)  #[P]
                elbo = elbo.view(-1, 1).data  #[P,1]
                elbo = (elbo - m) / torch.sqrt(v)
                elbo = torch.clamp(elbo, max=.01)
                agent_explore.insert_data(step, current_state, r_action.data,
                                          r_value, -elbo, masks,
                                          r_action_log_probs, r_dist_entropy,
                                          0)  #, done)

                #update m and v
                m = B * m + (1. - B) * elbo.mean()
                v = B * v + (1. - B) * elbo.pow(2).mean()

                if elbo.mean() < -9000.:
                    print(elbo)
                    print(reward)
                    print(elbo.mean())
                    print(elbo.pow(2).mean())
                    fadsads

            # after_step_time = time.time() - start3

            # if 'Montez' in env_name and see_reward_episode:

            #     for state_i in range(len(state)):
            #         if done[state_i]:
            #             states_list[state_i] = []
            #         else:
            #             states_list[state_i].append(np.squeeze(state[state_i]))

            #             # print (state[state_i].shape)
            #             # fasdf

            #         # print (reward)

            #         if reward_numpy[state_i] >0:
            #             #plot the states of state_i
            #             print (len(states_list[state_i]))
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             view_reward_episode(model_dict=model_dict, frames=states_list[state_i])

            #             fadsa

            #      # and np.sum(agent.rollouts.rewards.cpu().numpy()) > 0

            #     # print (np.sum(agent.rollouts.rewards.cpu().numpy()))
            #     # print (j)

        steps_time = time.time() - start3
        start3 = time.time()

        #Optimize agents
        agent_exploit.update()  #agent.update(j,num_updates)
        if explore_:
            agent_explore.update()  #agent.update(j,num_updates)

            #Optimize vae
            batch = agent_exploit.rollouts.states
            batch = batch[1:]  # [Steps,Processes,Stack,84,84]
            batch = batch[:, :, 0]  # [Steps,Processes,84,84]
            batch = batch.contiguous().view(-1, 84,
                                            84)  # [Steps*Processes,84,84]
            elbo = vae.update(batch)

        #Insert state
        agent_exploit.insert_first_state(agent_exploit.rollouts.states[-1])
        if explore_:
            agent_explore.insert_first_state(agent_explore.rollouts.states[-1])

        total_num_steps = (j + 1) * num_processes * num_steps

        #Change number of explore vs exploit
        if model_dict['init_exploit_processes'] != -1 and model_dict[
                'inc_exploiters_over'] != -1:
            frac_step = np.minimum((total_num_steps + 1.) /
                                   float(model_dict['inc_exploiters_over']),
                                   1.)  #fraction of steps
            aaa = int((num_processes - init_exploit_processes) * frac_step)
            exploit_processes = np.minimum(init_exploit_processes + aaa + 1,
                                           num_processes)

        update_time = time.time() - start3

        # agent_exploit.rollouts.reset_lists()
        # agent_explore.rollouts.reset_lists()

        # print ('init ', init_exploit_processes)
        # print ('cur ', exploit_processes)
        # print ('frac_step', frac_step)
        # print ('aaa', aaa)

        # print (agent.state_pred_error.data.cpu().numpy())

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            # #make vae prob gif
            if vae_:
                # do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps)
                # do_gifs2(envs_vae, agent_exploit, vae, model_dict, update_current_state, update_rewards, total_num_steps)
                do_gifs3(envs_vae, agent_exploit, vae, model_dict,
                         update_current_state, update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)

            elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0])
            # elbo =  "1"

            steps_time = "{:.3f}".format(steps_time)
            update_time = "{:.3f}".format(update_time)

            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # print(to_print_info_string+' '+elbo)
            # print(to_print_info_string+' '+elbo+' '+str(exploit_processes)+' '+str(before_step_time)+' '+str(step_time)+' '+str(after_step_time))#, value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy())
            print(
                to_print_info_string + ' ' + elbo + ' ' +
                str(exploit_processes)
            )  #+' '+steps_time+' '+update_time)#, value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy())

            # print (value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy())
            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, elbo, Exploit_Procs"

            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    # do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                    do_ls_2(envs_ls, agent_explore, model_dict,
                            total_num_steps, update_current_state,
                            update_rewards, vae)

                    # update_ls_plot(model_dict)
                    update_ls_plot_2(model_dict)
                    print('updated ls')

                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots

            # if ls_:

                try:

                    start3 = time.time()

                    make_plots(model_dict)
                    print(to_print_legend_string +
                          " Plot updated ")  #+str(time.time() - start3))
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()