model = SiameseCNN(nb_features=39, device=device, embedding_dim=embedding_dim, dropout=dropout, problem=problem, non_linearity=non_linearity, margin=margin, normalize=normalize, show_every=1000).to(device) logging.info(model) optimizer = torch.optim.Adadelta(model.parameters()) try: best_prec1, cum_epochs, model, optimizer = load_checkpoint( where_to_save, args.file, model, optimizer) except FileNotFoundError: cum_epochs = 0 best_prec1 = 0 pass for epoch in range(epochs): # train for one epoch model.trainer(train_loader, criterion, optimizer, epoch, cum_epochs) # evaluate on validation set auc_score = model.validate(val_loader=val_loader) is_best = auc_score > best_prec1 best_prec1 = max(auc_score, best_prec1) save_checkpoint( { 'epoch': epoch + cum_epochs + 1,
def run(sess, env, algo, checkpoints_dir, n_episodes=100000, gui=False): fmt = '\n\n\n[*] Succesfully loaded: {}\n\n\n'.format(algo) if algo == 'dqn': from models.dqn.dqn import DQN, preprocessing dqn = DQN(env) print(fmt) dqn.loss() elif algo == 'ddqn': from models.ddqn.ddqn import DDQN, preprocessing dqn = DDQN(env) print(fmt) dqn.loss() # elif algo =='pompdp' or algo =='a2c': # from a2c.a2c import actorCritic, preprocessing # ac_network = actorCritic() # print(fmt) # ac_network.actor_critic_loss() RA = parser() """ checkpoints load """ try: episode_to_restore = load_checkpoint(checkpoints_dir, sess) print("[*] restoring from episode {}".format(episode_to_restore)) except: episode_to_restore = 0 print("[*] failed to load checkpoints") sess.run(tf.global_variables_initializer()) updateNetwork = 4 game_loss = 0 for each_episode in range(5): #numner of videos obs = env.reset()["image"] obs = preprocessing(obs) total_steps = 0 total_reward = 0 done = False in_trinsic = 0 total_reward_per_episode = 0 steps = 0 while not done: if gui: plt.imshow(env.render()) action = dqn.act(obs, sess) open_door = env.door.is_open key = False if env.carrying == None else True intrinsic_reward, failDfa = RA.trace(key, open_door) done = failDfa in_trinsic += intrinsic_reward nextObservation, reward, done, _ = env.step(action) nextObservation = nextObservation["image"] #rewards total_reward = reward + intrinsic_reward dqn.rememember(preprocessing(nextObservation), action, total_reward, obs, done) obs = preprocessing(nextObservation) total_reward_per_episode += reward print( "ep %d done: %s \tIn %d steps we got %.3f total reward and %.3f instrinsic reward" % (each_episode, done, steps, total_reward_per_episode, intrinsic_reward)) steps += 1
def run_a2c(sess, env, algo, checkpoints_dir, n_episodes=100000, gui=False, BATCH_SIZE=32): fmt = '\n\n\n[*] Succesfully loaded: {}\n\n\n'.format(algo) from models.a2c.a2c import actorCritic, preprocessing ac_network = actorCritic(env) print(fmt) ac_network.actor_critic_loss() RA = parser() """ checkpoints load """ try: episode_to_restore = load_checkpoint(checkpoints_dir, sess) print("[*] restoring from episode {}".format(episode_to_restore)) except: episode_to_restore = 0 print("[*] failed to load checkpoints") sess.run(tf.global_variables_initializer()) print("[*] creating new graphs") # graph_dir = os.path.join(checkpoints_dir, 'graphs') # if not os.path.exists(graph_dir): # os.mkdir(graph_dir) summary_writer = tf.summary.FileWriter('./graphs', sess.graph) saver = tf.train.Saver() #S_ = tf.Summary() total_reward = 0 last_n_rewards = [] for each_episode in tqdm(range(episode_to_restore, episode_to_restore + n_episodes), total=n_episodes): # we will use monte carlo approach # that is we first record the data # than use it done = False in_trinsic = 0 state = env.reset()["image"] state = preprocessing(state) # these are temporary data storage lists save_actions, save_rewards, save_dones = [ np.empty([0]) for i in range(3) ] save_states, save_nextStates = [], [] #TO DO: change this numpy #BATCH_SIZE c_loss = 0 total_loss_actor_critic = 0 count_batch = 0 total_reward_per_episode = 0 while not done: if gui: plt.imshow(env.render()) # first thing we need to choose the action action = ac_network.chooseAction(state, sess) # now i can perform the step in the environment next_state, reward, done, _ = env.step(action) next_state = next_state["image"] open_door = env.door.is_open key = False if env.carrying == None else True intrinsic_reward, failDfa = RA.trace(key, open_door) in_trinsic += intrinsic_reward t_reward = reward + intrinsic_reward # now we gonna store the trajectory #save_states = np.append(save_states, state) save_states.append(state) state = preprocessing(next_state) #save_nextStates = np.vstack(save_nextStates, state, axis=0) save_nextStates.append(state) save_actions = np.append(save_actions, action) save_rewards = np.append(save_rewards, t_reward) save_dones = np.append(save_dones, done) total_reward += reward total_reward_per_episode += reward # done = failDfa if count_batch % BATCH_SIZE == 0 and count_batch != 0: save_nextStates = np.array(save_nextStates) save_states = np.array(save_states) value_predicted = ac_network.act_value.eval( session=sess, feed_dict={ac_network.x_input: save_nextStates}) target = [] save_rewards = (save_rewards - save_rewards.mean()) / ( save_rewards.std() + np.finfo(np.float32).eps) for i in range(len(value_predicted)): target.append(save_rewards[i] + ac_network.discount * value_predicted[i] * (1 - save_dones[i])) target = np.array(target) save_actions = save_actions.astype(np.int) action_convert = save_actions.reshape(-1) onehotEncodeAction = np.eye( ac_network.action_size)[action_convert] total_loss_actor_critic, _ = sess.run( [ac_network.total_loss, ac_network.total_optimizer], feed_dict={ ac_network.x_input: save_states, ac_network.target: target, ac_network.actions: onehotEncodeAction }) summary = tf.Summary() summary.value.add(tag='loss_a2c', simple_value=np.sum(total_loss_actor_critic)) summary_writer.add_summary(summary, each_episode) summary_writer.flush() count_batch = 0 #resetting save_actions, save_rewards, save_dones = [ np.empty([0]) for i in range(3) ] save_states, save_nextStates = [], [ ] #TO DO: change this numpy count_batch += 1 if each_episode % 20 == 0 and each_episode != 0: print("After episode ", str(each_episode), " the total loss ", str(np.sum(total_loss_actor_critic)), " And reward ", str(total_reward)) print("Intrinsic reward after episode ", str(each_episode), " is ", str(in_trinsic)) total_reward = 0 summary = tf.Summary() summary.value.add(tag='20_ep_average', simple_value=np.mean(last_n_rewards)) summary.value.add(tag='20_ep_sum', simple_value=np.sum(last_n_rewards)) summary_writer.add_summary(summary, each_episode) summary_writer.flush() last_n_rewards = [] if each_episode % 100 == 0: checkpoint_save_path = saver.save( sess, '{}/Episode_{}.ckpt'.format(checkpoints_dir, each_episode)) print('Model is saved at {}!'.format(checkpoint_save_path)) last_n_rewards.append(total_reward_per_episode) summary = tf.Summary() summary.value.add(tag='Episode_reward', simple_value=total_reward_per_episode) summary.value.add(tag='intrinsic_reward', simple_value=intrinsic_reward) summary_writer.add_summary(summary, each_episode) summary_writer.flush()
def run_a2c(sess, env, algo, checkpoints_dir, n_episodes=100000, gui=False, BATCH_SIZE=32): fmt = '\n\n\n[*] Succesfully loaded: {}\n\n\n'.format(algo) from models.a2c.a2c import actorCritic, preprocessing ac_network = actorCritic(env) print(fmt) ac_network.actor_critic_loss() RA = parser() """ checkpoints load """ try: episode_to_restore = load_checkpoint(checkpoints_dir, sess) print("[*] restoring from episode {}".format(episode_to_restore)) except: episode_to_restore = 0 print("[*] failed to load checkpoints") sess.run(tf.global_variables_initializer()) total_reward = 0 for each_episode in range(20): #numner of videos in_trinsic = 0 state = env.reset()["image"] state = preprocessing(state) done = False # these are temporary data storage lists save_actions, save_rewards, save_dones = [ np.empty([0]) for i in range(3) ] save_states, save_nextStates = [], [] #TO DO: change this numpy #BATCH_SIZE c_loss = 0 total_loss_actor_critic = 0 count_batch = 0 steps = 0 while not done: steps += 1 if gui: plt.imshow(env.render()) # first thing we need to choose the action action = ac_network.chooseActionTest(state, sess) # now i can perform the step in the environment next_state, reward, done, _ = env.step(action) next_state = next_state["image"] open_door = env.door.is_open key = False if env.carrying == None else True intrinsic_reward, failDfa = RA.trace(key, open_door) in_trinsic += intrinsic_reward t_reward = reward + intrinsic_reward print(action)
def run(sess, env, algo, checkpoints_dir, n_episodes=100000, gui=False): fmt = '\n\n\n[*] Succesfully loaded: {}\n\n\n'.format(algo) if algo == 'dqn': from models.dqn.dqn import DQN, preprocessing dqn = DQN(env) print(fmt) dqn.loss() elif algo == 'ddqn': from models.ddqn.ddqn import DDQN, preprocessing dqn = DDQN(env) print(fmt) dqn.loss() # elif algo =='pompdp' or algo =='a2c': # from a2c.a2c import actorCritic, preprocessing # ac_network = actorCritic() # print(fmt) # ac_network.actor_critic_loss() RA = parser() """ checkpoints load """ try: episode_to_restore = load_checkpoint(checkpoints_dir, sess) print("[*] restoring from episode {}".format(episode_to_restore)) except: episode_to_restore = 0 print("[*] failed to load checkpoints") sess.run(tf.global_variables_initializer()) # graph_dir = os.path.join(checkpoints_dir, 'graphs') # if not os.path.exists(graph_dir): # os.mkdir(graph_dir) summary_writer = tf.summary.FileWriter('./graphs', sess.graph) saver = tf.train.Saver() #S_ = tf.Summary() updateNetwork = 4 game_loss = 0 last_n_rewards = [] for each_episode in tqdm(range(episode_to_restore, episode_to_restore + n_episodes), total=n_episodes): obs = env.reset()["image"] obs = preprocessing(obs) done = False in_trinsic = 0 total_reward_per_episode = 0 counter = 0 while not done: if gui: plt.imshow(env.render()) ### RB action = dqn.act(obs, sess) open_door = env.door.is_open key = False if env.carrying == None else True intrinsic_reward, failDfa = RA.trace(key, open_door) in_trinsic += intrinsic_reward nextObservation, reward, done, _ = env.step(action) nextObservation = nextObservation["image"] #rewards total_reward = reward + intrinsic_reward done = failDfa dqn.rememember(preprocessing(nextObservation), action, total_reward, obs, done) obs = preprocessing(nextObservation) total_reward_per_episode += reward if dqn.memory_class.sumtree.total_priority( ) > 1000 and dqn.memory_class.sumtree.total_priority( ) % updateNetwork == 0: # update and retrieve data idx, minibatch, ISWeights = dqn.sampleData() t_rewards = np.array([i[1] for i in minibatch]) t_dones = np.array([i[4] for i in minibatch]) t_actions = np.array([i[2] for i in minibatch]) t_obs = np.squeeze(np.array([i[0] for i in minibatch])) t_nextObservations = np.squeeze( np.array([i[3] for i in minibatch])) # now we have all the mini batch we can first define our target # we need to send nextObservation t_output = dqn.target_network_output.eval( session=sess, feed_dict={dqn.x_input: t_nextObservations}) target = np.array([ t_rewards[i] + dqn.discount * np.max(t_output[i]) * (1 - t_dones[i]) for i in range(len(t_output)) ]) game_loss, _ = sess.run([dqn.loss, dqn.optimizer], feed_dict={ dqn.x_input: t_obs, dqn.actions: t_actions, dqn.target: target }) summary = tf.Summary() summary.value.add(tag='loss_{}'.format(algo), simple_value=game_loss) summary_writer.add_summary(summary, each_episode) summary_writer.flush() if counter > 500: sess.run(dqn.copy_weight) counter = 0 counter += 1 if each_episode != 0 and each_episode % 20 == 0: print("After episode ", str(each_episode), " the game loss is ", str(game_loss), " and ave reward: ", str(np.mean(last_n_rewards))) print("Intrinsic Reward ", str(in_trinsic)) summary = tf.Summary() summary.value.add(tag='20_ep_average', simple_value=np.mean(last_n_rewards)) summary.value.add(tag='20_ep_sum', simple_value=np.sum(last_n_rewards)) summary_writer.add_summary(summary, each_episode) summary_writer.flush() last_n_rewards = [] if each_episode % 100 == 0: checkpoint_save_path = saver.save( sess, '{}/Episode_{}.ckpt'.format(checkpoints_dir, each_episode)) print('Model is saved at {}!'.format(checkpoint_save_path)) last_n_rewards.append(total_reward_per_episode) summary = tf.Summary() summary.value.add(tag='Episode_reward', simple_value=total_reward_per_episode) summary.value.add(tag='intrinsic_reward', simple_value=intrinsic_reward) summary_writer.add_summary(summary, each_episode) summary_writer.flush()