def train(): tf.reset_default_graph() policy_nn = SupervisedPolicy() f = open(relationPath) train_data = f.readlines() f.close() num_samples = len(train_data) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if num_samples > 500: num_samples = 500 else: num_episodes = num_samples for episode in xrange(num_samples): print("Episode %d" % episode) print('Training Sample:', train_data[episode % num_samples][:-1]) env = Env(dataPath, train_data[episode % num_samples]) sample = train_data[episode % num_samples].split() try: good_episodes = teacher(sample[0], sample[1], 5, env, graphpath) except Exception as e: print('Cannot find a path') continue for item in good_episodes: state_batch = [] action_batch = [] for t, transition in enumerate(item): state_batch.append(transition.state) action_batch.append(transition.action) state_batch = np.squeeze(state_batch) state_batch = np.reshape(state_batch, [-1, state_dim]) policy_nn.update(state_batch, action_batch) saver.save(sess, 'models/policy_supervised_' + relation) print('Model saved')
def REINFORCE(training_pairs, policy_nn, num_episodes): train = training_pairs success = 0 # path_found = set() path_found_entity = [] path_relation_found = [] for i_episode in range(num_episodes): start = time.time() print('Episode %d' % i_episode) print('Training sample: ', train[i_episode][:-1]) env = Env(dataPath, train[i_episode]) sample = train[i_episode].split() state_idx = [env.entity2id_[sample[0]], env.entity2id_[sample[1]], 0] episode = [] state_batch_negative = [] action_batch_negative = [] for t in count(): state_vec = env.idx_state(state_idx) action_probs = policy_nn.predict(state_vec) action_chosen = np.random.choice(np.arange(action_space), p=np.squeeze(action_probs)) reward, new_state, done = env.interact(state_idx, action_chosen) if reward == -1: # the action fails for this step state_batch_negative.append(state_vec) action_batch_negative.append(action_chosen) new_state_vec = env.idx_state(new_state) episode.append( Transition(state=state_vec, action=action_chosen, next_state=new_state_vec, reward=reward)) if done or t == max_steps: break state_idx = new_state # Discourage the agent when it choose an invalid step if len(state_batch_negative) != 0: print('Penalty to invalid steps:', len(state_batch_negative)) policy_nn.update(np.reshape(state_batch_negative, (-1, state_dim)), -0.05, action_batch_negative) print('----- FINAL PATH -----') print('\t'.join(env.path)) print('PATH LENGTH', len(env.path)) print('----- FINAL PATH -----') # If the agent success, do one optimization if done == 1: print('Success') path_found_entity.append(path_clean(' -> '.join(env.path))) success += 1 path_length = len(env.path) length_reward = 1 / path_length global_reward = 1 # if len(path_found) != 0: # path_found_embedding = [env.path_embedding(path.split(' -> ')) for path in path_found] # curr_path_embedding = env.path_embedding(env.path_relations) # path_found_embedding = np.reshape(path_found_embedding, (-1,embedding_dim)) # cos_sim = cosine_similarity(path_found_embedding, curr_path_embedding) # diverse_reward = -np.mean(cos_sim) # print 'diverse_reward', diverse_reward # total_reward = 0.1*global_reward + 0.8*length_reward + 0.1*diverse_reward # else: # total_reward = 0.1*global_reward + 0.9*length_reward # path_found.add(' -> '.join(env.path_relations)) total_reward = 0.1 * global_reward + 0.9 * length_reward state_batch = [] action_batch = [] for t, transition in enumerate(episode): if transition.reward == 0: state_batch.append(transition.state) action_batch.append(transition.action) policy_nn.update(np.reshape(state_batch, (-1, state_dim)), total_reward, action_batch) else: global_reward = -0.05 # length_reward = 1/len(env.path) state_batch = [] action_batch = [] total_reward = global_reward for t, transition in enumerate(episode): if transition.reward == 0: state_batch.append(transition.state) action_batch.append(transition.action) policy_nn.update(np.reshape(state_batch, (-1, state_dim)), total_reward, action_batch) print('Failed, Do one teacher guideline') try: good_episodes = teacher(sample[0], sample[1], 1, env, graphpath) for item in good_episodes: teacher_state_batch = [] teacher_action_batch = [] total_reward = 0.0 * 1 + 1 * 1 / len(item) for t, transition in enumerate(item): teacher_state_batch.append(transition.state) teacher_action_batch.append(transition.action) policy_nn.update(np.squeeze(teacher_state_batch), 1, teacher_action_batch) except Exception as e: print('Teacher guideline failed') print('Episode time: ', time.time() - start) print('\n') print('Success percentage:', success / num_episodes) for path in path_found_entity: rel_ent = path.split(' -> ') path_relation = [] for idx, item in enumerate(rel_ent): if idx % 2 == 0: path_relation.append(item) path_relation_found.append(' -> '.join(path_relation)) relation_path_stats = collections.Counter(path_relation_found).items() relation_path_stats = sorted(relation_path_stats, key=lambda x: x[1], reverse=True) f = open(dataPath + 'tasks/' + relation + '/' + 'path_stats.txt', 'w') for item in relation_path_stats: f.write(item[0] + '\t' + str(item[1]) + '\n') f.close() print('Path stats saved') return