Example #1
0
def train():
    tf.reset_default_graph()
    policy_nn = SupervisedPolicy()

    f = open(relationPath)
    train_data = f.readlines()
    f.close()

    num_samples = len(train_data)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        if num_samples > 500:
            num_samples = 500
        else:
            num_episodes = num_samples

        for episode in xrange(num_samples):
            print("Episode %d" % episode)
            print('Training Sample:', train_data[episode % num_samples][:-1])

            env = Env(dataPath, train_data[episode % num_samples])
            sample = train_data[episode % num_samples].split()

            try:
                good_episodes = teacher(sample[0], sample[1], 5, env,
                                        graphpath)
            except Exception as e:
                print('Cannot find a path')
                continue

            for item in good_episodes:
                state_batch = []
                action_batch = []
                for t, transition in enumerate(item):
                    state_batch.append(transition.state)
                    action_batch.append(transition.action)
                state_batch = np.squeeze(state_batch)
                state_batch = np.reshape(state_batch, [-1, state_dim])
                policy_nn.update(state_batch, action_batch)

        saver.save(sess, 'models/policy_supervised_' + relation)
        print('Model saved')
def REINFORCE(training_pairs, policy_nn, num_episodes):
    train = training_pairs

    success = 0

    # path_found = set()
    path_found_entity = []
    path_relation_found = []

    for i_episode in range(num_episodes):
        start = time.time()
        print('Episode %d' % i_episode)
        print('Training sample: ', train[i_episode][:-1])

        env = Env(dataPath, train[i_episode])

        sample = train[i_episode].split()
        state_idx = [env.entity2id_[sample[0]], env.entity2id_[sample[1]], 0]

        episode = []
        state_batch_negative = []
        action_batch_negative = []
        for t in count():
            state_vec = env.idx_state(state_idx)
            action_probs = policy_nn.predict(state_vec)
            action_chosen = np.random.choice(np.arange(action_space),
                                             p=np.squeeze(action_probs))
            reward, new_state, done = env.interact(state_idx, action_chosen)

            if reward == -1:  # the action fails for this step
                state_batch_negative.append(state_vec)
                action_batch_negative.append(action_chosen)

            new_state_vec = env.idx_state(new_state)
            episode.append(
                Transition(state=state_vec,
                           action=action_chosen,
                           next_state=new_state_vec,
                           reward=reward))

            if done or t == max_steps:
                break

            state_idx = new_state

        # Discourage the agent when it choose an invalid step
        if len(state_batch_negative) != 0:
            print('Penalty to invalid steps:', len(state_batch_negative))
            policy_nn.update(np.reshape(state_batch_negative, (-1, state_dim)),
                             -0.05, action_batch_negative)

        print('----- FINAL PATH -----')
        print('\t'.join(env.path))
        print('PATH LENGTH', len(env.path))
        print('----- FINAL PATH -----')

        # If the agent success, do one optimization
        if done == 1:
            print('Success')

            path_found_entity.append(path_clean(' -> '.join(env.path)))

            success += 1
            path_length = len(env.path)
            length_reward = 1 / path_length
            global_reward = 1

            # if len(path_found) != 0:
            # 	path_found_embedding = [env.path_embedding(path.split(' -> ')) for path in path_found]
            # 	curr_path_embedding = env.path_embedding(env.path_relations)
            # 	path_found_embedding = np.reshape(path_found_embedding, (-1,embedding_dim))
            # 	cos_sim = cosine_similarity(path_found_embedding, curr_path_embedding)
            # 	diverse_reward = -np.mean(cos_sim)
            # 	print 'diverse_reward', diverse_reward
            # 	total_reward = 0.1*global_reward + 0.8*length_reward + 0.1*diverse_reward
            # else:
            # 	total_reward = 0.1*global_reward + 0.9*length_reward
            # path_found.add(' -> '.join(env.path_relations))

            total_reward = 0.1 * global_reward + 0.9 * length_reward
            state_batch = []
            action_batch = []
            for t, transition in enumerate(episode):
                if transition.reward == 0:
                    state_batch.append(transition.state)
                    action_batch.append(transition.action)
            policy_nn.update(np.reshape(state_batch, (-1, state_dim)),
                             total_reward, action_batch)
        else:
            global_reward = -0.05
            # length_reward = 1/len(env.path)

            state_batch = []
            action_batch = []
            total_reward = global_reward
            for t, transition in enumerate(episode):
                if transition.reward == 0:
                    state_batch.append(transition.state)
                    action_batch.append(transition.action)
            policy_nn.update(np.reshape(state_batch, (-1, state_dim)),
                             total_reward, action_batch)

            print('Failed, Do one teacher guideline')
            try:
                good_episodes = teacher(sample[0], sample[1], 1, env,
                                        graphpath)
                for item in good_episodes:
                    teacher_state_batch = []
                    teacher_action_batch = []
                    total_reward = 0.0 * 1 + 1 * 1 / len(item)
                    for t, transition in enumerate(item):
                        teacher_state_batch.append(transition.state)
                        teacher_action_batch.append(transition.action)
                    policy_nn.update(np.squeeze(teacher_state_batch), 1,
                                     teacher_action_batch)

            except Exception as e:
                print('Teacher guideline failed')
        print('Episode time: ', time.time() - start)
        print('\n')
    print('Success percentage:', success / num_episodes)

    for path in path_found_entity:
        rel_ent = path.split(' -> ')
        path_relation = []
        for idx, item in enumerate(rel_ent):
            if idx % 2 == 0:
                path_relation.append(item)
        path_relation_found.append(' -> '.join(path_relation))

    relation_path_stats = collections.Counter(path_relation_found).items()
    relation_path_stats = sorted(relation_path_stats,
                                 key=lambda x: x[1],
                                 reverse=True)

    f = open(dataPath + 'tasks/' + relation + '/' + 'path_stats.txt', 'w')
    for item in relation_path_stats:
        f.write(item[0] + '\t' + str(item[1]) + '\n')
    f.close()
    print('Path stats saved')

    return