def label_set(): """ Lables the the set via user input. """ for title, description in zip(all_episodes[0::3], all_episodes[1::3]): ep = Episode(title, description) ep_tokenized = ep.tokenize() for tokenized, name in ep_tokenized: text = title + "\n" + description last_pos = 0 name_len = len(name) for it in re.finditer(name, text): pos = it.start() print(text[last_pos:pos], end="") print(colored(name.title(), 'green'), end="") last_pos = pos + name_len print(text[last_pos:]) print("Is " + colored(name.title(), 'green') + " a topic (t) or a guest (g)?") i = input() with open("data/labeled_test.txt", "a") as f: if i == "t": f.write(tokenized) f.write("\n") f.write("T\n\n") elif i == "g": f.write(tokenized) f.write("\n") f.write("G\n\n") print()
def label_set(all_names, dead_names, fict_names): """ Lables the the set. """ topic_names = {} guest_names = {} for title, description in zip(all_episodes[0::3], all_episodes[1::3]): ep = Episode(title, description) ep_tokenized = ep.tokenize() for tokenized, name in ep_tokenized: if name in topic_names: topic_names[name].append(tokenized) continue if name in guest_names: guest_names[name].append(tokenized) continue in_fict, _ = trie.find_prefix(fict_names, name) in_dead, _ = trie.find_prefix(dead_names, name) if in_fict or in_dead: topic_names[name] = [tokenized] continue in_all, _ = trie.find_prefix(all_names, name) if not in_all: guest_names[name] = [tokenized] for episode_list in topic_names.values(): for text in episode_list: print(text) print("T") print() for episode_list in guest_names.values(): n_mentions = len(episode_list) if n_mentions > 5 and n_mentions < 25: for text in episode_list: print(text) print("G") print()
from keras.preprocessing.text import one_hot, Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from tag_names import tag_names from episode import Episode, window import numpy as np """ Used for demostrating the model by typing made up samples. """ model_file_name = sys.argv[1] with open(model_file_name, 'rb') as f: model, tokenizer = pickle.load(f) while True: win_size = 10 text = input("Type an episode description: \n") tagged_text = tag_names(text) ep = Episode("", tagged_text) for text, name in ep.tokenize(): text = window(text, win_size) text = tokenizer.texts_to_sequences([text]) text = pad_sequences(text, maxlen=2 * win_size + 1) y = model.predict(text) if y[0] > 0.5: print(name, "is a Guest", str(np.round(y[0][0] * 100)), "%") else: print(name, "is a Topic", str(np.round((1 - y[0][0]) * 100)), "%") print()