コード例 #1
0
def label_set():
    """
    Lables the the set via user input.
    """
    for title, description in zip(all_episodes[0::3], all_episodes[1::3]):
        ep = Episode(title, description)
        ep_tokenized = ep.tokenize()

        for tokenized, name in ep_tokenized:
            text = title + "\n" + description

            last_pos = 0
            name_len = len(name)
            for it in re.finditer(name, text):
                pos = it.start()
                print(text[last_pos:pos], end="")
                print(colored(name.title(), 'green'), end="")
                last_pos = pos + name_len
            print(text[last_pos:])
            
            print("Is " + colored(name.title(), 'green') + "  a topic (t) or a guest (g)?")
            i = input()
            with open("data/labeled_test.txt", "a") as f:
                if i == "t":
                    f.write(tokenized)
                    f.write("\n")
                    f.write("T\n\n")
                elif i == "g":
                    f.write(tokenized)
                    f.write("\n")
                    f.write("G\n\n")
            print()
コード例 #2
0
def label_set(all_names, dead_names, fict_names):
    """
    Lables the the set.
    """
    topic_names = {}
    guest_names = {}
    for title, description in zip(all_episodes[0::3], all_episodes[1::3]):
        ep = Episode(title, description)
        ep_tokenized = ep.tokenize()

        for tokenized, name in ep_tokenized:

            if name in topic_names:
                topic_names[name].append(tokenized)
                continue

            if name in guest_names:
                guest_names[name].append(tokenized)
                continue

            in_fict, _ = trie.find_prefix(fict_names, name)
            in_dead, _ = trie.find_prefix(dead_names, name)

            if in_fict or in_dead:
                topic_names[name] = [tokenized]
                continue

            in_all, _ = trie.find_prefix(all_names, name)
            if not in_all:
                guest_names[name] = [tokenized]

    for episode_list in topic_names.values():
        for text in episode_list:
            print(text)
            print("T")
            print()

    for episode_list in guest_names.values():
        n_mentions = len(episode_list)
        if n_mentions > 5 and n_mentions < 25:
            for text in episode_list:
                print(text)
                print("G")
                print()
コード例 #3
0
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from tag_names import tag_names
from episode import Episode, window
import numpy as np
"""
Used for demostrating the model by typing made up samples.
"""

model_file_name = sys.argv[1]

with open(model_file_name, 'rb') as f:
    model, tokenizer = pickle.load(f)

while True:
    win_size = 10
    text = input("Type an episode description: \n")
    tagged_text = tag_names(text)
    ep = Episode("", tagged_text)
    for text, name in ep.tokenize():
        text = window(text, win_size)
        text = tokenizer.texts_to_sequences([text])
        text = pad_sequences(text, maxlen=2 * win_size + 1)
        y = model.predict(text)
        if y[0] > 0.5:
            print(name, "is a Guest", str(np.round(y[0][0] * 100)), "%")
        else:
            print(name, "is a Topic", str(np.round((1 - y[0][0]) * 100)), "%")
    print()