number_of_words = 100
max_len = 20000
emb_dir = 'embedding/'
emb_file = 'wiki.en'

# Load names and paths of shows from chosen directory
names, paths = load_paths_from_dir(data)
print(names)


# Choose most common nouns from shows subtitles
movies_common_words = []
print("Most common words.")
for path in paths:
    print(path)
    movies_common_words.append(most_common_nouns(load_files_to_string(path, how_many=10), number_of_words))

# Make lists of most common nouns
movies_words_lists = []
sum_list = []
print("Words lists.")
for m in movies_common_words:
    word_list = [t[0] for t in m]
    movies_words_lists.append(word_list)
    sum_list += word_list

# Create  embeddings for documents
mean_vectors_list = []
print(len(names))
i = 1
print("Mean vectors.")
Ejemplo n.º 2
0
data = 'data/shows'
number_of_words = 100
max_len = 20000

emb_dir = 'embedding/'
emb_file = 'wiki.en'

# batman_beyond = most_common_nouns(load_file_to_string(data+"/Batman Beyond/S1-Season 1, Episode 01 - Rebirth (Part 1)-eng.txt"),
#                                   number_of_words)
# batman_animated = most_common_nouns(load_file_to_string(data+"/Batman: The Animated Series/S1-Batman.The.Animated.Series.15.The.Cat.and.the.Claw.Part.I-OLLIE.eng.txt"),
#                                     number_of_words)
# dharma_greg = most_common_nouns(load_file_to_string(data+"/Dharma & Greg/S1-Dharma.And.Greg.S01E01.DVDRip.XviD-FoV.txt"),
#                                 number_of_words)

batman_beyond = most_common_nouns(
    load_files_to_string(data + "/Batman Beyond"), number_of_words)
batman_animated = most_common_nouns(
    load_files_to_string(data + "/Batman: The Animated Series"),
    number_of_words)
dharma_greg = most_common_nouns(load_files_to_string(data + "/Dharma & Greg"),
                                number_of_words)

teen_titans = most_common_nouns(load_files_to_string(data + "/Teen Titans"),
                                number_of_words)

will_grace = most_common_nouns(load_files_to_string(data + "/Will & Grace"),
                               number_of_words)

superman = most_common_nouns(load_files_to_string(data + "/Superman"),
                             number_of_words)
Ejemplo n.º 3
0
from utils import  most_common_nouns
from files import load_files_to_string, load_paths_from_dir

# Settings
data = 'data/shows'
number_of_words = 100
show_list = ["Taboo", "Game of Thrones"]

# Load names and paths of shows from chosen directory
names, paths = load_paths_from_dir(data, show_list)
print(names)

# Most common words in tv show
movies_common_words = []
print("Most common words.")
for path in paths:
    print(path)
    print(most_common_nouns(load_files_to_string(path, how_many=10), number_of_words))