-
Notifications
You must be signed in to change notification settings - Fork 1
/
meaning_space.py
121 lines (98 loc) · 3.92 KB
/
meaning_space.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import string
import numpy as np
from utils import random_idx
from utils import utils
from utils import lang_vectors_utils as lvu
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
num_topics = 50
passes = 20
topn = 10
k = 5000
N = 10000
# cluster_sizes is mapping to n-gram size
# cluster_sz in random_idx referring to specific element (int) in cluster_sizes, array
cluster_sizes = [1, 2, 3, 4, 5, 6, 7, 8]
ordered = 1
#assuming this is the alphabet bc of precedent in generate_text.py
#alph = 'abc'
alphabet = string.lowercase + ' '
RI_letters = random_idx.generate_letter_id_vectors(N, k, alphabet)
def create_doc_set(path, files):
doc_set = []
for filename in files:
f = open(path + filename, "r")
doc_set.append(f.read())
f.close()
return doc_set
def tokenize(doc_set):
# list for tokenized documents in loop
texts = []
# loop through document list
for i in doc_set:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts.append(stemmed_tokens)
return texts
def create_meaning_matrix(ldamodel, topicid, topn, dictionary):
# token 2 id dictionary
# print dictionary.token2id
matrix = np.zeros((N,N))
id2token = dictionary.id2token
topic_terms = []
for tup in ldamodel.get_topic_terms(topicid, topn):
topic_terms.append(str(id2token[tup[0]]))
for i in range(0,topn):
term_vector = random_idx.id_vector(N, topic_terms[i], alphabet, RI_letters, ordered)
matrix[i] = term_vector
return matrix
def create_meaning_matrices(ldamodel, num_topics, topn, dictionary):
matrices = np.zeros((num_topics,N,N))
for topicid in range(0,num_topics):
matrices[topicid] = create_meaning_matrix(ldamodel, topicid, topn, dictionary)
return matrices
def run():
# create sample documents
raw_path = "raw_texts/texts_english/"
preprocessed_path = "preprocessed_texts/english/"
training_preprocessed_path = "preprocessed_texts/english/with_spaces/"
training_files = ["a_christmas_carol.txt", "alice_in_wonderland.txt"]
# this is for testing accuracy against the
# actual stream that will be the test input
test_files = ["hamlet_english.txt", "percy_addleshaw.txt"]
training_doc_set = create_doc_set(training_preprocessed_path, training_files)
test_doc_set = create_doc_set(preprocessed_path, test_files)
tokenized_training_documents = tokenize(training_doc_set)
tokenized_test_documents = tokenize(test_doc_set)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(tokenized_training_documents)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in tokenized_training_documents]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
# for i in range(0,num_topics):
# topici = ldamodel.show_topic(i, topn)
# print topici
# meaning matrix
# https://radimrehurek.com/gensim/models/word2vec.html ?????
meaning_matrices = create_meaning_matrices(ldamodel, num_topics, topn, dictionary)
# need to fix configs to make working meaning matrices
# token 2 id dictionary
# print dictionary.token2id
# was going to measure accuracy by multiplying or dot producting a
# token vector with a meaning matrix
# run()