/
analysis.py
306 lines (247 loc) · 11.7 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#
# There are broadly two ways to cluster documents:
# 1. Calculate document distances, then run distance-based clustering
# 2. Create vector document representations, then run clustering
# We support the following ways of calculating document distances:
# 1. Spacy similarity measure (based on average of token vectors)
# 2. Word-pairs distance. Total dist is average dist of closest word pairs
# Note that we support three distance metrics (cosine, euclidean, spacy)
# We support the following ways of generating document representations
# 1. tfidf weighed back of words
# 2. tfid weighted back-of-verbs plus back-of-nouns
#
import numpy as np
import spacy
from collections import OrderedDict
import regex as re
from errors import UNKNOWN_KEYWORDS
from beagleError import BeagleError
from corpus import Corpus
from tagged_question_corpus import TaggedQuestionCorpus
from build_tag_cluster import buildTagCluster
import tfidf as tfidf
import document_vector as docVec
import cluster
import textrank
import time
from spacy.tokens import Doc
from sklearn.feature_extraction.text import TfidfVectorizer
from wordfreq import word_frequency
from clean_text import clean_text
from greedy_clusterer import cluster_greedily
from cluster_on_keywords import clusterOnKeywords, findKeywordClusters
Doc.set_extension("vector", default={})
Doc.set_extension("vocabulary_", default={})
Doc.set_extension("idf_", default={})
# nlp = spacy.load("./model/googlenews-spacy");
import en_core_web_lg
nlp = en_core_web_lg.load()
nlp.add_pipe(tfidf.generate_text_frequency, name="text_frequency")
# Possibly remove the leading statements that might help imply meaning of question
nlp.Defaults.stop_words -= {"what", "when", "who", "where", "why", "how"}
def buildCorpus(docs):
corpus = Corpus(docs, nlp)
corpus = tfidf.tfidf(corpus)
return docVec.tfidfWeightedBagOfWords(corpus, useGlobalIDF=True)
def clusterDocuments(docs):
corpus = buildCorpus(docs)
return cluster.dbscanWithWordVectorDistances(corpus)
def buildTaggedCorpus(docs):
corpus = TaggedQuestionCorpus(docs, nlp)
corpus = tfidf.tfidf(corpus)
return docVec.tfidfWeightedBagOfWords(corpus, useGlobalIDF=True)
def tagAndVectorizeCorpus(docs):
"""
Builds a tagged questions corpus and TFIDF vectorizes it.
Parameters:
docs (list): List of strings (questions)
Returs:
corpus (TaggedQuestionCorpus): returns a tagged corpus with with .idf_ and .vocabulary_ properties.
References:
TfidfVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
"""
corpus = TaggedQuestionCorpus(docs, nlp)
vectorizer = TfidfVectorizer(
analyzer='word',
ngram_range=(1, 3),
use_idf=True,
sublinear_tf=True,
smooth_idf=True,
stop_words='english'
)
vectors = vectorizer.fit_transform([doc._.lemmatized for doc in corpus.documents]).toarray()
corpus.vocabulary_ = vectorizer.vocabulary_
corpus.idf_ = vectorizer.idf_
for ix, doc in enumerate(corpus.documents):
doc._.vector = vectors[ix]
return corpus
#Currently unused
def customClusterQuestions(docs, algorithm, parameters, removeOutliers=True):
"""
Customized question clustering methods. Implemented as a knock-off to prevent interupting with production builds
Parameters:
docs (list): List of strings (questions)
algorithm (str): The clustering algorithm that will be used
(DBSCAN, K Means, Gaussian K Means, Agglomerative Clustering)
parameters (string): Parameters used to initiate the clustering algorithms
(epsilon, clusters, strictness, threshold)
removeOutliers (bool): A flag to determine whether to remove outliers or not
(default is True)
Returns:
corpus: Corpus that has clusters list attached to it
"""
corpus = tagAndVectorizeCorpus(docs)
params = [{param["param"]: param["value"]} for param in parameters][0]
if algorithm == "DBSCAN":
return cluster.dbscan(corpus, float(params["epsilon"]))
if algorithm == "K Means":
return cluster.kmeans(corpus, int(params["clusters"]), removeOutliers)
if algorithm == "Gaussian K Means":
return cluster.g_kmeans(corpus, min(4, int(params["strictness"])), removeOutliers)
if algorithm == "Agglomerative Clustering":
return cluster.agglomerate(corpus, float(params["threshold"]), removeOutliers)
#Currently unused
def cluster_questions_with_agglomeration(docs):
corpus = tagAndVectorizeCorpus(docs)
return cluster.agglomerate(corpus)
#In active use
def cluster_questions(docs):
return cluster_greedily(TaggedQuestionCorpus(docs, nlp))
#Currently unused
def clusterQuestionsOnKeywords(questions, keywords, do_agglomeration):
"""
A three step process to find the clusters in a set of questions when some keywords are provided. We are assuming that the list of keywords is always non-exhaustive and so we are expecting the keywords provided to not cover the entire corpus.
Step 1:
a. Use the complement Naive Bayes algorithm to determine which questions have the highest probability of generating each keyword - the higher the probability, the more likely the keyword and the question are in the same cluster.
b. If one question is likely to come from 2 separate clusters, pick the cluster with the highest probabilty.
c. Return the clusters and the list of questions that do not belong to any of these clusters.
Step 2:
a. Using the preffered clustering algorithm (agglomarative clustering), cluster the remaining unclustered questions.
b. Name the clusters identified and return the question clusters
Step 3:
a. Find any 2 clusters that share the same keyword and merge those clusters into one.
Parameters:
questions (list[object]): A list of objects with the id (key) and the question (value) string
keywords (list[string]): A list of keyword strings.
**Note: We are assuming the keywords provided are the ones generated or accepted by the instructor. keywords generated by the algorithm in the past are not expected**
Returns:
tagged_question_clusters (object{keyword: list[qnId]}): An object with the keyword (key) and a list of question ids(value) that belong to that keyword.
"""
keyword_clusters, uncategorized_questions = findKeywordClusters(questions, keywords)
print("\n\n Keyword Clusters", keyword_clusters)
tagged_question_clusters = {}
if len(uncategorized_questions) > 0:
if do_agglomeration:
question_clusters = clusterQuestions(uncategorized_questions)
tagged_question_clusters = buildTagCluster(question_clusters)
else:
tagged_question_clusters["uncategorized"] = [qn["id"] for qn in uncategorized_questions]
# Consolidate the two cluster objects, looking for clusters with the same keyword
for key in keyword_clusters:
if key in tagged_question_clusters.keys():
tagged_question_clusters[key] += keyword_clusters[key]
else:
tagged_question_clusters[key] = keyword_clusters[key]
return tagged_question_clusters
# questionsCorpus = buildTaggedCorpus(uncategorized_questions)
# keywordsCorpus = buildCorpus(keywords)
#removed = keywordsCorpus.remove_unknown_docs()
# if removed:
# raise BeagleError(UNKNOWN_KEYWORDS)
# else:
# return clusterOnKeywords(questionsCorpus, keywordsCorpus)
#In active use
def match_questions_with_categories(questions, clusters):
"""A simple matching algorithm that places questions into a pre-created cluster if:
1. The question's lemmatized form contains the cluster's keyword
2. The question contains no rarer English words that are also cluster keywords
Parameters:
questions (list[dict]): A list of dictionaries with an id and question (text) field
clusters (list[string]): A list of pre-created keywords
"""
cluster_additions = { "uncategorized": [] }
for question in questions:
clean_question = clean_text(question["question"].replace("\n", ""))
cluster_options = set()
for token in nlp(clean_question):
if token.lemma_ in clusters:
cluster_options.add(token.lemma_)
if len(cluster_options) == 0:
cluster_additions["uncategorized"].append(question["id"])
continue
best_keyword = None
rarest_freq = 1
for keyword in cluster_options:
if word_frequency(keyword, "en") < rarest_freq:
rarest_freq = word_frequency(keyword, "en")
best_keyword = keyword
if best_keyword in cluster_additions:
cluster_additions[best_keyword].append(question["id"])
else:
cluster_additions[best_keyword] = [question["id"]]
return cluster_additions
#Currently unused
def match_questions_tfidf(questions, clusters):
# clusters = {
# "questions": [ 'and the moon too', 'lets show some' ],
# "clusterIds": [ 4, 4 ]
# }
# questions = [ {
# "id": 11, "question": 'Another one about the sun?'
# },
# {
# "id": 33,
# "question": 'What is the distance from the sun though?' },
# {
# "id": 37,
# "question": 'what\'s the changing factors of the sun and moon together?'
# } ]
# Clean and Lemmatize all the questions in the clusters
for idx, question in enumerate(clusters["questions"]):
question = clean_text(question.replace("\n", ""))
clusters["questions"][idx] = " ".join([d.lemma_ for d in nlp(question)])
# Clean and Lemmatize all the question in the orphan group
for idx, question in enumerate(questions):
question = clean_text(questions[idx]["question"].replace("\n", ""))
questions[idx]["question"] = " ".join([d.lemma_ for d in nlp(question)])
# Create corpus
completeCorpus = clusters["questions"] + [q["question"] for q in questions]
# completeCorpus = " ".join(clusters["questions"])
# completeCorpus += " ".join([q["question"] for q in questions])
vectorizer = TfidfVectorizer(
analyzer='char',
ngram_range=(1, 3),
use_idf=True,
sublinear_tf=True,
smooth_idf=True,
stop_words='english'
)
vectors = vectorizer.fit(completeCorpus)
# add tfidf vectors to the clusters object for each question
clusters["question_vectors"] = vectorizer.transform(clusters["questions"]).toarray()
# for idx, question in enumerate(clusters["questions"]):
# clusters["question_vectors"].append(vectorizer.transform(question).toarray())
#
for idx, question in enumerate(questions):
question = questions[idx]["question"]
questions[idx]["question_vector"] = vectorizer.transform([question]).toarray()[0]
# print("\n Clusters: ", clusters)
# print("\n Questions: ", questions)
# print("\n Complete Corpus: ", completeCorpus)
cluster.findBestFitCluster(questions, clusters)
return True
def dist(wordOne, wordTwo):
one = nlp(wordOne)
two = nlp(wordTwo)
if (np.linalg.norm(one.vector) == 0
or np.linalg.norm(two.vector) == 0):
raise Exception("Don't have both words")
return one.similarity(two)
def createCompleteCorpus(questions, clusters):
completeCorpus = [q["question"] for q in questions]
completeCorpus = completeCorpus + [c["question"] for idx in clusters for c in clusters[idx]]
completeCorpus = nlp(" ".join(completeCorpus))
completeCorpus = " ".join([d.lemma_ for d in completeCorpus])
# print("completeCorpus: ", completeCorpus)
def getVector(word):
return nlp(word).vector