-
Notifications
You must be signed in to change notification settings - Fork 1
/
topic_classification.py
executable file
·368 lines (328 loc) · 14.5 KB
/
topic_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This module sets up the classification system (the semantic index)
Call it by
topic_classification [tags]
The corpus name and number of posts per topic are set in config.py
The corpus_name is your choice. I use something like stackoverflow_xml for the stackoverflow database and xml tag.
tags are the tags to include (Note: the more you include, the more memory you need for the matrix!). If it is not given,
only the top tags are included, but note that the top ten tags are 50% of the posts.
"""
import sys
import os
import copy
import logging
import gensim
import nltk
import re
import pprint
import MySQLdb
import time
import cPickle as pickle
from BeautifulSoup import BeautifulSoup
from config import Config
import util
# tokenizers....
linkstops = [u"http", u"com", u"org", u"www", u":", u"://", u"/", u"."]
punctuators = [".", ",", "-", "/", "&", "\\", "'", '"', ":", ";", "(", ")", "?", "*", "!", "$", "%", "#", "|", "{", "}", "[", "]", "://", "...", "<", ">"]
punctuatorSet = set(punctuators)
domainStops = ["gt", "lt"]
stopwords = set(nltk.corpus.stopwords.words('english') + punctuators + domainStops)
stemmer = nltk.WordNetLemmatizer().lemmatize
def tokenizeText(postText, useStemmer=False):
""" tokenize some text """
text = nltk.wordpunct_tokenize(postText)
words = [w.lower() if not useStemmer else stemmer(w.lower()) for w in text if w.lower() not in stopwords]
nText = nltk.Text(words)
return nText.tokens
def tokenizePost(title, question, answers, tags):
""" tokenize a post, separating text and code """
# note: a better way to do this would be separating out code and text completely and indexing them separately
textTokens = []
codeTokens = []
linkTokens = []
for postPart in [title, question]+answers:
if postPart:
postSoup = BeautifulSoup(postPart)
postCode = util.extractCode(postSoup)
if postCode:
nCode = nltk.Text([ctoken.lower() for ctoken in nltk.wordpunct_tokenize(postCode) if ctoken not in punctuatorSet])
codeTokens += nCode.tokens
postText = util.extractText(postSoup)
if postText:
textTokens += tokenizeText(postText, useStemmer=True)
#postLinks = util.extractLinks(postSoup)
#linkParts = nltk.wordpunct_tokenize(postLinks)
#links = [link for link in linkParts if link not in linkstops]
#nLinks = nltk.Text(links)
#linkTokens += nLinks.tokens
tagTokens = re.split("[<>]+", tags)
return (textTokens + codeTokens + linkTokens + tagTokens[1:-1])
class StackOverflowCorpus(object):
""" abstract corpus for gensim
A corpus is a set of documents containing a numerical word list and a dictionary
linking the numbers to words.
This class also keeps track of the mapping between corpus "documents" and post ids.
"""
def __init__(self, db, dictionary, topic=None, postList=None):
self.t0 = time.time()
self.tbegin = time.time()
self.ctr = 0
self.dictionary = dictionary
self.topic = topic
self.postList = postList
self.db = db
self.corpusToPost = {}
def unicodifyTokens(self, lst):
for item in lst:
try:
yield unicode(item)
except:
pass
def __iter__(self):
for question in util.iterateQuestions(self.db, self.topic, self.postList):
answers = [answer for answer in util.iterateAnswers(self.db, [question.id])]
tokens = tokenizePost(question.title, question.body, [answer.body for answer in answers], question.tags)
if Config.debug and self.ctr > 0 and (self.ctr % 5000)==0:
now = time.time()
print >>sys.stderr, "Posts imported:", self.ctr, "(in %0.1fs, %0.2fpost/s)" % (
(now-self.t0), self.ctr/(now-self.tbegin)
)
self.t0 = now
self.corpusToPost[self.ctr] = question.id
self.ctr += 1
yield self.dictionary.doc2bow([utoken for utoken in self.unicodifyTokens(tokens)], allow_update=True)
def saveCorpusToPost(self, fileName):
f = file(fileName, "wb")
pickle.dump(self.corpusToPost, f)
f.close()
@staticmethod
def loadCorpusToPost(fileName):
f = file(fileName, "rb")
corpusToPost = pickle.load(f)
f.close()
return corpusToPost
def testIterate():
try:
db=util.makeDbConnection("stackoverflow")
dictionary = gensim.corpora.dictionary.Dictionary()
for doc in StackOverflowCorpus(db, dictionary):
print "doc=", doc, "dictionary=",dictionary
raw_input()
finally:
db.close()
def frequencyDistributionToSQL(db, topic, freqDist):
c=db.cursor()
c.execute("""CREATE TABLE IF NOT EXISTS %s_frequency_table (
n_gram varchar(100),
frequency int,
INDEX (n_gram)
) ENGINE=MyISAM DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci""" % (topic))
c.fetchall()
if Config.debug:
print >>sys.stderr, "Importing frequency distribution to SQL"
c.executemany(("""INSERT INTO %s_frequency_table (""" % topic) + """
n_gram,
frequency
) VALUES (
%s,
%s
)""",
[(ngram, frequency) for (ngram, frequency) in freqDist.items()]
)
c.fetchall()
c.close()
def getWholePost(db, postId):
""" get a (title, post and all its answers) """
title = None
wholePost = None
for question in util.iterateQuestions(db, postList=[postId]):
answers = "\n\n".join([answer.body for answer in util.iterateAnswers(db, postId)])
title = question.title
wholePost = "\n\n".join([question.title, question.body, answers, question.tags])
return (title, wholePost)
def makeStackOverflowCorpus(fileName, topic=None, usePostList=False, useTags=[]):
db=util.makeDbConnection(Config.myDb)
if usePostList:
print "Using post list"
if not useTags:
useTags = util.topTags(db, 200)
postList = util.tagPosts(db, useTags)
print "Need to import", len(postList), "posts"
else:
postList = None
dictionary = gensim.corpora.dictionary.Dictionary()
soCorpus = StackOverflowCorpus(db, dictionary, topic, postList)
try:
gensim.corpora.MmCorpus.serialize(fileName + ".mm", soCorpus)
finally:
dictionary.save(fileName + ".dict")
soCorpus.saveCorpusToPost(fileName + ".c2p")
db.close()
return len(postList)
def makeTfIdf(fileName):
""" make TFIDF from a corpus """
corpus = gensim.corpora.MmCorpus(fileName + ".mm")
tfidf = gensim.models.TfidfModel(corpus)
tfidf.save(fileName + ".tfidf")
return tfidf
def makeLSI(fileName, nTopics, fromTfidf=False):
""" make LSI given a corpus filename """
corpus = gensim.corpora.MmCorpus(fileName + ".mm")
if fromTfidf:
print >>sys.stderr, "Converting corpus to TFIDF representation"
tfidf = gensim.models.TfidfModel.load(fileName + ".tfidf")
useCorpus = tfidf[corpus]
else:
useCorpus = corpus
dictionary = gensim.corpora.Dictionary.load(fileName + ".dict")
lsi = gensim.models.LsiModel(useCorpus, id2word=dictionary, num_topics=nTopics)
lsi.save(fileName + ".lsi")
return lsi
def makeLDA(fileName, nTopics, fromTfidf=False):
""" make LDA given a corpus filename """
corpus = gensim.corpora.MmCorpus(fileName + ".mm")
if fromTfidf:
print >>sys.stderr, "Converting corpus to TFIDF representation"
tfidf = gensim.models.TfidfModel.load(fileName + ".tfidf")
useCorpus = tfidf[corpus]
else:
useCorpus = corpus
dictionary = gensim.corpora.Dictionary.load(fileName + ".dict")
lda = gensim.models.LdaModel(useCorpus, id2word=dictionary, num_topics=nTopics)
lda.save(fileName + ".lda")
return lda
def ldaTopics(fileName, query):
""" given a query document, find lda topics """
lda = gensim.models.LdaModel.load(fileName + ".lda")
dictionary = gensim.corpora.Dictionary.load(fileName + ".dict")
queryBow = dictionary.doc2bow(tokenizeText(query, useStemmer=True))
topics = lda[queryBow]
return topics
def makeSimilarityIndex(fileName, fromTfIdf=False):
corpus = gensim.corpora.MmCorpus(fileName + ".mm")
if fromTfIdf:
print >>sys.stderr, "Converting corpus to TFIDF representation"
tfidf = gensim.models.TfidfModel.load(fileName + ".tfidf")
useCorpus = tfidf[corpus]
else:
useCorpus = corpus
lsi = gensim.models.LsiModel.load(fileName + ".lsi")
index = gensim.similarities.Similarity(fileName, lsi[useCorpus], len(lsi.show_topics()))
index.save(fileName + ".index")
return index
def displayMatches(db, matches, start=0, maxresults=5):
for match in matches[start:(start+maxresults)]:
print >>sys.stderr, "Working on match:", match
post = util.Post.fromPostId(db, match[0])
print post.title, " SIMILARITY:", match[1]
class QueryResult:
def __init__(self, db, match, post=None):
""" convert a similiarityQuery match or post to a QueryResult structure; if post is given, match should be the similarity """
if post:
self.id = post.id
self.post = post
self.similarity = match
else:
self.id = int(match[0])
self.post = util.Post.fromPostId(db, self.id)
self.similarity = match[1]
class TopicModeling(object):
""" class to keep references to all the parts of the topic model (aka index) in memory"""
def __init__(self, corpusName):
self.corpusName = corpusName
self.index = gensim.similarities.Similarity.load(corpusName + ".index")
self.lsi = gensim.models.LsiModel.load(corpusName + ".lsi")
self.corpus = gensim.corpora.MmCorpus(corpusName + ".mm")
self.dictionary = gensim.corpora.Dictionary.load(corpusName + ".dict")
self.corpusToPost = StackOverflowCorpus.loadCorpusToPost(corpusName + ".c2p")
self.tfidf = gensim.models.TfidfModel.load(corpusName + ".tfidf")
self.corpusTfidf = self.tfidf[self.corpus]
def similarityQuery(self, query, cutoff=0):
""" perform a similarity query. return matching post ids, similarity score, and corpus id """
logging.debug("tokenizing query...")
queryBow = self.dictionary.doc2bow(tokenizeText(query, useStemmer=True))
logging.debug("converting query to LSI space...")
queryLsi = self.lsi[queryBow]
logging.debug("querying the index...")
results = self.index[queryLsi]
logging.debug("filtering %d results..." % len(results))
matchingPosts = [(self.corpusToPost[corpusDoc], similarity, corpusDoc ) for corpusDoc, similarity in enumerate(results) if similarity >= cutoff]
logging.debug("sorting %d results..." % len(matchingPosts))
matchingPosts.sort(key=lambda match: -match[1])
logging.debug("sorting complete...")
return matchingPosts
class QuerySimilarity:
""" allow comparisons of the same query to multiple other documents """
def __init__(self, topicModel, query):
self.topicModel = topicModel
self.queryBow = topicModel.dictionary.doc2bow(tokenizeText(unicode(query), useStemmer=True))
self.queryLsi = topicModel.lsi[topicModel.tfidf[self.queryBow]]
def similarity(self, document):
docBow = self.topicModel.dictionary.doc2bow(tokenizePost("", document, [], ""))
if docBow:
docLsi = self.topicModel.lsi[self.topicModel.tfidf[docBow]]
docIndex = gensim.similarities.MatrixSimilarity([docLsi])
similarity = docIndex[self.queryLsi]
return similarity[0]
else:
print >>sys.stderr, "WARNING: Document did not bow:", document
return 0.5
def similarityToDocument(self, document, query):
docBow = self.dictionary.doc2bow(tokenizePost("", document, [], ""))
if docBow:
docLsi = self.lsi[self.tfidf[docBow]]
docIndex = gensim.similarities.MatrixSimilarity([docLsi])
queryBow = self.dictionary.doc2bow(tokenizeText(unicode(query), useStemmer=True))
queryLsi = self.lsi[self.tfidf[queryBow]]
similarity = docIndex[queryLsi]
return similarity[0]
else:
print >>sys.stderr, "WARNING: Document did not bow:", document
return 0.5
def queryResults(self, db, query, cutoff=0.5):
""" return query results as a list of QueryResult instances
"""
matchingPosts = self.similarityQuery(query, cutoff);
# link id->similarity
postMatches = {match[0] : match[1] for match in matchingPosts}
# get the actual posts, but remove the closed ones
posts = util.Post.fromPostIds(db, postMatches, removeClosed=True)
logging.debug("returning %d open posts" % len(posts))
return [QueryResult(db, postMatches[post.id], post=post) for post in posts]
def main():
""" generate a dictionary, corpus and index from the Stack Overflow dump """
corpusName = Config.corpusName
print >>sys.stderr, "Generating corpus..."
if os.path.isfile(corpusName + ".mm"):
corpus = gensim.corpora.MmCorpus(corpusName + ".mm")
nPosts = len(corpus)
print >>sys.stderr, "Corpus exists with %d posts. skipping." % nPosts
else:
if len(sys.argv) > 1:
useTags = sys.argv[1:]
else:
useTags = []
nPosts = makeStackOverflowCorpus(corpusName, None, usePostList=True, useTags=useTags)
print >>sys.stderr, "Making TFIDF representation..."
if os.path.isfile(corpusName + ".tfidf"):
print >>sys.stderr, "exists, skipping."
else:
makeTfIdf(corpusName)
print >>sys.stderr, "Making LSI..."
if os.path.isfile(corpusName + ".lsi"):
print >>sys.stderr, "exists, skipping."
else:
makeLSI(corpusName, nPosts//Config.postsPerTopic, True)
print >>sys.stderr, "Making similarity index..."
if os.path.isfile(corpusName + ".index"):
print >>sys.stderr, "exists, skipping."
else:
makeSimilarityIndex(corpusName, True)
#print >>sys.stderr, "Making LDA topic model..."
#makeLDA(corpusName, nPosts//100, True)
if __name__ == "__main__":
print >>sys.stderr , "Using corpus name: %s" % Config.corpusName
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
main()