/
blog.py
216 lines (150 loc) · 5.74 KB
/
blog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import pymongo
import logging
import datetime
from indexer import Indexer
from bson.objectid import ObjectId
class Blog(object):
def __init__(self):
# create indexer
self.Idx = Indexer()
# create two connection instances
self.Post = None
self.InvIdx = None
self.index_fields = []
def set_db(self,Blog_DB):
self.Post = Blog_DB.posts
self.InvIdx = Blog_DB.invidx
def set_index_fields(self,fields):
if not isinstance(fields, list):
raise Exception("Fields must be a list")
self.index_fields = fields
self.Idx.set_idx_fields(fields)
def save_post(self,post):
logging.debug('save_post: ' + str(post))
if self.index_fields == []:
raise Exception("No fields to index. Please set it first!")
if isinstance(post,list):
raise Exception("Only accept 1 post")
if logging.root.level == logging.DEBUG:
post_start_time = datetime.datetime.utcnow()
# inserting post to posts collection
obj_id = self.Post.insert(post)
if logging.root.level == logging.DEBUG:
post_end_time = datetime.datetime.utcnow()
if obj_id == None:
raise Exception("Error saving to mongodb")
logging.debug('Saving post to mongo is OK')
# strip unnecessary string
#obj_id_strip = str(obj_id).strip('ObjectId("').rstrip('")')
#logging.debug('strip object_id to: ' + obj_id_strip)
if logging.root.level == logging.DEBUG:
idx_start_time = datetime.datetime.utcnow()
# get word
words = self.Idx.index(post)
# updating words to inverted index
# using loop
# TODO: change to bulk update
for word in words:
#print word
#self.InvIdx.update({"word":word},{"$push":{"docs":obj_id_strip}},True)
self.InvIdx.update({"word":word},{"$push":{"docs":obj_id}},True)
if logging.root.level == logging.DEBUG:
idx_end_time = datetime.datetime.utcnow()
# print info
post_time = post_end_time-post_start_time
idx_time = idx_end_time-idx_start_time
total_time = post_time + idx_time
logging.debug('time to save post: ' +str(post_time.total_seconds()))
logging.debug('time to save idx: ' +str(idx_time.total_seconds()))
logging.debug('total time: ' +str(total_time.total_seconds()))
return obj_id
def get_dummy_post(self,number):
if (number<0) or (number>4):
raise Exception("Choose 1..4")
posts = {}
posts[1] = "Six people have been shot dead after a Russian lawyer opened fire on his colleagues at a pharmacy company"
posts[2] = "Water and Venice usually go together like bees and honey. But not when there's as much rain"
posts[3] = "Two men inside the utility truck have a lucky escape after a passing freight train collides with their vehicle"
posts[4] = "Super storm Sandy gives New York a historic drenching.\nBattery Park in lower Manhattan floods as record high water"
return {"title":"Dummy post "+str(number) ,"content": posts[number], "time":str(datetime.datetime.utcnow())}
def clear(self):
self.Post.remove()
self.InvIdx.remove()
def search(self,input_text):
# get time: start first query
if logging.root.level == logging.DEBUG:
query_idx_start_time = datetime.datetime.utcnow()
# tokenize query
words_text_input = self.Idx.tokenize(text_input)
# build query to get doc_ids
list_words_text_input = []
for word_text_input in words_text_input:
#print word_text_input
cond_words_text_input = {"word": word_text_input}
list_words_text_input.append(cond_words_text_input)
final_words_text_input = {"$or":list_words_text_input}
# get doc_ids from inverted index
doc_ids = [queryIdx.values()[0] for queryIdx in self.InvIdx.find( final_words_text_input, {"docs" :1 })]
# remove duplicate doc_id
doc_ids = set([doc_id[0] for doc_id in doc_ids])
# get time: end first query & start second query
if logging.root.level == logging.DEBUG:
query_idx_end_time = datetime.datetime.utcnow()
query_col_start_time = query_idx_end_time
# build query to get documents by doc_ids
list_doc = []
for doc_id in doc_ids:
cond_doc = {"_id": ObjectId(doc_id)}
list_doc.append(cond_doc)
final_doc = {"$or":list_doc}
# get post from posts collection
docs = self.Post.find(final_doc)
if logging.root.level == logging.DEBUG:
query_col_end_time = datetime.datetime.utcnow()
# print info
query_idx_time = query_idx_end_time - query_idx_start_time
query_col_time = query_col_end_time - query_col_start_time
total_time = query_idx_time + query_col_time
logging.debug('time to query invidx: ' +str(query_idx_time.total_seconds()))
logging.debug('time to query posts: ' +str(query_col_time.total_seconds()))
logging.debug('total query time: ' +str(total_time.total_seconds()))
return docs
# end of function
# end of class
if __name__ == '__main__':
logging.root
logging.root.setLevel(logging.DEBUG)
logging.debug('Create Connection')
Con = pymongo.Connection('localhost')
# set db name: myblog
Blog_DB = Con.myblog
logging.debug('Create Blog')
Blog = Blog()
# set database for blog
Blog.set_db(Blog_DB)
# set fields to be indexed
Blog.set_index_fields(['title','content'])
# =============================
# Test inserting document
# =============================
# clear db
Blog.clear()
# inserting some posts for testing
logging.debug("insert post1 to db")
post = Blog.get_dummy_post(1)
obj_id = Blog.save_post(post)
logging.debug("insert post2 to db")
post = Blog.get_dummy_post(2)
obj_id = Blog.save_post(post)
logging.debug("insert post3 to db")
post = Blog.get_dummy_post(3)
obj_id = Blog.save_post(post)
# =============================
# Test querying document
# =============================
#input text
print "\n## MongoDB Real Time Full Text Search - Python Driver ##\n"
text_input = raw_input('Input Full Text Search : ')
docs = Blog.search(text_input)
for doc in docs:
print doc