Example #1
0
class Blog(object):

	
	def __init__(self):
		
		# create indexer 
		self.Idx = Indexer()

		# create two connection instances
		self.Post = None
		self.InvIdx = None

		self.index_fields = []

	def set_db(self,Blog_DB):
		self.Post = Blog_DB.posts
		self.InvIdx = Blog_DB.invidx

	def set_index_fields(self,fields):

		if not isinstance(fields, list): 
			raise Exception("Fields must be a list")

		self.index_fields = fields
		self.Idx.set_idx_fields(fields)


	def save_post(self,post):
		logging.debug('save_post: ' + str(post))
		if self.index_fields == []:
			raise Exception("No fields to index. Please set it first!")
		if isinstance(post,list): 
			raise Exception("Only accept 1 post")

		
		if logging.root.level == logging.DEBUG:
			post_start_time = datetime.datetime.utcnow()

		# inserting post to posts collection
		obj_id = self.Post.insert(post)

		if logging.root.level == logging.DEBUG:
			post_end_time = datetime.datetime.utcnow()

		
		if obj_id == None:
			raise Exception("Error saving to mongodb")
		logging.debug('Saving post to mongo is OK')

		# strip unnecessary string
		#obj_id_strip = str(obj_id).strip('ObjectId("').rstrip('")')
		#logging.debug('strip object_id to: ' + obj_id_strip)
		
		if logging.root.level == logging.DEBUG:
			idx_start_time = datetime.datetime.utcnow()

		# get word
		words = self.Idx.index(post)		

		# updating words to inverted index
		# using loop
		# TODO: change to bulk update
		for word in words:
			#print word
			#self.InvIdx.update({"word":word},{"$push":{"docs":obj_id_strip}},True)
			self.InvIdx.update({"word":word},{"$push":{"docs":obj_id}},True)
		
		if logging.root.level == logging.DEBUG:
			idx_end_time = datetime.datetime.utcnow()
			# print info
			post_time = post_end_time-post_start_time
			idx_time = idx_end_time-idx_start_time
			total_time = post_time + idx_time
			
			logging.debug('time to save post: ' +str(post_time.total_seconds()))
			logging.debug('time to save idx: ' +str(idx_time.total_seconds()))
			logging.debug('total time: ' +str(total_time.total_seconds()))

		return obj_id
	
	def get_dummy_post(self,number):
		
		if (number<0) or (number>4): 
				raise Exception("Choose 1..4")

		posts = {}
		posts[1] = "Six people have been shot dead after a Russian lawyer opened fire on his colleagues at a pharmacy company"
		posts[2] = "Water and Venice usually go together like bees and honey. But not when there's as much rain"
		posts[3] = "Two men inside the utility truck have a lucky escape after a passing freight train collides with their vehicle"
		posts[4] = "Super storm Sandy gives New York a historic drenching.\nBattery Park in lower Manhattan floods as record high water"

		return {"title":"Dummy post "+str(number) ,"content": posts[number], "time":str(datetime.datetime.utcnow())}

	def clear(self):
		self.Post.remove()
		self.InvIdx.remove()

	def search(self,input_text):


		# get time: start first query
		if logging.root.level == logging.DEBUG:
			query_idx_start_time = datetime.datetime.utcnow()

		# tokenize query
		words_text_input = self.Idx.tokenize(text_input)

		# build query to get doc_ids
		list_words_text_input = []
		for word_text_input in words_text_input:
			#print word_text_input
			cond_words_text_input = {"word": word_text_input}
			list_words_text_input.append(cond_words_text_input)
		final_words_text_input = {"$or":list_words_text_input}
		
		# get doc_ids from inverted index
		doc_ids = [queryIdx.values()[0] for queryIdx in self.InvIdx.find( final_words_text_input, {"docs" :1 })]
		# remove duplicate doc_id
		doc_ids = set([doc_id[0] for doc_id in doc_ids])

		# get time: end first query & start second query
		if logging.root.level == logging.DEBUG:
			query_idx_end_time = datetime.datetime.utcnow()
			query_col_start_time = query_idx_end_time

		# build query to get documents by doc_ids
		list_doc = []
		for doc_id in doc_ids:
			cond_doc = {"_id": ObjectId(doc_id)}
			list_doc.append(cond_doc)
		final_doc = {"$or":list_doc}

		# get post from posts collection
		docs = self.Post.find(final_doc)


		if logging.root.level == logging.DEBUG:
			query_col_end_time = datetime.datetime.utcnow()
			
			# print info
			
			query_idx_time = query_idx_end_time - query_idx_start_time
			query_col_time = query_col_end_time - query_col_start_time
			total_time = query_idx_time + query_col_time
						
			logging.debug('time to query invidx: ' +str(query_idx_time.total_seconds()))
			logging.debug('time to query posts: ' +str(query_col_time.total_seconds()))
			logging.debug('total query time: ' +str(total_time.total_seconds()))

		return docs