Esempio n. 1
0
	def __init__(self):

		self.doc_index_filepath = 'main_index'
		self.title_index_filepath = 'title_index'
		self.stemmer = SimpleStemmer()
		self.doc_index = defaultdict(list)
		self.title_index = defaultdict(list)
		self.doc_tf = defaultdict(list)
		self.doc_df = defaultdict(int)
		self.title_tf = defaultdict(list)
		self.title_df = defaultdict(int)
		self.numDocs = 0
		self.stopwords = dict.fromkeys(['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your'])
Esempio n. 2
0
class SearchServer:

	def __init__(self):
		self.text_corpus_filepath = 'text_corpus'
		self.doc_index_filepath = 'main_index'
		self.title_index_filepath = 'title_index'
		self.page_ranks_filepath = 'page_ranks'
		self.page_rank_dict = {}
		self.numDocs = None
		self.corpus = {}
		self.doc_index = {}
		self.title_index = {}
		self.doc_tf = {}
		self.doc_idf = {}
		self.title_tf = {}
		self.title_idf = {}
		self.snippetizer = Snippetizer()
		self.stemmer = SimpleStemmer()
		self.stopwords = dict.fromkeys(['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your'])

	def tokenize(self, line):
		line = line.lower()
		line = re.sub(r'[^a-z0-9 ]', ' ', line) # replace non-alphanumeric characters with spaces
		line = line.split()
		line = [word for word in line if word not in self.stopwords]	# eliminate stopwords
		line = [self.stemmer.stem(word) for word in line]
		return line

	def read_text_corpus(self, filepath):
		corpus = {}
		corpus_file = open(filepath, 'r')
		for line in corpus_file:
			line = line.rstrip()
			(docID, title, url, doc) = line.split('\x03')
			corpus[docID] = {'title': title, 'url': url, 'doc': doc}
		return corpus	

	def read_index(self, index_filepath):
		index_file = open(index_filepath, 'r')
		self.numDocs = int(index_file.readline().rstrip().split('.')[0])
		index = {}
		tf = {}
		idf = {}
		for line in index_file:
			line = line.rstrip()
			(term, postings, line_tf, line_idf) = line.split('|')
			postings = postings.split(';')
			postings = [post.split(':') for post in postings]
			postings = [ [int(post[0]), map(int, post[1].split(','))] for post in postings ]
			index[term] = postings
			line_tf = line_tf.split(',')
			tf[term] = map(float, line_tf)
			idf[term] = float(line_idf)
		index_file.close()
		return (index, tf, idf)

	def read_indexes(self):
		print "Reading text corpus..."
		self.corpus = self.read_text_corpus(self.text_corpus_filepath)
		print "Reading page rank info..."
		pagerank_fp = open(self.page_ranks_filepath, "r")
		self.page_rank_dict = pickle.load(pagerank_fp)
		pagerank_fp.close()
		print "Reading document index (may take a few minutes)..."
		(self.doc_index, self.doc_tf, self.doc_idf) = self.read_index(self.doc_index_filepath)
		# print "Reading title index..."
		# (self.title_index, self.title_tf, self.title_idf) = self.read_index(self.title_index_filepath)
		print "Ready! Listening on localhost:20001"
 
	def intersect_lists(self, lists):
		if len(lists) == 0:
			return []
		lists.sort(key = len)
		return list(reduce(lambda x, y: set(x)&set(y), lists))

	def get_postings(self, terms):
		return [self.doc_index[term] for term in terms]

	def get_docs_from_postings(self, postings):
		return [ [x[0] for x in post] for post in postings]

	def dot_product(self, vector1, vector2):
		if len(vector1) != len(vector2):
			return 0
		return sum([ x*y for (x,y) in zip(vector1, vector2) ])

	def rank_documents(self, terms, docs):
		doc_vectors = defaultdict(lambda: [0]*len(terms))
		query_vector = [0]*len(terms)
		for term_index, term in enumerate(terms):
			if term not in self.doc_index:
				continue
			query_vector[term_index] = self.doc_idf[term]
			for doc_index, (doc, postings) in enumerate(self.doc_index[term]):
				if doc in docs:
					doc_vectors[doc][term_index] = self.doc_tf[term][doc_index]

		doc_scores = [ [self.dot_product(cur_doc_vector, query_vector) + self.page_rank_dict[doc]*1E6, doc] for doc, cur_doc_vector in doc_vectors.iteritems() ]
		doc_scores.sort(reverse=True)
		intermediate_docs = [str(x[1]) for x in doc_scores][:100]
		seen_titles = set()
		result_docs = []
		for document in intermediate_docs:
			if self.corpus[document]['title'] not in seen_titles:
				result_docs.append([self.corpus[document]['title'], self.corpus[document]['url'], self.snippetizer.get_snippet(self.corpus[document]['doc'], ' '.join(terms).strip())])
				seen_titles.add(self.corpus[document]['title'])
#		result_docs = [(self.corpus[x]['title'], self.corpus[x]['url'], self.snippetizer.get_snippet(self.corpus[x]['doc'], ' '.join(terms).strip())) for x in result_docs]
		return result_docs[:10]

	def one_term_query(self, q):
		original_query = q
		q = self.tokenize(q)
		if len(q)==0:
			print ''
			return
		elif len(q) > 1:
			return self.free_term_query(original_query)
		term = q[0]
		if term not in self.doc_index:
			print ''
			return ''
		else:
			postings = self.doc_index[term]
			docs = [x[0] for x in postings]
			return self.rank_documents(q, docs)

	def free_term_query(self, q):
		q = self.tokenize(q)
		if len(q)==0:
			print ''
			return ''
		li = set()
		for term in q:
			try:
				postings = self.doc_index[term]
				docs = [x[0] for x in postings]
				li = li|set(docs)
			except:	# term not in index
				pass
		li = list(li)
		return self.rank_documents(q, li)

	def phrase_query(self, q):
		original_query = q
		q = self.tokenize(q)
		if len(q) == 0:
			print ''
			return ''
		elif len(q) == 1:
			return self.one_term_query(original_query)
		phrase_docs = self.phrase_query_docs(q)
		return self.rank_documents(q, phrase_docs)

	def phrase_query_docs(self, q):
		phrase_docs = []
		length = len(q)
		for term in q:
			if term not in self.doc_index:
				return []
		postings = self.get_postings(q)
		docs = self.get_docs_from_postings(postings)
		docs = self.intersect_lists(docs)
		for i in xrange(len(postings)):
			postings[i] = [x for x in postings[i] if x[0] in docs]
		postings = copy.deepcopy(postings)
		for i in xrange(len(postings)):
			for j in xrange(len(postings[i])):
				postings[i][j][1] = [x - i for x in postings[i][j][1]]
		result = []
		for i in xrange(len(postings[0])):
			li = self.intersect_lists([x[i][1] for x in postings])
			if li == []:
				continue
			else:
				result.append(postings[0][i][0])
		return result

	def parse_query(self, query):
		if '"' in query:
			return self.phrase_query(query)
		elif len(query.split()) > 1:
			return self.free_term_query(query)
		else:
			return self.one_term_query(query)

	def listen(self):
		s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
		s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
		host = socket.gethostname()
		port = PORT
		s.bind((host, port))
		while True:
			s.listen(5)
			c, addr = s.accept()
			msg = recv_delim(c, 512, '\x01')
			query = pickle.loads(msg)
			print addr, ' >> ', query
			if query!='':
				serp = self.parse_query(query)
				msg = pickle.dumps(serp)
				send_msg(c, msg, '\x01')
			c.close()
			print "Done."
Esempio n. 3
0
class Indexer:

	def __init__(self):

		self.doc_index_filepath = 'main_index'
		self.title_index_filepath = 'title_index'
		self.stemmer = SimpleStemmer()
		self.doc_index = defaultdict(list)
		self.title_index = defaultdict(list)
		self.doc_tf = defaultdict(list)
		self.doc_df = defaultdict(int)
		self.title_tf = defaultdict(list)
		self.title_df = defaultdict(int)
		self.numDocs = 0
		self.stopwords = dict.fromkeys(['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your'])

	def parse_doc(self, html_file):
		title = ""
		doc = ""
		html = lxml.html.fromstring(html_file)
		title_el = html.xpath('//title')
		if title_el:
			title = title_el[0].text_content()
		div_el = html.find_class('freestyle-text')
		if div_el:
			doc = div_el[0].text_content()
		return (title, doc)

	def tokenize(self, line):
		line = line.lower()
		line = re.sub(r'[^a-z0-9 ]', ' ', line) # replace non-alphanumeric characters with spaces
		line = line.split()
		line = [word for word in line if word not in self.stopwords]	# eliminate stopwords
		line = [self.stemmer.stem(word) for word in line]
		return line

	def write_index(self, filepath, index, tf, df):
		index_file = open(filepath, 'w')
		print >> index_file, self.numDocs
		self.numDocs = float(self.numDocs)
		for term in index.iterkeys():
			postings_list = []
			for posting in index[term]:
				docID = posting[0]
				positions = posting[1]
				postings_list.append(':'.join([str(docID), ','.join(map(str,positions))]))
			postings_data = ';'.join(postings_list)
			tf_data = ','.join(map(str, tf[term]))
			idf_data = '%.4f' % (self.numDocs / df[term])
			print >> index_file, '|'.join((term, postings_data, tf_data, idf_data))
		index_file.close()

	def build_term_index(self, docID, terms):
		term_dict_page = {}
		for position, term in enumerate(terms):
			try:
				term_dict_page[term][1].append(position)
			except:
				term_dict_page[term] = [docID, array('I', [position])]
		return term_dict_page

	def tf_idf_weights(self, term_dict_page, tf, df):
		# normalize the doc vector
		norm = 0
		for term, posting in term_dict_page.iteritems():
			norm += len(posting[1])**2
		norm = math.sqrt(norm)

		# calculate tf and idf weights
		for term, posting in term_dict_page.iteritems():
			tf[term].append('%.4f' % (len(posting[1])/norm))
			df[term] += 1
		return (tf, df)

	def merge_to_index(self, term_dict_page, index):
		for term_page, posting_page in term_dict_page.iteritems():
			index[term_page].append(posting_page)
		return index

	def create_indexes(self):
		gc.disable()

		docID = 0
		self.numDocs = 0

		# main loop creating the index

		connection = pymongo.Connection("localhost", 27017)
		db = connection.final_espn_corpus_new
		entries = db.pages.find()
		for entry in entries:
			(title, doc) = self.parse_doc(entry['content'])
			docID += 1
			print "DocID", docID, ": ", title.encode('utf-8'), " [", entry['url'].encode('utf-8'), "]"
			self.numDocs += 1
			lines = '\n'.join ((title, doc))
			doc_terms = self.tokenize(lines)
			title_terms = self.tokenize(title)

			doc_dict_page = self.build_term_index(docID, doc_terms)
			title_dict_page = self.build_term_index(docID, title_terms)

			(self.doc_tf, self.doc_df) = self.tf_idf_weights(doc_dict_page, self.doc_tf, self.doc_df)
			(self.title_tf, self.title_df) = self.tf_idf_weights(title_dict_page, self.title_tf, self.title_df)

			self.doc_index = self.merge_to_index(doc_dict_page, self.doc_index)
			self.title_index = self.merge_to_index(title_dict_page, self.title_index)

			'''
			# build the index for the current page
			term_dict_page = {}
			for position, term in enumerate(terms):
				try:
					term_dict_page[term][1].append(position)
				except:
					term_dict_page[term] = [docID, array('I', [position])]
			'''
			'''
			# normalize the document vector
			norm = 0
			for term, posting in term_dict_page.iteritems():
				norm += len(posting[1])**2
			norm = math.sqrt(norm)

			# calculate tf and idf weights	
			for term, posting in term_dict_page.iteritems():
				self.tf[term].append('%.4f' % (len(posting[1])/norm))
				self.df[term] += 1
			'''
			'''
			# merge the current page index with the main index
			for term_page, posting_page in term_dict_page.iteritems():
				self.main_index[term_page].append(posting_page)
			'''
		gc.enable()
		self.write_index(self.doc_index_filepath, self.doc_index, self.doc_tf, self.doc_df)
		self.write_index(self.title_index_filepath, self.title_index, self.title_tf, self.title_df)