def __init__(self): self.doc_index_filepath = 'main_index' self.title_index_filepath = 'title_index' self.stemmer = SimpleStemmer() self.doc_index = defaultdict(list) self.title_index = defaultdict(list) self.doc_tf = defaultdict(list) self.doc_df = defaultdict(int) self.title_tf = defaultdict(list) self.title_df = defaultdict(int) self.numDocs = 0 self.stopwords = dict.fromkeys(['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your'])
class SearchServer: def __init__(self): self.text_corpus_filepath = 'text_corpus' self.doc_index_filepath = 'main_index' self.title_index_filepath = 'title_index' self.page_ranks_filepath = 'page_ranks' self.page_rank_dict = {} self.numDocs = None self.corpus = {} self.doc_index = {} self.title_index = {} self.doc_tf = {} self.doc_idf = {} self.title_tf = {} self.title_idf = {} self.snippetizer = Snippetizer() self.stemmer = SimpleStemmer() self.stopwords = dict.fromkeys(['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your']) def tokenize(self, line): line = line.lower() line = re.sub(r'[^a-z0-9 ]', ' ', line) # replace non-alphanumeric characters with spaces line = line.split() line = [word for word in line if word not in self.stopwords] # eliminate stopwords line = [self.stemmer.stem(word) for word in line] return line def read_text_corpus(self, filepath): corpus = {} corpus_file = open(filepath, 'r') for line in corpus_file: line = line.rstrip() (docID, title, url, doc) = line.split('\x03') corpus[docID] = {'title': title, 'url': url, 'doc': doc} return corpus def read_index(self, index_filepath): index_file = open(index_filepath, 'r') self.numDocs = int(index_file.readline().rstrip().split('.')[0]) index = {} tf = {} idf = {} for line in index_file: line = line.rstrip() (term, postings, line_tf, line_idf) = line.split('|') postings = postings.split(';') postings = [post.split(':') for post in postings] postings = [ [int(post[0]), map(int, post[1].split(','))] for post in postings ] index[term] = postings line_tf = line_tf.split(',') tf[term] = map(float, line_tf) idf[term] = float(line_idf) index_file.close() return (index, tf, idf) def read_indexes(self): print "Reading text corpus..." self.corpus = self.read_text_corpus(self.text_corpus_filepath) print "Reading page rank info..." pagerank_fp = open(self.page_ranks_filepath, "r") self.page_rank_dict = pickle.load(pagerank_fp) pagerank_fp.close() print "Reading document index (may take a few minutes)..." (self.doc_index, self.doc_tf, self.doc_idf) = self.read_index(self.doc_index_filepath) # print "Reading title index..." # (self.title_index, self.title_tf, self.title_idf) = self.read_index(self.title_index_filepath) print "Ready! Listening on localhost:20001" def intersect_lists(self, lists): if len(lists) == 0: return [] lists.sort(key = len) return list(reduce(lambda x, y: set(x)&set(y), lists)) def get_postings(self, terms): return [self.doc_index[term] for term in terms] def get_docs_from_postings(self, postings): return [ [x[0] for x in post] for post in postings] def dot_product(self, vector1, vector2): if len(vector1) != len(vector2): return 0 return sum([ x*y for (x,y) in zip(vector1, vector2) ]) def rank_documents(self, terms, docs): doc_vectors = defaultdict(lambda: [0]*len(terms)) query_vector = [0]*len(terms) for term_index, term in enumerate(terms): if term not in self.doc_index: continue query_vector[term_index] = self.doc_idf[term] for doc_index, (doc, postings) in enumerate(self.doc_index[term]): if doc in docs: doc_vectors[doc][term_index] = self.doc_tf[term][doc_index] doc_scores = [ [self.dot_product(cur_doc_vector, query_vector) + self.page_rank_dict[doc]*1E6, doc] for doc, cur_doc_vector in doc_vectors.iteritems() ] doc_scores.sort(reverse=True) intermediate_docs = [str(x[1]) for x in doc_scores][:100] seen_titles = set() result_docs = [] for document in intermediate_docs: if self.corpus[document]['title'] not in seen_titles: result_docs.append([self.corpus[document]['title'], self.corpus[document]['url'], self.snippetizer.get_snippet(self.corpus[document]['doc'], ' '.join(terms).strip())]) seen_titles.add(self.corpus[document]['title']) # result_docs = [(self.corpus[x]['title'], self.corpus[x]['url'], self.snippetizer.get_snippet(self.corpus[x]['doc'], ' '.join(terms).strip())) for x in result_docs] return result_docs[:10] def one_term_query(self, q): original_query = q q = self.tokenize(q) if len(q)==0: print '' return elif len(q) > 1: return self.free_term_query(original_query) term = q[0] if term not in self.doc_index: print '' return '' else: postings = self.doc_index[term] docs = [x[0] for x in postings] return self.rank_documents(q, docs) def free_term_query(self, q): q = self.tokenize(q) if len(q)==0: print '' return '' li = set() for term in q: try: postings = self.doc_index[term] docs = [x[0] for x in postings] li = li|set(docs) except: # term not in index pass li = list(li) return self.rank_documents(q, li) def phrase_query(self, q): original_query = q q = self.tokenize(q) if len(q) == 0: print '' return '' elif len(q) == 1: return self.one_term_query(original_query) phrase_docs = self.phrase_query_docs(q) return self.rank_documents(q, phrase_docs) def phrase_query_docs(self, q): phrase_docs = [] length = len(q) for term in q: if term not in self.doc_index: return [] postings = self.get_postings(q) docs = self.get_docs_from_postings(postings) docs = self.intersect_lists(docs) for i in xrange(len(postings)): postings[i] = [x for x in postings[i] if x[0] in docs] postings = copy.deepcopy(postings) for i in xrange(len(postings)): for j in xrange(len(postings[i])): postings[i][j][1] = [x - i for x in postings[i][j][1]] result = [] for i in xrange(len(postings[0])): li = self.intersect_lists([x[i][1] for x in postings]) if li == []: continue else: result.append(postings[0][i][0]) return result def parse_query(self, query): if '"' in query: return self.phrase_query(query) elif len(query.split()) > 1: return self.free_term_query(query) else: return self.one_term_query(query) def listen(self): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) host = socket.gethostname() port = PORT s.bind((host, port)) while True: s.listen(5) c, addr = s.accept() msg = recv_delim(c, 512, '\x01') query = pickle.loads(msg) print addr, ' >> ', query if query!='': serp = self.parse_query(query) msg = pickle.dumps(serp) send_msg(c, msg, '\x01') c.close() print "Done."
class Indexer: def __init__(self): self.doc_index_filepath = 'main_index' self.title_index_filepath = 'title_index' self.stemmer = SimpleStemmer() self.doc_index = defaultdict(list) self.title_index = defaultdict(list) self.doc_tf = defaultdict(list) self.doc_df = defaultdict(int) self.title_tf = defaultdict(list) self.title_df = defaultdict(int) self.numDocs = 0 self.stopwords = dict.fromkeys(['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your']) def parse_doc(self, html_file): title = "" doc = "" html = lxml.html.fromstring(html_file) title_el = html.xpath('//title') if title_el: title = title_el[0].text_content() div_el = html.find_class('freestyle-text') if div_el: doc = div_el[0].text_content() return (title, doc) def tokenize(self, line): line = line.lower() line = re.sub(r'[^a-z0-9 ]', ' ', line) # replace non-alphanumeric characters with spaces line = line.split() line = [word for word in line if word not in self.stopwords] # eliminate stopwords line = [self.stemmer.stem(word) for word in line] return line def write_index(self, filepath, index, tf, df): index_file = open(filepath, 'w') print >> index_file, self.numDocs self.numDocs = float(self.numDocs) for term in index.iterkeys(): postings_list = [] for posting in index[term]: docID = posting[0] positions = posting[1] postings_list.append(':'.join([str(docID), ','.join(map(str,positions))])) postings_data = ';'.join(postings_list) tf_data = ','.join(map(str, tf[term])) idf_data = '%.4f' % (self.numDocs / df[term]) print >> index_file, '|'.join((term, postings_data, tf_data, idf_data)) index_file.close() def build_term_index(self, docID, terms): term_dict_page = {} for position, term in enumerate(terms): try: term_dict_page[term][1].append(position) except: term_dict_page[term] = [docID, array('I', [position])] return term_dict_page def tf_idf_weights(self, term_dict_page, tf, df): # normalize the doc vector norm = 0 for term, posting in term_dict_page.iteritems(): norm += len(posting[1])**2 norm = math.sqrt(norm) # calculate tf and idf weights for term, posting in term_dict_page.iteritems(): tf[term].append('%.4f' % (len(posting[1])/norm)) df[term] += 1 return (tf, df) def merge_to_index(self, term_dict_page, index): for term_page, posting_page in term_dict_page.iteritems(): index[term_page].append(posting_page) return index def create_indexes(self): gc.disable() docID = 0 self.numDocs = 0 # main loop creating the index connection = pymongo.Connection("localhost", 27017) db = connection.final_espn_corpus_new entries = db.pages.find() for entry in entries: (title, doc) = self.parse_doc(entry['content']) docID += 1 print "DocID", docID, ": ", title.encode('utf-8'), " [", entry['url'].encode('utf-8'), "]" self.numDocs += 1 lines = '\n'.join ((title, doc)) doc_terms = self.tokenize(lines) title_terms = self.tokenize(title) doc_dict_page = self.build_term_index(docID, doc_terms) title_dict_page = self.build_term_index(docID, title_terms) (self.doc_tf, self.doc_df) = self.tf_idf_weights(doc_dict_page, self.doc_tf, self.doc_df) (self.title_tf, self.title_df) = self.tf_idf_weights(title_dict_page, self.title_tf, self.title_df) self.doc_index = self.merge_to_index(doc_dict_page, self.doc_index) self.title_index = self.merge_to_index(title_dict_page, self.title_index) ''' # build the index for the current page term_dict_page = {} for position, term in enumerate(terms): try: term_dict_page[term][1].append(position) except: term_dict_page[term] = [docID, array('I', [position])] ''' ''' # normalize the document vector norm = 0 for term, posting in term_dict_page.iteritems(): norm += len(posting[1])**2 norm = math.sqrt(norm) # calculate tf and idf weights for term, posting in term_dict_page.iteritems(): self.tf[term].append('%.4f' % (len(posting[1])/norm)) self.df[term] += 1 ''' ''' # merge the current page index with the main index for term_page, posting_page in term_dict_page.iteritems(): self.main_index[term_page].append(posting_page) ''' gc.enable() self.write_index(self.doc_index_filepath, self.doc_index, self.doc_tf, self.doc_df) self.write_index(self.title_index_filepath, self.title_index, self.title_tf, self.title_df)