def __init__( self, filename, saver, loader, create_lexicon_flag=False, cluster_results=False, num_clusters=3, tf_idf_flag=True, ah_flag=False, pr_flag=False, normalize=True, create_page_rank_flag=False, directory="../index", linksFile="../index/IntLinks.txt", citationsFile="../index/IntCitations.txt", n_retrieves=10, root_set_size=10, maxIter=10, verbose=False, ): """ The init function loads up the pickled tf lexicon, normalizers and the norms of all the documents. Also this is the function that setsup the entire class incuding all its flags """ self.normalize = normalize self.tf_idf_flag = tf_idf_flag self.ah_flag = ah_flag self.pr_flag = pr_flag self.n_retrieves = n_retrieves self.root_set_size = root_set_size self.maxIter = maxIter self.cluster_results = cluster_results self.num_clusters = num_clusters assert self.root_set_size >= self.n_retrieves directory = SimpleFSDirectory(File("../index")) self.reader = IndexReader.open(directory) self.numDocs = self.reader.maxDoc() if self.normalize is False: self.norm = None # TF and/or TF IDF part. if create_lexicon_flag is True: if normalize is True: # make second flag True if you want to create a normalizer also. # but assumed created from Create Lexicon file if verbose is True: print "extracting all the norms of docs" start_time = time.clock() self.norm = calculateNormalizer(reader=self.reader, verbose=verbose) if verbose is True: end_time = time.clock() print "time taken for calculating norms is : " + str(end_time - start_time) + " seconds" saver(filename=filename + "_norms", obj=self.norm) self.lexicon = createLexicon( filename=filename, reader=self.reader, norm=self.norm if self.normalize else None, tf_idf_flag=self.tf_idf_flag, verbose=verbose, ) saver(filename=filename + "_lexicon", obj=self.lexicon) else: self.lexicon = loader(filename + "_lexicon") if normalize is True: if verbose is True: print "loading norms" start_time = time.clock() self.norm = loader(filename=filename + "_norms") if verbose is True: end_time = time.clock() print "time taken for loading norms is : " + str(end_time - start_time) + " seconds" # Authorties and Hubs part. if self.ah_flag or self.pr_flag is True: self.graph = LinkAnalysis( linksFile=linksFile, citationsFile=citationsFile, reader=self.reader, verbose=verbose ) if pr_flag is True: # pre-calculate page rank if create_page_rank_flag is True: # make this a create_page_rank_flag self.pr_values = page_rank( alpha=0.1, maxIter=self.maxIter, numDocs=self.numDocs, graph=self.graph, saver=saver, verbose=True, ) else: # if already calculated simply load self.pr_values = loader("rank")
def __init__(self, filename, saver, loader, create_lexicon_flag=False, cluster_results=False, num_clusters=3, tf_idf_flag=True, ah_flag=False, pr_flag=False, normalize=True, create_page_rank_flag=False, directory='../index', linksFile="../index/IntLinks.txt", citationsFile="../index/IntCitations.txt", n_retrieves=10, root_set_size=10, maxIter=10, verbose=False): """ The init function loads up the pickled tf lexicon, normalizers and the norms of all the documents. Also this is the function that setsup the entire class incuding all its flags """ self.normalize = normalize self.tf_idf_flag = tf_idf_flag self.ah_flag = ah_flag self.pr_flag = pr_flag self.n_retrieves = n_retrieves self.root_set_size = root_set_size self.maxIter = maxIter self.cluster_results = cluster_results self.num_clusters = num_clusters assert self.root_set_size >= self.n_retrieves directory = SimpleFSDirectory(File('../index')) self.reader = IndexReader.open(directory) self.numDocs = self.reader.maxDoc() if self.normalize is False: self.norm = None # TF and/or TF IDF part. if create_lexicon_flag is True: if normalize is True: # make second flag True if you want to create a normalizer also. # but assumed created from Create Lexicon file if verbose is True: print "extracting all the norms of docs" start_time = time.clock() self.norm = calculateNormalizer(reader=self.reader, verbose=verbose) if verbose is True: end_time = time.clock() print "time taken for calculating norms is : " + str( end_time - start_time) + " seconds" saver(filename=filename + '_norms', obj=self.norm) self.lexicon = createLexicon( filename=filename, reader=self.reader, norm=self.norm if self.normalize else None, tf_idf_flag=self.tf_idf_flag, verbose=verbose) saver(filename=filename + '_lexicon', obj=self.lexicon) else: self.lexicon = loader(filename + '_lexicon') if normalize is True: if verbose is True: print "loading norms" start_time = time.clock() self.norm = loader(filename=filename + '_norms') if verbose is True: end_time = time.clock() print "time taken for loading norms is : " + str( end_time - start_time) + " seconds" # Authorties and Hubs part. if self.ah_flag or self.pr_flag is True: self.graph = LinkAnalysis(linksFile=linksFile, citationsFile=citationsFile, reader=self.reader, verbose=verbose) if pr_flag is True: # pre-calculate page rank if create_page_rank_flag is True: # make this a create_page_rank_flag self.pr_values = page_rank(alpha=0.1, maxIter=self.maxIter, numDocs=self.numDocs, graph=self.graph, saver=saver, verbose=True) else: # if already calculated simply load self.pr_values = loader('rank')
class search(object): """ This is the main class that is going to initiate the search engine """ def __init__( self, filename, saver, loader, create_lexicon_flag=False, cluster_results=False, num_clusters=3, tf_idf_flag=True, ah_flag=False, pr_flag=False, normalize=True, create_page_rank_flag=False, directory="../index", linksFile="../index/IntLinks.txt", citationsFile="../index/IntCitations.txt", n_retrieves=10, root_set_size=10, maxIter=10, verbose=False, ): """ The init function loads up the pickled tf lexicon, normalizers and the norms of all the documents. Also this is the function that setsup the entire class incuding all its flags """ self.normalize = normalize self.tf_idf_flag = tf_idf_flag self.ah_flag = ah_flag self.pr_flag = pr_flag self.n_retrieves = n_retrieves self.root_set_size = root_set_size self.maxIter = maxIter self.cluster_results = cluster_results self.num_clusters = num_clusters assert self.root_set_size >= self.n_retrieves directory = SimpleFSDirectory(File("../index")) self.reader = IndexReader.open(directory) self.numDocs = self.reader.maxDoc() if self.normalize is False: self.norm = None # TF and/or TF IDF part. if create_lexicon_flag is True: if normalize is True: # make second flag True if you want to create a normalizer also. # but assumed created from Create Lexicon file if verbose is True: print "extracting all the norms of docs" start_time = time.clock() self.norm = calculateNormalizer(reader=self.reader, verbose=verbose) if verbose is True: end_time = time.clock() print "time taken for calculating norms is : " + str(end_time - start_time) + " seconds" saver(filename=filename + "_norms", obj=self.norm) self.lexicon = createLexicon( filename=filename, reader=self.reader, norm=self.norm if self.normalize else None, tf_idf_flag=self.tf_idf_flag, verbose=verbose, ) saver(filename=filename + "_lexicon", obj=self.lexicon) else: self.lexicon = loader(filename + "_lexicon") if normalize is True: if verbose is True: print "loading norms" start_time = time.clock() self.norm = loader(filename=filename + "_norms") if verbose is True: end_time = time.clock() print "time taken for loading norms is : " + str(end_time - start_time) + " seconds" # Authorties and Hubs part. if self.ah_flag or self.pr_flag is True: self.graph = LinkAnalysis( linksFile=linksFile, citationsFile=citationsFile, reader=self.reader, verbose=verbose ) if pr_flag is True: # pre-calculate page rank if create_page_rank_flag is True: # make this a create_page_rank_flag self.pr_values = page_rank( alpha=0.1, maxIter=self.maxIter, numDocs=self.numDocs, graph=self.graph, saver=saver, verbose=True, ) else: # if already calculated simply load self.pr_values = loader("rank") def retrieve(self, pr_weight=0.4, verbose=True): """ This function retrieves Performs the steps in slide 31 lecture 3 """ ## .. start_time = time.clock() sim = [float(0)] * self.reader.maxDoc() # initialized all sims to 0 if verbose is True: print "estimating query features" q_feat = self.query.tf_idf() if self.tf_idf_flag is True else self.query.tf() i = 0 if verbose is True: print "calculating similarities" for term in self.query.query: if verbose is True: print "calculating similarities for term " + str(term.text()) if str(term.text()) not in self.lexicon.keys(): print str(term.text()) + " is not in the index, so skipping it." break I = self.lexicon[str(term.text())] # extract the lexicon of the term for doc_id, doc_feat in I: # for every document that carries the term sim[int(doc_id)] = sim[int(doc_id)] + q_feat[i] * doc_feat i = i + 1 if self.normalize is True: if verbose is True: print "normalizing " I = self.lexicon[str(term.text())] # extract the lexicon of the term for doc_id, feat in I: sim[int(doc_id)] = sim[int(doc_id)] / self.norm[int(doc_id)] if verbose is True: print "sorting" idx = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True) end_time = time.clock() print "vector spcae time " + str(end_time - start_time) + " seconds" if self.ah_flag is True: # re rank idx by authorities and hubs and create a new idx start_time = time.clock() if verbose is True: print "estimating root set" assert pr_flag is False link_adj = {} citation_adj = {} root_set = list(idx[0 : self.root_set_size]) base_set = list(root_set) if verbose is True: print "growing base set" count = 0 for root_node in root_set: if verbose is True: count = count + 1 print "growing base for node " + str(count) + " - " + str(root_node) if not root_node in link_adj.keys(): link_adj[root_node] = list([]) if not root_node in citation_adj.keys(): citation_adj[root_node] = list([]) fwd_links = list(self.graph.getLinks(root_node)) if verbose is True: print "number of forward links for " + str(root_node) + " is " + str(len(fwd_links)) for fwd_link in fwd_links: if verbose is True: print "processing link " + str(fwd_link) if not fwd_link in base_set: base_set.append(fwd_link) # grow base set and mark in adjacency matrix if not fwd_link in citation_adj.keys(): citation_adj[fwd_link] = list([]) if not fwd_link in link_adj.keys(): link_adj[fwd_link] = list([]) # citation_adj[fwd_link].append(root_node) link_adj[root_node].append(fwd_link) back_links = list(self.graph.getCitations(root_node)) if verbose is True: print "number of backward links for " + str(root_node) + " is " + str(len(back_links)) for back_link in back_links: if verbose is True: print "processing link " + str(back_link) if not back_link in base_set: base_set.append(back_link) if not back_link in link_adj.keys(): link_adj[back_link] = list([]) if not back_link in citation_adj.keys(): citation_adj[back_link] = list([]) # link_adj[back_link].append(root_node) citation_adj[root_node].append(back_link) if verbose is True: print "size of base set is " + str(len(base_set)) print "size of citation adjacency is " + str(len(citation_adj.keys())) print "size of link adjacency is " + str(len(link_adj.keys())) # calcualte hubs and authorities auth_score, hub_score = authorities_hubs( numDocs=self.numDocs, adj=(link_adj, citation_adj), nodes=base_set, maxIter=self.maxIter, verbose=False ) auth_idx = sorted(range(len(auth_score)), key=lambda k: auth_score[k], reverse=True) hub_idx = sorted(range(len(hub_score)), key=lambda k: hub_score[k], reverse=True) end_time = time.clock() print "authorities and hubs took " + str(end_time - start_time) + " seconds" elif self.pr_flag is True: start_time = time.clock() # calcualte page rank score ... sim_new = page_rank_score(weight=pr_weight, similarities=sim, pr_val=self.pr_values, verbose=verbose) idx_new = sorted(range(len(sim_new)), key=lambda k: sim_new[k], reverse=True) end_time = time.clock() print "page rank took " + str(end_time - start_time) + " seconds" if self.cluster_results is True: words = cluster( doc_ids=idx[0 : self.n_retrieves], lexicon=self.lexicon, r=self.reader, num_clusters=self.num_clusters, verbose=verbose, ) if self.ah_flag is True: return (idx, sim, auth_idx, auth_score, hub_idx, hub_score) elif self.pr_flag is True: return (idx, sim, idx_new, sim_new) else: return (idx, sim) def run(self, query, print_urls=True, pr_weight=0.4, verbose=False): """this function basically runs a query""" self.query = parse_query(query, self.reader) start_time = time.clock() if self.ah_flag is True: doc_ids, score, auth_ids, auth_score, hub_ids, hub_score = self.retrieve(verbose=verbose) elif self.pr_flag is True: doc_ids, score, pr_ids, pr = self.retrieve(pr_weight=pr_weight, verbose=verbose) else: doc_ids, score = self.retrieve(verbose=verbose) end_time = time.clock() print "in total " + str(end_time - start_time) + " seconds for retrieval" if print_urls is True: print "vector space retreival" for i in xrange(self.n_retrieves): d = self.reader.document(doc_ids[i]) print "doc: [" + str(doc_ids[i]) + "], score: [" + str(score[doc_ids[i]]) + "]" # , url: " + d.getFieldable("path").stringValue().replace("%%", "/") if self.ah_flag is True: print "authorities based retreival" for i in xrange(self.n_retrieves): d = self.reader.document(auth_ids[i]) print "doc: [" + str(auth_ids[i]) + "], score: [" + str(auth_score[auth_ids[i]]) + "]" # , url: " + d.getFieldable("path").stringValue().replace("%%", "/") print "hubs based retreival" for i in xrange(self.n_retrieves): d = self.reader.document(hub_ids[i]) print "doc: [" + str(hub_ids[i]) + "], score: [" + str(hub_score[hub_ids[i]]) + "]" # , url: " + d.getFieldable("path").stringValue().replace("%%", "/") elif self.pr_flag is True: print "page rank based retreival" for i in xrange(self.n_retrieves): d = self.reader.document(pr_ids[i]) print "doc: [" + str(pr_ids[i]) + "], score: [" + str(pr[pr_ids[i]]) + "]" # , url: " + d.getFieldable("path").stringValue().replace("%%", "/") print "retrieval complete. " print "..........................................................................." return d
class search(object): """ This is the main class that is going to initiate the search engine """ def __init__(self, filename, saver, loader, create_lexicon_flag=False, cluster_results=False, num_clusters=3, tf_idf_flag=True, ah_flag=False, pr_flag=False, normalize=True, create_page_rank_flag=False, directory='../index', linksFile="../index/IntLinks.txt", citationsFile="../index/IntCitations.txt", n_retrieves=10, root_set_size=10, maxIter=10, verbose=False): """ The init function loads up the pickled tf lexicon, normalizers and the norms of all the documents. Also this is the function that setsup the entire class incuding all its flags """ self.normalize = normalize self.tf_idf_flag = tf_idf_flag self.ah_flag = ah_flag self.pr_flag = pr_flag self.n_retrieves = n_retrieves self.root_set_size = root_set_size self.maxIter = maxIter self.cluster_results = cluster_results self.num_clusters = num_clusters assert self.root_set_size >= self.n_retrieves directory = SimpleFSDirectory(File('../index')) self.reader = IndexReader.open(directory) self.numDocs = self.reader.maxDoc() if self.normalize is False: self.norm = None # TF and/or TF IDF part. if create_lexicon_flag is True: if normalize is True: # make second flag True if you want to create a normalizer also. # but assumed created from Create Lexicon file if verbose is True: print "extracting all the norms of docs" start_time = time.clock() self.norm = calculateNormalizer(reader=self.reader, verbose=verbose) if verbose is True: end_time = time.clock() print "time taken for calculating norms is : " + str( end_time - start_time) + " seconds" saver(filename=filename + '_norms', obj=self.norm) self.lexicon = createLexicon( filename=filename, reader=self.reader, norm=self.norm if self.normalize else None, tf_idf_flag=self.tf_idf_flag, verbose=verbose) saver(filename=filename + '_lexicon', obj=self.lexicon) else: self.lexicon = loader(filename + '_lexicon') if normalize is True: if verbose is True: print "loading norms" start_time = time.clock() self.norm = loader(filename=filename + '_norms') if verbose is True: end_time = time.clock() print "time taken for loading norms is : " + str( end_time - start_time) + " seconds" # Authorties and Hubs part. if self.ah_flag or self.pr_flag is True: self.graph = LinkAnalysis(linksFile=linksFile, citationsFile=citationsFile, reader=self.reader, verbose=verbose) if pr_flag is True: # pre-calculate page rank if create_page_rank_flag is True: # make this a create_page_rank_flag self.pr_values = page_rank(alpha=0.1, maxIter=self.maxIter, numDocs=self.numDocs, graph=self.graph, saver=saver, verbose=True) else: # if already calculated simply load self.pr_values = loader('rank') def retrieve(self, pr_weight=0.4, verbose=True): """ This function retrieves Performs the steps in slide 31 lecture 3 """ ## .. start_time = time.clock() sim = [float(0)] * self.reader.maxDoc() # initialized all sims to 0 if verbose is True: print "estimating query features" q_feat = self.query.tf_idf( ) if self.tf_idf_flag is True else self.query.tf() i = 0 if verbose is True: print "calculating similarities" for term in self.query.query: if verbose is True: print "calculating similarities for term " + str(term.text()) if str(term.text()) not in self.lexicon.keys(): print str( term.text()) + " is not in the index, so skipping it." break I = self.lexicon[str( term.text())] # extract the lexicon of the term for doc_id, doc_feat in I: # for every document that carries the term sim[int(doc_id)] = sim[int(doc_id)] + q_feat[i] * doc_feat i = i + 1 if self.normalize is True: if verbose is True: print "normalizing " I = self.lexicon[str( term.text())] # extract the lexicon of the term for doc_id, feat in I: sim[int(doc_id)] = sim[int(doc_id)] / self.norm[int(doc_id)] if verbose is True: print "sorting" idx = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True) end_time = time.clock() print "vector spcae time " + str(end_time - start_time) + " seconds" if self.ah_flag is True: # re rank idx by authorities and hubs and create a new idx start_time = time.clock() if verbose is True: print "estimating root set" assert pr_flag is False link_adj = {} citation_adj = {} root_set = list(idx[0:self.root_set_size]) base_set = list(root_set) if verbose is True: print "growing base set" count = 0 for root_node in root_set: if verbose is True: count = count + 1 print "growing base for node " + str(count) + " - " + str( root_node) if not root_node in link_adj.keys(): link_adj[root_node] = list([]) if not root_node in citation_adj.keys(): citation_adj[root_node] = list([]) fwd_links = list(self.graph.getLinks(root_node)) if verbose is True: print "number of forward links for " + str( root_node) + " is " + str(len(fwd_links)) for fwd_link in fwd_links: if verbose is True: print "processing link " + str(fwd_link) if not fwd_link in base_set: base_set.append(fwd_link) # grow base set and mark in adjacency matrix if not fwd_link in citation_adj.keys(): citation_adj[fwd_link] = list([]) if not fwd_link in link_adj.keys(): link_adj[fwd_link] = list([]) #citation_adj[fwd_link].append(root_node) link_adj[root_node].append(fwd_link) back_links = list(self.graph.getCitations(root_node)) if verbose is True: print "number of backward links for " + str( root_node) + " is " + str(len(back_links)) for back_link in back_links: if verbose is True: print "processing link " + str(back_link) if not back_link in base_set: base_set.append(back_link) if not back_link in link_adj.keys(): link_adj[back_link] = list([]) if not back_link in citation_adj.keys(): citation_adj[back_link] = list([]) #link_adj[back_link].append(root_node) citation_adj[root_node].append(back_link) if verbose is True: print "size of base set is " + str(len(base_set)) print "size of citation adjacency is " + str( len(citation_adj.keys())) print "size of link adjacency is " + str(len(link_adj.keys())) # calcualte hubs and authorities auth_score, hub_score = authorities_hubs(numDocs=self.numDocs, adj=(link_adj, citation_adj), nodes=base_set, maxIter=self.maxIter, verbose=False) auth_idx = sorted(range(len(auth_score)), key=lambda k: auth_score[k], reverse=True) hub_idx = sorted(range(len(hub_score)), key=lambda k: hub_score[k], reverse=True) end_time = time.clock() print "authorities and hubs took " + str(end_time - start_time) + " seconds" elif self.pr_flag is True: start_time = time.clock() # calcualte page rank score ... sim_new = page_rank_score(weight=pr_weight, similarities=sim, pr_val=self.pr_values, verbose=verbose) idx_new = sorted(range(len(sim_new)), key=lambda k: sim_new[k], reverse=True) end_time = time.clock() print "page rank took " + str(end_time - start_time) + " seconds" if self.cluster_results is True: words = cluster(doc_ids=idx[0:self.n_retrieves], lexicon=self.lexicon, r=self.reader, num_clusters=self.num_clusters, verbose=verbose) if self.ah_flag is True: return (idx, sim, auth_idx, auth_score, hub_idx, hub_score) elif self.pr_flag is True: return (idx, sim, idx_new, sim_new) else: return (idx, sim) def run(self, query, print_urls=True, pr_weight=0.4, verbose=False): """this function basically runs a query""" self.query = parse_query(query, self.reader) start_time = time.clock() if self.ah_flag is True: doc_ids, score, auth_ids, auth_score, hub_ids, hub_score = self.retrieve( verbose=verbose) elif self.pr_flag is True: doc_ids, score, pr_ids, pr = self.retrieve(pr_weight=pr_weight, verbose=verbose) else: doc_ids, score = self.retrieve(verbose=verbose) end_time = time.clock() print "in total " + str(end_time - start_time) + " seconds for retrieval" if print_urls is True: print "vector space retreival" for i in xrange(self.n_retrieves): d = self.reader.document(doc_ids[i]) print "doc: [" + str(doc_ids[i]) + "], score: [" + str( score[doc_ids[i]]) + "]" #, url: " + d.getFieldable("path").stringValue().replace("%%", "/") if self.ah_flag is True: print "authorities based retreival" for i in xrange(self.n_retrieves): d = self.reader.document(auth_ids[i]) print "doc: [" + str(auth_ids[i]) + "], score: [" + str( auth_score[auth_ids[i]]) + "]" #, url: " + d.getFieldable("path").stringValue().replace("%%", "/") print "hubs based retreival" for i in xrange(self.n_retrieves): d = self.reader.document(hub_ids[i]) print "doc: [" + str(hub_ids[i]) + "], score: [" + str( hub_score[hub_ids[i]]) + "]" #, url: " + d.getFieldable("path").stringValue().replace("%%", "/") elif self.pr_flag is True: print "page rank based retreival" for i in xrange(self.n_retrieves): d = self.reader.document(pr_ids[i]) print "doc: [" + str(pr_ids[i]) + "], score: [" + str( pr[pr_ids[i]]) + "]" #, url: " + d.getFieldable("path").stringValue().replace("%%", "/") print "retrieval complete. " print "..........................................................................." return d