class Search(object): def __init__(self, generate_pr_score=True): self.db = MongoPipeline('AnveshanDB') self.content = self.db.get_content() self.graph = Graph(self.content) if generate_pr_score: self.pr = PageRank(self.graph) print("calc pr") pr_score = self.pr.get_score() print("Saving pr to db") self.db.save_pr_score(pr_score) else: self.pr = PageRank( graph = self.graph,\ score = self.db.get_pr_score()) #print(self.pr) #print(self.graph.get_adjacency_matrix()) def search(self, query, user_resource=None): self.index_search_result = dict() self.content_search_result = dict() tokenizer = Tokenizer() query_tokens = tokenizer.processItem(query) self.index_search_result, self.content_search_result = self.db.get_content_by_index( query_tokens) combined_result = combine_index_content_result(\ self.index_search_result,\ self.content_search_result\ ) bm25 = BM25(query_tokens) #bm25 get_relevance_score for combined result score = bm25.get_relevance_score(combined_result) #pagerank if user_resource is None: pr_score = self.pr.get_score_for_search(self.content_search_result) else: pr_score = PageRank.filter_score_from_pr_score( self.content_search_result, user_resource["pr_score"]) combined_score = combine_score(score, pr_score) def get_score(content): print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']]) return combined_score[content['url']] return sorted(self.content_search_result, key=get_score, reverse=True) def personalized_search(self, query, user_resource): #user resource contain user specific pr_score, and personalization_vector return self.search(query, user_resource)
def get_page_rank(): global __page_rank if __page_rank is not None: return __page_rank print(' * Bootstrap PageRank') __page_rank = PageRank() __page_rank.get() return __page_rank
def __init__(self, generate_pr_score=True): self.db = MongoPipeline('AnveshanDB') self.content = self.db.get_content() self.graph = Graph(self.content) if generate_pr_score: self.pr = PageRank(self.graph) print("calc pr") pr_score = self.pr.get_score() print("Saving pr to db") self.db.save_pr_score(pr_score) else: self.pr = PageRank( graph = self.graph,\ score = self.db.get_pr_score())
def search(self, query, user_resource=None): self.index_search_result = dict() self.content_search_result = dict() tokenizer = Tokenizer() query_tokens = tokenizer.processItem(query) self.index_search_result, self.content_search_result = self.db.get_content_by_index( query_tokens) combined_result = combine_index_content_result(\ self.index_search_result,\ self.content_search_result\ ) bm25 = BM25(query_tokens) #bm25 get_relevance_score for combined result score = bm25.get_relevance_score(combined_result) #pagerank if user_resource is None: pr_score = self.pr.get_score_for_search(self.content_search_result) else: pr_score = PageRank.filter_score_from_pr_score( self.content_search_result, user_resource["pr_score"]) combined_score = combine_score(score, pr_score) def get_score(content): print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']]) return combined_score[content['url']] return sorted(self.content_search_result, key=get_score, reverse=True)
def __init__(self, generate_pr_score=True): self.db = MongoPipeline('AnveshanDB') self.content = self.db.get_content() self.graph = Graph(self.content) if generate_pr_score: personalization_vector = get_personalization_vector(self.graph.graph, self.content) self.pr = PageRank(self.graph, personalization=personalization_vector) print("calc pr") pr_score = self.pr.get_score() print("Saving pr to db") self.db.save_pr_score(pr_score) #save persoanlization vector in db save_personalization_vector(personalization_vector) else: self.pr = PageRank( graph = self.graph,\ score = self.db.get_pr_score())
def search(self, query, user_resource=None): self.index_search_result = dict() self.content_search_result = dict() #tokenizer = Tokenizer() #query_tokens = tokenizer.processItem(query) query_tokens = query.true_tokens print(query.token_weights) self.index_search_result, self.content_search_result = self.db.get_content_by_index(query_tokens, query.token_weights) print(len(self.index_search_result), len(self.content_search_result)) combined_result = combine_index_content_result(\ self.index_search_result,\ self.content_search_result\ ) bm25 = BM25(query_tokens) #print("tokens", query_tokens) #print("combined result", combined_result) #bm25 get_relevance_score for combined result #score = bm25.get_relevance_score(combined_result, tags) #pagerank if user_resource is None: score = bm25.get_relevance_score(combined_result) pr_score = self.pr.get_score_for_search(self.content_search_result) combined_score = combine_score(score, pr_score) else: user = current_user score = bm25.get_relevance_score(combined_result, user.tags) pr_score = PageRank.filter_score_from_pr_score(self.content_search_result, user_resource["pr_score"]) combined_score = combine_score(score, pr_score, pr=user.pr, bm25 = user.bm25) #print(user_resource, pr_score) #combined_score = combine_score(score , pr_score) def get_score(content): print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']]) return combined_score[content['url']] return sorted([content[0] for content in self.content_search_result], key=get_score, reverse=True)
def build(self, directory=config.DUMP_FOLDER): if not self.index: raise FileNotFoundError('Index not initialized') writer = self.index.writer(limitmb=2048, procs=N_PROC, multisegment=True) compiler = Compiler() categories = [] reverse_graph = [] file_adjlist = open(ASSETS_DATA / 'graphs' / 'graph.adjlist.tmp', "w") def parse_link(node, article): category = node.value.category() if category: categories.append(category.group()) else: article_link = normalize_title(node.value.text) if article_link not in reverse_graph: reverse_graph.append(article_link) miss = 0 count = 0 for wiki in directory.iterdir(): if wiki.is_file() and wiki.stem.startswith('enwiki'): for root in self.xml_parser.from_xml(str(wiki)): count += 1 if count > 20000: writer.commit() writer = self.index.writer(limitmb=2048, procs=N_PROC, multisegment=True) count = 0 listener = None try: id, title, text = self.xml_parser.get(root) listener = compiler.on( lambda node: parse_link(node, title.text), ParseTypes.LINK) logger.info(f'{title.text} compiling') article = compiler.compile(text.text) writer.add_document(title=title.text, text=article, categories=','.join(categories), id=f'{id}') logger.info(f'{title.text} indexed') listener and listener() # Remove listeners article_title = normalize_title(title.text) adj_graph_str = " ".join(reverse_graph) file_adjlist.write(article_title + " " + adj_graph_str + "\n") categories.clear() reverse_graph.clear() except (ParseError, MalformedTag, RedirectFound) as e: miss += 1 listener and listener() # Remove listeners logger.warning(f'{title.text} {e.type}, skipping') continue if miss > 0: logger.warning(f'{miss} articles ignored') writer.commit() file_adjlist.close() adj = Adjacency() adj.load_from_index() adj.write_adjlist_clean() pr = PageRank() pr.load_adjlist() pr.load_graphml() pr.generate_rank()
class Search(object): def __init__(self, generate_pr_score=True): self.db = MongoPipeline('AnveshanDB') self.content = self.db.get_content() self.graph = Graph(self.content) if generate_pr_score: personalization_vector = get_personalization_vector(self.graph.graph, self.content) self.pr = PageRank(self.graph, personalization=personalization_vector) print("calc pr") pr_score = self.pr.get_score() print("Saving pr to db") self.db.save_pr_score(pr_score) #save persoanlization vector in db save_personalization_vector(personalization_vector) else: self.pr = PageRank( graph = self.graph,\ score = self.db.get_pr_score()) #print(self.pr) #print(self.graph.get_adjacency_matrix()) def search(self, query, user_resource=None): self.index_search_result = dict() self.content_search_result = dict() #tokenizer = Tokenizer() #query_tokens = tokenizer.processItem(query) query_tokens = query.true_tokens print(query.token_weights) self.index_search_result, self.content_search_result = self.db.get_content_by_index(query_tokens, query.token_weights) print(len(self.index_search_result), len(self.content_search_result)) combined_result = combine_index_content_result(\ self.index_search_result,\ self.content_search_result\ ) bm25 = BM25(query_tokens) #print("tokens", query_tokens) #print("combined result", combined_result) #bm25 get_relevance_score for combined result #score = bm25.get_relevance_score(combined_result, tags) #pagerank if user_resource is None: score = bm25.get_relevance_score(combined_result) pr_score = self.pr.get_score_for_search(self.content_search_result) combined_score = combine_score(score, pr_score) else: user = current_user score = bm25.get_relevance_score(combined_result, user.tags) pr_score = PageRank.filter_score_from_pr_score(self.content_search_result, user_resource["pr_score"]) combined_score = combine_score(score, pr_score, pr=user.pr, bm25 = user.bm25) #print(user_resource, pr_score) #combined_score = combine_score(score , pr_score) def get_score(content): print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']]) return combined_score[content['url']] return sorted([content[0] for content in self.content_search_result], key=get_score, reverse=True) def personalized_search(self, query, user_resource, personalization=True): user = current_user print("pr: ",user.pr, "bm25: ", user.bm25) #user resource contain user specific pr_score, and personalization_vector print("personalization", personalization) if personalization: return self.search(query, user_resource) else: return self.search(query) def search_contributions(self, query): pass