Exemple #1
0
class Search(object):
    def __init__(self, generate_pr_score=True):
        self.db = MongoPipeline('AnveshanDB')
        self.content = self.db.get_content()
        self.graph = Graph(self.content)
        if generate_pr_score:
            self.pr = PageRank(self.graph)
            print("calc pr")
            pr_score = self.pr.get_score()
            print("Saving pr to db")
            self.db.save_pr_score(pr_score)
        else:
            self.pr = PageRank(
            graph = self.graph,\
            score = self.db.get_pr_score())

        #print(self.pr)
        #print(self.graph.get_adjacency_matrix())

    def search(self, query, user_resource=None):
        self.index_search_result = dict()
        self.content_search_result = dict()
        tokenizer = Tokenizer()
        query_tokens = tokenizer.processItem(query)
        self.index_search_result, self.content_search_result = self.db.get_content_by_index(
            query_tokens)

        combined_result = combine_index_content_result(\
            self.index_search_result,\
            self.content_search_result\
        )
        bm25 = BM25(query_tokens)

        #bm25 get_relevance_score for combined result
        score = bm25.get_relevance_score(combined_result)

        #pagerank
        if user_resource is None:
            pr_score = self.pr.get_score_for_search(self.content_search_result)
        else:
            pr_score = PageRank.filter_score_from_pr_score(
                self.content_search_result, user_resource["pr_score"])

        combined_score = combine_score(score, pr_score)

        def get_score(content):
            print(content['url'], ": BM25 : ", score[content['url']], "PR: ",
                  pr_score[content['url']])
            return combined_score[content['url']]

        return sorted(self.content_search_result, key=get_score, reverse=True)

    def personalized_search(self, query, user_resource):
        #user resource contain user specific pr_score, and personalization_vector
        return self.search(query, user_resource)
Exemple #2
0
def get_page_rank():
    global __page_rank

    if __page_rank is not None:
        return __page_rank

    print(' * Bootstrap PageRank')

    __page_rank = PageRank()
    __page_rank.get()

    return __page_rank
Exemple #3
0
 def __init__(self, generate_pr_score=True):
     self.db = MongoPipeline('AnveshanDB')
     self.content = self.db.get_content()
     self.graph = Graph(self.content)
     if generate_pr_score:
         self.pr = PageRank(self.graph)
         print("calc pr")
         pr_score = self.pr.get_score()
         print("Saving pr to db")
         self.db.save_pr_score(pr_score)
     else:
         self.pr = PageRank(
         graph = self.graph,\
         score = self.db.get_pr_score())
Exemple #4
0
    def search(self, query, user_resource=None):
        self.index_search_result = dict()
        self.content_search_result = dict()
        tokenizer = Tokenizer()
        query_tokens = tokenizer.processItem(query)
        self.index_search_result, self.content_search_result = self.db.get_content_by_index(
            query_tokens)

        combined_result = combine_index_content_result(\
            self.index_search_result,\
            self.content_search_result\
        )
        bm25 = BM25(query_tokens)

        #bm25 get_relevance_score for combined result
        score = bm25.get_relevance_score(combined_result)

        #pagerank
        if user_resource is None:
            pr_score = self.pr.get_score_for_search(self.content_search_result)
        else:
            pr_score = PageRank.filter_score_from_pr_score(
                self.content_search_result, user_resource["pr_score"])

        combined_score = combine_score(score, pr_score)

        def get_score(content):
            print(content['url'], ": BM25 : ", score[content['url']], "PR: ",
                  pr_score[content['url']])
            return combined_score[content['url']]

        return sorted(self.content_search_result, key=get_score, reverse=True)
Exemple #5
0
 def __init__(self, generate_pr_score=True):
     self.db = MongoPipeline('AnveshanDB')
     self.content = self.db.get_content()
     self.graph = Graph(self.content)
     if generate_pr_score:
         personalization_vector = get_personalization_vector(self.graph.graph, self.content)
         self.pr = PageRank(self.graph, personalization=personalization_vector)
         print("calc pr")
         pr_score = self.pr.get_score()
         print("Saving pr to db")
         self.db.save_pr_score(pr_score)
         #save persoanlization vector in db
         save_personalization_vector(personalization_vector)
     else:
         self.pr = PageRank(
         graph = self.graph,\
         score = self.db.get_pr_score())
Exemple #6
0
    def search(self, query, user_resource=None):
        self.index_search_result = dict()
        self.content_search_result = dict()
        #tokenizer = Tokenizer()
        #query_tokens = tokenizer.processItem(query)
        query_tokens = query.true_tokens
        print(query.token_weights)
        self.index_search_result, self.content_search_result = self.db.get_content_by_index(query_tokens, query.token_weights)
        print(len(self.index_search_result), len(self.content_search_result))
        
        combined_result = combine_index_content_result(\
            self.index_search_result,\
            self.content_search_result\
        )

        bm25 = BM25(query_tokens)
        #print("tokens", query_tokens)

        #print("combined result", combined_result)
        
        #bm25 get_relevance_score for combined result
        #score = bm25.get_relevance_score(combined_result, tags)
        
        #pagerank
        if user_resource is None:
            score = bm25.get_relevance_score(combined_result)
            pr_score = self.pr.get_score_for_search(self.content_search_result)
            combined_score = combine_score(score, pr_score)
        else:
            user = current_user
            score = bm25.get_relevance_score(combined_result, user.tags)
            pr_score = PageRank.filter_score_from_pr_score(self.content_search_result, user_resource["pr_score"])
            combined_score = combine_score(score, pr_score, pr=user.pr, bm25 = user.bm25)
        #print(user_resource, pr_score)
        #combined_score = combine_score(score , pr_score)
 
        def get_score(content):
            print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']])
            return combined_score[content['url']]
 
        
        return sorted([content[0] for content in self.content_search_result], key=get_score, reverse=True)
Exemple #7
0
    def build(self, directory=config.DUMP_FOLDER):
        if not self.index:
            raise FileNotFoundError('Index not initialized')

        writer = self.index.writer(limitmb=2048,
                                   procs=N_PROC,
                                   multisegment=True)

        compiler = Compiler()

        categories = []
        reverse_graph = []

        file_adjlist = open(ASSETS_DATA / 'graphs' / 'graph.adjlist.tmp', "w")

        def parse_link(node, article):
            category = node.value.category()

            if category:
                categories.append(category.group())
            else:
                article_link = normalize_title(node.value.text)
                if article_link not in reverse_graph:
                    reverse_graph.append(article_link)

        miss = 0
        count = 0
        for wiki in directory.iterdir():
            if wiki.is_file() and wiki.stem.startswith('enwiki'):
                for root in self.xml_parser.from_xml(str(wiki)):
                    count += 1

                    if count > 20000:
                        writer.commit()
                        writer = self.index.writer(limitmb=2048,
                                                   procs=N_PROC,
                                                   multisegment=True)
                        count = 0

                    listener = None
                    try:
                        id, title, text = self.xml_parser.get(root)
                        listener = compiler.on(
                            lambda node: parse_link(node, title.text),
                            ParseTypes.LINK)
                        logger.info(f'{title.text} compiling')
                        article = compiler.compile(text.text)
                        writer.add_document(title=title.text,
                                            text=article,
                                            categories=','.join(categories),
                                            id=f'{id}')
                        logger.info(f'{title.text} indexed')
                        listener and listener()  # Remove listeners
                        article_title = normalize_title(title.text)
                        adj_graph_str = " ".join(reverse_graph)
                        file_adjlist.write(article_title + " " +
                                           adj_graph_str + "\n")
                        categories.clear()
                        reverse_graph.clear()
                    except (ParseError, MalformedTag, RedirectFound) as e:
                        miss += 1
                        listener and listener()  # Remove listeners
                        logger.warning(f'{title.text} {e.type}, skipping')
                        continue

        if miss > 0:
            logger.warning(f'{miss} articles ignored')

        writer.commit()

        file_adjlist.close()

        adj = Adjacency()

        adj.load_from_index()

        adj.write_adjlist_clean()

        pr = PageRank()

        pr.load_adjlist()

        pr.load_graphml()

        pr.generate_rank()
Exemple #8
0
class Search(object):
    def __init__(self, generate_pr_score=True):
        self.db = MongoPipeline('AnveshanDB')
        self.content = self.db.get_content()
        self.graph = Graph(self.content)
        if generate_pr_score:
            personalization_vector = get_personalization_vector(self.graph.graph, self.content)
            self.pr = PageRank(self.graph, personalization=personalization_vector)
            print("calc pr")
            pr_score = self.pr.get_score()
            print("Saving pr to db")
            self.db.save_pr_score(pr_score)
            #save persoanlization vector in db
            save_personalization_vector(personalization_vector)
        else:
            self.pr = PageRank(
            graph = self.graph,\
            score = self.db.get_pr_score())
       
        #print(self.pr)
        #print(self.graph.get_adjacency_matrix())
        
    def search(self, query, user_resource=None):
        self.index_search_result = dict()
        self.content_search_result = dict()
        #tokenizer = Tokenizer()
        #query_tokens = tokenizer.processItem(query)
        query_tokens = query.true_tokens
        print(query.token_weights)
        self.index_search_result, self.content_search_result = self.db.get_content_by_index(query_tokens, query.token_weights)
        print(len(self.index_search_result), len(self.content_search_result))
        
        combined_result = combine_index_content_result(\
            self.index_search_result,\
            self.content_search_result\
        )

        bm25 = BM25(query_tokens)
        #print("tokens", query_tokens)

        #print("combined result", combined_result)
        
        #bm25 get_relevance_score for combined result
        #score = bm25.get_relevance_score(combined_result, tags)
        
        #pagerank
        if user_resource is None:
            score = bm25.get_relevance_score(combined_result)
            pr_score = self.pr.get_score_for_search(self.content_search_result)
            combined_score = combine_score(score, pr_score)
        else:
            user = current_user
            score = bm25.get_relevance_score(combined_result, user.tags)
            pr_score = PageRank.filter_score_from_pr_score(self.content_search_result, user_resource["pr_score"])
            combined_score = combine_score(score, pr_score, pr=user.pr, bm25 = user.bm25)
        #print(user_resource, pr_score)
        #combined_score = combine_score(score , pr_score)
 
        def get_score(content):
            print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']])
            return combined_score[content['url']]
 
        
        return sorted([content[0] for content in self.content_search_result], key=get_score, reverse=True)

    def personalized_search(self, query, user_resource, personalization=True):
        user = current_user
        print("pr: ",user.pr, "bm25: ", user.bm25)
        #user resource contain user specific pr_score, and personalization_vector
        print("personalization", personalization)
        if personalization:
            return self.search(query, user_resource)
        else:
            return self.search(query)

    def search_contributions(self, query):
        pass