Ejemplo n.º 1
0
 def load_graph(self):
     parser = Parser()
     path = "C:\\Users\\Gudli\\Desktop\\OISISI Drugi projekat\\python-2.7.7-docs-html"
     #path =  "C:\\Users\\Asus\\Desktop\\Projekat_Python\\python-2.7.7-docs-html"
     """
         For each html file in the specified directory, a new object which represents an html page and all the pages
         it links to is created and added into a list of pages
     """
     for root, dirs, files in os.walk(path, topdown=True):
         for filename in files:
             if r".html" in filename:
                 parser.parse(os.path.join(root, filename))
                 page = Page(os.path.join(root, filename), parser.links)
                 self.pages.append(page)
     """
         Looping through the list of html pages and adding them into a graph as vertices
     """
     for page in self.pages:
         self.graph.insert_vertex(Graph.Vertex(page.path))
     """
         Looping through the list of html pages and creating edges between the current page and the pages it links to
     """
     for page in self.pages:
         for link in page.links:
             self.graph.insert_edge(Graph.Vertex(page.path),
                                    Graph.Vertex(link))
Ejemplo n.º 2
0
def page_rank(max_iter, pages, graph, loader, words, result_set):
    start = time.time()
    print("Ranking pages...")
    word_count_dictionary = {}
    for page in pages:
        word_count_dictionary[page.path] = page.word_count
    rank = {}
    d = 0.85
    n = len(pages)
    for page in pages:
        rank[page.path] = 1 / n

    for i in range(max_iter):
        for page in pages:
            incoming_edges = graph.incident_edges(Graph.Vertex(page.path), False)
            vertices = []
            for edge in incoming_edges:
                vertices.append(edge.origin())

            rank_sum = 0
            for vertex in vertices:
                rank_sum += rank[vertex.element()] / len(graph.incident_edges(vertex))

            word_count = 0
            for word in words.set.keys():
                word_count += loader.trie.getWordCountForPage(word, loader.getPageNum(page.path))
            rank_sum *= (1 + log(word_count / word_count_dictionary[page.path] + 1, 2))
            rank[page.path] = (1 - d) + d * rank_sum

    end = time.time()
    print("Ranked all pages in " + str((end - start).__round__(2)) + " seconds.")

    return sort_ranks(rank, result_set)
Ejemplo n.º 3
0
    def loadTrieViaHTML(self, path):
        """
            Collects all the '.html' files from the given path and its subfolders into a list. Then proceeds to
            call Parser.parse() for each file in the list. Words from every file are then inserted into the Trie
            structure. After filling the Trie, it creates the Graph structure.
        """
        parser = Parser()

        start = time.time()
        """
            By using 'self.getAllFiles(path), we collect the absolute paths for every '.html' file in the given
            directory. Paths are kept within the list 'self.files'. 
            Using a for loop and a parser, we iterate through the list, and parse every file, add its words
            to the Trie structure, and subsequently build a Graph.
        """

        page_counter = -1
        self.getHtmlFiles(path)

        for file in self.files:
            page_counter += 1
            self.dict[page_counter] = file

            parser.parse(file)                      # Parse the page at the given path

            page = Page(file, parser.links, len(parser.words))         # Create a new Page object to be used for Graphing
            self.pages.append(page)

            for word in parser.words:                   # Insert every word from the page into Trie
                self.trie.insertWord(word, page_counter)

        " Graph creation below: "
        " Creating a Vertex for every page "
        for page in self.pages:
            self.graph.insert_vertex(Graph.Vertex(page.path))

        " Adding edges for every link between pages "
        for page in self.pages:
            for link in page.links:
                self.graph.insert_edge(Graph.Vertex(page.path), Graph.Vertex(link))

        end = time.time()
        print("Parsed files, loaded Trie and formed a Graph in  " + str((end - start).__round__(2)) + " seconds.")