def load_graph(self): parser = Parser() path = "C:\\Users\\Gudli\\Desktop\\OISISI Drugi projekat\\python-2.7.7-docs-html" #path = "C:\\Users\\Asus\\Desktop\\Projekat_Python\\python-2.7.7-docs-html" """ For each html file in the specified directory, a new object which represents an html page and all the pages it links to is created and added into a list of pages """ for root, dirs, files in os.walk(path, topdown=True): for filename in files: if r".html" in filename: parser.parse(os.path.join(root, filename)) page = Page(os.path.join(root, filename), parser.links) self.pages.append(page) """ Looping through the list of html pages and adding them into a graph as vertices """ for page in self.pages: self.graph.insert_vertex(Graph.Vertex(page.path)) """ Looping through the list of html pages and creating edges between the current page and the pages it links to """ for page in self.pages: for link in page.links: self.graph.insert_edge(Graph.Vertex(page.path), Graph.Vertex(link))
def page_rank(max_iter, pages, graph, loader, words, result_set): start = time.time() print("Ranking pages...") word_count_dictionary = {} for page in pages: word_count_dictionary[page.path] = page.word_count rank = {} d = 0.85 n = len(pages) for page in pages: rank[page.path] = 1 / n for i in range(max_iter): for page in pages: incoming_edges = graph.incident_edges(Graph.Vertex(page.path), False) vertices = [] for edge in incoming_edges: vertices.append(edge.origin()) rank_sum = 0 for vertex in vertices: rank_sum += rank[vertex.element()] / len(graph.incident_edges(vertex)) word_count = 0 for word in words.set.keys(): word_count += loader.trie.getWordCountForPage(word, loader.getPageNum(page.path)) rank_sum *= (1 + log(word_count / word_count_dictionary[page.path] + 1, 2)) rank[page.path] = (1 - d) + d * rank_sum end = time.time() print("Ranked all pages in " + str((end - start).__round__(2)) + " seconds.") return sort_ranks(rank, result_set)
def loadTrieViaHTML(self, path): """ Collects all the '.html' files from the given path and its subfolders into a list. Then proceeds to call Parser.parse() for each file in the list. Words from every file are then inserted into the Trie structure. After filling the Trie, it creates the Graph structure. """ parser = Parser() start = time.time() """ By using 'self.getAllFiles(path), we collect the absolute paths for every '.html' file in the given directory. Paths are kept within the list 'self.files'. Using a for loop and a parser, we iterate through the list, and parse every file, add its words to the Trie structure, and subsequently build a Graph. """ page_counter = -1 self.getHtmlFiles(path) for file in self.files: page_counter += 1 self.dict[page_counter] = file parser.parse(file) # Parse the page at the given path page = Page(file, parser.links, len(parser.words)) # Create a new Page object to be used for Graphing self.pages.append(page) for word in parser.words: # Insert every word from the page into Trie self.trie.insertWord(word, page_counter) " Graph creation below: " " Creating a Vertex for every page " for page in self.pages: self.graph.insert_vertex(Graph.Vertex(page.path)) " Adding edges for every link between pages " for page in self.pages: for link in page.links: self.graph.insert_edge(Graph.Vertex(page.path), Graph.Vertex(link)) end = time.time() print("Parsed files, loaded Trie and formed a Graph in " + str((end - start).__round__(2)) + " seconds.")