def generate_adjacency_matrix(self, drop_static=False): pages = [page for page in Page.select() if not drop_static or "html" in page.content_type] ids = {} matrix = {} for page in pages: if drop_static and "text/html" not in page.content_type: continue ids[page.url] = int(page.id) matrix[page.id] = set() for page in pages: if drop_static and "text/html" not in page.content_type: continue for link in json.loads(page.links): if drop_static and link not in ids: continue if ids[link] not in matrix[page.id]: matrix[page.id].add(ids[link]) for el in matrix: matrix[el] = list(matrix[el]) with open("data\\matrix.json", "w") as w: w.write(json.dumps(matrix))
def load(self): max_id = 1 visited_links = set() all_links = set() for page in Page.select(): if page.id > max_id: max_id = page.id visited_links.add(page.url) for link in json.loads(page.links): all_links.add(link) self.pages = {l: None for l in visited_links} self.queue = all_links - visited_links return max_id