def gather_links_save_page(page_url): html_string = '' try: response = urlopen(page_url, timeout=10) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") if Crawler.save_html_pages: # By running concurrent threads this creates multiple equal codes code = Crawler.count_code Crawler.count_code += 1 write_file(Crawler.pages_folder + str(code), html_string) Crawler.code_from_url[page_url] = code Crawler.url_from_code[code] = page_url if code % 100 == 0: print( 'storing with pickle: code_from_url and url_from_code' ) with open('code_from_url_dict.pickle', 'wb') as handle: pickle.dump(Crawler.code_from_url, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('url_from_code_dict.pickle', 'wb') as handle: pickle.dump(Crawler.url_from_code, handle, protocol=pickle.HIGHEST_PROTOCOL) # with open('filename.pickle', 'rb') as handle: # b = pickle.load(handle) link_extractor = LinkExtractor(Crawler.base_url, page_url, True, Crawler.domain_name) link_extractor.feed(html_string) except Exception as e: print(str(e)) return set() return link_extractor.page_links()
def preprocess_documents(self): global link_extractor web_graph = graph.OptimizedDirectedGraph() with open('code_from_url_dict.pickle', 'rb') as handle: code_from_url = pickle.load(handle) # for filename in os.listdir(self.FOLDER + '/pages/'): for filename in range(self.n_pages): with open(self.FOLDER + '/pages/' + str(filename)) as f: doc_text = f.read() if doc_text is None: print('empty doc') print(filename) continue self.process_page(int(filename), doc_text) link_extractor = LinkExtractor(self.FOLDER, self.HOMEPAGE, self.DOMAIN_NAME) link_extractor.feed(doc_text) links = link_extractor.page_links() # print('document number '+filename) # print('total links: '+str(len(links))) count = 0 web_graph.add_node(int(filename)) for url in links: if url in code_from_url: # if code_from_url[url] == 6: # print(url) # exit() count += 1 web_graph.add_edge(int(filename), code_from_url[url]) # print('node '+filename+str(web_graph.get_pointing_to(int(filename)))) return web_graph