Esempio n. 1
0
 def gather_links_save_page(page_url):
     html_string = ''
     try:
         response = urlopen(page_url, timeout=10)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
             if Crawler.save_html_pages:
                 # By running concurrent threads this creates multiple equal codes
                 code = Crawler.count_code
                 Crawler.count_code += 1
                 write_file(Crawler.pages_folder + str(code), html_string)
                 Crawler.code_from_url[page_url] = code
                 Crawler.url_from_code[code] = page_url
                 if code % 100 == 0:
                     print(
                         'storing with pickle: code_from_url and url_from_code'
                     )
                     with open('code_from_url_dict.pickle', 'wb') as handle:
                         pickle.dump(Crawler.code_from_url,
                                     handle,
                                     protocol=pickle.HIGHEST_PROTOCOL)
                     with open('url_from_code_dict.pickle', 'wb') as handle:
                         pickle.dump(Crawler.url_from_code,
                                     handle,
                                     protocol=pickle.HIGHEST_PROTOCOL)
                         # with open('filename.pickle', 'rb') as handle:
                         #     b = pickle.load(handle)
         link_extractor = LinkExtractor(Crawler.base_url, page_url, True,
                                        Crawler.domain_name)
         link_extractor.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return link_extractor.page_links()
Esempio n. 2
0
    def preprocess_documents(self):
        global link_extractor
        web_graph = graph.OptimizedDirectedGraph()

        with open('code_from_url_dict.pickle', 'rb') as handle:
            code_from_url = pickle.load(handle)

        # for filename in os.listdir(self.FOLDER + '/pages/'):
        for filename in range(self.n_pages):
            with open(self.FOLDER + '/pages/' + str(filename)) as f:
                doc_text = f.read()

            if doc_text is None:
                print('empty doc')
                print(filename)
                continue

            self.process_page(int(filename), doc_text)

            link_extractor = LinkExtractor(self.FOLDER, self.HOMEPAGE,
                                           self.DOMAIN_NAME)
            link_extractor.feed(doc_text)
            links = link_extractor.page_links()
            # print('document number '+filename)
            # print('total links: '+str(len(links)))
            count = 0
            web_graph.add_node(int(filename))
            for url in links:
                if url in code_from_url:
                    # if code_from_url[url] == 6:
                    #     print(url)
                    #     exit()
                    count += 1
                    web_graph.add_edge(int(filename), code_from_url[url])
            # print('node '+filename+str(web_graph.get_pointing_to(int(filename))))
        return web_graph