def _init_net(self, clear_store): self.web_net = WebNet() if not clear_store: # Do not clear the store but add new nodes to it, load and add existing to webnet with WebNodeStore(self.store_path, clear=False) as node_store: for node in node_store.load_webnodes(True): self.already_processed_websites.add( node.get_content_hash()) for link in node.get_urls(): self.already_processed_links.add(link) self.web_net.add_node(node) # After we marked all already processed links, add new outgoings to restart restart_link_count = 0 total_link_out = 0 for node in self.web_net: for link in node.get_out_links(): total_link_out += 1 if link not in self.already_processed_links: self.add_link(link) restart_link_count += 1 logging.info("Restarting with %d links of %d", restart_link_count, total_link_out)
def _init_net(self, clear_store): self.web_net = WebNet() if not clear_store: # Do not clear the store but add new nodes to it, load and add existing to webnet with WebNodeStore(self.store_path, clear=False) as node_store: for node in node_store.load_webnodes(True): self.already_processed_websites.add(node.get_content_hash()) for link in node.get_urls(): self.already_processed_links.add(link) self.web_net.add_node(node) # After we marked all already processed links, add new outgoings to restart restart_link_count = 0 total_link_out = 0 for node in self.web_net: for link in node.get_out_links(): total_link_out += 1 if link not in self.already_processed_links: self.add_link(link) restart_link_count += 1 logging.info("Restarting with %d links of %d", restart_link_count, total_link_out)
class Crawler: # Initializes the Crawler. If max_sites is greater than zero it will only # download this many sites and stop afterwards, else until no new site is found. def __init__(self, store_path, link_constraint, max_sites=0, max_workers=2, timeout=30): self.store_path = store_path self.pending_links = Queue() self.pending_websites = Queue() self.web_net = None self.link_constraint = link_constraint if self.link_constraint is None: raise ValueError("No link constraint given!") self.already_processed_links = set() self.already_processed_websites = set() self.is_crawling = False self.max_sites = max_sites self.processed_sites_count = 0 self.max_workers = max_workers self.timeout = timeout self.starting_processor = None self.links_processor = None self.websites_processor = None def _is_finished(self): return not self.is_crawling or self.has_maximum_sites_processed() def has_maximum_sites_processed(self): return 0 < self.max_sites <= self.processed_sites_count def process_link(self, link): if self._is_finished(): return website = Crawler.download_website(link, self.timeout) if website is None: logging.debug("Website %s not downloaded", link) if website is NotResolvable: logging.debug("Website %s not resolvable and not trying again.", link) return return self, link, website @staticmethod def link_got_processed(future): if future.done() and future.result() is not None: self, link, website = future.result() if self._is_finished(): return if website is None: # revert and try later logging.debug("Website %s not downloaded, retrying later ", link) self.add_link(link) return if not self.has_maximum_sites_processed(): self.pending_websites.put((link, website)) def obtain_new_link(self): link = None while link is None and not self._is_finished(): try: link = self.pending_links.get(timeout=self.timeout) except Empty: logging.info("No more links found to process!") return if link in self.already_processed_links: link = None continue # already processed if link is not None: self.already_processed_links.add(link) return link def process_links(self): logging.info("Starting to process links") try: with ThreadPoolExecutor(max_workers=self.max_workers) as executor: while not self._is_finished(): # this will submit many many futures when testing with limited maxsites(>0) # but they will be ignored! link = self.obtain_new_link() if link is None: return future = executor.submit(self.process_link, link) future.add_done_callback(Crawler.link_got_processed) finally: self.stop() # ensure crawler is really stopped def process_website(self, link, website): logging.debug("Starting to parse %s pending links %d", link, self.pending_links.qsize()) try: webparser = WebParser(link, website) except ValueError: logging.debug("Website %s not parsable, ignored but out link kept", link) return web_hash = hash(webparser) if web_hash in self.already_processed_websites: # Already processed but with a different url, add this url to node so we know this in the future! logging.debug("Website %s already processed (with different url)!", link) node = self.web_net.get_by_content_hash(web_hash) if node is not None: node.add_url(link) return logging.info("Processed %d.link %s pending websites %d", self.processed_sites_count + 1, link, self.pending_websites.qsize()) self.already_processed_websites.add(web_hash) self.processed_sites_count += 1 builder = WebNode.Builder(self.link_constraint) builder.init_from_webparser(webparser) webnode = builder.make_node() self.web_net.add_node(webnode) for link in webnode.get_out_links(): self.add_link(link) def process_websites(self, clear_store): # We are required to open the store in the same thread the store is modified in logging.info("Starting to process websites") with WebNodeStore(self.store_path, clear_store) as node_store: try: while not self._is_finished(): data = self.pending_websites.get(block=True) if data is None: break link, website = data self.process_website(link, website) node_store.save_webnodes(self.web_net.get_nodes()) finally: self.stop() # ensure crawler is really stopped def _init_net(self, clear_store): self.web_net = WebNet() if not clear_store: # Do not clear the store but add new nodes to it, load and add existing to webnet with WebNodeStore(self.store_path, clear=False) as node_store: for node in node_store.load_webnodes(True): self.already_processed_websites.add(node.get_content_hash()) for link in node.get_urls(): self.already_processed_links.add(link) self.web_net.add_node(node) # After we marked all already processed links, add new outgoings to restart restart_link_count = 0 total_link_out = 0 for node in self.web_net: for link in node.get_out_links(): total_link_out += 1 if link not in self.already_processed_links: self.add_link(link) restart_link_count += 1 logging.info("Restarting with %d links of %d", restart_link_count, total_link_out) def _start_async(self, clear_store): self._init_net(clear_store) self.links_processor = threading.Thread(target=self.process_links) self.links_processor.start() self.websites_processor = threading.Thread(target=Crawler.process_websites, args=[self, clear_store]) self.websites_processor.start() def join(self): try: self.starting_processor.join() # If this stops blocking, the other processors are valid self.websites_processor.join() self.links_processor.join() except KeyboardInterrupt: self.stop() def start(self, start_url, clear_store=True): logging.info("Starting crawling at %s", start_url) self.is_crawling = True self.add_link(start_url) self.starting_processor = threading.Thread(target=Crawler._start_async, args=[self, clear_store]) self.starting_processor.start() def add_link(self, link): link = self.link_constraint.get_valid(link) if link is None: return self.pending_links.put(link) def stop(self): if self.is_crawling: # Race condition safe (could be executed multiple times) logging.info("Stopping crawling") self.is_crawling = False self.pending_websites.put(None) # Ensure threads do not wait forever and exit self.pending_links.put(None) @staticmethod def download_website(url, timeout): # Download and read website logging.debug("Downloading website %s", url) try: website = urllib.request.urlopen(url, timeout=timeout).read() except socket_timeout: logging.debug("Timeout error when downloading %s", url) website = None except urllib.error.HTTPError as err: if int(err.code / 100) == 4: logging.debug("Client http error when downloading %s %s", url, err) website = NotResolvable # 404 Not Found or other Client Error, ignore link in future else: logging.debug("HTTP Error when downloading %d %s %s", err.code, url, err) website = None except urllib.error.URLError as err: logging.debug("Url error when downloading %s %s", url, err) website = None except RemoteDisconnected as disc: logging.debug("(RemoteDisconnect) error when downloading %s %s", url, disc) website = NotResolvable except UnicodeEncodeError: logging.debug("(UnicodeEncodeError) error when downloading %s", url) website = NotResolvable return website
self._matrix_size = 1 def __str__(self): return "Teleport Ranker (" + str(self.teleport_prop) + ")" # noinspection PyUnresolvedReferences def _build_matrix(self): super()._build_matrix() self._matrix_size = self.matrix.shape[0] # Matrix is square def _power_method_step(self, x): # The teleport matrix the ones((k, k)) matrix scaled by (self.teleport_prop / k) # where k = self._matrix_size. # We do not compute this full rank 1 matrix for performance reasons and use the fact that # norm(x,1)=sum(x)=1 return (1. - self.teleport_prop) * super()._power_method_step(x) + self.teleport_prop / self._matrix_size if __name__ == "__main__": ranker = TeleportRanker(0.1) print("Starting ranking webnet.") from pyoogle.preprocessing.web.nodestore import WebNodeStore from pyoogle.preprocessing.web.net import WebNet from pyoogle.config import DATABASE_PATH with WebNodeStore(database_path=DATABASE_PATH) as store: loaded_webnet = WebNet() for loaded_node in store.load_webnodes(load_content=False): loaded_webnet.add_node(loaded_node) ranker.rank(loaded_webnet) store.save_webnodes(loaded_webnet.get_nodes())
def __str__(self): return "Teleport Ranker (" + str(self.teleport_prop) + ")" # noinspection PyUnresolvedReferences def _build_matrix(self): super()._build_matrix() self._matrix_size = self.matrix.shape[0] # Matrix is square def _power_method_step(self, x): # The teleport matrix the ones((k, k)) matrix scaled by (self.teleport_prop / k) # where k = self._matrix_size. # We do not compute this full rank 1 matrix for performance reasons and use the fact that # norm(x,1)=sum(x)=1 return (1. - self.teleport_prop) * super()._power_method_step( x) + self.teleport_prop / self._matrix_size if __name__ == "__main__": ranker = TeleportRanker(0.1) print("Starting ranking webnet.") from pyoogle.preprocessing.web.nodestore import WebNodeStore from pyoogle.preprocessing.web.net import WebNet from pyoogle.config import DATABASE_PATH with WebNodeStore(database_path=DATABASE_PATH) as store: loaded_webnet = WebNet() for loaded_node in store.load_webnodes(load_content=False): loaded_webnet.add_node(loaded_node) ranker.rank(loaded_webnet) store.save_webnodes(loaded_webnet.get_nodes())
class Crawler: # Initializes the Crawler. If max_sites is greater than zero it will only # download this many sites and stop afterwards, else until no new site is found. def __init__(self, store_path, link_constraint, max_sites=0, max_workers=2, timeout=30): self.store_path = store_path self.pending_links = Queue() self.pending_websites = Queue() self.web_net = None self.link_constraint = link_constraint if self.link_constraint is None: raise ValueError("No link constraint given!") self.already_processed_links = set() self.already_processed_websites = set() self.is_crawling = False self.max_sites = max_sites self.processed_sites_count = 0 self.max_workers = max_workers self.timeout = timeout self.starting_processor = None self.links_processor = None self.websites_processor = None def _is_finished(self): return not self.is_crawling or self.has_maximum_sites_processed() def has_maximum_sites_processed(self): return 0 < self.max_sites <= self.processed_sites_count def process_link(self, link): if self._is_finished(): return website = Crawler.download_website(link, self.timeout) if website is None: logging.debug("Website %s not downloaded", link) if website is NotResolvable: logging.debug("Website %s not resolvable and not trying again.", link) return return self, link, website @staticmethod def link_got_processed(future): if future.done() and future.result() is not None: self, link, website = future.result() if self._is_finished(): return if website is None: # revert and try later logging.debug("Website %s not downloaded, retrying later ", link) self.add_link(link) return if not self.has_maximum_sites_processed(): self.pending_websites.put((link, website)) def obtain_new_link(self): link = None while link is None and not self._is_finished(): try: link = self.pending_links.get(timeout=self.timeout) except Empty: logging.info("No more links found to process!") return if link in self.already_processed_links: link = None continue # already processed if link is not None: self.already_processed_links.add(link) return link def process_links(self): logging.info("Starting to process links") try: with ThreadPoolExecutor(max_workers=self.max_workers) as executor: while not self._is_finished(): # this will submit many many futures when testing with limited maxsites(>0) # but they will be ignored! link = self.obtain_new_link() if link is None: return future = executor.submit(self.process_link, link) future.add_done_callback(Crawler.link_got_processed) finally: self.stop() # ensure crawler is really stopped def process_website(self, link, website): logging.debug("Starting to parse %s pending links %d", link, self.pending_links.qsize()) try: webparser = WebParser(link, website) except ValueError: logging.debug("Website %s not parsable, ignored but out link kept", link) return web_hash = hash(webparser) if web_hash in self.already_processed_websites: # Already processed but with a different url, add this url to node so we know this in the future! logging.debug("Website %s already processed (with different url)!", link) node = self.web_net.get_by_content_hash(web_hash) if node is not None: node.add_url(link) return logging.info("Processed %d.link %s pending websites %d", self.processed_sites_count + 1, link, self.pending_websites.qsize()) self.already_processed_websites.add(web_hash) self.processed_sites_count += 1 builder = WebNode.Builder(self.link_constraint) builder.init_from_webparser(webparser) webnode = builder.make_node() self.web_net.add_node(webnode) for link in webnode.get_out_links(): self.add_link(link) def process_websites(self, clear_store): # We are required to open the store in the same thread the store is modified in logging.info("Starting to process websites") with WebNodeStore(self.store_path, clear_store) as node_store: try: while not self._is_finished(): data = self.pending_websites.get(block=True) if data is None: break link, website = data self.process_website(link, website) node_store.save_webnodes(self.web_net.get_nodes()) finally: self.stop() # ensure crawler is really stopped def _init_net(self, clear_store): self.web_net = WebNet() if not clear_store: # Do not clear the store but add new nodes to it, load and add existing to webnet with WebNodeStore(self.store_path, clear=False) as node_store: for node in node_store.load_webnodes(True): self.already_processed_websites.add( node.get_content_hash()) for link in node.get_urls(): self.already_processed_links.add(link) self.web_net.add_node(node) # After we marked all already processed links, add new outgoings to restart restart_link_count = 0 total_link_out = 0 for node in self.web_net: for link in node.get_out_links(): total_link_out += 1 if link not in self.already_processed_links: self.add_link(link) restart_link_count += 1 logging.info("Restarting with %d links of %d", restart_link_count, total_link_out) def _start_async(self, clear_store): self._init_net(clear_store) self.links_processor = threading.Thread(target=self.process_links) self.links_processor.start() self.websites_processor = threading.Thread( target=Crawler.process_websites, args=[self, clear_store]) self.websites_processor.start() def join(self): try: self.starting_processor.join( ) # If this stops blocking, the other processors are valid self.websites_processor.join() self.links_processor.join() except KeyboardInterrupt: self.stop() def start(self, start_url, clear_store=True): logging.info("Starting crawling at %s", start_url) self.is_crawling = True self.add_link(start_url) self.starting_processor = threading.Thread(target=Crawler._start_async, args=[self, clear_store]) self.starting_processor.start() def add_link(self, link): link = self.link_constraint.get_valid(link) if link is None: return self.pending_links.put(link) def stop(self): if self.is_crawling: # Race condition safe (could be executed multiple times) logging.info("Stopping crawling") self.is_crawling = False self.pending_websites.put( None) # Ensure threads do not wait forever and exit self.pending_links.put(None) @staticmethod def download_website(url, timeout): # Download and read website logging.debug("Downloading website %s", url) try: website = urllib.request.urlopen(url, timeout=timeout).read() except socket_timeout: logging.debug("Timeout error when downloading %s", url) website = None except urllib.error.HTTPError as err: if int(err.code / 100) == 4: logging.debug("Client http error when downloading %s %s", url, err) website = NotResolvable # 404 Not Found or other Client Error, ignore link in future else: logging.debug("HTTP Error when downloading %d %s %s", err.code, url, err) website = None except urllib.error.URLError as err: logging.debug("Url error when downloading %s %s", url, err) website = None except RemoteDisconnected as disc: logging.debug("(RemoteDisconnect) error when downloading %s %s", url, disc) website = NotResolvable except UnicodeEncodeError: logging.debug("(UnicodeEncodeError) error when downloading %s", url) website = NotResolvable return website