def scrape_root(self, root, helper): """ Scrape a root URL """ t0 = time.time() # Fetch the root URL and scrape all child URLs that refer # to the same domain suffix and we haven't seen before logging.info("Fetching root {0}".format(root.url)) # Read the HTML document at the root URL html_doc = Fetcher.raw_fetch_url(root.url) if not html_doc: logging.warning("Unable to fetch root {0}".format(root.url)) return # Parse the HTML document soup = Fetcher.make_soup(html_doc) # Obtain the set of child URLs to fetch fetch_set = Fetcher.children(root, soup) # Add the children whose URLs we don't already have to the # scraper articles table with SessionContext() as session: for url in fetch_set: if helper and helper.skip_url(url): # The helper doesn't want this URL continue # noinspection PyBroadException try: article = ArticleRow(url=url, root_id=root.id) # Leave article.scraped as NULL for later retrieval session.add(article) session.commit() except IntegrityError as e: # Article URL already exists in database: # roll back and continue session.rollback() except Exception as e: logging.warning( "Roll back due to exception in scrape_root: {0}" .format(e) ) session.rollback() t1 = time.time() logging.info("Root scrape completed in {0:.2f} seconds".format(t1 - t0))
def urls2fetch(self, root, helper): """ Returns a set of URLs to fetch. If the scraper helper class has associated RSS feed URLs, these are used to acquire article URLs. Otherwise, the URLs are found by scraping the root website and searching for links to subpages. """ fetch_set = set() feeds = None if helper is None else helper.feeds if feeds: for feed_url in feeds: logging.info("Fetching feed {0}".format(feed_url)) try: d = feedparser.parse(feed_url) except Exception as e: logging.warning( "Error fetching/parsing feed {0}: {1}".format( feed_url, str(e))) continue for entry in d.entries: if entry.link and not helper.skip_rss_entry(entry): fetch_set.add(entry.link) else: # Fetch the root URL and scrape all child URLs # that refer to the same domain suffix logging.info("Fetching root {0}".format(root.url)) # Read the HTML document at the root URL html_doc = Fetcher.raw_fetch_url(root.url) if not html_doc: logging.warning("Unable to fetch root {0}".format(root.url)) return # Parse the HTML document soup = Fetcher.make_soup(html_doc) # Obtain the set of child URLs to fetch fetch_set = Fetcher.children(root, soup) return fetch_set
def urls2fetch(self, root, helper): """ Returns a set of URLs to fetch. If the scraper helper class has associated RSS feed URLs, these are used to acquire article URLs. Otherwise, the URLs are found by scraping the root website and searching for links to subpages. """ fetch_set = set() feeds = helper.feeds if feeds: for feed_url in feeds: logging.info("Fetching feed {0}".format(feed_url)) try: d = feedparser.parse(feed_url) except Exception as e: logging.warning( "Error fetching/parsing feed {0}: {1}".format(feed_url, str(e)) ) continue for entry in d.entries: if entry.link and not helper.skip_rss_entry(entry): fetch_set.add(entry.link) else: # Fetch the root URL and scrape all child URLs # that refer to the same domain suffix logging.info("Fetching root {0}".format(root.url)) # Read the HTML document at the root URL html_doc = Fetcher.raw_fetch_url(root.url) if not html_doc: logging.warning("Unable to fetch root {0}".format(root.url)) return # Parse the HTML document soup = Fetcher.make_soup(html_doc) # Obtain the set of child URLs to fetch fetch_set = Fetcher.children(root, soup) return fetch_set