def parse_urls(self, html): """ Produces a list of URLs present in the given html. :type html: str :rtype: list """ soup = BeautifulSoup(html, "html.parser") urls = [] # (presumably) only in the main page for element in soup.findAll("h2", {"class": "section-heading"}): if element.a: url = element.a.get("href") if url not in self.visited_urls: urls.append(Utility.clean_url(url)) # in main page and appears as relevant articles for element in soup.findAll("a", {"class": "story-link"}): url = element.get("href") if url not in self.visited_urls: urls.append(Utility.clean_url(url)) return urls
def parse_urls(self, html): """ Appends new URLs present in the given html too the URL queue. :type html: str """ soup = BeautifulSoup(html, "html.parser") # this is (presumably) only in the main page for element in soup.findAll("h2", {"class": "section-heading"}): if element.a: url = element.a.get("href") if url not in self.visited_urls: self.url_queue.append(Utility.clean_url(url)) # in main page and appear as relevant articles for element in soup.findAll("a", {"class": "story-link"}): url = element.get("href") if url not in self.visited_urls: self.url_queue.append(Utility.clean_url(url))