def extract_links_bs(self, url, html): ''' Extract all outlinks from html using beautiful soup. Return list of links Args: - url: url of the html source, used to construct absolute url from relative url - html: html source Returns: - links: list of outlinks ''' try: soup = BeautifulSoup(html, 'lxml') except: print "Parsing with beautiful soup failed" return [] links = set() for tag in soup.findAll('a', href=True): link = tag['href'] try: link = urlparse.urljoin(url, link) except: continue link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links)