def test_beautifulsoup_constructor_does_lookup(self): with warnings.catch_warnings(record=True) as w: # This will create a warning about not explicitly # specifying a parser, but we'll ignore it. # You can pass in a string. BeautifulSoup("", features="html") # Or a list of strings. BeautifulSoup("", features=["html", "fast"]) # You'll get an exception if BS can't find an appropriate # builder. self.assertRaises(ValueError, BeautifulSoup, "", features="no-such-feature")
def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ <script type="text/javascript"> </script> """ soup = BeautifulSoup(doc, "lxml-xml") # lxml would have stripped this while parsing, but we can add # it later. soup.script.string = 'console.log("< < hey > > ");' encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded)
def parse_page(self, page_content): try: parsed_html = BeautifulSoup(page_content, "html.parser") # html = list(parsed_html.children)[2] # body = list(html.children)[3] images_tag = parsed_html.find_all('img') for tag in images_tag: url = tag.get('src') if url is not None: self.image_url = self.page_url.split("astropix")[0] + url break except Exception as e: # print(e) pass
def parseAndStoreDoc(self): for e in self.doc.entries: title = jinja2.Markup(e.title).unescape() link = e.link imgLink = e.imgurl soup = BeautifulSoup(e.description) description = jinja2.Markup(soup.getText()).unescape() srcName = self.srcName labelName = self.srcName.split("_")[1] article = Article(title=title, description=description, imgLink=imgLink, link=link, srcName=srcName, labelName=labelName) article.put() logging.info("Storing data from %s" % srcName)
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b - a)
def parse_page(self, page_content): try: parsed_html = BeautifulSoup(page_content, "html.parser") html = list(parsed_html.children)[1] body = list(html.children)[1] # for link in body.find_all('div'): wall_tag = body.find('div', class_='img_cont') style = wall_tag.get('style') if style is not None and "background-image" in style: image_address = style.split("/")[1].split("jpg")[0] self.image_url = self.page_url + image_address + "jpg" # image_name_id = "iotd_title" self.extract_image_name = body.find('a', class_='title').getText() except Exception as e: pass
def parseAndStoreDoc(self): for e in self.doc.entries: title = jinja2.Markup(e.title).unescape() link = e.links[0]["href"] soup = BeautifulSoup(e.description) description = jinja2.Markup(soup.getText()).unescape() #retrieve the img link if not exit use logo instead try: imgLink = e.media_thumbnail[0]["url"] except AttributeError as e: imgLink = BBC.bbcLogo srcName = self.srcName labelName = self.srcName.split("_")[1] article = Article(title=title, description=description, imgLink=imgLink, link=link, srcName=srcName, labelName=labelName) article.put() logging.info("Storing data from %s" % srcName)
def parseAndStoreDoc(self): for e in self.doc.entries: title = jinja2.Markup(e.title).unescape() desc = BeautifulSoup(e.description).findAll("p")[0].getText() description = jinja2.Markup(desc).unescape() #truncate the length if len(description) > 500: description = description[:500] link = e.links[0]["href"] srcName = self.srcName labelName = self.srcName.split("_")[1] #img from feed imgLink = WashingtonPost.logoSrc article = Article(title=title, description=description, imgLink=imgLink, link=link, srcName=srcName, labelName=labelName) article.put() logging.info("storing data from %s" % srcName)
def __init__(self, namespaceHTMLElements, soup=None, store_line_numbers=True, **kwargs): if soup: self.soup = soup else: from lib.bs4 import BeautifulSoup # TODO: Why is the parser 'html.parser' here? To avoid an # infinite loop? self.soup = BeautifulSoup("", "html.parser", store_line_numbers=store_line_numbers, **kwargs) # TODO: What are **kwargs exactly? Should they be passed in # here in addition to/instead of being passed to the BeautifulSoup # constructor? super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) # This will be set later to an html5lib.html5parser.HTMLParser # object, which we can use to track the current line number. self.parser = None self.store_line_numbers = store_line_numbers
def GEN(book=None, prov=None, test=False): errmsg = '' provider = "libgen.io" if prov is None: prov = 'GEN' host = lazylibrarian.CONFIG[prov + '_HOST'] if not host.startswith('http'): host = 'http://' + host search = lazylibrarian.CONFIG[prov + '_SEARCH'] if not search or not search.endswith('.php'): search = 'search.php' if 'index.php' not in search and 'search.php' not in search: search = 'search.php' if search[0] == '/': search = search[1:] sterm = makeUnicode(book['searchterm']) page = 1 results = [] next_page = True while next_page: if 'index.php' in search: params = { "s": book['searchterm'], "f_lang": "All", "f_columns": 0, "f_ext": "All" } else: params = { "view": "simple", "open": 0, "phrase": 0, "column": "def", "res": 100, "req": book['searchterm'] } if page > 1: params['page'] = page providerurl = url_fix(host + "/%s" % search) searchURL = providerurl + "?%s" % urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True elif '111' in result: # looks like libgen has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) errmsg = result else: logger.debug(searchURL) logger.debug('Error fetching page data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) try: soup = BeautifulSoup(result, 'html5lib') try: table = soup.find_all('table')[2] # un-named table if table: rows = table.find_all('tr') except IndexError: # no results table in result page rows = [] if 'search.php' in search and len(rows) > 1: rows = rows[1:] for row in rows: author = '' title = '' size = '' extn = '' link = '' td = row.find_all('td') if 'index.php' in search and len(td) > 3: try: author = formatAuthorName(td[0].text) title = td[2].text newsoup = BeautifulSoup(str(td[4]), 'html5lib') data = newsoup.find('a') link = data.get('href') extn = data.text.split('(')[0] size = data.text.split('(')[1].split(')')[0] size = size.upper() except IndexError as e: logger.debug( 'Error parsing libgen index.php results: %s' % str(e)) elif 'search.php' in search and len(td) > 8: try: author = formatAuthorName(td[1].text) title = td[2].text size = td[7].text.upper() extn = td[8].text newsoup = BeautifulSoup(str(td[2]), 'html5lib') link = newsoup.get('href') except IndexError as e: logger.debug( 'Error parsing libgen search.php results; %s' % str(e)) if not size: size = 0 else: try: mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn if not link.startswith('http'): if "/ads.php?" in link: url = url_fix(host + link) else: url = url_fix(host + "/ads.php?" + link) else: url = redirect_url(host, link) bookresult, success = fetchURL(url) if not success: # may return 404 if no results, not really an error if '404' in bookresult: logger.debug( "No results found from %s for %s" % (provider, sterm)) else: logger.debug(url) logger.debug( 'Error fetching link data from %s: %s' % (provider, bookresult)) errmsg = bookresult bookresult = False if bookresult: url = None try: new_soup = BeautifulSoup( bookresult, 'html5lib') for link in new_soup.find_all('a'): output = link.get('href') if output: if output.startswith( 'http' ) and '/get.php' in output: url = output break elif '/get.php' in output: url = '/get.php' + output.split( '/get.php')[1] break elif '/download/book' in output: url = '/download/book' + output.split( '/download/book')[1] break if url and not url.startswith('http'): url = url_fix(host + url) else: url = redirect_url(host, url) except Exception as e: logger.error( '%s parsing bookresult for %s: %s' % (type(e).__name__, link, str(e))) url = None if url: results.append({ 'bookid': book['bookid'], 'tor_prov': provider + '/' + search, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct', 'priority': lazylibrarian.CONFIG[prov + '_DLPRIORITY'] }) logger.debug('Found %s, Size %s' % (title, size)) next_page = True except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def diagnose(data): """Diagnostic suite for isolating common problems. :param data: A string containing markup that needs to be explained. :return: None; diagnostics are printed to standard output. """ print(("Diagnostic running on Beautiful Soup %s" % __version__)) print(("Python version %s" % sys.version)) basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: for builder in builder_registry.builders: if name in builder.features: break else: basic_parsers.remove(name) print(( "I noticed that %s is not installed. Installing it may help." % name)) if 'lxml' in basic_parsers: basic_parsers.append("lxml-xml") try: from lxml import etree print(("Found lxml version %s" % ".".join(map(str, etree.LXML_VERSION)))) except ImportError as e: print("lxml is not installed or couldn't be imported.") if 'html5lib' in basic_parsers: try: import html5lib print(("Found html5lib version %s" % html5lib.__version__)) except ImportError as e: print("html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() elif data.startswith("http:") or data.startswith("https:"): print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)) print( "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." ) return else: try: if os.path.exists(data): print(( '"%s" looks like a filename. Reading data from the file.' % data)) with open(data) as fp: data = fp.read() except ValueError: # This can happen on some platforms when the 'filename' is # too long. Assume it's data and not a filename. pass print("") for parser in basic_parsers: print(("Trying to parse your markup with %s" % parser)) success = False try: soup = BeautifulSoup(data, features=parser) success = True except Exception as e: print(("%s could not parse the markup." % parser)) traceback.print_exc() if success: print(("Here's what %s did with the markup:" % parser)) print((soup.prettify())) print(("-" * 80))
def TPB(book=None, test=False): errmsg = '' provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/s/?") cat = 0 # 601=ebooks, 102=audiobooks, 0=all, no mag category if 'library' in book: if book['library'] == 'AudioBook': cat = 102 elif book['library'] == 'eBook': cat = 601 elif book['library'] == 'magazine': cat = 0 sterm = makeUnicode(book['searchterm']) page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = { "q": book['searchterm'], "category": cat, "page": page, "orderby": "99" } searchURL = providerurl + "?%s" % urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result, 'html5lib') # tpb uses a named table table = soup.find('table', id='searchResult') if table: rows = table.find_all('tr') else: rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.find_all('td') if len(td) > 2: try: new_soup = BeautifulSoup(str(td[1]), 'html5lib') link = new_soup.find("a") magnet = link.get("href") title = link.text size = td[1].text.split(', Size ')[1].split('iB')[0] size = size.replace(' ', '') mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if minimumseeders < int(seeders): # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result, 'html5lib') for link in new_soup.find_all('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TPB_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return BeautifulSoup(markup, builder=builder, **kwargs)
def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup("<a><b></b></a>", "html.parser") self.assertEqual([], w)
def test_warning_if_parser_specified_too_vague(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup("<a><b></b></a>", "html") self._assert_no_parser_specified(w)
def KAT(book=None, test=False): errmsg = '' provider = "KAT" host = lazylibrarian.CONFIG['KAT_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/usearch/" + quote(book['searchterm'])) params = {"category": "books", "field": "seeders", "sorder": "desc"} searchURL = providerurl + "/?%s" % urlencode(params) sterm = makeUnicode(book['searchterm']) result, success = fetchURL(searchURL) if not success: # seems KAT returns 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success results = [] if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result, 'html5lib') rows = [] try: table = soup.find_all('table')[1] # un-named table if table: rows = table.find_all('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.find_all('td') if len(td) > 3: try: title = unaccented(td[0].text) # kat can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str( td[0]).split('href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = 'http' + str(td[0]).split('href="http')[1].split( '.torrent?')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(td[1].text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[3].text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['KAT_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def WWT(book=None, test=False): errmsg = '' provider = "WorldWideTorrents" host = lazylibrarian.CONFIG['WWT_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/torrents-search.php") sterm = makeUnicode(book['searchterm']) cat = 0 # 0=all, 36=ebooks, 52=mags, 56=audiobooks if 'library' in book: if book['library'] == 'AudioBook': cat = 56 elif book['library'] == 'eBook': cat = 36 elif book['library'] == 'magazine': cat = 52 page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = {"search": book['searchterm'], "page": page, "cat": cat} searchURL = providerurl + "/?%s" % urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # might return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result, 'html5lib') try: tables = soup.find_all('table') # un-named table table = tables[2] if table: rows = table.find_all('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.find_all('td') if len(td) > 3: try: title = unaccented(td[0].text) # can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str( td[0]).split('href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = url_fix(host + '/download.php') + \ str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(td[1].text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['WWT_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
data = data.read() elif os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data with open(data) as fp: data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." return print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: soup = BeautifulSoup(data, parser) success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "Here's what %s did with the markup:" % parser print soup.prettify() print "-" * 80 def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. This lets you see how lxml parses a document when no Beautiful
def TDL(book=None, test=False): errmsg = '' provider = "torrentdownloads" host = lazylibrarian.CONFIG['TDL_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host) params = {"type": "search", "cid": "2", "search": book['searchterm']} searchURL = providerurl + "/rss.xml?%s" % urlencode(params) sterm = makeUnicode(book['searchterm']) data, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in data: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, data)) errmsg = data data = False if test: return success results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 if data: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = item['title'] seeders = int(item['seeders']) link = item['link'] size = int(item['size']) url = None if link and minimumseeders < int(seeders): # no point requesting the magnet link if not enough seeders # TDL gives us a relative link result, success = fetchURL(providerurl + link) if success: new_soup = BeautifulSoup(result, 'html5lib') for link in new_soup.find_all('a'): output = link.get('href') if output and output.startswith('magnet'): url = output break if not url or not title: logger.debug('Missing url or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TDL_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def fragmentClass(self): from lib.bs4 import BeautifulSoup self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None)