Python SoupStrainer Examples

Programming Language: Python

Namespace/Package Name: bs4.element

Class/Type: SoupStrainer

Examples at hotexamples.com: 7

Python SoupStrainer - 7 examples found. These are the top rated real world Python examples of bs4.element.SoupStrainer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SoupStrainer(25)

Frequently Used Methods

SoupStrainer (25)

Example #1

Show file

File: test_tree.py Project: ominux/beautifulsoup-1

 def test_find_all_by_tag_strainer(self):
     self.assertSelects(
         self.tree.find_all(SoupStrainer('a')),
         ['First tag.', 'Nested tag.'])

Example #2

Show file

 def test_soupstrainer(self):
     """Parsers should be able to work with SoupStrainers."""
     strainer = SoupStrainer("b")
     soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
                      parse_only=strainer)
     self.assertEqual(soup.decode(), "<b>bold</b>")

Example #3

Show file

 def test_parse_with_soupstrainer(self):
     markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
     strainer = SoupStrainer("b")
     soup = self.soup(markup, parse_only=strainer)
     self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")

Example #4

Show file

 def test_soupstrainer(self):
     # The html5lib tree builder does not support SoupStrainers.
     strainer = SoupStrainer("b")
     markup = "<p>A <b>bold</b> statement.</p>"
     soup = self.soup(markup, parse_only=strainer)
     self.assertEqual(soup.decode(), self.document_for(markup))

Example #5

Show file

 def test_soupstrainer(self):
     
     strainer = SoupStrainer("b")

Example #6

Show file

 def test_soupstrainer(self):
     strainer = SoupStrainer("b")
     soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>",
                      parse_only=strainer)
     self.assertEqual(soup.decode(), "<b>bold</b>")

Example #7

Show file

def soupify_links(url, file_extension=None):
    """
    Returns a String list containing urls that match the specified file_extension
    Only works on link tags
    
    Args:
        url (String): the target URL
    
    Returns:
        [String]: A list of string URLs representing all links to content from <a> and <img> tags
    """

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
    }

    with requests.session() as session:

        try:
            # run a GET request on the supplied URL
            r = session.get(url, headers=headers, stream=True, timeout=1)
            r.raise_for_status()
        except HTTPError as http_err:
            click.secho(f"\nHTTP error occurred: {http_err}\n",
                        fg="red",
                        bold=False)
            return False
        except TimeoutError as timeout_err:
            click.secho(f"\nRequest timed out: {timeout_err}\n",
                        fg="red",
                        bold=False)
            return False
        except Exception as err:
            click.secho(f"\nOther error occurred: {err}\n",
                        fg="red",
                        bold=False)
            return False
        else:
            # no errors... continue
            # parse just the <img> and <a> tags
            soup_a = BeautifulSoup(r.content,
                                   "lxml",
                                   parse_only=SoupStrainer("a"))
            soup_img = BeautifulSoup(r.content,
                                     "lxml",
                                     parse_only=SoupStrainer("img"))

    # build the list of hrefs
    hrefs = []

    if file_extension is not None:
        print(f"Getting links for {file_extension} files...")
        # Looking for a specific file_extension
        for img_link in soup_img(src=regex.compile(f".{file_extension}")):
            if img_link.get("src") is not None:
                hrefs.append(conv_rel_abs_addr(url, img_link.get("src")))
        for a_link in soup_a(href=regex.compile(f".{file_extension}")):
            if a_link.get("href") is not None:
                hrefs.append(conv_rel_abs_addr(url, a_link.get("href")))
    else:
        print("Getting links...")
        for img_link in soup_img.find_all("img"):
            if img_link.get("src") is not None:
                hrefs.append(conv_rel_abs_addr(url, img_link.get("src")))
        for a_link in soup_a.find_all("a"):
            if a_link.get("href") is not None:
                hrefs.append(conv_rel_abs_addr(url, a_link.get("href")))

    return hrefs