Example #1
0
 def test_find_all_by_tag_strainer(self):
     self.assertSelects(
         self.tree.find_all(SoupStrainer('a')),
         ['First tag.', 'Nested tag.'])
Example #2
0
 def test_soupstrainer(self):
     """Parsers should be able to work with SoupStrainers."""
     strainer = SoupStrainer("b")
     soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
                      parse_only=strainer)
     self.assertEqual(soup.decode(), "<b>bold</b>")
Example #3
0
 def test_parse_with_soupstrainer(self):
     markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
     strainer = SoupStrainer("b")
     soup = self.soup(markup, parse_only=strainer)
     self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
Example #4
0
 def test_soupstrainer(self):
     # The html5lib tree builder does not support SoupStrainers.
     strainer = SoupStrainer("b")
     markup = "<p>A <b>bold</b> statement.</p>"
     soup = self.soup(markup, parse_only=strainer)
     self.assertEqual(soup.decode(), self.document_for(markup))
Example #5
0
 def test_soupstrainer(self):
     
     strainer = SoupStrainer("b")
Example #6
0
 def test_soupstrainer(self):
     strainer = SoupStrainer("b")
     soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>",
                      parse_only=strainer)
     self.assertEqual(soup.decode(), "<b>bold</b>")
Example #7
0
def soupify_links(url, file_extension=None):
    """
    Returns a String list containing urls that match the specified file_extension
    Only works on link tags
    
    Args:
        url (String): the target URL
    
    Returns:
        [String]: A list of string URLs representing all links to content from <a> and <img> tags
    """

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
    }

    with requests.session() as session:

        try:
            # run a GET request on the supplied URL
            r = session.get(url, headers=headers, stream=True, timeout=1)
            r.raise_for_status()
        except HTTPError as http_err:
            click.secho(f"\nHTTP error occurred: {http_err}\n",
                        fg="red",
                        bold=False)
            return False
        except TimeoutError as timeout_err:
            click.secho(f"\nRequest timed out: {timeout_err}\n",
                        fg="red",
                        bold=False)
            return False
        except Exception as err:
            click.secho(f"\nOther error occurred: {err}\n",
                        fg="red",
                        bold=False)
            return False
        else:
            # no errors... continue
            # parse just the <img> and <a> tags
            soup_a = BeautifulSoup(r.content,
                                   "lxml",
                                   parse_only=SoupStrainer("a"))
            soup_img = BeautifulSoup(r.content,
                                     "lxml",
                                     parse_only=SoupStrainer("img"))

    # build the list of hrefs
    hrefs = []

    if file_extension is not None:
        print(f"Getting links for {file_extension} files...")
        # Looking for a specific file_extension
        for img_link in soup_img(src=regex.compile(f".{file_extension}")):
            if img_link.get("src") is not None:
                hrefs.append(conv_rel_abs_addr(url, img_link.get("src")))
        for a_link in soup_a(href=regex.compile(f".{file_extension}")):
            if a_link.get("href") is not None:
                hrefs.append(conv_rel_abs_addr(url, a_link.get("href")))
    else:
        print("Getting links...")
        for img_link in soup_img.find_all("img"):
            if img_link.get("src") is not None:
                hrefs.append(conv_rel_abs_addr(url, img_link.get("src")))
        for a_link in soup_a.find_all("a"):
            if a_link.get("href") is not None:
                hrefs.append(conv_rel_abs_addr(url, a_link.get("href")))

    return hrefs