def test_find_all_links(): prefix = "a little something before, " suffix = " a bit of another after." for content, expected_links in FAL_TESTS: text = prefix + content + suffix links = find_all_links(text) assert len(links) == len(expected_links) for link, expected in zip(links, expected_links): assert link.to_text(full_quote=False) == expected link_tokens = find_all_links(text, with_text=True) assert link_tokens[0].startswith(prefix) assert link_tokens[-1].endswith(suffix)
def links(res: requests.models.Response, search: str = None, pattern: str = None) -> list: """Get the links of the page. Args: res (requests.models.Response): The response of the page. search (str, optional): Defaults to None. Search the links you want. pattern (str, optional): Defaults to None. Search the links use a regex pattern. Returns: list: The links you want. """ absolute_hrefs = [link.to_text() for link in find_all_links(res.text)] relative_hrefs = Selector(text=res.text).css('a::attr(href)').extract() relative_hrefs = [ rehref for rehref in relative_hrefs if not rehref.startswith('http') ] domain = f'https://{urlparse(res.url).netloc}' hrefs = [ *absolute_hrefs, *[urljoin(domain, rehref) for rehref in relative_hrefs] ] if search: hrefs = [href for href in hrefs if search in href] if pattern: hrefs = [href for href in hrefs if re.findall(pattern, href)] return list(set(hrefs))
def test_find_all_links_basic(): target = """hi my name is prince nigeria, please visit my website http://richprince.biz or if that's blocked try https://getprince.ly! Thanks for your attention.bye! PS if those ports are blocked, how about trying https://crownbux.afamilycompany:broken/affiliate PPS if all else fails you can always mailto:[email protected] """ urls = find_all_links(target) assert len(urls) == 2
def prepare_urls(self, msg_or_text, direct_urls=False): if isinstance(msg_or_text, Message): urls = [] url_entities = msg_or_text.parse_entities( types=[MessageEntity.URL]) for entity in url_entities: url_str = url_entities[entity] logger.debug("Entity URL Parsed: %s", url_str) if "://" not in url_str: url_str = "http://{}".format(url_str) urls.append(URL(url_str)) text_link_entities = msg_or_text.parse_entities( types=[MessageEntity.TEXT_LINK]) for entity in text_link_entities: url_str = entity.url logger.debug("Entity Text Link Parsed: %s", url_str) urls.append(URL(url_str)) else: urls = find_all_links(msg_or_text, default_scheme="http") urls_dict = {} for url in urls: url_text = url.to_text(True) url_parts_num = len([part for part in url.path_parts if part]) try: if ( # SoundCloud: tracks, sets and widget pages, no /you/ pages (self.SITES["sc"] in url.host and (2 <= url_parts_num <= 3 or self.SITES["scapi"] in url_text) and (not "you" in url.path_parts)) or # Bandcamp: tracks and albums (self.SITES["bc"] in url.host and (2 <= url_parts_num <= 2)) or # YouTube: videos and playlists (self.SITES["yt"] in url.host and ("youtu.be" in url.host or "watch" in url.path or "playlist" in url.path))): if direct_urls or self.SITES["yt"] in url.host: urls_dict[url_text] = get_direct_urls(url_text) else: urls_dict[url_text] = "http" elif not any( (site in url.host for site in self.SITES.values())): urls_dict[url_text] = get_direct_urls(url_text) except ProcessExecutionError: logger.debug("youtube-dl get url failed: %s", url_text) except URLError as exc: urls_dict[url_text] = exc.status return urls_dict
def links(res: requests.models.Response, search: str = None, pattern: str = None) -> list: """Get the links of the page. Args: res (requests.models.Response): The response of the page. search (str, optional): Defaults to None. Search the links you want. pattern (str, optional): Defaults to None. Search the links use a regex pattern. Returns: list: All the links of the page. """ hrefs = [link.to_text() for link in find_all_links(res.text)] if search: hrefs = [href for href in hrefs if search in href] if pattern: hrefs = [href for href in hrefs if re.findall(pattern, href)] return list(set(hrefs))
def prepare_urls(self, msg=None, text=None, get_direct_urls=False): if text: urls = find_all_links(text, default_scheme="http") elif msg: urls = [] for url_str in msg.parse_entities(types=["url"]).values(): if "://" not in url_str: url_str = "http://" + url_str urls.append(URL(url_str)) else: logger.debug("Text or msg is required") return urls_dict = {} for url in urls: url_text = url.to_text(True) url_parts_num = len([part for part in url.path_parts if part]) if ( # SoundCloud: tracks, sets and widget pages (self.SITES["sc"] in url.host and (2 <= url_parts_num <= 3 or self.SITES["scapi"] in url_text)) or # Bandcamp: tracks and albums (self.SITES["bc"] in url.host and (2 <= url_parts_num <= 2)) or # YouTube: videos and playlists (self.SITES["yt"] in url.host and ("youtu.be" in url.host or "watch" in url.path or "playlist" in url.path))): if get_direct_urls or self.SITES["yt"] in url.host: direct_urls = self.youtube_dl_get_direct_urls(url_text) if direct_urls: urls_dict[url_text] = direct_urls else: urls_dict[url_text] = "http" elif not any((site in url.host for site in self.SITES.values())): direct_urls = self.youtube_dl_get_direct_urls(url_text) if direct_urls: urls_dict[url_text] = direct_urls if not urls_dict: logger.info("No supported URLs found") return urls_dict
def resolve_links(self, info): for link in urlutils.find_all_links(self.text): DBG(f'Found link: {link}') yield link