def get_social_media_links(self,link, DEFAULT_SM, default_dict): res = requests.get(link,verify=False, timeout=30) tree = parse_html_bytes(res.content, res.headers.get('content-type')) sm_link = list(find_links_tree(tree)) for i in sm_link: for sm in DEFAULT_SM: if sm in i: default_dict[sm] += [i] return default_dict
def find_social_links(url) -> set: """ Find the social media links are a webpage. - url: The url of the webpage to search (String) """ response = requests.get(url) tree = parse_html_bytes(response.content, response.headers.get('content-type')) return set(find_links_tree(tree))
def test_href(): href = etree.HTML(""" <a href="http://feeds.feedburner.com/TnsGlobalPressReleases"> <fb:like href="http://www.facebook.com/elDiarioEs"> <a class="twitter-follow-button" href="https://twitter.com/NASA"> <a class="github-button" href="https://github.com/igrigorik/githubarchive.org" data-count-href="/igrigorik/githubarchive.org/stargazers"> <div class="fb-page" data-href="https://www.facebook.com/facebook" data-tabs="timeline" data-small-header="false"> """) assert len(list(find_links_tree(href))) == 5, href
def get_social_media(url): media = [ 'facebook', 'linkedin', 'twitter', 'youtube', 'github', 'google plus', 'pinterest', 'instagram', 'snapchat', 'flipboard', 'flickr', 'weibo', 'periscope', 'telegram', 'soundcloud', 'feedburner', 'vimeo', 'slideshare', 'vkontakte', 'xing' ] res = requests.get(url) social = dict() tree = parse_html_bytes(res.content, res.headers.get('content-type')) links = set(find_links_tree(tree)) for i in range(len(media)): for link in links: if media[i] in link: social[media[i]] = link return social
def test_broken_href(): href = etree.HTML(""" <a href> """) assert len(list(find_links_tree(href))) == 0, href
def test_twitter(): href = etree.HTML(""" <meta name="twitter:site" content="@fluquid_ds"> <meta name="twitter:creator" content="@fluquid_ds"> """) assert len(list(find_links_tree(href))) == 2, href
import requests from html_to_etree import parse_html_bytes from extract_social_media import find_links_tree res = requests.get('https://github.com/HarshCasper/Rotten-Scripts') tree = parse_html_bytes(res.content, res.headers.get('content-type')) links = set(find_links_tree(tree)) print(links)