コード例 #1
0
ファイル: __init__.py プロジェクト: chclxds/thg-framework-1
def _from_url(url):  # pragma: no cover
    """ get list of social media links/handles given a url """
    import requests
    from html_to_etree import parse_html_bytes
    res = requests.get(url)
    tree = parse_html_bytes(res.content, res.headers.get('content-type'))

    return set(find_links_tree(tree))
コード例 #2
0
 def get_social_media_links(self,link, DEFAULT_SM, default_dict):
     res = requests.get(link,verify=False, timeout=30)
     tree = parse_html_bytes(res.content, res.headers.get('content-type'))
     sm_link =  list(find_links_tree(tree))
     for i in sm_link:
         for sm in DEFAULT_SM:
             if sm in i:
                 default_dict[sm] += [i]
     return default_dict
コード例 #3
0
def find_social_links(url) -> set:
    """
    Find the social media links are a webpage.
    - url: The url of the webpage to search (String)
    """
    response = requests.get(url)
    tree = parse_html_bytes(response.content,
                            response.headers.get('content-type'))
    return set(find_links_tree(tree))
コード例 #4
0
    def extract_social_media_from_response(self, content, header):
        tree = parse_html_bytes(content, header.get('content-type'))
        result = {}

        for m in self.metas:
            for link in list(set(find_links_tree(tree))):
                if m in link:
                    result[m] = link
        return result
コード例 #5
0
ファイル: company.py プロジェクト: DevTotti/Company-Details
def get_social_media(url):
    media = [
        'facebook', 'linkedin', 'twitter', 'youtube', 'github', 'google plus',
        'pinterest', 'instagram', 'snapchat', 'flipboard', 'flickr', 'weibo',
        'periscope', 'telegram', 'soundcloud', 'feedburner', 'vimeo',
        'slideshare', 'vkontakte', 'xing'
    ]
    res = requests.get(url)
    social = dict()
    tree = parse_html_bytes(res.content, res.headers.get('content-type'))
    links = set(find_links_tree(tree))
    for i in range(len(media)):
        for link in links:
            if media[i] in link:
                social[media[i]] = link
    return social
コード例 #6
0
def test_tree():
    for rec in get_data():
        meta = rec['meta']
        url = meta['url']
        contains = meta['contains']

        webdata = rec['webdata']
        body = webdata['byte_body']
        content_type = webdata['content-type']

        if len(clean_html(body)) < 100:
            logging.warning('skipping %s', url)
            continue

        tree = parse_html_bytes(body=body, content_type=content_type)

        assert contains in extract_text(tree), (url, contains,
                                                etree.tostring(
                                                    tree, encoding='utf-8'))
コード例 #7
0
 def _request_html(self, url):
     html = requests.get(url).content
     print(html)
     return parse_html_bytes(html)
コード例 #8
0
ファイル: __init__.py プロジェクト: guoyu07/email-audit
def audit_html_bytes(body, content_type=''):
    """ audit html with given bytestring body and header content_type """
    logging.debug('parse_html_bytes')
    tree = parse_html_bytes(body, content_type)
    return audit_etree(tree)
コード例 #9
0
import requests
from html_to_etree import parse_html_bytes
from extract_social_media import find_links_tree

res = requests.get('https://github.com/HarshCasper/Rotten-Scripts')

tree = parse_html_bytes(res.content, res.headers.get('content-type'))

links = set(find_links_tree(tree))
print(links)