Esempio n. 1
0
def resolve(web_url):
    '''
    Resolve a web page to a media stream.

    It is usually as simple as::

        import urlresolver
        media_url = urlresolver.resolve(web_url)

    where ``web_url`` is the address of a web page which is associated with a
    media file and ``media_url`` is the direct URL to the media.

    Behind the scenes, :mod:`urlresolver` will check each of the available
    resolver plugins to see if they accept the ``web_url`` in priority order
    (lowest priotity number first). When it finds a plugin willing to resolve
    the URL, it passes the ``web_url`` to the plugin and returns the direct URL
    to the media file, or ``False`` if it was not possible to resolve.

    .. seealso::

        :class:`HostedMediaFile`

    Args:
        web_url (str): A URL to a web page associated with a piece of media
        content.

    Returns:
        If the ``web_url`` could be resolved, a string containing the direct
        URL to the media file, if not, returns ``False``.
    '''
    source = HostedMediaFile(url=web_url)
    return source.resolve()
Esempio n. 2
0
def resolve(web_url):
    '''
    Resolve a web page to a media stream.

    It is usually as simple as::

        import urlresolver9
        media_url = urlresolver.resolve(web_url)

    where ``web_url`` is the address of a web page which is associated with a
    media file and ``media_url`` is the direct URL to the media.

    Behind the scenes, :mod:`urlresolver` will check each of the available
    resolver plugins to see if they accept the ``web_url`` in priority order
    (lowest priotity number first). When it finds a plugin willing to resolve
    the URL, it passes the ``web_url`` to the plugin and returns the direct URL
    to the media file, or ``False`` if it was not possible to resolve.

    .. seealso::

        :class:`HostedMediaFile`

    Args:
        web_url (str): A URL to a web page associated with a piece of media
        content.

    Returns:
        If the ``web_url`` could be resolved, a string containing the direct
        URL to the media file, if not, returns ``False``.
    '''
    source = HostedMediaFile(url=web_url)
    return source.resolve()
Esempio n. 3
0
def scrape_supported(html, regex=None, host_only=False):
    '''
    returns a list of links scraped from the html that are supported by urlresolver

    args:
        html: the html to be scraped
        regex: an optional argument to override the default regex which is: href\s*=\s*["']([^'"]+
        host_only: an optional argument if true to do only host validation vs full url validation (default False)

    Returns:
        a list of links scraped from the html that passed validation

    '''
    if regex is None: regex = '''href\s*=\s*['"]([^'"]+)'''
    links = []
    for match in re.finditer(regex, html):
        stream_url = match.group(1)
        host = urlparse.urlparse(stream_url).hostname
        if host_only:
            if host is None:
                continue

            if host in host_cache:
                if host_cache[host]:
                    links.append(stream_url)
                continue
            else:
                hmf = HostedMediaFile(
                    host=host, media_id='dummy'
                )  # use dummy media_id to allow host validation
        else:
            hmf = HostedMediaFile(url=stream_url)

        is_valid = hmf.valid_url()
        host_cache[host] = is_valid
        if is_valid:
            links.append(stream_url)
    return links
Esempio n. 4
0
def scrape_supported(html, regex=None, host_only=False):
    '''
    returns a list of links scraped from the html that are supported by urlresolver
    
    args:
        html: the html to be scraped
        regex: an optional argument to override the default regex which is: href\s*=\s*["']([^'"]+
        host_only: an optional argument if true to do only host validation vs full url validation (default False)
    
    Returns:
        a list of links scraped from the html that passed validation
    
    '''
    if regex is None: regex = '''href\s*=\s*['"]([^'"]+)'''
    links = []
    for match in re.finditer(regex, html):
        stream_url = match.group(1)
        host = urlparse.urlparse(stream_url).hostname
        if host_only:
            if host is None:
                continue
            
            if host in host_cache:
                if host_cache[host]:
                    links.append(stream_url)
                continue
            else:
                hmf = HostedMediaFile(host=host, media_id='dummy')  # use dummy media_id to allow host validation
        else:
            hmf = HostedMediaFile(url=stream_url)
        
        is_valid = hmf.valid_url()
        host_cache[host] = is_valid
        if is_valid:
            links.append(stream_url)
    return links