Beispiel #1
0
def get_canonical_url(url, whitelist, expandlist, extract,
                      timeout=REQ_TIMEOUT):
    '''Get the canonical (or open graph) URL
    Returns a 4-tuple (original_url, new_url, method, reason)

    where method in ['canonical', 'redirect', 'original']
    '''
    method = 'original'
    ret_url = url

    # if it's not unicode, it must be utf8, otherwise fail
    url_new = url_or_error(url)
    if url_new is None:
        return {'url_original': url,
                'url_retrieved': None,
                'method': method,
                'reason': 'invalid url'}
    url = url_new

    # Only download URLs that are in the WHITELIST  or in the EXPANDLIST
    if not (check_whitelist(url, method, extract, expandlist) or
            check_whitelist(url, method, extract, whitelist)):
        return {'url_original': url,
                'url_retrieved': ret_url,
                'method': method,
                'reason': 'not in lists'}

    #
    # fetch page
    #
    page, enc, final_url, err = get_web_page(url, timeout)

    # check final url from dowload attempt
    if final_url is not None:
        if not isinstance(final_url, unicode):
            final_url = final_url.decode('utf8')

        if final_url != url:
            ret_url = url_or_error(final_url)
            if ret_url is not None:
                method = 'redirect'
                logging.debug('got redirect')
            else:
                method = 'bad url'
        else:
            ret_url = final_url

    # check if whitelist exists
    if (ret_url is None or
            not check_whitelist(ret_url, method, extract, whitelist)):
        return {'url_original': url,
                'url_retrieved': ret_url,
                'method': method,
                'reason': 'not in whitelist'}

    return process_page(page, enc, url, ret_url, method)
Beispiel #2
0
    def processed_handler(data):
        url, page, enc, final_url, err = data
        ret_url = None
        method = 'original'
        # check final url from dowload attempt
        if final_url is None:
            result = {'url_original': url,
                      'url_retrieved': None,
                      'method': None,
                      'reason': 'unreachable'}
            canonical_handler(result)
            return

        if final_url != url:
            ret_url = url_or_error(final_url)
            method = 'redirect'
            msg = 'got redirect: {} -> {}'.format(url, final_url)
            logging.debug(msg)
        else:
            ret_url = url

        # check if whitelist exists
        if not check_whitelist(ret_url, method, extract, whitelist):
            result = {'url_original': url,
                      'url_retrieved': ret_url,
                      'method': None,
                      'reason': 'not in whitelist'}
            canonical_handler(result)
            return

        result = process_page(page, enc, url, ret_url, method)
        canonical_handler(result)
Beispiel #3
0
def get_web_page(url, timeout):
    ''' Fetches content at a given URL.
    Requests implementation.
    Args:
        url - unicode string

    Returns: (data, enc, final_url, None)
              or
             (None, None, None, Reason) on error.
    '''

    url = url_or_error(url)
    if url is None:
        return (None, None, None, 'url')

    reason = 'unk'  # default error resason

    # Download and Processs
    try:
        req = requests.get(url, timeout=timeout, allow_redirects=True,
                           stream=True)
        req.raise_for_status()

        # Get Response URL
        final_url = req.url

        # Get Encoding
        enc = req.encoding

        # Get content-type
        content_type = req.headers.get('content-type')

        if content_type and 'text/html' not in content_type:
            msg = 'content type not supported %s for %s' % (content_type, url)
            logging.debug(msg)
            return (None, enc, final_url, 'content-type')

        # get data
        data = req.content

        return (data, enc, final_url, None)

    except requests.exceptions.Timeout:
        msg = 'timedout: {}'.format(url)
        logging.debug(msg)
        reason = 'timeout'

    except requests.exceptions.HTTPError:
        msg = 'download failed: url=%s reason: %d' % (url, req.status_code)
        logging.debug(msg)
        reason = str(req.status_code)

    except Exception as ex:
        msg = 'download failed: url=%s with %s' % (url, repr(ex))
        logging.debug(msg)
        reason = 'download'

    return (None, None, None, reason)
def extract_canonical(unicode_content):
    """Extracts canonical URL or Open Graph URL from the content"""
    try:
        soup = BeautifulSoup(unicode_content, "html5lib")
    except FeatureNotFound:
        logging.exception("missing html5lib?")
        raise
    except Exception as ex:
        logging.exception(ex)
        return None
    except Exception:
        pass

    # Try Canonical URL
    try:
        url_can = soup.find("link", rel="canonical")
        if url_can:
            url_new = url_can.get("href")
            if url_new:
                return url_or_error(url_new)
    except Exception:
        pass

    # Try Open Graph
    try:
        url_can = soup.find("meta", attrs={"property": "og:url", "content": True})
        if url_can:
            u = url_can["content"]
            if u:
                return url_or_error(u)

    except Exception:
        pass

    # logging.debug('no canonical url found')
    return None
Beispiel #5
0
def get_web_page_async(url, timeout, maxsize, maxclients, processed_handler):
    ''' Fetches content at a given URL.
    Requests implementation.
    Args:
        url - unicode string

    Returns: (data, enc, final_url, None)
              or
             (None, None, None, Reason) on error.
    '''

    url = url_or_error(url)
    if url is None:
        processed_handler((None, None, None, 'url'))

    http_client = AsyncHTTPClient(max_clients=maxclients,
                                  max_buffer_size=maxsize)
    # Download and Processs
    handle_request = make_request_handler(processed_handler)
    http_client.fetch(url, handle_request)
Beispiel #6
0
def get_canonical_url_async(url, whitelist, expandlist, extract,
                            timeout, maxsize, maxclients, canonical_handler):
    '''Get the canonical (or open graph) URL
    Returns a 4-tuple (original_url, new_url, method, reason)

    where method in ['canonical', 'redirect', 'original']
    '''
    method = 'original'
    ret_url = url

    # if it's not unicode, it must be utf8, otherwise fail
    url_new = url_or_error(url)
    if url_new is None:
        result = {'url_original': url,
                  'url_retrieved': None,
                  'method': method,
                  'reason': 'invalid url'}
        canonical_handler(result)
        return

    url = url_new
    # Only download URLs that are in the WHITELIST  or in the EXPANDLIST
    if not (check_whitelist(url, method, extract, expandlist) or
            check_whitelist(url, method, extract, whitelist)):
        result = {'url_original': url,
                  'url_retrieved': ret_url,
                  'method': method,
                  'reason': 'not in lists'}
        logging.debug('passing result 1')
        canonical_handler(result)
        return

    # fetch page
    processed_handler = make_processed_handler(canonical_handler, whitelist,
                                               extract)
    get_web_page_async(url, timeout, maxsize, maxclients, processed_handler)