Ejemplo n.º 1
0
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='wswp',
                 proxy=None,
                 delay=3,
                 max_depth=4,
                 scrape_callback=None):
    """ Crawl from the given start URL following links matched by link_regex. In the current
        implementation, we do not actually scrapy any information.

        args:
            start_url (str): web site to start crawl
            link_regex (str): regex to match for links
        kwargs:
            robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
            user_agent (str): user agent (default: wswp)
            proxy (str): proxy url, ex 'http://IP' (default: None)
            delay (int): seconds to throttle between requests to one domain (default: 3)
            max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
            scrape_callback (function): function to call after each download (default: None)
    """
    crawl_queue = [start_url]
    # keep track which URL's have seen before
    seen = {}
    data = []
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)
            html = download(url, user_agent=user_agent, proxy=proxy)
            if not html:
                continue
            if scrape_callback:
                data.extend(scrape_callback(url, html) or [])
            # filter for links matching our regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='wswp',
                 proxy=None,
                 delay=3,
                 max_depth=4,
                 scrape_callback=None):
    """
    从给定的起始URL开始爬取所有正则表达式匹配到的链接
    """
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)

    crawl_queue = [start_url]
    # 定义一个set集合保存所有已经爬取过的链接
    # seen = set(crawl_queue)
    # 新的seen为字典,增加了以发现链接的深度记录
    seen = {}
    data = []
    # 爬虫下载限速
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        # 检查URL是否允许爬取(根据robots.txt文件中的定义)
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            # 到达最大深度,不再向队列中添加该网页中的链接
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)
            html = download(url=url, user_agent=user_agent, proxy=proxy)
            if not html:
                continue
            if scrape_callback:
                data.extend(scrape_callback(url, html) or [])
            # 在页面HTML中筛选出匹配我们正则表达式的链接
            for link in get_links(html):
                if re.search(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    # 检查是否已经爬取过该链接
                    if abs_link not in seen:
                        # seen.add(abs_link)
                        # 爬取深度增加
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
Ejemplo n.º 3
0
class Downloader:
    def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = None # 每个请求我们都将设置此属性
        self.cache = cache

    def __call__(self, url, num_retries=2):
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # 服务器错误所以忽略缓存加载的结果
            # 重新下载
            result = None
        if result is None:
            # 未能从缓存中加载,因此仍然需要下载
            self.throttle.wait(url)
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            if self.cache:
                # 将结果保存进缓存中
                self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        print('Downloading:', url)
        try:
            resp = requests.get(url, headers=headers, proxies=proxies)
            html = resp.text
            if resp.status_code >= 400:
                print('Download error:', resp.text)
                html = None
                if self.num_retries and 500 <= resp.status_code < 600:
                    self.num_retries -= 1
                    return self.download(url, headers, proxies)
        except requests.exceptions.RequestException as e:
            print('Download error:', e)
            return {'html': None, 'code': 500}
        return {'html': html, 'code': resp.status_code}
Ejemplo n.º 4
0
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='wswp',
                 proxy=None,
                 delay=3,
                 max_depth=4,
                 scrape_callback=None):
    crawl_queue = [start_url]
    seen = {}
    data = []

    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)

    throttle = Throttle(delay)

    while crawl_queue:
        url = crawl_queue.pop()
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue

            throttle.wait(url)

            html = download(url, user_agent=user_agent, proxy=proxy)

            if not html:
                continue

            if scrape_callback:
                data.extend(scrape_callback(url, html) or [])

            for link in get_links(html):
                if re.search(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
Ejemplo n.º 5
0
class Downloader:
    """ Downloader class to use cache and requests for downloading pages.
        For contructor, pass:
            delay (int): # of secs delay between requests (default: 5)
            user_agent (str): user agent string (default: 'wswp')
            proxies (list[dict]): list of possible proxies, each
                must be a dict with http / https keys and proxy values
            cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
            timeout (float/int): number of seconds to wait until timeout
    """
    def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
                 timeout=60):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.cache = cache
        self.num_retries = None  # we will set this per request
        self.timeout = timeout

    def __call__(self, url, num_retries=2):
        """ Call the downloader class, which will return HTML from cache
            or download it
            args:
                url (str): url to download
            kwargs:
                num_retries (int): # times to retry if 5xx code (default: 2)
        """
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None
        if result is None:
            # result was not loaded from cache, need to download
            self.throttle.wait(url)
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        """ Download a and return the page content
            args:
                url (str): URL
                headers (dict): dict of headers (like user_agent)
                proxies (dict): proxy dict w/ keys 'http'/'https', values
                    are strs (i.e. 'http(s)://IP') (default: None)
        """
        print('Downloading:', url)
        try:
            resp = requests.get(url, headers=headers, proxies=proxies,
                                timeout=self.timeout)
            html = resp.text
            if resp.status_code >= 400:
                print('Download error:', resp.text)
                html = None
                if self.num_retries and 500 <= resp.status_code < 600:
                    # recursively retry 5xx HTTP errors
                    self.num_retries -= 1
                    return self.download(url, headers, proxies)
        except requests.exceptions.RequestException as e:
            print('Download error:', e)
            return {'html': None, 'code': 500}
        return {'html': html, 'code': resp.status_code}