Esempio n. 1
0
def worker():
    """Typical Consumer
    Get spider.model.SpiderResult from spider.queue.spider_queue
    and process the spider_result
    """
    spider_log.debug("queue_size: {0}, hash_size: {1}, fetching: {2}".format(spider_queue.qsize(), len(hash_list), fetching))
    if len(hash_list) >= settings.LIMIT or (spider_queue.qsize() == 0 and fetching == 0):
        from spider.model import spider_session
        spider_log.info("limit touched!")
        spider_log.info("stopping spider now.")
        spider_session.close()
        ioloop.IOLoop.current().stop()
    record = yield spider_queue.get()
    url = record.url
    depth = record.depth
    try:
        if url_hash(url) in hash_list:
            # just in case...
            return
        # spider_log.debug("queue_size: {0}, hash_size: {1}, fetching: {2}".format(spider_queue.qsize(), len(hash_list), fetching))
        links = yield get_links_from_url(record)
        for link in links:
            if url_hash(link) in hash_list or record.depth >= settings.DEPTH:
                continue
            new_record = SpiderResult(url=urlparse.urljoin(url, link), depth=depth + 1)
            new_record.refer = url
            spider_queue.put(new_record)
    finally:
        spider_queue.task_done()
Esempio n. 2
0
 def __init__(self, settings):
     init_log(settings)
     init_queue()
     init_db()
     spider_log.debug("delay: %s" % settings.DELAY)
     spider_log.debug("limit: %s" % settings.LIMIT)
     self.settings = settings
Esempio n. 3
0
def get_links_from_url(spider_result):
    """Get url links in page spider_result.url

    :param spider_result: spider.model.SpiderResult
    :return: list, a list of formatted urls
    """
    global fetching
    url = spider_result.url
    client = httpclient.AsyncHTTPClient()
    # parse HEADER in settings
    try:
        header = settings.HEADER
    except AttributeError:
        header = {}

    request = httpclient.HTTPRequest(url=url, headers=header, follow_redirects=True, max_redirects=5, request_timeout=30)
    # parse KEYWORD in settings
    try:
        keyword = settings.KEYWORD
        if not isinstance(keyword, str):
            keyword = ""
    except AttributeError:
        keyword = ""

    try:
        fetching += 1
        response = yield client.fetch(request)
        fetching -= 1
        spider_log.debug(u"{0} fetching {1}".format(response.request_time, url))
        # decode the body if response if gzip
        html = response.body if isinstance(response.body, str) else response.body.decode()
        # get links
        doc = fromstring(html)
        links = filter(None, map(url_formatter, doc.xpath("//a/@href")))

        if keyword and html.find(keyword) == -1:
            # keyword set in settings
            # but page do not contains the keyword
            # do nothing
            pass
        else:
            # compress response body
            spider_result.content = zlib.compress(html)
            try:
                insert_record(spider_result)
                hash_list.append(url_hash(url))
            except DataError:
                spider_log.warn("insert fail: {0}".format(url))

        raise gen.Return(list(set(links)))
    except httpclient.HTTPError:
        fetching -= 1
        spider_log.debug(u"failed fetching {0}".format(url))
        raise gen.Return([])