Beispiel #1
0
    def __crawljobs(self, keyword, city=u'全国'):

        page_num = 1

        url = u'http://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false' % (city,)

        page_size = sys.maxint

        errors = []

        while page_num < page_size:
            data = u'first=false&pn=%d&kd=%s' % (page_num, keyword)
            cache, content = Downloader.get_html(url, data, city)
            if content:
                try:
                    result = json.loads(content)
                    page_no = result['content']['pageNo']
                    if page_no > 0:
                        print '{kv}: {page_num}'.format(kv=keyword, page_num=page_num)
                        page_num += 1
                        if not cache:
                            # 每个网络请求间隔一秒
                            sleep(0.1)
                    else:
                        # 为空,说明最后一页
                        break
                except Exception, e:
                    page_num += 1
                    # 发生异常, 删除缓存文件
                    Downloader.remove_file(url, data, city)
                    errors.append((1, url, data, city, e))
            else:
                # 请求结果为空
                errors.append((2, url, data, city, e))