def crawler(self, key_words = None, hy = None, city = None): ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() #第一步,请求首页,拿到cookie retry_count = 10 while True: try: html_src = webutil.request(zhilian_crawler_data.first_url, headers = zhilian_crawler_data.first_url_request_header, timeout = 60, encoding = 'utf-8', proxy = None, cookie = cookieJar, ua = ua) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载首页太大或太小') break except Exception as e: print u'下载首页异常%s' % e retry_count -= 1 if retry_count <= 0: raise Exception(u'下次首页%s都失败,抛出异常' % retry_count) time.sleep(10) continue search_url = zhilian_crawler_data.get_search_url(key_words, hy, city, page_num = 1) #第二步,根据关键字搜索 while True: try: html_src = self.get_result_page_by_page_num(search_url, cookieJar, ua) search_url = self.get_next_page_url(html_src) if search_url == None: break self.get_and_product_detail_url(html_src) except Exception as e: raise Exception(u'下载搜索首页异常%s' % e)
def crawl(self, url): if url == None: return ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() try: html_src = webutil.request(url, headers = lagou_crawler_data.get_jobs_url_header(), ua = ua, cookie = cookieJar, timeout = 60, encoding = 'utf-8', retry = 5, savefile=None) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载的页面太大或太小') except Exception as e: print u'抓取%s异常 %s' % (url, e) raise Exception(u'抓取数据异常') save_data = {} #使用xpath提取信息 tree = etree.HTML(html_src) try: job_title_list = tree.xpath('.//*[@class="clearfix join_tc_icon"]/h1') if job_title_list != None and len(job_title_list) > 0: job_title = job_title_list[0].get('title') save_data['job_title'] = str(job_title) else: save_data['job_title'] = '' work_place_xpath = tree.xpath('.//*[@class="job_request"]/span[2]/text()') if work_place_xpath != None and len(work_place_xpath) > 0: work_place = work_place_xpath[0] save_data['work_place'] = str(work_place) else: save_data['work_place'] = '' publish_time_xpath = tree.xpath('.//*[@class="job_request"]/div[1]/text()') if publish_time_xpath != None and len(publish_time_xpath) > 0: publish_time = publish_time_xpath[0] save_data['publish_time'] = str(publish_time) else: save_data['publish_time'] = '' work_request_xpath = tree.xpath('.//*[@class="job_bt"]/p/text()') if work_request_xpath != None and len(work_request_xpath) > 0: work_request = work_request_xpath[0] save_data['work_request'] = str(work_request) else: save_data['work_request'] = '' except Exception as e: print u'解析页面异常%s' % e #存储数据 try: self.save_data(url, save_data) except Exception as e: print u'存储数据异常%s' % e raise Exception(u'存储数据异常')
def crawl(self, url): if url == None or len(url) < 1: return ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() data_dict = {} data_dict['type'] = 'zhilian' data_dict['version'] = 1 data_dict['url'] = url try: html_src = webutil.request(url, headers = zhilian_crawler_data.get_search_url_header(), ua = ua, cookie = cookieJar, timeout = 60, retry = 5, encoding = 'utf-8', proxy = None) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载详情页异常') data_dict['html'] = html_src self.parse_html(html_src, data_dict) self.save_data(url, data_dict) except Exception as e: print u'下载详情页异常%s' % e raise Exception(u'下载详情页异常')
def crawler(self, key_words = None, hy = None, city = None): ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() #第一步,请求拉勾网首页,获取cookie retry_count = 10 while True: try: html_src = webutil.request(lagou_crawler_data.lagou_url, headers = lagou_crawler_data.get_lagou_header(), timeout = 60, encoding = 'utf-8', cookie = cookieJar, ua = ua) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'请求的页面太大或太小,异常') break except Exception as e: print u'获取首页异常%s' % e retry_count -= 1 if retry_count > 0: time.sleep(5) continue raise Exception(u'获取首页异常,需要加换代理或其它手段') #第二步,提交查询请求 search_url, query_data = lagou_crawler_data.get_lagou_search_url(key_words, hy, city) if search_url == None: raise Exception(u'搜索关键字为空,异常') try: html_src = webutil.request(search_url, headers = lagou_crawler_data.get_lagou_search_header(), data = query_data, cookie = cookieJar, ua = ua, proxy = None, encoding = 'utf-8', retry = 5, timeout = 60) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'搜索%s异常' % search_url) except Exception as e: print u'下载搜索首页异常 %s' % e raise Exception(u'下载搜索首页异常') #第三步,提取post请求,查询具体数据 #提取第一页查询结果 post_data = lagou_crawler_data.get_lagou_position_post_data(first = 'true', keyword = key_words, page_num = 1) position_id_list = [] try: html_src = self.get_result_page(search_url, hy, city, post_data, cookieJar, ua, proxy = None) data_dict = self.json_to_dict(html_src) if "success" in data_dict: if data_dict["success"] != "true" and data_dict["success"] != True: return if "content" in data_dict: content = data_dict["content"] total_page_count = content["totalPageCount"] if int(total_page_count) == 0: return seach_results = content["result"] if seach_results != None and len(seach_results) > 0: [self.product(str(result['positionId'])) for result in seach_results] position_id_list.extend([result['positionId'] for result in seach_results]) except Exception as e: print u'请求结果首页异常%s' % e raise Exception(u"请求结果首页异常") if total_page_count != None and total_page_count > 1: post_data['first'] = 'false' for i in xrange(2, total_page_count): post_data['pn'] = i try: html_src = self.get_result_page(search_url, hy, city, post_data, cookieJar, ua, proxy = None) data_dict = self.json_to_dict(html_src) if "content" in data_dict: content = data_dict["content"] seach_results = content["result"] if seach_results != None and len(seach_results) > 0: [self.product(str(result['positionId'])) for result in seach_results] position_id_list.extend([result['positionId'] for result in seach_results]) # for result in seach_results: # position_id = result['positionId'] # print position_id except Exception as e: print u'请求结果页异常' time.sleep(2) continue