Python get_user_agent Examples, common.webutil.get_user_agent Python Examples

Example #1

0

Show file

File: zhilian_crawler.py Project: zhaoylcd/lagou_job_crawler

    def crawler(self, key_words = None, hy = None, city = None):
        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        #第一步，请求首页，拿到cookie
        retry_count = 10
        while True:
            try:
                html_src = webutil.request(zhilian_crawler_data.first_url, headers = zhilian_crawler_data.first_url_request_header, timeout = 60, encoding = 'utf-8', proxy = None, cookie = cookieJar, ua = ua)
                if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                    raise Exception(u'下载首页太大或太小')
                break
            except Exception as e:
                print u'下载首页异常%s' % e
                retry_count -= 1
                if retry_count <= 0:
                    raise Exception(u'下次首页%s都失败，抛出异常' % retry_count)
                time.sleep(10)
                continue

        search_url = zhilian_crawler_data.get_search_url(key_words, hy, city, page_num = 1)
        #第二步，根据关键字搜索
        while True:
            try:
                html_src = self.get_result_page_by_page_num(search_url, cookieJar, ua)
                search_url = self.get_next_page_url(html_src)
                if search_url == None:
                    break
                self.get_and_product_detail_url(html_src)
            except Exception as e:
                raise Exception(u'下载搜索首页异常%s' % e)

Example #2

0

Show file

File: lagou_crawler_consumer.py Project: zhaoylcd/lagou_job_crawler

    def crawl(self, url):
        if url == None:
            return

        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        try:
            html_src = webutil.request(url, headers = lagou_crawler_data.get_jobs_url_header(), ua = ua, cookie = cookieJar, timeout = 60, encoding = 'utf-8', retry = 5, savefile=None)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'下载的页面太大或太小')
        except Exception as e:
            print u'抓取%s异常 %s' % (url, e)
            raise Exception(u'抓取数据异常')

        save_data = {}
        #使用xpath提取信息
        tree = etree.HTML(html_src)
        try:
            job_title_list = tree.xpath('.//*[@class="clearfix join_tc_icon"]/h1')
            if job_title_list != None and len(job_title_list) > 0:
                job_title = job_title_list[0].get('title')
                save_data['job_title'] = str(job_title)
            else:
                save_data['job_title'] = ''
            work_place_xpath = tree.xpath('.//*[@class="job_request"]/span[2]/text()')
            if work_place_xpath != None and len(work_place_xpath) > 0:
                work_place = work_place_xpath[0]
                save_data['work_place'] = str(work_place)
            else:
                save_data['work_place'] = ''

            publish_time_xpath = tree.xpath('.//*[@class="job_request"]/div[1]/text()')
            if publish_time_xpath != None and len(publish_time_xpath) > 0:
                publish_time = publish_time_xpath[0]
                save_data['publish_time'] = str(publish_time)
            else:
                save_data['publish_time'] = ''

            work_request_xpath = tree.xpath('.//*[@class="job_bt"]/p/text()')
            if work_request_xpath != None and len(work_request_xpath) > 0:
                work_request = work_request_xpath[0]
                save_data['work_request'] = str(work_request)
            else:
                save_data['work_request'] = ''
        except Exception as e:
            print u'解析页面异常%s' % e

        #存储数据
        try:
            self.save_data(url, save_data)
        except Exception as e:
            print u'存储数据异常%s' % e
            raise Exception(u'存储数据异常')

Example #3

0

Show file

File: zhilian_crawler_consumer.py Project: zhaoylcd/lagou_job_crawler

    def crawl(self, url):
        if url == None or len(url) < 1:
            return

        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        data_dict = {}
        data_dict['type'] = 'zhilian'
        data_dict['version'] = 1
        data_dict['url'] = url

        try:
            html_src = webutil.request(url, headers = zhilian_crawler_data.get_search_url_header(), ua = ua, cookie = cookieJar, timeout = 60, retry = 5, encoding = 'utf-8', proxy = None)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'下载详情页异常')
            data_dict['html'] = html_src
            self.parse_html(html_src, data_dict)
            self.save_data(url, data_dict)
        except Exception as e:
            print u'下载详情页异常%s' % e
            raise Exception(u'下载详情页异常')

Example #4

0

Show file

File: lagou_crawler.py Project: zhaoylcd/lagou_job_crawler

    def crawler(self, key_words = None, hy = None, city = None):
        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        #第一步，请求拉勾网首页，获取cookie
        retry_count = 10
        while True:
            try:
                html_src = webutil.request(lagou_crawler_data.lagou_url, headers = lagou_crawler_data.get_lagou_header(), timeout = 60, encoding = 'utf-8', cookie = cookieJar, ua = ua)
                if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                    raise Exception(u'请求的页面太大或太小，异常')
                break
            except Exception as e:
                print u'获取首页异常%s' % e
                retry_count -= 1
                if retry_count > 0:
                    time.sleep(5)
                    continue
                raise Exception(u'获取首页异常，需要加换代理或其它手段')


        #第二步，提交查询请求

        search_url, query_data = lagou_crawler_data.get_lagou_search_url(key_words, hy, city)
        if search_url == None:
            raise Exception(u'搜索关键字为空，异常')

        try:
            html_src = webutil.request(search_url, headers = lagou_crawler_data.get_lagou_search_header(), data = query_data, cookie = cookieJar, ua = ua, proxy = None, encoding = 'utf-8', retry = 5, timeout = 60)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'搜索%s异常' % search_url)
        except Exception as e:
            print u'下载搜索首页异常 %s' % e
            raise Exception(u'下载搜索首页异常')

        #第三步，提取post请求，查询具体数据
        #提取第一页查询结果
        post_data = lagou_crawler_data.get_lagou_position_post_data(first = 'true', keyword = key_words, page_num = 1)

        position_id_list = []
        try:
            html_src = self.get_result_page(search_url, hy, city, post_data, cookieJar, ua, proxy = None)
            data_dict = self.json_to_dict(html_src)
            if "success" in data_dict:
                if data_dict["success"] != "true" and data_dict["success"] != True:
                    return

            if "content" in data_dict:
                content = data_dict["content"]
                total_page_count = content["totalPageCount"]
                if int(total_page_count) == 0:
                    return

                seach_results = content["result"]
                if seach_results != None and len(seach_results) > 0:
                    [self.product(str(result['positionId'])) for result in seach_results]
                    position_id_list.extend([result['positionId'] for result in seach_results])
        except Exception as e:
            print u'请求结果首页异常%s' % e
            raise Exception(u"请求结果首页异常")

        if total_page_count != None and total_page_count > 1:
            post_data['first'] = 'false'
            for i in xrange(2, total_page_count):
                post_data['pn'] = i
                try:
                    html_src = self.get_result_page(search_url, hy, city, post_data, cookieJar, ua, proxy = None)
                    data_dict = self.json_to_dict(html_src)
                    if "content" in data_dict:
                        content = data_dict["content"]
                        seach_results = content["result"]
                        if seach_results != None and len(seach_results) > 0:
                            [self.product(str(result['positionId'])) for result in seach_results]
                            position_id_list.extend([result['positionId'] for result in seach_results])
                            # for result in seach_results:
                            #     position_id = result['positionId']
                            #     print position_id
                except Exception as e:
                    print u'请求结果页异常'
                    time.sleep(2)
                    continue