コード例 #1
0
 def parse_job(self, response):
     #analysis job details
     item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                      response=response)
     item_loader.add_css("title", ".job-name::attr(title)")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_xpath("salary",
                           "//*[@class='job_request']/p/span[1]/text()")
     item_loader.add_xpath("job_city",
                           "//*[@class='job_request']/p/span[2]/text()"
                           )  #Start from first one
     item_loader.add_xpath("work_years",
                           "//*[@class='job_request']/p/span[3]/text()")
     item_loader.add_xpath("degree_need",
                           "//*[@class='job_request']/p/span[4]/text()")
     item_loader.add_xpath("job_type",
                           "//*[@class='job_request']/p/span[5]/text()")
     item_loader.add_css("tags", '.position-label li::text')
     item_loader.add_css("publish_time", ".publish_time::text")
     item_loader.add_css("job_advantage", ".job-advantage p::text")
     item_loader.add_css("job_desc", ".job_bt div")
     item_loader.add_css("job_addr", ".work_addr")
     item_loader.add_css("company_name", "#job_company dt a div h2::text")
     item_loader.add_css("company_url", "#job_company dt a::attr(href)")
     item_loader.add_value("crawl_time", datetime.now())
     job_item = item_loader.load_item()
     return job_item
コード例 #2
0
    def parse_job(self, response):
        #解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        publish_time = response.css(".publish_time::text").extract()[0].split(
            " ")[0].strip()
        item_loader.add_value("publish_time", publish_time)
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())
        time.sleep(3)
        job_item = item_loader.load_item()

        return job_item
コード例 #3
0
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        # 以下四个item通过span拿到,用xpath比较好写
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        # 这里把全文html提取
        item_loader.add_css("job_desc", ".job_bt div")
        # 这里有些地址放在<a>下面,不能直接取text。先全拿到,后面再处理
        item_loader.add_css("job_addr", ".work_addr")
        # 注意:job_company是一个id,所以用"#"不用"."
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        # TODO: 这里不是yield?
        return job_item
コード例 #4
0
ファイル: lagou.py プロジェクト: zhangatle/spider
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_css("tags", ".job_request ul li::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/h3/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/h3/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/h3/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/h3/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_address", ".work_addr")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", ".job_company_content em::text")

        item_loader.add_value("crawl_time", datetime.datetime.now())
        item_loader.add_value("crawl_update_time", datetime.datetime.now())
        job_item = item_loader.load_item()
        return job_item
コード例 #5
0
    def parse_job(self, response):
        # 解析职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css("tags", ".position-label li::text")
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.datetime.now())

        job_item = item_loader.load_item()
        return job_item

    # def parse_start_url(self, response):
    #     return []
    #
    # def process_results(self, response, results):
    #     return results
コード例 #6
0
ファイル: lagou.py プロジェクト: HongweiY/ArticleSpider
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJonItem(),
                                         response=response)
        item_loader.add_css('title', '.job-name::attr(title)')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('salary', '.job_request .salary::text')
        item_loader.add_xpath('job_city',
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath('work_years',
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath('degree_need',
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath('job_type',
                              "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('tags', '.position-label li::text')
        item_loader.add_css('job_advantage', '.job-advantage p::text')
        item_loader.add_css('job_desc', '.job_bt div')
        item_loader.add_css('job_addr', '.work_addr')
        item_loader.add_css('company_url', '#job_company dt a::attr(href)')
        item_loader.add_css('company_name', '#job_company dt a img::attr(alt)')
        item_loader.add_value('crawl_time', datetime.now())

        job_item = item_loader.load_item()

        return job_item
コード例 #7
0
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name span::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("salary", ".salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")

        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", "#job_company dt a div h2::text")

        job_item = item_loader.load_item()

        return job_item
コード例 #8
0
    def parse_job(self, response):
        """
        解析职位信息页面
        """
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", '.job-name::attr(title)')
        # 这里最小最大薪资和最短最长工作年限等到item里再去处理
        item_loader.add_css("salary", '.job_request .salary::text')
        item_loader.add_xpath("job_city",
                              '//*[@class="job_request"]/p/span[2]/text()')
        item_loader.add_xpath("work_years",
                              '//*[@class="job_request"]/p/span[3]/text()')
        item_loader.add_xpath("degree_need",
                              '//*[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath("job_type",
                              '//*[@class="job_request"]/p/span[5]/text()')
        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", '.publish_time::text')
        item_loader.add_css("job_advantage", '.job-advantage p::text')
        item_loader.add_css("job_desc", '.job_bt div')
        # 地点的提取处理放在item里完成。
        item_loader.add_css("job_address", '.work_addr')
        item_loader.add_css("company_name", '.job_company dt a img::attr(alt)')
        item_loader.add_css("company_url", '.job_company dt a::attr(href)')
        item_loader.add_value("crawl_time", datetime.now())
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("url", response.url)

        job_item = item_loader.load_item()

        return job_item
コード例 #9
0
    def parse_item(self, response):
        #解析拉勾网职位
        lagouItemLoader = LagouJobItemLoader(item=LagouJobItem(),
                                             response=response)
        lagouItemLoader.add_css('title', '.job-name span::text')
        lagouItemLoader.add_value('url', response.url)
        lagouItemLoader.add_value('url_obj_id', get_md5(response.url))
        lagouItemLoader.add_css('salary', '.job_request .salary ::text')
        lagouItemLoader.add_xpath(
            'job_city', '//dd[@class="job_request"]/p/span[2]/text()')
        lagouItemLoader.add_xpath(
            'work_years', '//dd[@class="job_request"]/p/span[3]/text()')
        lagouItemLoader.add_xpath(
            'degree_need', '//dd[@class="job_request"]/p/span[4]/text()')
        lagouItemLoader.add_xpath(
            'job_type', '//dd[@class="job_request"]/p/span[5]/text()')

        lagouItemLoader.add_css('tags', '.position-label li::text')
        lagouItemLoader.add_css('publish_time', '.publish_time::text')
        lagouItemLoader.add_css('job_advantage', '.job-advantage p::text')
        lagouItemLoader.add_css('job_desc', '.job_bt div')
        lagouItemLoader.add_css('job_addr', '.work_addr')
        lagouItemLoader.add_css('company_url', '#job_company a::attr(href)')
        lagouItemLoader.add_css('company_name', '#job_company img::attr(alt)')
        lagouItemLoader.add_value('craw_time', datetime.datetime.now().date())
        lagouItemLoader.add_value('craw_update_time', '')

        lagouJobItem = lagouItemLoader.load_item()
        print(lagouJobItem)
        return lagouJobItem
コード例 #10
0
    def parse_job(self, response):
        # 解析拉勾网的职位

        # i = {}
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()

        # 尽量少写处理逻辑,关于数据的清洗放在ItemLoader函数中去做   css .class #id
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id",
                              get_md5(response.url))  # 参考jobbole4中的md5用法
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css("tags", ".position-label li::text")
        item_loader.add_css(
            "publish_time",
            ".publish_time::text")  # need to convert str and split
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()
        return job_item
コード例 #11
0
 def parse_job(self, response):
     item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                      response=response)
     # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     # i['name'] = response.xpath('//div[@id="name"]').extract()
     # i['description'] = response.xpath('//div[@id="description"]').extract()
     item_loader.add_css("title", ".job-name::attr(title)")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_css("salary", ".job_request p span.salary::text")
     item_loader.add_xpath("job_city",
                           "//dd[@class='job_request']/p/span[2]/text()")
     item_loader.add_xpath("work_years",
                           "//dd[@class='job_request']/p/span[3]/text()")
     item_loader.add_xpath("degree_need",
                           "//dd[@class='job_request']/p/span[4]/text()")
     item_loader.add_xpath("job_type",
                           "//dd[@class='job_request']/p/span[5]/text()")
     item_loader.add_css("publish_time",
                         ".job_request p.publish_time::text")
     item_loader.add_css("job_advantage", ".job-advantage p::text")
     item_loader.add_css("job_desc", ".job_bt div p")
     item_loader.add_css("job_addr", ".work_addr")
     item_loader.add_css("tags", ".position-label.clearfix li::text")
     item_loader.add_css("company_name", ".job_company dt a img::attr(alt)")
     item_loader.add_css("company_url", ".job_company dt a::attr(href)")
     item_loader.add_value("crawl_time", datetime.datetime.now())
     # item_loader.add_css("crawl_update_time",".work_addr")
     lagou_item = item_loader.load_item()
     return lagou_item
コード例 #12
0
ファイル: lagou.py プロジェクト: duyanyong2017/ArticleSpider
    def parse_job(self, response):
        # logger = logging.getLogger()
        # formatter = logging.Formatter('%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
        #
        # file_hadler = logging.FileHandler(filename=LOG_FILE, encoding='utf-8')
        # file_hadler.setLevel(level=logging.DEBUG)
        # file_hadler.setFormatter(formatter)
        #
        # stream_handler = logging.StreamHandler()
        # stream_handler.setLevel(logging.DEBUG)
        # stream_handler.setFormatter(formatter)
        #
        # logger.addHandler(file_hadler)
        # logger.addHandler(stream_handler)
        #
        # logger.info(response.url)

        itemloader = LagouJobItemLoader(item=LagouJobItem(), response=response)

        itemloader.add_css("title", ".job-name::attr(title)")
        itemloader.add_value("url", response.url)
        itemloader.add_value('url_object_id', get_md5(response.url))
        itemloader.add_css("salary", ".job_request .salary::text")
        itemloader.add_xpath("job_city",
                             "//*[@class='job_request']/h3/span[2]/text()")
        itemloader.add_xpath("work_years",
                             "//*[@class='job_request']/h3/span[3]/text()")
        itemloader.add_xpath("degree_need",
                             "//*[@class='job_request']/h3/span[4]/text()")
        itemloader.add_xpath("job_type",
                             "//*[@class='job_request']/h3/span[5]/text()")

        itemloader.add_css("tags", '.position-label li::text')
        itemloader.add_css('publish_time', '.publish_time::text')
        itemloader.add_css('job_advantage', '.job-advantage p::text')
        itemloader.add_css('job_desc', '.job_bt div')
        itemloader.add_css('job_addr', '.work_addr')
        itemloader.add_css('company_name', '#job_company dt a img::attr(alt)')
        itemloader.add_css('company_url', '#job_company dt a::attr(href)')
        itemloader.add_value('crawl_time',
                             datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

        job_item = itemloader.load_item()

        return job_item
コード例 #13
0
    def parse(self, response):
        itemloader = LagouJobItemLoader(item=LagouJobItem(), response=response)

        itemloader.add_css("title", ".job-name::attr(title)")
        itemloader.add_value("url", response.url)
        itemloader.add_value('url_object_id', get_md5(response.url))
        itemloader.add_css("salary", ".job_request .salary::text")
        itemloader.add_xpath("job_city", "//*[@class='job_request']/h3/span[2]/text()")
        itemloader.add_xpath("work_years", "//*[@class='job_request']/h3/span[3]/text()")
        itemloader.add_xpath("degree_need", "//*[@class='job_request']/h3/span[4]/text()")
        itemloader.add_xpath("job_type", "//*[@class='job_request']/h3/span[5]/text()")

        itemloader.add_css("tags", '.position-label li::text')
        itemloader.add_css('publish_time', '.publish_time::text')
        itemloader.add_css('job_advantage', '.job-advantage p::text')
        itemloader.add_css('job_desc', '.job_bt div')
        itemloader.add_css('job_addr', '.work_addr')
        itemloader.add_css('company_name', '#job_company dt a img::attr(alt)')
        itemloader.add_css('company_url', '#job_company dt a::attr(href)')
        itemloader.add_value('crawl_time', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

        job_item = itemloader.load_item()

        return job_item