コード例 #1
0
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name span::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("salary", ".salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")

        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", "#job_company dt a div h2::text")

        job_item = item_loader.load_item()

        return job_item
コード例 #2
0
    def parse_job(self, response):
        #解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        publish_time = response.css(".publish_time::text").extract()[0].split(
            " ")[0].strip()
        item_loader.add_value("publish_time", publish_time)
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())
        time.sleep(3)
        job_item = item_loader.load_item()

        return job_item
コード例 #3
0
    def parse_job(self, response):
        # 解析职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css("tags", ".position-label li::text")
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.datetime.now())

        job_item = item_loader.load_item()
        return job_item

    # def parse_start_url(self, response):
    #     return []
    #
    # def process_results(self, response, results):
    #     return results
コード例 #4
0
ファイル: lagou.py プロジェクト: zhangatle/spider
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_css("tags", ".job_request ul li::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/h3/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/h3/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/h3/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/h3/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_address", ".work_addr")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", ".job_company_content em::text")

        item_loader.add_value("crawl_time", datetime.datetime.now())
        item_loader.add_value("crawl_update_time", datetime.datetime.now())
        job_item = item_loader.load_item()
        return job_item
コード例 #5
0
    def parse_job(self, response):
        # 解析拉钩网的职位
        item_loader = LagouItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_xpath('title', "//div[@class='job-name']/@title")
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_xpath(
            'salary',
            "//dd[@class='job_request']//span[@class='salary']/text()")
        item_loader.add_xpath('job_city',
                              "//dd[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath('work_years',
                              "//dd[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath('degree_need',
                              "//dd[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath('job_type',
                              "//dd[@class='job_request']/p/span[5]/text()")
        item_loader.add_xpath('publish_time',
                              "//p[@class='publish_time']/text()")
        item_loader.add_xpath(
            'tags', "//ul[contains(@class,'position-label')]/li/text()")
        item_loader.add_xpath(
            'job_advantage',
            "//span[@class='advantage']/following-sibling::p/text()")
        item_loader.add_xpath('job_desc', "string(//dd[@class='job_bt']/div)")
        item_loader.add_xpath('job_addr', "//div[@class='work_addr']")
        item_loader.add_xpath('company_name',
                              "//dl[@id='job_company']/dt/a/img/@alt")
        item_loader.add_xpath('company_url',
                              "//ul[@class='c_feature']/li[last()]/a/@href")
        item_loader.add_value('crawl_time', datetime.now())

        job_item = item_loader.load_item()
        return job_item
コード例 #6
0
 def parse_job(self, response):
     #analysis job details
     item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                      response=response)
     item_loader.add_css("title", ".job-name::attr(title)")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_xpath("salary",
                           "//*[@class='job_request']/p/span[1]/text()")
     item_loader.add_xpath("job_city",
                           "//*[@class='job_request']/p/span[2]/text()"
                           )  #Start from first one
     item_loader.add_xpath("work_years",
                           "//*[@class='job_request']/p/span[3]/text()")
     item_loader.add_xpath("degree_need",
                           "//*[@class='job_request']/p/span[4]/text()")
     item_loader.add_xpath("job_type",
                           "//*[@class='job_request']/p/span[5]/text()")
     item_loader.add_css("tags", '.position-label li::text')
     item_loader.add_css("publish_time", ".publish_time::text")
     item_loader.add_css("job_advantage", ".job-advantage p::text")
     item_loader.add_css("job_desc", ".job_bt div")
     item_loader.add_css("job_addr", ".work_addr")
     item_loader.add_css("company_name", "#job_company dt a div h2::text")
     item_loader.add_css("company_url", "#job_company dt a::attr(href)")
     item_loader.add_value("crawl_time", datetime.now())
     job_item = item_loader.load_item()
     return job_item
コード例 #7
0
    def parse_job(self, response):
        """
        解析职位信息页面
        """
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", '.job-name::attr(title)')
        # 这里最小最大薪资和最短最长工作年限等到item里再去处理
        item_loader.add_css("salary", '.job_request .salary::text')
        item_loader.add_xpath("job_city",
                              '//*[@class="job_request"]/p/span[2]/text()')
        item_loader.add_xpath("work_years",
                              '//*[@class="job_request"]/p/span[3]/text()')
        item_loader.add_xpath("degree_need",
                              '//*[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath("job_type",
                              '//*[@class="job_request"]/p/span[5]/text()')
        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", '.publish_time::text')
        item_loader.add_css("job_advantage", '.job-advantage p::text')
        item_loader.add_css("job_desc", '.job_bt div')
        # 地点的提取处理放在item里完成。
        item_loader.add_css("job_address", '.work_addr')
        item_loader.add_css("company_name", '.job_company dt a img::attr(alt)')
        item_loader.add_css("company_url", '.job_company dt a::attr(href)')
        item_loader.add_value("crawl_time", datetime.now())
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("url", response.url)

        job_item = item_loader.load_item()

        return job_item
コード例 #8
0
 def parse_job(self, response):
     item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                      response=response)
     # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     # i['name'] = response.xpath('//div[@id="name"]').extract()
     # i['description'] = response.xpath('//div[@id="description"]').extract()
     item_loader.add_css("title", ".job-name::attr(title)")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_css("salary", ".job_request p span.salary::text")
     item_loader.add_xpath("job_city",
                           "//dd[@class='job_request']/p/span[2]/text()")
     item_loader.add_xpath("work_years",
                           "//dd[@class='job_request']/p/span[3]/text()")
     item_loader.add_xpath("degree_need",
                           "//dd[@class='job_request']/p/span[4]/text()")
     item_loader.add_xpath("job_type",
                           "//dd[@class='job_request']/p/span[5]/text()")
     item_loader.add_css("publish_time",
                         ".job_request p.publish_time::text")
     item_loader.add_css("job_advantage", ".job-advantage p::text")
     item_loader.add_css("job_desc", ".job_bt div p")
     item_loader.add_css("job_addr", ".work_addr")
     item_loader.add_css("tags", ".position-label.clearfix li::text")
     item_loader.add_css("company_name", ".job_company dt a img::attr(alt)")
     item_loader.add_css("company_url", ".job_company dt a::attr(href)")
     item_loader.add_value("crawl_time", datetime.datetime.now())
     # item_loader.add_css("crawl_update_time",".work_addr")
     lagou_item = item_loader.load_item()
     return lagou_item
コード例 #9
0
ファイル: lagou.py プロジェクト: hanxingyu/ArticleSpider
 def parse_job(self, response):
     # 解析拉勾网的职位
     item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                      response=response)
     item_loader.add_css('title', '.job-name::attr(title)')
     pass
     return item_loader
コード例 #10
0
ファイル: lagou.py プロジェクト: evahere/JobSpider
    def parse_job(self, response):
        # item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()

        item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        # item_loader.add_css("job_city", ".job_request span:nth-child(2)::text")
        # item_loader.add_css("work_years", ".job_request span:nth-child(3)::text")
        # item_loader.add_css("degree_need", ".job_request span:nth-child(4)::text")
        # item_loader.add_css("job_type", ".job_request span:nth-child(5)::text")
        # item_loader.add_css("tags", ".position-label li::text")
        # item_loader.add_css("publish_time", ".publish_time::text")
        # item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job-detail")
        item_loader.add_css("job_addr", ".work_addr")
        # item_loader.add_css("company_name", "#job_company img::attr(alt)")
        # item_loader.add_xpath("company_url", "//*[@id='job_company']/dt/a/@href")
        # item_loader.add_value("crawl_time", datetime.now())
        job_item = item_loader.load_item()

        return job_item
コード例 #11
0
    def parse_item(self, response):
        #解析拉勾网职位
        lagouItemLoader = LagouJobItemLoader(item=LagouJobItem(),
                                             response=response)
        lagouItemLoader.add_css('title', '.job-name span::text')
        lagouItemLoader.add_value('url', response.url)
        lagouItemLoader.add_value('url_obj_id', get_md5(response.url))
        lagouItemLoader.add_css('salary', '.job_request .salary ::text')
        lagouItemLoader.add_xpath(
            'job_city', '//dd[@class="job_request"]/p/span[2]/text()')
        lagouItemLoader.add_xpath(
            'work_years', '//dd[@class="job_request"]/p/span[3]/text()')
        lagouItemLoader.add_xpath(
            'degree_need', '//dd[@class="job_request"]/p/span[4]/text()')
        lagouItemLoader.add_xpath(
            'job_type', '//dd[@class="job_request"]/p/span[5]/text()')

        lagouItemLoader.add_css('tags', '.position-label li::text')
        lagouItemLoader.add_css('publish_time', '.publish_time::text')
        lagouItemLoader.add_css('job_advantage', '.job-advantage p::text')
        lagouItemLoader.add_css('job_desc', '.job_bt div')
        lagouItemLoader.add_css('job_addr', '.work_addr')
        lagouItemLoader.add_css('company_url', '#job_company a::attr(href)')
        lagouItemLoader.add_css('company_name', '#job_company img::attr(alt)')
        lagouItemLoader.add_value('craw_time', datetime.datetime.now().date())
        lagouItemLoader.add_value('craw_update_time', '')

        lagouJobItem = lagouItemLoader.load_item()
        print(lagouJobItem)
        return lagouJobItem
コード例 #12
0
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)

        item_loader.add_css('title', '.job-name::attr("title")')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('salary', '.job_request .salary::text')
        item_loader.add_css(
            'job_city',
            '.job_request > p:nth-child(1) > span:nth-child(2)::text')
        item_loader.add_css(
            'work_years',
            '.job_request > p:nth-child(1) > span:nth-child(3)::text')
        item_loader.add_css(
            'degree_need',
            '.job_request > p:nth-child(1) > span:nth-child(4)::text')
        item_loader.add_css(
            'job_type',
            '.job_request > p:nth-child(1) > span:nth-child(5)::text')
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('tags', '.position-label .labels::text')
        item_loader.add_css('job_advantage', '.job-advantage p::text')
        item_loader.add_css('job_desc', '.job_bt div')
        item_loader.add_css('job_addr', '.work_addr')
        item_loader.add_css('company_name', '.b2::attr("alt")')
        item_loader.add_css('company_url', '#job_company dt a::attr("href")')
        item_loader.add_value('crawl_time',
                              datetime.now().strftime(SQL_DATETIME_FORMAT))
        job_item = item_loader.load_item()
        print('parse job 函数返回:', job_item)
        return job_item
コード例 #13
0
    def parse_job(self, response):
        # 解析拉勾网的职位

        # i = {}
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()

        # 尽量少写处理逻辑,关于数据的清洗放在ItemLoader函数中去做   css .class #id
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id",
                              get_md5(response.url))  # 参考jobbole4中的md5用法
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css("tags", ".position-label li::text")
        item_loader.add_css(
            "publish_time",
            ".publish_time::text")  # need to convert str and split
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()
        return job_item
コード例 #14
0
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        # 以下四个item通过span拿到,用xpath比较好写
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        # 这里把全文html提取
        item_loader.add_css("job_desc", ".job_bt div")
        # 这里有些地址放在<a>下面,不能直接取text。先全拿到,后面再处理
        item_loader.add_css("job_addr", ".work_addr")
        # 注意:job_company是一个id,所以用"#"不用"."
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        # TODO: 这里不是yield?
        return job_item
コード例 #15
0
    def parse_job(self, response):
        item_loader = LagouItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css('title', '.job-name .name::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_md5', get_md5(response.url))
        item_loader.add_css('salary', '.job_request .salary::text')
        item_loader.add_xpath('job_city',
                              '//dd[@class="job_request"]/p/span[2]/text()')
        item_loader.add_xpath('work_years',
                              '//dd[@class="job_request"]/p/span[3]/text()')
        item_loader.add_xpath('degree_need',
                              '//dd[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath('job_type',
                              '//dd[@class="job_request"]/p/span[5]/text()')
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('job_advantage', '.job-advantage p::text')
        item_loader.add_css('job_desc', '.job_bt div')
        # job_desc = response.css('.job_bt')[0].xpath('string()').extract_first()  # 不用ItemLoader可以这么写
        item_loader.add_css('job_addr', '.work_addr')
        item_loader.add_css('company_name', '.job_company dt a img::attr(alt)')
        item_loader.add_css('company_url', '.job_company dt a::attr(href)')
        item_loader.add_css('tags', '.position-label li::text')
        item_loader.add_value('crawl_time', datetime.datetime.now())

        job_item = item_loader.load_item()

        return job_item
コード例 #16
0
ファイル: lagou.py プロジェクト: cMinzel-Z/Web-crawler
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css(".job-name h1::text")

        job_item = item_loader.load_item()

        return job_item
コード例 #17
0
ファイル: lagou.py プロジェクト: duyanyong2017/ArticleSpider
    def parse_job(self, response):
        # logger = logging.getLogger()
        # formatter = logging.Formatter('%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
        #
        # file_hadler = logging.FileHandler(filename=LOG_FILE, encoding='utf-8')
        # file_hadler.setLevel(level=logging.DEBUG)
        # file_hadler.setFormatter(formatter)
        #
        # stream_handler = logging.StreamHandler()
        # stream_handler.setLevel(logging.DEBUG)
        # stream_handler.setFormatter(formatter)
        #
        # logger.addHandler(file_hadler)
        # logger.addHandler(stream_handler)
        #
        # logger.info(response.url)

        itemloader = LagouJobItemLoader(item=LagouJobItem(), response=response)

        itemloader.add_css("title", ".job-name::attr(title)")
        itemloader.add_value("url", response.url)
        itemloader.add_value('url_object_id', get_md5(response.url))
        itemloader.add_css("salary", ".job_request .salary::text")
        itemloader.add_xpath("job_city",
                             "//*[@class='job_request']/h3/span[2]/text()")
        itemloader.add_xpath("work_years",
                             "//*[@class='job_request']/h3/span[3]/text()")
        itemloader.add_xpath("degree_need",
                             "//*[@class='job_request']/h3/span[4]/text()")
        itemloader.add_xpath("job_type",
                             "//*[@class='job_request']/h3/span[5]/text()")

        itemloader.add_css("tags", '.position-label li::text')
        itemloader.add_css('publish_time', '.publish_time::text')
        itemloader.add_css('job_advantage', '.job-advantage p::text')
        itemloader.add_css('job_desc', '.job_bt div')
        itemloader.add_css('job_addr', '.work_addr')
        itemloader.add_css('company_name', '#job_company dt a img::attr(alt)')
        itemloader.add_css('company_url', '#job_company dt a::attr(href)')
        itemloader.add_value('crawl_time',
                             datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

        job_item = itemloader.load_item()

        return job_item
コード例 #18
0
    def parse_item(self, response):
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()

        itemloader = ItemLoader(item=LagouJobItem(), response=response)
        itemloader.add_value("url", response.url)
        itemloader.add_value("url_object_id", get_md5(response.url))
        itemloader.add_css("title", ".job-name span::text")
        itemloader.add_css("salary", ".job_request span::text")
        itemloader.add_css("publish_time", "ul.position-label p::text")
        itemloader.add_css("tags", "ul.position li::text")
        itemloader.add_css("job_advantage", ".job-advantage p::text")
        itemloader.add_css("job_desc", ".job_bt p::text")
        itemloader.add_css("job_addr", ".job-address a::text")
        itemloader.add_css("company_url", ".job_company ul a::attr(href)")
        itemloader.add_css("company_name", "job_company dt a img::attr(alt)")
        itemloader.add_value("crawl_time", datetime.datetime.now())
        job_item = itemloader.load_item()
        return job_item
コード例 #19
0
    def parse(self, response):
        itemloader = LagouJobItemLoader(item=LagouJobItem(), response=response)

        itemloader.add_css("title", ".job-name::attr(title)")
        itemloader.add_value("url", response.url)
        itemloader.add_value('url_object_id', get_md5(response.url))
        itemloader.add_css("salary", ".job_request .salary::text")
        itemloader.add_xpath("job_city", "//*[@class='job_request']/h3/span[2]/text()")
        itemloader.add_xpath("work_years", "//*[@class='job_request']/h3/span[3]/text()")
        itemloader.add_xpath("degree_need", "//*[@class='job_request']/h3/span[4]/text()")
        itemloader.add_xpath("job_type", "//*[@class='job_request']/h3/span[5]/text()")

        itemloader.add_css("tags", '.position-label li::text')
        itemloader.add_css('publish_time', '.publish_time::text')
        itemloader.add_css('job_advantage', '.job-advantage p::text')
        itemloader.add_css('job_desc', '.job_bt div')
        itemloader.add_css('job_addr', '.work_addr')
        itemloader.add_css('company_name', '#job_company dt a img::attr(alt)')
        itemloader.add_css('company_url', '#job_company dt a::attr(href)')
        itemloader.add_value('crawl_time', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

        job_item = itemloader.load_item()

        return job_item
コード例 #20
0
ファイル: lagou.py プロジェクト: sunzhongyuan/ArticleSpider
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(LagouJobItem(), response)
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_url', get_md5(response.url))
        item_loader.add_css('title', '.job-name::attr(title)')
        item_loader.add_css('salary', '.job_request span.salary::text')
        item_loader.add_css('job_city', '.job_request span:nth-child(2)::text')
        # xpath选择第二个这么写
        # item_loader.add_xpath('job_city', '//*[@class="job_request"]/p/span[2]/text()')
        item_loader.add_css('work_year', '.job_request span:nth-child(3)::text')
        item_loader.add_css('degree_need', '.job_request span:nth-child(4)::text')
        item_loader.add_css('job_type', '.job_request span:nth-child(5)::text')
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('tags', '.position-label li::text')
        item_loader.add_css('job_advantage', '.job-advantage p::text')
        item_loader.add_css('job_desc', '.job_bt div')
        item_loader.add_css('job_addr', '.work_addr a:not(#mapPreview)::text, .work_addr::text')
        # item_loader.add_css('job_addr', '.work_addr')
        item_loader.add_css('company_url', '.job_company dt a::attr(href)')
        item_loader.add_css('company_name', '.job_company dt a img::attr(alt)')
        item_loader.add_value('crawl_time', datetime.datetime.now())

        job_item = item_loader.load_item()
        return job_item