Example #1
0
    def parse_content(response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary_min", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years_min",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
    def parse_content(response):
        jobbole_item = JobboleBlogItem()
        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = JobboleBlogItemLoader(item=jobbole_item,
                                            response=response)

        # 通过css选择器将后面的指定规则进行解析。
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        # 调用这个方法来对规则进行解析生成item对象
        jobbole_item = item_loader.load_item()

        # 已经填充好了值调用yield传输至pipeline
        yield jobbole_item
    def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            # 处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)

            item_loader.add_value("url_object_id", get_md5(response.url))
            item_loader.add_value("question_id", question_id)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_xpath(
                "content",
                "//*[@id='root']/div/main/div/div[1]/div[2]"
                "/div[1]/div[1]/div[2]/div/div/div/span/text()",
            )
            item_loader.add_css(
                "topics", ".QuestionHeader-topics .Tag.QuestionTopic .Popover div::text"
            )
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
            # 这里的watch_user_num 包含Watch 和 click 在clean data中分离
            item_loader.add_css("watch_user_num", ".NumberBoard-itemValue ::text")
            item_loader.add_value("url", response.url)
            question_item = item_loader.load_item()
        else:
            # 处理老版本页面的item提取(好像已经没有老版页面了我这里放着保险一下)
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_xpath(
                "title",
                "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()",
            )
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css(
                "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text"
            )
            item_loader.add_xpath(
                "watch_user_num",
                "//*[@id='zh-question-side-header-wrap']/text()|"
                "//*[@class='zh-question-followers-sidebar']/div/a/strong/text()",
            )
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()
        # 发起向后台具体answer的接口请求
        yield scrapy.Request(
            self.start_answer_url.format(question_id, 20, 0),
            headers=self.headers,
            callback=self.parse_answer,
        )
        yield question_item
Example #4
0
    def parse_content(response):
        item_loader = ZhilianItemLoader(item=ZhilianItem(), response=response)
        item_loader.add_css("title", "div.main1-stat h1.info-h3::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary_min",
                            "div.main1-stat div.info-money>strong::text")
        item_loader.add_css(
            "job_city", "div.main1-stat .info-three>span:nth-child(1)>a::text")
        item_loader.add_css(
            "work_years_min",
            "div.main1-stat .info-three>span:nth-child(2)::text")
        item_loader.add_css(
            "degree_need",
            "div.main1-stat .info-three>span:nth-child(3)::text")
        item_loader.add_xpath("job_advantage", "//script[1]")
        item_loader.add_css("job_desc", "div.responsibility > div.pos-ul")
        item_loader.add_css("job_addr", "div.work-add > p.add-txt::text")
        item_loader.add_css("company_name", "div.company > a::text")
        item_loader.add_css("company_url", "div.company > a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
Example #5
0
    def parse_content(response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("tags", ".position-label li::text")
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
    def parse_content(response):
        item_loader = W51JobItemLoader(item=W51JobItem(), response=response)
        item_loader.add_css("title", "div.cn > h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary_min", "div.cn > strong::text")
        item_loader.add_css("job_city", "p.msg.ltype::text")
        item_loader.add_css("job_advantage",
                            "div.tHeader.tHjob div.t1>.sp4::text")
        item_loader.add_css("job_desc", "div.tCompany_main div.job_msg")
        item_loader.add_xpath(
            "job_addr",
            "//*[@class='tBorderTop_box'][2]/div/p[@class='fp']/text()")
        item_loader.add_css("company_name", "p.cname>a.catn::attr(title)")
        item_loader.add_css("company_url", "p.cname>a.catn::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
    def parse_answer(self, response):
        # 处理question的answer
        ans_json = json.loads(response.text)
        is_end = ans_json["paging"]["is_end"]
        next_url = ans_json["paging"]["next"]

        # 提取answer的具体字段
        for answer in ans_json["data"]:
            url_object_id = get_md5(url=answer["url"])
            answer_id = answer["id"]
            question_id = answer["question"]["id"]
            author_id = answer["author"]["id"] if "id" in answer["author"] else None
            author_name = (
                answer["author"]["name"] if "name" in answer["author"] else None
            )
            content = answer["excerpt"] if "excerpt" in answer else ""
            really_url = "https://www.zhihu.com/question/{0}/answer/{1}".format(
                answer["question"]["id"], answer["id"]
            )
            create_time = answer["created_time"]
            updated_time = answer["updated_time"]

            yield scrapy.Request(
                really_url,
                headers=self.headers,
                callback=self.parse_answer_end,
                meta={
                    "url_object_id": url_object_id,
                    "answer_id": answer_id,
                    "question_id": question_id,
                    "author_id": author_id,
                    "author_name": author_name,
                    "content": content,
                    "create_time": create_time,
                    "updated_time": updated_time,
                },
            )
        if not is_end:
            yield scrapy.Request(
                next_url, headers=self.headers, callback=self.parse_answer
            )