def parse_detail(self, response):
        # 实例化
        article_item = JobBoleArticleItem()

        # 通过item loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        if response.url.find("/Index/newslist") or response.url.find("/index"):
            # 通过css选择器将后面的指定规则进行解析。
            item_loader.add_css("title", ".article-title::text")
            item_loader.add_value("url", response.url)
            item_loader.add_value("url_object_id", get_md5(response.url))
            item_loader.add_css("create_date", "#date-topic::text")
            item_loader.add_css("content", ".article-content")

            # 调用这个方法来对规则进行解析生成item对象
            article_item = item_loader.load_item()
        else:
            item_loader.add_css("title", "title::text")
            item_loader.add_value("url", response.url)
            item_loader.add_value("url_object_id", get_md5(response.url))
            item_loader.add_css("create_date", "title::text")
            item_loader.add_css("content", "body::text")
        # 已经填充好了值调用yield传输至pipeline
        yield article_item
Exemple #2
0
    def parse_detail(self, response):
        article_item=JobBoleArticleItem()

        # 通过css选择器提取字段
        front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        title = response.css(".entry-header h1::text").extract()[0]
        create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
        praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract()[0]

        tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        tags = ",".join(tag_list)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Exemple #3
0
    def parse_detail(self, response):
        article_item = CnBlogsArticleItem()

        title = response.css("#news_title a::text").extract_first()

        create_date = response.css(".time::text").extract_first()
        # create_time = response.css(".time::text").extract_first()

        match_pattern = ".*(\d{4}-\d{2}-\d{2}).*?(\d+:\d+)"
        match_re = re.match(match_pattern, create_date)
        if match_re:
            create_date = match_re.group(1)
            # create_time = match_re.group(2)

        content = response.css('#news_body').extract_first()

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strftime(create_date,
                                                     "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()

        article_item["create_date"] = create_date
        # article_item["create_time"] = create_time
        article_item["content"] = content

        yield article_item
Exemple #4
0
    def parse_job(self, response):
        item_load = LagouJobItemLoader(item=LagouJobItem(), response=response)
        item_load.add_value("url", response.url)
        item_load.add_value("url_object_id", get_md5(response.url))
        item_load.add_css("title", "div.job-name::attr(title)")
        item_load.add_css("salary", ".salary::text")
        item_load.add_xpath("job_city",
                            "//*[@class='job_request']/p/span[2]/text()")
        item_load.add_xpath("work_years",
                            "//*[@class='job_request']/p/span[3]/text()")
        item_load.add_xpath("degree_need",
                            "//*[@class='job_request']/p/span[4]/text()")
        item_load.add_xpath("job_type",
                            "//*[@class='job_request']/p/span[5]/text()")
        item_load.add_css("pulish_time", ".publish_time::text")
        item_load.add_xpath("tags",
                            "//*[@class='position-label clearfix']/li/text()")
        item_load.add_xpath("job_advantage",
                            "//*[@class='job-advantage']/p/text()")
        item_load.add_xpath("job_desc", "//*[@class='job_bt']/div")
        item_load.add_xpath("job_addr", "//*[@class='work_addr']/a/text()")
        item_load.add_xpath("company_url",
                            "//*[@class='c_feature']/li/a/@title")
        item_load.add_css("company_name", ".job_company dt img::attr(alt)")
        item_load.add_value("crawl_time", datetime.datetime.now())
        item_load.add_value("crawl_update_time", datetime.datetime.now())

        lagou_item = item_load.load_item()
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        return lagou_item
Exemple #5
0
    def parse_detail(self, response):
        try:
            # 使用Crawl api记录文章详情页请求成功的Request
            self.crawler.stats.inc_value("ArticleDetail_Success_Reqeust")
        except Exception as e:
            _ = e
        article_item = JobBoleArticleItem()

        #通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Exemple #6
0
    def parse_job(self, response):
        LaGouArticleItem = ArticleItemLoader(item=LaGouItem(),
                                             response=response)
        LaGouArticleItem.add_css("job_name", '.job-name::attr(title)')
        LaGouArticleItem.add_css("salary", ".salary::text")
        LaGouArticleItem.add_xpath(
            "job_exp", "//dd[@class='job_request']/p/span[3]/text()")
        LaGouArticleItem.add_xpath(
            "edu", "//dd[@class='job_request']/p/span[4]/text()")
        LaGouArticleItem.add_xpath(
            "job_type", "//dd[@class='job_request']/p/span[5]/text()")
        LaGouArticleItem.add_xpath(
            "work_city", "//dd[@class='job_request']/p/span[2]/text()")
        LaGouArticleItem.add_css("company_name", "#job_company .b2::attr(alt)")
        LaGouArticleItem.add_css("company_url",
                                 ".job_company dt a::attr(href)")
        LaGouArticleItem.add_css("work_addr", ".work_addr")
        #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()")
        LaGouArticleItem.add_css("create_date", ".publish_time::text")
        LaGouArticleItem.add_value("job_url", response.url)
        LaGouArticleItem.add_value("job_url_id", get_md5(response.url))
        LaGouArticleItem.add_css("job_advantage", ".job-advantage p::text")
        LaGouArticleItem.add_css("job_desc", ".job_bt div")
        LaGouArticleItem.add_css("tag", ".position-label li")
        ArticleItemLoder = LaGouArticleItem.load_item()

        return ArticleItemLoder
Exemple #7
0
    def parse_job(self, response):
        # 解析拉勾网的职位

        # i = {}
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()

        # 尽量少写处理逻辑,关于数据的清洗放在ItemLoader函数中去做   css .class #id
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id",
                              get_md5(response.url))  # 参考jobbole4中的md5用法
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css("tags", ".position-label li::text")
        item_loader.add_css(
            "publish_time",
            ".publish_time::text")  # need to convert str and split
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()
        return job_item
Exemple #8
0
    def parse_detail(self,response):
        article_item = response.meta.get("article_item","")
        re_url = response.url
        re_title = response.xpath('//div[@id="left_content_pages"]/h1[@class="contents_header"]/a/text()').extract_first("")
        re_info = response.xpath('//div[@id="left_content_pages"]/div[@class="contents_info"]//text()').extract()
        re_info = ''.join(re_info)
        if re.search(r'作者:(.*?)来源',re_info):
            re_author = re.search(r'作者:(.*?)来源',re_info).group().replace('作者:','').replace('来源','').split()[0]
            re_source = re.search(r'来源:(.*?)发布时间', re_info).group().replace('来源:', '').replace('发布时间', '').split()[0]
            release_time = re.search(r'发布时间:(.*?)阅读', re_info).group().replace('发布时间:', '').replace('阅读', '').split()[0]
            re_read = re.search(r'阅读:(.*?)推荐', re_info).group().replace('阅读:', '').replace('推荐', '').split()[0]
            re_recommend = response.xpath('//div[@id="btnDetailDigg"]/span/text()').extract()[0]
        else:
            re_author = re.search(r'作者:(.*?)发布时间', re_info).group().replace('作者:', '').replace('发布时间', '').split()[0]
            re_source = ""
            release_time = re.search(r'发布时间:(.*?)阅读', re_info).group().replace('发布时间:', '').replace('阅读', '').split()[0]
            re_read = re.search(r'阅读:(.*?)推荐', re_info).group().replace('阅读:', '').replace('推荐', '').split()[0]
            re_recommend = response.xpath('//div[@id="btnDetailDigg"]/span/text()').extract()[0]

        re_text = response.xpath('//div[@id="ArticleCnt"]//text()').extract()
        re_images_url = response.xpath('//div[@id="ArticleCnt"]/p/img/@src').extract_first("")

        article_item["re_title"] = re_title
        article_item["re_url"] = re_url
        article_item["re_author"] = re_author
        article_item["re_source"] = re_source
        article_item["release_time"] = release_time
        article_item["re_read"] = re_read
        article_item["re_text"] = "".join(re_text)
        article_item["url_object_id"] = common.get_md5(re_url)
        article_item["re_images_url"] = [re_images_url]
        article_item["re_recommend"] = re_recommend
        # print(re_url,re_info)
        yield article_item
Exemple #9
0
    def parse_detail(self, response):
        article_item = LvChaSoftItem()

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=LvChaSoftItem(),
                                        response=response)
        item_loader.add_xpath("title", "//div[@id='soft_title']/text()")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_xpath("type", "//*[@id='main1k']/div[3]/a[3]/text()")
        item_loader.add_xpath("size", "//em[@id='ljdx']/text()")
        item_loader.add_xpath(
            "update_time",
            "//*[@id='main1k']/div[4]/div[2]/div[2]/div[1]/p[6]/em/text()")
        item_loader.add_xpath("content", "//*[@class='rjjsbox']/p/text()")
        item_loader.add_xpath("tag",
                              "//*[@class='fllist clearfix']/p[4]/em/text()")
        item_loader.add_xpath("fav_nums",
                              "//*[@class='fllist clearfix']/p[5]/em/@class")
        item_loader.add_xpath(
            "download_urls", "//*[@class='clearfix count_down']/dd/a[1]/@href")

        article_item = item_loader.load_item()
        yield article_item
Exemple #10
0
    def detail(self, response):
        # item = ArticlespiderItem()
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # pubtime = response.xpath('//*[@id="post-113778"]/div[2]/p/text()').extract_first().strip().split(" ")[0]
        # tag_list = response.xpath('//*[@id="post-113778"]/div[2]/p/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = "·".join(tag_list)
        # print("xpath选择器", title, pubtime, tags)
        #
        # title = response.css('div.entry-header h1::text').extract_first()
        # pubtime = response.css('.entry-meta-hide-on-mobile::text').extract()[0].strip().split(" ")[0]
        # try:
        #     pubtime = datetime.datetime.strptime(pubtime, '%Y/%m/%d').date()
        # except:
        #     pubtime = datetime.datetime.now().date()
        # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = "·".join(tag_list)
        # # print("css  选择器", title, pubtime, tags)
        #
        # all_num = response.css('.post-adds')
        # praise_num = all_num.css('#113778votetotal::text').extract_first(0)
        # fav_num = all_num.css('.bookmark-btn::text').extract_first("0")
        # match_re = re.match('.*?(\d+).*', fav_num)
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        # comment_num = all_num.css('.fa.fa-comments-o::text').extract_first(0)
        # content = response.css('.entry p::text').extract()
        # contents = "__".join([element for element in content if element])
        # item['title'] = title
        # item['pubtime'] = pubtime
        # item['tags'] = tags
        # item['praise_num'] = praise_num
        # item['fav_num'] = fav_num
        # item['comment_num'] = comment_num
        # item['contents'] = contents
        # item['image_urls'] = response.meta.get('image_urls', "")
        # item['url_object_id'] = get_md5(response.url)
        # item['url'] = response.url

        # ItemLoader
        item_loader = ArticleItemLoder(item=ArticlespiderItem(),
                                       response=response)
        item_loader.add_css('title', 'div.entry-header h1::text')
        item_loader.add_css('pubtime', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('praise_num', '.post-adds span h10::text')
        item_loader.add_css('fav_num', '.bookmark-btn::text')
        item_loader.add_css('comment_num',
                            'a[href="#article-comment"] span::text')
        item_loader.add_css('content', 'div.entry p::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('image_urls',
                              [response.meta.get("image_urls", '')])

        article_item = item_loader.load_item()
        return article_item
    def parse_job(self, response):
        i = {}
        item_loader = LagouItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_xpath('title', '//div[@class="job-name"]/@title')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(url=response.url))
        item_loader.add_xpath('salary', '//span[@class="salary"]/text()')
        item_loader.add_xpath('job_city',
                              '//*[@class="job_request"]/p/span[2]/text()')
        item_loader.add_xpath('work_years',
                              '//*[@class="job_request"]/p/span[3]/text()')
        item_loader.add_xpath('degree_need',
                              '//*[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath('job_type',
                              '//*[@class="job_request"]/p/span[5]/text()')
        item_loader.add_xpath('publish_time',
                              '//*[@class="publish_time"]/text()')
        item_loader.add_css('tags', '.position-label li::text')
        item_loader.add_xpath('job_advantage',
                              '//*[@class="job-advantage"]/p/text()')
        item_loader.add_xpath('job_desc', '//*[@class="job_bt"]/div')
        item_loader.add_xpath('company_url',
                              '//*[@class="job_company"]/dt/a/@href')
        item_loader.add_xpath('company_name',
                              '//*[@class="job_company"]/dt/a/img/@alt')

        job_item = item_loader.load_item()

        return job_item
Exemple #12
0
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", "")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text")
        item_loader.add_css("work_years",
                            ".job_request p span:nth-child(3)::text"
                            )  # 这里使用css ,是为了在学习时,熟悉css选择器用法
        item_loader.add_xpath("degree_need",
                              "//dd[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//dd[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("tags", ".position-label.clearfix li::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_value("crawl_time", datetime.datetime.now())
        # item_loader.add_css("crawl_update_time", datetime.datetime.now())

        job_item = item_loader.load_item(
        )  # 这里先赋值给一个变量,是考虑到便于调试以及代码可读性,而不是为了代码简洁而直接return

        return job_item
Exemple #13
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        # 提取文章的具体信息
        # xpath解析
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·',                                                                                                           '').strip()
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # fav_nums = response.css("span.bookmark-btn::text").extract()[0]
        # match_re = re.match(r".*(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # comment_num = response.xpath("//a[@href='#article-comment']/span").extract()[0]
        # match_re = re.match(r".*(\d+).*", comment_num)
        # if match_re:
        #     comment_num = match_re.group(1)
        # contetn = response.xpath("//div[@class='entry']").extract()[0]
        # print(contetn)

        # css选择器
        front_image_url  =response.meta.get("front_image_url","") #文章封面图
        title = response.css(".entry-header h1::text").extract_first()
        create_date = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·', '').strip()
        praise_nums = response.css(".vote-post-up h10::text").extract_first().strip()
        fav_nums = response.css("span.bookmark-btn::text").extract_first()
        match_re = re.match(r".*(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css('a[href="#article-comment"] span::text').extract_first()
        match_re = re.match(r".*(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract_first()
        tags = response.css("p.entry-meta-hide-on-mobile a::text").extract_first()
        tag_list = [element for element in tags if not element.strip().endswith('\0')]
        tag_list = ",".join(tag_list)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["fav_nums"] = fav_nums
        article_item["comment_nums"] = comment_nums
        article_item["tags"] = tags
        article_item["content"] = content

        yield article_item #传到pipelines.py
Exemple #14
0
    def parse_detail(self, response):

        #make an instance call article_item
        article_item = JobBoleArticleItem()

        #send to the pipeline.py, let it receive it
        #using item loader to load the items
        #getting all item via loadering item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        # assgin the loader into the item
        article_item = item_loader.load_item()

        yield article_item
Exemple #15
0
    def parse_details(self, response):
        news = ArticlespiderItem()

        img_url = response.meta.get("img_url", "")
        title = response.css(".entry-header h1::text").extract_first("")
        datetime = response.css(".entry-meta-hide-on-mobile::text").extract_first("").replace(" ·", "").strip()
        praise_num_str = response.css("#114676votetotal::text").extract_first("")
        if praise_num_str:
            praise_num = int(praise_num_str)
        else:
            praise_num = 0
        content = response.css(".entry p::text").extract()[0]

        news['title'] = title
        news['datetime'] = datetime
        news['praise_num'] = praise_num
        news['content'] = content
        news['img_url'] = [img_url]
        news['url_object_id'] = get_md5(img_url)


        item_loader=ItemLoader(item=ArticlespiderItem,response=response)
        item_loader.add_css()
        item_loader.add_xpath()
        item_loader.add_value()


        yield news
Exemple #16
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)

            article_item = JobBoleArticleItem()
            title = response.css("#news_title a::text").extract_first("")
            create_date = response.css("#news_info .time::text").extract_first(
                "")
            match_re = re.match(".*?(\d+.*)", create_date)
            if match_re:
                create_date = match_re.group(1)
            content = response.css("#news_content").extract()[0]
            tag_list = response.css(".news_tags a::text").extract()
            tags = ",".join(tag_list)
            article_item["title"] = title
            article_item["create_date"] = create_date
            article_item["content"] = content
            article_item["tags"] = tags
            article_item["url"] = response.url
            if response.meta.get("front_image_url", ""):
                article_item["front_image_url"] = [
                    response.meta.get("front_image_url", "")
                ]
            else:
                article_item["front_image_url"] = []

            article_item["url_object_id"] = common.get_md5(response.url)

            yield article_item
Exemple #17
0
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_xpath("tags", '//li[@class="labels"]/text()')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
Exemple #18
0
    def parse_nums(self, response):
        # j_data = json.loads(response.text)
        page_source = response.text
        j_str = remove_tags(page_source)
        j_data = json.loads(j_str)
        # article_item = response.meta.get("article_item", "")
        item_loader = response.meta.get('article_item', '')

        # praise_nums = j_data["DiggCount"]
        # fav_nums = j_data["TotalView"]
        # comment_nums = j_data["CommentCount"]
        #
        # article_item["praise_nums"] = praise_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["url_object_id"] = common.get_md5(article_item["url"])

        item_loader.add_value('praise_nums', j_data['DiggCount'])
        item_loader.add_value('fav_nums', j_data['TotalView'])
        item_loader.add_value('comment_nums', j_data['CommentCount'])
        item_loader.add_value('url_object_id',
                              common.get_md5(response.meta.get('url', '')))

        article_item = item_loader.load_item()

        yield article_item
Exemple #19
0
    def parse_detail(self, response):
        matches = re.match(".*/(?P<contentId>[0-9]+)", response.url)
        if matches:
            article_item = CnblogsArticleItem()
            title = response.xpath('//div[@id="news_title"]/a/text()').extract_first("")
            news_info_node = response.xpath('//div[@id="news_info"]')
            create_time = news_info_node.xpath('span[@class="time"]/text()').extract_first("")
            matches2 = re.match(".*?(?P<create_time>[0-9:-]+[\s]*?[0-9:-]+).*", create_time)
            # .*?(?P<create_time>\d.*)
            # .*?(?P<create_time>[0-9:-]+[\s]*?[0-9:-]+).*
            if matches2:
                create_time = matches2["create_time"]
            content = response.xpath('//*[@id="news_content"]').extract_first("")

            tag_list = response.xpath('//*[@id="news_more_info"]/div[@class="news_tags"]/a/text()').extract()
            tags = ",".join(tag_list)

            article_item["title"] = title
            article_item["create_time"] = create_time
            article_item["content"] = content
            article_item["tags"] = tags
            article_item["url"] = response.url
            article_item["url_object_id"] = common.get_md5(response.url)
            article_item["front_image_url"] = response.meta.get("front_image_url", "")

            contentId = matches["contentId"]
            url = parse.urljoin(response.url, "https://news.cnblogs.com/NewsAjax/GetAjaxNewsInfo?contentId={}".format(contentId))
            yield Request(url=url, meta={"article_item": article_item}, callback=self.parse_news_info)
Exemple #20
0
    def parse_job(self, response):
        # 解析拉钩职位
        item_loder = LaGouJobItemLoader(item=LaGouJobItem(), response=response)
        item_loder.add_css('title', '.job-name::attr(title)')
        item_loder.add_value('url', response.url)
        item_loder.add_value('url_object_id', get_md5(response.url))
        item_loder.add_css('salary', '.job_request .salary::text')
        item_loder.add_xpath('job_city',
                             "//*[@class='job_request']/p/span[2]/text()")
        item_loder.add_xpath('work_years',
                             "//*[@class='job_request']/p/span[3]/text()")
        item_loder.add_xpath('degree_need',
                             "//*[@class='job_request']/p/span[4]/text()")
        item_loder.add_xpath('job_type',
                             "//*[@class='job_request']/p/span[5]/text()")

        item_loder.add_css('publish_time', '.publish_time::text')
        item_loder.add_css('job_advantage', '.job-advantage p::text')
        item_loder.add_css('job_desc', '.job_bt div')
        item_loder.add_css('job_addr', '.work_addr')
        item_loder.add_css('company_url', '#job_company dt a::attr(href)')
        item_loder.add_css('company_name', '#job_company dt a img::attr(alt)')
        item_loder.add_css('tags', '.position-label li::text')
        item_loder.add_value('crawl_time', datetime.now())
        item_loder.add_value('crawl_update_time', datetime.now())

        lagou_job_item = item_loder.load_item()

        return lagou_job_item
Exemple #21
0
    def parse_detail(self,response):
        # article_item = JobBoleArticleItem()
        # front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        # title = response.css('div.entry-header h1::text').extract_first()
        # create_data = response.css('p.entry-meta-hide-on-mobile::text').extract_first().strip().replace("·","").strip()
        # praise_nums = response.css('span.vote-post-up h10::text').extract_first()
        # fav_nums = response.css(".bookmark-btn::text").extract_first()
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first()
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css('div.entry').extract_first()
        # tag_list = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # try:
        #     create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_data = datetime.datetime.now().date()
        # article_item['create_date'] = create_data
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图

        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)

        # 通过css选择器将后面的指定规则进行解析。
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        # 调用这个方法来对规则进行解析生成item对象
        article_item = item_loader.load_item()

        # 已经填充好了值调用yield传输至pipeline
        yield article_item
Exemple #22
0
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_xpath("title", '//div/span[@class="name"]/text()')
        item_loader.add_xpath("url", response.url)
        item_loader.add_xpath("url_object_id", get_md5(response.url))
        item_loader.add_xpath("salary", '//dd/p/span[@class="salary"]/text()')
        item_loader.add_xpath("job_city", '//dd/p/span[2]/text()')
        item_loader.add_xpath("work_years", '//dd/p/span[3]/text()')
        item_loader.add_xpath("degree_need", '//dd/p/span[4]/text()')
        item_loader.add_xpath("job_type", '//dd/p/span[5]/text()')
        item_loader.add_xpath("tags",
                              '//dd[@class="job_request"]/ul/li/text()')
        item_loader.add_xpath("publish_time",
                              '//dd/p[@class="publish_time"]/text()')
        item_loader.add_xpath("job_advantage", '//dl/dd/p/text()')
        item_loader.add_xpath("job_desc", '//dd/div/p/text()')
        item_loader.add_xpath("job_addr", '//dd/div[@class="work_addr"]')
        item_loader.add_xpath("company_url", '//dl/dt/a/@href')
        item_loader.add_xpath("company_name", '//dl/dt/a/img/@alt')
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
Exemple #23
0
    def parse_job(self, response):
        item_loader = LagouItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request p .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css("tags", ".position-label li::text")
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_url", ".c_feature li a::text ")
        item_loader.add_css("company_name", "img.b2::attr(alt)")
        item_loader.add_value("crawl_time", datetime.datetime.now())

        lagoujob_item = item_loader.load_item()
        yield lagoujob_item
Exemple #24
0
    def parse_detail(self, response):
        # 通过item loader加载item
        type_name = response.meta.get("type_name", "")
        publish_date = response.meta.get("publish_date", "")  # 发布时间
        item_loader = kjjysItemLoader(item=kjjysItem(), response=response)
        image_url = response.css("#UCAP-CONTENT img::attr(src)").extract()
        content = response.css(".Zoom").extract_first("")
        title = response.meta.get("title", "")

        new_image_url = []
        if len(image_url) > 0:
            for in_url in image_url:
                in_url = parse.urljoin(response.url, in_url)
                new_image_url.append(in_url)
        else:
            item_loader.add_value("front_image_path", '--')

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        if len(new_image_url) > 0:
            item_loader.add_value("front_image_url", new_image_url)
        # else:
        #     item_loader.add_value("front_image_url", [""])
        item_loader.add_value("source_net", self.start_urls[0])
        item_loader.add_value("source_name", '中华人民共和国科学技术部')
        item_loader.add_value("type_name", type_name)
        item_loader.add_value("title", title)
        item_loader.add_value("content", content)

        item_loader.add_value("publish_time", publish_date)
        item_loader.add_value("crawl_time", datetime.datetime.now())
        article_item = item_loader.load_item()

        yield article_item
Exemple #25
0
    def parse_nums(self, response):
        j_data = json.loads(response.text)
        if j_data:

            article_item = response.meta.get('article_item', "")
        # 基于回调的代码

            praise_nums = int(j_data["DiggCount"])



            fav_nums = j_data['TotalView']
            comment_nums = j_data['CommentCount']

        # 延迟调用  代码分离

        # item_loader = response.meta.get("article_item", "")
        # item_loader.add_value("praise_nums", j_data["DiggCount"])
            article_item["praise_nums"] = praise_nums
            article_item['fav_nums'] = fav_nums
            article_item['comment_nums'] = comment_nums
            article_item['url_obj_id'] = common.get_md5(article_item['url'])

            yield article_item
        else:
            print("Error here")
Exemple #26
0
    def parse_detail(self, response):
        """
        提取文章信息
        """
        # 通过自定义的item_loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath("title",
                              '//div[@class="entry-header"]/h1/text()')
        front_image_url = response.meta.get("front_image_url", "")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_xpath(
            "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_xpath(
            "praise_nums",
            "//span[contains(@class, 'vote-post-up')]/h10/text()")
        item_loader.add_xpath("comment_nums",
                              "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath(
            "fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()")
        item_loader.add_xpath(
            "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")
        item_loader.add_xpath("content", "//div[@class='entry']")

        article_item = item_loader.load_item()

        yield article_item
Exemple #27
0
    def parse_item(self, response):

        # i = {}
        # #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        # #i['name'] = response.xpath('//div[@id="name"]').extract()
        # #i['description'] = response.xpath('//div[@id="description"]').extract()
        # return i

        item_loader = LagouItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", '.job-name > .name::text')
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", '.job_request .salary::text')
        item_loader.add_xpath("job_city",
                              '//*[@class="job_request"]/p/span[2]/text()')
        item_loader.add_xpath("work_years",
                              '//*[@class="job_request"]/p/span[3]/text()')
        item_loader.add_xpath("degree_need",
                              '//*[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath("job_type",
                              '//*[@class="job_request"]/p/span[5]/text()')
        item_loader.add_css("publish_time", '.publish_time::text')
        item_loader.add_css("job_advantage", '.job-advantage p::text')
        item_loader.add_css("job_desc", '.job_bt div::text')
        item_loader.add_css("job_addr", '.work_addr')
        item_loader.add_css("company_name", '#job_company dt a img::attr(alt)')
        item_loader.add_css("company_url", '#job_company dt a::attr(href)')
        item_loader.add_css("tags", '.position-label li::text ')
        item_loader.add_value("crawl_time", datetime.datetime.now())

        item = item_loader.load_item()
        return item
Exemple #28
0
    def parse_nums(self, response):
        j_data = json.loads(response.text)
        ''' 被精简的代码
        article_item = response.meta.get("article_item", "")
        commentCount = j_data["CommentCount"]
        totalView = j_data["TotalView"]
        diggCount = j_data["DiggCount"]
        buryCount = j_data["BuryCount"]

        # article_item = CdnBlogArtcleItem()
        article_item["praise_nums"] = diggCount
        article_item["fav_nums"] = totalView
        article_item["comment_nums"] = commentCount
        article_item["url_object_id"] = common.get_md5(article_item["url"])
        '''

        item_loader = response.meta.get("article_item", "")
        item_loader.add_value("praise_nums", j_data["DiggCount"])
        item_loader.add_value("fav_nums", j_data["TotalView"])
        item_loader.add_value("comment_nums", j_data["CommentCount"])
        item_loader.add_value("url_object_id", common.get_md5(response.meta.get("url", "")))

        article_item = item_loader.load_item()

        yield article_item
Exemple #29
0
    def parse_detail(self, response):
        """
        提取文章的具体字段
        :type response: HtmlResponse
        :param response:
        :return:
        """
        # 通过Item loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url', response.url)
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('front_image_url',
                              response.meta.get("front_image_url", ""))
        item_loader.add_css('praise_nums', '.vote-post-up h10::text')
        item_loader.add_css('comment_nums',
                            "a[href='#article-comment'] span::text")
        item_loader.add_css('fav_nums', '.bookmark-btn::text')
        item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('content', 'div.entry')
        article_item = item_loader.load_item()

        yield article_item  # 传递到pipelines.py
Exemple #30
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        #提取文章的具体字段
        # front_image_url = response.meta.get('front_image_url','') #文章封面图
        # title = response.css('.entry-header h1::text').extract_first('')
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·','').strip()
        # praise_nums = response.css('.vote-post-up h10::text').extract()[0]
        # fav_nums = response.css('.bookmark-btn::text').extract()[0]
        # match_re = re.match('.*?(\d+).*',fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums =  response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match('.*?(\d+).*', comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        # taglist = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # taglist = [element for element in taglist if not element.strip().endswith('评论')]
        # tags = ','.join(taglist)
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['comment_nums'] =comment_nums
        # article_item['content'] = content
        # article_item['fav_nums'] =fav_nums
        # article_item['tags'] =tags

        #通过item_loader加载item
        front_image_url = response.meta.get('front_image_url', '')  #文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('praise_nums', '.vote-post-up h10::text')
        item_loader.add_css('comment_nums',
                            "a[href='#article-comment'] span::text")
        item_loader.add_css('fav_nums', '.bookmark-btn::text')
        item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('content', "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Exemple #31
0
    def parse_detail(self, response):
        # article_item = JobBoleArticleItem()
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        front_image_url = response.meta.get("front_image_url", "")
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item