コード例 #1
0
ファイル: lagou.py プロジェクト: zicen/ArticleSpider
    def parse_item(self, response):
        item_loader = LagouJobItemLoader(LagouJobItem(), response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())
        item_loader.add_value("crawl_update_time", datetime.now())

        jobItem = item_loader.load_item()
        return jobItem
コード例 #2
0
 def parse_detail(self, response):
     article_item = ItcodemonkeyArticleItem()
     # 提取文章的具体字段
     item_loader = JobBoleArticleItemLoader(item=ItcodemonkeyArticleItem(),
                                            response=response)
     # 针对css选择器
     item_loader.add_css(
         'title',
         'body > div.container.tc-main > div.row > div.span9 > div > h2::text'
     )
     item_loader.add_css(
         'create_time',
         'body > div.container.tc-main > div.row > div.span9 > div > div.article-infobox > span::text'
     )
     item_loader.add_css(
         'classify',
         'body > div.container.tc-main > div.row > div.span9 > div > div.article-infobox > span > a::text'
     )
     item_loader.add_css('content', '#article_content')
     # 针对直接取值的情况
     item_loader.add_value('url', response.url)
     item_loader.add_value('url_object_id', get_md5(response.url))
     article_item = item_loader.load_item()
     yield article_item
コード例 #3
0
ファイル: jobbole.py プロジェクト: Thpffcj/Python-Learning
    def parse_detail(self, response):

        article_item = JobBoleArticleItem()

        # 提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_data = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first("").replace("·",
        # 																										 "").strip()
        #
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first("")
        #
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first("")
        # match_re = re.match(r".*?(\d+).*", fav_nums)
        # if match_re:
        # 	fav_nums = int(match_re.group(1))
        # else:
        # 	fav_nums = 0
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(r".*?(\d+).*", comment_nums)
        # if match_re:
        # 	comment_nums = int(match_re.group(1))
        # else:
        # 	comment_nums = 0
        #
        # content = response.xpath("//div[@class='entry']").extract_first("")
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        # 通过css选择器提取字段
        # front_image_url = response.meta.get("front_image_url", "")
        # title = response.css(".entry-header h1::text").extract_first("")
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first("").replace("·", "").strip()
        #
        # praise_nums = response.css(".vote-post-up h10::text").extract_first("")
        #
        # fav_nums = response.css("span.bookmark-btn::text").extract_first("")
        # match_re = re.match(r".*?(\d+).*", fav_nums)
        # if match_re:
        # 	fav_nums = int(match_re.group(1))
        # else:
        # 	fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("")
        # match_re = re.match(r".*?(\d+).*", comment_nums)
        # if match_re:
        # 	comment_nums = int(match_re.group(1))
        # else:
        # 	comment_nums = 0
        #
        # content = response.css("div.entry").extract_first("")
        #
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        # 	create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        # 	create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
コード例 #4
0
    def parse_detail(self, response):
        #xpath 提取方法,过于繁琐,建议使用ItemLoader
        # ArticleItem = JobBoleArticle()
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace('·',' ').strip()
        # praise_nums = int(response.xpath('//div[@class="post-adds"]/span/h10/text()').extract()[0])
        #
        # fav_nums = re.match(r'.*(\d+).*',response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0])
        # if fav_nums:
        #     fav_nums = int(fav_nums.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = re.match(r'.*(\d+).*', response.xpath('//a[contains(@href,"comment")]/span/text()').extract()[0])
        # if comment_nums:
        #     comment_nums = int(comment_nums.group(1))
        # else:
        #     comment_nums = 0
        # content =  response.xpath('//div[@class="entry"]/p/text()').extract();
        # tag_list =response.xpath('//p[contains(@class,"hide-on-mobile")]/a/text()').extract();
        # tag_list = [t for t in tag_list if not t.strip().endswith("评论")]
        # tags= ",".join(tag_list)
        # author = response.xpath('//div[@class="copyright-area"]/a/text()').extract_first()
        #
        # # ArticleItem["title"] = title
        # try:
        #     create_time = datetime.datetime.strptime(create_time,'%Y/%m/%d').date()
        # except Exception as e :
        #     create_time  = datetime.datetime.now()
        # ArticleItem["create_time"] = create_time
        # ArticleItem["praise_nums"] = praise_nums
        # ArticleItem["fav_nums"] = fav_nums
        # ArticleItem["comment_nums"] = comment_nums
        # ArticleItem["content"] = content
        # ArticleItem["tags"] = tags
        # ArticleItem["url_object_id"] = get_md5(response.url)
        # ArticleItem["image_url"] = [front_image_url]
        # ArticleItem["author"] = author
        # ArticleItem["url"] = response.url

        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobBoleArticle(),
                                        response=response)
        item_loader.add_xpath('title',
                              '//div[@class="entry-header"]/h1/text()')
        item_loader.add_xpath(
            'create_time', '//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_xpath('praise_nums',
                              '//div[@class="post-adds"]/span/h10/text()')
        item_loader.add_xpath(
            'fav_nums', '//span[contains(@class,"bookmark-btn")]/text()')
        item_loader.add_xpath('comment_nums',
                              '//a[contains(@href,"comment")]/span/text()')
        item_loader.add_xpath('content', '//div[@class="entry"]//p/text()')
        item_loader.add_xpath(
            'tags', '//p[contains(@class,"hide-on-mobile")]/a/text()')
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('image_url', [front_image_url])
        item_loader.add_xpath('author',
                              '//div[@class="copyright-area"]/a/text()')
        item_loader.add_value('url', response.url)

        ArticleItem = item_loader.load_item()
        yield ArticleItem
コード例 #5
0
ファイル: items.py プロジェクト: Vaskka/ArticleSpider
def to_md5url(value):
    """将urlMD5"""
    value = common.get_md5(value)
    return value