コード例 #1
0
ファイル: jobbole.py プロジェクト: BattlesSymphony/home
    def parse_article(self, response):
        article_item = response.meta['item']
        pub_date = response.css('.entry-meta-hide-on-mobile::text').get()
        cate = response.xpath('//a[@rel="category tag"]/text()').get()
        tags = response.xpath(
            '//p[@class="entry-meta-hide-on-mobile"]/a[contains(@href,"tag")]/text()'
        ).getall()
        votetotal = response.xpath(
            '//h10[contains(@id,"votetotal")]/text()').get()
        booktotal = response.css('span.bookmark-btn::text').get()
        commenttotal = response.xpath(
            '//a[@href="#article-comment"]//text()').get()
        content = response.xpath('//div[@class="entry"]').get()
        match_obj = '\d+'
        booktotal = re.match(match_obj, booktotal.strip())
        commenttotal = re.match(match_obj, commenttotal.strip())

        article_item['url_md5_id'] = get_md5(response.url)
        article_item['pub_date'] = pub_date.strip().split()[0]
        article_item['cate'] = cate
        article_item['tags'] = tags
        article_item['votetotal'] = int(votetotal) if votetotal else 0
        article_item['booktotal'] = int(booktotal.group()) if booktotal else 0
        article_item['commenttotal'] = int(
            commenttotal.group()) if commenttotal else 0
        article_item['content'] = content.strip()
        yield article_item
コード例 #2
0
    def parse_article(self, response):
        item = response.meta['item']
        # pub_date = response.css('.entry-meta-hide-on-mobile::text').get().strip().split()[0]
        # try:
        #     pub_date = datetime.datetime.strptime(pub_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     pub_date = datetime.datetime.now().date()
        #
        # cate = response.xpath('//a[@rel="category tag"]/text()').get()
        # tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a[contains(@href,"tag")]/text()').getall()
        # votetotal = response.xpath('//h10[contains(@id,"votetotal")]/text()').get()
        # booktotal = response.css('span.bookmark-btn::text').get()
        # commenttotal = response.xpath('//a[@href="#article-comment"]//text()').get()
        # content = response.xpath('//div[@class="entry"]').get()
        # match_obj = '\d+'
        # booktotal = re.match(match_obj, booktotal.strip())
        # commenttotal = re.match(match_obj, commenttotal.strip())
        #
        # article_item['url_md5_id'] = get_md5(response.url)
        # article_item['pub_date'] = pub_date
        # article_item['cate'] = cate
        # article_item['tags'] = tags
        # article_item['votetotal'] = int(votetotal) if votetotal else 0
        # article_item['booktotal'] = int(booktotal.group()) if booktotal else 0
        # article_item['commenttotal'] = int(commenttotal.group()) if commenttotal else 0
        # article_item['content'] = content.strip()
        #
        # yield  article_item

        # 通过Itemloader加载实例
        l = ArticleItemloader(item=BoleArticle(), response=response)
        l.add_value('title', item['title'])
        l.add_value('url', item['url'])
        l.add_value('front_img_url', item['front_img_url'])
        l.add_css('pub_date', '.entry-meta-hide-on-mobile::text')
        l.add_xpath(
            'cate',
            '//p[@class="entry-meta-hide-on-mobile"]/a[@rel="category tag"]/text()'
        )
        l.add_xpath(
            'tags',
            '//p[@class="entry-meta-hide-on-mobile"]/a[contains(@href,"tag")]/text()'
        )
        l.add_xpath('votetotal', '//h10[contains(@id,"votetotal")]/text()')
        l.add_css('booktotal', 'span.bookmark-btn::text')
        l.add_xpath('commenttotal', '//a[@href="#article-comment"]//text()')
        l.add_xpath('content', '//div[@class="entry"]')
        l.add_value('url_md5_id', get_md5(response.url))

        article_item = l.load_item()
        yield article_item
コード例 #3
0
ファイル: lagou.py プロジェクト: BattlesSymphony/home
    def parse_item(self, response):
        with open('lagou.html', 'w', encoding='utf-8') as f:
            f.write(response.text)

        l = LaGouItemloader(item=LagoujobItem(), response=response)
        '''
        url = Field()
        url_md5_id = Field()
        title = Field()
        salary = Field()
        job_city = Field()
        work_years = Field()
        degree_need = Field()
        job_type = Field()
        pub_time = Field()
        tags = Field()
        job_advantage = Field()
        job_desc = Field()
        job_address = Field()
        company_name = Field()
        company_url = Field()
        crawl_time = Field()
        '''
        l.add_value('url', response.url)
        l.add_value('url_md5_id', get_md5(response.url))
        l.add_css('title', '.job-name::attr(title)')
        l.add_css('salary', 'span.salary::text')
        l.add_css('job_city', 'dd.job_request > p >span:nth-child(2)::text')
        l.add_css('work_years', 'dd.job_request > p >span:nth-child(3)::text')
        l.add_css('degree_need', 'dd.job_request > p >span:nth-child(4)::text')
        l.add_css('job_type', 'dd.job_request > p >span:nth-child(5)::text')
        l.add_css('pub_time', 'p.publish_time::text')
        l.add_css('tags', '.labels::text')
        l.add_css('job_advantage', '.job-advantage > p::text')
        l.add_css('job_desc', '.job-detail')
        # addr_lst = response.xpath('//div[@class="work_addr"]//text()').getall()
        # addr = ''.join([i.strip() for i in addr_lst if len(i.strip())>0])
        l.add_css('job_address', '.work_addr')
        l.add_css('company_name', 'img.b2::attr(alt)')
        l.add_css('company_url', '.job_company > dt > a::attr(href)')
        l.add_value('crawl_time', datetime.datetime.now())
        lagou_item = l.load_item()
        return lagou_item