コード例 #1
0
    def get_detail(self, response):
        item = response.meta['item']
        description = response.xpath("//p[@class='summary']/text()").extract()[0]

        tags_text = response.xpath("//div[@class='video_tags _video_tags']/a/text()").extract()
        if len(tags_text):
            tags = ','.join(tags_text)
        else:
            tags = ''

        #动漫这里显示集数有几种方式
        jishu=''
        jishu_list1=response.css('.item_detail_half a::attr(href)').extract()
        jishu_list2=response.css('.mod_episode .item a::attr(href)').extract()
        if len(jishu_list1):
            for i in range(1,len(jishu_list1)+1):
                jishu+="第%s集:https://v.qq.com"%(str(i))+jishu_list1[i-1]+','
        elif jishu_list2:
            if len(jishu_list2):
                for i in range(1,len(jishu_list2)+1):
                    jishu+="第%s集:https://v.qq.com"%(str(i))+jishu_list2[i-1]+','
        else:
            jishu=''


        url_object_id=get_md5(response.url)
        play_time = response.xpath("//div[@class='figure_count']/span[@class='num']/text()").extract_first()

        item['url_object_id']=url_object_id
        item['jishu']=jishu
        item['tags'] = tags
        item['description'] = description
        item['play_time'] = play_time
        yield item
コード例 #2
0
ファイル: movie.py プロジェクト: 152056215/ArticalProject
    def parse_item(self, response):  #每一页有多个电影 需要 对每个电影解析

        itemloader = ItemLoader(item=MovieItem(), response=response)
        itemloader.add_value('url', response.url)
        itemloader.add_value('url_object_id', get_md5(response.url))
        itemloader.add_css('main_title', '.video_title a::text')
        itemloader.add_css('title', '.video_title::text')
        itemloader.add_css('tags', '.video_info a::text')
        itemloader.add_css('score1', '.video_score .units::text')  #评分两部分构成
        itemloader.add_css('score2', '.video_score .decimal::text')
        itemloader.add_css('info', '.summary::text')
        itemloader.add_css('role', '.director a::text')
        itemloader.add_css('image_url', '.figure_pic::attr(style)')
        itemloader.add_value('movie_url', self.movie_detail_url + response.url)

        item = itemloader.load_item()

        return item
コード例 #3
0
ファイル: zhilian.py プロジェクト: 152056215/ArticalProject
    def parse_detail(self,response):
        print('1')
        # te='['首页', '保定人才网', '保定销售代表招聘', '6000-10000元/月\xa0', '全职', '不限', '本科', '10人 ', '双休', '试用期缴纳五险一金', '1000-9999人', '民营', '\n石家庄市桥东区中山路39号勒泰中心(B座)写字楼37/38/39层', '\n']'
        # print(response.text)
        try:
            itemloder=ItemLoader(item=ZhilianzhaopinItem(),response=response)
            itemloder.add_value('url',response.url)
            itemloder.add_value('url_object_id',get_md5(response.url))
            # itemloder.add_css('title','.l.info-h3::text')
            # itemloder.add_css('all','strong::text') #salary work_years degree_need  job_addvantage scale company_type address
            # itemloder.add_css('tags','.icon-promulgator-person a::text')
            # itemloder.add_css('job_info','.pos-ul span::text')
            # itemloder.add_css('company_name','.companny a::text')


            item=itemloder.load_item()
            return item
        except Exception as e:
            print(e)
            pass
コード例 #4
0
ファイル: lagou.py プロジェクト: 152056215/ArticalProject
 def parse_item(self, response):
     itemloder= ItemLoader(item=LagouJobItem(),response=response)
     itemloder.add_css('title','.job-name .name::text')
     itemloder.add_value('url',response.url)
     itemloder.add_value('url_object_id',get_md5(response.url))
     itemloder.add_css('salary_min','.salary::text')     #工资范围  1k-2k
     itemloder.add_xpath('job_city','/html/body/div[3]/div/div[1]/dd/p[1]/span[2]/text()')       #有斜线  /上海/
     itemloder.add_xpath('work_years_min','/html/body/div[3]/div/div[1]/dd/p[1]/span[3]/text()')
     itemloder.add_xpath('degree_need','/html/body/div[3]/div/div[1]/dd/p[1]/span[4]/text()')
     itemloder.add_xpath('work_type','/html/body/div[3]/div/div[1]/dd/p[1]/span[5]/text()')
     itemloder.add_xpath('tags','/html/body/div[3]/div/div[1]/dd/ul/li/text()')      #['移动互联网', '房产服务', '金融', '智能硬件', 'ERP', '后台']
     itemloder.add_css('publish_time','.publish_time::text')     #14:46  发布于拉勾网
     itemloder.add_xpath('job_addvantage','//*[@id="job_detail"]/dd[1]/p/text()')
     itemloder.add_css('job_desc','.job_bt div p')      #列表 需要','.join()
     itemloder.add_xpath('company_name','//*[@id="job_company"]/dt/a/div/h2/text()') #空格处理
     itemloder.add_xpath('company_area','//*[@id="job_detail"]/dd[3]/div[1]/a/text()') #空格处理
     itemloder.add_xpath('company_develop_state','//*[@id="job_company"]/dd/ul/li[2]/text()')
     itemloder.add_css('company_url','#job_company dt a::attr(href)')
     itemloder.add_xpath('company_scale','//*[@id="job_company"]/dd/ul/li[4]/text()')
     job_itme=itemloder.load_item()
     return job_itme
コード例 #5
0
    def parse_item(self, response):
        itemloder=ItemLoader(item=ShixisengItem(),response=response)
        itemloder.add_css('title','.new_job_name::text')
        itemloder.add_value('url',response.url)
        itemloder.add_value('url_object_id',get_md5(response.url))
        itemloder.add_css('upgrade_time','.job_date span::text')
        itemloder.add_css('salary_min','.job_money::text')      #范围
        itemloder.add_css('job_city','.job_position::attr(title)')
        itemloder.add_css('degree_need','.job_academic::text')
        itemloder.add_css('work_per_week','.job_week::text')
        itemloder.add_css('shixi_time','.job_time::text')
        itemloder.add_css('job_addvantage','.job_good::text')
        itemloder.add_css('job_info','.job_part ::text')
        itemloder.add_css('company_name','.job_com_name::text')
        itemloder.add_css('company_url','.job_link::text')
        itemloder.add_css('work_address','.com_position::text')
        itemloder.add_css('tags','.job_detail_msg span::text')
        itemloder.add_css('end_time','.deadline .job_detail::text')


        item=itemloder.load_item()
        return item
コード例 #6
0
    def parse_detail(self, response):  #提取文章的具体字段
        # article_item = JobboleArticalItem()
        # #通过css选择器提取字段
        # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # title = response.css(".entry-header h1::text").extract_first('')
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first('').strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract_first('')
        # fav_nums = response.css(".bookmark-btn::text").extract_first('')
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first('')
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract_first('')
        #
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()#date类型不可调用json.dumps 进行序列化
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # item_loder=ItemLoader(item=JobboleArticalItem(),response=response)  #在此处实例化一个item对象传进去,格式
        item_loder = ArticleItemLoder(item=JobboleArticalItem(),
                                      response=response)  #改写loderItem
        item_loder.add_css('title', '.entry-header h1::text')
        item_loder.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loder.add_css('praise_nums', '.vote-post-up h10::text')
        item_loder.add_css('fav_nums', '.bookmark-btn::text')
        item_loder.add_css('comment_nums',
                           "a[href='#article-comment'] span::text")
        item_loder.add_css('content', '.entry p::text')
        item_loder.add_css('tags', 'p.entry-meta-hide-on-mobile a::text')
        item_loder.add_value('url', response.url)
        item_loder.add_value('url_object_id', get_md5(response.url))
        item_loder.add_value('front_image_url', [front_image_url])

        article_item = item_loder.load_item()  #调用才会解析

        yield article_item