Esempio n. 1
0
    def parse(self, response):

        zhuye_url = []
        content = response.body.decode("utf-8")
        data = etree.HTML(content)

        for each in data.xpath("//div/ul[@id='pins']/li"):
            zhuyemian = each.xpath(".//a/@href")[0]
            zhuye_url.append(zhuyemian)

        for each_zhu_url in zhuye_url:
            # each_zhu_url = random.choice(zhuye_url)
            # print(each_zhu_url)

            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6823.400 QQBrowser/10.3.3117.400"
            }
            rsp = request.Request(each_zhu_url, headers=headers)
            rep = request.urlopen(rsp)
            html = rep.read().decode()
            html = etree.HTML(html)
            pages = html.xpath("//div[2]/div[1]/div[4]/a[5]/span/text()")[0]
            names = html.xpath("//div[@class='content']/h2/text()")[0]
            imgsss = html.xpath("//div[@class='main-image']/p/a/img/@src")[0]

            item = MeizituItem()
            item["name"] = names
            item["page"] = pages
            item["img_down"] = imgsss
            print(imgsss)
            item["img_zhu"] = zhuye_url

            yield item
Esempio n. 2
0
 def get_every_image_url(self, response):
     item = MeizituItem()
     item['name'] = response.meta['name']
     image_url = response.xpath(
         '//div[@class="content"]/a/img/@src').extract()[0]
     item['img_url'] = image_url
     yield item
Esempio n. 3
0
 def img_parse(self, response):
     img = MeizituItem()
     img['title'] = response.xpath(
         '//div[@class="main-image"]/p/a/img/@alt').extract_first()
     img['imgurl'] = response.xpath(
         '//div[@class="main-image"]/p/a/img/@src').extract_first()
     yield img
Esempio n. 4
0
 def parse(self, response):
     item = MeizituItem()
     item['image_urls'] = response.xpath('//img//@src').extract()
     yield item
     new_url = response.xpath('.//li[@class="next"]//@href').extract_first()
     if new_url:
         yield scrapy.Request(new_url, callback=self.parse)
Esempio n. 5
0
 def parse_second_page(self, response):
     item = MeizituItem()
     image_urls = response.css('div.postContent img::attr(src)').extract()
     image_folder_name = response.css(
         'div.postmeta a::text').extract_first()
     item['image_folder_name'] = image_folder_name
     item['image_urls'] = image_urls
     yield item
Esempio n. 6
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()')
        # l.add_xpath('tags', '//div[@class="postContent"]')
        l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity())
        l.add_value('url', response.url)

        return l.load_item()
Esempio n. 7
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        l.add_xpath('name', '//h2/a/text()')
        l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
        l.add_value('url', response.url)

        return l.load_item()
Esempio n. 8
0
    def parse_item(self, response):
        item_url_list = response.xpath('//div[@id="picture"]//img/@src').extract()
        # item_url_list = response.xpath('//img[@class ="scrollLoading"]/@src').extract()
        for url in item_url_list:
            item = MeizituItem()
            item['url'] = url

            yield item
Esempio n. 9
0
 def parse(self, response):
     years = response.xpath('//div[@class="year"]')
     for year in years:
         reallink = year.xpath('./following-sibling::ul[1]//li/p[2]/a/@href').getall()
         rawname = year.xpath('./following-sibling::ul[1]//li/p[2]/a/text()').getall()
         for name, url in zip(rawname, reallink):
             item = MeizituItem()
             item['year'] = year.xpath('./text()').get()
             item['name'] = name
             yield scrapy.Request(url, callback=self.parse_url, meta={'item': item})
Esempio n. 10
0
 def parse(self, response):
     # 从Json文件内容中提取所有img的内容
     item = MeizituItem()  # items中的类
     imgs = json.loads(response.body)['data']
     for eachImage in imgs:
         try:
             item['image_urls'] = [eachImage['middleURL']]
             yield item
         except Exception as e:
             print(e)
Esempio n. 11
0
 def parse_item(self, response):
     item = MeizituItem()
     item["img_dir"] = response.xpath(
         '//li[@class="current-menu-parent"]/a/text()').extract_first()
     item['img_url'] = response.xpath(
         '//div[@class="main-image"]/p/a/img/@src').extract_first()
     item['img_dir_2'] = response.xpath(
         '//div[@class="main-image"]/p/a/img/@alt').extract_first()
     item["img_name"] = re.match(r'.*?/\d{2}/(.*\.\w+g)',
                                 item["img_url"]).group(1)
     yield item
Esempio n. 12
0
 def parse(self, response):
     result = response.selector.xpath('//ul[@id="pins"]')
     detail_urls = result.css('li a::attr(href)').extract()
     names = result.css('a img::attr(alt)').extract()
     img_urls = result.css('a img::attr(src)').extract()
     for i in range(len(detail_urls)):
         item = MeizituItem()
         item['detail_url'] = detail_urls[i]
         item['name'] = names[i]
         item['img_url'] = img_urls[i]
         yield item
Esempio n. 13
0
 def parse(self, response):
     hxs = Selector(response)
     title = hxs.xpath('/html/head/title/text()').extract()
     sites = hxs.xpath('//div[@class="postContent"]/p')
     items = []
     for site in sites:
         item = MeizituItem()
         item['link'] = site.xpath('img/@src').extract()
         item['title'] = title
         items.append(item)
     return items
Esempio n. 14
0
 def parse_pai(self, response):
     if response.xpath('//div[@id="comments"]'):
         img_list = response.xpath('//img[@class="lazy"]')
         for img in img_list:
             item = MeizituItem()
             item["img_dir"] = response.xpath(
                 '//li[@class="current-menu-item"]/a/text()').extract_first(
                 )
             item['img_dir_2'] = ''
             item["img_url"] = img.xpath('./@data-original').extract_first()
             item["img_name"] = re.findall(r'.*/(.*)', item["img_url"])[-1]
             yield item
Esempio n. 15
0
 def parse(self, response):
     # 获取分类
     li_list = response.xpath("//ul[@id='menu-nav']/li/a")
     for li in li_list:
         item = MeizituItem()
         item['category_1_title'] = li.xpath('./text()').extract_first()
         item['category_1_href'] = li.xpath('./@href').extract_first()
         yield scrapy.Request(url=item['category_1_href'],
                              callback=self.parse_list,
                              meta={'item': item},
                              dont_filter=True)
         break
Esempio n. 16
0
 def fenye(self, response):
     # 取得图片路径和标题
     url = response.url
     item = MeizituItem()
     item['img_url'] = response.xpath('//div[@class="main-image"]//img/@src').extract()
     item['title'] = response.xpath('//div[@class="main-image"]//img/@alt').extract_first().strip()
     yield item
     # 取得下方导航条页面路径
     xhs = response.xpath('//div[@class="pagenavi"]/a[6]/@href').extract()
     for url in xhs:
         yield Request(url=url, callback=self.fenye)
         pass
Esempio n. 17
0
    def parse_item(self, response):
        item = MeizituItem()
        image_links = response.xpath(
            '//div[@id="picture"]/p/img/@src').extract()
        item['dir_name'] = response.xpath(
            '//div[@class="metaRight"]/h2/a/text()').extract()[0]

        for link in image_links:
            item['image_link'] = link
            item['nick_name'] = link.replace(':', '').replace('/', '')[39:-4]

            yield item
Esempio n. 18
0
	def parse_xijie(self,response):
		sel = Selector(response)
		item = MeizituItem()
		rawdate1 = sel.xpath('//div[@class="month_Year"]/text()').extract()[0]
		rawdate2 = sel.xpath('//div[@class="day"]/text()').extract()[0]
		date = rawdate1[-4:] + '-' + rawdate1[:2] + '-' + rawdate2
		title = sel.xpath('//div[@class="metaRight"]/h2/a/text()').extract()[0]
		for_pic = sel.xpath('//div[@id="picture"]//img')
		for yige in for_pic:
			item['date'] = date
			item['title'] = title
			item['image_urls'] = [yige.xpath('./@src').extract()[0]]
			yield  item
Esempio n. 19
0
 def parse_body(self, response):
     item = MeizituItem()
     name = response.xpath(
         '//div[@class="metaRight"]/h2/a/text()').extract_first()  # 提取出名称
     urls = response.xpath('//div[@id="picture"]/p/img/@src').extract()
     item['name'] = name
     if urls:
         urls = response.xpath(
             '//div[@id="picture"]/p/img/@src').extract()  # 提取图片链接
         item['image_urls'] = urls
     else:
         urls = response.xpath(
             '//img[@class="scrollLoading"]/@src').extract()  # 提取图片链接
         item['image_urls'] = urls
     yield item
Esempio n. 20
0
 def parse_picture(self, response):
     item = MeizituItem()
     item['pic_name'] = response.selector.xpath(
         "//div[@ class='main-image']/p/a/img/@alt").extract()
     item['pic_url'] = response.selector.xpath(
         "//div[@ class='main-image']/p/a/img/@src").extract()
     download(item['pic_url'][0], item['pic_name'][0])
     next_pic_text = response.selector.xpath(
         "//div[@ class='pagenavi']/span[not(@class='dots')]/following-sibling::a[1]/span/text()"
     ).extract()
     if next_pic_text[0] != '下一组»':
         next_pic = response.selector.xpath(
             "//div[@ class='pagenavi']/span[not(@class='dots')]/following-sibling::a[1]/@href"
         ).extract()
         yield scrapy.Request(next_pic[0], callback=self.parse_picture)
Esempio n. 21
0
    def parse_next(self, response):

        item = MeizituItem()
        item['name'] = response.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@alt').extract()[0]
        item['img_url'] = response.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src').extract()
        yield item

        # max_page = response.xpath('//div[@divclass="pagenavi"]/a/text()')[-2].extract()
        next_page = response.xpath('/html/body/div[2]/div[1]/div[4]/a/@href').extract()[-1]
        maxp = next_page.split('/')[-1]
        print(maxp)
        # now_page = response.url.split('/')[-1]

        if int(maxp) < 500:
            yield scrapy.Request(next_page, callback=self.parse_next)
Esempio n. 22
0
    def parse_info(self, response):
        item = MeizituItem()
        item["image_url"] = response.xpath(
            '//div[@class="main-image"]/p/a/img/@src').extract_first()
        item["image_name"] = response.xpath(
            '//h2[@class="main-title"]/text()').extract_first()
        yield item

        next_page = response.xpath(
            '//div[@class="pagenavi"]/a[last()]/@href').extract_first()
        if next_page:
            yield scrapy.Request(next_page,
                                 callback=self.parse_info,
                                 meta={},
                                 dont_filter=True)
Esempio n. 23
0
 def parse_item(response):
     il = ItemLoader(item=MeizituItem(), response=response)
     entries = filter(lambda x: x['author']['gender'] is not 'M',
                      json.loads(response.body)['data']['entries'])
     images = []
     for entry in entries:
         try:
             images += entry["images"]
             images.append(entry['photo'])
         except KeyError as e:
             print(json.dumps(entry, indent=2))
     else:
         print(json.loads(response.body)['now'], len(images))
     il.add_value('image_urls', images)
     return il.load_item()
Esempio n. 24
0
 def parse(self, response):
     sel = scrapy.selector.Selector(response)
     sites = sel.xpath('//*[@class="cell first-cell"]')
     for site in sites:
         item = MeizituItem()
         title = site.xpath('a/@title').extract()
         pic_url = site.xpath('a[1]/img/@src').extract()
         item['title'] = [t.encode('utf-8') for t in title]
         item['pic_url'] = pic_url
         yield item
         urls = sel.xpath('//*[@class="page-nav"]/a/@href').extract()
         for url in urls:
             print url
             url = "http://www.tooopen.com" + url
             print url
             yield scrapy.http.Request(url, callback=self.parse)
Esempio n. 25
0
    def parse_page2(self, response):
        item = MeizituItem()
        sel = Selector(response)
        sites = sel.css(
            "div.content div.main article img::attr(src)").extract()
        names = sel.css("div.content div.main h1::text").extract()
        for siteUrl in sites:
            print('siteUrl')
            item['image_urls'] = ['https:' + siteUrl]  # 特别注意,不这么处理会产生错误。
            item['name'] = names
            yield item

        # 各个图片集合的翻页
        hrefs = sel.css(
            "div.content div.main a.page-num::attr(href)").extract()
        for href in hrefs:
            url = response.urljoin('https:' + href)
            yield scrapy.Request(url, self.parse_page2)
Esempio n. 26
0
 def parse_item(self, response):
     #l=用ItemLoader载入MeizituItem()
     re = []
     l = ItemLoader(item=MeizituItem(), response=response)
     #名字
     l.add_xpath('name', '//h2/a/text()')
     #标签
     l.add_xpath(
         'tags',
         "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p"
     )
     #图片连接
     l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",
                 Identity())
     #url
     l.add_value('url', response.url)
     re.append(l.load_item())
     print re
     #return re
     return l.load_item()
Esempio n. 27
0
 def parse_detail(self, response):
     match=re.search(r'\/\d{1,6}\/(\d{1,6})', response.url)
     if match==None:
         page=1
         item=MeizituItem()
         item['url']=response.url
         item['title']=response.meta.get('title', '')
         item['image_urls']=[]
     else:
         item=response.meta.get('item')
         page=match.group(1)
     selecotr=Selector(response)
     imgae_urls=selecotr.xpath('//div[@class="main-image"]//img/@src').extract()
     for url in imgae_urls:
         item['image_urls'].append(url)
     next_page = selecotr.xpath('//span[contains(text(), "下一页")]/parent::a/@href').extract_first(default=None)
     if next_page:
         yield Request(next_page,callback=self.parse_detail,meta={'item':item,'referer':response.url},priority=int(page))
     else:
         yield item
Esempio n. 28
0
    def parse_detail(self, response):
        item = MeizituItem()

        imgUrl = response.xpath(
            '//div[@class="main-image"]//img/@src').extract_first()
        nextPage = response.xpath(
            '*//div[@class="pagenavi"]/a[last()]/@href').extract_first()
        title = response.xpath('*//h2/text()').extract_first()
        title = re.sub('(\d+)', '', title)
        postTime = response.xpath(
            '*//div[@class="main-meta"]/span[last()]/text()').extract_first()
        item['url'] = imgUrl
        item['refeUrl'] = response.url
        item['name'] = title
        item['md5'] = common.get_md5(imgUrl)

        yield Request(url=nextPage,
                      callback=self.parse_detail,
                      priority=20,
                      headers={'referer': response.url})
        yield item
Esempio n. 29
0
    def parse(self, response):
        ret = response.xpath(r'//ul[@id="pins"]/li')
        item = MeizituItem()
        for oli in ret:
            image_link = oli.xpath(r'.//@data-original').extract_first()
            # print(image_link)
            image_name = oli.xpath(r'.//@alt').extract_first()
            # print(image_name)
            item["image_link"] = image_link
            item["image_name"] = image_name
            yield item
            yield scrapy.Request(url=item['image_link'],
                                 callback=self.downloader,
                                 dont_filter=True)

        if self.page < 3:
            self.page += 1
            url = self.url.format(self.page)
            # dont_filter是否过滤
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
Esempio n. 30
0
 def parse_item(response):
     il = ItemLoader(item=MeizituItem(), response=response)
     il.add_css('image_urls', 'div[id="js_content"] img::attr(data-src)')
     return il.load_item()