def parse(self, response): zhuye_url = [] content = response.body.decode("utf-8") data = etree.HTML(content) for each in data.xpath("//div/ul[@id='pins']/li"): zhuyemian = each.xpath(".//a/@href")[0] zhuye_url.append(zhuyemian) for each_zhu_url in zhuye_url: # each_zhu_url = random.choice(zhuye_url) # print(each_zhu_url) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6823.400 QQBrowser/10.3.3117.400" } rsp = request.Request(each_zhu_url, headers=headers) rep = request.urlopen(rsp) html = rep.read().decode() html = etree.HTML(html) pages = html.xpath("//div[2]/div[1]/div[4]/a[5]/span/text()")[0] names = html.xpath("//div[@class='content']/h2/text()")[0] imgsss = html.xpath("//div[@class='main-image']/p/a/img/@src")[0] item = MeizituItem() item["name"] = names item["page"] = pages item["img_down"] = imgsss print(imgsss) item["img_zhu"] = zhuye_url yield item
def get_every_image_url(self, response): item = MeizituItem() item['name'] = response.meta['name'] image_url = response.xpath( '//div[@class="content"]/a/img/@src').extract()[0] item['img_url'] = image_url yield item
def img_parse(self, response): img = MeizituItem() img['title'] = response.xpath( '//div[@class="main-image"]/p/a/img/@alt').extract_first() img['imgurl'] = response.xpath( '//div[@class="main-image"]/p/a/img/@src').extract_first() yield img
def parse(self, response): item = MeizituItem() item['image_urls'] = response.xpath('//img//@src').extract() yield item new_url = response.xpath('.//li[@class="next"]//@href').extract_first() if new_url: yield scrapy.Request(new_url, callback=self.parse)
def parse_second_page(self, response): item = MeizituItem() image_urls = response.css('div.postContent img::attr(src)').extract() image_folder_name = response.css( 'div.postmeta a::text').extract_first() item['image_folder_name'] = image_folder_name item['image_urls'] = image_urls yield item
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()') # l.add_xpath('tags', '//div[@class="postContent"]') l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): item_url_list = response.xpath('//div[@id="picture"]//img/@src').extract() # item_url_list = response.xpath('//img[@class ="scrollLoading"]/@src').extract() for url in item_url_list: item = MeizituItem() item['url'] = url yield item
def parse(self, response): years = response.xpath('//div[@class="year"]') for year in years: reallink = year.xpath('./following-sibling::ul[1]//li/p[2]/a/@href').getall() rawname = year.xpath('./following-sibling::ul[1]//li/p[2]/a/text()').getall() for name, url in zip(rawname, reallink): item = MeizituItem() item['year'] = year.xpath('./text()').get() item['name'] = name yield scrapy.Request(url, callback=self.parse_url, meta={'item': item})
def parse(self, response): # 从Json文件内容中提取所有img的内容 item = MeizituItem() # items中的类 imgs = json.loads(response.body)['data'] for eachImage in imgs: try: item['image_urls'] = [eachImage['middleURL']] yield item except Exception as e: print(e)
def parse_item(self, response): item = MeizituItem() item["img_dir"] = response.xpath( '//li[@class="current-menu-parent"]/a/text()').extract_first() item['img_url'] = response.xpath( '//div[@class="main-image"]/p/a/img/@src').extract_first() item['img_dir_2'] = response.xpath( '//div[@class="main-image"]/p/a/img/@alt').extract_first() item["img_name"] = re.match(r'.*?/\d{2}/(.*\.\w+g)', item["img_url"]).group(1) yield item
def parse(self, response): result = response.selector.xpath('//ul[@id="pins"]') detail_urls = result.css('li a::attr(href)').extract() names = result.css('a img::attr(alt)').extract() img_urls = result.css('a img::attr(src)').extract() for i in range(len(detail_urls)): item = MeizituItem() item['detail_url'] = detail_urls[i] item['name'] = names[i] item['img_url'] = img_urls[i] yield item
def parse(self, response): hxs = Selector(response) title = hxs.xpath('/html/head/title/text()').extract() sites = hxs.xpath('//div[@class="postContent"]/p') items = [] for site in sites: item = MeizituItem() item['link'] = site.xpath('img/@src').extract() item['title'] = title items.append(item) return items
def parse_pai(self, response): if response.xpath('//div[@id="comments"]'): img_list = response.xpath('//img[@class="lazy"]') for img in img_list: item = MeizituItem() item["img_dir"] = response.xpath( '//li[@class="current-menu-item"]/a/text()').extract_first( ) item['img_dir_2'] = '' item["img_url"] = img.xpath('./@data-original').extract_first() item["img_name"] = re.findall(r'.*/(.*)', item["img_url"])[-1] yield item
def parse(self, response): # 获取分类 li_list = response.xpath("//ul[@id='menu-nav']/li/a") for li in li_list: item = MeizituItem() item['category_1_title'] = li.xpath('./text()').extract_first() item['category_1_href'] = li.xpath('./@href').extract_first() yield scrapy.Request(url=item['category_1_href'], callback=self.parse_list, meta={'item': item}, dont_filter=True) break
def fenye(self, response): # 取得图片路径和标题 url = response.url item = MeizituItem() item['img_url'] = response.xpath('//div[@class="main-image"]//img/@src').extract() item['title'] = response.xpath('//div[@class="main-image"]//img/@alt').extract_first().strip() yield item # 取得下方导航条页面路径 xhs = response.xpath('//div[@class="pagenavi"]/a[6]/@href').extract() for url in xhs: yield Request(url=url, callback=self.fenye) pass
def parse_item(self, response): item = MeizituItem() image_links = response.xpath( '//div[@id="picture"]/p/img/@src').extract() item['dir_name'] = response.xpath( '//div[@class="metaRight"]/h2/a/text()').extract()[0] for link in image_links: item['image_link'] = link item['nick_name'] = link.replace(':', '').replace('/', '')[39:-4] yield item
def parse_xijie(self,response): sel = Selector(response) item = MeizituItem() rawdate1 = sel.xpath('//div[@class="month_Year"]/text()').extract()[0] rawdate2 = sel.xpath('//div[@class="day"]/text()').extract()[0] date = rawdate1[-4:] + '-' + rawdate1[:2] + '-' + rawdate2 title = sel.xpath('//div[@class="metaRight"]/h2/a/text()').extract()[0] for_pic = sel.xpath('//div[@id="picture"]//img') for yige in for_pic: item['date'] = date item['title'] = title item['image_urls'] = [yige.xpath('./@src').extract()[0]] yield item
def parse_body(self, response): item = MeizituItem() name = response.xpath( '//div[@class="metaRight"]/h2/a/text()').extract_first() # 提取出名称 urls = response.xpath('//div[@id="picture"]/p/img/@src').extract() item['name'] = name if urls: urls = response.xpath( '//div[@id="picture"]/p/img/@src').extract() # 提取图片链接 item['image_urls'] = urls else: urls = response.xpath( '//img[@class="scrollLoading"]/@src').extract() # 提取图片链接 item['image_urls'] = urls yield item
def parse_picture(self, response): item = MeizituItem() item['pic_name'] = response.selector.xpath( "//div[@ class='main-image']/p/a/img/@alt").extract() item['pic_url'] = response.selector.xpath( "//div[@ class='main-image']/p/a/img/@src").extract() download(item['pic_url'][0], item['pic_name'][0]) next_pic_text = response.selector.xpath( "//div[@ class='pagenavi']/span[not(@class='dots')]/following-sibling::a[1]/span/text()" ).extract() if next_pic_text[0] != '下一组»': next_pic = response.selector.xpath( "//div[@ class='pagenavi']/span[not(@class='dots')]/following-sibling::a[1]/@href" ).extract() yield scrapy.Request(next_pic[0], callback=self.parse_picture)
def parse_next(self, response): item = MeizituItem() item['name'] = response.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@alt').extract()[0] item['img_url'] = response.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src').extract() yield item # max_page = response.xpath('//div[@divclass="pagenavi"]/a/text()')[-2].extract() next_page = response.xpath('/html/body/div[2]/div[1]/div[4]/a/@href').extract()[-1] maxp = next_page.split('/')[-1] print(maxp) # now_page = response.url.split('/')[-1] if int(maxp) < 500: yield scrapy.Request(next_page, callback=self.parse_next)
def parse_info(self, response): item = MeizituItem() item["image_url"] = response.xpath( '//div[@class="main-image"]/p/a/img/@src').extract_first() item["image_name"] = response.xpath( '//h2[@class="main-title"]/text()').extract_first() yield item next_page = response.xpath( '//div[@class="pagenavi"]/a[last()]/@href').extract_first() if next_page: yield scrapy.Request(next_page, callback=self.parse_info, meta={}, dont_filter=True)
def parse_item(response): il = ItemLoader(item=MeizituItem(), response=response) entries = filter(lambda x: x['author']['gender'] is not 'M', json.loads(response.body)['data']['entries']) images = [] for entry in entries: try: images += entry["images"] images.append(entry['photo']) except KeyError as e: print(json.dumps(entry, indent=2)) else: print(json.loads(response.body)['now'], len(images)) il.add_value('image_urls', images) return il.load_item()
def parse(self, response): sel = scrapy.selector.Selector(response) sites = sel.xpath('//*[@class="cell first-cell"]') for site in sites: item = MeizituItem() title = site.xpath('a/@title').extract() pic_url = site.xpath('a[1]/img/@src').extract() item['title'] = [t.encode('utf-8') for t in title] item['pic_url'] = pic_url yield item urls = sel.xpath('//*[@class="page-nav"]/a/@href').extract() for url in urls: print url url = "http://www.tooopen.com" + url print url yield scrapy.http.Request(url, callback=self.parse)
def parse_page2(self, response): item = MeizituItem() sel = Selector(response) sites = sel.css( "div.content div.main article img::attr(src)").extract() names = sel.css("div.content div.main h1::text").extract() for siteUrl in sites: print('siteUrl') item['image_urls'] = ['https:' + siteUrl] # 特别注意,不这么处理会产生错误。 item['name'] = names yield item # 各个图片集合的翻页 hrefs = sel.css( "div.content div.main a.page-num::attr(href)").extract() for href in hrefs: url = response.urljoin('https:' + href) yield scrapy.Request(url, self.parse_page2)
def parse_item(self, response): #l=用ItemLoader载入MeizituItem() re = [] l = ItemLoader(item=MeizituItem(), response=response) #名字 l.add_xpath('name', '//h2/a/text()') #标签 l.add_xpath( 'tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p" ) #图片连接 l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) #url l.add_value('url', response.url) re.append(l.load_item()) print re #return re return l.load_item()
def parse_detail(self, response): match=re.search(r'\/\d{1,6}\/(\d{1,6})', response.url) if match==None: page=1 item=MeizituItem() item['url']=response.url item['title']=response.meta.get('title', '') item['image_urls']=[] else: item=response.meta.get('item') page=match.group(1) selecotr=Selector(response) imgae_urls=selecotr.xpath('//div[@class="main-image"]//img/@src').extract() for url in imgae_urls: item['image_urls'].append(url) next_page = selecotr.xpath('//span[contains(text(), "下一页")]/parent::a/@href').extract_first(default=None) if next_page: yield Request(next_page,callback=self.parse_detail,meta={'item':item,'referer':response.url},priority=int(page)) else: yield item
def parse_detail(self, response): item = MeizituItem() imgUrl = response.xpath( '//div[@class="main-image"]//img/@src').extract_first() nextPage = response.xpath( '*//div[@class="pagenavi"]/a[last()]/@href').extract_first() title = response.xpath('*//h2/text()').extract_first() title = re.sub('(\d+)', '', title) postTime = response.xpath( '*//div[@class="main-meta"]/span[last()]/text()').extract_first() item['url'] = imgUrl item['refeUrl'] = response.url item['name'] = title item['md5'] = common.get_md5(imgUrl) yield Request(url=nextPage, callback=self.parse_detail, priority=20, headers={'referer': response.url}) yield item
def parse(self, response): ret = response.xpath(r'//ul[@id="pins"]/li') item = MeizituItem() for oli in ret: image_link = oli.xpath(r'.//@data-original').extract_first() # print(image_link) image_name = oli.xpath(r'.//@alt').extract_first() # print(image_name) item["image_link"] = image_link item["image_name"] = image_name yield item yield scrapy.Request(url=item['image_link'], callback=self.downloader, dont_filter=True) if self.page < 3: self.page += 1 url = self.url.format(self.page) # dont_filter是否过滤 yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse_item(response): il = ItemLoader(item=MeizituItem(), response=response) il.add_css('image_urls', 'div[id="js_content"] img::attr(data-src)') return il.load_item()