def parse_item(self, response): item = MeinvItem() sel = Selector(response) item['title'] = sel.xpath( '//div[@class="content"]/h5/text()').extract()[0] item['img'] = sel.xpath( '//div[@class="content-pic"]/a/img/@src').extract()[0] item['origin_time'] = sel.xpath( '//div[@class="content-msg"]/text()').extract()[0][5:] # item['origin_time'] = datetime.strptime(item['origin_time'], "%Y-%m-%d %H:%M%S") # dt_string = '2011-07-15 13:00:00+00:00' # new_dt = dt_string[:19] # dt = datetime.strptime(new_dt, '%Y-%m-%d %H:%M:%S') item['url'] = response.url item['url_arrs'] = [item['url']] item['img_arrs'] = [item['img']] item['catalogue'] = self.catalogue(item['url']) item['create_time'] = datetime.now() item['is_show'] = True count = sel.xpath( '//div[@class="content-page"]/span[@class="page-ch"]/text()' ).extract()[0] count = int(count[1:-1]) for num in range(2, count): item['url_arrs'].append(response.url[0:-5] + '_' + str(num) + '.html') item['img_arrs'].append(item['img'][0:-5] + str(num) + '.jpg') print(item) return item
def parse(self, response): columns = response.xpath('//div[@class="list_cont list_cont2 w1180"]') for column in columns: classify = column.xpath('.//h2/text()').get() urls = column.xpath('.//ul[@class="clearfix"]/li/a/img/@data-original').getall() item = MeinvItem(headline=classify, urls=urls) yield item
def load_item(self, d): item = MeinvItem(); item['title'] = d.css('a::attr(title)').extract_first() item['url'] = 'http://m.92mntu.com'+d.css('a::attr(href)').extract_first() item['img'] = d.css('img::attr(src)').extract_first() item['img'] = item['img'].replace('www.','') item['img_arrs'] = [] item['url_arrs'] = [] item['catalogue'] = self.catalogue(item['url']) item['create_time'] = time.time() print d.css('a::attr(title)').extract_first() return item
def parse(self, response): meinv_item = MeinvItem() meinv_item['title'] = response.xpath( "/html/body/div[2]/div[2]/h1/text()").extract_first() meinv_item['imgurl'] = response.xpath( "//*[@id='picBody']/p/a[1]/img/@src").extract_first() yield meinv_item next_link = response.xpath("//*[@id='nl']/a/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request( "https://www.27270.com/ent/meinvtupian/2019/" + next_link, callback=self.parse)
def parse(self, response): img=[] item = MeinvItem() # 实例化item next_p = response.css('a.a1::attr(href)').extract()[1] imgurls = response.css('img.content_img::attr(src)').extract() # 注意这里是一个集合也就是多张图片 if next_p is not None: next_page =response.urljoin(next_p) yield scrapy.Request(next_page, callback=self.parse) item['imgurl']= imgurls yield item with open('a.txt','a') as f: f.write('%s' % img) pass
def parse_album(self, response): """ 获取单个图片的url 及 title 通过meta传递 来的图集title 及 url 对应关系 :param response: :return: """ item = MeinvItem() img_url = response.xpath( '//div[@class="articleBody"]/p/a/img/@src')[0].extract() img_title = response.xpath( '//div[@class="articleBody"]/p/a/img/@alt')[0].extract() item['image_title'] = img_title item['image_url'] = img_url item['album_title'] = response.meta['album_title'] item['album_url'] = response.meta['album_url'] item['tag'] = response.meta['tag'] yield item yield Request(img_url, callback=self.SaveImage, meta={ 'album_title': item['album_title'], 'img_title': item['album_title'], }) status = response.xpath('//div[@class="pages"]/ul').extract() next_page = response.xpath( '//div[@class="pages"]/ul/li[last()]/a/@href').extract() if status: if next_page: if '#' not in next_page[0]: next_url = response.urljoin(next_page[0]) yield Request(next_url, callback=self.parse_album, meta={ 'album_title': item['album_title'], 'album_url': item['album_url'], 'tag': item['tag'] }) else: print('*************最后一页了 别翻了 ***********') else: print('******* 找不到下一页 %s图集只有一张图片*********' % response.meta['album_url'])
def parse_item(self, response): item = MeinvItem() #标题 item['title'] = response.xpath( '//h2[@class="main-title"]/text()').extract_first('') #分类 item['classify'] = response.xpath( '//div[@class="main-meta"]/span[1]/a/text()').extract_first('') #发布时间 item['time'] = "".join( response.xpath('//div[@class="main-meta"]/span[2]/text()'). extract_first('')).replace('发布于', '') #浏览量 item['page_view'] = response.xpath( '//div[@class="main-meta"]/span[3]/text()').extract_first('') #图片链接 item['image_link'] = response.xpath( '//div[@class="main-image"]/p/a/img/@src').extract() yield scrapy.Request(item['image_link'][0], callback=self.a)
def parse_item(self, response): sel = Selector(response) item = MeinvItem() if response.meta.has_key('item') == False: item['url'] = response.url item['title'] = sel.xpath( '//h2[@class="main-title"]/text()').extract_first() item['img'] = sel.xpath( '//div[@class="main-image"]/p/a/img/@src').extract_first() item['img_arrs'] = [] item['catalogue'] = sel.xpath( '//a[@rel="category tag"]/text()').extract_first() else: item = response.meta['item'] item['img_arrs'].append({ 'img_title': sel.xpath('//h2[@class="main-title"]/text()').extract_first(), 'img_url': sel.xpath( '//div[@class="main-image"]/p/a/img/@src').extract_first() }) item['create_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['origin_time'] = sel.xpath( '//span[contains(text(), "-")]').extract_first() next_page = sel.xpath( '//div[@class="main-image"]/p/a/@href').extract_first() if next_page.find(item['url']) != -1: yield Request(next_page, meta={'item': item}, callback=self.parse_item, dont_filter=False, errback=self.errback_httpbin) else: yield item
def parse(self, response): """ 获取全部图集的url,得到图集的title 及 url :param response: :return: """ items = MeinvItem() a_tag_list = response.xpath('//div[@class="listBox"]/ul/li/a') for a_tag in a_tag_list: album_url = a_tag.xpath('@href')[0].extract() album_title = a_tag.xpath('@title')[0].extract() items['album_url'] = album_url items['album_title'] = album_title tag = response.url.split('/')[3] items['tag'] = tag self.album_count += 1 yield Request(album_url, callback=self.parse_album, meta={ 'album_url': items['album_url'], 'album_title': items['album_title'], 'tag': items['tag'] }) next_page_status = response.xpath( '//div[@class="pages"]/ul/li[last()-1]/a/text()')[0].extract() if "下一页" in next_page_status: next_page = response.xpath( '//div[@class="pages"]/ul/li[last()-1]/a/@href')[0].extract() next_page_url = response.urljoin(next_page) yield scrapy.Request(next_page_url, callback=self.parse) time.sleep(1) items['album_count'] = self.album_count yield items['album_count'] print('*******************%s共有 %s 套 ' % (tag, items['album_count']))