def parse_detail(self, response): item = DesignItem() url = response.url img_url = response.xpath( '//div[@class="artcon"]//img/@src').extract()[0] tags = response.xpath('//div[@id="loat6"]/a/text()').extract() try: tags[0] = response.xpath( '//div[@class="zuozhe1"]/a/text()').extract()[0] except: tags.pop(0) tags = ','.join(tags) title = response.xpath('//h1/text()').extract()[0] remark = response.xpath( '//div[@class="articlebox"]/div[@class="artcon"]//text()').extract( ) # remark = ''.join(response.xpath('//div[@class="artcon"]/p[1]//text()').extract()).strip() # if not remark: # remark = ''.join(response.xpath('//div[@class="artcon"]/p[2]//text()').extract()).strip() # if not remark: # remark = response.xpath('//div[@class="artcon"]/text()').extract()[0] remark = [''.join(i.split()) for i in remark] remark = ''.join(remark) if len(remark) > 480: remark = remark[:480] item['tags'] = tags item['title'] = title.strip() item['remark'] = remark.strip() item['url'] = url item['img_url'] = img_url print(item) for key, value in data.items(): item[key] = value yield item
def parse(self, response): detail_list = response.xpath( '//div[@class="dboom-container-block"]//article') for i in detail_list: item = DesignItem() title = i.xpath('./h3[@class="dboom-title"]/a/text()').extract()[0] url = i.xpath('./h3[@class="dboom-title"]/a/@href').extract()[0] try: remark = i.xpath( './p[@class="dboom-excerpt flip-other"]/text()').extract( )[0] except: remark = '' tags = 'design' item['title'] = title item['remark'] = remark item['url'] = url item['tags'] = tags yield scrapy.Request(url, callback=self.parse_detail, meta={'item': item}) if self.page < 500: print(self.page) self.page += 1 yield scrapy.Request('https://www.designboom.com/design/page' + str(self.page) + '/', callback=self.parse)
def parse(self, response): date = json.loads(response.body) for i in date['data']: item = DesignItem() title = i['title'] tags = '' try: for cate in i['showcateList']: tags += cate['name'] + ',' except: tags = '' img_url = 'http://www.perdesigncn.com' + i['litpic'] url = 'http://www.perdesigncn.com/Home/info/' + i['id'] info = i item['title'] = title item['tags'] = tags item['url'] = url item['img_url'] = img_url for key, value in data.items(): item[key] = value item['info'] = i yield item if self.p < 9: self.p += 1 try: yield scrapy.FormRequest(url=self.url, formdata={'p': str(self.p)}, callback=self.parse) except: print(self.p, '*' * 50)
def parse_detail(self, response): item = DesignItem() prize_level = response.meta['prize_level'] prize_time = response.xpath('//li[@class="project-year project-term"]/h4/text()').extract()[0] tags = response.xpath('//li[@class="project-discipline project-term"]//div/text()').extract() for i in range(tags.count(' ')): tags.remove(' ') designer = response.xpath('//div[@class="columns project-details"]/div[2]//li/text()').extract()[0] try: company = response.xpath('//div[@class="columns project-details"]/div[3]/div/p/text()').extract()[0] except: company = '' title = response.xpath('//h1/text()').extract()[0] img_url = response.xpath('//div[@class="project-main-content__inner-wrapper"]/figure[1]/img/@src').extract()[0] if not img_url.startswith('https://good-design.org'): img_url = 'https://good-design.org' + img_url remark = response.xpath('//div[@class="project-description"]/p[1]/text()').extract()[0] remark = remark.replace('\n','').replace(' ','').replace('\r','').strip() if len(remark) > 450: remark = remark[:450] item['prize_level'] = prize_level item['prize_time'] = prize_time item['tags'] = tags item['designer'] = designer item['company'] = company item['title'] = title item['img_url'] = img_url item['remark'] = remark for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() url = response.url tags = response.xpath( '//li[contains(@class," li_active1")]/a/text()').extract()[0] img_url = response.xpath( '//div[@class="view_content"]/p/img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.designdo.cn' + img_url try: remark = response.xpath( '/html/body/div[9]/div[1]/table/tr/td[1]/p[2]/span/text()' ).extract()[0] except: remark = '' title = response.xpath('//p[@class="case_title"]/text()').extract()[0] item['title'] = title item['remark'] = remark item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value # print(item) yield item
def parse_detail(self, response): item = DesignItem() print(self.category_index) url = response.url tags = self.category[self.category_list[self.category_index]] img_url = response.xpath( '//div[@class="product-banner product-banner-video"]/img/@src' ).extract()[0] if not img_url.startswith('http'): img_url = 'http://www.lkkdesign.com' + img_url remark = response.xpath( '//div[@class="product-banner-txt"]//text()').extract() remark = [''.join(i.split()) for i in remark] remark = ''.join(remark) if len(remark) > 500: remark = remark[:500] title = response.xpath('//h1/text()').extract()[0] item['title'] = title item['remark'] = remark item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value yield item
def parse_detail(self,response): item = DesignItem() prize_level = response.xpath('//ul[@class="eparams"]/li[1]/text()').extract()[1].strip() prize_time = self.year url = response.url img_url = response.xpath('//div[@class="eimglist"]/a[1]/img/@src').extract()[0] img_url = img_url.replace('c200x200','a768') if not img_url.startswith('http'): img_url = 'http://cdn.di-award.org' + img_url remark = response.xpath('//p[@class="econtent"]//text()').extract() remark = [''.join(i.split()) for i in remark] remark = ''.join(remark) title = response.xpath('//h3/text()').extract()[0] company = response.xpath('//ul[@class="eparams"]/li[2]/text()').extract()[1].strip() if len(remark) > 480: remark = remark[:480] item['title'] = title item['remark'] = remark item['url'] = url item['img_url'] = img_url item['company'] = company item['prize_level'] = prize_level item['prize_time'] = prize_time for key, value in data.items(): item[key] = value # print(item) yield item
def parse(self, response): detail_list = response.xpath('//div[@class="post"]') for i in detail_list: item = DesignItem() url = i.xpath('./div[1]/a/@href').extract()[0] title = i.xpath('./div[2]/h2//text()').extract()[0] img_url = i.xpath('./div[1]/a/img/@src').extract()[0] tags = i.xpath('.//div[@class="category"]//text()').extract() for i in range(tags.count(' ')): tags.remove(' ') for i in range(tags.count(', ')): tags.remove(', ') img_url = img_url.replace('-326x246', '') item['tags'] = tags item['img_url'] = img_url item['url'] = url item['title'] = title for key, value in data.items(): item[key] = value yield scrapy.Request(url, callback=self.parse_detail, meta={'item': item}) if self.page < 30: self.page += 1 yield scrapy.Request(url='http://www.pplock.com/industrial-design/page/' + str(self.page), callback=self.parse)
def parse_category(self, response): design_list = response.xpath( '//ul[@class="gpWinnersInCategory gp itemList"]//div[@class="in"]') tags = response.xpath('//ul[@id="sub-nav"]//a[@class="active"]/text()' ).extract()[0] # 标签 for design in design_list: item = DesignItem() title = design.xpath( './/h3[@class="projectTitle"]//a/text()').extract()[0] prize_level = design.xpath( './/p[@class="award"]/text()').extract()[0] try: designer_name = design.xpath( './/p[@class="agency"]/text()').extract()[1].strip() except: designer_name = design.xpath( './/p[@class="agency"]/text()').extract()[0].strip() detail_url = design.xpath('.//a[1]/@href').extract()[0] item['title'] = title # 标题 item['tags'] = tags item['prize_level'] = prize_level # 奖项级别 item['designer'] = designer_name # 设计者 item['prize_time'] = str(self.year + 1) # 奖项时间 for key, value in data.items(): item[key] = value yield scrapy.Request(url='http://www.effectivedesign.org.uk' + detail_url, callback=self.parse_detail, meta={'item': item})
def parse(self, response): content = response.text result = json.loads(content) for i in result: item = DesignItem() item['title'] = i['title1'] item['img_url'] = urllib.parse.unquote( 'http://www.spark-design.cn' + i['pic1']) item['url'] = 'http://www.spark-design.cn/zh-cn/caseshow/?id=' + i[ 'id'] item['tags'] = self.cate_dict[self.cate_list[self.cate_index]] for key, value in data.items(): item[key] = value yield item if content != '[]': self.page += 1 yield scrapy.Request( 'http://www.spark-design.cn/ajax.asp?rnd=.7055475&s=case&page=%s&cid=%s' % (self.page, self.cate_list[self.cate_index]), callback=self.parse) else: if self.cate_index < 4: self.cate_index += 1 self.page = 1 yield scrapy.Request( 'http://www.spark-design.cn/ajax.asp?rnd=.7055475&s=case&page=%s&cid=%s' % (self.page, self.cate_list[self.cate_index]), callback=self.parse)
def parse_detail(self,response): item = DesignItem() prize_level = self.prize_level[self.prize_index] prize_time = self.year url = response.url img_url = response.xpath('//div[@class="main_image"]/ul/li[1]/img/@src').extract()[0] if not img_url.startswith('http://www.chinagooddesignaward.com'): img_url = 'http://www.chinagooddesignaward.com' + img_url remark = response.xpath('//div[@class="ct_cn"]//text()').extract() remark = [''.join(i.split()) for i in remark] remark = ''.join(remark) title = response.xpath('//h2/text()').extract()[0] designer = response.xpath('//div[@class="case_text"]/dl[2]/dd/p//text()').extract() designer = [''.join(i.split()) for i in designer] designer = ' '.join(designer) company = response.xpath('//div[@class="case_text"]/dl[1]/dd/p[1]//text()').extract() company = ' '.join(company) if len(remark) > 480: remark = remark[:480] item['title'] = title item['remark'] = remark item['url'] = url item['img_url'] = img_url item['designer'] = designer item['company'] = company item['prize_level'] = prize_level item['prize_time'] = prize_time for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() # id = re.compile('\?id=\w+').search(response.url).group() # page = 'page' + str(response.meta['page']) # typeid = 'typeid' + str(response.meta['typeid']) img_url = response.xpath( '//div[@id="works_list"]//a[1]/img/@src').extract()[0] # if img_url in a: # a[img_url].append(id + page + typeid) # else: # a[img_url] = [id + page + typeid] if not img_url.startswith('http://www.dgdesign.org.cn'): img_url = 'http://www.dgdesign.org.cn' + img_url[2:] message = response.xpath( '//div[@id="works_list"]//td[2]//div[1]/span/text()').extract() remark = '' print(img_url) item['img_url'] = img_url.strip() item['title'] = message[0].strip() try: item['company'] = message[4].strip() except: item['company'] = '' item['prize_time'] = str(self.year) item['remark'] = remark item['tags'] = [message[1]] item['designer'] = message[3].strip() for key, value in data.items(): item[key] = value # for key, value in a.items(): # if len(value) > 1: # print(value) yield item
def parse_list(self, response): detail_list = response.xpath('//h2/a/@href').extract() for i in detail_list: item = DesignItem() designer = response.xpath('//h1/text()').extract()[0] item['designer'] = designer yield scrapy.Request(i, callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response): url = response.url print(url) self.total += 1 item = DesignItem() img_url = response.xpath('//div[@class="only"]/img/@src').extract()[0] if not img_url.startswith('http://www.redstaraward.org'): img_url = 'http://www.redstaraward.org/' + img_url try: title = response.xpath( '//div[@class="zuopin_h"][1]/div/text()').extract()[0] except: title = '' try: designer = response.xpath( '//div[@class="zuopin_h"][2]/div/text()').extract()[0] except: designer = '' try: company = response.xpath( '//div[@class="zuopin_h"][3]/div/text()').extract()[0] except: company = '' try: remark = response.xpath( '//div[@class="zuopin_h"][6]/div/text()').extract()[0] if len(remark) > 450: remark = remark[:450] except: remark = '' try: prize_level = response.xpath( '//div[@class="zuopin_h"][4]/div/text()').extract()[0] except: prize_level = '' item['img_url'] = img_url.strip() item['title'] = title.strip() item['company'] = company.strip() item['prize_time'] = str(self.year) item['remark'] = remark.replace('\n', '').replace(' ', '').replace('\r', '').strip() item['prize_level'] = prize_level.strip() item['designer'] = designer.strip() item['url'] = url.strip() for key, value in data.items(): item[key] = value print("总数", self.total) yield item
def parse_detail(self, response): item = DesignItem() img_url = response.meta.get('img_url') url = response.url tags = response.meta.get('tags') title = response.xpath('//h1[@class="title"]/text()').extract()[0] item['title'] = title item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value # print(tags,self.page) yield item
def parse(self, response): date = response.xpath( '//div[@class="grid-row flex flex-wrap"]/div[@data-project]/@data-project' ).extract() for i in date: item = DesignItem() dic = json.loads(i) url = dic['urls']['web']['project'] img_url = dic['photo']['1536x864'] if self.category_id in [ '332', '333', '334', '335', '336', '337', '52', '362', '338', '51', '339', '340', '341', '342' ]: tags = "Technology," + self.category_ids[self.category_id] elif self.category_id in ['25', '259', '27', '260', '28', '261']: tags = 'Design' + self.category_ids[self.category_id] else: tags = 'Crafts' + self.category_ids[self.category_id] item['img_url'] = img_url.strip() item['tags'] = tags item['url'] = url item['info'] = i yield scrapy.Request(url, callback=self.parse_detail, meta={'item': item}) if date: if self.page < 200: self.page += 1 yield scrapy.Request( 'https://www.kickstarter.com/discover/advanced?category_id=' + self.category_id + '&sort=magic&seed=2573000&page=' + str(self.page)) else: self.page = 1 self.index_id += 1 self.category_id = self.index_ids[self.index_id] yield scrapy.Request( 'https://www.kickstarter.com/discover/advanced?category_id=' + self.category_id + '&sort=magic&seed=2573000&page=' + str(self.page)) else: self.page = 1 self.index_id += 1 self.category_id = self.index_ids[self.index_id] yield scrapy.Request( 'https://www.kickstarter.com/discover/advanced?category_id=' + self.category_id + '&sort=magic&seed=2573000&page=' + str(self.page))
def parse_detail(self, response): item = DesignItem() url = response.url tags = response.xpath('//span[@class="n_r_wz6"]/a/text()').extract()[0] img_url = response.xpath('//a[@onclick]/img/@src').extract()[0] title = response.xpath( '//*[@id="xn_c_prodv_60_nameText"]/text()').extract()[0] item['title'] = title item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value # print(item) yield item
def parse_detail(self, response): item = DesignItem() url = response.url title = response.xpath( '//div[@class="c_left"]/div/p[1]/b/text()').extract()[0] img_url = response.xpath( '//div[@class="case_content"]//img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.yu-kangyuan.com' + img_url item['title'] = title item['img_url'] = img_url item['url'] = url for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() url = response.url tags = self.category[self.category_list[self.category_index]] img_url = response.xpath( '//div[@class="limitimg"]/p[1]/img/@src').extract()[0] title = response.xpath('//h1/text()').extract()[0] item['title'] = title item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value yield item
def parse(self, response): detail_list = response.xpath( '//div[@class="d_case_list"]/ul[@class="clearfix"]/li') for i in detail_list: item = DesignItem() title = i.xpath('.//div[@class="h3"]/text()').extract()[0] tags = '工业设计,' + i.xpath('.//div[@class="p"]/text()').extract()[0] url = 'http://www.a-fourdesign.com' + i.xpath( './a/@href').extract()[0] item['url'] = url item['tags'] = tags item['title'] = title yield scrapy.Request(url, callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response): item = DesignItem() text = response.xpath( '/html/body/table[1]/tr[4]/td/span[4]/table/tr[2]/td[2]/h2/text()' ).extract()[0] rex = re.compile(r'Winner in (.*?)Design Category,') tags = rex.findall(text)[0] if tags.count('and'): tags = re.sub(r' and ', ',', tags) rex = re.compile(r'Category, (.*?) -') prize_time = rex.findall(text)[0] url = response.url remark = '' img_url = response.xpath( '/html/body/table[1]/tr[3]/td/a/img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'https://competition.adesignaward.com/' + img_url try: remark = response.xpath( '/html/body/table[1]/tr[3]/td/table/tr[3]/td[1]/text()' ).extract() index = remark.index( " \r\nUNIQUE PROPERTIES / PROJECT DESCRIPTION:") remark = remark[index + 1] remark = remark.split('\r\n') remark = ' '.join(remark) except: print("*" * 100, remark, response.url) title = response.xpath( '/html/body/table[1]/tr[2]/td/table/tr/td[1]/h1/text()').extract( )[0][:-4] designer = response.xpath( '/html/body/table[1]/tr[2]/td/table/tr/td[1]/h1/a/text()').extract( )[0].strip() if len(remark) > 480: remark = remark[:480] item['title'] = title item['remark'] = remark item['url'] = url item['img_url'] = img_url item['designer'] = designer item['tags'] = tags item['prize_time'] = prize_time for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() url = response.url title = response.xpath('//h1/text()').extract()[0] img_url = response.xpath( '//ul[@id="picInGG"]/li[1]/img/@src').extract()[0] tags = response.xpath( '//div[@class="fleft article_tags"]/a/text()').extract() tags = '工业设计,' + ','.join(tags) item['tags'] = tags item['img_url'] = img_url item['url'] = url item['title'] = title for key, value in data.items(): item[key] = value print(item) yield item
def parse_detail(self, response): item = DesignItem() url = response.url img_url = response.xpath( '//div[@id="focus"]/ul/li/img/@src').extract()[0] if not img_url.startswith('http://www.yxidea.com.cn'): img_url = 'http://www.yxidea.com.cn/' + img_url tags = self.category[self.category_list[self.category_index]] title = response.xpath('//div[@class="pt10"]/text()').extract()[0] item['tags'] = tags item['title'] = title.strip() item['url'] = url item['img_url'] = img_url print(item) for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() url = response.url tags = self.category[self.category_list[self.category_index]] img_url = response.xpath('//div[@class="ci_p"]//img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.ctdesign.cn' + img_url[1:] title = response.xpath( '//div[@class="ci_head clearfix"]/span/text()').extract()[0][3:] item['title'] = title item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value # print(item) yield item
def parse_detail(self, response): item = DesignItem() url = response.url tags = '' if self.category[self.category_list[self.category_index]] != '其他案例': tags = self.category[self.category_list[self.category_index]] img_url = response.xpath('//div[@class="nr"]/table/tbody/tr[2]/td/img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.ico-id.com'+img_url title = response.meta.get('title') item['title'] = title item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() url = response.url tags = '' if self.category[self.category_list[self.category_index]] != '其他行业': tags = self.category[self.category_list[self.category_index]] img_url = response.xpath('//ul[@class="pic"]/li/img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.bfitdesign.com/' + img_url title = response.xpath('//*[@class="p2"]/text()').extract()[0] item['title'] = title item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() url = response.url text = response.xpath('//div[@class="plc"]/a/text()').extract() tags = text[2] title = text[3] img_url = response.xpath('//a[@class="jqzoom"]/img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.hx-design.com' + img_url item['title'] = title item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): # print(response.text) item = DesignItem() url = response.url tags = response.meta['tags'] img_url = response.xpath( '//div[@class="detail detail_p"]/p//img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.newplan.com.cn' + img_url title = response.xpath( '//div[@class="guide"]/*[last()]/text()').extract()[0] item['title'] = title item['img_url'] = img_url item['url'] = url item['tags'] = tags for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() url = response.url img_url = response.xpath( '//ul[@class="bxslider"]/li[1]/img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.siwei-id.com' + img_url title = response.meta['title'] remark = response.xpath('//div[@class="w-text"]/p/text()').extract() remark = [''.join(i.split()) for i in remark] remark = ','.join(remark) item['title'] = title.strip() item['url'] = url item['remark'] = remark item['img_url'] = img_url for key, value in data.items(): item[key] = value yield item
def parse_detail(self, response): item = DesignItem() url = response.url img_url = response.xpath('//div[@class="col-md-12 col-sm-12 col-xs-12"]/img/@src').extract()[0] if not img_url.startswith('http'): img_url = 'http://www.kcandesign.com/'+img_url remark = response.xpath('//div[contains(@class,"page_text")]//text()').extract() remark = [''.join(i.split()) for i in remark] remark = ' '.join(remark) title = response.meta['title'] item['title'] = title item['img_url'] = img_url item['url'] = url item['remark'] = remark # print(remark) for key, value in data.items(): item[key] = value # print(item) yield item