def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//span[@id="thread_subject"]/text()').extract()) l.add_value('date',response.xpath('//div[@class="authi"]/em/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/br/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/p/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/div/div/font/font/strong/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value( 'title', response.xpath('//div[@id="contentwrap"]/h1/text()').extract()) l.add_value('date', response.xpath('//div[@class="infos"]/p/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value('content', response.xpath('//div[@class="content"]/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="description"]/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content"]/div/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( 'title', response.xpath('//p[@class="title1"]/text()').extract_first()) date = response.xpath( '//pre[@class="f_title"]/text()').extract_first() loader.replace_value('date', date[date.find(u"日期:") + 3:][0:10]) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="contents"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', response.xpath('//table/tr[3]/td[2]/text()').extract()) l.add_value('date', response.xpath('//table/tr[4]/td/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value( 'content', response.xpath('//td[@class="tdbg"]/div/font/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="tdbg"]/p/font/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="tdbg"]/p/span/text()').extract()) l.add_value('content', response.xpath('//td[@class="tdbg"]/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@id="biaoti"]/text()').extract()) l.add_value('title', response.xpath('//h1[@id="biaoti"]/text()').extract()) l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract()) l.add_value('date',response.xpath('//div[@class="center lh32 grey12a"]/text()').extract()) l.add_value('date',response.xpath('//div[@id="left"]/h2/text()').extract()) l.add_value('content',response.xpath('//div[@id="zw"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="zw"]/strong/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) url = response.url if url[11:17]=="shzfzz": date = ''.join(l.get_collected_values('date')) date = time.strptime(date.split()[0], u'%Y年%m月%d日') l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value("title", response.xpath('//h1[@id="title"]/text()').extract_first()) loader.add_value("title", response.xpath('//span[@id="title"]/text()').extract_first()) loader.add_value("date", response.xpath('//span[@class="time"]/text()').extract_first()) loader.add_value("date", response.xpath('//span[@id="pubtime"]/text()').extract_first()) date = ''.join(loader.get_collected_values("date")).strip() date = time.strptime(date, '%Y年%m月%d日 %H:%M:%S') loader.replace_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date)) loader.add_value("content", ''.join(response.xpath('//div[@id="content"]/descendant-or-self::text()').extract())) loader.add_value("content", ''.join(response.xpath('//div[@class="article"]/descendant-or-self::text()').extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def parse(self, response): filename = response.url.split('/')[-1] if filename == 'teacher_list.js': contents = response.text.strip().split('\n')[-8:] for content in contents: content = content.split('=')[1].strip().replace(';', '') teachers = eval(content) teacher_loader = ItemLoader(item=TeacherItem()) for teacher in teachers: teacher_loader.replace_value('number', teacher[0]) teacher_loader.replace_value('name', teacher[1]) yield teacher_loader.load_item() else: content = response.text.strip().split('\n')[-4] content = content.split('=')[1].strip().replace(';', '') class_numbers = eval(content) class_number_loader = ItemLoader(item=ClassesItem()) for class_number in class_numbers: class_number_loader.replace_value('number', class_number[0]) class_number_loader.replace_value('full_name', class_number[1]) class_number_loader.replace_value('name', class_number[1]) yield class_number_loader.load_item()
def parse(self, response): all_row = response.css(".empty_html tr") for i in range(1, len(all_row)): row = all_row[i] course_loader = ItemLoader(item=CourseItem(), selector=row) # https://aisap.nutc.edu.tw/public/day/course_list.aspx?sem=1081&clsno=1120170121&_p=2 -> 1120170121 class_id = re.search(r'clsno=[\w\d]*', response.url)[0][6:] course_loader.replace_css('number', 'td:nth-child(2)::text') course_loader.replace_value('class_id', class_id) course_loader.replace_css('name', 'td:nth-child(4)::text, td:nth-child(4) > strong::text') course_loader.replace_css('time', 'td:nth-child(6)::text') course_loader.replace_css('location', 'td:nth-child(6)::text') course_loader.replace_css('compulsory', 'td:nth-child(7)::text') course_loader.replace_css('credit', 'td:nth-child(8)::text') course_loader.replace_css('popular', 'td:nth-child(9) > strong::text') course_loader.replace_css('teacher_name', 'td:nth-child(10)::text') course_loader.replace_css('popular_limit', 'td:nth-child(11)::text') yield course_loader.load_item() next_page = response.css('.page > b:last-child > a::attr(href)').get() if next_page: yield response.follow(next_page)
def get_news(self, response): try: loader = ItemLoader(item=SpiderItem(), response=response) loader.add_value( 'title', response.xpath( '//div[@class="left"]/h1/text()').extract_first()) loader.add_value( 'title', response.xpath('//h1[@class="h1"]/text()').extract_first()) loader.add_value( 'date', response.xpath('//div[@class="zuoze"]/text()').extract_first()) loader.add_value( 'date', response.xpath( '//span[@class="post-time"]/text()').extract_first()) date = ''.join(loader.get_collected_values('date')) if date == '': return loader.replace_value('date', date.strip() + ":00") loader.add_value( 'content', ''.join( response.xpath( '//span[@id="zoom"]/descendant-or-self::text()'). extract())) loader.add_value( 'content', ''.join( response.xpath( '//p[@class="summary"]/descendant-or-self::text()'). extract())) loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) yield loader.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) yield l.load_item()
def parse_art_url(self, response): # TODO ItemLoaders提供的是填充容器(items字典形式)的机制 items = ItemLoader( item=CrawlSpiderItem(), response=response ) # 返回 scrapy对象 <scrapy.loader.ItemLoader object at 0x7f3f1d459f28> # TODO 通过ItemLoader实例化返回的对象,进行页面解析 # # TODO add_xpath会将匹配到的内容添加到指定字段 # items.add_xpath('art_content', '//div[@class="entry"]/p/text()') # items.add_xpath('art_create_time', '//div[@class="entry-meta"]/p/text()') # TODO 解析页面,拿到文章内容 art_content = response.xpath('//div[@class="entry"]') art_content_list = art_content.xpath( 'string(.)').extract()[0].strip().split('\r\n') art_contents = '' # TODO 对数据进行处理 for i in art_content_list: art_contents += i art_contents = art_contents.strip().replace('\n', '').replace( ' ', '').replace('\t', '') # TODO 获取时间 art_create_time = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first() # TODO replace_value替代原来字段的值 items.replace_value('art_img', response.meta['art_img']) items.replace_value('art_url', response.meta['art_url']) items.replace_value('art_title', response.meta['art_title']) items.replace_value('art_content', art_contents) items.replace_value('art_create_time', art_create_time.strip()[0:-2]) # TODO return给管道 return items.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( 'title', response.xpath( '//div[@id="title_tex"]/text()').extract_first()) loader.add_value( 'title', response.xpath( '//div[@class="dc-title"]/text()').extract_first()) loader.add_value( 'title', response.xpath( '//div[@class="xl-tit"]/text()').extract_first()) url = response.url url = url[url.rfind('/') + 2:url.rfind('_')] loader.replace_value('date', url[0:4] + '-' + url[4:6] + '-' + url[6:8]) loader.add_value( 'content', ''.join( response.xpath( '//div[@id="tex"]/descendant-or-self::text()').extract( ))) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="tex"]/descendant-or-self::text()'). extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="dc-text02"]/descendant-or-self::text()' ).extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value( 'title', response.xpath('//div[@class="article_title"]/text()').extract()) l.add_value( 'date', response.xpath('//div[@class="article_title1"]/text()').extract()) r1 = r"\d{1,4}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) date1 = date1[0] + '-' + date1[1] + '-' + date1[2] l.replace_value('date', date1) l.add_value( 'content', response.xpath('//div[@id="MyContent"]/p/span/text()').extract()) l.add_value( 'content', response.xpath( '//div[@id="MyContent"]/p/font/span/text()').extract()) l.add_value( 'content', response.xpath('//p[@class="MsoNormal"]/span/span/font/span/text()' ).extract()) l.add_value( 'content', response.xpath( '//p[@class="MsoNormal"]/span/span/font/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="article_intro"]/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="MyContent"]/p/font/text()').extract()) l.add_value( 'content', response.xpath('//p[@id="MsoNormal"]/span/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value( 'title', response.xpath( '//div[@id="lbyright_xwxq_title"]/text()').extract()) l.add_value( 'date', response.xpath('//div[@id="lbyright_xwxq_xxx"]/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value( 'content', response.xpath( '//div[@id="lbyright_xwxq_txt"]/p/span/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//h2[@class="titleH2"]/text()').extract()) l.add_value('title', response.xpath('//div[@class="Article-Left"]/h3/text()').extract()) l.add_value('title', response.xpath('//div[@class="tit"]/h1/text()').extract()) l.add_value('date',response.xpath('//div[@class="from"]/span/text()').extract()) l.add_value('date',response.xpath('//div[@class="CopyFrom"]/text()').extract()) l.add_value('date',response.xpath('//div[@class="auther-from"]/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value('content',response.xpath('//div[@class="content"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="content"]/p/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="content"]/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@class="layout"]/h2/text()').extract()) l.add_value('title', response.xpath('//div[@id="wrapper"]/h1/text()').extract()) l.add_value('title', response.xpath('//div[@class="top"]/h1/text()').extract()) l.add_value('date',response.xpath('//div[@class="layout"]/div/text()').extract()) l.add_value('date',response.xpath('//div[@class="left"]/span/text()').extract()) l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract()) date = ''.join(l.get_collected_values('date')) date = time.strptime(date.split()[0], '%Y-%m-%d') l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) l.add_value('content',response.xpath('//div[@class="news-con"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) url = response.url return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse(self, response): items = ItemLoader(item=ScrapyamazonItem()) all_div_quotes = response.css('div.a-section.a-spacing-medium') for quote in all_div_quotes: product_name = quote.css('.a-color-base.a-text-normal::text').extract() product_author = quote.css('.a-color-secondary .a-size-base.a-link-normal').css('::text').extract() product_price = quote.css('.a-spacing-top-small .a-price:nth-child(1) span.a-offscreen').css('::text').extract() product_imagelink = quote.css('.s-image::attr(src)').extract() items.replace_value('product_name', product_name) items.replace_value('product_author', product_author) items.replace_value('product_price', product_price) items.replace_value('product_imagelink', product_imagelink) yield items.load_item()
def parse_price(self, response): try: lproduct = response.meta["lproduct"] lprice = ItemLoader(item=AsosPrice(), response=response) pricejson = json.loads(response.body) outletprice = pricejson[0]["productPrice"]["current"]["value"] if str(outletprice) == "0.0": outletprice = pricejson[0]["productPrice"]["xrp"]["value"] retailprice = pricejson[0]["productPrice"]["rrp"]["value"] if str(retailprice) == "0.0": retailprice = pricejson[0]["productPrice"]["previous"]["value"] currency = pricejson[0]["productPrice"]["currency"] lprice.replace_value("outlet", float(outletprice)) lprice.replace_value("retail", float(retailprice)) lprice.replace_value("currency", str(currency).upper()) lproduct.replace_value("price", dict(lprice.load_item())) yield lproduct.load_item() # Exception for products that have other products inside, like suits and vests except AttributeError as e: self.logger.info(str(e))
def get_news(self, response): try: loader = ItemLoader(item=SpiderItem(), response=response) loader.add_value( 'title', response.xpath( '//div[@class="text"]/h1/text()').extract_first()) loader.add_value( 'title', response.xpath('//div[@class="text_c clearfix"]/h1/text()'). extract_first()) loader.add_value( 'title', response.xpath( '//div[@class="text_c"]/h1/text()').extract_first()) loader.add_value( 'title', response.xpath('//div[@class="d2_left wb_left fl"]/h1/text()'). extract_first()) loader.add_value( 'date', response.xpath( '//p[@class="text_tools"]/text()').extract_first()) loader.add_value( 'date', response.xpath('////div[@class="text_c clearfix"]/h5/text()'). extract_first()) loader.add_value( 'date', response.xpath('//p[@class="sou"]/text()').extract_first()) loader.add_value( 'date', response.xpath( '//span[@id="p_publishtime"]/text()').extract_first()) date = ''.join(loader.get_collected_values('date')) date = time.strptime(date.split()[0], '%Y年%m月%d日%H:%M') loader.replace_value('date', time.strftime('%Y-%m-%d', date)) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="text_c"]/descendant-or-self::text()'). extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="text_show"]/descendant-or-self::text()' ).extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="show_text"]/descendant-or-self::text()' ).extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@id="p_content"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse(self, response): api = self.api_keys[self.flag] # for key in self.api_key: yt = build('youtube', 'v3', developerKey=api, cache_discovery=False) for city in self.cities[0]: logging.info('++++++++++++++++++ {} ++++++++++++++++++'.format(city)) # publishedBefore # Exceptions # if the token in a given api run out the search with raise a error try: search = yt.search().list(q='{}'.format(city), part='snippet', maxResults=self.max_videos, type='video', publishedBefore=self.date).execute() # ------- api Exceptions in case of runing out of quota ------- except: self.flag += 1 yt = build('youtube', 'v3', developerKey=self.api_keys[self.flag], cache_discovery=False) print(""" | quotaExceeded | {} """.format(self.flag)) search = yt.search().list(q='{}'.format(city), part='snippet', maxResults=self.max_videos, type='video', publishedBefore=self.date).execute() # ------- Exceptions ------- for i in range(len(search['items'])): # get basic snippet data l = ItemLoader(item=YoutubeItem(), response=response) l.add_value('city', city) l.add_value('videoId', str(search['items'][i]['id']['videoId'])) l.add_value('title', str(search['items'][i]['snippet']['title'])) l.add_value('datetime', str(search['items'][i]['snippet']['publishedAt'])) l.add_value('description', str(search['items'][i]['snippet']['description'])) l.add_value('channelId', str(search['items'][i]['snippet']['channelId'])) statistic = yt.videos().list(id='{}'.format(search['items'][i]['id']['videoId']), part='statistics').execute() # filter for null value in case the video have no like or dislike try: l.add_value('like', statistic['items'][0]['statistics']['likeCount']) except: l.add_value('like', '0') try: l.add_value('dislike', statistic['items'][0]['statistics']['dislikeCount']) except: l.add_value('dislike', '0') yield l.load_item() try: # avoid some video that has comments disable try: # see if we runing out of quota comments = yt.commentThreads().list(videoId=str(search['items'][i]['id']['videoId']), part='snippet', maxResults=self.max_comments).execute() # yield parse_comments(response, comments) # normal comments c = ItemLoader(item=Youtubecomments(), response=response) c.add_value('videoId', str(search['items'][i]['id']['videoId'])) if len(comments['items']) != 0: c.add_value('c_id', str( comments['items'][i]['id'])) c.add_value('authorDisplayName', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['authorDisplayName'])) c.add_value('authorChannelUrl', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['authorChannelUrl'])) c.add_value('textOriginal', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal'])) c.add_value('publishedAt', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt'])) c.add_value('updatedAt', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['updatedAt'])) c.add_value('likeCount', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['likeCount'])) c.add_value('totalReplyCount', str(comments['items'][i]['snippet']['totalReplyCount'])) for i in range(1, len(comments['items'])): c.replace_value('c_id', str( comments['items'][i]['id'])) c.replace_value('authorDisplayName', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['authorDisplayName'])) c.replace_value('authorChannelUrl', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['authorChannelUrl'])) c.replace_value('textOriginal', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal'])) c.replace_value('publishedAt', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt'])) c.replace_value('updatedAt', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['updatedAt'])) c.replace_value('likeCount', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['likeCount'])) c.replace_value('totalReplyCount', str(comments['items'][i]['snippet']['totalReplyCount'])) yield c.load_item() # ------- api Exceptions in case of runing out of quota ------- except Exception as e: if 'parameter has disabled comments' in str(e): print(''' | parameter has disabled comments | ''') pass elif 'quota' in str(e): # else: print('+-+-+-+-+-+-+-+-++-+-+-++-+-{}-+-+-+-+-+-+-+-++--+-+'.format(e)) print('+-+-+-+-+-+-+-+-++-+-+-++-+-{}-+-+-+-+-+-+-+-++--+-+'.format(self.flag)) print(""" | quotaExceeded | """) self.flag += 1 yt = build('youtube', 'v3', developerKey=self.api_keys[self.flag], cache_discovery=False) comments = yt.commentThreads().list(videoId=str(search['items'][i]['id']['videoId']), part='snippet', maxResults=self.max_comments).execute() c = ItemLoader(item=Youtubecomments(), response=response) c.add_value('videoId', str(search['items'][i]['id']['videoId'])) if len(comments['items']) != 0: c.add_value('c_id', str( comments['items'][i]['id'])) c.add_value('authorDisplayName', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['authorDisplayName'])) c.add_value('authorChannelUrl', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['authorChannelUrl'])) c.add_value('textOriginal', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal'])) c.add_value('publishedAt', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt'])) c.add_value('updatedAt', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['updatedAt'])) c.add_value('likeCount', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['likeCount'])) c.add_value('totalReplyCount', str(comments['items'][i]['snippet']['totalReplyCount'])) for i in range(1, len(comments['items'])): c.replace_value('c_id', str( comments['items'][i]['id'])) c.replace_value('authorDisplayName', str( comments['items'][i]['snippet']['topLevelComment']['snippet'][ 'authorDisplayName'])) c.replace_value('authorChannelUrl', str( comments['items'][i]['snippet']['topLevelComment']['snippet'][ 'authorChannelUrl'])) c.replace_value('textOriginal', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal'])) c.replace_value('publishedAt', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt'])) c.replace_value('updatedAt', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['updatedAt'])) c.replace_value('likeCount', str( comments['items'][i]['snippet']['topLevelComment']['snippet']['likeCount'])) c.replace_value('totalReplyCount', str(comments['items'][i]['snippet']['totalReplyCount'])) yield c.load_item() except: pass
def parse_movie_info(self, response, tpp_id): logger.info(f"crawled movie info of {response.url}") item_loader = ItemLoader(item=MovieInfoItem(), response=response) person_item_loader = ItemLoader(item=PersonInfoItem(), response=response) data = response.xpath( "//script[@type='application/ld+json']/text()").extract()[0] try: text = json.loads(data, strict=False) except json.decoder.JSONDecodeError as de: logger.error(f"json decode error {de} in url = {response.url}") finally: text = json.loads(data, strict=False) logger.info(f"len of movie info = {len(text)}") item_loader.replace_value('movieName', text.get('name', '')) item_loader.replace_value('movieName', text.get('name', '')) item_loader.replace_value('dbMovieID', text.get('url', '')[9:-1]) item_loader.replace_value('tppMovieID', tpp_id) base_column = { 'director': 'directors', 'author': 'writers', 'actor': 'actors' } def get_name_list(parent): result = [ child.get('name', '').split(' ')[0] for child in text.get(parent, []) ][:10] return result if len(result) else [""] def get_person_info(parent): logger.info(f"start to crawl person info") for detail in text.get(parent, []): person_item_loader.replace_value('name', detail.get('name', '')) person_item_loader.replace_value('url', detail.get('url', '')) person_item_loader.replace_value('identity', parent) logger.info(f"get person info with identity {parent}") yield person_item_loader for column, item_name in base_column.items(): item_loader.replace_value(item_name, get_name_list(column)) # item_loader.replace_value('directors', get_name_list('director')) # item_loader.replace_value('writers', get_name_list('author')) # item_loader.replace_value('actors', get_name_list('actor')) item_loader.replace_value('genre', text.get('genre', [])) info = response.xpath('//*[@id="info"]').get() pattern = '<span class="pl">制片国家/地区:</span>(.*?)<br>' item_loader.replace_value('area', re.findall(pattern, info)) item_loader.replace_value('duration', text.get('duration', '')) item_loader.replace_value('publishedDate', text.get('datePublished', '')) item_loader.replace_value( 'rateCount', text.get('aggregateRating', []).get('ratingCount', '0.0')) item_loader.replace_value( 'doubanRate', text.get('aggregateRating', []).get('ratingValue', '0.0')) logger.info(f"finish parse one movie info, ready to parse person") for column in base_column.keys(): for item in get_person_info(column): yield item.load_item() # next(get_person_info('director')) # get_person_info('author') # get_person_info('actor') yield item_loader.load_item() time.sleep(random.uniform(1, 2))
def parse_box_info(self, response): """ 获取每日电影票房信息 :param response: :return: """ time.sleep(random.uniform(0, 1)) logger.info(f"now crawl url for boxOffice: {response.url}") item_loader = ItemLoader(item=BoxOfficeItem(), response=response) text = json.loads(response.text, strict=False) query_date = text.get('calendar', []).get('selectDate', "") for i, movie_info in enumerate( text.get('movieList', []).get('list', [])): if i == 30: break field_map = { 'seatRate': 'avgSeatView', 'boxRate': 'boxRate', 'showRate': 'showCountRate', 'splitSumBoxInfo': 'sumSplitBoxDesc', 'sumBoxInfo': 'sumBoxDesc', 'showView': 'avgShowView' } movie_name = movie_info.get('movieInfo', []).get('movieName') movie_id = movie_info.get('movieInfo', []).get('movieId') for field, json_attr in field_map.items(): item_loader.replace_value(field, movie_info.get(json_attr, '')) item_loader.replace_value('movieID', movie_id) item_loader.replace_value('movieName', movie_name) item_loader.replace_value( 'releaseInfo', movie_info.get('movieInfo', []).get('releaseInfo', "")) item_loader.replace_value('showInfo', movie_info.get('showCount', 0)) item_loader.replace_value( 'boxInfo', movie_info.get('boxSplitUnit', []).get('num', "")) item_loader.replace_value( 'splitBoxInfo', movie_info.get('splitBoxSplitUnit', []).get('num', "")) item_loader.replace_value('crawlDate', query_date) item_loader.replace_value('yearRate', get_year_rate(query_date, i + 1)) logger.info( f"get {i + 1} boxOffice, named {movie_info.get('movieName', '')}." ) # logger.error(f"boxOffice spider put {movie_info.get('movieName')} into queue") yield item_loader.load_item() # 根据电影名称从豆瓣获取电影详情页链接 search_url = self.search_base_url + movie_name time.sleep(random.uniform(0, 1)) yield scrapy.Request(url=search_url, cookies=self.cookies, callback=self.parse_movie_info_url, dont_filter=True, cb_kwargs=dict(movie_name=movie_name, movie_year=query_date, tpp_id=movie_id))
def getInfo(self, res): if not mch(res): return response = etree.HTML(res.text) loader = ItemLoader(item=booking.Booking(), response=res) supplier_obj_id = res.meta.get('statics.hotels.id') supplier_name = res.meta.get('statics.hotels.supplier') if supplier_obj_id: loader.add_value('statics_hotels_id', supplier_obj_id) loader.add_value('statics_hotels_supplier', supplier_name) pic = [] for e in self.allXpath: Xpath = eval('bk.' + e) fielName, lable = '_'.join(e.split('_')[:-1]), e.split('_')[-1] tempResult = '' if lable == 'non': if response.xpath(Xpath): tempResult = response.xpath(Xpath)[0].strip() elif lable == 'ren': if re.findall(Xpath, res.text): tempResult = re.findall(Xpath, res.text)[0].strip() elif lable == 'rea': if re.findall(Xpath, res.text): for each in re.findall(Xpath, res.text): tempResult += each.strip() elif lable == 'sub': if response.xpath(Xpath): tempResult = re.sub( '\\n+', '\\n', response.xpath(Xpath)[0].xpath('string(.)')).strip() elif lable == 'sua': selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split( 'weego')[1], Xpath.split('weego')[2:] for each in response.xpath(selects): temp = each.xpath(subSelcets) if isinstance(temp, list): tempResult += temp[0] elif isinstance(temp, str): tempResult += temp tempResult = re.sub('\\n+', '\\n', tempResult).strip() elif lable == 'pic': selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split( 'weego')[1], Xpath.split('weego')[2:] for each in response.xpath(selects): temp = each.xpath(subSelcets) pic.append(temp[0]) tempResult = pic elif lable == 'pir': for each in re.findall(Xpath, res.text): pic.append(each) tempResult = pic elif lable == 'xpl': selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split( 'weego')[1], Xpath.split('weego')[2:] tl = [] for each in response.xpath(selects): temp = re.sub('\\n+', ' - ', each.xpath(subSelcets).strip()) tl.append(temp) loader.add_value(fielName.lower(), tl) if lable != 'xpl': if loader.get_collected_values(fielName.lower()): if loader.get_collected_values(fielName.lower())[0] == '': loader.replace_value(fielName.lower(), tempResult) else: loader.add_value(fielName.lower(), tempResult) yield loader.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( 'title', response.xpath( '//div[@class="text"]/h1/text()').extract_first()) loader.add_value( 'title', response.xpath('//div[@class="text_c clearfix"]/h1/text()'). extract_first()) loader.add_value( 'title', response.xpath( '//div[@class="text_c"]/h1/text()').extract_first()) loader.add_value( 'title', response.xpath('//div[@class="d2_left wb_left fl"]/h1/text()'). extract_first()) loader.add_value( 'title', response.xpath( '//div[@class="clearfix w1000_320 text_title"]/h1/text()'). extract_first()) url = response.url url = url[url.find('n1') + 3:url.rfind('/')] loader.replace_value('date', url[0:4] + '-' + url[5:7] + '-' + url[7:]) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="text_c"]/descendant-or-self::text()'). extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="text_show"]/descendant-or-self::text()' ).extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="show_text"]/descendant-or-self::text()' ).extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@id="p_content"]/descendant-or-self::text()'). extract())) loader.add_value( 'content', ''.join( response.xpath( '//div[@id="rwb_zw"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', response.xpath('//h1[@class="title"]/text()').extract()) l.add_value( 'title', response.xpath( '//span[@class="articletitle_p22"]/text()').extract()) l.add_value('title', response.xpath('//h1[@class="tit_h2"]/text()').extract()) l.add_value( 'title', response.xpath('//span[@class="gog_title"]/text()').extract()) l.add_value( 'title', response.xpath('//td[@class="gog_title"]/text()').extract()) l.add_value('date', response.xpath('//div[@class="info"]/text()').extract()) l.add_value( 'date', response.xpath('//span[@class="p12 LightGray2"]/text()').extract()) l.add_value( 'date', response.xpath('//div[@class="articletime"]/text()').extract()) l.add_value( 'date', response.xpath('//body/table[5]/tr[5]/td[2]/div/text()').extract()) l.add_value( 'date', response.xpath( '//body/table[6]/tr/td/table/tr/td/table[3]/tr/td/text()'). extract()) r1 = r"\d{4}.\d{1,2}.\d{1,2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value( 'content', response.xpath('//div[@class="content"]/p/text()').extract()) l.add_value('content', response.xpath('//td[@class="p16"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content01 p16"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content"]/div/p/text()').extract()) l.add_value( 'content', response.xpath('//span[@class="gog_content"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content"]/p/a/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="gog_content"]/p/text()').extract()) l.add_value( 'content', response.xpath( '//td[@class="gog_content"]/font/p/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="p16"]/div/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): try: l = ItemLoader(item=SpiderItem(), response=response) l.add_value( 'title', response.xpath( '//div[@class="post_content_main"]/h1/text()').extract()) l.add_value( 'title', response.xpath( '//div[@class="endContent"]/h1/text()').extract()) l.add_value( 'title', response.xpath('//div[@class="theTitle"]/h1/text()').extract()) l.add_value( 'title', response.xpath( '//div[@class="ep-main-bg"]/h1/text()').extract()) l.add_value( 'title', response.xpath( '//div[@class="ep-content-main"]/h1/text()').extract()) l.add_value( 'title', response.xpath( '//div[@class="endContent bg_endPage_Lblue"]/h1/text()'). extract()) l.add_value( 'date', response.xpath( '//div[@class="post_time_source"]/text()').extract()) l.add_value( 'date', response.xpath('//span[@class="info"]/text()').extract()) l.add_value( 'date', response.xpath('//div[@class="text"]/text()').extract()) l.add_value( 'date', response.xpath( '//div[@class="ep-time-soure cDGray"]/text()').extract()) date = ''.join(l.get_collected_values('date')) date = time.strptime(date.split()[0], u'%Y-%m-%d') l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) l.add_value( 'content', response.xpath('//div[@class="post_text"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="endText"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="endText"]/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="end-text"]/p/text()').extract()) l.add_value( 'content', response.xpath( '//div[@class="end-text"]/div/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="content"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="endText"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="endText"]/div/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value( 'title', response.xpath('//div[@class="crumbs"]/h1/text()').extract()) l.add_value( 'title', response.xpath('//div[@class="headBox"]/h1/text()').extract()) l.add_value('title', response.xpath('//h1[@class="artTitle"]/text()').extract()) l.add_value( 'title', response.xpath('//h1[@class="artiTitle"]/text()').extract()) l.add_value( 'title', response.xpath('//h1[@class="artiTitle clearB"]/text()').extract()) l.add_value('title', response.xpath('//h1[@class="c_title"]/text()').extract()) l.add_value( 'title', response.xpath('//h1[@class="c_title"]/span/text()').extract()) l.add_value('title', response.xpath('//td[@class="a4"]/text()').extract()) l.add_value( 'title', response.xpath('//div[@class="Left"]/div/h1/text()').extract()) l.add_value( 'title', response.xpath('//div[@class="wrapl"]/h1/text()').extract()) l.add_value( 'title', response.xpath('//div[@class="big_img2"]/h1/text()').extract()) l.add_value('title', response.xpath('//div[@id="contit"]/h1/text()').extract()) l.add_value( 'title', response.xpath('//div[@class="headBox"]/div/h1/text()').extract()) l.add_value( 'date', response.xpath('//div[@class="pub_date"]/span/text()').extract()) l.add_value( 'date', response.xpath( '//div[@class="artiInfo pub_date fl"]/span/text()').extract()) l.add_value('date', response.xpath('//td[@class="a5"]/span/text()').extract()) l.add_value( 'date', response.xpath('//span[@id="pubtime_baidu"]/text()').extract()) l.add_value( 'date', response.xpath('//div[@class="wrapl"]/h3/text()').extract()) l.add_value('date', response.xpath('//div[@class="more"]/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0] + " " + "00:00:00") l.add_value( 'content', response.xpath('//div[@id="articleBody"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="articleBody"]/p/font/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="artiContent"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="artiContent"]/p/font/text()').extract()) l.add_value('content', response.xpath('//div[@id="c_body"]/p/text()').extract()) l.add_value('content', response.xpath('//td[@class="a1"]/p/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="a1"]/p/font/text()').extract()) l.add_value( 'content', response.xpath('//div[@id="artbody"]/p/span/text()').extract()) l.add_value('content', response.xpath('//div[@id="artbody"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="artCon"]/p/text()').extract()) l.add_value('content', response.xpath('//div[@class="artCon"]/text()').extract()) l.add_value('content', response.xpath('//div[@id="box3"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="c_content"]/p/text()').extract()) l.add_value( 'content', response.xpath( '//div[@class="c_content"]/span/p/text()').extract()) l.add_value('content', response.xpath('//div[@id="cc"]/p/text()').extract()) l.add_value('content', response.xpath('//div[@id="content"]/p/text()').extract()) l.add_value('content', response.xpath('//div[@id="bigpic"]/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath('//h1[@id="title"]/text()').extract_first()) loader.add_value( "title", response.xpath('//span[@id="title"]/text()').extract_first()) loader.add_value( "title", response.xpath('//div[@id="Title"]/text()').extract_first()) loader.add_value( "date", response.xpath('//span[@class="time"]/text()').extract_first()) loader.add_value( "date", response.xpath('//span[@id="pubtime"]/text()').extract_first()) try: date = ''.join(loader.get_collected_values("date")).strip() if date != '': date = time.strptime(date, u'%Y年%m月%d日 %H:%M:%S') loader.replace_value( "date", time.strftime("%Y-%m-%d %H:%M:%S", date)) else: date = ''.join( response.xpath( '//div[@class="info"]/text()').extract()).strip() if date != '': loader.replace_value("date", date.strip()[:16] + ':00') except ValueError: end = response.url.find('/_c') loader.replace_value( "date", response.url[end - 10:end].replace('/', '-') + " 00:00:00") loader.add_value( "content", ''.join( response.xpath( '//div[@id="content"]/descendant-or-self::text()'). extract())) loader.add_value( "content", ''.join( response.xpath( '//div[@class="article"]/descendant-or-self::text()'). extract())) loader.add_value( "content", ''.join( response.xpath( '//td[@class="p1"]//p/descendant::text()').extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) yield loader.load_item()
def parse_product(self, response): try: sessionid = response.meta["sessionid"] sessionname = response.meta["sessionname"] gender = response.meta["gender"] productlink = response.meta["productlink"] lproduct = ItemLoader(item=AsosProduct(), response=response) name = response.css("div.product-hero>h1::text").extract_first() code = response.css("div.product-code>span::text").extract_first() kind = response.css( "div.product-description span strong:nth-child(1)::text" ).extract_first() brand = response.css( "div.brand-description span strong::text").extract_first() details = response.css( "div.product-description span ul>li::text").extract() # FIXING DISCREPANCIES ON ASOS STORE WEBPAGE STRUCTURE if (kind is None) or (brand is None): temp = getByPos(details) details = temp.details if (kind is None) or (kind == " "): kind = temp.kind if brand is None: brand = temp.brand # FURTHER FIXING WITH PARISIAN PETITE BRAND if re.search("\s+$", str(kind)) is not None: kind = str(kind) + str( response.css("div.product-description span a>strong::text" ).extract_first()) care = response.css("div.care-info span::text").extract_first() # FIXING DISCREPANCIES ON ASOS STORE WEBPAGE STRUCTURE ON CARE SECTION if care is None: care = response.css( "div.care-info span>*::text").extract_first() lstimages = response.css( "div.product-gallery li.image-thumbnail img::attr(src)" ).extract() images = [(img.split("?", 1)[0] + "?wid=" + str(self.imgwidth)) for img in lstimages] about = response.css("div.about-me span::text").extract_first() # FIXING DISCREPANCIES ON ASOS STORE WEBPAGE STRUCTURE ON ABOUT SECTION if about is None: about = response.css( "div.about-me span>*::text").extract_first() # Todo: GETTING ONLY RELIABLE DATA IN ORDER TO DEPLOY if name is None: raise AttributeError("No name") elif code is None: raise AttributeError("No code") elif productlink is None: raise AttributeError("No product link") elif kind is None: raise AttributeError("No kind") elif brand is None: raise AttributeError("No brand") elif details is None: raise AttributeError("No details") elif care is None: raise AttributeError("No care") elif about is None: raise AttributeError("No about") elif images is None: raise AttributeError("No images") lproduct.replace_value("name", stripSpaces(str(name)).title()) lproduct.replace_value("code", stripSpaces(str(code))) lproduct.replace_value("link", stripSpaces(str(productlink))) lproduct.replace_value("kind", stripSpaces(str(kind)).title()) lproduct.replace_value("brand", stripSpaces(str(brand)).title()) lproduct.replace_value("details", [str(x).strip() for x in details]) lproduct.replace_value("care", stripSpaces(str(care))) lproduct.replace_value("about", stripSpaces(str(about))) lproduct.replace_value("images", images) lproduct.replace_value("storename", self.storename.title()) lproduct.replace_value("sessionid", str(sessionid)) lproduct.replace_value("sessionname", stripSpaces(str(sessionname)).title()) lproduct.replace_value("gender", stripSpaces(str(gender)).title()) iid = re.findall("iid=[0-9]+", productlink)[0][4:] pricelink = "http://www.asos.com/api/product/catalogue/v2/stockprice?" \ "productIds=" + iid + "&store=COM¤cy=GBP" requestprice = response.follow(pricelink, callback=self.parse_price) requestprice.meta["lproduct"] = lproduct yield requestprice # Exception for products that have other products inside, like suits and vests except AttributeError as e: self.logger.info(str(e))