def test_cached_and_stale(self): sampledata = [ (200, {'Date': self.today, 'Expires': self.yesterday}), (200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Expires': self.yesterday}), (200, {'Expires': self.yesterday, 'ETag': 'foo'}), (200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Expires': self.tomorrow, 'Age': '86405'}), (200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}), # no-cache forces expiration, also revalidation if validators # exists (200, {'Cache-Control': 'no-cache'}), (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}), (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request('http://example-%d.com' % idx) res0a = Response(req0.url, status=status, headers=headers) # cache expired response res1 = self._process_requestresponse(mw, req0, res0a) self.assertEqualResponse(res1, res0a) assert 'cached' not in res1.flags # Same request but as cached response is stale a new response must # be returned res0b = res0a.replace(body=b'bar') res2 = self._process_requestresponse(mw, req0, res0b) self.assertEqualResponse(res2, res0b) assert 'cached' not in res2.flags cc = headers.get('Cache-Control', '') # Previous response expired too, subsequent request to same # resource must revalidate and succeed on 304 if validators # are present if 'ETag' in headers or 'Last-Modified' in headers: res0c = res0b.replace(status=304) res3 = self._process_requestresponse(mw, req0, res0c) self.assertEqualResponse(res3, res0b) assert 'cached' in res3.flags # get cached response on server errors unless must-revalidate # in cached response res0d = res0b.replace(status=500) res4 = self._process_requestresponse(mw, req0, res0d) if 'must-revalidate' in cc: assert 'cached' not in res4.flags self.assertEqualResponse(res4, res0d) else: assert 'cached' in res4.flags self.assertEqualResponse(res4, res0b) # Requests with max-stale can fetch expired cached responses # unless cached response has must-revalidate req1 = req0.replace(headers={'Cache-Control': 'max-stale'}) res5 = self._process_requestresponse(mw, req1, res0b) self.assertEqualResponse(res5, res0b) if 'no-cache' in cc or 'must-revalidate' in cc: assert 'cached' not in res5.flags else: assert 'cached' in res5.flags
def get_media_requests(self, item, info): request = Request(item['img_url']) request.meta['item'] = item yield request
def wr(self,response): selector = scrapy.Selector(response) deeps = selector.xpath('//li[@class="active current activeandsub"]/ul/li/a') if deeps: for deep in deeps: url = self.preurl + deep.xpath('@href').extract()[0] yield Request(url, callback=self.wr) return if response.url == 'http://www.unido.org/internship/internships-in-field-offices.html': return item = self._inititem() item['englishname'] = 'UNIDO' # 组织英文缩写 item['chinesename'] = '联合国工业发展组' # 组织中文缩写 item['incontinent'] = '欧洲' # 组织所属洲 item['incountry'] = '奥地利' # 组织所在国家 item['type'] = '经济' # 组织类别 item['url'] = 'www.unido.org' # 组织主页 item['alljoburl'] = 'http://www.unido.org/employment.html' url = response.url item['joburl'] = url num = 0 main_content = selector.xpath('//div[@class="csc-default"]/p') itemname = '' text = '' tips = '' totle = 0 for i in main_content: totle += 1 while num < totle: target = '' content = main_content[num] try: target = content.xpath('b/text()').extract()[0] text = content.xpath('text()').extract()[0] except: pass num += 1 if target == 'Duration:' or target == 'Duration: ': itemname = 'ExpectedDurationofAssignment' elif target == 'Duty Station:' or target == 'Duty Station: ': itemname = 'Location' elif target == 'Tasks:' or target == 'Tasks: ': itemname = 'responsibilities' num2 = num for content in main_content[num2:]: target = content.xpath('b/text()') if not target: num += 1 text += content.xpath('text()').extract()[0] else: break elif target == 'Qualification requirements:' or target == 'Qualification requirements: ': num2 = num for content in main_content[num2:]: test = content.xpath('text()').extract()[0] if 'Education' in test: item['education'] = test elif 'Experience' in test: item['experience'] = test elif 'Language' in test: item['language'] = test break else: try: cont = content.xpath('b') tips += cont.xpath('string(.)').extract()[0] except: pass continue item[itemname] = StrUtil.delWhiteSpace(text) logger.debug("UNIDO-->job-->%s" % url+'-->'+itemname+'-->'+item[itemname]) Work = selector.xpath('//div[@id="header-content"]/div/h1/text()').extract()[0] item['work'] = StrUtil.delWhiteSpace(Work) logger.debug("UNIDO-->job-->%s" % url+'-->Work-->'+item['work']) itemname= 'addition' item[itemname] = StrUtil.delWhiteSpace(tips) logger.debug("UNIDO-->job-->%s" % url + '-->' + itemname + '-->' + item[itemname]) self.insert(item, spiderName=self.name)
def parse(self, response): book_urls = response.xpath( '//*[@id="content"]/div/div[2]/div/ul/li/span[3]/a/@href').extract() for book_url in book_urls: if '/230.htm' not in response.url: yield Request(book_url, callback=self.parse_read)
def get_media_requests(self, item, info): for image_url in item['image_urls']: # item['origin_page_url']= response.url yield Request(image_url,callback='parse')
def parse(self, response): request = Request(response.url, callback=self.parse_table) yield request
def start_requests(self): for url in self.start_urls: yield Request(url=url, callback=self.parse)
def start_requests(self): return [ Request("https://www.zhihu.com/login", meta={'cookiejar': 1}, callback=self.post_login) ]
def start_requests(self): for url in self.sitemap_urls: yield Request(url, self._parse_sitemap)
def parse_page_plan_0(self,response): max_page_url_temp = re.sub(re.compile('-\d.html'),'-1000.html',response.url) yield Request(url=max_page_url_temp,callback=self.parse_page_plan_0_1)
def parse_page_plan_0_1(self,response): max_page_rule = '//span[@class="pager-pageindex"]/a/@href' max_page_url = self.prefix_url + response.xpath(max_page_rule).extract()[-1] yield Request(url=max_page_url,callback=self.parse_detail)
def parse(self, response): cat_urls = ['http://mall.autohome.com.cn/list/2-500100-0-0-0-0-0-0-0-1.html?factoryId=0&minPrice=-1&maxPrice=-1&stageTag=0&importTag=0&double11Tag=0&prefix=&dataSource=', 'http://mall.autohome.com.cn/list/1-110100-0-0-0-0-0-0-0-1.html?factoryId=0&minPrice=-1&maxPrice=-1&stageTag=0&importTag=0&double11Tag=0&prefix=&dataSource=', 'http://mall.autohome.com.cn/list/3-110100-0-0-0-0-0-0-0-1.html?factoryId=0&minPrice=-1&maxPrice=-1&stageTag=0&importTag=0&double11Tag=0&prefix=&dataSource='] # for cat_url in cat_urls: yield Request(url=cat_urls[0],callback=self.parse_address)
def parse(self, response): page_urls = response.css(".nlist>a::attr(href)").extract() for page_url in page_urls[0:-1]: yield Request(url=parse.urljoin(response.url, page_url), callback=self.messageParse)
def start_requests(self): for channel in self.post_params: url = channel['url'] # if self.doredis and self.expire_redis.exists(url): # continue yield Request(url=url,method='get',meta={"ba_type":channel["ba_type"],"page_parse":True},callback=self.parse_link)
def parse_about_page(self, response): """Parse about page.""" loader = response.meta['loader'] name = response.xpath('//title/text()').extract_first() if not name: print response.url loader.add_value('name', name) # add user_id try: query_include_id = parse_qs( urlparse( response.xpath( '(//img[@alt="' + name + '"])[1]/ancestor::a/@href').extract_first()).query) except Exception as e: print e raise CloseSpider('Cannot find id in: ' + response.url) try: user_id = query_include_id['id'][0] except Exception: user_id = query_include_id['profile_id'][0] loader.add_value('user_id', user_id) # get work loader.add_value( 'works', self.extract_work( response.xpath('//div[@id="work"]//div[contains(@id, "u_0")]'), response.meta['driver'])) # education loader.add_value( 'colleges', self.extract_edu( response.xpath( '//div[@id="education"]//div[contains(@id, "u_0")]'), response.meta['driver'])) response.meta['driver'].quit() # skills skills_str = response.xpath( 'string((//div[@id="skills"]/div/div)[2])').extract_first() if skills_str: skills = skills_str.split(', ') skills = skills[:-1] + skills[-1].split(' and ') loader.add_value('professional_skills', skills) # current_city living_div_select = response.xpath('(//div[@id="living"]/div/div)[2]') loader.add_value( 'current_city', { 'city': living_div_select.xpath( './/div[@title="Current City"]//a/text()').extract_first(), 'page_url': living_div_select.xpath( './/div[@title="Current City"]//a/@href').extract_first() }) # hometown loader.add_value( 'hometown', { 'hometown': living_div_select.xpath( './/div[@title="Hometown"]//a/text()').extract_first(), 'page_url': living_div_select.xpath( './/div[@title="Hometown"]//a/@href').extract_first() }) # places_lived places_lived_selectors = response.xpath( '(//div[@id="living"]/div/div)[2]/div[@title and not(@id)]') for places_lived_selector in places_lived_selectors: loader.add_value( 'places_lived', { 'destination': places_lived_selector.xpath( '(.//a)[2]/text()').extract_first(), 'page_url': places_lived_selector.xpath( '(.//a)[2]/@href').extract_first(), 'post_url': places_lived_selector.xpath( '(.//a)[1]/@href').extract_first(), 'description': places_lived_selector.xpath( '(.//a)[1]/text()').extract_first() }) # contact info loader.add_value( 'websites', response.xpath('//div[@id="contact-info"]//div' '[@title="Websites"]//a/text()').extract()) loader.add_value( 'mobile_numbers', response.xpath( '(//div[@id="contact-info"]//div[@title="Mobile"]//td)' '[2]//span[@dir]/text()').extract()) # basic info loader.add_value( 'birth_date', response.xpath( '(//div[@id="basic-info"]//div[@title="Birthday"]//td)' '[2]/div/text()').extract_first()) loader.add_value( 'gender', response.xpath( '(//div[@id="basic-info"]//div[@title="Gender"]//td)' '[2]/div/text()').extract_first()) loader.add_value( 'interested_in', response.xpath( '(//div[@id="basic-info"]//div[@title="Interested In"]//td)' '[2]/div/text()').extract_first()) loader.add_value( 'languages', filter( None, re.split( ', | and| language', response.xpath( 'string((//div[@id="basic-info"]//div' '[@title="Languages"]//td)[2])').extract_first()))) loader.add_value( 'religion', { 'religious_type': response.xpath( 'string((//div[@id="basic-info"]//div' '[@title="Religious Views"]//td)[2])').extract_first(), 'page_url': response.xpath( '(//div[@id="basic-info"]//div[@title="Religious Views"]' '//td)[2]//a/@href').extract_first() }) loader.add_value( 'political', { 'political_type': response.xpath( 'string((//div[@id="basic-info"]//div' '[@title="Political Views"]//td)[2])').extract_first(), 'page_url': response.xpath( '(//div[@id="basic-info"]//div[@title="Political Views"]' '//td)[2]//a/@href').extract_first() }) # nickname for nickname_selector in response.xpath( '//div[@id="nicknames"]//div[@title]'): loader.add_value( 'other_names', { 'type': nickname_selector.xpath( 'string((.//td)[1])').extract_first(), 'name': nickname_selector.xpath( 'string((.//td)[2])').extract_first() }) # relationship loader.add_value( 'relationship', { 'text': response.xpath('string((//div[@id="relationship"]' '/div/div)[2])').extract_first(), 'link': response.xpath('(//div[@id="relationship"]/div/div)' '[2]//a/@href').extract_first() }) # family for member_selector in response.xpath( '(//div[@id="family"]/div/div)[2]/div/div[1]'): loader.add_value( 'family_members', { 'name': member_selector.xpath( 'string((.//h3)[1])').extract_first(), 'relationship': member_selector.xpath('(.//h3)[2]/text()').extract_first(), 'link': member_selector.xpath('.//a/@href').extract_first() }) # about loader.add_value( 'about', response.xpath( 'string((//div[@id="bio"]/div/div)[2])').extract_first()) # life event for event_selector in response.xpath('//div[@id="year-overviews"]//a'): loader.add_value( 'life_events', { 'headline': event_selector.xpath('./text()').extract_first(), 'link': event_selector.xpath('./@href').extract_first() }) # parse quotes loader.add_value( 'fav_quotes', response.xpath( 'string((//div[@id="quote"]/div/div)[2]/div)').extract_first()) # parse friends page if response.meta['search_friends_depth']: yield Request(response.meta['base_url'] + 'v=friends', callback=self.parse_friends_page, meta={ 'loader': loader, 'base_url': response.meta['base_url'], 'search_friends_depth': response.meta['search_friends_depth'] - 1, 'friend_with': response.meta['id'] }) else: loader.add_value('friend_with', response.meta.get('friend_with', None)) # parse timeline yield Request(response.meta['base_url'] + 'v=timeline', callback=self.parse_timeline, meta={ 'id': response.meta['id'], 'user_id': user_id, 'base_url': response.meta['base_url'] }) # parse likes yield Request(response.meta['base_url'] + 'v=likes', callback=self.parse_likes, meta={'id': response.meta['id']}) loader.add_value('timestamp', datetime.datetime.now()) # print loader.load_item() yield loader.load_item()
def start_requests(self): for i in range(0, self.distinct_urls): for j in range(0, self.dupe_factor): url = self.mockserver.url("/echo?headers=1&body=test%d" % i) yield Request(url, dont_filter=self.dont_filter)
def parse(self, response): # Finding Product Status From Scraping. try: productName = SpiderHelper.get_product_name(response.request.url) product = response.xpath(".//*[@class='btn btn-primary btn-wide']") if product: log.warning(f"{productName} is Currently: Available.") else: log.info(f"{productName} is Out of Stock.") except NoSuchElementException: log.error(NoSuchElementException) pass # Start selenium driver if product: log.info(f"Found {productName} to add to cart.") # Booting WebDriver. driver = SpiderHelper.get_driver() wait = WebDriverWait(driver, 15) # Starting Webpage. driver.get(response.url) wait.until( EC.element_to_be_clickable( (By.XPATH, ".//*[@class='btn btn-primary btn-wide']"))) # Click Add to Cart. log.info("Clicking Add To Cart Button.") driver.find_element_by_xpath( ".//*[@class='btn btn-primary btn-wide']").click() time.sleep(2) if len( driver.find_elements_by_xpath( "//button[contains(text(), 'not interested')]")) != 0: driver.find_elements_by_xpath( "//button[contains(text(), 'not interested')]").click() # Click Cart. log.info("Going to Shopping Cart.") driver.get("https://secure.newegg.com/shop/cart") time.sleep(5) # Click Check-out Button. log.info("Clicking Checkout Button.") if len( driver.find_elements_by_xpath( "//button[contains(text(), 'not interested')]")) != 0: driver.find_elements_by_xpath( "//button[contains(text(), 'not interested')]").click() driver.find_element_by_xpath( ".//*[@class='btn btn-primary btn-wide']").click() # Giving Website Time To Login. log.info("Giving Website Time To Login..") wait.until( EC.element_to_be_clickable( (By.XPATH, "//*[@id='signInSubmit']"))) # ARE YOU READY TO BUY? log.info(f"Buying {productName}.") # Click past Shipping driver.find_elements_by_xpath( ".//*[@class='btn btn-primary checkout-step-action-done layout-quarter']" )[0].click() # Click past delivery driver.find_elements_by_xpath( ".//*[@class='btn btn-primary checkout-step-action-done layout-quarter']" )[1].click() log.info("Bot has Completed Checkout.") time.sleep(180000) else: log.info("Retrying Bot In 15 Seconds.") time.sleep(15) yield Request(url=response.url, callback=self.parse, dont_filter=True)
def start_requests(self): body = b"a" * self.full_response_length url = self.mockserver.url("/alpayload") yield Request(url, method="POST", body=body, errback=self.errback)
def parse_brands(self, response): for link in self.brands.extract_links(response): yield Request(link.url, self.parse_category, meta={'link_text': link.text})
def parse(self, response): self.urls_visited.append(response.url) self.times.append(time.time()) for link in self.link_extractor.extract_links(response): yield Request(link.url, callback=self.parse)
def parse(self, response): informationItems = InformationItem() selector = Selector(response) ID = re.findall('weibo\.cn/(\d+)', response.url)[0] text1 = ";".join( selector.xpath( 'body/div[@class="c"]/text()').extract()) # 获取标签里的所有text() nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1) # 昵称 gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1) # 性别 place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1) # 地区(包括省份和城市) signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1) # 个性签名 birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1) # 生日 sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1) # 性取向 marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1) # 婚姻状况 url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1) # 首页链接 informationItems["_id"] = ID if nickname: informationItems["NickName"] = nickname[0] if gender: informationItems["Gender"] = gender[0] if place: place = place[0].split(" ") informationItems["Province"] = place[0] if len(place) > 1: informationItems["City"] = place[1] if signature: informationItems["Signature"] = signature[0] if birthday: try: birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d") informationItems["Birthday"] = birthday - datetime.timedelta( hours=8) except Exception: pass if sexorientation: if sexorientation[0] == gender[0]: informationItems["Sex_Orientation"] = "gay" else: informationItems["Sex_Orientation"] = "Heterosexual" if marriage: informationItems["Marriage"] = marriage[0] if url: informationItems["URL"] = url[0] urlothers = "https://weibo.cn/attgroup/opening?uid=%s" % ID r = requests.get(urlothers, cookies=response.request.cookies) if r.status_code == 200: selector = etree.HTML(r.content) texts = ";".join( selector.xpath('//body//div[@class="tip2"]/a//text()')) if texts: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', texts) # 微博数 num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', texts) # 关注数 num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', texts) # 粉丝数 if num_tweets: informationItems["Num_Tweets"] = int(num_tweets[0]) if num_follows: informationItems["Num_Follows"] = int(num_follows[0]) if num_fans: informationItems["Num_Fans"] = int(num_fans[0]) yield informationItems urlFollows = "https://weibo.cn/%s/follow" % ID # 爬第一页的关注,加入待爬队列 idFollows = self.getNextID(urlFollows, response.request.cookies) for ID in idFollows: url = "https://weibo.cn/%s/profile?filter=1&page=1" % ID yield Request(url=url, callback=self.parse)
def start_requests(self): self.t1 = time.time() url = self.mockserver.url("/delay?n=%s&b=%s" % (self.n, self.b)) yield Request(url, callback=self.parse, errback=self.errback)
def parse_read(self, response): read_url_slice = response.xpath('//html/body/div[2]/div[4]/div[2]') read_url = read_url_slice.xpath('a/@href').extract()[0] yield Request(read_url, callback=self.parse_chapter)
def create_request(self, item, retries=3): yield Request(url=item.link, callback=self.parse, meta={'item': item, 'retries': retries})
def get_media_requests(self, item, info): yield Request(item['image_url'])
def start_requests(self): return [Request('https://m.facebook.com/login/', callback=self.login)]
def parse(self, response: Response) -> Generator[Request, None, None]: sel = Selector(response, type="html") for url in sel.xpath('//div[@id="defaultResultPage"]//a/@href').getall(): yield Request(url=url, callback=self.product_page_cb)
def parse_likes(self, response): # print response.meta.get('type') like_types_selectors = response.xpath('//div[@id="root"]/div/div') for like_type_selector in like_types_selectors: if response.meta.get('type', False): type = response.meta['type'] if not like_type_selector.xpath( './/*[(self::h3 or self::h4)]/text()').extract_first(): # filter the redundent div continue else: # first time crawl likes page type = like_type_selector.xpath( './/*[(self::h3 or self::h4)]/text()').extract_first() if type: type = type.strip() for page_selector in like_type_selector.xpath('.//img'): loader = ItemLoader(item=Page(), response=response) loader.add_value('id', response.meta['id']) loader.add_value('type', type) loader.add_value( 'url', page_selector.xpath( '((./following-sibling::div)' '[1]//a)[1]/@href').extract_first()) loader.add_value( 'name', page_selector.xpath( '((./following-sibling::div)' '[1]//a)[1]//span/text()').extract_first()) # print page_selector.xpath( # '((./following-sibling::div)' # '[1]//a)[1]//span/text()').extract_first() # extract page id try: loader.add_value( 'facebook_page_id', parse_qs( urlparse( page_selector.xpath( '(./following-sibling::div)[1]//a' '[text()="Like"]/@href').extract_first( )).query)['id'][0]) except Exception: # cannot find like button try: loader.add_value( 'facebook_page_id', page_selector.xpath( './parent::div/parent::div[@id]/@id'). extract_first().split(':')[-1]) except Exception: # need to process the page to get the id print 'Cannot find id on ' + response.url continue loader.add_value('timestamp', datetime.datetime.now()) yield loader.load_item() see_more_url = like_type_selector.xpath( './/span[text()="See More"]' '/parent::a/@href').extract_first() or \ like_type_selector.xpath( './/a[text()="See More"]/@href').extract_first() if see_more_url: yield Request('https://m.facebook.com' + see_more_url, callback=self.parse_likes, meta={ 'id': response.meta['id'], 'type': type })
from __future__ import absolute_import from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler from tests.mocks.frontier_manager import FakeFrontierManager from tests.mocks.crawler import FakeCrawler from frontera.core.models import Request as FRequest from frontera.core.models import Response as FResponse from scrapy.http import Request, Response from scrapy.spiders import Spider from scrapy.settings import Settings from six.moves import range # test requests r1 = Request('http://www.example.com') r2 = Request('https://www.example.com/some/page') r3 = Request('http://example1.com') # test requests with redirects rr1 = Request('http://www.example.com', meta={b'redirect_times': 1}) rr2 = Request('https://www.example.com/some/page', meta={b'redirect_times': 4}) rr3 = Request('http://example1.com', meta={b'redirect_times': 0}) # test frontier requests fr1 = FRequest('http://www.example.com') fr2 = Request('https://www.example.com/some/page') fr3 = Request('http://example1.com') class TestFronteraScheduler(object): def test_enqueue_requests(self): crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
def test_response_cacheability(self): responses = [ # 304 is not cacheable no matter what servers sends (False, 304, {}), (False, 304, {'Last-Modified': self.yesterday}), (False, 304, {'Expires': self.tomorrow}), (False, 304, {'Etag': 'bar'}), (False, 304, {'Cache-Control': 'max-age=3600'}), # Always obey no-store cache control (False, 200, {'Cache-Control': 'no-store'}), # invalid (False, 200, {'Cache-Control': 'no-store, max-age=300'}), (False, 200, {'Cache-Control': 'no-store', 'Expires': self.tomorrow}), # invalid # Ignore responses missing expiration and/or validation headers (False, 200, {}), (False, 302, {}), (False, 307, {}), (False, 404, {}), # Cache responses with expiration and/or validation headers (True, 200, {'Last-Modified': self.yesterday}), (True, 203, {'Last-Modified': self.yesterday}), (True, 300, {'Last-Modified': self.yesterday}), (True, 301, {'Last-Modified': self.yesterday}), (True, 308, {'Last-Modified': self.yesterday}), (True, 401, {'Last-Modified': self.yesterday}), (True, 404, {'Cache-Control': 'public, max-age=600'}), (True, 302, {'Expires': self.tomorrow}), (True, 200, {'Etag': 'foo'}), ] with self._middleware() as mw: for idx, (shouldcache, status, headers) in enumerate(responses): req0 = Request('http://example-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) res1 = self._process_requestresponse(mw, req0, res0) res304 = res0.replace(status=304) res2 = self._process_requestresponse( mw, req0, res304 if shouldcache else res0) self.assertEqualResponse(res1, res0) self.assertEqualResponse(res2, res0) resc = mw.storage.retrieve_response(self.spider, req0) if shouldcache: self.assertEqualResponse(resc, res1) assert 'cached' in res2.flags and res2.status != 304 else: self.assertFalse(resc) assert 'cached' not in res2.flags # cache unconditionally unless response contains no-store or is a 304 with self._middleware(HTTPCACHE_ALWAYS_STORE=True) as mw: for idx, (_, status, headers) in enumerate(responses): shouldcache = 'no-store' not in headers.get( 'Cache-Control', '') and status != 304 req0 = Request('http://example2-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) res1 = self._process_requestresponse(mw, req0, res0) res304 = res0.replace(status=304) res2 = self._process_requestresponse( mw, req0, res304 if shouldcache else res0) self.assertEqualResponse(res1, res0) self.assertEqualResponse(res2, res0) resc = mw.storage.retrieve_response(self.spider, req0) if shouldcache: self.assertEqualResponse(resc, res1) assert 'cached' in res2.flags and res2.status != 304 else: self.assertFalse(resc) assert 'cached' not in res2.flags