def parseAnimals(self,response): html = HtmlParser(response) if html.extract_urls('//div[@class="pagination"]/a[@class="next"]'): for url in html.extract_urls('//div[@id="search_results"]/div/a'): yield Request(url,callback="parseAnimal") for url in html.extract_urls('//div[@class="pagination"]/a[@class="next"]'): yield Request(url,callback="parseAnimals") else: for url in html.extract_urls('//div[@id="search_results"]/div/a'): yield Request(url,callback="parseAnimal")
def parseCat(self, response): parser = HtmlParser(response) if 'Next' not in parser.xpath('//li/a/text()'): for i in parser.xpath('//span[@class="bqQuoteLink"]/a//text()'): self.mydb.quotes.insert({'quote': i}) else: for i in parser.xpath('//span[@class="bqQuoteLink"]/a//text()'): self.mydb.quotes.insert({'quote': i}) for url in parser.extract_urls('//li/a[contains(text(),"Next")]'): yield Request(url, callback="parseCat")
def test_unique(self): req1 = Request("http://www.google.com", method="POST", form_data={"test1": "abcd", "abcd": "test1"}) req2 = Request("http://www.google.com", method="POST", form_data={"abcd": "test1", "test1": "abcd"}) req3 = Request("http://www.google.com", method="POST", form_data={"abcd": "test1", "test1": "abdc"}) self.assertEqual(req1.get_unique_id(), req2.get_unique_id()) self.assertNotEqual(req1.get_unique_id(), req3.get_unique_id())
def test_unique(self): req1 = Request("http://www.google.com", method="POST", form_data={ "test1": "abcd", "abcd": "test1" }) req2 = Request("http://www.google.com", method="POST", form_data={ "abcd": "test1", "test1": "abcd" }) req3 = Request("http://www.google.com", method="POST", form_data={ "abcd": "test1", "test1": "abdc" }) self.assertEqual(req1.get_unique_id(), req2.get_unique_id()) self.assertNotEqual(req1.get_unique_id(), req3.get_unique_id())
def test_cookie(self): response = Request(httpbin('cookies', 'set?name=dragline')).send() response = Request(httpbin('cookies'), cookies=response.cookies).send() self.assertEqual(response.json()['cookies']['name'], 'dragline') response = Request(httpbin('/cookies/delete?name')).send() response = Request(httpbin('cookies'), cookies={'name': 'dragline2'}).send() self.assertEqual(response.json()['cookies']['name'], 'dragline2')
def parse(self, response): parser = HtmlParser(response) todaysdate = datetime.date.today() for days in range(0, 60): startdate = (todaysdate + datetime.timedelta(days=days)).strftime('%-m/%-d/%Y') checkindate = (todaysdate + datetime.timedelta(days=days)).strftime('%Y/%m/%d') enddate = ( todaysdate + datetime.timedelta(days=days + 1)).strftime('%-m/%-d/%Y') # startdate= '8/29/2018' # enddate = '8/30/2018' inputdata = csv.DictReader( open(os.path.join(os.path.dirname(__file__), 'hotel_input.csv'))) roomdata = {} nonavailablelist = [] for inp in inputdata: if inp["property"]: url = inp["link"] + "&chkin=%s&chkout=%s" % (startdate, enddate) if inp["room_id"]: roomdata[inp["room_id"]] = { "hotel_name": inp["hotel_name"], "ID": inp["ID"], "property_id": inp["property_id"], "checkindate": checkindate, "saved": False } if not inp["room_id"]: nonavailablelist.append({ "hotel_name": inp["hotel_name"], "ID": inp["ID"], "property_id": inp["property_id"], "checkindate": checkindate }) if inp["lastcheck"]: yield Request(url, callback=self.parse_booking_page, meta={ "roomdata": roomdata, "nonavailablelist": nonavailablelist }) roomdata = {} nonavailablelist = []
def parseCat(self, response): parser = HtmlParser(response) dbname = response.meta['u'] if not parser.xpath('//a[@class="next_page"]'): for i in parser.xpath('//div[@class="quoteText"]'): quote = i.text for j in i.iterfind('a'): author = j.text self.mydb[dbname].insert({'quote': quote, 'author': author}) else: for i in parser.xpath('//div[@class="quoteText"]'): quote = i.text for j in i.iterfind('a'): author = j.text self.mydb[dbname].insert({'quote': quote, 'author': author}) for url in parser.extract_urls('//a[@class="next_page"]'): yield Request(url, callback="parseCat", meta={'u': dbname})
def parse(url): request_headers = {"Accept-Encoding":"gzip","User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"} response = Request(url,headers=request_headers).send().text print "processed : ",url parser=HtmlParser(response) raw_price1 = parser.xpath("//span[@id='priceblock_saleprice']/text()") raw_price2 = parser.xpath("//span[@id='priceblock_ourprice']/text()") raw_availability = parser.xpath("//div[@id='availability']//text()") raw_name = parser.xpath("//h1[@id='title']//text()") raw_brand = parser.xpath("//a[@id='brand']//text()") raw_img = parser.xpath("//div[@class='imgTagWrapper']//img/@data-a-dynamic-image") if raw_img: img = json.loads(raw_img[0]).keys()[0] else: img = '' name = ' '.join(''.join(raw_name).split()).strip() price=0.0 if raw_price1: price = float(raw_price1[0].replace(',','').strip()) if raw_price2: price = float(raw_price2[0].replace(',','').strip()) brand = ''.join(raw_brand).strip() availability = ''.join(raw_availability).strip() raw_product_id = re.findall("dp/([A-Z0-9]{,10})",url) if raw_product_id: product_id = raw_product_id[0] elif raw_product_id : product_id = re.findall("product/(.*)/ref",url)[0] else: product_id = re.findall("product/(.*)",url)[0] data={ "price":price, "availability":availability, "product_id":product_id, "name":name, "image":img, "brand":brand } return data
def parse(self, response): if len(response.text) < 10000: raise RequestError('Invalid Response') with open(os.path.join(os.path.dirname(__file__), 'need_to_scrape.csv')) as f: reader = list(csv.DictReader(f)) # reader = ["B000MFNB5O"] for asins in reader: asin_id = asins.get('October_ASIN_List') asin_id = asin_id.strip() if asin_id else None # asin_id = asins urls = 'https://www.amazon.com/dp/' + asin_id searchurl = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias=aps&field-keywords=" + asin_id meta = { 'resultsurl': searchurl, 'input_asin': asin_id, 'retried': 0, } yield Request(urls, callback=self.parse_product, meta=meta)
def process(req_or_url): global data if not req_or_url: return help() if isinstance(req_or_url, Request): data["request"] = req_or_url else: data["request"] = Request(req_or_url) try: data["response"] = data["request"].send() except: data["response"] = None print(traceback.format_exc()) print("Failed to fetch") try: data["parser"] = HtmlParser(data["response"]) except: data["parser"] = None print("Failed to parse response") return help()
def parse_product(self, response): response.meta['Referer'] = response.url meta = response.meta input_asin = meta.get('input_asin') if "Sorry! We couldn't" in response.text or response.status_code == 404: # self.logger.warning("Page Not Found %s"%response.url) data = {} data['Asin'] = response.meta['input_asin'] data['Url'] = response.url Missing(**data).save() elif 'captcha' in response.text.lower(): # self.logger.warning("Captcha Page url is %s,length is %d"%(response.url,len(response.text))) headers = {} headers['User-Agent'] = get_ua() headers['Host'] = "www.amazon.com" headers['Referer'] = response.url if response.meta['retried'] <= settings.MAX_RETRY: response.meta['retried'] = response.meta['retried'] + 1 meta = { 'resultsurl': response.meta['resultsurl'], 'input_asin': response.meta['input_asin'], "Referer": response.url, "headers": headers, "retried": response.meta['retried'], } yield Request(response.url, callback=self.parse_product, headers=headers, meta=meta, dont_filter=True) else: self.logger.warning("Rejecting %s" % (response.url)) elif response.status == 503: self.logger.warning("Blocked page url is %s,length is %d" % (response.url, len(response.text))) raise RequestError("Page Blocked") elif len(response.text) < 10000: headers = {} headers['User-Agent'] = get_ua() headers['Host'] = "www.amazon.com" headers['Referer'] = response.meta['Referer'] if response.meta['retried'] <= settings.MAX_RETRY: response.meta['retried'] = response.meta['retried'] + 1 meta = { 'resultsurl': response.meta['resultsurl'], 'input_asin': input_asin, "Referer": response.meta['Referer'], "headers": headers, "retried": response.meta['retried'], } yield Request(response.url, callback=self.parse_product, headers=headers, meta=meta, dont_filter=True) else: self.logger.warning("Rejecting %s" % (response.url)) else: parser = HtmlParser(response) namepath = './/h1[@id="title"]/span[@id="productTitle"]/text()' brandpath1 = './/th[contains(text(),"Brand")]/following-sibling::td/text()' brandpath2 = './/table//td[contains(text(),"Brand")]/following-sibling::td/text()' brandpath3 = './/li[contains(b/text(),"Brand")]/text()' brandpath4 = './/a[@id="bylineInfo"]/text()' pricepath1 = './/span[@id="priceblock_dealprice"]/text()' pricepath2 = './/span[@id="priceblock_ourprice"]/text()' pricepath3 = './/span[@id="priceblock_saleprice"]/text()' pricepath4 = './/span[@class="a-color-price"]/text()' partnumberpath1 = './/th[contains(text(),"Part Number")]/following-sibling::td/text()' partnumberpath2 = './/table//td[contains(text(),"Part Number")]/following-sibling::td/text()' partnumberpath3 = './/th[contains(text(),"Item model number")]/following-sibling::td/text()' partnumberpath4 = './/li[contains(b/text(),"Item model number")]/text()' partnumberpath5 = './/li[contains(b/text(),"Part Number")]/text()' partnumberpath6 = './/li[contains(b/text(),"Model")]/text()' partnumberpath7 = './/th[contains(text(),"Model")]/following-sibling::td/text()' partnumberpath8 = './/table//td[contains(text(),"Model")]/following-sibling::td/text()' packagedimensionpath1 = './/th[contains(text(),"Package Dimension")]/following-sibling::td/text()' packagedimensionpath2 = './/table//td[contains(text(),"Package Dimension")]/following-sibling::td/text()' packagedimensionpath3 = './/li[contains(b/text(),"Package Dimension")]/text()' packagedimensionpath4 = './/li//span[contains(text(),"Package Dimension")]/following-sibling::span/text()' packagedimensionpath5 = './/th[contains(span/text(),"Package Dimension")]/following-sibling::td[1]//text()' asinpath1 = './/input[@id="ASIN"]/@value' asinpath2 = './/li[contains(b/text(),"ASIN")]/text()' asinpath3 = './/th[contains(text(),"ASIN")]/following-sibling::td/text()' #ratingpath = './/div[@id="revFMSR"]//span[contains(text(),"out of 5 stars")]/text()' ratingpath1 = './/div[@id="averageCustomerReviews"]//span[contains(@class,"reviewCount")][contains(@title,"out of 5 stars")]/@title' ratingpath2 = './/th[contains(text(),"Customer Reviews")]/following-sibling::td/text()' oempath1 = './/th[contains(text(),"OEM Part Number")]/following-sibling::td/text()' oempath2 = './/table//td[contains(text(),"OEM Part Number")]/following-sibling::td/text()' oempath3 = './/li[contains(b/text(),"OEM Part Number")]/text()' oempath4 = './/table//tr[contains(th//a/text(),"OEM Part Number")]/td/text()' oempath5 = './/table//tr[contains(td//a/text(),"OEM Part Number")]/td/text()' name = parser.xpath(namepath) if name: name = name[0].strip() else: #retry if no name self.logger.warning( "Bad output ( parse_reviews ).. Url is %s" % (response.url)) raise RequestError('Invalid Page Response') brand1 = parser.xpath(brandpath1) brand2 = parser.xpath(brandpath2) brand3 = parser.xpath(brandpath3) brand4 = parser.xpath(brandpath4) brand = brand1 if brand1 else brand2 brand = brand3 if not brand else brand brand = brand4 if not brand else brand brand = brand[0].strip() if brand else None price1 = parser.xpath(pricepath1) price2 = parser.xpath(pricepath2) price3 = parser.xpath(pricepath3) price4 = parser.xpath(pricepath4) price = price1 if price1 else price2 price = price3 if not price else price price = price4 if not price else price price = price[0].strip() if price else None # price = response.meta["price"] if not price else price partnumber1 = parser.xpath(partnumberpath1) partnumber2 = parser.xpath(partnumberpath2) partnumber3 = parser.xpath(partnumberpath3) partnumber4 = parser.xpath(partnumberpath4) partnumber5 = parser.xpath(partnumberpath5) partnumber6 = parser.xpath(partnumberpath6) partnumber7 = parser.xpath(partnumberpath7) partnumber8 = parser.xpath(partnumberpath8) partnumber = partnumber1 if partnumber1 else partnumber2 partnumber = partnumber3 if not partnumber else partnumber partnumber = partnumber4 if not partnumber else partnumber partnumber = partnumber5 if not partnumber else partnumber partnumber = partnumber6 if not partnumber else partnumber partnumber = partnumber7 if not partnumber else partnumber partnumber = partnumber8 if not partnumber else partnumber partnumber = partnumber[0].strip() if partnumber else None packagedimension1 = parser.xpath(packagedimensionpath1) packagedimension2 = parser.xpath(packagedimensionpath2) packagedimension3 = parser.xpath(packagedimensionpath3) packagedimension4 = parser.xpath(packagedimensionpath4) packagedimension5 = parser.xpath(packagedimensionpath5) packagedimensiontext = packagedimension1 if packagedimension1 else packagedimension2 if packagedimension2 else packagedimension3 if packagedimension3 else packagedimension4 if packagedimension4 else packagedimension5 packagedimension = ''.join( [i.strip() for i in packagedimensiontext if i.strip()]) packagedimension = packagedimension.strip( ) if packagedimension else None asin1 = parser.xpath(asinpath1) asin2 = parser.xpath(asinpath2) asin = asin1 if asin1 else asin2 asin = asin[0].strip() rating1 = parser.xpath(ratingpath1) rating2 = parser.xpath(ratingpath2) if rating1: rating = rating1[0].replace('out of 5 stars', '').strip() elif rating2: rating = ' '.join(''.join(rating2).split()).replace( 'out of 5 stars', '').strip() else: rating = None oem1 = parser.xpath(oempath1) oem2 = parser.xpath(oempath2) oem3 = parser.xpath(oempath3) oem4 = parser.xpath(oempath4) oem5 = parser.xpath(oempath5) oem = oem1 if oem1 else oem2 oem = oem3 if not oem else oem oem = oem4 if not oem else oem oem = oem5 if not oem else oem oem = oem[0].strip() if oem else None seller_rank_list = [] seller_rank_xpath = './/th[contains(text(),"Sellers Rank")]/following-sibling::td/span//span' seller_ranks = parser.xpath(seller_rank_xpath) if seller_ranks: for rank in seller_ranks: seller_rank = rank.xpath('.//text()') seller_rank = ' '.join( ''.join(seller_rank).split()) if seller_rank else None seller_rank = seller_rank.replace('(See top 100)', '') seller_rank_list.append(seller_rank) if not seller_rank_list: sell_t2 = parser.xpath('//li[@id="SalesRank"]') sell_t2 = sell_t2[0].extract_text() if sell_t2 else None sell_t2 = sell_t2.replace( '(See Top 100 in Beauty & Personal Care)', '' ).replace(' (See Top 100 in Sports & Outdoors)', '').replace( ' .zg_hrsr { margin: 0; padding: 0; list-style-type: none; } .zg_hrsr_item { margin: 0 0 0 10px; } .zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; } ', ' | ').replace('Amazon Best Sellers Rank: ', '').split('| ') if sell_t2 else None seller_rank_list = sell_t2 if sell_t2 else None searched_asin = input_asin # resultsurl = response.meta["resultsurl"] resultsurl = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias=aps&field-keywords=" + input_asin item_package_quantity = parser.xpath( '//div[contains(label//text(),"Item Package Quantity:")]//following-sibling::span' ) item_package_quantity = item_package_quantity[0].extract_text( ) if item_package_quantity else None # new fileds # ship_check = parser.xpath('//a[contains(@title,"See All Buying Options") or contains(text(),"See All Buying Options")]') ship_check = parser.xpath( '//div[contains(@data-feature-name,"shipsFromSoldByInsideBuyBox") or contains(@id,"shipsFromSoldByInsideBuyBox_feature_div")]' ) raw_ship_info = parser.xpath('//div[@id="merchant-info"]') raw_ship_info2 = parser.xpath( '//span[contains(@id,"merchant-info")]') raw_ship_info3 = parser.xpath( '//div[contains(@id,"merchant-info")]') ship_info = raw_ship_info[0].extract_text( ) if raw_ship_info else None ship_info2 = raw_ship_info2[0].extract_text( ) if raw_ship_info2 else None ship_info3 = raw_ship_info3[0].extract_text( ) if raw_ship_info3 else None ship_info = ship_info if ship_info else ship_info2 ship_info = ship_info if ship_info else ship_info3 # view(response) # if not ship_info and ship_check: # ship_info = "Ships from and sold by Amazon.com.." raw_inventory = parser.xpath('//div[@id="availability"]') inventory = raw_inventory[0].extract_text( ) if raw_inventory else None raw_item_description = parser.xpath( './/h1[@id="title"]/span[@id="productTitle"]') item_description = raw_item_description[0].extract_text( ) if raw_item_description else None raw_no_of_cust = parser.xpath( './/a[contains(@id,"acrCustomerReviewLink")]') #second_parser raw_no_of_cust = raw_no_of_cust[0].extract_text( ) if raw_no_of_cust else None date_first_available = parser.xpath( '//th[contains(text(),"Date First Available")]//following-sibling::td' ) #second_parser date_first_available = date_first_available[0].extract_text( ) if date_first_available else None item_weight = parser.xpath( '//th[contains(text(),"Item Weight")]//following-sibling::td' ) #second_parser item_weight_t2 = parser.xpath( '//b[contains(text(),"Item Weight") or contains(text(),"Shipping Weight:")]//following-sibling::text()' ) item_weight_t2 = ', '.join(item_weight_t2).replace( '(', '').replace( ')', '').strip().strip(', ') if item_weight_t2 else None item_weight = item_weight[0].extract_text( ) if item_weight else item_weight_t2 pro_dimen = parser.xpath( '//th[contains(text(),"Product Dimensions") or contains(text(),"Package Dimensions")]//following-sibling::td' ) #second parser pro_dimen = pro_dimen[0].extract_text() if pro_dimen else None pro_dimen_t2 = parser.xpath( '//b[contains(text(),"Product Dimensions:") or contains(text(),"Package Dimensions")]//following-sibling::text()' ) pro_dimen_t2 = ', '.join(pro_dimen_t2).strip().strip( ', ') if pro_dimen_t2 else None pro_dimen = pro_dimen if pro_dimen else pro_dimen_t2 #------------------------------------------------- Second response -------------------------------------------- second_response = response.text.split('Product information') second_parser = second_response[1] if second_response and len( second_response) >= 2 else None checkreviews_p2 = '' if second_parser: second_parser = HtmlParser(second_parser) seller_rank_list_p2 = [] seller_ranks_p2 = second_parser.xpath(seller_rank_xpath) if seller_ranks_p2: for rank_p2 in seller_ranks_p2: seller_rank_p2 = rank_p2.xpath('.//text()') seller_rank_p2 = ' '.join( ''.join(seller_rank_p2).split( )) if seller_rank_p2 else None seller_rank_p2 = seller_rank_p2.replace( '(See top 100)', '') seller_rank_list_p2.append(seller_rank_p2) partnumber9 = second_parser.xpath(partnumberpath1) partnumber10 = second_parser.xpath(partnumberpath2) partnumber11 = second_parser.xpath(partnumberpath3) partnumber13 = second_parser.xpath(partnumberpath4) partnumber15 = second_parser.xpath(partnumberpath5) partnumber16 = second_parser.xpath(partnumberpath6) partnumber17 = second_parser.xpath(partnumberpath7) partnumber18 = second_parser.xpath(partnumberpath8) partnumber_p2 = partnumber9 if partnumber9 else partnumber10 partnumber_p2 = partnumber11 if not partnumber_p2 else partnumber_p2 partnumber_p2 = partnumber13 if not partnumber_p2 else partnumber_p2 partnumber_p2 = partnumber15 if not partnumber_p2 else partnumber_p2 partnumber_p2 = partnumber16 if not partnumber_p2 else partnumber_p2 partnumber_p2 = partnumber17 if not partnumber_p2 else partnumber_p2 partnumber_p2 = partnumber8 if not partnumber_p2 else partnumber_p2 partnumber_p2 = partnumber_p2[0].strip( ) if partnumber_p2 else None oem6 = second_parser.xpath(oempath1) #second parser oem7 = second_parser.xpath(oempath2) oem8 = second_parser.xpath(oempath3) oem9 = second_parser.xpath(oempath4) oem10 = second_parser.xpath(oempath5) oem_p2 = oem6 if oem6 else oem7 oem_p2 = oem8 if not oem_p2 else oem_p2 oem_p2 = oem9 if not oem_p2 else oem_p2 oem_p2 = oem10 if not oem_p2 else oem_p2 oem_p2 = oem_p2[0].strip() if oem_p2 else None raw_item_description2 = second_parser.xpath( './/h1[@id="title"]/span[@id="productTitle"]') item_description2 = raw_item_description2[0].extract_text( ) if raw_item_description2 else None raw_no_of_cust2 = second_parser.xpath( './/a[contains(@id,"acrCustomerReviewLink")]' ) #second_parser raw_no_of_cust2 = raw_no_of_cust2[0].extract_text( ) if raw_no_of_cust2 else None date_first_available2 = second_parser.xpath( '//th[contains(text(),"Date First Available")]//following-sibling::td' ) #second_parser date_first_available2 = date_first_available2[0].extract_text( ) if date_first_available2 else None item_weight2 = second_parser.xpath( '//th[contains(text(),"Item Weight")]//following-sibling::td' ) #second_parser item_weight2 = item_weight2[0].extract_text( ) if item_weight2 else None pro_dimen2 = second_parser.xpath( '//th[contains(text(),"Product Dimensions") or contains(text(),"Package Dimensions")]//following-sibling::td' ) #second parser pro_dimen2 = pro_dimen2[0].extract_text( ) if pro_dimen2 else None checkreviews_p2 = second_parser.xpath( './/div[@data-hook="top-customer-reviews-widget"]') else: self.logger.warning("No second parser %s,length is %d" % (response.url, len(response.text))) oem_p2 = None partnumber_p2 = None seller_rank_list_p2 = None item_description2 = None item_weight2 = None pro_dimen2 = None raw_no_of_cust2 = None date_first_available2 = None product = { "Brand": brand, "Part_No": partnumber if partnumber else partnumber_p2, "ASIN": asin, "Rating_Numbers": rating, "OE_NO": oem if oem else oem_p2, "Price": price, "Customer_Name": None, "Commented_Date": None, "Comments": None, "Url": response.url, "Name": name, "Searched_ASIN": searched_asin, "Results_Url": resultsurl, "Best_Seller_Rank": seller_rank_list if seller_rank_list else seller_rank_list_p2, "Review_Url": None, "Item_Description": item_description if item_description else item_description2, "Item_Package_Quantity": item_package_quantity, "Inventory": inventory, "Ship_Info": ship_info, "Item_Weight": item_weight if item_weight else item_weight2, "Product_Dimensions": pro_dimen if pro_dimen else pro_dimen2, "Number_of_Customer_Reviews": raw_no_of_cust if raw_no_of_cust else raw_no_of_cust2, "Date_First_Available": date_first_available if date_first_available else date_first_available2, } checkreviews = parser.xpath( './/div[@data-hook="top-customer-reviews-widget"]') if not checkreviews_p2: checkreviews_p2 = None checkreviews = checkreviews if checkreviews else checkreviews_p2 #if the product contain reviews generate 'sort by recent' review url and crawl that page else save the data with blank reviews if checkreviews or raw_no_of_cust: re_urls = 'https://www.amazon.com/dp/' + asin reviewsurl = re_urls.replace( '/dp/', '/product-reviews/' ) + "?ie=UTF8&reviewerType=avp_only_reviews&sortBy=recent" yield Request(reviewsurl, callback="parse_reviews", meta=product) else: # rint "\n *************************************************************** DATA 2 ********************************************************************** \n" Product(**product).save()
def test_post_raw(self): data = 'dragline' response = Request(httpbin('post'), data=data).send() self.assertEqual(data, response.json()['data'])
def test_request(self): req = Http() headers, content = req.request("http://www.example.org") reqst = Request("http://www.example.org") response = reqst.send() self.assertEqual(content, response.body)
def parse(self, response): parser = HtmlParser(response) for url in parser.extract_urls('//a[@class="actionLinkLite serif"]'): dbname = url.split('/')[-1] yield Request(url, callback="parseCat", meta={'u': dbname})
def test_get(self): response = Request(httpbin('get')).send() self.assertEqual(httpbin('get'), response.json()['url'])
def test_post_form(self): data = {'name': 'dragline'} response = Request(httpbin('post'), data=data).send() self.assertEqual(data, response.json()['form'])
def test_redirect(self): response = Request(httpbin('redirect', '4')).send() self.assertEqual(httpbin('get'), response.json()['url']) self.assertEqual(response.status_code, 200) response = Request(httpbin('redirect', '4'), allow_redirects=False).send() self.assertEqual(response.status_code, 302)
def parse(self, response): parser = HtmlParser(response) for i in parser.extract_urls( '//div[@class="bqLn"]/div[@class="bqLn"]/a'): yield Request(i, callback="parseCat")
def test_headers(self): headers = {"user-agent": "dragline"} response = Request(httpbin('user-agent'), headers=headers).send() self.assertEqual(headers, response.json())
def parse(self,response): html = HtmlParser(response) photo_list = ['//li[@class="first"]','//li[@class=" "]','//li[@class="last"]'] for item in photo_list: for url in html.extract_urls(item): yield Request(url,callback="parseAnimals")