def parse_user(self, response): item = MFWItem() item['uid'] = response.meta['uid'] item['name'] = response.xpath( '//div[@class="MAvaName"]/text()').extract_first() item['level'] = int(response.xpath( '//span[@class="MAvaLevel flt1"]/a/@title').extract_first().split('.')[-1]) if item['level'] <= 3: return item['tags'] = response.xpath( '//div[@class="its_tags"]//i[contains(@class, "on")]/../@title').extract() item['attention'] = [int(i) for i in response.xpath( '//div[@class="MAvaMore clearfix"]//a/text()').extract()] item['groups'] = response.xpath( '//div[@class="MGroupDetail"]//a[@class="name"]/text()').extract() item['dynamic'] = response.xpath( '//span[@class="time"]/text()').extract() item['download'] = [] infos = response.xpath('//div[@class="common_block relative_info"]/p') for info in infos: if u'刚刚下载了' in ''.join(info.xpath('text()').extract()): item['download'].append({'time': info.xpath( 'span[@class="time"]/text()').extract_first(), 'name': info.xpath('a/text()').extract()[-1]}) item['note'] = {} item['path'] = [] item['review'] = [] item['together'] = [] note = response.xpath(u'//a[@title="TA的游记"]/@href').extract_first() req = Request(urljoin(response.url, note), callback=self.parse_note) req.meta['item'] = item yield req
def _parse_symptom_question(self, response): symptom_question_item = response.meta.get('symptom_questions') # print response.url if not symptom_question_item: symptom_question_item = SymptomQuestionItem() symptom_question_item['symptom_name'] = response.meta['symptom_item']['name'] symptom_question_item['qids'] = [] # parse urls = response.xpath('//div[@class="p_list_li"]/div[@class="p_list_cent"]/div[@class="p_list_centt"]/dl/dt/a/@href').extract() symptom_question_item['qids'] += [u.split('/')[-1].split('.')[0] for u in urls] # last_url = response.xpath('//div[@class="portldet-content"]/a/@href').extract()[-1] next_url = response.xpath('//div[@class="portlet-content"]/a[text()="下一页 >"]/@href').extract() if not next_url: # 所有页都处理完了 print symptom_question_item yield symptom_question_item else: url = next_url[0] # print url # print symptom_question_item['qids'] request = Request(url, dont_filter=True, callback=self._parse_symptom_question) request.meta['symptom_questions'] = symptom_question_item # print request yield request
def parse(self, response): tabs = [] tab_selector = response.xpath('//div[@id="siteDirectory"]') ### loop for all tabs for tab in tab_selector.xpath('.//div[@class="popover-grouping"]'): tabNameSel = tab.xpath("h2/text()").extract() if tabNameSel: tabName = tabNameSel[0] fobj = open(tabName + ".txt", "a+") cat_selector = tab.xpath(".//ul") ### loop for all categories for category in cat_selector.xpath("li"): #'.//div[contains(@class, "ht180")] catNameSel = category.xpath( "a/text()" ).extract() # //div[contains(@class, "top-menu unit")]/ul/li/div/div/div/ul/li[@class="heading"] # print category.extract() if catNameSel: catName = catNameSel[0] catLinkSel = category.xpath("a/@href").extract() if catLinkSel: catLink = "http://www.amazon.in" + catLinkSel[0] request = Request(catLink, callback=self.parse_subcatpage) request.meta["fobj"] = fobj request.meta["tabName"] = tabName request.meta["catName"] = catName yield request fobj.close()
def getItem(self, school): item = SchoolItem() logo = school.xpath('div/div[contains(@class,"school_m_img fl")]/a/img/@src').extract() item["logo"] = logo[0] if logo else "" # name province city area under school_m_main school_main = school.xpath('div/div[contains(@class,"school_m_main fl")]') name = school_main.xpath("li/h3/a/text()").extract() item["name"] = name[0] if name else "" item["province"] = "" item["city"] = "" item["area"] = "" tempLocation = school_main.xpath("li[2]/b/text()").extract() if tempLocation: location = tempLocation[0].split() item["province"] = location[0] if len(location) > 0 else "" item["city"] = location[1] if len(location) > 1 else "" item["area"] = location[2] if len(location) > 2 else "" catagery = school_main.xpath("li[3]/b/text()").extract() schoolType = school_main.xpath("li[4]/ol[1]/b/text()").extract() level = school_main.xpath("li[4]/ol[2]/b/text()").extract() item["level"] = level[0] if level else "" item["catagery"] = catagery[0] if catagery else "" item["schoolType"] = schoolType[0] if schoolType else "" # address and phone under school_m_lx addressAndPhone = school.xpath('ul[contains(@class,"school_m_lx")]') address = addressAndPhone.xpath("li[1]/b/text()").extract() item["address"] = address[0] if address else "" item["phone"] = addressAndPhone.xpath("li[2]/b/text()").extract() schoollUrl = school_main.xpath("li/h3/a/@href").extract()[0] request = Request(schoollUrl, callback=self.parse_schoolIntroUrl) request.meta["item"] = item return request
def parseJsonProduct(self, response): item = response.meta["item"] # make a valid json file out of it and remove unneeded data prodResponse = response.body.split("$+$")[0].strip().replace("'", '"') prodDict = {} sizeWidthDict = {} jsonresponse = json.loads(prodResponse) for product, value in jsonresponse.iteritems(): if item["sku"] not in prodDict: prodDict[item["sku"]] = {} if value["c"] not in prodDict[item["sku"]]: prodDict[item["sku"]][value["c"]] = {} if value["w"] not in prodDict[item["sku"]][value["c"]]: prodDict[item["sku"]][value["c"]][value["w"]] = {} if value["s"] not in sizeWidthDict: sizeWidthDict[value["s"]] = [] if value["w"] not in sizeWidthDict[value["s"]]: sizeWidthDict[value["s"]].append(value["w"]) prodDict[item["sku"]][value["c"]][value["w"]][value["s"]] = value["sku"] item["variant"] = prodDict item["size_width_list"] = sizeWidthDict # request first imageset if item["imageSetUrls"]: color, href = item["imageSetUrls"].popitem() if len(href) > 1: item["imageSetUrls"][color] = href[1:] request = Request(href[0], callback=self.parseJsonImageSet) request.meta["item"] = item return request self.to_csv(item) return item
def parse_page(self, response): if response.meta.has_key('crawldepth'): depth = response.meta['crawldepth'] else: # Set search depth here depth = 1 log.msg('Depth = %s' % str(depth), level=log.INFO) if not isinstance(response, HtmlResponse): log.msg('Not an HTML file: %s' % response.url, level=log.WARNING) return log.msg('Response from: %s' % response.url, level=log.INFO) url_bf.add(response.url) # TODO: Extract page title extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode()) cleaned_text = extractor.getText() # Eliminate duplicates keywordset = set(keywordlist) found_list = [] for keyword in keywordset: # TODO: Is there a more efficient way to do this? # Look at word boundaries to match entire words only if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)): found_list.append(keyword) # Parse this page item = BiffleItem() if (len(found_list) > 0): item['url'] = response.url item['body'] = cleaned_text item['keywords'] = ', '.join(found_list) item['process_date'] = datetime.today() log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO) self.map_keyword_count(found_list) yield item if (depth > 0): # Find the next requests and yield those hxs = HtmlXPathSelector(response) links = hxs.select('//a/@href').extract() log.msg('Links on page: %s' % len(links), level=log.INFO) depth -= 1 log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO) for l in links: l = urlparse.urljoin(response.url, l) if (l in url_bf): pass #log.msg('Duplicate URL found: %s' % l, level=log.INFO) else: url_bf.add(l) #log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO) # Decrement depth for next layer of links #callback = lambda response, depth = depth: self.parse_page(response, depth) callback = lambda response: self.parse_page(response) request = Request(l, callback=callback) request.meta['crawldepth'] = depth yield request
def amazon_marketplace(self,response): sel = Selector(response) item = response.meta['item'] try: sp = sel.xpath("//span[@style='text-decoration: inherit; white-space: nowrap;']/text()").extract()[0].replace(",","") shippingcost = sel.xpath("//span[@class='olpShippingPrice']/span/text()").extract() if shippingcost: sp = str(float(sp) + float(sel.xpath("//span[@class='olpShippingPrice']/span/text()").extract()[0].replace(",",""))) if sp>item['SP']: sp = item['SP'] except: try: flipkart_url = flipkart_urls[item['index']] request = Request(flipkart_url,callback = self.flipkart_scraper) request.meta['item'] = item # request.meta['proxy'] = "http://111.161.126.100:80" yield request except: try: paytm_url = paytm_urls[item['index']] request = Request(paytm_url,callback = self.paytm_scraper) request.meta['item'] = item request.meta['proxy'] = "http://111.161.126.100:80" yield request except: self.to_csv(item)
def parse_monuments_en(self,response): sel=Selector(response) monument=sel.xpath('//div[@class="col-50 content-desc"]') title=monument.xpath("h2[@class='big sec-color']/text()").extract() summary=''.join(monument.xpath("div[@id='idContentScroll']/span/p//text()").extract()) informationLink=monument.xpath("div[@id='idContentScroll']/span/a/@href").extract() item = response.meta['item'] if len(informationLink)>0: item['informationLink_en']=informationLink.pop() else: item['informationLink_en']=response.url if len(title)>0: item['name_en']=title.pop() else: item['name_en']='' if len(summary)>0: item['description_en']=summary else: item['description_en']='' if len(informationLink)>0: item['informationLink']=informationLink.pop() else: item['informationLink']=response.url euLink=sel.xpath('//*[@id="eu"]/@href').extract() request=Request(self.BASE+str(euLink.pop()),callback=self.parse_monuments_eu) request.meta['item']=item yield request
def parse_restaurants_en(self,response): sel=Selector(response) item = response.meta['item'] descriptionpath=sel.xpath("//*[@id='idContentScroll']") description=descriptionpath.xpath("span[@itemprop='description']/p//text()").extract() timetable=descriptionpath.xpath("span[@itemprop='description']/p[2]//text()").extract() timetable2=descriptionpath.xpath("span[@itemprop='description']/p[3]//text()").extract() categoryPath=sel.xpath("//*[@id='gastronomy-content']/section[2]/div/section[1]/section/div/ul/li[2]/p[2]") category=categoryPath.xpath("a/strong/text()").extract() if len(description)>0: item['description_en']=' '.join(description) else: item['description_en']='' if len(category)>0: item['category_en']=['Restaurant',category.pop()] else: item['category_en']=['Restaurant','Others'] if len(timetable)>0: if len(timetable2)>0: item['timetable_en']=' '.join([timetable.pop(),timetable2.pop()]) else: item['timetable_en']=timetable.pop() else: item['timetable_en']='' link=response.url link=link.replace("/en/","/eu/") request=Request(link,callback=self.parse_restaurants_eu) request.meta['item']=item yield request
def parse_disease(self, response): """解析【疾病】页面""" disease_item = DiseaseItem() disease_item['url'] = response.url _name = response.xpath('//div[@class="p_lbox1"]/div[@class="p_lboxti"]/h3') disease_item['name'] = _name.xpath('text()').extract()[0] _other_name = _name.xpath('var/text()').extract() if _other_name: begin = _other_name[0].find(':') + 1 end = _other_name[0].rfind(')') disease_item['aliases'] = re.split(',|,', _other_name[0][begin:end]) _related = response.xpath('//div[@id="yw4"]/div/div/div') disease_item['related_diseases'] = _related.xpath('ul/li/a[contains(@href, "/jibing/")]/@title').extract() disease_item['related_symptoms'] = _related.xpath('ul/li/a[contains(@href, "/zhengzhuang/")]/@title').extract() # print disease_item['related_diseases'], disease_item['related_symptoms'] # print disease_item yield disease_item # Go on parsing details detail_urls = response.xpath('//div[@class="p_lbox1_ab"]/a/@href').extract() detail_urls += response.xpath('//ul[@class="p_sibox2ul clears"]/li/a/@href').extract() # print detail_urls for url in detail_urls: request = Request(url=url, dont_filter=True, callback=self._parse_disease_detail) request.meta['disease_item'] = disease_item yield request # Go on parsing questions question_url = response.xpath('//div[@class="p_lbox5"]/div[@class="p_lboxti"]/a/@href').extract()[0] request = Request(url=question_url, dont_filter=True, callback=self._parse_disease_question) request.meta['disease_item'] = disease_item # print request yield request
def parse_symptom(self, response): """解析【症状】页面""" symptom_item = SymptomItem() symptom_item['url'] = response.url symptom_item['name'] = response.xpath('//div[@id="m_1"]/div[@class="p_sibox1 p_siboxbor"]/div[@class="p_sititile"]/span/h1/text()').extract()[0] _related = response.xpath('//div[@id="yw3"]/div/div') symptom_item['related_diseases'] = _related.xpath('ul/li/a[contains(@href, "/jibing/")]/@title').extract() # symptom_item['related_symptoms'] = _related.xpath('ul/li/a[contains(@href, "/zhengzhuang/")]/@title').extract() # print symptom_item['related_diseases'], symptom_item['related_symptoms'] # print symptom_item yield symptom_item # Go on parsing details detail_urls = response.xpath('//dl[@class="p_sibox1dl clears"]/dt/a/@href').extract() detail_urls += response.xpath('//ul[@class="p_sibox2ul clears"]/li/a[1]/@href').extract() # print detail_urls for url in detail_urls: request = Request(url=url, dont_filter=True, callback=self._parse_symptom_detail) request.meta['symptom_item'] = symptom_item yield request # Go on parsing questions question_url = response.xpath('//div[@class="p_sibox4 p_siboxbor"]/div[@class="p_sititile"]/a/@href').extract()[0] request = Request(url=question_url, dont_filter=True, callback=self._parse_symptom_question) request.meta['symptom_item'] = symptom_item # print request yield request
def parse(self, response): ''' Parse response from start urls (/channels) Channels are groups by category. So, this spider extracts the category of each channel, and constructs a request with the meta information of the category (that information would not be available from the channel page otherwise) ''' self.logger.debug("Parse url {}".format(response.url)) cat_container = response.xpath('/html/body/div[1]/div/article/div') # Channels are grouped by category in containers with class '.channel-category' for cat in cat_container.css('.channel-category'): # extract the title of the category cat_title = cat.xpath('h2/text()').extract_first() # extract the link to the channel pages for channel in cat.css('ul.channel-grid li'): link = channel.xpath('a//@href').extract_first() full_link = loaders.contextualize(link, base_url=response.url) # Construct request request = Request(full_link, callback=self.parse_channel) request.meta['category'] = cat_title yield request
def parse_history(self,response): #Parse Price History Table house = response.meta['item'] tax_url = house['tax_url'] price_history = [] pattern = r' { "html": "(.*)" }' html = re.search(pattern, response.body).group(1) html = re.sub(r'\\"', r'"', html) # Correct escaped quotes html = re.sub(r'\\/', r'/', html) # Correct escaped forward if (html != ""): soup = BeautifulSoup(html) table = soup.find('table') table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') cols = [ele for ele in cols] cols = cols[:3] if (cols[2].find('span') != None): date = cols[0].get_text() event = cols[1].get_text() price = cols[2].find('span').get_text() price_history.append([date, event, price]) #Store history as JSON string house['price_history'] = json.dumps(price_history) tax_request = Request(tax_url, callback=self.parse_taxes) tax_request.meta['item'] = house return tax_request
def parse(self, response): """First step of Mon/gr parsing.""" try: # Connect to Beanstalkd server self.beanstalk = beanstalkc.Connection(host=self.host_beanstalkd, port=11301) # See all tubes: self.beanstalk.tubes() # Switch to the default (tube): self.beanstalk.use("default") # self.makedirResults() self.nodes = json.loads(response.body_as_unicode()) for node in self.nodes: link_node = self.domain + self.nodes[node] request = Request(link_node, callback=self.parseDomain) # Pass metadata to the next wave of parsing request.meta["node"] = node yield request except: print "Please run the beanstalkc" return
def parse(self, response): sel = Selector(response) item = DicksItem() if "&page=" in response.url: # Extracting the Page Number and then using that to assign sort. pagenumber = float(response.url.split("&page=")[-1]) else: pagenumber = 1 t = 0 + ((pagenumber-1)*48) item["Sort_Order"] = {} producturls= sel.xpath("//div[@class='prod-details']/h2/a/@href").extract() productnames = sel.xpath("//div[@class='prod-details']/h2/a/@title").extract() for url,name in zip(producturls,productnames): item["Sort_Order"]["http://www.dickssportinggoods.com"+url] = t t=t+1 for i in range(len(urllist)): #comparing the Category URL and assigning LYS Categorization if urllist[i] == response.url: item['Category'] = lyscat[i] item['id1'] = priceid[i] break for url,name in zip(producturls,productnames): if "Fitbit" not in name: request=Request("http://www.dickssportinggoods.com"+url, self.product_page) request.meta["item"] = item yield request
def parseJsonImageSet(self, response): item = response.meta["item"] imageSetResponse = response.body # make a valid json file out of it, if only one image available it was a list => make a dict imageSetResponse = imageSetResponse.replace("/*jsonp*/s7jsonResponse(", "") imageSetResponse = ",".join(imageSetResponse.split(",")[:-1]) imageSetResponse = imageSetResponse.replace('"item":[', '"item":') imageSetResponse = imageSetResponse.replace('"item":', '"item":[') imageSetResponse = imageSetResponse.replace("}]}}", "}}}") imageSetResponse = imageSetResponse[::-1].replace("}}}", "}}]}")[::-1] color = response.url.split("-")[1].split("?")[0] isImageSet = False if len(response.url.split("-")) > 2: isImageSet = True item["Product_Image_File1"][color] = [] jsonresponse = json.loads(imageSetResponse) for index, imageItem in enumerate(jsonresponse["set"]["item"]): # check if there is a image set or only one image if "isDefault" not in imageItem["i"]: imageUrl = ( "http://roadrunnersports.scene7.com/is/image/" + imageItem["i"]["n"] + "?iv=" + imageItem["iv"] ) # response url is image set => image can be scaled if isImageSet: imageUrl += "&scl=1" item["Product_Image_File1"][color].append(imageUrl) else: # there is no image set append request for default image if item["color"][color] not in item["imageSetUrls"]: item["imageSetUrls"][item["color"][color]] = [] if item["color"][color] not in item["imageSetUrls2"]: item["imageSetUrls2"][item["color"][color]] = [] item["imageSetUrls"][item["color"][color]].append( "http://roadrunnersports.scene7.com/is/image/roadrunnersports/" + item["sku"] + "-" + color + "?req=set,json&scl=1" ) item["imageSetUrls2"][item["color"][color]].append( "http://roadrunnersports.scene7.com/is/image/roadrunnersports/" + item["sku"] + "-" + color + "?req=set,json&scl=1" ) if item["imageSetUrls"]: color, href = item["imageSetUrls"].popitem() if len(href) > 1: item["imageSetUrls"][color] = href[1:] request = Request(href[0], callback=self.parseJsonImageSet) request.meta["item"] = item return request self.to_csv(item) return item
def parse(self, response): dirname = os.getcwd() tabs= [] tab_selector = response.xpath('//div[contains(@id, "SMWrapr")]') ### loop for all tabs for tab in tab_selector.xpath('.//div[contains(@id, "Tab")]'): # tabItem = TabItem() tabNameSel = tab.xpath('div/span[2]/text()').extract() if tabNameSel: tabName = tabNameSel[0] os.chdir(dirname) if not os.path.exists(currDir+"/"+tabName): os.makedirs(currDir+"/"+tabName) #os.chdir(tabName) fobj = open(currDir+"/"+tabName+".txt", 'w') cat_selector = tab.xpath('div[2]/div[contains(@class, "SMSubCat")]') ### loop for all categories for category in cat_selector.xpath('div'): #'.//div[contains(@class, "ht180")] # catItem = CatItem() catNameSel = category.xpath('div/a/@title').extract() if catNameSel: catName = catNameSel[0] subcat_selector = category.xpath('.//ul') ### loop for all subcategories for subcat in subcat_selector.xpath('.//li'): subcatNameSel = subcat.xpath('.//a/@title').extract() if subcatNameSel: subcatName = subcatNameSel[0] subcatLinkSel = subcat.xpath('.//a/@href').extract() if subcatLinkSel: subcatLink = subcatLinkSel[0]+"?sort=plrty" request = Request(subcatLink,callback=self.parse_subcatpage) request.meta['fobj'] = fobj request.meta['tabName'] = tabName request.meta['catName'] = catName request.meta['subcatName'] = subcatName yield request #(response,tabName,catName,subcatName) #print subcatLink #print tabName, ":", catName, ":", subcatName # categories.append(catItem) #return categories #categories = [dict(categories)] #tabs.append(tabItem) #return tabs os.chdir(dirname) fobj.close()
def parse(self, response): nums = Tag.remain_items() for i in nums: request_url = DOMAIN + '/view/' + str(i) + '.htm' request = Request(request_url, callback=self.parse_page) request.meta['view_num'] = str(i) yield request time.sleep(0.1)
def parse_list_detail(self, response): hxs = HtmlXPathSelector(response) shops = hxs.select('//li[@class="shopname"]/a/@href').extract() for shop in shops: url = base_url + shop request = Request(url, callback=self.parse_detail) request.meta["num"] = response.request.meta["num"] request.meta["need_js"] = True yield request
def parse_solutions(self,response): hxs = HtmlXPathSelector(response) x = hxs.select("//tr[@class='kol']//td[8]/ul/li/a/@href").extract() filename = response.meta['name'] for i in range(10): request = Request('http://www.codechef.com/viewplaintext/'+x[i].split('/')[-1], callback=self.parse_ptsol) request.meta['name'] = filename request.meta['count'] = str(i) yield request
def make_amazon_request(self, response, asin, amazonprice=None): request = Request('https://www.amazon.co.uk/gp/offer-listing/%s/' % asin, callback=self.parse_offers) request.meta['ASIN'] = asin request.meta['Original_URL'] = response.url if amazonprice: try: request.meta["Amazon_Price"] = float(getprice.sub(r'', amazonprice)) except: print "ERROR %s - %s" % (amazonprice, str(request)) return request
def test_request_response_converters(): spider = TestSpider() rc = RequestConverter(spider) rsc = ResponseConverter(spider, rc) url = "http://test.com/test?param=123" request = ScrapyRequest(url=url, callback=spider.callback, errback=spider.errback, body=REQUEST_BODY) request.meta[b'test_param'] = b'test_value' request.headers.appendlist(b"TestKey", b"test value") request.cookies[b'MyCookie'] = b'CookieContent' frontier_request = rc.to_frontier(request) assert frontier_request.meta[b'scrapy_callback'] == b'callback' assert frontier_request.meta[b'scrapy_errback'] == b'errback' assert frontier_request.body == to_bytes(REQUEST_BODY) assert frontier_request.url == url assert frontier_request.method == b'GET' assert frontier_request.headers[b'Testkey'] == b'test value' assert frontier_request.cookies[b'MyCookie'] == b'CookieContent' assert b'frontier_request' not in frontier_request.meta[b'scrapy_meta'] request_converted = rc.from_frontier(frontier_request) assert request_converted.meta[b'test_param'] == b'test_value' assert request_converted.body == to_bytes(REQUEST_BODY) assert request_converted.url == url assert request_converted.method == 'GET' assert request_converted.cookies[b'MyCookie'] == b'CookieContent' assert request_converted.headers.get(b'Testkey') == b'test value' assert request_converted.callback == spider.callback assert request_converted.errback == spider.errback # Some middleware could change .meta contents request_converted.meta[b'middleware_stuff'] = b'appeared' response = ScrapyResponse(url=url, request=request_converted, body=RESPONSE_BODY, headers={b'TestHeader': b'Test value'}) frontier_response = rsc.to_frontier(response) assert frontier_response.body == RESPONSE_BODY assert frontier_response.meta[b'scrapy_meta'][b'test_param'] == b'test_value' assert frontier_response.meta[b'scrapy_meta'][b'middleware_stuff'] == b'appeared' assert frontier_response.status_code == 200 assert b'frontier_request' not in frontier_response.meta[b'scrapy_meta'] response_converted = rsc.from_frontier(frontier_response) assert response_converted.body == RESPONSE_BODY assert response_converted.meta[b'test_param'] == b'test_value' assert response_converted.url == url assert response_converted.status == 200 assert response_converted.headers[b'TestHeader'] == b'Test value' frontier_request = FrontierRequest(url) request_converted = rc.from_frontier(frontier_request) assert frontier_request.url == url
def start_requests(self): with open('imageURLs.csv') as csvFile: reader = csv.DictReader(csvFile) for row in reader: item = GetimagesprojectItem() image_url = row['URL'] item['image_urls'] = [row['URL'],] item['pid'] = row['ID'] request = Request(image_url,callback = self.parse) request.meta['item'] = item yield request
def start_requests(self): for cate in cate_array : item_list_interface_url = u'http://api.youzibuy.com/brand_area_catalog/item_list?v=1.2.2&size=0&catalog_id=%s&group_id=%s'%(cate['catalog_id'],cate['group_id']) for i in xrange(1,load_page_count_per_api+1): request = Request('%s&page=%d'%(item_list_interface_url,i)) request.meta['cate'] = cate # print '----------------------'+request.url yield request
def parseLineType(self, response): hxs = HtmlXPathSelector(response) lineType = hxs.select('//*[@id="pagebar"]/h1/text()').extract()[0].strip() self.log('Processing %s...' % (lineType), level=log.DEBUG) items = hxs.select('//*[@id="tbl_fahrplaene"]/tbody/tr/td[2]/a') for item in items: url = urljoin(response.url, item.select('./@href').extract()[0]) req = Request(url, callback=self.parseFahrplan) req.meta['lineType'] = lineType req.meta['lineName'] = item.select('./text()').extract()[0].strip() self.log("Following URL %s" % (url), level=log.DEBUG) yield req
def parse_surgery(self, response): print response.url surgery_item = SurgeryItem() surgery_item['url'] = response.url surgery_item['name'] = response.xpath('//div[@class="w_n"]/h3/text()').extract()[0] surgery_item['summary'] = response.xpath('//dd[@class="w_d3"]/text()').extract()[0] # Go on parsing details _next = response.xpath('//div[@class="w_n"]/div[@class="w_na clears"]/a[@class="hover"]/following-sibling::a[not(@class="w_la")][1]/@href').extract() next_detail_url = urljoin(response.url, _next[0]) request = Request(url=next_detail_url, dont_filter=True, callback=self._parse_surgery_detail) request.meta['surgery_item'] = surgery_item yield request
def parse_schoolIntroUrl(self, response): sel = Selector(response) item = response.meta["item"] schoolIntroUrl = sel.xpath('//div[@class="school_kz fr"]/a/@href').extract() link = self.start_urls[0] if schoolIntroUrl: link = schoolIntroUrl[0] request = Request(link, callback=self.parse_items) request.meta["item"] = item return request else: return item
def parseDomain(self, response): """Second step of Mon/rg parsing (Domains).""" node = response.meta["node"] self.domains = json.loads(response.body_as_unicode()) for dom in self.domains: link_dom = self.domain + self.domains[dom] request = Request(link_dom, callback=self.parseStatements) # Pass metadata to the next wave of parsing request.meta["node"] = node request.meta["domain"] = dom yield request return
def snapdeal_scraper(self,response): item = response.meta['item'] sel = Selector(response) item['Snapdeal_URL']= response.url try: if sel.xpath("//div[@class='notifyMe-soldout']"): ProductName = sel.xpath("//h1[@itemprop='name']/text()").extract()[0].replace(",","") item['Snapdeal__ProductName'] =ProductName item['Snapdeal_MRP']=item['Snapdeal_SP'] = '' item['Snapdeal_Stock'] = 'Out of Stock' else: mrp = sel.xpath("//span[@id='original-price-id']/text()").extract() if mrp: item['Snapdeal_SP'] = sel.xpath("//span[@id='selling-price-id']/text()").extract()[0] item['Snapdeal_MRP'] = sel.xpath("//span[@id='original-price-id']/text()").extract()[0] else: item['Snapdeal_MRP'] = sel.xpath("//span[@id='selling-price-id']/text()").extract()[0] item['Snapdeal_SP'] = '' item['Snapdeal__ProductName'] = sel.xpath("//h1[@itemprop='name']/text()").extract()[0].replace(",","") stock = sel.xpath("//div[@class='notifyMe-soldout']").extract() discntnd = sel.xpath("//div[@class='noLongerProduct']").extract() if stock or discntnd: item['Snapdeal_Stock'] = "Out Of Stock" else: item['Snapdeal_Stock'] = "In Stock" except: item['Snapdeal__ProductName'] = item['Snapdeal_MRP'] = item['Snapdeal_SP'] = '' item['Snapdeal_Stock'] = 'Not Found' try: amazon_url = amazon_urls[item['index']] request = Request(amazon_url, headers={'Referer':'http://amazon.in'}, callback = self.amazon_scraper) request.meta['item'] = item request.meta['proxy'] = "http://111.161.126.100:80" yield request except: try: flipkart_url = flipkart_urls[item['index']] request = Request(flipkart_url,callback = self.flipkart_scraper) request.meta['item'] = item # request.meta['proxy'] = "http://111.161.126.100:80" yield request except: try: paytm_url = paytm_urls[item['index']] request = Request(paytm_url,callback = self.paytm_scraper) request.meta['item'] = item request.meta['proxy'] = "http://111.161.126.100:80" yield request except: self.to_csv(item)
def parse_jsp(self, response): # extracting data from the jsp file country_data = [] log.msg("Looking for all the countries flags", level=log.INFO) for data in re.findall('countryCodeArray\[\d{1,3}\]="(.+)\"', response.body): country_data.append(data.split("|")) log.msg("Found {} countries".format(len(country_data))) request = Request(self.start_urls[0], callback=self.parse_countries_page, dont_filter=True) request.meta['country_data'] = country_data yield request
def parse_api_data(self, response): # make a crawl loop # equal to continually scroll to the bottom # and load to new posts # the number of posts should be not more than max_posts_num to control the loop times # but the max loop need control loop_times = response.meta.get("loop_times", 1) max_posts_num = response.meta.get("max_posts_num", 100) range = response.meta.get("range", 60) fb_api_req_access_token = response.meta.get("fb_api_req_access_token", None) request_tail = response.meta.get("request_tail", None) data_header = response.meta.get("data_header", None) # the max number for scroll to the bottom max_loop_times = 10 if loop_times <= max_loop_times and len(self.post_ids) <= max_posts_num: loop_times += 1 api_code = response.body.replace('for (;;);', '') request_url, data_header = self.structure_api_request(api_code, request_tail = request_tail, data_header = data_header) # no more request url if len(request_url) == 0: return self.structure_fbapi_request_url(fb_api_req_access_token) # request error elif request_url == "error": self.logger.info("request url: %s search post error." % response.url) return self.structure_fbapi_request_url(fb_api_req_access_token) # request urls are normal else: # post_ids contain (post_id, post_time, post_type) post_ids = self.parse_post(json.loads(api_code)["payload"], range) if post_ids: # post_ids list, contains all the post_id need to crawl # no suitable time range post, loop once self.post_ids.extend(post_ids) return Request( url = request_url, callback = self.parse_api_data, priority = 1, dont_filter = False, meta = { "loop_times": loop_times, "request_tail": request_tail, "max_posts_num": max_posts_num, "data_header": data_header, "fb_api_req_access_token": fb_api_req_access_token, } ) else: return self.structure_fbapi_request_url(fb_api_req_access_token)
def parse(self, response): # 获取响应体 # print(response.body) # 获取 SIGN sign = re.search("var SIGN = \'(.+)\';", response.text).group(1) # print(sign) items = response.xpath('//tbody/tr') # print(items) # 如果当前页没有,则进行下一页 if not len(items): # 看是否有下一页 nextpage = response.xpath( '//div[@class="vd_page"]/a[@class="vd_bt_v2 vd_page_btn"]/span["下一页"]/text()' ) print('next page', nextpage, len(nextpage)) if len(nextpage): yield self.my_process_next(response) for item in items: info = item.xpath('.//th/span/a[1]/@data-info').extract_first() print(info) info = json.loads(info) # 如果是文件夹 if info['is_dir']: # 默认只抓子第一页的目录 if self.page['/'] <= self.dirpage: yield self.process_dir(info, response) continue href = self.get_down_info.format(link=info['copy_ref'], sign=sign, time=int(round(time.time() * 1000))) # print(href) yield Request(href, meta={ 'cookiejar': response.meta['cookiejar'], 'filepath': response.meta['filepath'] }, callback=self.next) # 看是否有下一页 nextpage = response.xpath( '//div[@class="vd_page"]/a[@class="vd_bt_v2 vd_page_btn"]/span["下一页"]/text()' ) print('next page', nextpage, len(nextpage)) if len(nextpage): yield self.my_process_next(response)
def follow(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, cb_kwargs=None, flags=None): # type: (...) -> Request """ Return a :class:`~.Request` instance to follow a link ``url``. It accepts the same arguments as ``Request.__init__`` method, but ``url`` can be a relative URL or a ``scrapy.link.Link`` object, not only an absolute URL. :class:`~.TextResponse` provides a :meth:`~.TextResponse.follow` method which supports selectors in addition to absolute/relative URLs and Link objects. .. versionadded:: 2.0 The *flags* parameter. """ if isinstance(url, Link): url = url.url elif url is None: raise ValueError("url can't be None") url = self.urljoin(url) return Request( url=url, callback=callback, method=method, headers=headers, body=body, cookies=cookies, meta=meta, encoding=encoding, priority=priority, dont_filter=dont_filter, errback=errback, cb_kwargs=cb_kwargs, flags=flags, )
def start_requests(self): base_url = 'http://yanbao.stock.hexun.com/xgq/%s.aspx' page_section_dict = { 'gsyj': u"公司研究", 'hyyj': u"行业研究", 'yjyc': u"业绩预测", 'qsch': u"券商晨会", 'clbg': u"策略报告", } for section_short_name in page_section_dict: url = base_url % section_short_name yield Request( url=url, meta={'section': page_section_dict[section_short_name]}, callback=self.parse_index_page_item)
def parse(self, response): self.logger.info("------------ response 4 start") yield Request(url='https://www.baidu.com/s?wd=2', callback=self.parse_e, meta={ "expire": datetime.datetime.now() + datetime.timedelta(seconds=2) }) yield Request(url='https://www.baidu.com/s?wd=3', callback=self.parse_e, meta={ "expire": datetime.datetime.now() + datetime.timedelta(seconds=2) }) yield Request(url='https://www.baidu.com/s?wd=4', callback=self.parse_e, meta={ "expire": datetime.datetime.now() + datetime.timedelta(seconds=2) })
def deal_with_pager(self, item, pager, cur_pager, url): if pager is None or pager == cur_pager: return item else: url_pattern = None if cur_pager == 1: url_pattern = r'(.*)\.shtml' else: url_pattern = r'(.*)-[\d]+\.shtml' m = re.match(url_pattern, url) url = m.group(1) + '-%d.shtml' % cur_pager return Request(url, meta={'item': item, 'is_page': True}, callback=self.parse_news_item) pass
def parse(self, response): self.page+=1 articles = response.xpath('//*[@class="list"]/ul/li/div[@class="box"]') #所有文章 # from scrapy.shell import inspect_response # inspect_response(response,self) for article in articles: url = article.xpath('.//div[@class="word"]/h3/a/@href').extract_first() title = ''.join(article.xpath('.//div[@class="word"]/h3/a/text()').re(r'\w')) summary = ''.join(article.xpath('.//div[@class="des"]/text()').re(r'\w')) author = ''.join(article.xpath('.//div[@class="msg clr"]/a/text()[2]').re(r'\w')) tag = article.xpath('.//div[@class="tags"]/a/text()').extract() title_img = article.xpath('.//div[@class="img"]/a[2]/img/@src').extract_first() leiphone_item= leiphoneItem(url = url, title = title, author = author, title_img = title_img, tag = tag, summary = summary) request = Request(url=url,callback=self.parse_body) #请求文章正文 request.meta['item'] = leiphone_item #将item暂存 meta属性具有传播性,无论发生重定向或重试都可以通过这个属性获取最原始的meta值 yield request ##翻页 next_page = response.xpath('.//a[@class="next"]/@href').extract_first() #下一页 if next_page: yield Request(url = next_page, callback = self.parse)#回调方法用来指定由谁来解析此项Request请求的响应
def parse(self, response): sel = Selector(response) sites = sel.css('td[class="txt-container"] strong') items = [] for site in sites: '''item = Mamba() item["user"] = site.xpath("a/text()").extract() item["number"]=self.i items.append(item) self.i=self.i+1''' user_url = "http://m.mamba.ru" + str( site.xpath("a/@href").extract()) Request(user_url, self.get_user_info) return items
def start_requests(self): source = '国研网' source_url = 'http://www.drcnet.com.cn/' for url in self.countrysUrls: country = url.split("=")[1] for (k, v) in self.subjects.items(): search_url = url + "&uid=" + v meta = { 'source': source, 'source_url': source_url, 'search_url': search_url, 'subject': k, 'subject country': country } yield Request(search_url, self.parseUsefulUrl, meta=meta)
def start_requests(self): allowed_domains = ['www.huajiao.com/'] urls = ['1', '2', '3', '5', '999', '1000', '1001'] for url in urls: newUrl = 'http://www.huajiao.com/category/' + url request = Request( url=newUrl, callback=self.filterPages, headers={ 'Referral': 'http://www.huajiao.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' }) yield request
def parse(self, response): detail_page_list = re.findall('document.write.*?a href="(.*?\.html)">', response.text) if detail_page_list and isinstance(detail_page_list, list): detail_page_list = list(set(detail_page_list)) self.logger1.log_more('Current url: {}, detail length:{}'.format( response.request.url, len(detail_page_list))) for detail_url in detail_page_list: final_detail_url = response.urljoin(detail_url) yield Request(url=final_detail_url, callback=self.parse_detail) page_info_list = re.findall('setPage\(.*?(\d+),(\d+),(\d+)', response.text) if page_info_list: page_info_list = page_info_list[0] if len(page_info_list) >= 3: total_page = int(page_info_list[0]) cur_page = int(page_info_list[2]) if cur_page < total_page: next_page_url = self.next_page_tpl.format(cur_page + 1) next_page_url = response.urljoin(next_page_url) yield Request(url=next_page_url, callback=self.parse, dont_filter=True)
def next_request(self): """Returns a request to be scheduled or none.""" use_set = self.settings.getbool('REDIS_SET') if use_set: url = self.server.spop(self.redis_key) else: url = self.server.lpop(self.redis_key) if url: t =pickle.loads(url) #print t['cookies'] print t['link_hash'] print t['product_code'] return Request(t['url'],cookies=eval(t['cookies']),meta={'product_code':t['product_code'], 'link_hash': t['link_hash']},dont_filter=True)
def start_requests(self): url = "https://kyfw.12306.cn/otn/queryTrainInfo/getTrainName?" t = (datetime.datetime.now() + datetime.timedelta(days=3)).strftime("%Y-%m-%d") params = {"date": t} s_url = url + urllib.parse.urlencode(params) self.logger.debug("start url " + s_url) yield Request(s_url, callback=self.parse, meta={ "t": t, "turn": self.turn })
def start_requests(self): # 调试 #meta = {"journal_url": "url"} #url = "http://www.satnt.ac.za/index.php/satnt/article/view/686" #yield Request(url, self.crawl_issue_info, meta = meta, dont_filter=True) #return with open(self.url_file, "rb") as f: for line in f: meta = {"journal_url": line.strip()} journal_issue_url = "%s/issue/archive/" % line.strip() yield Request(journal_issue_url, self.crawl_homepage, meta=meta, dont_filter=True)
def parse(self, response): cookies = CookieJar() cookies.extract_cookies(response, response.request) self.cookie_dict = cookies._cookies yield Request( url='http://dig.chouti.com/login', method='POST', headers={ 'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8" }, body='phone=8618279816872&password=18279816872&oneMonth:1', cookies=self.cookie_dict, callback=self.check_login)
def get_request_object(self, params): """构造request对象""" formdata = params.get('formdata', {}) if formdata: if isinstance(formdata, dict): return FormRequest(**params) else: s = json.dumps(formdata, ensure_ascii=False) log.warning("formdata:{}格式不对, 无法制造FormRequest对象".format(s)) return None else: temp_params = copy.deepcopy(params) if 'formdata' in temp_params: del temp_params['formdata'] return Request(**temp_params)
def parse(self, response): #--- get the total number of the company max_page = ComSpider.count #--- for loop that crawl multiple pages that based different stock code for page in range(0, max_page + 1): #--- get the stock code code = ComSpider.Comdata[page] #--- call the scrape function iterally yield Request(url=add_or_replace_parameter(self.single_url, 'securityCode', code), callback=self.scrape, meta={'code': code})
def getUserId(self, response): bsObj = BeautifulSoup(response.text, 'html.parser') user = bsObj.find_all("a", href=re.compile("^(/user/[0-9]+)"))[0] if 'href' in user.attrs: newPage = 'http://www.huajiao.com' + user.attrs['href'] request = Request( url=newPage, callback=self.parseUserStat, headers={ 'Referral': 'http://www.huajiao.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' }) yield request
def after_login(self, response): # check login succeed before going on if (((("ERROR: Invalid username") or ("The username/password combination you haventered is invalid")) in response.body) or (response.url is self.start_urls[0])): self.log("Login failed", level=log.ERROR) return # continue scraping with authenticated session... else: self.log("Login succeed!", level=log.DEBUG) print response.url print "response end!!"+ response.url return Request(url=response.url, callback=self.parse1)
def parse(self, response): print(response) req = Request( url='https://dig.chouti.com/login', method='POST', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'referer': 'https://dig.chouti.com/' }, body='phone=************&password=************&oneMonth=1', meta={"cookiejar": True}, callback=self.check_login, ) yield req
def parse3(self, response): """点赞""" hxs = Selector(response) linkid_list = hxs.xpath('//div[@class="news-pic"]/img/@lang').extract() print(linkid_list) for link_id in linkid_list: # 获取每一个id去点赞 base_url = "https://dig.chouti.com/link/vote?linksId={0}".format( link_id) yield Request(url=base_url, method='POST', cookies=self.cookie_jar, callback=self.parse4) # hxs.xpath('//div[@id="dig_lcpage"]//a/@href') # 寻找所有分页页面 page_list = hxs.xpath('//a[@class="ct_pagepa"] /@href').extract() """https://dig.chouti.com/all/hot/recent/2""" for page in page_list: page_url = "https://dig.chouti.com%s" % page yield Request(url=page_url, method='GET', cookies=self.cookie_jar, callback=self.parse3)
def parse_comments(self, response): for comments in response.css('.comment-item'): username = comments.css( 'span.comment-info > a::text').extract_first() comment = comments.css('span.short::text').extract_first() yield { 'movie': response.meta['movie'], 'username': username, 'comment': comment } next_url = response.css('a.next::attr(href)').extract_first() if next_url: yield Request(url=response.url[:response.url.find('?')] + next_url, callback=self.parse_comments, meta=response.meta)
def parse_songs(self, response): songs_links = response.xpath( '//a[contains(@target,"blank")]/text()').extract() reggaetonLyricsScrapperItem = response.meta['item'] if songs_links > 0: for song in songs_links: reggaetonLyricsScrapperItem = response.meta['item'] reggaetonLyricsScrapperItem = reggaetonLyricsScrapperItem.copy( ) reggaetonLyricsScrapperItem['name'] = song yield Request(url=self.get_lyric_url( reggaetonLyricsScrapperItem['author'], song), meta={'item': reggaetonLyricsScrapperItem}, callback=self.parse_lyric) return
def parse(self, response): self.driver.get(response.url) time.sleep(2) inputs = self.driver.find_elements_by_xpath( "//div[@id='results_nav_by_year']/a") links = [] for i in inputs: link = i.get_attribute('href') if (link != None): links.append(link) for link in links: yield Request(url=link, callback=self.parse_page)
def parse_subcategory(self, response): hxs = HtmlXPathSelector(response) for product in hxs.select( '//li[contains(@class, "prd_listing_prod")]'): product = self._parse_product_el(product, get_base_url(response)) yield Request(product['url'], callback=self.parse_product, meta={'product': product}) # go to page 2 search_term = urlopen( 'https://www.buyagift.co.uk/navigation/GetBNNumber?url=%s' % response.url).read() if not search_term: msg = "[BuyAGift] Error extracting search term from: %s" % response.url self.log(msg) #self.errors.append(msg) return search_term = 'BN-' + search_term page2_url = "http://www.buyagift.co.uk/navigation/GetPartialRecordsList?searchTerm=%(search_term)s&page=%(page_num)s&pageSize=24&sortTerm=SalesRank&" meta = {'search_term': search_term, 'page_num': 2} page2_url = page2_url % meta yield Request(page2_url, callback=self.parse_pages, meta=meta)
def turn_to_next_page(self, response): this_func_name = sys._getframe().f_code.co_name self.logger.debug("%s(): current page\t%s" % (this_func_name, response.url)) sel = Selector(response) next_page_list = sel.xpath( u'//div[@class="pagination"]/a[@class="next_page" and @rel="next" and text()="下一页 ›"]/@href' ).extract() if len(next_page_list) == 0: return link = self.base_url + next_page_list[0] self.logger.debug("%s(): next page\t\t%s" % (this_func_name, link)) return Request(url=link, meta=response.meta, callback=self.parse_proxy_list)
def parse(self, response): exhibits = response.css( 'main.site-main > div.row > #isotope-container > div') for exhibit in exhibits: url = exhibit.css('.mb-image > a::attr(href)').get() title = exhibit.css('.mb-image > a::attr(title)').get() date = exhibit.css('.date.details::text').get() image_link = exhibit.css('.mb-image > a > img::attr(src)').get() yield Request(url=url, callback=self.parse_exhibit, meta={ 'title': title, 'date': date, 'image_link': image_link })
def crawl_issue(self, response): issues = response.xpath(".//a[@class='green issueTitle']") for issue in issues: url = urlparse.urljoin(response.url, issue.xpath("./@href").extract_first()) meta = {"journal_url": response.meta["journal_url"]} yield Request(url, self.crawl_issue_info, meta=meta) if response.meta["is_first"]: identifier = response.xpath( "//h1[@class='issue_title_identifier']/text()").extract_first( ) total_issue_num = Utils.regex_extract(identifier, ".*-(\d+) issues.*") total_issue_num = int(total_issue_num) total_page = total_issue_num / 12 + 1 for i in range(1, total_page): next_page_url = "%s&p=%d" % (response.url, i) meta = { "journal_url": response.meta["journal_url"], "is_first": False } yield Request(next_page_url, self.crawl_issue, meta=meta)
def parse_item_requests_callback(self, response, item_xpath_selector=''): requests = [] for job_item in response.xpath(item_xpath_selector): job_crawler_item = JobItem() self.populate_job_crawler_item(job_item, job_crawler_item) if self.should_load_details(job_crawler_item): requests.append( Request(url=job_crawler_item.job_details_link, callback=self.retrieve_job_details, meta={'item': job_crawler_item}, dont_filter=True)) return requests
def parse(self, response): try: content = json.loads(response.body.decode()) page_count = content.get('total', 1) for page in range(1, page_count + 1): url = self.page_url.format(page) yield Request(url, callback=self.parse_link, errback=self.error_parse, dont_filter=True) except: err_msg = traceback.format_exc() self.logger1.warning( "Exception occurred on get the page counts[{url}], error:{err_msg}" .format(url=response.url, err_msg=err_msg))