def parse(self, response): """ Default callback used by Scrapy to process download response Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link :param response: :return: """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = TakeFirst() loader.default_input_processor = MapCompose(unicode.strip) loader.default_input_processor = Join() loader.defalut_output_processor = TakeFirst() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ # with open('polydata/'+response.url.split('=')[1], 'wb') as f: # f.write(response.body) # scraped_url_list = list() selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.select(self.products_list_xpath): loader = XPathItemLoader(PolyvoreData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL", unicode(response.request.url, "utf-8")) # scraped_url_list.append(loader.load_item()['requestURL']) for item in deal.xpath('//*[@id="content"]/ul[1]/li'): ll = XPathItemLoader(PolyvoreData(), selector=item) # define processors ll.default_input_processor = MapCompose(unicode.strip) ll.default_output_processor = Join() for field, xpath in self.item_items.iteritems(): ll.add_xpath(field, xpath) ll.add_value("requestURL", loader.load_item()['requestURL']) ll.add_value("name", loader.load_item()['name']) ll.add_value("numlikes", loader.load_item()['numlikes']) yield ll.load_item() for item in deal.xpath('//*[@id="content"]/ul[2]/li'): ll = XPathItemLoader(PolyvoreData(), selector=item) # define processors ll.default_input_processor = MapCompose(unicode.strip) ll.default_output_processor = Join() for field, xpath in self.item_items.iteritems(): ll.add_xpath(field, xpath) ll.add_value("requestURL", loader.load_item()['requestURL']) ll.add_value("name", loader.load_item()['name']) ll.add_value("numlikes", loader.load_item()['numlikes']) yield ll.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ # with open('polydata/'+response.url.split('=')[1], 'wb') as f: # f.write(response.body) # scraped_url_list = list() selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.select(self.products_list_xpath): loader = XPathItemLoader(PolyvoreData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # scraped_url_list.append(loader.load_item()['requestURL']) for item in deal.xpath('//*[@id="content"]/ul[1]/li'): ll = XPathItemLoader(PolyvoreData(), selector=item) # define processors ll.default_input_processor = MapCompose(unicode.strip) ll.default_output_processor = Join() for field, xpath in self.item_items.iteritems(): ll.add_xpath(field, xpath) ll.add_value("requestURL", loader.load_item()['requestURL']) ll.add_value("name", loader.load_item()['name']) ll.add_value("numlikes", loader.load_item()['numlikes']) yield ll.load_item() for item in deal.xpath('//*[@id="content"]/ul[2]/li'): ll = XPathItemLoader(PolyvoreData(), selector=item) # define processors ll.default_input_processor = MapCompose(unicode.strip) ll.default_output_processor = Join() for field, xpath in self.item_items.iteritems(): ll.add_xpath(field, xpath) ll.add_value("requestURL", loader.load_item()['requestURL']) ll.add_value("name", loader.load_item()['name']) ll.add_value("numlikes", loader.load_item()['numlikes']) yield ll.load_item()
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for deal in selector.xpath(self.deals_list_xpath): #multiple deals per page loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #iterate over each deal # define processors # An Item Loader contains one input processor and one output processor for each (item) field. loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ # Gives ability to select parts of response defined in deals_list_xpath selector = HtmlXPathSelector(response) # Iterate through found deals for deal in selector.xpath(self.deals_list_xpath): # Loads data into item fields defined in items.py loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # Define processors for clean up and joining elements loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Iterate over item_fields dict and add xpaths to loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",queryStr['page'] # iterate over deals for deal in selector.select(self.products_list_xpath): loader = XPathItemLoader(JabongData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",(urllib.unquote(queryStr['p%5B%5D']).split("=")[1]),queryStr['start'] for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(flipkartData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded response """ selector = HtmlXPathSelector( response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for content in selector.xpath( self.content_list_xpath): #multiple deals per page loader = XPathItemLoader(RedditLearnPython(), selector=content) #iterate over each deal # define processors loader.default_input_processor = MapCompose( unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems( ): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item( ) # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses # Testing contracts: # @url http://www.livingsocial.com/cities/15-san-francisco # @returns items 1 # @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for entry in selector.xpath(self.entries_list_xpath): loader = XPathItemLoader(WGGesuchtEntry(), selector=entry) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item() cur_index = response.meta.get("cur_index", 1) new_url = re.sub("\d+.html", str(cur_index) + ".html", response.url) print("\n" + str(response.url) + "\n" + new_url + "\n") if cur_index < 59: yield Request(new_url, callback=self.parse, meta={"cur_index": cur_index + 1})
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(),selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) # stripe out white-space of unicode strings loader.default_output_processor = Join() # join the data together by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): # iteritems() iterate the (key,value) of items in a dictionary. There are also iterkeys() and itervalues() functions. loader.add_xpath(field, xpath) yield loader.load_item() # yield each other and move on to the next # output as json file: scrapy crawl livingsocial -o items.json
def parse(self, response): selector = HtmlXPathSelector(response) for link in selector.select(self.links_list_xpath): loader = XPathItemLoader(iWatchOnline(), selector=link) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.episodes_field.iteritems(): loader.add_xpath(field,xpath) yield loader.load_item()
def parse(self, response): """"Call back used by Scrapy to download and process response """ selector = HtmlXPathSelector(response) # Go through art statements for statement in selector.select(self.description_xpath): loader = XPathItemLoader(MonetInformation(), selelctor=statement) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor
def parse(self,response): selector = HtmlXPathSelector(response) for deal in selector.select(self.deal_list_xpath): loader = XPathItemLoader(LivingSocialDeal().seletor = deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field,xpath in self.item_fields.iteritems(): loader.add_xpath(field,xpath) yield loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) for startup in selector.select(self.startup_results_xpath): loader = XPathItemLoader(SearchResults(), selector=startup) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) for startup in selector.select(self.startup_results_xpath): loader = XPathItemLoader(SearchResults(), selector = startup) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) # looking for a deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def myparse(self,response): print "myParse" selector = HtmlXPathSelector(response) # l = selector.select(self.deals_list_xpath) l = selector.select('//div[@id="detailed"]') ll = l.select('.//div[@class="title4"]/a/text()').extract() open(ll[0].strip()+'.html','wb').write(response.body) print ll[0].strip() for deal in l: #loader = XPathItemLoader(LivingSocialDeal(),selector=deal) loader = XPathItemLoader(MoviesClass() , selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.default_output_processor = TakeFirst() for field,xpath in self.mov_fields.iteritems(): loader.add_xpath(field,xpath) x = deal.select(field).extract() yield loader.load_item()
def parse(self, response): """Get response from start_urls""" selector = HtmlXPathSelector(response) for deal in selector.xpath(self.xpath_for_deals): loader = XPathItemLoader(LivingSocial(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath.strip()) yield loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) #iterate over deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #define processor # renove whitespace loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) # iterate over data_list for data in selector.select(self.data_list): loader = XPathItemLoader(TeoniteItem(), selector=data) loader.default_input_processor = MapCompose(str.strip) loader.default_output_processor = Join() # add xpath to loader for field, xpath in self.item_fields.items(): loader.add_xpath(field, xpath) yield loader.load_item() for nextp in selector.select(self.next_page): yield response.follow(nextp, callback=self.parse)
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) #iterate over events for event in selector.select(self.events_list_xpath): loader = XPathItemLoader(CrunchBaseEvent(), selector=event) #define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader. for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): """ # """ selector = HtmlXPathSelector(response) # iterate over tickets for ticket in selector.select(self.tickets_list_xpath): loader = XPathItemLoader(ComparatorItem(), selector=ticket) # define loader loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader loader.add_xpath('eventname' , './/span[@class="summary listingEventName"]/text()') loader.add_xpath('eventlocation' , './/div[@class="divVenue location"]/text()') loader.add_xpath('ticketslink' , './/a[@class="divEventDetails url"]/@href') print "Here is ticket link \n" + loader.get_output_value("ticketslink") ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketslink") ticketsURL = urljoin(response.url, ticketsURL) yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ #Instantiate a HtmlXPathSelector selector = HtmlXPathSelector(response) #Iterate over reviews for review in selector.select(self.reviews_list_xpath): loader = XPathItemLoader(YelpReview(), selector=review) #Define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #Iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def myparse(self, response): print "myParse" selector = HtmlXPathSelector(response) # l = selector.select(self.deals_list_xpath) l = selector.select('//div[@id="detailed"]') ll = l.select('.//div[@class="title4"]/a/text()').extract() open(ll[0].strip() + '.html', 'wb').write(response.body) print ll[0].strip() for deal in l: #loader = XPathItemLoader(LivingSocialDeal(),selector=deal) loader = XPathItemLoader(MoviesClass(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.default_output_processor = TakeFirst() for field, xpath in self.mov_fields.iteritems(): loader.add_xpath(field, xpath) x = deal.select(field).extract() yield loader.load_item()
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded response """ selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for content in selector.xpath(self.content_list_xpath): #multiple deals per page loader = XPathItemLoader(RedditLearnPython(), selector=content) #iterate over each deal # define processors loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse(self, response): """ # """ selector = HtmlXPathSelector(response) # iterate over tickets for ticket in selector.select(self.tickets_list_xpath): loader = XPathItemLoader(ComparatorItem(), selector=ticket) # define loader loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader loader.add_xpath('eventname' , './/*[@class="productionsEvent"]/text()') loader.add_xpath('eventlocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()') loader.add_xpath('ticketslink' , './/*/a[@class = "btn btn-primary"]/@href') loader.add_xpath('eventdate' , './/*[@class = "productionsDate"]/text()') loader.add_xpath('eventcity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()') loader.add_xpath('eventstate' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()') loader.add_xpath('eventtime' , './/*[@class = "productionsTime"]/text()') print "Here is ticket link \n" + loader.get_output_value("ticketslink") ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketslink") ticketsURL = urljoin(response.url, ticketsURL) yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def parse(self, response): """ Default CB used by Scrapy to process download responses Testing Contracts: @url 'https://angel.co/companies?teches[]=Big+Data' @returns items 1 @scrapes title link """ #get response after making request to some site (data passed to our cb) selector = HtmlXPathSelector(response) for company in selector.select(self.companies_list_xpath): loader = XPathItemLoader(AngelJobs(), selector=company) loader.default_input_processor = MapCompose(unicode.strip) loader.deffault_input_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_items()