Beispiel #1
0
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """
        # with open('polydata/'+response.url.split('=')[1], 'wb') as f:
        #     f.write(response.body)
        # scraped_url_list = list()

        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.select(self.products_list_xpath):
            loader = XPathItemLoader(PolyvoreData(), selector=deal)
            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader
            loader.add_value("requestURL",
                             unicode(response.request.url, "utf-8"))

            # scraped_url_list.append(loader.load_item()['requestURL'])

            for item in deal.xpath('//*[@id="content"]/ul[1]/li'):
                ll = XPathItemLoader(PolyvoreData(), selector=item)
                # define processors
                ll.default_input_processor = MapCompose(unicode.strip)
                ll.default_output_processor = Join()

                for field, xpath in self.item_items.iteritems():
                    ll.add_xpath(field, xpath)

                ll.add_value("requestURL", loader.load_item()['requestURL'])
                ll.add_value("name", loader.load_item()['name'])
                ll.add_value("numlikes", loader.load_item()['numlikes'])
                yield ll.load_item()

            for item in deal.xpath('//*[@id="content"]/ul[2]/li'):
                ll = XPathItemLoader(PolyvoreData(), selector=item)
                # define processors
                ll.default_input_processor = MapCompose(unicode.strip)
                ll.default_output_processor = Join()

                for field, xpath in self.item_items.iteritems():
                    ll.add_xpath(field, xpath)

                ll.add_value("requestURL", loader.load_item()['requestURL'])
                ll.add_value("name", loader.load_item()['name'])
                ll.add_value("numlikes", loader.load_item()['numlikes'])
                yield ll.load_item()
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """
        # with open('polydata/'+response.url.split('=')[1], 'wb') as f:
        #     f.write(response.body)
        # scraped_url_list = list()

        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.select(self.products_list_xpath):
            loader = XPathItemLoader(PolyvoreData(), selector=deal)
            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader 
            loader.add_value("requestURL",unicode(response.request.url, "utf-8"))

            # scraped_url_list.append(loader.load_item()['requestURL'])

            for item in deal.xpath('//*[@id="content"]/ul[1]/li'):
                ll = XPathItemLoader(PolyvoreData(), selector=item)
                # define processors
                ll.default_input_processor = MapCompose(unicode.strip)
                ll.default_output_processor = Join()

                for field, xpath in self.item_items.iteritems():
                    ll.add_xpath(field, xpath)

                ll.add_value("requestURL", loader.load_item()['requestURL'])
                ll.add_value("name", loader.load_item()['name'])
                ll.add_value("numlikes", loader.load_item()['numlikes'])
                yield ll.load_item()

            for item in deal.xpath('//*[@id="content"]/ul[2]/li'):
                ll = XPathItemLoader(PolyvoreData(), selector=item)
                # define processors
                ll.default_input_processor = MapCompose(unicode.strip)
                ll.default_output_processor = Join()

                for field, xpath in self.item_items.iteritems():
                    ll.add_xpath(field, xpath)

                ll.add_value("requestURL", loader.load_item()['requestURL'])
                ll.add_value("name", loader.load_item()['name'])
                ll.add_value("numlikes", loader.load_item()['numlikes'])
                yield ll.load_item()
Beispiel #3
0
 def myparse(self,response):
     print "myParse"
     selector = HtmlXPathSelector(response)
    # l = selector.select(self.deals_list_xpath)
     l = selector.select('//div[@id="detailed"]')
     ll = l.select('.//div[@class="title4"]/a/text()').extract()
     open(ll[0].strip()+'.html','wb').write(response.body)
     print ll[0].strip()
     for deal in l:
         
         #loader = XPathItemLoader(LivingSocialDeal(),selector=deal)
         loader = XPathItemLoader(MoviesClass() , selector=deal)
         loader.default_input_processor = MapCompose(unicode.strip)
         loader.default_output_processor = Join()
         loader.default_output_processor = TakeFirst()
         
         for field,xpath in self.mov_fields.iteritems():
             loader.add_xpath(field,xpath)
             x = deal.select(field).extract()            
         yield loader.load_item()
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
Beispiel #4
0
    def parse(self, response):  # actually a method
        """
		Default callback used by Scrapy to process downloaded response
		
		"""

        selector = HtmlXPathSelector(
            response)  # instantiate HtmlXPathSelector() w/ response parameter

        # iterate over deals
        for content in selector.xpath(
                self.content_list_xpath):  #multiple deals per page
            loader = XPathItemLoader(RedditLearnPython(),
                                     selector=content)  #iterate over each deal

            # define processors
            loader.default_input_processor = MapCompose(
                unicode.strip)  #strip out white-space of unicode strings
            loader.default_output_processor = Join()  #join data by a space

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems(
            ):  #itemitems() method allows you to iterate (k, v) of items in a dict
                loader.add_xpath(field,
                                 xpath)  #add specific field xpath to loader
            yield loader.load_item(
            )  # load_item: grabs each item field (link, title, etc), gets xpath, process data
            # w/ input output processor. Yield each item, then move onto next deal
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """
        selector = HtmlXPathSelector(response)


        details=urlparse(response.request.url)
        queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")}
        print "\n",(urllib.unquote(queryStr['p%5B%5D']).split("=")[1]),queryStr['start']

        
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(flipkartData(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader
            loader.add_value("requestURL",unicode(response.request.url, "utf-8"))

            # adding the category for the request
            loader.add_value("category",unicode(self.category))

            yield loader.load_item()
    def parse_article(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        
        selector = Selector(response)
        loader = XPathItemLoader(LeMondeArt(), selector=selector)
        
        self.log('\n\nA response from %s just arrived!' % response.url)
        
        # define processors
        text_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()
        
        # Populate the LeMonde Item with the item loader
        for field, xpath in self.article_item_fields.iteritems():
            try:
                loader.add_xpath(field, xpath, text_input_processor)
            except ValueError:
                self.log("XPath %s not found at url %s" % (xpath, response.url))
            
        #loader.add_value("Url",response.url)
        

        yield loader.load_item()
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        """

        selector = HtmlXPathSelector(response)


        details=urlparse(response.request.url)
        queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")}
        print "\n",queryStr['page']

        # iterate over deals
        for deal in selector.select(self.products_list_xpath):
            loader = XPathItemLoader(JabongData(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            # adding the request URL to the loader 
            loader.add_value("requestURL",unicode(response.request.url, "utf-8"))


            # adding the category for the request
            loader.add_value("category",unicode(self.category))

            yield loader.load_item()
	def parse(self, response):
		"""
		Default callback used by Scrapy to process downloaded responses
		Testing contracts:
		@url http://www.livingsocial.com/cities/15-san-francisco
		@returns items 1
		@scrapes title link
		"""

		selector = HtmlXPathSelector(response)

		for deal in selector.xpath(self.deals_list_xpath):
			loader = XPathItemLoader(LivingSocialDeal(),selector=deal)

			# define processors
			loader.default_input_processor = MapCompose(unicode.strip) 
			# stripe out white-space of unicode strings
			loader.default_output_processor = Join() 
			# join the data together by a space

			# iterate over fields and add xpaths to the loader
			for field, xpath in self.item_fields.iteritems(): 
			# iteritems() iterate the (key,value) of items in a dictionary. There are also iterkeys() and itervalues() functions. 
				loader.add_xpath(field, xpath) 
			yield loader.load_item() 
			# yield each other and move on to the next

# output as json file: scrapy crawl livingsocial -o items.json
	def parse(self, response): # actually a method
		"""
		Default callback used by Scrapy to process downloaded responses
		
		Testing contracts:
		@url http://www.livingsocial.com/cities/15-san-francisco
		@returns items 1
		@scrapes title link
		
		"""
		         
		selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter
		
		# iterate over deals
		for deal in selector.xpath(self.deals_list_xpath): #multiple deals per page
			loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #iterate over each deal
			
			# define processors
			# An Item Loader contains one input processor and one output processor for each (item) field.
			loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings
			loader.default_output_processor = Join() #join data by a space
			
			# iterate over fields and add xpaths to the loader
			for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict
				loader.add_xpath(field, xpath) #add specific field xpath to loader
			yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data
			# w/ input output processor. Yield each item, then move onto next deal
Beispiel #10
0
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        """
        # Gives ability to select parts of response defined in deals_list_xpath
        selector = HtmlXPathSelector(response)

        # Iterate through found deals
        for deal in selector.xpath(self.deals_list_xpath):
            # Loads data into item fields defined in items.py
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # Define processors for clean up and joining elements
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # Iterate over item_fields dict and add xpaths to loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
Beispiel #11
0
    def parse_article(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """

        selector = Selector(response)
        loader = XPathItemLoader(LeMondeArt(), selector=selector)

        self.log('\n\nA response from %s just arrived!' % response.url)

        # define processors
        text_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        # Populate the LeMonde Item with the item loader
        for field, xpath in self.article_item_fields.iteritems():
            try:
                loader.add_xpath(field, xpath, text_input_processor)
            except ValueError:
                self.log("XPath %s not found at url %s" %
                         (xpath, response.url))

        #loader.add_value("Url",response.url)

        yield loader.load_item()
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.xpath(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()



            
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        # Testing contracts:
        # @url http://www.livingsocial.com/cities/15-san-francisco
        # @returns items 1
        # @scrapes title link

        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for entry in selector.xpath(self.entries_list_xpath):
            loader = XPathItemLoader(WGGesuchtEntry(), selector=entry)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

        cur_index = response.meta.get("cur_index", 1)
        new_url = re.sub("\d+.html", str(cur_index) + ".html", response.url)

        print("\n" + str(response.url) + "\n" + new_url + "\n")

        if cur_index < 59:
            yield Request(new_url, callback=self.parse, meta={"cur_index": cur_index + 1})
Beispiel #14
0
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link

        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # define processors

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader

            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
	def parse(self, response):
		selector = HtmlXPathSelector(response)
		for link in selector.select(self.links_list_xpath):
			loader = XPathItemLoader(iWatchOnline(), selector=link)
			loader.default_input_processor = MapCompose(unicode.strip)
			loader.default_output_processor = Join()
			for field, xpath in self.episodes_field.iteritems():
				loader.add_xpath(field,xpath)
			yield loader.load_item()
Beispiel #16
0
    def myparse(self, response):
        print "myParse"
        selector = HtmlXPathSelector(response)
        # l = selector.select(self.deals_list_xpath)
        l = selector.select('//div[@id="detailed"]')
        ll = l.select('.//div[@class="title4"]/a/text()').extract()
        open(ll[0].strip() + '.html', 'wb').write(response.body)
        print ll[0].strip()
        for deal in l:

            #loader = XPathItemLoader(LivingSocialDeal(),selector=deal)
            loader = XPathItemLoader(MoviesClass(), selector=deal)
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            loader.default_output_processor = TakeFirst()

            for field, xpath in self.mov_fields.iteritems():
                loader.add_xpath(field, xpath)
                x = deal.select(field).extract()
            yield loader.load_item()
    def parse(self,response):
        selector = HtmlXPathSelector(response)

        for deal in selector.select(self.deal_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal().seletor = deal)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            for field,xpath in self.item_fields.iteritems():
                loader.add_xpath(field,xpath)
            yield loader.load_item()
    def parse(self, response):

        selector = HtmlXPathSelector(response)

        for startup in selector.select(self.startup_results_xpath):
            loader = XPathItemLoader(SearchResults(), selector=startup)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
    def parse(self, response):

        selector = HtmlXPathSelector(response)

        for startup in selector.select(self.startup_results_xpath):
            loader = XPathItemLoader(SearchResults(), selector = startup)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
    def parse(self, response):
        selector = HtmlXPathSelector(response)
        # looking for a deals
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            yield loader.load_item()
    def parse(self, response):
        selector = HtmlXPathSelector(response)

        #iterate over deals
        for deal in selector.select(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            #define processor
            # renove whitespace
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            #iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
Beispiel #22
0
    def parse(self, response):
        """Get response from start_urls"""

        selector = HtmlXPathSelector(response)

        for deal in selector.xpath(self.xpath_for_deals):
            loader = XPathItemLoader(LivingSocial(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath.strip())
            yield loader.load_item()
Beispiel #23
0
    def parse(self, response):

        selector = HtmlXPathSelector(response)

        # iterate over data_list
        for data in selector.select(self.data_list):
            loader = XPathItemLoader(TeoniteItem(), selector=data)

            loader.default_input_processor = MapCompose(str.strip)
            loader.default_output_processor = Join()

            # add xpath to loader
            for field, xpath in self.item_fields.items():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

        for nextp in selector.select(self.next_page):
            yield response.follow(nextp, callback=self.parse)
 def parse(self, response):
     """
     # """
     selector = HtmlXPathSelector(response)
     # iterate over tickets
     for ticket in selector.select(self.tickets_list_xpath):
         loader = XPathItemLoader(ComparatorItem(), selector=ticket)
         # define loader
         loader.default_input_processor = MapCompose(unicode.strip)
         loader.default_output_processor = Join()
         # iterate over fields and add xpaths to the loader
         loader.add_xpath('eventname' , './/span[@class="summary listingEventName"]/text()')
         loader.add_xpath('eventlocation' , './/div[@class="divVenue location"]/text()')
         loader.add_xpath('ticketslink' , './/a[@class="divEventDetails url"]/@href')
         
         print "Here is ticket link \n" + loader.get_output_value("ticketslink")
         ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketslink")
         ticketsURL = urljoin(response.url, ticketsURL)
         yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        """
        selector = HtmlXPathSelector(response)

        #iterate over events
        for event in selector.select(self.events_list_xpath):
            loader = XPathItemLoader(CrunchBaseEvent(), selector=event)

            #define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            #iterate over fields and add xpaths to the loader.
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            yield loader.load_item()
Beispiel #26
0
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        """
        #Instantiate a HtmlXPathSelector 
        selector = HtmlXPathSelector(response)

        #Iterate over reviews
        for review in selector.select(self.reviews_list_xpath):
            loader = XPathItemLoader(YelpReview(), selector=review)

            #Define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            #Iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
	def parse(self, response): # actually a method
		"""
		Default callback used by Scrapy to process downloaded response
		
		"""
		         
		selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter
		
		# iterate over deals
		for content in selector.xpath(self.content_list_xpath): #multiple deals per page
			loader = XPathItemLoader(RedditLearnPython(), selector=content) #iterate over each deal
			
			# define processors
			loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings
			loader.default_output_processor = Join() #join data by a space
			
			# iterate over fields and add xpaths to the loader
			for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict
				loader.add_xpath(field, xpath) #add specific field xpath to loader
			yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data
			# w/ input output processor. Yield each item, then move onto next deal
    def parse(self, response):
        """
        # """
        selector = HtmlXPathSelector(response)
        # iterate over tickets
        for ticket in selector.select(self.tickets_list_xpath):
            loader = XPathItemLoader(ComparatorItem(), selector=ticket)
            # define loader
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader
            loader.add_xpath('eventname' , './/*[@class="productionsEvent"]/text()')
            loader.add_xpath('eventlocation' , './/*[@class = "productionsVenue"]/span[@itemprop  = "name"]/text()')
            loader.add_xpath('ticketslink' , './/*/a[@class = "btn btn-primary"]/@href')
            loader.add_xpath('eventdate' , './/*[@class = "productionsDate"]/text()')
            loader.add_xpath('eventcity' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressLocality"]/text()')
            loader.add_xpath('eventstate' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressRegion"]/text()')
            loader.add_xpath('eventtime' , './/*[@class = "productionsTime"]/text()')

            print "Here is ticket link \n" + loader.get_output_value("ticketslink")
            ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketslink")
            ticketsURL = urljoin(response.url, ticketsURL)
            yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)