def parse_view(self, response): sel = Selector(response) place = Place() place['area'] = sel.select( "[@id='places_area__row']/[@class='w2p_fw']").extract_first() place['country'] = sel.select( "[@id='places_iso__row']/[@class='w2p_fw']").extract_first() yield place
def parse_items(self, response): item = TwitterVizItem() self.browser.get(response.url) # let JavaScript Load time.sleep(3) # scrape dynamically generated HTML hxs = Selector(text=self.browser.page_source) item['Follower'] = hxs.select('(//span[@class="u-linkComplex-target"])[position()>2]').extract() item['Follows'] = hxs.select('(//span[@class="u-linkComplex-target"])[position()=1]').extract() print item return item
def parse_items(self, response): item = TwitterVizItem() self.browser.get(response.url) # let JavaScript Load time.sleep(3) # scrape dynamically generated HTML hxs = Selector(text=self.browser.page_source) item['Follower'] = hxs.select( '(//span[@class="u-linkComplex-target"])[position()>2]' ).extract() item['Follows'] = hxs.select( '(//span[@class="u-linkComplex-target"])[position()=1]' ).extract() print item return item
def parse(self, response): # Vamos abrir o arquivo em modo "Append", para continuar a escrever nele csv_file = open('amazon_search.csv', 'a') # Extraindo todas as divs hxs = Selector(response) divs = hxs.select('//div[@class="sg-row"]') for i, div in enumerate(divs): name = div.select( './/span[contains(@class,"a-text-normal")]/text()').get() price = div.select( './/span[contains(@class, "a-price")]/span[@class="a-offscreen"]/text()' ).get() stars = div.select( './/i[contains(@class, "a-icon-star-small")]/span/text()').get( ) csv_file.write('%s;%s;%s\n' % (name, price, stars)) csv_file.close() # Extraindo a url da próxima página next_url = response.xpath('//li[@class="a-last"]/a/@href').get() if not next_url is None: yield response.follow(next_url, callback=self.parse)
def parse(self, response): # Precisamos abrir o arquivo em modo "escrita" para criar o arquivo (ou sobrescrever) csv_file = open('amazon_games.csv', 'w') # Extraindo todas as divs hxs = Selector(response) divs = hxs.select('//div[@class="s-item-container"]') for i, div in enumerate(divs): name = div.select('.//h2/text()').get() price = div.select( './/span[contains(@class, "s-price")]/text()').get() stars = div.select( './/i[contains(@class, "a-icon-star")]/span/text()').get() csv_file.write('%s;%s;%s\n' % (name, price, stars)) csv_file.close() # Extraindo a URL da próxima página e fazendo uma nova requisição next_url = response.xpath('//a[@id="pagnNextLink"]/@href').get() if not next_url is None: yield response.follow(next_url, callback=self.parseNextPage)
def parse5(self, response): hxs = Selector(response) print "--------------------------------" items = [] item= self.generate_item_dict() #Yura_start: item["file_urls"]=[] item["title"] = hxs.select("//div[@class='breadcrumbs']/b/text()").extract_first() print item["title"] item["image_urls"] = hxs.select("//div[@class='scroller-view']/div/div/a/@href").extract() try: z = hxs.select("//div[@class='itemPDF']/div/a/@onclick").extract() for i in range(len(z)): x = ((z[i]).split("('")[1]).split("','")[0] item["file_urls"].append(x) print item["file_urls"] except: item["file_urls"] = "" print "No PDF there" #Yura_end---------------------------------- item["name"] = hxs.xpath("//h1[@id='itemName']/text()").extract()[0] item["price"] = hxs.xpath("//div[@id='mss-top-price']/em/text()").extract()[0] aaa="hhhh" try: a= hxs.xpath("//div[@class='ytInfoTab']/text()").extract() aa=[] for i in a: i=i.lower() a=i.split(" ") if "info" in a: aaa=i.capitalize()[:-5] item["pbrand"]= i.capitalize()[:-5] if aaa=="" or aaa==" ": item["pbrand"]=hxs.xpath("//div[@class='viewMoreLinks']/a/text()").extract()[-1] except: print "Brand not found" category =hxs.xpath("//div[@id='breadcrumbs']//a/text()").extract()[-1] items.append(item) return items
def parse(self, response): hxs = Selector(response) items = [] item = AndrewItem() item["file_urls"]=[] item["title"] = hxs.select("//div[@class='breadcrumbs']/b/text()").extract_first() item["image_urls"] = hxs.select("//div[@class='scroller-view']/div/div/a/@href").extract() try: z = hxs.select("//div[@class='itemPDF']/div/a/@onclick").extract() for i in range(len(z)): x = ((z[i]).split("('")[1]).split("','")[0] item["file_urls"].append(x) print item["file_urls"] except: item["file_urls"] = "" print "No PDF there" items.append(item) return items
def parse(self, response): page = response.url.split("/")[-2] filename = '../../bundestag-%s.html' % page response_html = response.body #hxs = HtmlXPathSelector(response) hxs = Selector(response) ''' body = response.body titles = hxs.xpath('//title/text()') final = titles.extract() print(final) ''' all_titles = hxs.select('//a') for titles in all_titles: title = titles.select('//title/text()').extract() link = titles.select('a/@href').extract() print(title) print(link) '''
def parse_page(self, response): # Given a page response, parse all the images and grab the title sel = Selector(response=response) title = response.css('title::text').extract() text = ''.join(sel.select("//body//text()").extract()).strip() url = response.request.url base = get_base_url(response) image_urls = [ urljoin(response.url, src) for src in response.xpath('//img/@src').extract() ] image_hashes = defaultdict() for img in image_urls: image_hashes[hashlib.sha1(img.encode()).hexdigest()] = img yield ElectionItem(title=title, base_url=base, url=url, text=text, image_hash_ids=image_hashes, image_urls=image_urls)