def parse_question(self, response): item = StackItem() item["title"] = response.css( "#question-header h1 a::text").extract()[0] item["url"] = response.url item["content"] = response.css(".question .post-text").extract()[0] yield item
def parse(self, response): questions = Selector(response).xpath('//*[@class="toanvancontent"]/p') print("Question Len = " , len(questions)) result_string = "" for question in questions: item = StackItem() # item['textContent'] = question.xpath(u'//em/text()').extract().encode('utf-8').strip() # item['textContent'] = unicode(str(question.xpath(u'//em/text()').extract()), "utf-8") print(len(question.xpath('./text()').extract())) print(len(question.xpath('./em/text()').extract())) print(len(question.xpath('./strong/text()').extract())) for sentence in question.xpath('.//strong/text()').extract(): print(str(sentence.encode('utf-8').strip())) result_string += str(sentence.encode('utf-8').strip()) for sentence in question.xpath('.//em/text()').extract(): print(str(sentence.encode('utf-8').strip())) result_string += str(sentence.encode('utf-8').strip()) for sentence in question.xpath('.//text()').extract(): print(str(sentence.encode('utf-8').strip())) result_string += str(sentence.encode('utf-8').strip()) # print(result_string) print("---------------------") # print(question) item['textContent'] = result_string yield item
def parse_product(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() product = StackItem() product['brand'] = extract_with_css( 'p.Text.Text--body-1.Text--left.Text--bold.Text--small.Text--\$magenta-50::text' ) product['name'] = extract_with_css( 'span.Text.Text--subtitle-1.Text--left.Text--small.Text--text-20::text' ) product['price'] = extract_with_css( 'span.Text.Text--title-6.Text--left.Text--bold.Text--small.Text--neutral-80::text' ) product['details'] = "".join( response.css( 'div.ProductDetail__ProductRow div#productDetails.ProductDetail__productDetails div.ProductDetail__productContent *::text' ).extract()) product['how_to_use'] = extract_with_css( 'div.ProductDetail__howToUse div.Collapsible div.Collapsible__contentOuter div.Collapsible__contentInner div.ProductDetail__productContent *::text' ) product['ingredients'] = extract_with_css( 'div.ProductDetail__ingredients div.Collapsible div.Collapsible__contentOuter div.Collapsible__contentInner div.ProductDetail__productContent *::text' ) product['image_url'] = extract_with_css('img::attr(src)') product['product_url'] = response.request.url yield product
def parse(self, response): questions = Selector(response).xpath('//div[@class="summary"]/h3') for question in questions: item = StackItem() item['title'] = question.xpath('a[@class="question-hyperlink"]/text()').extract()[0] item['url'] = question.xpath('a[@class="question-hyperlink"]/@href').extract()[0] yield item
def parse_page(self, response): paras = Selector(response).xpath('//div/p') for para in paras: item = StackItem() item['url'] = response.url item['desc'] = para.xpath('text()').extract() yield item
def parse(self, response): posts = Selector(response).xpath('//p[@class="title"]/a') for post in posts: item = StackItem() item['title'] = post.xpath('text()').extract()[0] item['url'] = post.xpath('@href').extract()[0] yield item
def parse(self, response): title = response.css('div.summary >h3>a::text').getall() urls = response.css('div.summary >h3>a::attr(href)').getall() for title, url in zip(title, urls): item = StackItem() item['title'] = title item['url'] = url yield item
def parse(self,response): questions=Selector(response).xpath("//div[@class='summary']/h3") for question in questions: item=StackItem() item["title"]=question.xpath("a[@class='question-hyperlink']/text()").extract()[0] item["url"]=question.xpath("a[@class='question-hyperlink']/@href").extract()[0] yield item
def parse(self, response): titles = Selector(response).xpath('//td[@class="title"]') for title in titles: item = StackItem() item['title'] = title.xpath( 'a[@class="storylink"]/text()').extract()[0] item['url'] = title.xpath( 'a[@class="storylink"]/@href').extract()[0] yield item
def parse_item(self, response): for ele in response.xpath('//div[@class="list-group"]/a') : i = StackItem() i['name'] = ele.xpath('text()').extract()[0] i['url'] = ele.xpath('@href').extract()[0] yield i
def parse(self, response): questions = Selector(response).xpath('//div[@class="courseblock"]') # questions = Selector(response).xpath('//div[@class="courseblock"]/button/h3') for question in questions: item = StackItem() item['code'] = question.xpath( 'button/h3/span[@class="code"]/text()').extract()[0].replace("\u00a0", " ") # item['code'] = question.xpath('div[@class="course-section"]/') item['prerequisites'] = question.xpath('div/div/div/p/a[@class="bubblelink code"]/text()').extract() yield item
def parse(self, response): articles = Selector(response).xpath( '//div[@class="search-result-story__container"]/h1[@class="search-result-story__headline"]' ) for article in articles: item = StackItem() item['domain'] = "bloomberg.com" item['title'] = article.xpath('a/text()').extract()[0] item['url'] = article.xpath('a/@href').extract()[0] yield item
def parse(self, response): questions = Selector(response).xpath('//div[@class="summary"]') for question in questions: item = StackItem() item["url"] = question.xpath( 'h3/a[@class="question-hyperlink"]/@href').extract_first() item["title"] = question.xpath( 'h3/a[@class="question-hyperlink"]/text()').extract_first() item["question"] = "" yield item
def parse_node(self, response, node): #self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract())) response.selector.remove_namespaces() item = StackItem() item['title'] = node.xpath('normalize-space(title/text())').extract() item['url'] = node.xpath('link/text()').extract() item['authorname'] = node.xpath('dc:creator/text()').extract() item['publicationdate'] = node.xpath('pubdate/text()').extract() item['desc'] = node.xpath('description/text()').extract() item['content'] = node.xpath('content:encoded/text()').extract() yield item
def parse(self, response): questions = HtmlXPathSelector(response).select( '//div[@class="summury"]/h3') for question in questions: item = StackItem() title = question.xpath( 'a[@class="question-hyperlink"]/text()').extract()[0] url = questionxpath( 'a[@class="question-hyperlink"]/@href').extract()[0] print title, "\n", url, "\n"
def parse(self, response): sites = Selector(response).xpath('//div/h4') for site in sites: item = StackItem() print site item['title'] = site.xpath("a/text()").extract() item['url'] = 'http://reason.org'+str(site.xpath("a/@href").extract_first()) #item['description'] = site.xpath("following-sibling::div/text()").extract_first('').strip() yield item for link in LxmlLinkExtractor(allow=()).extract_links(response): yield Request(link.url, callback=self.parse_page)
def parse_node(self, response, node): #self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract())) item = StackItem() item['title'] = node.xpath('title/text()').extract() item['url'] = node.xpath('link/text()').extract() item['authorname'] = node.xpath('author/text()').extract() item['publicationdate'] = node.xpath('pubDate/text()').extract() item['desc'] = '' item['content'] = node.xpath('description/text()').extract() yield item
def parse(self, response): questions = Selector(response).xpath('//div[@class="product-container text-left product-block"]') for question in questions: item = StackItem() item['url'] = question.xpath( 'div[@class="product-image-container image"]/a[@class="product_img_link"]/@href').extract()[0] item['title'] = question.xpath( 'div[@class="product-image-container image"]/a[@class="product_img_link"]/@title').extract()[0] item['price'] = question.xpath( 'div[@class="product-meta"]/div[@class="clearfix"]/div[@class="content_price"]/span[@class="price product-price "]/text()').extract()[0] yield item
def parse_items(self, response): item = StackItem() item["company"] = 'tuCasa' item["url"] = response.url item["title"] = self.format_xpath(response, '/html/body/div[5]/h1/text()') item["price"] = self.format_xpath(response, '/html/body/div[5]/div[1]/span[1]/text()').split()[0] item["update_date"] = self.format_xpath(response, '/html/body/div[6]/div/div[3]/span[2]/text()').split()[2] item["rooms"] = self.format_xpath(response, '/html/body/div[6]/div/div[1]/ul/li[4]/text()').split()[0] item["surface"] = self.format_xpath(response, '/html/body/div[6]/div/div[1]/ul/li[2]/span/text()').split()[0] item["location"] = self.format_xpath(response, '/html/body/div[5]/span/text()') item["description"] = ''.join(response.xpath('/html/body/div[6]/div/div[1]/text()').extract()).strip() yield item
def parse_item(self, response): questions = response.xpath('//div[@class="summary"]/h3') for question in questions: item = StackItem() item['url'] = question.xpath( 'a[@class="question-hyperlink"]/@href').extract()[0] item['title'] = question.xpath( 'a[@class="question-hyperlink"]/text()').extract()[0] #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() yield item
def parse_items(self, response): item = StackItem() item["company"] = 'idealista' item["url"] = response.url #item["title"] = self.format_xpath(response, '//*[@id="dvTitulo"]/div[1]/h1/text()') #item["price"] = self.format_xpath(response, '//*[@id="dvTitulo"]/div[2]/div[1]/div/div/span/text()').split()[0] #item["update_date"] = self.format_xpath(response, '//*[@class="gray-light size12"]/text()').split()[2] #item["rooms"] = self.format_xpath(response, '//*[@id="dvTitulo"]/div[1]/div[1]/div[2]/text()').split()[0] #item["surface"] = self.format_xpath(response, '//*[@id="dvTitulo"]/div[1]/div[1]/div[1]/text()').split()[0] #item["location"] = self.format_xpath(response, '/html/body/div[5]/span/text()') #item["description"] = ''.join(response.xpath('//*[@class="description"]/text()').extract()).strip() yield item
def parse(self, response): questions = Selector(response).xpath('//div[@class="summary"]') for question in questions: item = StackItem() item['title'] = question.xpath( 'h3/a[@class="question-hyperlink"]/text()').extract()[0] item['url'] = question.xpath( 'h3/a[@class="question-hyperlink"]/@href').extract()[0] item['username'] = question.xpath( './/div[@class="user-details"]/a/text()').extract()[0] item['useraddress'] = question.xpath( './/div[@class="user-details"]/a/@href').extract()[0] yield item
def parse(self, response): questions = Selector(response).xpath('//div[@class="post-title"]') for question in questions: #print question #iprint 'guru' item = StackItem() item['title'] = question.xpath( 'a/text()').extract() item['url'] = question.xpath( 'a/@href').extract()[0] yield item for link in LxmlLinkExtractor(allow=()).extract_links(response): yield Request(link.url, callback=self.parse_page)
def parse(self, response): questions = response.xpath('//div[@class="summary"]/h3') for question in questions: item = StackItem() item['title'] = question.xpath( 'a[@class="question-hyperlink"]/text()').extract()[0] item['url'] = "http://stackoverflow.com" + question.xpath( 'a[@class="question-hyperlink"]/@href').extract()[0] yield item if len(response.xpath("//a[@rel='next']")): url = response.xpath("//a[@rel='next']/@href").extract()[0] yield scrapy.Request("http://stackoverflow.com" + url, callback=self.parse)
def parse(self, response): questions = Selector(response).xpath('//a[@class="list-item-link"]') for question in questions: item = StackItem() url = question.xpath('@href').extract()[0] item['url'] = question.xpath('@href').extract()[0] item['title'] = question.xpath( 'div[@class="info-wrap"]/p[@class="list-title"]/text()' ).extract()[0] item['desc'] = question.xpath( 'div[@class="info-wrap"]/div[@class="list-desc"]/text()' ).extract()[0] request = scrapy.Request(url, callback=self.parse_content) request.meta['item'] = item yield request
def parse(self, response): question = Selector(response).xpath('//div[@id="content"]') item = StackItem() item["url"] = question.xpath( '//div[@id="question-header"]/h1/a[@class="question-hyperlink"]/@href' ).extract_first() item["title"] = question.xpath( '//div[@id="question-header"]/h1/a[@class="question-hyperlink"]/text()' ).extract_first() q_text = response.xpath( '//div[@class="postcell post-layout--right"]/div[@class="post-text"]/p' ) q_text = [i.extract() for i in q_text] q_text = " ".join(q_text) q_text = tag.sub("", q_text) item["question"] = q_text yield item
def parse(self, response): questions = Selector(response).xpath('//main[@class="site-main"]/article') #print "guru"+str(questions) for question in questions: item = StackItem() item['title'] = question.xpath( 'header/h2/a/text()').extract() item['url'] = question.xpath( 'header/h2/a/@href').extract() item['authorname'] = question.xpath('footer/span/span/a[@class="url fn n"]/text()').extract() item['authorurl'] = question.xpath('footer/span/span/a[@class="url fn n"]/@href').extract() item['publicationdate'] = question.xpath('footer/span/a/time[@class="entry-date published updated"]/text()').extract() item['content'] = question.xpath('div[@class="entry-content"]/p/text()').extract() yield item
def parse(self, response): questions = Selector(response).xpath('//div[@class="summary"]/h3') for question in questions: item = StackItem() item['title'] = question.xpath( 'a[@class="question-hyperlink"]/text()').extract()[0] item['url'] = question.xpath( 'a[@class="question-hyperlink"]/@href').extract()[0] yield item NEXT_PAGE_SELECTOR = "//a[@title='go to page 2']/@href" next_page = response.xpath(NEXT_PAGE_SELECTOR) if next_page: path = next_page.extract_first() nextpage = response.urljoin(path) print("Found url: {}".format(nextpage)) yield scrapy.Request(nextpage, callback=self.parse)
def parse(self, response): stackitem = StackItem() for item in response.css("li.item"): stackitem['sku'] = item.xpath("@data-item-sku").extract() stackitem['productName'] = item.xpath( ".//h2/a/text()").extract_first() stackitem['price'] = item.xpath( './/span[@class="price"]/text()').extract_first() stackitem['imageUrl'] = item.xpath('.//img/@src').extract_first() yield scrapy.Request(item.xpath( './/a[@class="product-image"]/@href').extract_first(), callback=self.parse_item, meta={'stackitem': stackitem}) next_page_url = response.xpath( ".//a[@class='next']/@href").extract_first() if next_page_url: yield scrapy.Request(response.urljoin(next_page_url))
def parse_items(self, response): item = StackItem() item["company"] = 'habitaclia' item["url"] = response.url item["title"] = self.format_xpath(response, '//*[@class="h1ficha"]/text()') item["price"] = self.format_xpath( response, '//*[@class="precio-ficha"]/span/text()').split()[0] item["detailReference"] = self.format_xpath( response, '//*[@class="referencia-ficha"]/span/text()') item["rooms"] = "" item["surface"] = "" #item["update_date"] = self.format_xpath(response, '//*[@id="contents_n"]/div[4]/div/div[1]/span/text()').split()[4].replace("(","").replace(")","") #item["rooms"] = self.format_xpath(response, '//*[@class="detail-rooms"]/text()').split()[0] #item["surface"] = self.format_xpath(response, '//*[@class="detail-m2"]/text()')[:-1] #item["location"] = self.format_xpath(response, '/html/body/div[5]/span/text()') #item["description"] = ''.join(response.xpath('//*[@id="description"]/text()').extract()).strip() yield item