def parse(self, response): for sel in response.css("ul#channels-browse-content-grid > li"): loader = ItemLoader(YoutubeVideo(), selector=sel) loader.add_xpath('link', './/h3/a/@href') yield loader.load_item()
def parse(self, response): l = ItemLoader(item=MyItem(), response=response) l.add_xpath( "title", """//div[@class="carousel"]/div[@class="songlist-slides slide-page"]/ul[@class="list-songlist slide-item"]/li[@class="songlist-item"]/a[@class="lnk-songlist"]/@title""", ) return l.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatTOIItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect TOI stats after converting from m,mmm:ss to seconds i = 5 CATEG = ["es_toi", "sh_toi", "pp_toi", "toi"] while i < 12: i += 1 if i % 2 == 0: temp = row.xpath("td[" + str(i) + "]/text()").extract()[0] sTemp = temp.split(":") sTemp[0] = sTemp[0].replace(",", "") loader.add_value(CATEG[(i - 6) / 2], str(60 * int(sTemp[0]) + int(sTemp[1]))) else: pass # feed item to pipeline yield loader.load_item()
def process_row(self, row, task): stats = self.crawler.stats l = ItemLoader(WV_DrillingPermit()) l.add_value(None, row) item = l.load_item() if item['API'] and item['permit_activity_type'] and item[ 'permit_activity_date']: existing_item = self.db.loadItem( item, { 'API': item['API'], 'permit_activity_type': item['permit_activity_type'], 'permit_activity_date': item['permit_activity_date'] }) if existing_item: stats.inc_value('_existing_count', spider=self) else: stats.inc_value('_new_count', spider=self) yield item dt = datetime.strptime(item['permit_activity_date'], '%Y-%m-%d %H:%M:%S') # if item['permit_activity_type'] in ('Permit Issued', 'Permit Commenced', 'Permit Completed'): if item['permit_activity_type'] in ( 'Permit Issued', 'Permits Issued' ) and datetime.now() - dt < timedelta(days=365): for item in self.create_feed_entry(item, task): yield item
def parse_product(self, response): product_url = response.url # sel = self.selenium #sel.open(response.url) #time.sleep(2.5) selector = Selector(response) # //*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2] price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[7]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath( '//*[@id="product_detail_view_1"]/div/div[5]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath( '//*[@id="product_detail_view_1"]/div/div[4]/div[2]/span[2]/text()').extract() l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="inner"]/div[1]/div[1]/div/div/text()') l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="inner"]/div[1]/div[1]/div/a[1]/text()') l.add_xpath('product', '//*[@id="inner"]/div[1]/div[1]/div/a[2]/text()') item = l.load_item() item['product_url'] = product_url item['price'] = price item['vendor'] ='Local Banya' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date']=str(time.strftime("%d/%m/%Y")) return item
def parse_by_product(self, response): """ For the 'Bundles' category, grab the product details for the first product listed. """ self.selector = Selector(response) self.results = self.selector.xpath('//*[@id="ctl00_tdMainPanel"]') loader = ItemLoader(item = VisionsProduct(), selector = self.results[0]) self.field_xpaths = { 'product': ('div[contains(@class, "catalogueTitle")]' '/h3/text()'), 'price': ('div[@id="ctl00_ContentPlaceHolder1_pnl' 'Bundle"]/div[@id="divProductDetails"]/div' '[contains(@class, "priceAddToCart")]/div[1]/span' '[contains(@id, "SalePrice")]/text()') } # Extract and load product details loader.add_xpath('product', self.field_xpaths['product']) loader.add_xpath('price', self.field_xpaths['price'], re = '\$[\d]*[,]*[\d]*\.[\d]*') loader.add_value('availability', 'Not Limited/Clearance Item') # Because it's an individual product page, manually set the category self.category = '/'.join(['Home', response.url.split('/')[4]]) loader.add_value('category', self.category) yield loader.load_item()
def parse_item(self,response): l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse(self, response): item_list = [] for a in response.css(".menu_box .menu_main h2"): l = ItemLoader(item=CategoryItem(), response=response) # l.add_css('category', ".menu_box .menu_main h2") l.add_value("category", a.extract(), self.get_text) item_list.append(l.load_item()) return item_list
def parse_content(self, response): bbsItem_loader = ItemLoader(item=BbsItem(), response=response) url = str(response.url) bbsItem_loader.add_value("url", url) bbsItem_loader.add_xpath("forum", self._x_query["forum"]) bbsItem_loader.add_xpath("poster", self._x_query["poster"]) bbsItem_loader.add_xpath("content", self._x_query["page_content"]) return bbsItem_loader.load_item()
def parse2(self, response): item = json.loads(response.body_as_unicode()) for i in range(len(item['list'])): data_tmp = item['list'][i] loader = ItemLoader(item=XqtestItem()) loader.add_value('title', data_tmp['data']) org = loader.load_item() yield org
def test_load_item_using_default_loader(self): i = TestItem() i["summary"] = u"lala" il = ItemLoader(item=i) il.add_value("name", u"marta") item = il.load_item() assert item is i self.assertEqual(item["summary"], u"lala") self.assertEqual(item["name"], [u"marta"])
def parse_content(self, response): bbsItem_loader = ItemLoader(item = ScrapyspiderItem(), response = response) url = str(response.url) bbsItem_loader.add_value('url', url) bbsItem_loader.add_xpath('forum', self._x_query['forum']) bbsItem_loader.add_xpath('poster', self._x_query['poster']) bbsItem_loader.add_xpath('content', self._x_query['page_content']) return bbsItem_loader.load_item()
def test_load_item_using_default_loader(self): i = TestItem() i['summary'] = u'lala' il = ItemLoader(item=i) il.add_value('name', u'marta') item = il.load_item() assert item is i self.assertEqual(item['summary'], u'lala') self.assertEqual(item['name'], [u'marta'])
def parse_item(self, response): self.logger.info("parse_item url %s" % response.url) l = ItemLoader(item=ImgDownloadItem(), response=response) l.add_xpath('name', '//h1[@class="c333 subTitle"]/text()') l.add_xpath('desc', '//div[@class="txtmod"]/p/text()') l.add_value('url', response.url) l.add_xpath('image_urls', "//p[@class='tc mb10']/img/@src", Identity()) return l.load_item()
def parsePost(self, response): l = ItemLoader(item=Post(), response=response) d = pyq(response.body) l.add_value('url', response.url) l.add_css('title', 'h1.entry-title::text') l.add_css('date', 'span.entry-date::text') l.add_css('author', 'span.author.vcard > a::text') l.add_value('content', d('div.entry-content').text()) return l.load_item()
def _set_loader(self, response, xs, item): if not xs: self.from_detail_page = True item = response.request.meta['item'] self.loader = ItemLoader(item=item, response=response) self.loader.default_output_processor = TakeFirst() else: self.from_detail_page = False self.loader = ItemLoader(item=item, selector=xs) self.loader.default_output_processor = TakeFirst()
def parse_detail(self, response): url = response.url item = ItemLoader(item=MeizituItem(), response=response) item.add_xpath("title", "//h2/a/text()") item.add_xpath("image_urls", '//div[@id="picture"]//img/@src') item.add_value('url', url) return item.load_item()
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) product_name = hxs.xpath( '//*[@id="vip_content_section"]/div[2]/h1/text()').extract() # //*[@id="vip_content_section"]/div[2]/h1 if (len(product_name) != 0): product_name = hxs.xpath( '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0] product_price = hxs.xpath('//*[@id="price-val"]/text()').extract() if (len(product_price) != 0): product_price = hxs.xpath( '//*[@id="price-val"]/text()').extract()[0] if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None): l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()') # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="cat_crum"]/@value') l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()') item = l.load_item() item['product_url'] = response.url item['price'] = product_price item['vendor'] = 'PepperFry' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date'] = str(time.strftime("%d/%m/%Y")) return item
def create_tag(self, feed_entry_id, tag, comment=''): # TODO: create tags l = ItemLoader(FeedEntryTag()) l.add_value('feed_entry_id', feed_entry_id) l.add_value('tag', tag) l.add_value('comment', comment) return l.load_item()
def parsePage(self, response): rentHouse = ItemLoader(item = RentItem(), response = response) rentHouse.add_value('id', self.name + '-' + response.url.split('/')[-1].split('.')[0]) rentHouse.add_value('link', response.url) rentHouse.add_xpath('title', "//dl[@class = 'title']/dt/p/text()") return rentHouse.load_item()
def parse_item(self, response): for e in response.xpath( '//table[@id="ip_list"]//tr[contains(td[6],"HTTP")]'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_xpath('prot', 'td[6]/text()') l.add_xpath('ip', 'td[2]/text()') l.add_xpath('port', 'td[3]/text()') yield l.load_item()
def parse(self, response): for e in response.xpath( '//table[@id="tbl_proxy_list"]//tr[count(td)=6]'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_value('prot', 'http') l.add_xpath('ip', 'td[1]', TakeFirst(), remove_tags, unicode.strip) l.add_xpath('port', 'td[2]', TakeFirst(), remove_tags, unicode.strip) yield l.load_item()
def parse_item2(self, response): l = ItemLoader(item=DmozItem(), response=response) l.add_xpath( 'type', '//div[@class="location ask_main_location"]/span[@class="fl"]/a[last()]/text()' ) l.add_xpath('type', '//div[@class="question"]/h2/text()') l.add_xpath('answer', '//div[@class="anwser"]/h2/text()') l.add_value('answer', '牛逼') yield l.load_item()
def parse(self, response): for e in response.xpath('//tr[contains(@class,"row")]'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_xpath( 'prot', 'td[5]/a/text()', lambda xs: 'https' if xs[0].strip() == 'true' else 'http') l.add_xpath('ip', 'td[2]/a/text()', lambda xs: xs[0].strip()) l.add_xpath('port', 'td[3]/a/text()') yield l.load_item()
def parse_final(self, response): # sel = Selector(response) # item = MeiziScrapyItem() # item["image_name"] = sel.xpath('/html/body/div[2]/div[1]/h2/text()').extract()[0] # return item l = ItemLoader(item=MeiziScrapyItem(), response=response) l.add_xpath('image_name', '/html/body/div[2]/div[1]/h2/text()') l.add_xpath('image_url', '//*[@id="content"]/a/img/@src') l.add_xpath('page_total_num', '//*[@id="page"]/a[7]/text()') return l.load_item()
def create_tag (self, feed_entry_id, tag, comment = ''): # TODO: create tags l = ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', feed_entry_id) l.add_value ('tag', tag) l.add_value ('comment', comment) return l.load_item()
def parse_news(self, response): item = ItemLoader(item=NewsItem(), response=response) item.add_value('url', response.url) item.add_value('title', response.xpath("//h1[@class='single-post__title']/text()").extract()[0]) item.add_value('content', response.xpath("//section[@class='article']/p/text()").extract()) return item.load_item()
def store_pdf (self, response): # print response.body # with open('result.pdf','w') as f: # f.write(response.body) stats = self.crawler.stats params = response.meta['scrape'] params['Content-Type'] = response.headers.get('Content-Type','') params['Content-Disposition'] = response.headers.get('Content-Disposition','') params['task_id'] = response.meta['task_id'] if 'application/pdf' != params['Content-Type'][:15]: self.log('Task_id %(task_id)s Unexpected content type. Expected "application/pdf", got "%(Content-Type)s" for PDF download API: %(api)s JobDate: %(job_date)s '% (params), log.WARNING) stats.inc_value ('_pdf_store_fail', spider=self) self.item_dropped(response.meta['task_id']) elif 'attachment; filename=' != params['Content-Disposition'][:21]: self.log('Task_id %(task_id)s Unexpected content disposition header. Expected "attachment; filename=*", got "%(Content-Disposition)s" for PDF download API: %(api)s JobDate: %(job_date)s '% (params), log.WARNING) stats.inc_value ('_pdf_store_fail', spider=self) self.item_dropped(response.meta['task_id']) else: l = ItemLoader(FracFocusPDF()) l.add_value ('seqid', response.meta['scrape']['seqid']) l.add_value ('pdf', psycopg2.Binary(response.body)) l.add_value ('filename', params['Content-Disposition'][21:]) item = l.load_item() self.log('Task_id %(task_id)s Storing PDF download for API: %(api)s JobDate: %(job_date)s '% (params), log.INFO) stats.inc_value ('_pdf_store', spider=self) yield item self.item_completed(response.meta['task_id'])
def parse(self, response): items = [] for everyday in response.xpath('//ul/li/strong/a'): loader = ItemLoader(ProductItem(), everyday) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_xpath('name', 'text()') loader.add_xpath('price', '@href') loader.add_xpath('stock', '@mon') loader.add_value('last_updated', 'today') # you can also use literal values item = self.to_utf8(loader.load_item(), *['name', 'price', 'stock', 'last_updated']) self.log(item['name'], log.INFO) items.append(item) return items
def parse_book_url(self, response): book_item = BookDetails(book_id="", book_type="pdf") bil = ItemLoader(item=book_item, response=response) bil.add_xpath("book_id", "/*//script/text()", re=r'bookId\s*:\s*(.*),.*') bil.add_xpath("book_path", "/*//script/text()", re=r'getDownloadUrl\s*:\s*\"(.*)\".*') #bil.get_xpath() bil.load_item() download_url = self.base_url + book_item['book_path'][0] post_data = "book_id=" + book_item['book_id'][ 0] + "&" + "type=" + book_item['book_type'] #post_data = "book_id=" + "2759" + "&" + "type=" + book_item['book_type'] #set header post_header = {} post_header[ "Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8" post_header["User-Agent"] = "Mozilla/5.0" #print post_header #print curl_cmd yield Request(download_url, self.get_book_link, headers=post_header, method='POST', body=post_data)
def parse(self, response): l = ItemLoader(item=Problem(), response=response) d = pyq(response.body) l.add_value('id', response.url[-4:]) l.add_value('title', d('#content_body > center:nth-child(1) > span').text()) l.add_value('body', d('#content_body').text()) return l.load_item()
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract() # //*[@id="vip_content_section"]/div[2]/h1 if (len(product_name) != 0): product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0] product_price = hxs.xpath('//*[@id="price-val"]/text()').extract() if (len(product_price) != 0): product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()[0] if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None): l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()') # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="cat_crum"]/@value') l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()') item = l.load_item() item['product_url'] = response.url item['price'] = product_price item['vendor'] = 'PepperFry' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date'] = str(time.strftime("%d/%m/%Y")) return item
def parse_item(self, response): item_loader = ItemLoader(item=MeiziItem(), response=response) # 标题 item_loader.add_xpath('title', '//h2/a/text()') # 图片链接 item_loader.add_xpath('image', "//div[@id='picture']/p/img/@src", Identity()) # 帖子链接 item_loader.add_xpath('link', response.url) return item_loader.load_item()
def parse_news(self, response): item = ItemLoader(item=NewsItem(), response=response) item.add_value('url', response.url) item.add_value( 'title', response.xpath( "//h1[@class='single-post__title']/text()").extract()[0]) item.add_value( 'content', response.xpath("//section[@class='article']/p/text()").extract()) return item.load_item()
def parse(self, response): item = PdfItem() loader = ItemLoader(response=response) pdf_path = '//*[contains(text(), "[PDF]")]' pdf_url_path = '%s//following-sibling::*' % pdf_path item['url'] = loader.get_xpath('%s' % pdf_url_path) item['title'] = loader.get_xpath('%s/text()' % pdf_url_path, TakeFirst()) summary_path = '%s//parent::*//parent::*/*[@class="s"]/*' % pdf_url_path description_path = '%s/*[@class="st"]/*' % summary_path item['description'] = loader.get_xpath( '%s/text()|%s/*/text()' % (description_path, description_path)) similar_path = '%s/*[contains(@class, "f")]//a[contains(@href, "q=related:")]' % summary_path # similar_url = loader.get_xpath('%s/@href' % similar_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, similar_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) # # next_path = '//*[@class="pn"]' # next_url = loader.get_xpath('%s/@href' % next_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, next_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) pdf_url = item['url'] print item if pdf_url: pdf_filename = os.path.basename(pdf_url) pdf_filepath = '%s/%s/%s' % (DOWNLOAD_DIR, SEARCH_TERM, pdf_filename) if self.download_files: self.download_file(pdf_url, pdf_filepath, response.url) yield item
def parse(self, response): #crawl all display page for link in self.link_extractor['page_down'].extract_links(response): yield Request(url = link.url, callback=self.parse) #browser self.browser.get(response.url) time.sleep(5) # get the data and write it to scrapy items etaoItem_loader = ItemLoader(item=EtaoItem(), response = response) url = str(response.url) etaoItem_loader.add_value('url', url) etaoItem_loader.add_xpath('title', self._x_query['title']) etaoItem_loader.add_xpath('name', self._x_query['name']) etaoItem_loader.add_xpath('price', self._x_query['price']) yield etaoItem_loader.load_item()
def parse(self, response): for sel in response.css("ul#channels-browse-content-grid > li"): loader = ItemLoader(YoutubeVideo(), selector=sel) loader.add_xpath('link', './/h3/a/@href') loader.add_xpath('title', './/h3/a/text()') loader.add_xpath('views', ".//ul/li[1]/text()") yield loader.load_item()
def parse_item(self, response): self.logger.info("parse_item url %s" % response.url) l = ItemLoader(item=ImgDownloadItem(), response=response) l.add_xpath('name', '//h1[@class="article-title"]/a/text()') # l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//article[@class='article-content']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse_detail(self, response): print("response.url===", response.url) #具体值 url = response.url #使用ItemLoader类 item = ItemLoader(item=Meizitu2Item(), response=response) item.add_xpath("tilte", "//h2/a/text()") item.add_xpath("image_urls", '//div[@id="picture"]//img/@src') #添加值的方式 item.add_value("url", url) return item.load_item()
def parse_item(self,response): self.log('Hi,this is an item page!%s'%response.url) # soup = BeautifulSoup(response) # item=SecooItem() # item['topic']=response.xpath("//a[@target='_blank']/text()").extract() # print(item['topic']) # item['topic_href']=response.xpath("//a[@target='_blank']/@href").extract() # print(item['topic_href']) # item['topic'] = soup.find(attrs={'class':'seo_cont'}).a.extract() # print(item['topic']) # item['topic_href'] = soup.find(attrs={'class':'seo_cont'}).a['href'].extract() # print(item['topic_href']) item=ItemLoader(item=SecooItem(),response=response) item.get_css('.seo_cont>a',) return item
def parse_products(self, response): hxs = HtmlXPathSelector(response) product_containers = hxs.xpath('//*[@class="product-container floatL"]') for product in product_containers.xpath('..//div/a'): l = ItemLoader(item=BillionPricesIndiaItem(), response=response) item = l.load_item() item['product_url'] = product.xpath('@href').extract()[0] item['product_name'] = product.xpath('.//*[@class=""]/text()').extract()[0] item['price'] = product.xpath('div[3]/div[2]/div[1]/text()').extract()[0] item['quantity'] = product.xpath('div[3]/div[1]/span[1]/text()').extract()[0] item['vendor'] = 'LocalBanya' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date'] = str(time.strftime("%d/%m/%Y")) print item return item
def _set_loader(self, response, xs, item): if not xs: self.from_detail_page = True item = response.request.meta["item"] if self.scraper.detail_page_content_type == "J": json_resp = json.loads(response.body_as_unicode()) self.loader = JsonItemLoader(item=item, selector=json_resp) else: self.loader = ItemLoader(item=item, response=response) else: self.from_detail_page = False if self.scraper.content_type == "J": self.loader = JsonItemLoader(item=item, selector=xs) else: self.loader = ItemLoader(item=item, selector=xs) self.loader.default_output_processor = TakeFirst() self.loader.log = self.log
def _set_loader(self, response, from_page, xs, item): self.from_page = from_page rpt = self.scraper.get_rpt(from_page) if not self.from_page == 'MP': item = response.request.meta['item'] if rpt.content_type == 'J': json_resp = json.loads(response.body_as_unicode()) self.loader = JsonItemLoader(item=item, selector=json_resp) else: self.loader = ItemLoader(item=item, response=response) else: if rpt.content_type == 'J': self.loader = JsonItemLoader(item=item, selector=xs) else: self.loader = ItemLoader(item=item, selector=xs) self.loader.default_output_processor = TakeFirst() self.loader.log = self.log
def parse(self, response): for sel in response.css("ul#channels-browse-content-grid > li") loader = ItemLoader(YoutubeVideo(), selector = sel) loader.add_xpath('link', './/h3/a/@href') loader.add_xpath('title', './/h3/a/text()') loader.add_xpath('views', ".//ul/li[1]/text()") yield loader.load item()
def parse(self, response): content = response.body page = response.url.split("/")[-1] """ content = Selector(response=response).xpath("//div[@class='body textStyle']").extract() if (len(content)): content = content[0] #踢除标签 strip = StripTags() content = strip.filterTags(content) #写文件 filename = 'quotes-%s' % page with open(filename, 'w') as f: f.write(str(content)) self.log('Saved file %s' % filename) """ loader = ItemLoader(item=TutorialItem(), response=response) loader.add_xpath('title', "//title/text()") loader.add_xpath('content', "//div[@class='body textStyle']") data = loader.load_item() downFile = DownFile(data['content'][0], 'http://www.admin10000.com') downFile.downImgFile() mongo = Mongo("articles") mongo.setTable("admin10000") content = data['content'][0] # 踢除标签 strip = StripTags() content = strip.filterTags(content) article = {'title': data['title'][0], 'content': content} mongo.add(article)
def parse_item(self, response): # 解析http://www.meizitu.com/a/5336.html获取图片URL l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse(self, response): item = PdfItem() loader = ItemLoader(response=response) pdf_path = '//*[contains(text(), "[PDF]")]' pdf_url_path = '%s//following-sibling::*' % pdf_path item['url'] = loader.get_xpath('%s' % pdf_url_path) item['title'] = loader.get_xpath('%s/text()' % pdf_url_path, TakeFirst()) summary_path = '%s//parent::*//parent::*/*[@class="s"]/*' % pdf_url_path description_path = '%s/*[@class="st"]/*' % summary_path item['description'] = loader.get_xpath('%s/text()|%s/*/text()' % (description_path, description_path)) similar_path = '%s/*[contains(@class, "f")]//a[contains(@href, "q=related:")]' % summary_path # similar_url = loader.get_xpath('%s/@href' % similar_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, similar_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) # # next_path = '//*[@class="pn"]' # next_url = loader.get_xpath('%s/@href' % next_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, next_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) pdf_url = item['url'] print item if pdf_url: pdf_filename = os.path.basename(pdf_url) pdf_filepath = '%s/%s/%s' % (DOWNLOAD_DIR, SEARCH_TERM, pdf_filename) if self.download_files: self.download_file(pdf_url, pdf_filepath, response.url) yield item
def parse_items(self, response): item = ItemLoader(Articulos(), response) item.add_xpath('title', '//*[@id="MainContainer"]/article/section[1]/div[1]/div/h2/text()') item.add_xpath('description', '//*[@id="MainContainer"]/article/section[1]/div[2]/ul/li[3]/text()') yield item.load_item() # scrapy runspider multiplepages.py -o ../../resources/computrabajo.csv -t csv
def parse(self, response): items = ItemLoader(item=XsContentItem(), response=response) #章节标题 items.add_xpath('title', '//*[@class="bookname"]/h1/text()') #正文 items.add_xpath('text', '//*[@id="content"]/text()') yield items.load_item()
def parse(self, response): l = ItemLoader(item=JianshuArticleItem(), response=response) l.add_xpath( 'content', '//div[@class="article"]/div[@class="show-content"]/p/text()') l.add_value('url', response.url) return l.load_item()
def parse(self, response): try: page = response.url.split("/")[-1].split(".")[0] self.log('ID: %s' % page) book_name = response.css('h2::text').extract_first() #self.log('book_name: %s' % book_name) book_author = response.css('h4::text').extract_first().replace( 'Tác giả: ', '') #self.log('book_author: %s' % book_author) book_category = response.css('h4 a::text').extract_first().replace( 'Thể loại: ', '') #self.log('book_category: %s' % book_category) book_cover = response.xpath( '//img[@class="img-thumbnail"]//@src').extract_first() # BookDownload = namedtuple('BookDownload', ['source', 'epub', 'mobi', 'pdf', 'azw3', 'prc']) # book_downloads = [] # for book_download in response.css('div.book-download'): # # print(book_download.css('a::text').extract_first()) # # bd = BookDownload._fields_defaults # source = book_download.css('a::text').extract_first() # epub = book_download.css('a')[1].extract() # # book_downloads.append(bd) # self.log('source: %s' % source) # self.log('epub: %s' % epub) # self.log('book_downloads: %s' % book_downloads) loader = ItemLoader(response=response) book_description = loader.get_xpath( '//div[@class="book-description"]/node()', Join()) #self.log('book_description: %s' % book_description) except: self.log('ERROR in: %s', response.url) yield { 'id': page, 'name': book_name, 'author': book_author, 'category': book_category, 'description': book_description, 'cover': book_cover }
def parse_item(self, response): for e in response.xpath('//table[@id="basic"]/tbody/tr'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_xpath('ip', 'td[2]/a/text()') l.add_xpath('port', 'td[3]/text()') l.add_xpath('prot', 'td[4]/a/text()') yield l.load_item()
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()') # l.add_xpath('tags', '//div[@class="postContent"]') l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity()) l.add_value('url', response.url) return l.load_item()
def parse(self, response): # jpy = PyQuery(response.text) print(response.text) item_loader = ItemLoader(item=City58Item(), response=response) item_loader.add_xpath( 'price', '//div[contains(@class, "house-pay-way")]/span[1]/b/text()') item_loader.add_xpath( 'pay_way', '//div[contains(@class, "house-pay-way")]/span[2]/text()') item_loader.add_xpath( 'house_type', '//div[contains(@class,"house-desc-item")]/ul/li[2]/span[2]/text()' ) return item_loader.load_item()
def parse_start_url(self, response): sel = Selector(response) for row in sel.xpath('//table/tr'): if not row.xpath('td'): continue item = ItemLoader(item=song(), selector=row) item.add_value('request_page', response.url) item.add_xpath('title', 'td[3]/a[1]/text()') item.add_xpath('artist', 'td[3]/a[2]/text()') item.add_xpath('album', 'td[4]/text()') loaded_item = item.load_item() yield loaded_item
def parse_html_item(self, response, loop, fields): meta = response.meta hxs = Selector(response) self.macro.update({'URL':response.url, 'keyword':meta.get('keyword', '')}) for e in hxs.xpath(loop or '(//*)[1]'): loader = ItemLoader(item=Item(), selector=e) for k,v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'css' in v: get_v_x = loader.get_css v_x = v.get('css') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value", "xpath" or "css"'.format(k), level=log.WARNING) continue val = get_v_x( self.macro.expand(v_x, meta), parser.make_parser(v.get('parse', {})), re=v.get('regex') ) if not val and 'default' in v: val = arg_to_iter(self.macro.expand(v.get('default'), meta)) if not (val or v.get('multi') or v.get('opt')): log.msg(u'field [{}] is empty:\n{}'.format(k, loader.load_item()), level=log.WARNING) break loader.add_value(k, val) else: yield loader.load_item()
def parse(self, response): js = response.xpath( '//div[@id="proxylist"]/following-sibling::script/text()').extract( )[0] self.js_init(js) for e in response.xpath('//table[@class="proxytbl"]//tr[td]'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_xpath('ip', 'td[1]/text()') l.add_xpath('port', 'td[2]/script/text()', lambda xs: self.js_calc(xs[0])) l.add_xpath( 'prot', 'td[5]', lambda xs: 'http' if remove_tags(xs[0]).strip() == '-' else 'https') yield l.load_item()