class RedditSpider(CrawlSpider): name = "reddit" allowed_domains = ["openjur.de"] start_urls = ('http://openjur.de/u.html', ) rules = [ Rule(LinkExtractor(allow=["\/u\/[0-9]+\.html"]), callback="parse_item"), Rule(LinkExtractor(allow=["\/u(\-[0-9]+)?\.html"])) ] def parse_item(self, response): #print (response.css("div[ id=\"info\"]").extact()) item = PicItem() item['url'] = response.url item['gericht'] = response.css( "#info > ul:nth-child(1) > li:nth-child(1) > p:nth-child(2)").css( "a[href*=http]::text").extract() item['datum'] = response.css( "#info > ul:nth-child(1) > li:nth-child(2) > p:nth-child(2)::text" ).extract() item["AZ"] = response.css( "#info > ul:nth-child(1) > li:nth-child(3) > p:nth-child(2)::text" ).extract() item['typ'] = response.css( "#info > ul:nth-child(1) > li:nth-child(4) ::text").extract() item['text'] = response.css("#text").extract() item['verfahrensgang'] = [ " ".join( response.css( ".instanzen > p:nth-child(2) > a:nth-child(1)::text"). extract() + (response.css( ".instanzen > p:nth-child(2) > i:nth-child(2)::text"). extract())) ] item['rechtsgebiete'] = response.css(".rechtsgebiete").extract() print(item['url']) yield item
class ThreadsSpider(CrawlSpider): name = "threads" allowed_domains = ["reddit.com"] start_urls = ( "https://www.reddit.com/r/", #Enter Reddit thread here ) rules = ( Rule(LinkExtractor(restrict_xpaths=(".//div[@class='nav-buttons']")), follow=True), Rule(LinkExtractor(restrict_xpaths=(".//div[@class='content']//p[@class='parent']/a[@class='title']")), callback="parse_item"), ) def parse_item(self, response): hxs = Selector(response) thread = hxs.xpath(".//p[@class='title']/a/text()").extract() op = hxs.xpath(".//div[contains(@class, 'self')]//p[@class='tagline']/a[contains(@class, 'author')]/text()").extract() thread_date = hxs.xpath(".//div[contains(@class, 'self')]//p[@class='tagline']/time/@title").extract() textpost = hxs.xpath(".//div[contains(@class, 'self')]//div[@class='md']//text()").extract() comments = hxs.xpath(".//div[contains(@class, 'self')]//a[contains(@class, 'comments')]/text()").extract() vote_points = hxs.xpath(".//div[@class='linkinfo']/div[@class='score']/span[@class='number']/text()").extract() upvoted = hxs.xpath(".//div[@class='linkinfo']/div[@class='score']/text()").extract() rows = hxs.xpath(".//div[@class='commentarea']//div[contains(@class, 'comment')]/div[contains(@class, 'entry')]") for row in rows: l = CommentItemLoader(item = CommentItem(), response = response) l.add_value("url", response.url) l.add_value("thread", thread) l.add_value("op", op) l.add_value("thread_date", thread_date) l.add_value("textpost", textpost) l.add_value("comments", comments) l.add_value("vote_points", vote_points) l.add_value("upvoted", upvoted) l.add_value("comment", row.xpath(".//div[contains(@class, 'usertext-body')]//text()").extract()) l.add_value("user", row.xpath(".//p[@class='tagline']/a[contains(@class, 'author')]/text()").extract()) l.add_value("time", row.xpath(".//p[@class='tagline']/time/@title").extract()) yield l.load_item()
class LeMondeSpider(CrawlSpider): name = "lemonde" allowed_domains = ["lemonde.fr"] start_urls = [ "http://www.lemonde.fr/", ] article_item_fields = { 'title': './/article/h1/text()', #'Author': './/article/p[@class="bloc_signature"]/span[@class="signature_article"]/span[@itemprop="author"]/a.text()', #'Publisher': './/article/p[@class="bloc_signature"]/span[@id="publisher"]/text()', 'timestamp': './/article/p[@class="bloc_signature"]/time[@itemprop="datePublished"]/@datetime', 'body': './/article/div[@id="articleBody"]/*', } rules = ( # Rule(LinkExtractor(allow=[r'w+']), follow=True), # Extract links matching to the article link Rule(LinkExtractor(allow=(r"article/\d{4}/\d{2}/\d{2}/.+")), callback="parse_article", follow=True), ) def parse_article(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ selector = Selector(response) loader = XPathItemLoader(LeMondeArt(), selector=selector) self.log('\n\nA response from %s just arrived!' % response.url) # define processors text_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Populate the LeMonde Item with the item loader for field, xpath in self.article_item_fields.iteritems(): try: loader.add_xpath(field, xpath, text_input_processor) except ValueError: self.log("XPath %s not found at url %s" % (xpath, response.url)) #loader.add_value("Url",response.url) yield loader.load_item()
class BitcoinAddrSpider(CrawlSpider): item = {} name = "bitcoinaddr" allowed_domains = ["bitcointalk.org"] start_urls = ["https://bitcointalk.org/index.php?board=1.0"] rules = ( Rule(LinkExtractor(allow=('board'))), Rule(LinkExtractor(allow=('topic')), callback='parse_item'), ) def parse_item(self, response): #print response.xpath('//tbody').extract() , "!!!" for line in response.css(".signature"): #print line.extract(), ' !!! '; #_item = BitcoinaddrspiderItem() if (line.xpath('text()').re(r"(1[1-9A-HJ-NP-Za-km-z]{26,33})") == []): continue Addr = line.xpath('text()').re( r"(1[1-9A-HJ-NP-Za-km-z]{26,33})")[0] print Addr, "123!!!\n" Username = line.xpath( '../../../tr[1]/td[1]/b/a/text()').extract()[0] if Username == "" or Addr == "": continue try: self.item[Username].append(Addr) self.item[Username] = list(set(self.item[Username])) except KeyError: self.item[Username] = [Addr] print Username, ' : ', Addr, "!!!!\n" #yield item def closed(self, reason): print self.item, "!!!" fileHandle = open('output.json', 'w') fileHandle.write(json.dumps(self.item)) fileHandle.close()
class WallpapersSpider(CrawlSpider): name = 'wallpapers' allowed_domains = ['simpledesktops.com'] start_urls = ['http://simpledesktops.com/browse/'] rules = ( Rule(LinkExtractor( allow= r'browse/desktops/20[0-9]{2}/(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/[0-9]{2}' ), callback='parse_item'), Rule(LinkExtractor(allow=r'browse/\d\/$'), follow=True), ) def parse_item(self, response): i = SimpledesktopsItem() i['image_urls'] = [ 'http://{0}/{1}'.format( self.allowed_domains[0], response.xpath('//div[@class="desktop"]/a/@href').extract()[0]) ] return i
class ArticleSpider(CrawlSpider): name = 'articleItems' allowed_domains = ['wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] rules = [Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'),callback='parse_items', follow=True)] def parse_items(self, response): article = Article() article['url'] = response.url article['title'] = response.css('h1::text').extract_first() article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract() lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() article['lastUpdated'] = lastUpdated.replace('This page was last edited on ', '') return article
class CookingLightSpider(CrawlSpider): name = 'myrecipes' allowed_domains = ['myrecipes.com'] start_urls = ["http://myrecipes.com/"] rules = (Rule(LinkExtractor(allow=".*/recipe/.*"), callback="parse_item"), Rule( LinkExtractor(deny=[ ".*/how-to/video/.*", ".*/r/.*", ".*/about-us/.*", ".*/contact-us/.*", ".*/frequently-asked-questions/.*", ".*/press/.*", ".*/rss/.*", ".*/sitemap/.*" ]))) def __init__(self): super(CookingLightSpider, self).__init__() self.seen_recipes = set() def parse_item(self, response): item = RecipeItem() item['url'] = response.url if item['url'] not in self.seen_recipes: self.seen_recipes.add(item['url']) return item
class CooksSpider(CrawlSpider): name = 'cooks' USER_AGENT = "Mozilla/5.0" allowed_domains = ['cooks.com'] start_urls = ["http://www.cooks.com/rec/browse/"] rules = ( Rule(LinkExtractor(allow=".*/recipe/[\w\d]{8}/[\w\d-]+\.html"), callback='parse_item'), Rule(LinkExtractor(allow=".*/rec/new_recipes_\w+\.html")) ) def __init__(self): super(CooksSpider, self).__init__() self.seen_recipes = set() def parse_item(self, response): item = RecipeItem() item['url'] = response.url item['url'] = re.sub("\?.*", "", item['url']) if item['url'] not in self.seen_recipes: self.seen_recipes.add(item['url']) return item
class AIweeklySpider(CrawlSpider): name = "aiweekly" allowed_domains = ["aiweekly.co"] start_urls = [ "http://aiweekly.co/issues/1#start", ] rules = [ Rule(LinkExtractor(allow=r"/issues/[0-9]+#start"), "parse_item"), Rule(LinkExtractor(allow=r"/issues/[0-9]+"), "parse_item"), ] def parse_item(self, response): item = AiweeklyItem() item['title'] = response.xpath( r"//h3[@class='item__title']/a/text()").extract() item['link'] = response.xpath( r"//h3[@class='item__title']/a/@href").extract() return item
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): url = urljoin(response.url, link.url) yield scrapy.Request( url, self.parse_link, meta={'splash': { 'args': { 'har': 1, 'html': 0 }, }})
class GFGSpider(CrawlSpider): name = 'geeksforgeeks' allowed_domains = ['geeksforgeeks.org'] rules = ( Rule(LinkExtractor(restrict_xpaths=('//a[@class="nextpostslink"]', )), follow=True), Rule(LinkExtractor(restrict_xpaths=('//h2[@class="post-title"]/a', )), callback='parse_item'), ) def __init__(self, category='tag', name='dynamic-programming', *args, **kwargs): super(GFGSpider, self).__init__(*args, **kwargs) self.dest = "../geeksforgeeks-books/" + name + "/" self.start_urls = [ 'http://www.geeksforgeeks.org/' + category + '/' + name ] self.doc_name = name def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) #item = GeeksforgeeksItem() #item['url'] = response.url if not os.path.exists(self.dest): os.makedirs(self.dest) with open(self.dest + 'metadata.xml', 'w') as metadata: metadata.write( '<dc:title>' + " ".join(self.doc_name.title().split('-')) + '</dc:title>\n<dc:language>en-US</dc:language>\n<dc:date opf:event="publication">2015-2-19</dc:date>\n<dc:rights>Creative Commons Attribution-NonCommercial-NoDerivs 2.5 India (CC BY-NC-ND 2.5 IN)</dc:rights>' ) """ call(['wget', '-O', self.dest + response.url.split('/')[-2] + ".html", response.url]) """ with codecs.open(self.dest + response.url.split('/')[-2] + ".html", 'w', 'utf-8') as file_handle: file_handle.write(response.body_as_unicode())
class PicSpider(CrawlSpider): name = "pic" allowed_domains = ["http://www.reddit.com"] start_urls = ( 'http://www.reddit.com/r/pics' ) rules = [ Rule(LinkExtractor(allow=['.*'])) ] #'/r/pics/\?count=\d*&after=\w* def parse(self, response): pass
class TechSpider(CrawlSpider): name = "news" allowed_domains = ["tech.163.com"] start_urls = ["http://tech.163.com/"] rules = [Rule(LinkExtractor(allow=("/17/\d+/\d+/*")), 'parse_item')] def parse_item(self, response): item = NewsItem() item['url'] = response.url ext = Extractor(rawPage=response.text, blockSize=5, image=False) print ext.getContext() item['content'] = ext.getContext() yield item
class SpoonfulSpider(CrawlSpider): name = 'spoonful' allowed_domains = ['spoonful.comm', 'family.disney.com', 'disney.com'] start_urls = ["http://family.disney.com/recipes"] rules = ( Rule(LinkExtractor(allow=".*/recipes/page/.*")), Rule(LinkExtractor(allow=".*/recipes/.*", deny=[".*/recipes/page/.*"]), callback='parse_item') ) def __init__(self): super(SpoonfulSpider, self).__init__() self.seen_recipes = set() def parse_item(self, response): item = RecipeItem() item['url'] = response.url if item['url'] not in self.seen_recipes: self.seen_recipes.add(item['url']) return item
def __init__(self): rs = ReadSetting() #读取各项参数 self.start_urls = rs.readurl() self.linkmatrix = LinkMatrix(rs.projectname()) self.linkmatrix.setroot(self.start_urls) self.allowed_domains = rs.readalloweddomain() self.xpath = rs.readxpath() self.rules = [Rule(LinkExtractor(), follow=True, callback="parse_start_url")] #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_start_url #所有Request均经过spidermiddlewares super(XpathSpider, self).__init__()
class UrlsSpider(CrawlSpider): name = "urls" allowed_domains = domains start_urls = [ start_link, ] rules = ( Rule(LinkExtractor(allow=()), callback='parse_item', follow = True), ) def parse_item(self, response): pass
class ImgurScrappingSpider(CrawlSpider): name = 'ImgurScrapping' allowed_domains = ['imgur.com'] start_urls = ['http://www.imgur.com'] rules = [Rule(LinkExtractor(allow=['/gallery/.*']), 'parse_imgur')] def parse_imgur(self, response): image = ImgurscrappingItem() image['title'] = response.xpath(\ "//h1/text()").extract() relative_address = response.xpath("//img/@src").extract() image['image_urls'] = ['http:' + relative_address[0]] return image
class SteamyKitchenSpider(CrawlSpider): name = 'steamykitchen' USER_AGENT = "Mozilla/4.0" allowed_domains = ['steamykitchen.com'] start_urls = ["http://steamykitchen.com/category/recipes"] rules = ( Rule(LinkExtractor(allow=".*\.com/\d+-.*\.html"), callback='parse_item'), Rule(LinkExtractor(allow=".*/category/recipes/page/\d+")) ) def __init__(self): super(SteamyKitchenSpider, self).__init__() self.seen_recipes = set() def parse_item(self, response): item = RecipeItem() item['url'] = response.url item['url'] = re.sub("\?.*", "", item['url']) if item['url'] not in self.seen_recipes: self.seen_recipes.add(item['url']) return item
class DrinkSpider(CrawlSpider): """ the DrinkSpider """ name = "drinks" allowed_domains = ["drinksmixer.com"] start_urls = [ "http://www.drinksmixer.com/cat/1/%d" % p for p in xrange(1, 125) ] rules = (Rule(LinkExtractor(allow='http://www\.drinksmixer\.com/.*\.html'), callback='parse_drink'), ) def parse_drink(self, response): """ parsing the drinks note that this function has no notion of what the drink is (cocktail, shot) """ drink = DrinkItem() soup = BeautifulSoup(response.body, "lxml") title = soup.find("title").contents[0].rstrip("recipe") recipe = soup.find("div", {"class": "RecipeDirections instructions"}) description = soup.find("div", {"class": "summary RecipeDirections"}) ingredients = soup.find("div", {"class": "ingredients"}) drink['ingredients'] = {} if ingredients: for ingredient in ingredients.find_all("span", {"class": "ingredient"}): if ingredient.contents: value = ingredient.find("span", { "class": "amount" }).contents[0] key = ingredient.find("span", {"class": "name"}) key = key.find("a") key = key['href'] key = re.findall(r'\d+', key)[0] drink['ingredients'][key] = value drink['name'] = title.rstrip() drink['recipe'] = recipe.contents[0].rstrip() if recipe else "" drink['description'] = description.contents[0].rstrip( ) if description else "" return drink
class ZZSpider(CrawlSpider): name = "zz_bagirov" allowed_domains = ["bagirov.livejournal.com"] start_urls = ["http://bagirov.livejournal.com/446032.html"] rules = (Rule(LinkExtractor( allow=('http://bagirov.livejournal.com/\d+\.html', ), deny=('tag', 'reply', 'thread', 'page'), ), callback='parse_page', follow=True), ) def parse_start_url(self, response): list(self.parse_page(response)) def parse_page(self, response): # use scrapy shell to find xpath # from scrapy.shell import inspect_response # inspect_response(response) item = ScraperItem() item['url'] = response.url try: item['title'] = response.xpath( "//div[@class='asset-header-content-inner']/h2/a/text()" ).extract()[0] except IndexError: item['title'] = "" try: item['text'] = " ".join( response.xpath( "//div[@class='asset-content']/child::node()").extract()) except IndexError: item['text'] = '' try: item['date'] = response.xpath( "//abbr[@class='datetime']/text()").extract()[0] except IndexError: item['date'] = '' try: item['comment_count'] = response.xpath( "//div[@class='comments-nav']/text()").extract()[0] except IndexError: item["comment_count"] = "0" yield item
class ForumsSpider(CrawlSpider): name = "nosurrender" allowed_domains = ["nosurrenderbreastcancersupportforum.com"] start_urls = [ "http://www.nosurrenderbreastcancersupportforum.com/", ] rules = ( # Rule to go to the single product pages and run the parsing function # Excludes links that end in _W.html or _M.html, because they point to # configuration pages that aren't scrapeable (and are mostly redundant anyway) Rule(LinkExtractor( restrict_xpaths= '//td[contains(@valign,"top")]/table[contains(@class,"tables")]//a[contains(@class,"forum")]', ), callback='internallist')) # Rule to follow arrow to next product grid # https://github.com/scrapy/dirbot/blob/master/dirbot/spiders/dmoz.py # https://github.com/scrapy/dirbot/blob/master/dirbot/pipelines.py def internallist(self, response): links = response.xpath( 'id("main_container")/div[2]/form[1]/table/tbody/tr[1]/td/table/tbody/tr/td[2]/a/@href' ).extract() for link in links: yield Request(link, callback=self.parsePostList) def parsePostsList(self, response): sel = Selector(response) html = response.body soup = BeautifulSoup(html) users = soup.findAll('a', {'class': re.compile('usergroup\d.*')}) items = [] topic = response.xpath( '//tbody/tr[2]/td[2]/table/tbody/tr[1]/td/div/b').extract() url = response.url for x in range(len(users)): item = PostItemsList() item['author'] = users[x].text item['author_link'] = users[x]['href'] item['create_date'] = soup.findAll( 'span', {'id': re.compile('posted_date_.*')})[x].text item['post'] = soup.findAll( 'span', {'id': re.compile('post_message.*')})[x].text item['tag'] = 'cancer' item['topic'] = topic item['url'] = url logging.info(item.__str__) items.append(item) return items
class ZZSpider(CrawlSpider): name = "zz_pesen-net" allowed_domains = ["pesen-net.livejournal.com"] start_urls = [ "http://pesen-net.livejournal.com/82406.html" # adult: http://pesen-net.livejournal.com/71163.html #"http://pesen-net.livejournal.com/70709.html" ] rules = (Rule(LinkExtractor( deny=('tag', 'reply', 'thread', 'page'), restrict_xpaths=("//span[@class='entry-linkbar-inner']"), ), callback='parse_page', follow=True), ) def parse_start_url(self, response): list(self.parse_page(response)) def parse_page(self, response): # use scrapy shell to find xpath #from scrapy.shell import inspect_response # inspect_response(response) item = ScraperItem() item['url'] = response.url try: item['title'] = response.xpath( "//dt[@class='entry-title']/text()").extract()[0] except IndexError: item['title'] = "" item['text'] = " ".join( response.xpath( "//div[@class='entry-content']/child::node()").extract()) try: item['date'] = response.xpath( "//abbr[@class='updated']/text()").extract()[0] except IndexError: item['date'] = '' try: item['comment_count'] = response.xpath( "//span[@class='comments-count']/text()").extract()[0] except IndexError: item["comment_count"] = "0" yield item
class ExampleSpider(CrawlSpider): name = "souhunews" allowed_domains = ["business.sohu.com"] start_urls = ['http://business.sohu.com/'] rules=( Rule(LinkExtractor(allow=r"/20161212+/*"), callback="parse_news",follow=True), ) def printcn(suni): for i in uni: print uni.encode('utf-8') def parse_news(self,response): item = FinancesouhuItem() item['news_thread']=response.url.strip().split('/')[-1][:-6] # self.get_thread(response,item) self.get_title(response,item) self.get_time(response,item) self.get_url(response,item) self.get_news_from(response,item) self.get_text(response,item) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse return item def get_title(self,response,item): title=response.xpath("/html/head/title/text()").extract() if title: # print 'title:'+title[0][:-5].encode('utf-8') item['news_title']=title[0][:-5] def get_time(self,response,item): time=response.xpath("//div[@id='pubtime_baidu']/text()").extract() if time: # print 'time'+time[0][:-5].encode('utf-8') item['news_time']=time[0] def get_news_from(self,response,item): news_from=response.xpath("//span[@id='media_span']/span/text()").extract() if news_from: # print 'from'+news_from[0].encode('utf-8') item['news_from']=news_from[0] def get_text(self,response,item): news_body=response.xpath("//div[@id='contentText']/div[1]/p/text()").extract() if news_body: # for entry in news_body: # print entry.encode('utf-8') item['news_body']=news_body def get_url(self,response,item): news_url=response.url if news_url: #print news_url item['news_url']=news_url
class TiwagSpider(CrawlSpider): name = "tiwag" allowed_domains = ["dietiwag.org"] start_urls = [ "http://www.dietiwag.org/phorum_2/list.php?f=2", "http://www.dietiwag.org/phorum_2/read.php?f=2&i=120470&t=120470" ] rules = ( Rule(LinkExtractor(allow=('list.php'))), Rule(LinkExtractor(allow=('read\.php', )), callback='parse_item'), ) def clean_str(self, val): return val.replace(u'\xa0', u'').replace('\n', '').strip() def parse_item(self, response): sel = Selector(response) comments = sel.xpath('//font[@class="PhorumMessage"]') items = [] for one in comments: item = Comment() item['url'] = response.url texts = one.xpath('text()').extract() item['author'] = self.clean_str(texts[0]).replace('Autor:', '') item['text'] = self.clean_str(' '.join(texts[2:len(texts)])) try: item['date'] = calendar.timegm( time.strptime( self.clean_str(texts[1]).replace('Datum:', ''), "%d-%m-%y %H:%M")) except ValueError: print "Error: " + ' '.join(texts) continue items.append(item) return items
class zqbSpider(CrawlSpider): name = site_name allowed_domains = [ "cankaoxiaoxi.com", ] start_urls = url_list rules = (Rule( LinkExtractor(allow=('/\d{4}/\d{4}/\S*\.shtml')), callback='parse_data', follow=True, ), Rule(LinkExtractor( allow=('/history/index/\d{4}-\d{2}/\d{2}-\d{2}\.shtml')), follow=True)) def parse_data(self, response): # get the publish time and store the fils by year/month year = response.url.split('/')[3] month = response.url.split('/')[4][0:2] path = data_dir + '/' + year + '/' + month if not os.path.exists(path): os.makedirs(path) # Get the title if response.xpath('//h1/text()').extract(): title = response.xpath('//h1/text()').extract()[0] else: title = response.xpath('//h2/text()').extract()[0] # get the content content_list = response.xpath( '//div[@id="ctrlfscont"]//p/text()').extract() content = "".join(content_list).strip().encode("utf-8") # If the time or the content is empty,Means we get the wrong page # Do not create the file if title and content: filename = path + '/' + title + '.txt' with open(filename, 'wb') as f: f.write(content)
class AdidasSpider(CrawlSpider): name = "adidas" allowed_domains = ["adidas.com"] start_urls = [ "http://www.adidas.com/us/shoes", ] rules = ( # Rule to go to the single product pages and run the parsing function # Excludes links that end in _W.html or _M.html, because they point to # configuration pages that aren't scrapeable (and are mostly redundant anyway) Rule(LinkExtractor( restrict_xpaths='//a[contains(@class,"product-link")]', deny=('_[WM]\.html', )), callback='singleProductParse'), # Rule to follow arrow to next product grid Rule(LinkExtractor( restrict_xpaths='//li[@class="pagging-arrow right-arrow"]'), follow=True), ) # Function to parse information from a single product page def singleProductParse(self, response): item = ProductItem() item['brand'] = 'Adidas' item['name'] = response.css('.title-32').xpath('text()').extract()[0] desc = response.css('.title-16').xpath('text()').extract()[0].strip() try: item['division'], item['category'] = desc.split(" ", 1) except ValueError: item['category'] = desc item['division'] = 'None' item['division'] = item['division'].replace("'s", "") item['price'] = response.css('span.sale-price').xpath( 'text()').extract()[0].strip() item['image_link'] = response.css( 'img.productimagezoomable::attr(src)').extract()[0] return item
class MemriseSpider(CrawlSpider): name = "memrise" allowed_domains = ["memrise.com"] start_urls = [ "http://www.memrise.com/login/", ] rules = ( Rule(LinkExtractor( allow='login', ), callback='do_login'), ) def do_login(self, response): args, url, method = fill_login_form(response.url, response.body, USER, PASSWORD) return scrapy.FormRequest(url, method=method, formdata=args, callback=self.parse_dashboard) def parse_dashboard(self, response): # If the course has several level, scrape them pagination_selector = '//div[contains(@class, "title")]/a/@href' for url in response.xpath(pagination_selector).extract(): yield scrapy.Request(urljoin('http://www.memrise.com', url), callback=self.parse_level) def parse_level(self, response): pagination_selector = '//div[contains(@class, "levels")]/a/@href' for url in response.xpath(pagination_selector).extract(): yield scrapy.Request(urljoin('http://www.memrise.com', url), callback=self.parse_level) course = response.xpath('//h1[contains(@class, "course-name")]/text()').extract()[0] for sel in response.xpath('//div[contains(@class, "thing text-text")]'): item = MemriseItem() item['course'] = course item['item_id'] = sel.xpath('@data-thing-id').extract()[0] status = sel.xpath('div/div[contains(@class, "status")]/text()').extract() if not status: status = "not learnt" elif status == ["now"]: status = "now" elif status == ['in about a day']: status = [1, 'day'] elif status == ['in about an hour']: status = [1, 'hour'] elif status == ['in about a minute']: status = [1, 'minute'] else: status = status[0].split()[1:] status[0] = int(status[0]) item['status'] = status yield item
class IdealistaSpider(CrawlSpider): name = "idealista" allowed_domains = ["idealista.com"] ######################################################################## ### Add the url to crawl in the start_urls variable ### ######################################################################## #start_urls = ["https://www.idealista.com/venta-viviendas/leganes/el-carrascal/"] #start_urls = ['https://www.idealista.com/alquiler-viviendas/madrid/zona-norte/'] start_urls = ['https://www.idealista.com/venta-viviendas/madrid/carabanchel/'] rules = ( # Filter all the flats paginated by the website following the pattern indicated Rule(LinkExtractor(restrict_xpaths=("//a[@class='icon-arrow-right-after']")), callback='parse_flats', follow=True), ) def parse_flats(self, response): # Necessary in order to create the whole link towards the website default_url = 'http://idealista.com' info_flats_xpath = response.xpath("//*[@class='item-info-container']") prices_flats_xpath = response.xpath("//*[@class='row price-row clearfix']/span[@class='item-price']/text()") discounts_xpath = response.xpath("//*[@class='row price-row clearfix']") links = [str(''.join(default_url + link.xpath('a/@href').extract().pop())) for link in info_flats_xpath] prices = [float(flat.extract().replace('.','').strip()) for flat in prices_flats_xpath] discounts = [0 if len(discount.xpath("./*[@class='item-price-down icon-pricedown']/text()").extract()) < 1 else discount.xpath("./*[@class='item-price-down icon-pricedown']/text()").extract().pop().replace('.','').strip().split(' ').pop(0) for discount in discounts_xpath] addresses = [address.xpath('a/@title').extract().pop().encode('iso-8859-1') for address in info_flats_xpath] rooms = [int(flat.xpath('span[@class="item-detail"]/small[contains(text(),"hab.")]/../text()').extract().pop().strip()) if len(flat.xpath('span[@class="item-detail"]/small[contains(text(),"hab.")]')) == 1 else None for flat in info_flats_xpath] sqfts_m2 = [float(flat.xpath('span[@class="item-detail"]/small[starts-with(text(),"m")]/../text()').extract().pop().replace('.','').strip()) if len(flat.xpath('span[@class="item-detail"]/small[starts-with(text(),"m")]')) == 1 else None for flat in info_flats_xpath] for flat in zip(links, prices, addresses, discounts, sqfts_m2, rooms): item = IdealistaItem(date=datetime.now().strftime('%Y-%m-%d'), link=flat[0], price=flat[1], address=flat[2], discount=flat[3], sqft_m2=flat[4], rooms=flat[5]) yield item #Overriding parse_start_url to get the first page parse_start_url = parse_flats
class PttSpider(CrawlSpider): name = "playspider" allowed_domains = ["play.google.com"] start_urls = [ "https://play.google.com/store/apps/details?id=mobi.infolife.ezweather.widget.batteryandweather", "https://play.google.com/store/apps/details?id=mobi.infolife.ezweather.widget.localweatherapp", # "https://play.google.com/store/apps/details?id=mobi.infolife.ezweather" ] rules = [ Rule(LinkExtractor(allow=(r"id=mobi.infolife.ezweather.widget.(batteryandweather|localweatherapp)")), callback='parse_app', follow=False) <<<<<<< HEAD ] # CrawlSpider 会根据 rules 规则爬取页面并调用函数进行处理 .widget.(batteryandweather|localweatherapp)
class GloboEsporteSpider(CrawlSpider): name = "globoesporte" allowed_domains = ["globoesporte.globo.com"] start_urls = ["http://globoesporte.globo.com/futebol/brasileirao-serie-a/noticia/plantao.html"] rules = [Rule(LinkExtractor(allow=('./noticia/.', )), follow=True, callback='parse_ge')] def parse_ge(self, response): for comentario in response.xpath('//div[@class="glbComentarios-conteudo-interno"]'): item = GloboEsporteItem() item['titulo'] = comentario.xpath('//title/text()').extract() item['autor'] = comentario.xpath('div/strong/text()').extract() item['texto'] = comentario.xpath('p[@class="glbComentarios-texto-comentario"]/text()').extract() item['link'] = response.url yield item