class Gmw(SpiderRedis): name = "gmw" website = "光明网" allowed_domain = "gmw.cn" start_urls = ['http://www.gmw.cn/'] rules = [ Rule(LinkExtractor(allow=("content_", ), deny=("sports", "shipin", "health", "shuhua", "run", "xueshu", "e.gmw.cn", "v.gmw.cn", "gongyi", "jd", "ny", "guoxue", "history", "sixiang", "topics", "photo", "cg", "media", "meiwen", "reader", "bbs", "blog", "travel")), callback="get_news", follow=True), Rule(LinkExtractor(allow=("node_", ), deny=("sports", "shipin", "health", "shuhua", "run", "xueshu", "e.gmw.cn", "v.gmw.cn", "gongyi", "jd", "ny", "guoxue", "history", "sixiang", "topics", "photo", "cg", "media", "meiwen", "reader", "bbs", "blog", "travel")), follow=True) ] def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath( '//h1[@id="articleTitle"]/text()').extract_first()) loader.add_value( "title", response.xpath( '//div[@id="articleTitle"]/text()').extract_first()) loader.add_value( "date", response.xpath('//span[@id="pubTime"]/text()').extract_first() + ":00") loader.add_value( "content", ''.join( response.xpath( '//div[@id="contentMain"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def parse(self, response): link = LinkExtractor(restrict_xpaths="//ul[@class='cont_xiaoqu']//li") links = link.extract_links(response) for link_line in links: print(link_line.url,link_line.text) item = LinkdemoItem() item["url"] = link_line.url item["text"] = link_line.text yield item
def parse(self, response): link_regulation = LinkExtractor(restrict_css='section') url_list = link_regulation.extract_links(response) if url_list: for link in url_list: url = link.url if 'page-' in url: yield scrapy.Request(url, callback=self.parse) else: yield scrapy.Request(url, callback=self.parse_detail)
def parse(self, response): link = LinkExtractor( restrict_css= 'body > div.wrap > div.middleright > div > div.cartoon_online_border > ul > li' ) links = link.extract_links(response) # link1 = link.extract_links(response)[0] for link in links: yield Request(url=link.url, callback=self.parse2, dont_filter=True)
def parse(self, response): body = Selector(text=response.body) images = body.css('img').extract() for image in images: image = image.encode("utf-8") if PexelsScraper.src_extractor.findall(image): img_url = PexelsScraper.src_extractor.findall(image)[0] if img_url not in PexelsScraper.crawled_urls: if 'http' not in img_url: print img_url print self.start_urls[0] print PexelsScraper.domain_extractor.findall( self.start_urls[0]) img_url = PexelsScraper.domain_extractor.findall( self.start_urls[0])[0][0] + img_url print img_url PexelsScraper.crawled_urls.add(img_url) tags = "" img_name = "" img_type = "" if PexelsScraper.tags_extractor.findall(image): tags = PexelsScraper.tags_extractor.findall( image)[0].replace(',', '').lower() print img_url, tags if '/' in img_url and len( PexelsScraper.filename_extractor.findall( img_url)) > 0: img_name = PexelsScraper.filename_extractor.findall( img_url)[0][0] img_type = PexelsScraper.filename_extractor.findall( img_url)[0][1] print img_name data = requests.get(img_url).content im = Image.open(BytesIO(data)) width, height = im.size # PexelsScraper.image_width = im.size[0] # PexelsScraper.image_height = im.size[1] img_aspect_ratio = self.calculate_aspect(width, height) yield ImagecrawlerItem(source_url=response.url, img_url=img_url, alternate_text=tags, img_width=width, img_height=height, img_name=img_name, img_type=img_type, img_aspect_ratio=img_aspect_ratio) link_extractor = LinkExtractor() next_links = [ link.url for link in link_extractor.extract_links(response) if not self.is_extracted(link.url) ] # Crawl the filtered links for link in next_links: yield scrapy.Request(link, self.parse)
def parse(self, response): link_extractor = LinkExtractor(allow=RotaractSpider.url_matcher) links = [link.url for link in link_extractor.extract_links(response)] for link in links: flag = True article_links = [] yield scrapy.Request(url=link, callback=self.parse_articles, meta={ 'article_links': article_links, 'flag': flag })
class BarneysSpider(CrawlSpider): name = 'barneys-crawl-spider' allowed_domains = ['www.barneys.com'] start_urls = [ 'https://www.barneys.com/', 'https://www.barneys.com/global/ajaxGlobalNav.jsp' ] product_parser = ProductParser() custom_settings = { 'DOWNLOAD_DELAY': 1, 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 ' 'Safari/537.36' } product_css = ['[id="ajaxGlobalNav"]', '.topnav-level-1'] listing_css = ['[id="main-container"]'] rules = [ Rule(LinkExtractor(restrict_css=product_css), callback='parse', process_request='set_currency_cookie'), Rule(LinkExtractor(restrict_css=listing_css), callback='parse_item', process_request='set_currency_cookie') ] def parse(self, response): trail = response.meta.get('trail', []) title = self.extract_title(response) if title: trail = trail + [[title, response.url]] for request in super().parse(response): request.meta['trail'] = trail yield request def parse_item(self, response): return self.product_parser.parse(response) def set_currency_cookie(self, request): request.cookies['usr_currency'] = 'SE-SEK' return request def extract_title(self, response): title = response.css('title::text').extract_first() if title: title = title.split('|')[0].strip() return title
class MovieSubjectSpider(scrapy.Spider): name = 'movie_subject' allowed_domains = ['m.douban.com'] start_urls = ['http://m.douban.com/'] rules = (Rule(LinkExtractor(allow=('movie/subject/(\d).*rec$')), callback='parse_item', follow=True, process_request='cookie')) def cookie(self, request): bid = ''.join( random.choice(string.ascii_letters + string.digits) for x in range(11)) request.cookies['bid'] = bid return request def start_requests(self): for url in self.start_urls: bid = ''.join( random.choice(string.ascii_letters + string.digits) for x in range(11)) yield Request(url, cookies={'bid': bid}) def get_douban_id(self, subject, response): subject['douban_id'] = response.url[35:-10] return subject def parse_item(self, response): subject = Subject() self.get_douban_id(subject, response) subject['type'] = 'movie' return subject
class DoubanSpider(CrawlSpider): name = 'douban' start_urls = {'https://movie.douban.com/top250/'} rules = (Rule(LinkExtractor(allow=r'https://movie.douban.com/top250.*'), callback='parse_item'), ) def parse_item(self, response): items = doubanItem() res = Selector(response) items['name'] = res.xpath( '//div[@class="hd"]/a/span[1]/text()').extract() # 电影名 items['imgs'] = res.xpath( '//*[@id="content"]/div/div[1]/ol/li/div/div/a/img/@src').extract( ) # 导演,主演 directors_info = res.xpath( '//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()[1]' ).extract() # 年份, 国家,分类 movies_info = res.xpath( '//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()[2]' ).extract() # 评分 items['rate'] = res.xpath( '//span[@class="rating_num"]/text()').extract() # print(items) return items
class Spider(CrawlSpider): name = 'mzitu' allowed_domains = ['mzitu.com', 'meizitu.net'] start_urls = ['http://www.mzitu.com/'] img_urls = [] rules = [ Rule(LinkExtractor(allow=('http://www.mzitu.com/\d{1,6}', ), deny=('http://www.mzitu.com/\d{1,6}/\d{1,6}')), callback='parse_item', follow=True) ] def parse_item(self, response): item = MzituScrapyItem() max_num = response.xpath( '//div[@class="content"]/div[@class="pagenavi"]/a[last()-1]/span/text()' ).extract_first(default='N/A') item['name'] = response.xpath( "//div[@class='main']/div[@class='content']/h2[@class='main-title']/text()" ).extract_first(default='N/A') for num in range(1, int(max_num) + 1): page_url = response.url + '/' + str(num) yield Request(page_url, callback=self.img_url) item['image_urls'] = self.img_urls item['url'] = response.url yield item def img_url(self, response): img_urls = response.xpath( '//div[@class="main-image"]/p/a/img/@src').extract() for img_url in img_urls: self.img_urls.append(img_url)
class Spider(CrawlSpider): name = 'mzitu' allowed_domains = ['mzitu.com'] start_urls = ['http://www.mzitu.com/'] img_urls = [] rules = ( Rule(LinkExtractor(allow=('http://www.mzitu.com/\d{1,6}',), deny=('http://www.mzitu.com/\d{1,6}/\d{1,6}')), callback='parse_item', follow=True), ) def parse_item(self, response): item = MzituScrapyItem() max_num = response.xpath( "descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first( default="N/A") item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default='N/A') for num in range(1, int(max_num)): page_url = response.url + '/' + str(num) yield Request(page_url, callback=self.img_url) item['image_urls'] = self.img_urls yield item def img_url(self, response): image_urls = response.xpath("descendant::div[@class='main-image']/descendant::img/@src").extract() for img_url in image_urls: self.img_urls.append(img_url)
class DatabloggerSpider(CrawlSpider): name = "datablogger" allowed_domains = ["wiprodigital.com"] start_urls = ["https://wiprodigital.com/"] rules = [ Rule(LinkExtractor(canonicalize=True, unique=True), follow=True, callback="parse_items") ] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, dont_filter=True) def parse_items(self, response): items = [] links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) for link in links: is_allowed = False for allowed_domain in self.allowed_domains: if allowed_domain in link.url: is_allowed = True if is_allowed: item = DatabloggerScraperItem() item['url_from'] = response.url item['url_to'] = link.url items.append(item) return items
def parse_items(self, response): # The list of items that are found on the particular page items = [] # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(allow_domains=self.allowed_domains, canonicalize=True, unique=True).extract_links(response) # Now go through all the found links for link in links: # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains is_allowed = False for allow_domain in self.allowed_domains: if allow_domain in link.url: is_allowed = True # If it is allowed, create a new item and add it to the list of found items if is_allowed: item = VnExpressScraperItem() item['url_from'] = response.url item['url_to'] = link.url items.append(item) # if is_allowed: # title = response.xpath('//head/title/text()').extract()[0] # texts = response.xpath('//*[not(self::script) and string-length(text()) > 0]/text()').extract() # self.parse_text(title, texts) # self.parse_html(title, response.body) return items
class LKSpider(CrawlSpider): name = "lk" allowed_domains = ['www.lkong.net'] start_urls = ['http://www.lkong.net/forum-60-1.html'] rules = (Rule(LinkExtractor(allow=('/forum-60-\d{1,4}\.html', )), callback='parse_page', follow=True), ) def parse_page(self, response): #url = thread #for thread in response.xpath('//th[@class="new"]/a/@href').extract(): #yield scrapy.Request(url, callback=self.parse_thread) if response.url not in pages: pages.append(response.url) with open('page', 'a+') as f: f.write(response.url + '\n') def parse_thread(self, response): item = LkItem() item['home'] = response.url item['title'] = response.xpath( '//h1[@class="ts"]/a[1]/text()').extract() item['link'] = response.xpath('//h1[@class="ts"]/a[2]/@href').extract() item['content'] = response.xpath( '//div[@id="postlist"]/div[1]/descendant::td[@class="t_f"]' ).extract()[0].encode('utf8') return item
def parse(self, response): link = LinkExtractor( deny='/fang1/a2/', restrict_xpaths= '//div[@class="f-filter f-w1190"]//dd[@class="info"]/div[@class="thr-list"]//li[@class="item"]/a' ) links = link.extract_links(response) for i in links: city_name = re.split('\/', i.url)[-3] yield Request(i.url, callback=self.get_index, meta={ 'city_name': city_name, 'dont_redirect': True }, dont_filter=True)
class MySpider(CrawlSpider): # The name of the spider name = "datablogger" # The domains that are allowed (links to other domains are skipped) allowed_domains = ["xkcd.com"] # The URLs to start with start_urls = ["https://xkcd.com/"] # This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method rules = [ Rule(LinkExtractor(allow=(), canonicalize=True, unique=True), follow=True, callback="parse_items") ] # Method which starts the requests by visiting all URLs specified in start_urls def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, dont_filter=True) # Method for parsing items def parse_items(self, response): print(response)
def parse(self, response): UNKNOWN_TYPE = 'unknown' SERVER_DATE_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' content_type = response.headers.get('Content-Type', UNKNOWN_TYPE) date_header = response.headers.get('Date') lastmodified_header = response.headers.get('Last-Modified') date = None lastmodified = None if date_header is not None: date = datetime.datetime.strptime(date_header, SERVER_DATE_FORMAT).strftime(DATETIME_FORMAT) if lastmodified_header is not None: lastmodified = datetime.datetime.strptime(lastmodified_header, SERVER_DATE_FORMAT).strftime(DATETIME_FORMAT) yield { #'headers': response.headers, 'url_to': response.url, 'content_type' : content_type, 'date' : date, 'source_server' : response.headers.get('Server', UNKNOWN_TYPE), 'content_length' : response.headers.get('Content-Length', 0), 'last_modified' : lastmodified } if self.allowedcontenttype(content_type): links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) for link in links: if '..' not in link.url and link.url is not response.url: yield scrapy.Request(link.url) yield scrapy.Request(link.url, method="HEAD")
class TecentSpider(scrapy.Spider): name = 'tencent' allowed_domains = ['hr.tencent.com'] start_urls = ['http://hr.tencent.com/position.php?start=0#a'] for i in range(274): strI = str(i*10) start_urls.append("http://hr.tencent.com/position.php?start="+strI+"#a") # start_urls = ['http://hr.tencent.com/position.php?start=0#a', # 'http://hr.tencent.com/position.php?start=10#a', # 'http://hr.tencent.com/position.php?start=20#a', # 'http://hr.tencent.com/position.php?start=20#a',] pageLink = LinkExtractor(allow=("start=\d+")) # 获取列表里链接,依次发出请求,通过callback依次处理 rules = [ Rule(pageLink, callback="parse", follow=True) ] def parse(self, response): for it in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): item = TencentspiderItem() #print(type(item)) item['positionName'] = it.xpath("./td[1]/a/text()").extract()[0] item['positionType'] = it.xpath("./td[2]/text()").extract()[0] item['pNum'] = it.xpath("./td[3]/text()").extract()[0] item['address'] = it.xpath("./td[4]/text()").extract()[0] item['publishTime'] = it.xpath("./td[5]/text()").extract()[0] yield item
class CrawlSpider(scrapy.CrawlSpider): name = 'test2' allow_domains = ['', ] start_urls = ['', ] rules = ( Rule(LinkExtractor(allow='category\.php', deny=('subsection\.php'))), Rule(LinkExtractor(allow='item\.php'),callback='parse_item'), ) def parse_item(self,response): self.logger.info('aaa%s',response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)') item['name'] = response.xpath('//td[@id="item_name"]/text()').extract() item['description'] = response.xpath('//td[@id="item_description"]/text()').extract() return item
def parse_items(self, response): # The list of items that are found on the particular page self.depth -= 1 if self.depth <= 0: return [] items = [] # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) # links = response.xpath('//a[@href]') # Now go through all the found links for link in links: # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains is_allowed = False for allowed_domain in self.allowed_domains: if allowed_domain in link.url and len(link.url) > len( response.url): is_allowed = True article_tag = response.xpath( "//li[contains(@class, 'article__loader')]") if not article_tag: is_allowed = False # If it is allowed, create a new item and add it to the list of found items if is_allowed: item = NewsScraperItem() item['url_from'] = response.url item['url_to'] = link.url items.append(item) # Return all the found items return items
class DatabloggerSpider(scrapy.Spider): name = 'datablogger' allowed_domains = ['www.mctopherganesh.com'] start_urls = ['http://www.mctopherganesh.com/'] rules = [ Rule( LinkExtractor( canonicalize=True, unique=True ), follow=True, callback="parse" ) ] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, dont_filter=True) def parse(self, response): items = [] links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) for link in links: is_allowed=False for allowed_domain in self.allowed_domains: if allowed_domain in link.url: is_allowed=True if is_allowed: item = DatabloggerScraperItem() item['url_from'] = response.url item['url_to'] = link.url items.append(item) return items
class LkLoginSpider(CrawlSpider): name = "lkl" allowed_domains = ['www.lkong.net'] #start_urls = ['http://www.lkong.net/member.php?mod=logging&action=login'] #start_urls = ['http://www.lkong.net/forum-14-1.html'] rules = (Rule(LinkExtractor(allow=('/thread.+\.html', )), callback='parse_thread'), ) def start_requests(self): return [ scrapy.FormRequest( 'http://www.lkong.net/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=L7On7&inajax=1', formdata={ 'username': '******', 'password': '******', 'answer': 'email', 'formhash': 'forumhash', 'referer': '/forum.php', 'questionid': '0', 'loginsubmit': 'True', 'cookietime': '2592000' }, callback=self.after_login) ] def after_login(self, response): #print response.body.decode('utf8') for url in start_url: yield self.make_requests_from_url(url) def parse_thread(self, response): print response.body.decode('utf8') print response.url
class mzitu_spider(CrawlSpider): name = 'mzitu' start_urls = {'http://www.mzitu.com/all/'} rules = { Rule(LinkExtractor(allow=r'http://www.mzitu.com/\d{1,6}', deny=r'http://www.mzitu.com/\d{1,6}/\d{1,6}'), callback='parse_item', follow=True) } img_urls = [] def parse_item(self, response): item = MzituSpiderItem() total_pages = response.xpath( '/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract()[ 0] # str item['name'] = response.xpath( '/html/body/div[2]/div[1]/h2/text()').extract() item['url'] = response.url # 用来设置中间件里面浏览器请求头的referer参数, for i in range(1, int(total_pages) - 1): page_url = response.url + '/' + str(i) # 每页的图片地址 yield scrapy.Request(page_url, callback=self.img_url) item['img_urls'] = self.img_urls yield item def img_url(self, response): img_urls = response.xpath( "/html/body/div[2]/div[1]/div[3]/p/a/img/@src").extract() for img_url in img_urls: self.img_urls.append(img_url)
def parse_items(self, response): # The list of items that are found on the particular page items = [] # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) # Now go through all the found links for link in links: # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains is_allowed = False for allowed_domain in self.allowed_domains: if allowed_domain in link.url: is_allowed = True # If it is allowed, create a new item and add it to the list of found items if is_allowed: item = MyscraperItem() item['link'] = link.url items.append(item) patterns = [ "kalerkantho.com/online/national/", "kalerkantho.com/online/Politics/", "kalerkantho.com/online/Court/", "kalerkantho.com/online/world/", "kalerkantho.com/online/business/", "kalerkantho.com/online/sahitya/", "kalerkantho.com/online/sport/", "kalerkantho.com/online/entertainment/", "kalerkantho.com/online/info-tech/", "kalerkantho.com/online/prescription/" ] file = None if patterns[0] in link.url: file = open('../../data/national.csv', 'a') if patterns[1] in link.url: file = open('../../data/politics.csv', 'a') if patterns[2] in link.url: file = open('../../data/court.csv', 'a') if patterns[3] in link.url: file = open('../../data/world.csv', 'a') if patterns[4] in link.url: file = open('../../data/business.csv', 'a') if patterns[5] in link.url: file = open('../../data/literature.csv', 'a') if patterns[6] in link.url: file = open('../../data/sports.csv', 'a') if patterns[7] in link.url: file = open('../../data/entertainment.csv', 'a') if patterns[8] in link.url: file = open('../../data/tech.csv', 'a') if patterns[9] in link.url: file = open('../../data/medical.csv', 'a') if file != None: file.write(urlShortener(link.url) + "\n") file.close() # Return all the found items return items
def get_index(self, response): city_name = response.meta['city_name'] link = LinkExtractor( allow='/fang1/.*htm', restrict_xpaths= '//div[@class="f-main f-clear f-w1190"]//div[@class="f-main-list"]/div[@class="f-list js-tips-list"]/div' ) links = link.extract_links(response) for i in links: city = re.split('\/|\.', i.url)[2] yield Request(i.url, callback=self.get_message, meta={ 'city': city, 'city_name': city_name, 'dont_redirect': True }, dont_filter=True)
class AbyznewslinksSpider(CrawlSpider): name = 'abz' depth = 400 # The domains that are allowed (links to other domains are skipped) allowed_domains = ['thestar.com'] # The URLs to start with start_urls = ['https://www.thestar.com/'] # This spider has one rule: extract all (unique and canonicalized) links, # follow them and parse them using the parse_items method rules = [ Rule(LinkExtractor(canonicalize=True, unique=True), follow=True, callback="parse_items") ] # Method which starts the requests by visiting all URLs specified in start_urls def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, dont_filter=False) # Method for parsing items def parse_items(self, response): # The list of items that are found on the particular page self.depth -= 1 if self.depth <= 0: return [] items = [] # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) # links = response.xpath('//a[@href]') # Now go through all the found links for link in links: # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains is_allowed = False for allowed_domain in self.allowed_domains: if allowed_domain in link.url and len(link.url) > len( response.url): is_allowed = True article_tag = response.xpath( "//li[contains(@class, 'article__loader')]") if not article_tag: is_allowed = False # If it is allowed, create a new item and add it to the list of found items if is_allowed: item = NewsScraperItem() item['url_from'] = response.url item['url_to'] = link.url items.append(item) # Return all the found items return items
class ComputrabajoSpider(CrawlSpider): name = "mi primer crowlspider" start_urls = ['https://www.ve.computrabajo.com/ofertas-de-trabajo/'] allowed_domain = ['www.ve.computrabajo.com/'] rules = ( Rule(LinkExtractor(allow=r'p=')), Rule(LinkExtractor(allow=r'/oferta-de-trabajo-de-'), callback='parse_items'), ) def parse_items(self, response): item = ItemLoader(Articulos(), response) item.add_xpath('title', '//*[@id="MainContainer"]/article/section[1]/div[1]/div/h2/text()') item.add_xpath('description', '//*[@id="MainContainer"]/article/section[1]/div[2]/ul/li[3]/text()') yield item.load_item() # scrapy runspider multiplepages.py -o ../../resources/computrabajo.csv -t csv
def callload(self,response): link = LinkExtractor(restrict_xpaths='//*[@cellspacing="1"]//a') link = link.extract_links(response) for urllist in link: url=urllist.url if url in self.loaded: pass else: self.loaded.append(url) request = scrapy.Request(url, callback=self.parse, headers={'User-Agent': 'Mozilla/5.0'}, dont_filter=True) path = self.path + '/'+urllist.text request.meta['item'] = path yield request time.sleep(2)
class Chinanews(SpiderRedis): name = "chinanews" website = u"中国新闻网" allowed_domain = "chinanews.com" start_urls = ['http://www.chinanews.com/'] rules = [ Rule(LinkExtractor("\d{4}/\d{2}-\d{2}/\d{7}.shtml$"), callback="get_news", follow=True), Rule(LinkExtractor("scroll-news", "china", "world", "society", "finance", "business", "fortune", "gangao", "taiwan", "huaren", "theory", "life"), follow=True) ] def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath( '//div[@id="cont_1_1_2"]/h1[1]/text()').extract_first()) loader.add_value( "date", response.xpath( '//span[@id="pubtime_baidu"]/text()').extract_first()) loader.add_value( "content", ''.join( response.xpath( '//div[@class="left_zw"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def parse(self, response): USER = True next_links = [] body = Selector(text=response.body) images = body.css('img.photo-item__img').extract() for image in images: img_url = PexelsScraper.src_extractor.findall(image)[0] tags = [ tag.replace(',', '').lower() for tag in PexelsScraper.tags_extractor.findall(image)[0].split(' ') ] print("Tags_check: ") print tags link_extractor = LinkExtractor(allow=PexelsScraper.url_matcher) next_links = [ link.url for link in link_extractor.extract_links(response) if not self.is_extracted(link.url) ] # Crawl the filtered links next_page_url = response.css( 'div.pagination a[rel="next"]::attr(href)').extract_first() if next_page_url: next_page_url = URL + next_page_url next_links.append(next_page_url) print("next_page_url") print(next_page_url) if USER: links = response.css("a.pull-left::attr(href)").extract_first() print(links) if links: links = "https://www.pexels.com" + links for i in range(10): next_links.append(links + "?page=" + str(i)) print("go into user parse") #request.meta['main_url'] = URL #yield request for each in next_links: yield scrapy.Request(each, self.parse_by_user) print("should have done user parse") print("Links_check: {}".format(links)) for link in next_links: print("next_links") print link yield scrapy.Request(link, self.parse)