def get_items_and_pagination(self, response): request_again = self.error_handler(response) if request_again: yield request_again return items_extractor = LinkExtractor(deny=[r'\/image\/', r'\/map'], restrict_xpaths='.//div[@class="itemInfo"]/h2') items_links = items_extractor.extract_links(response) for link in items_links: yield Request(url=link.url, callback=self.parse_item) if response.xpath('.//a[@class="next"]').extract(): total_quantity = response.xpath( '(.//div[@class="pageResults"]/span[@class="results"]' '/text()[normalize-space()])[2]').re(r'\d+') if total_quantity: total_quantity = int(total_quantity[0]) pages = total_quantity/25 page_range = range(1, pages+2) category = cond_set_value(response.xpath( './/input[@id="FrmWho"]/@value').extract()) quoted_category = quote_plus(category) for page in page_range: next_url = self.pagination_pattern.format(prase=quoted_category, page=page) yield Request(url=next_url, headers=self.pagination_headers, dont_filter=True, method='POST', callback=self.parse_pagination)
def get_companies_links(self, response): companies_link_extractor = LinkExtractor(allow=r'\/company_\d{5,7}') companies_links = companies_link_extractor.extract_links(response) for link in companies_links: yield Request(url=link.url, callback=self.parse_item, # cookies=None, meta={'category': response.meta.get('category')})
def _extract_links(self, response, params): """ parse links from response @return hrefs """ params['allow_domains'] = tuple(self.allowed_domains) link_extractor = LinkExtractor(**params) return link_extractor.extract_links(response)
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): url = urljoin(response.url, link.url) yield scrapy.Request(url, self.parse_link, meta={ 'splash': { 'args': {'har': 1, 'html': 0}, } })
def get_categories(self, response): # http://www.construction.co.uk/double-glazing-repairs/category_33.htm link_extractor = LinkExtractor(allow=r'\/category_\d+') links = link_extractor.extract_links(response) for link in links: category = link.text yield Request(url=link.url, callback=self.get_companies_links_by_letter, meta={'category':category})
def get_companies_links_by_letter(self, response): # http://www.construction.co.uk/heating-contractors-and-consultants/22_A.htm letter_link_extractor = LinkExtractor(allow=r'\/\d+_[A-Z].htm') links_by_letter = letter_link_extractor.extract_links(response) if links_by_letter: for link in links_by_letter: yield Request(url=link.url, callback=self.get_companies_links, meta={'category': response.meta.get('category')}) else: # there is no letters pagination at the page for request in self.get_companies_links(response): yield request
def parse_items_links(self, response): categories_links_extractor = LinkExtractor(restrict_xpaths='.//div[@class="categoryListContainer"]') cat_links = categories_links_extractor.extract_links(response) for link in cat_links: yield Request(url=link.url, callback=self.parse_items_links) items_links_extractor = LinkExtractor(restrict_xpaths='.//div[@class="directory-listing"]/h3') items_links = items_links_extractor.extract_links(response) for link in items_links: yield Request(url=link.url, callback=self.parse_item) pagination_link = cond_set_value(response.xpath('.//a[@class="more"]/@href').extract()) if pagination_link: full_pagination_link = urljoin(self.start_urls[0], pagination_link) yield Request(url=full_pagination_link, callback=self.parse_items_links)
def parse_categories(self, response): request_again = self.error_handler(response) if request_again: yield request_again return categories_extractor = LinkExtractor( restrict_xpaths='.//ul[@class="popTermsList"]') categories_links = categories_extractor.extract_links(response) for link in categories_links: yield Request(url=link.url, callback=self.get_items_and_pagination) letters_extractor = LinkExtractor( restrict_xpaths='.//div[@class="popTermsNavBar"]') letters_links = letters_extractor.extract_links(response) for link in letters_links: yield Request(url=link.url, callback=self.parse_categories)
class tripadvisorSpider(CrawlSpider): name = "istanbulrestorant" DOWNLOAD_DELAY = 1 allowed_domains = ["tripadvisor.com.tr"] start_urls = [] def my_range(start, end, step): while start <= end: yield start start += step for i in my_range(30, 10950, 30): start_urls.append( 'http://www.tripadvisor.com.tr/Restaurants-g293974-oa' + str(i) + '-Istanbul.html') rules = [Rule(LinkExtractor(allow=['/Restaurant_Review.*']), 'parse_page')] def parse_page(self, response): item = items.IstanbulrestorantscrapItem() item['isim'] = response.xpath( '//*[@id="BREADCRUMBS"]/li[last()]/text()')[0].extract() item['adres'] = response.xpath( '//*[@id="ABOVE_THE_FOLD"]/div[1]/div[2]/div/div[2]/div/div[1]/div/address/span/span/span[1]/text()' )[0].extract() item['tel'] = response.xpath( '//*[@id="ABOVE_THE_FOLD"]/div[1]/div[2]/div/div[2]/div/div[1]/div/div/div/div[1]/div/text()' )[0].extract() yield item
class NewyorkCrawlerSpider(CrawlSpider): name = 'newyorkcrawler' idx = 0 allowed_domains = ['www.nytimes.com'] start_urls = ['https://www.nytimes.com/section/world/europe'] rules = (Rule(LinkExtractor(allow=[r'\d{4}/\d{2}/\d{2}/[^/]+']), callback="parse_item", follow=True), ) def parse_item(self, response): self.log("Scraping: " + response.url) item = NytItem() item['url'] = response.url a = Article(response.url) # According to the source, this doesn't download anything (i.e. opens a connection), if input_html is not None a.download(input_html=response.text) a.parse() item['title'] = a.title item['authors'] = a.authors item['body'] = a.text # TODO: add tags f = open('articles/%d-%s' % (self.idx, a.title), 'w+', encoding='utf8') f.writelines(a.authors) f.write("\n" + response.url + "\n") f.write(a.text) f.close() self.idx += 1 return item
class CsdnSpider(CrawlSpider): name='csdn' allowed_domains=['csdn.net'] start_urls=['http://blog.csdn.net/qq_35037977/article/list/1'] rules=[ Rule(LinkExtractor(allow=r'/qq_35037977/article/list/\d+',restrict_xpaths="//a[contains(., %s)]"%nextpage),follow=True), Rule(LinkExtractor(allow=r'/qq_35037977/article/details/\d+',restrict_css='.link_title'),callback='parse_csdn') ] def parse_csdn(self,response): l=ItemLoader(item=CsdnItem(),response=response) l.add_css('title','#article_details .link_title a::text') l.add_value('link',response.url) l.add_css('posttime','.article_r .link_postdate::text') l.add_css('views','.article_r .link_view::text') yield l.load_item()
class MovieSpider(CrawlSpider): name = "movie" allowed_domains = ["douban.com"] start_urls = ["http://movie.douban.com/tag/%s?type=S" % one_tag] index = 0 rules = ( # 提取匹配 'http://movie.douban.com/tag/爱情' 的翻页 # Rule(LinkExtractor(allow=(('tag/%s.start=' % quote(one_tag.encode("utf-8"))), ))), # 提取匹配 'subject/\d+' 的链接并使用spider的parse_item方法进行分析 Rule(LinkExtractor(allow=('subject/\d+', )), callback='parse_item'), ) def parse_item(self, response): item = MovieItem() item['title'] = response.xpath( "//div[@id='content']/h1/span[1]/text()").extract()[0] item['url'] = response.url try: item['desc'] = response.xpath( "//div[@id='link-report']/span/text()").extract()[0].strip() except: item['desc'] = '' try: item['score'] = response.xpath( "//strong[@class='ll rating_num']/text()").extract()[0] except: item['score'] = 0 item['image_urls'] = response.xpath( "//div[@id='mainpic']/a[@class='nbgnbg']/img/@src").extract() print item['title'], item['score'], item['url'], item['desc'] yield item
class ZZSpider(CrawlSpider): name = "zz_gunter-spb" allowed_domains = ["livejournal.com"] start_urls = [ #"http://tanyant.livejournal.com/118267.html" #"http://gunter-spb.livejournal.com/14196.html" #"http://gunter-spb.livejournal.com/2387127.html" #"http://gunter-spb.livejournal.com/610032.html" "http://gunter-spb.livejournal.com/599654.html" ] rules = ( Rule( LinkExtractor( #allow=('http://tanyant.livejournal.com/\d+\.html',), deny=('tag', 'reply', 'thread', 'page'), # xpath for snorapp, tanyant # restrict_xpaths=('//a[@title="Previous"]') restrict_xpaths=("//i[@class='b-controls-bg']/parent::a"), ), callback='parse_page', follow=True), ) def parse_page(self, response): # use scrapy shell to find xpath #from scrapy.shell import inspect_response #inspect_response(response) item = ScraperItem() item['url'] = response.url try: item['title'] = response.xpath('//h1/text()').extract()[0] except IndexError: item['title'] = "" try: item['text'] = " ".join( response.xpath('//article[2]/child::node()').extract()) except IndexError: item['text'] = '' try: time = response.xpath("//time[1]/text()[3]").extract()[0] date = response.xpath("//time[1]/a/text()").extract() date.append(time) item['date'] = date except IndexError: item['date'] = '' try: item['comment_count'] = response.xpath( '//span[@class="js-amount"]/text()').extract()[0] except IndexError: item['comment_count'] = '0' yield item
class Mess(CrawlSpider): My_Tree.objects.all().delete() name = 'mess' allowed_domains = ["localhost"] start_urls = [ start_url(), ] rules = [ Rule( LinkExtractor(restrict_xpaths=('//a[@class="next"]')), callback='parse_item', follow=True, ) ] def parse_item(self, response): hxs = response item = MyScrapyItem() item['name'] = hxs.xpath( '//*[@id="content"]/a/button/text()').extract() item['url'] = response.url item['link'] = hxs.xpath('//a/@href').extract() s = My_Tree(url=item['url'], link=item['link'], name=item['name']) s.save() print item return item
class ExampleSpider(CrawlSpider): name = 'pic' allowed_domains = ['www.zhihu.com'] start_urls = ['https://www.zhihu.com/topic/19643259/hot'] #item = ZhihuItem() rules = [ Rule(LinkExtractor(allow=['/question/.*','/people/.*']), callback = 'parse_item', follow = True) ] def parse_item(self, response): #print response.css('h1').extract() image = ZhihuItem() #head_url = re.sub(r'_l\.', '.', ''.join(response.css('.body .Avatar--l::attr(src)').extract())) image['title'] = response.xpath("//img/@alt").extract() #image['image_urls'] = arr rel = response.xpath("//img/@srcset").extract() #print 'adsfaksjdfawp9eifh', rel #print 'asdkfaiehfkajsbdfj', rel[0] for i in range(len(rel)): rel[i] = re.sub(' 2x','',rel[i]) image['image_urls'] = rel #rel[0] = re.sub(' 2x','',rel[0]) #image['image_urls'] = [rel[0]] #print image['image_urls'] return image
class tiebaSpider(CrawlSpider): name = "tieba" allowed_domains = ["baidu.com"] start_urls = ["http://tieba.baidu.com/p/5051125142"] rules = [ Rule(LinkExtractor(allow=("/5051125142\?pn=(\d)")), follow=True, callback='parse_cont'), ] def parse11(self, response): pass def parse_cont(self, response): items = [] select = Selector(response) text = select.css('div.l_post.j_l_post.l_post_bright') # authors = select.css('div.d_author') # contents = select.css('div.d_post_content_main') for list in text: item = BaiduttItem() item['name'] = list.css('.d_author ul li.d_name a').xpath( 'text()').extract() item['content'] = list.css('.d_post_content_main cc div').xpath( 'text()').extract() items.append(item) return items
def __init__(self, *args, **kwargs): # run using: scrapy crawl xss_spider -a url='http://example.com' super(XSSspider, self).__init__(*args, **kwargs) self.start_urls = [kwargs.get('url')] hostname = urlparse(self.start_urls[0]).hostname # With subdomains self.allowed_domains = [hostname] # adding [] around the value seems to allow it to crawl subdomain of value self.delim = '1zqj' # semi colon goes on end because sometimes it cuts stuff off like # gruyere or the second cookie delim self.test_str = '\'"(){}<x>:/' # Login details self.login_user = kwargs.get('user') if self.login_user == 'None': self.login_user = None else: # Don't hit links with 'logout' in them since self.login_user exists self.rules = (Rule(LinkExtractor(deny=('logout')), callback='parse_resp', follow=True), ) if kwargs.get('pw') == 'None' and self.login_user is not None: self.login_pass = raw_input("Please enter the password: ") else: self.login_pass = kwargs.get('pw') # HTTP Basic Auth self.basic_auth = kwargs.get('basic') if self.basic_auth == 'true': self.http_user = self.login_user self.http_pass = self.login_pass
class MovieSpider(CrawlSpider): name='vDown' allowed_domains=['91porn.it'] searchText='我' start_urls=['http://www.91porn.it/search?search_query='+urllib.quote(searchText)+'&search_type=videos'] rules=[ #Rule(LinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))), #Rule(LinkExtractor(allow=(r'class=\"well well-sm[\s\S]*?href=\"(\S+?)\"[\s\S]*?img[\s]src=\"(\S+?)\"')),callback='parse_item'), #Rule(LinkExtractor(allow=(r'http://www.91porn.it/video/\d+"')),callback='parse_item') Rule(LinkExtractor(allow=(r'/video/\d+/"')),callback='parse_item') ] count=2 def parse_item(self,response): item=MyItem() item['url']=response.url print(response.url) videoPat=r'\<video[\s\S]*?poster=\"(\S+?)\"[\s\S]*\<source\s+src=\"(\S+?)\"'; m=re.match(videoPat,response.body) print(m.groups()) #item['file_urls']=response.xpath("//img/@src").extract() return item
class Image1Spider(CrawlSpider): name = "image1" allowed_domains = [ "developers.google.com", "developer.chrome.com", "developer.android.com", "cloud.google.com", "googledevelopers.blogspot.com", #"appurify.com", #"www.chromium.org", #"www.firebase.com", #"golang.org", #"www.html5rocks.com", #"www.stackdriver.com", "www.dartlang.org", "developer.nest.com", "www.polymer-project.org", ] start_urls = ('https://developers.google.com/', ) rules = [ Rule(LinkExtractor(allow=[r'.*']), callback='parse_item', follow=True) ] def parse_item(self, response): content = scrapy.Selector(response=response).xpath('//body') for node in content: item = GioItem() item['url'] = response.url item['image_urls'] = node.xpath('//img/@src').extract() yield item
class tripadvisorSpider(CrawlSpider): name = "karadenizOtel" DOWNLOAD_DELAY = 1 allowed_domains = ["tripadvisor.com.tr"] start_urls = [] def my_range(start, end, step): while start <= end: yield start start += step for i in my_range(0, 180, 30): start_urls.append('http://www.tripadvisor.com.tr/Hotels-g673665-oa' + str(i) + '-Turkish_Black_Sea_Coast-Hotels.html') rules = [Rule(LinkExtractor(allow=['/Hotel_Review.*'], ), 'parse_page')] def parse_page(self, response): item = items.KaradenizotelscrapItem() item['isim'] = response.xpath('//*[@id="HEADING"]/text()')[1].extract() item['adres'] = response.xpath( '//*[@id="HEADING_GROUP"]/div/div[3]/address/div[1]/span/span[1]/text()' )[0].extract() item['sehir'] = response.xpath( '//*[@id="BREADCRUMBS"]/li[4]/a/span/text()')[0].extract() item['ilce'] = response.xpath( '//*[@id="BREADCRUMBS"]/li[5]/a/span/text()')[0].extract() yield item
class PicSpider(CrawlSpider): name = "first" allowed_domains = ["www.reddit.com"] start_urls = ['http://www.reddit.com/r/paypal/'] rules = [ Rule(LinkExtractor(allow=['/r/paypal/\?count=\d*&after=\w*']), callback='parse_item', follow=True) ] # rules = [ # # Traverse the in the /r/pics subreddit. When you don't pass # # callback then follow=True by default. # # It's also important to NOT override the parse method # # the parse method is used by the CrawlSpider continuously extract links # Rule(LinkExtractor( # allow=['/r/pics/\?count=\d*&after=\w*']), # callback='parse_item', # follow=True), # ] def parse_item(self, response): selector_list = response.css('div.thing') for selector in selector_list: item = PicItem() item['title'] = selector.xpath('div/p/a/text()').extract() item['url'] = selector.xpath('a/@href').extract() yield item
class DBMSpider(CrawlSpider): # 爬虫名称,运行的时候用到 name = "doubanMovie" # 允许的域名 allowed_domains = ["movie.douban.com"] # 爬虫的种子地址,即第一个地址 start_urls = ["https://movie.douban.com"] #该spider将从https://movie.douban.com的首页开始爬取,获取subject的链接后使用parse_item方法。 rules = ( # 这里会将符合r'/subject/\d+/'这个正则的地址的网页用parse_subject这个方法解析 Rule(LinkExtractor(allow=(r'/subject/\d+/', )), callback='parse_subject', follow=True), ) # 这里可以用xpath或者css选择器来获取内容,直接用chrome开发者工具就可以了 def parse_subject(self, response): item = DoubanmovieItem() # todo extract item content item['movie_name'] = response.xpath( '//*[@id="content"]/h1/span[1]').xpath( 'normalize-space(string(.))').extract()[0] item['intro'] = response.xpath('//*[@id="link-report"]/span').xpath( 'normalize-space(string(.))').extract()[0] item['actors'] = response.xpath( '//*[@id="info"]/span[3]/span[2]').xpath( 'normalize-space(string(.))').extract() item['date'] = response.xpath('//*[@id="info"]/span[11]').xpath( 'normalize-space(string(.))').extract()[0] item['director'] = response.xpath( '//*[@id="info"]/span[1]/span[2]/a').xpath( 'normalize-space(string(.))').extract()[0] return item
class MySpider(CrawlSpider): name = "Huntsman" allowed_domains = ["essex.ac.uk"] start_urls = ["https://www.essex.ac.uk/"] rules = [ Rule(LinkExtractor(allow=('/www.essex.ac.uk/((?!:).)*$'), ), callback="parse_item", follow=True) ] def parse_item(self, response): item = BeastItem() url = response.url item["url"] = url title = response.xpath("//title/text()")[0].extract() item["title"] = title description = response.xpath( "//meta[@name='description']/@content").extract() item["description"] = description body = response.xpath('//body//text()').re(r'(\w[ ,\'\-\w]+\w)') item["body"] = body return item
class StackCrawlerSpider(CrawlSpider): name = 'stack_crawler' allowed_domains = ['stackoverflow.com'] # start_urls = ['http://www.stackoverflow.com/'] start_urls = ['http://stackoverflow.com/questions?pagesize=50&sort=newest'] # rules = ( # Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), # ) rules = [ Rule(LinkExtractor(allow=r'questions\?page=[0-9]&sort=newest'), callback='parse_item', follow=True) ] def parse_item(self, response): # i = StackItem() # #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # #i['name'] = response.xpath('//div[@id="name"]').extract() # #i['description'] = response.xpath('//div[@id="description"]').extract() # return i questions = response.xpath('//div[@class="summary"]/h3') for question in questions: item = StackItem() item['url'] = question.xpath( 'a[@class="question-hyperlink"]/@href').extract()[0] item['title'] = question.xpath( 'a[@class="question-hyperlink"]/text()').extract()[0] yield item
class BaiduSpider(CrawlSpider): name = "baidu" allowed_domains = ["baidu.com"] start_urls = ('http://shouji.baidu.com/software/?from=as', ) rules = [ Rule(LinkExtractor(allow=("http://shouji.baidu.com/soft/item", )), callback='parse_app', follow=True), ] def parse_app(self, response): apk = AppItem() apk['url'] = response.url apk['name'] = response.css('.app-name>span').extract()[0] apk['rate'] = response.css(".star-percent").xpath( "@style").extract()[0] apk['size'] = response.css(".detail > span.size").xpath( "text()").extract()[0] apk['category'] = response.css(".nav").css("a")[1].xpath( "text()").extract()[0] apk['apk_url'] = response.css(".apk").xpath("@href").extract()[0] apk['screenshots'] = response.css(".imagefix").xpath("@src").extract() apk['download_num'] = response.css("span.download-num").xpath( "text()").extract()[0] yield apk
class LogoSpider(CrawlSpider): name = 'logo' allowed_domains = ['pcauto.com.cn'] start_urls = ['http://www.pcauto.com.cn/zt/chebiao/guochan/'] rules = ( Rule(LinkExtractor(allow=(r'http://www.pcauto.com.cn/zt/chebiao/.*?/$')), callback='parse_page'), ) def parse_page(self, response): # print(response.text) sel = Selector(response) # print(sel) country = "".join(sel.xpath('//div[@class="th"]/span[@class="mark"]/a/text()').extract()) # carname = sel.xpath('//div[@class="dPic"]/i[@class="iPic"]/a/img/@alt').extract() # imageurl = sel.xpath('//div[@class="dPic"]/i[@class="iPic"]/a/img/@src').extract() # item=LogoItem(country=country, carname=carname, imageurl=imageurl) # yield item carnames = sel.xpath('//div[@class="dPic"]/i[@class="iPic"]/a/img/@alt').extract() for carname in carnames: imageurl= sel.xpath('//div[@class="dPic"]/i[@class="iPic"]/a/img[@alt="'+carname+'"]/@src').extract() # print(country, carname, imageurl) item = LogoItem(country=country, carname=carname, imageurl=imageurl) yield item
class GlassdoorSpider(scrapy.Spider): name = 'glassdoor' allowed_domains = [ 'glassdoor.com', 'www.glassdoor.com', 'www.glassdoor.com.au' ] start_urls = ['https://www.glassdoor.com'] rules = [ Rule(LinkExtractor(allow=r'\/Reviews\/.*'), callback='parse', follow=True) ] def start_requests(self): urls = [ 'https://www.glassdoor.com.au/Reviews/sydney-reviews-SRCH_IL.0,6_IM962.htm', ] for url in urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): for company in response.css('div.eiHdrModule'): yield { 'name': company.css("a.tightAll::text").extract_first().strip(), 'score': company.css("span.bigRating::text").extract_first(), 'reviews': company.css("span.num::text")[0].extract().strip(), 'salaries': company.css("span.num::text")[1].extract().strip(), 'interviews': company.css("span.num::text")[2].extract().strip(), } for href in response.css('li.page a::attr(href)'): yield response.follow(href, callback=self.parse)
class ImageSpider(CrawlSpider): name = "picSpider" start_urls = ["https://www.reddit.com/r/pics/"] allowed_domains = [ #Domain allowed to scrape "www.reddit.com" ] rules = ( # Extract links matching 'below Regex' # and follow links from them (No callback means follow=True by default). Rule(LinkExtractor(allow=(".*\/r\/pics\/\?count=\d*&after=(\w*)", )), callback='parse_next', follow=True), ) def parse_next(self, response): #Selecting list of elements and parsing one by one selector_list = response.xpath("//div[contains(@class,'thing')]") for selector in selector_list: print(selector) item = ItemStack.PicItem() item['title'] = selector.xpath("div/p/a/text()").extract() # item['link_url']=selector.xpath('p[contains(@class,\'title\')]/a/@href').extract() item['image_urls'] = selector.xpath( "a[contains(@class,'thumbnail')]/@href").extract() yield item
class MoocSpider(CrawlSpider): name = 'mooc' allowed_domains = ['mooc.cn'] start_urls = [] for i in range(1, 37): start_urls.append('http://www.mooc.cn/course/page/%d' % i) rules = [Rule(LinkExtractor(allow=['/.*\.html']), 'parse_mooc')] def parse_mooc(self, response): mooc = MoocItem() moocs = [] mooc['url'] = response.url ch_name = response.xpath("//h1/text()").extract() en_name = response.xpath( "//div[@class='course_enname']/text()").extract() university = response.xpath("//h2[1]/text()").extract() time = response.xpath("//div[@class='coursetime']/text()").extract() desc = response.xpath( "//div[@class='content-entry clearfix']/p[1]/text()").extract() mooc['ch_name'] = [m.encode('utf-8') for m in ch_name] mooc['en_name'] = [m.encode('utf-8') for m in en_name] mooc['university'] = [m.encode('utf-8') for m in university] mooc['time'] = [m.encode('utf-8') for m in time] mooc['desc'] = [m.encode('utf-8') for m in desc] moocs.append(mooc) return moocs
class Jeeran(CrawlSpider): name = 'jeeran' allowed_domains = ['yellowpages.com.eg'] start_urls = ['http://www.yellowpages.com.eg/en/category/pharmacies'] #depth_limit= 0 rules = (Rule(LinkExtractor(allow=('')), callback='parse_obj', follow=True), )
class PitchforkSpider(CrawlSpider): name = 'pitchfork' allowed_domains = ['pitchfork.com'] start_urls = [ 'https://pitchfork.com/reviews/albums/', 'https://pitchfork.com/reviews/albums/?page=2', 'https://pitchfork.com/reviews/albums/?page=3', 'https://pitchfork.com/reviews/albums/?page=4', 'https://pitchfork.com/reviews/albums/?page=5' ] rules = [Rule(LinkExtractor(allow=''), callback='parse', follow=True)] def parse(self, response): artists = response.xpath('//ul/li[1]/text()').extract() album = response.xpath('//h2/text()').extract() urls = response.xpath('//div[@class = "review"]/a/@href').extract() url = [BASE_URL + link for link in urls] for link in url: request = scrapy.Request(link, callback=self.review_text, dont_filter=True) yield request def review_text(self, response): text = response.xpath('//p/text()').extract() title = response.xpath('//h2/ul/li/a/text()').extract() album = response.xpath('//h1/text()').extract() yield ReviewItem(artists=title, album=album, text=text)
def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs): ''' Constructs spider instance from command=line or scrapyd daemon. :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage :param save_html: boolean 0/1 :param use_splash: boolean 0/1 :param screenshot_dir: used only when use_splash=1 :param op_time: operating time in minutes, negative - don't use that constraint :param kwargs: :return: ''' super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs) self.screenshot_dir = screenshot_dir log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG) if seed_urls: self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')] self.ranker = Ranker.load() self.linkextractor = LinkExtractor() self.save_html = bool(save_html) self.use_splash = bool(use_splash) self.operating_time = int(op_time) * 60 self.start_time = datetime.utcnow() self.finishing = False
class tripadvisorSpider(CrawlSpider): name = "ege" DOWNLOAD_DELAY = 1 allowed_domains = ["tripadvisor.com.tr"] start_urls = [ "http://www.tripadvisor.com.tr/Restaurants-g657096-Turkish_Aegean_Coast.html" ] def my_range(start, end, step): while start <= end: yield start start += step #for i in my_range(30, 6900, 30): #start_urls.append('http://www.tripadvisor.com.tr/Restaurants-g657096-oa'+str(i)+'-Turkish_Aegean_Coast.html') rules = [Rule(LinkExtractor(allow=['/Restaurant_Review.*'],deny = 'http://www.tripadvisor.com.tr/Restaurants-g657096-Turkish_Aegean_Coast.html'), 'parse_page')] def parse_page(self, response): item = items.EgerestorantscrapItem() item['isim'] = response.xpath('//*[@id="BREADCRUMBS"]/li[last()]/text()')[0].extract() item['adres'] = response.xpath('//*[@id="ABOVE_THE_FOLD"]/div[1]/div[2]/div/div[2]/div/div[1]/div/address/span/span/span[1]/text()')[0].extract() item['sehir'] = response.xpath('//*[@id="BREADCRUMBS"]/li[4]/a/span/text()')[0].extract() item['tel'] = response.xpath('//*[@id="ABOVE_THE_FOLD"]/div[1]/div[2]/div/div[2]/div/div[1]/div/div/div/div[1]/div/text()')[0].extract() item['ilce'] = response.xpath('//*[@id="BREADCRUMBS"]/li[5]/a/span/text()')[0].extract() yield item
class cqrbSpider(CrawlSpider): name = site_name allowed_domains = [ "cqrbepaper.cqnews.net", ] start_urls = url_list rules = ( Rule(LinkExtractor(allow=('/cqrb/html/\d{4}-\d{2}/\d{2}/content.+$')), \ callback='parse_data', follow=True,), ) def parse_data(self, response): # get the publish time and store the fils by year/month time = response.url.split('/')[5] year = time[0:4] month = time[5:7] path = data_dir + '/' + year + '/' + month if not os.path.exists(path): os.makedirs(path) # Get the title title = response.xpath('//tr/td/strong/text()').extract()[1].strip().encode('utf-8') # get the content content_list = response.xpath('//*[@id="ozoom"]/founder-content//text()').extract() content = "".join(content_list).strip().encode("utf-8") # If the time or the content is empty,Means we get the wrong page # Do not create the file if title and content: filename = path + '/' + title + '.txt' with open(filename, 'wb') as f: f.write(content)
class ProxySpider(CrawlSpider): name = "proxy" #allowed_domains = ['xici.net.co', 'youdaili.net'] start_urls = [ r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy", r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=10", r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=20", r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=30", r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=40", r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=50", r"http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip%20proxy&pn=60", r"http://www.gfsoso.net/?q=ip+proxy&t=1", r"http://www.gfsoso.net/?q=ip+proxy&pn=10", r"http://www.gfsoso.net/?q=ip+proxy&pn=20", r"http://www.gfsoso.net/?q=ip+proxy&pn=30", r"http://www.gfsoso.net/?q=ip+proxy&pn=40", r"http://www.gfsoso.net/?q=ip+proxy&pn=50", r"http://www.gfsoso.net/?q=ip+proxy&pn=60", ] rules = (Rule(LinkExtractor(allow=(r'', )), callback='parse_item'), ) def parse_item(self, response): soup = BeautifulSoup(response.body) str_list = [tag.string or '' for tag in soup.find_all(True)] body_str = ' '.join(str_list) # items = [ GoproxyItem(ip=group[0], port=group[7], protocol='HTTP') for group in re.findall(REG_IP, body_str) ] for group in re.findall(REG_IP, body_str): proxy_item, created = ProxyItem.objects.update_or_create( ip=group[0]) proxy_item.port = group[7] proxy_item.save()
class QuotesSpider(scrapy.Spider): name = "express" # name of spider to be called during execution start_urls = ['http://indianexpress.com/section/india/'] rules = (Rule(LinkExtractor( allow=(), restrict_css=('.yt-uix-button-content a ::attr(href)')), callback="parse_page", follow=True), ) def parse(self, response): for quote in response.css( 'div.articles' ): # content taken from article(storyclass of manorama website yield { 'Date': quote.css('div.date::text').extract_first( ), # date to be taken from section-teaserlist of website 'Headlines': quote.css('div.title a::text').extract_first(), 'link': quote.css('div.title a::attr(href)').extract() } NEXT_PAGE_SELECTOR = 'div.pagination a ::attr(href)' next_page = response.css(NEXT_PAGE_SELECTOR).extract_first() try: next_page = response.css( 'div.pagination a::attr(href)').extract()[0] yield scrapy.Request(response.urljoin(next_page), callback=self.parse) except IndexError: pass
class BbcSpider(CrawlSpider): name = "bbc" #Crawler will no go beyond this domains allowed_domains = ["bbc.com"] #Where to start crawling start_urls = ('http://www.bbc.com/', ) rules = ( # Change rule as per the requirements otherwise it will take hours #Rule(LinkExtractor(allow=r'/news/[A-Za-z0-9]'), callback='parse_item', follow=True), Rule(LinkExtractor(allow=r'/news/[a-z]'), callback='parse_item', follow=True), ) def parse_item(self, response): hxs = HtmlXPathSelector(response) item = CrawlerItem() #Use xpath or class selector #item['text'] = response.xpath('//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]/div[3]/p[1]//text()').extract()[0] item['text'] = hxs.select( '//p[@class="story-body__introduction"]/text()').extract() item['author'] = "BBC News Media" #item['headline'] = response.xpath('//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]/h1//text()').extract()[0] item['headline'] = hxs.select( '//h1[@class="story-body__h1"]/text()').extract() item['url'] = response.url yield item
def parse(self, response): print 'startPageSpider==========================>',response.url # log.msg(format='%(iPid)s, %(url)s, %(project)s ', iPid = self.taskId, url = response.url, project=self.project) listQuqueCount = self.redis.llen('scrapy:startPageSpider:listQuque:%s' % self.taskId) if listQuqueCount == 1: self._crawler.signals.send_catch_log('writeListQuque') elif listQuqueCount == 0: self._crawler.signals.send_catch_log('emptyListQuque') print 'startPageSpider---------send_catch_log->emptyListQuque' if response.url not in self.hasCrawlSet: pattern = re.compile(r'%s' % self.project['szStartUrlReg']) self.hasCrawlSet.add(response.url) if pattern.match(response.url) and response.url not in self.hasInsertSet: title = "|".join(response.xpath('/html/head/title/text()').extract()) insertSql = 'INSERT INTO project_start_page(iPid, szUrl, szTitle,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s")' % (self.taskId, response.url, title, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) self.dbUtils.insert(insertSql) self.hasInsertSet.add(response.url) self.redis.lpush('scrapy:startPageSpider:listQuque:%s' % self.taskId, response.url) #self.redis.sadd('scrapy:startPageSpider:startPage:2', response.url) log.msg(format='spider=startPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=title, u=response.url) _allow = ( _allow for _allow in self.project['szStartUrlReg'].split('~')) self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow) links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasCrawlSet ] for link in links: yield self.make_requests_from_url(link.url)
class solidotSpider(CrawlSpider): name = site_name allowed_domains = [ "solidot.org", ] start_urls = url_list rules = ( Rule(LinkExtractor(allow=('story\?sid=\d+$')), \ callback='parse_data', follow=True,), ) def parse_data(self, response): # get the publish time and store the fils by year/month date_string = response.xpath('//div[@class="talk_time"]/text()')\ .extract()[2].split(' ')[2] year = date_string[0:4] month = date_string[5:7] path = data_dir + '/' + year + '/' + month if not os.path.exists(path): os.makedirs(path) # Get the title title = response.xpath( '//div[@class="bg_htit"]/h2/text()').extract()[0] # get the content content_list = response.xpath( '//div[@class="p_mainnew"]/text()').extract() content = "".join(content_list).strip().encode("utf-8") # If the time or the content is empty,Means we get the wrong page # Do not create the file if title and content: filename = path + '/' + title + '.txt' with open(filename, 'wb') as f: f.write(content)
class CountrySpider(CrawlSpider): name = 'country' allowed_domains = ['example.webscraping.com'] start_urls = ['http://example.webscraping.com/'] rules = (Rule(LinkExtractor(allow='/index/', deny='/user/'), follow=True), Rule(LinkExtractor(allow='/view/', deny='/user/'), callback='parse_item')) def parse_item(self, response): item = ExampleItem() item['name'] = response.css( 'tr#places_country__row td.w2p_fw::text').extract() item['population'] = response.css( 'tr#places_population__row td.w2p_fw::text').extract() return item
class FollowAllSpider(Spider): name = 'followall' def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] self.link_extractor = LinkExtractor() self.cookies_seen = set() def start_requests(self): return [Request(self.url, callback=self.parse, dont_filter=True)] def parse(self, response): """Parse a PageItem and all requests to follow @url http://www.scrapinghub.com/ @returns items 1 1 @returns requests 1 @scrapes url title foo """ page = self._get_item(response) r = [page] r.extend(self._extract_requests(response)) return r def _get_item(self, response): item = Page(url=response.url, size=str(len(response.body)), referer=response.request.headers.get('Referer')) self._set_title(item, response) self._set_new_cookies(item, response) return item def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r def _set_title(self, page, response): if isinstance(response, HtmlResponse): title = Selector(response).xpath("//title/text()").extract() if title: page['title'] = title[0] def _set_new_cookies(self, page, response): cookies = [] for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) cookies.append(cookie) if cookies: page['newcookies'] = cookies
def extract_links(self, response): # The parent can do most of it for us links = LinkExtractor.extract_links(self, response) try: good_links = [link for link in links if link.text.isdigit()] except TypeError: return None return good_links
def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] self.link_extractor = LinkExtractor() self.cookies_seen = set()
def parse(self, response): # set the allowed domains in link ln_extractor = LinkExtractor(allow_domains=("news.sina.cn"), allow = (".*vt=1.*")) # get the links from the response links = ln_extractor.extract_links(response) urls = [] items = [] for i in links: urls.append(i.url) # all the not visited urls are put into container and queue. if i.url not in self.g_container_urls: self.g_queue_urls.put(i.url) self.g_container_urls.add(i.url) # make all the request in the queue for j in range(self.g_queue_urls.qsize()): tp_url = self.g_queue_urls.get() items.append(self.make_requests_from_url(tp_url). replace(callback=self.parse_page)) items.append(self.make_requests_from_url(tp_url)) return items
class startPageSpider(Spider): name = 'startPageSpider' def __init__(self, taskId, *a, **kw): """Constructor""" self.name +='_'+str(taskId) super(startPageSpider, self).__init__(*a, **kw) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) self.redis = redis.Redis(connection_pool=pool) self.dbUtils = db.dbUtils() self.taskId = int(taskId) self.project = None self.domain = None self.hasCrawlSet = set() self.hasInsertSet = set() project = self.dbUtils.queryRow('SELECT * FROM project_setting WHERE iStatus=1 AND iPid=%d' % self.taskId) if project : self.project = project self.start_urls = str(project['szStartUrl']).split('~') self.domain = ".".join(urlparse(project['szDomain']).hostname.split(".")[-2:]) def parse(self, response): print 'startPageSpider==========================>',response.url # log.msg(format='%(iPid)s, %(url)s, %(project)s ', iPid = self.taskId, url = response.url, project=self.project) listQuqueCount = self.redis.llen('scrapy:startPageSpider:listQuque:%s' % self.taskId) if listQuqueCount == 1: self._crawler.signals.send_catch_log('writeListQuque') elif listQuqueCount == 0: self._crawler.signals.send_catch_log('emptyListQuque') print 'startPageSpider---------send_catch_log->emptyListQuque' if response.url not in self.hasCrawlSet: pattern = re.compile(r'%s' % self.project['szStartUrlReg']) self.hasCrawlSet.add(response.url) if pattern.match(response.url) and response.url not in self.hasInsertSet: title = "|".join(response.xpath('/html/head/title/text()').extract()) insertSql = 'INSERT INTO project_start_page(iPid, szUrl, szTitle,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s")' % (self.taskId, response.url, title, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) self.dbUtils.insert(insertSql) self.hasInsertSet.add(response.url) self.redis.lpush('scrapy:startPageSpider:listQuque:%s' % self.taskId, response.url) #self.redis.sadd('scrapy:startPageSpider:startPage:2', response.url) log.msg(format='spider=startPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=title, u=response.url) _allow = ( _allow for _allow in self.project['szStartUrlReg'].split('~')) self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow) links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasCrawlSet ] for link in links: yield self.make_requests_from_url(link.url)
def parse(self, response): #self.redis.sadd('scrapy:startPageSpider:startPage:3', response.url) if response.url not in self.hasCrawlSet: #self.redis.sadd('scrapy:startPageSpider:startPage:4', response.url) self.hasCrawlSet.add(response.url) _allow = ( _allow for _allow in self.project['szUrlReg'].split('~')) self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow) links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasInsertSet ] #self.redis.hset('scrapy:startPageSpider:listPage:count', response.url, len(links)) for link in links: if link.url in self.hasInsertSet : continue insertSql = 'INSERT INTO project_list_page(iPid, szUrl, szTitle, szSourceUrl,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s", "%s")' % (self.taskId,link.url, link.text, response.url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) self.dbUtils.insert(insertSql) self.hasInsertSet.add(link.url) log.msg(format='spider=listPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=link.text, u=link.url)
def parse(self, response): xlink = LinkExtractor() itemre = re.compile(self.itemurl_re) for link in xlink.extract_links(response): if itemre.search(link.url): yield Request(url=link.url, callback=self.parse_item)
def __init__(self): LinkExtractor.__init__(self, restrict_xpaths='//div[@id="paging-bottom"]')
class listPageSpider(Spider): name = 'listPageSpider' def __init__(self, taskId, *a, **kw): """Constructor""" self.name +='_'+str(taskId) super(listPageSpider, self).__init__(*a, **kw) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) self.redis = redis.Redis(connection_pool=pool) self.dbUtils = db.dbUtils() self.taskId = int(taskId) self.domain = None self.project = None self.hasCrawlSet = set() self.hasInsertSet = set() self.isExit = 0 project = self.dbUtils.queryRow('SELECT * FROM project_setting WHERE iStatus=1 AND iPid=%d' % self.taskId) if project : self.project = project self.domain = ".".join(urlparse(project['szDomain']).hostname.split(".")[-2:]) # self.start_urls = ['http://www.ty2016.com/cn/2.html', 'http://www.ty2016.com/cn/3.html', 'http://www.ty2016.com/cn/4.html'] def stopSpider(self): self.isExit = 1 def getStartUrl(self): url = self.redis.rpop('scrapy:startPageSpider:listQuque') if not url: self.getStartUrl() return url def start_requests(self): # url = self.getStartUrl() # print '=====================>',url # yield self.make_requests_from_url(url) while True : #if self._crawler.engine is not None: #if self._crawler.engine.paused: break #if not self._crawler.engine.running: break url = self.redis.rpop('scrapy:startPageSpider:listQuque:%s' % self.taskId) #print 'listPageSpider==========================>',url if url: #self.redis.sadd('scrapy:startPageSpider:startPage:1', url) yield self.make_requests_from_url(url) #else: #self._crawler.signals.send_catch_log('emptyListQuque') #print 'listPageSpider---------send_catch_log->emptyListQuque' def parse(self, response): #self.redis.sadd('scrapy:startPageSpider:startPage:3', response.url) if response.url not in self.hasCrawlSet: #self.redis.sadd('scrapy:startPageSpider:startPage:4', response.url) self.hasCrawlSet.add(response.url) _allow = ( _allow for _allow in self.project['szUrlReg'].split('~')) self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow) links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasInsertSet ] #self.redis.hset('scrapy:startPageSpider:listPage:count', response.url, len(links)) for link in links: if link.url in self.hasInsertSet : continue insertSql = 'INSERT INTO project_list_page(iPid, szUrl, szTitle, szSourceUrl,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s", "%s")' % (self.taskId,link.url, link.text, response.url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) self.dbUtils.insert(insertSql) self.hasInsertSet.add(link.url) log.msg(format='spider=listPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=link.text, u=link.url)
def parse_urls(self, response): extractor = LinkExtractor(restrict_xpaths=('//table[contains(@class,ibm-data-table)]/tbody',)) links = extractor.extract_links(response) for link in links: url = link.url yield Request(url, callback=self.parse_items)
class TopicalFinder(SplashSpiderBase): name = 'topical_finder' save_html = None use_splash = None def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs): ''' Constructs spider instance from command=line or scrapyd daemon. :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage :param save_html: boolean 0/1 :param use_splash: boolean 0/1 :param screenshot_dir: used only when use_splash=1 :param op_time: operating time in minutes, negative - don't use that constraint :param kwargs: :return: ''' super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs) self.screenshot_dir = screenshot_dir log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG) if seed_urls: self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')] self.ranker = Ranker.load() self.linkextractor = LinkExtractor() self.save_html = bool(save_html) self.use_splash = bool(use_splash) self.operating_time = int(op_time) * 60 self.start_time = datetime.utcnow() self.finishing = False def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url, is_seed=True) def make_requests_from_url(self, url, is_seed=False): if self.use_splash: r = self._splash_request(url) else: r = super(TopicalFinder, self).make_requests_from_url(url) r.meta['score'] = 0.0 r.meta['is_seed'] = False if is_seed: r.meta['is_seed'] = True r.meta['score'] = 1.0 # setting maximum score value for seeds log.msg("Making request to %s with meta: %s" % (r.url, str(r.meta)), _level=log.DEBUG) return r def set_crawler(self, crawler): super(TopicalFinder, self).set_crawler(crawler) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def spider_idle(self): log.msg("Spider idle signal caught.", _level=log.DEBUG) raise DontCloseSpider def parse(self, response): ld = self._load_webpage_item(response, is_seed=response.meta['is_seed']) if self.use_splash: self._process_splash_response(response, ld) yield ld.load_item() if self.finishing: return now = datetime.utcnow() if self.operating_time > 0 and (now - self.start_time).total_seconds() > self.operating_time: log.msg("Reached operating time constraint. Waiting for Scrapy queue to exhaust.") self.finishing = True self.crawler.stop() return if not isinstance(response, TextResponse): return body = response.body_as_unicode().strip().encode('utf8') or '<html/>' score = self.ranker.score_html(body) log.msg("TC: %s has score=%f" % (response.url, score), _level=log.DEBUG) if score > 0.5: #!for some reason this is returning the raw splash response JSON #!and not the rendered HTML from splash #log.msg(u"\n\n\n****---Response body:\n %s----***\n\n\n" % response.body_as_unicode(), _level=log.DEBUG) #for link in self.linkextractor.extract_links(response): #can something like the line below fix it? Seems like a hack... for link in self.linkextractor.extract_links(response): log.msg("****---LINK EXTRACED: %s----***" % str(link.url), _level=log.DEBUG) if self.use_splash: r = self._splash_request(url=link.url) else: r = Request(url=link.url) external = is_external_url(response.url, link.url) depth = response.meta.get('link_depth', 0) r.meta.update({ 'link': { 'url': link.url, 'text': link.text, 'fragment': link.fragment, 'nofollow': link.nofollow}, 'link_depth': 0 if external else depth + 1, 'referrer_depth': depth, 'referrer_url': response.url, }) url_parts = urlparse_cached(r) path_parts = url_parts.path.split('/') r.meta['score'] = 1.0 / len(path_parts) r.meta['is_seed'] = False yield r def _load_webpage_item(self, response, is_seed): depth = response.meta.get('link_depth', 0) ld = WebpageItemLoader(response=response) ld.add_value('url', response.url) ld.add_value('host', get_domain(response.url)) ld.add_xpath('title', '//title/text()') ld.add_value('depth', depth) ld.add_value('total_depth', response.meta.get('depth')) ld.add_value('crawled_at', datetime.utcnow()) ld.add_value('is_seed', is_seed) ld.add_value('crawler_score', response.meta['score']) if self.save_html: ld.add_value('html', response.body_as_unicode()) if 'link' in response.meta: link = response.meta['link'] ld.add_value('link_text', link['text']) ld.add_value('link_url', link['url']) ld.add_value('referrer_url', response.meta['referrer_url']) ld.add_value('referrer_depth', response.meta['referrer_depth']) return ld
def parse_categories(self, response): l = LinkExtractor(restrict_xpaths='.//div[@class="categoryListContainer"]') links = l.extract_links(response) for link in links: yield Request(url=link.url, callback=self.parse_items_links)
def parse_level3_contents(self, response): baseurl = response.xpath('//base/@href').extract()[0] le = LinkExtractor() for link in le.extract_links(response): if self.allowed_domains[0] in link.url: yield Request(link.url, callback=self.final_contents)
def parse_urls(self, response): extractor = LinkExtractor(restrict_xpaths=('//div[contains(@class, "news_type2")]/h2',)) links = extractor.extract_links(response) for link in links: url = link.url yield Request(url, callback=self.parse_items)