def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//div[@itemtype="http://schema.org/BlogPosting"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link, Date title = sub_blog.xpath('meta[@itemprop="mainEntityOfPage"]/@content').extract_first() link = sub_blog.xpath('meta[@itemprop="mainEntityOfPage"]/@itemid').extract_first() date = sub_blog.xpath('meta[@itemprop="datePublished"]/@content').extract_first() item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Older Posts")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): if not self.exclude_posts: self.exclude_posts = response.xpath( '//input[@id="tt-hidden-exclude-posts"]/@value').extract_first( ) print 'exclude_posts ', self.exclude_posts blogs = response.xpath('//div[@class="scc left"]') if not blogs: return for blog in blogs: item = BlogExtractItem() title = blog.xpath( 'div[@class="scbt"]/h5/a/text()').extract_first() link = blog.xpath('div[@class="scbt"]/h5/a/@href').extract_first() author = blog.xpath( 'div[@class="scbt"]/span/a/text()').extract_first().strip() item['title'] = title item['link'] = link item['author'] = author request = scrapy.Request(link, callback=self.parse_sub_blog, dont_filter=True) request.meta['item'] = item yield request yield scrapy.FormRequest(url=self.intermediate_url, formdata={ "page": str(self.page_num), "excludePosts": self.exclude_posts }, callback=self.parse, dont_filter=True) self.page_num += 1
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//div[@class="post"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link link = sub_blog.xpath('a/@href').extract_first() title = sub_blog.xpath( 'a/span/span[@class="title"]/strong/text()').extract_first() date = sub_blog.xpath( 'a/span/span[@class="date"]/em/text()').extract_first() item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item request.meta['dont_redirect'] = True yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//span[contains(text(),"Older posts")]') if next_page: next_page_url = next_page.xpath('../@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath( '//li[contains(@class, "type-post status-publish")]') if not blogs: return for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('a/h2/text()').extract_first() link = sub_blog.xpath('a/@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = link # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request next_page_url = self.start_urls[0] + 'page/' + str(self.page_number) yield scrapy.Request(next_page_url, callback=self.parse) self.page_number += 1
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath( '//div[@class="eight columns post-group-content"]/article') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('header/h1/a/text()').extract_first() link = sub_blog.xpath('header/h1/a/@href').extract_first() date = sub_blog.xpath('footer//li[@class="date"]/time/text()' ).extract_first().strip() author = sub_blog.xpath('footer//li[@class="author"]').re( r'.*By\s*(.*?)\s*<') disgus_identifier = sub_blog.xpath( 'footer//li[@class="comments"]/a/span/@data-dsqidentifier' ).extract_first().strip() item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date item['author'] = author[0] if author else None # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item request.meta['disgus_identifier'] = disgus_identifier yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Older Entries")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//div[@class="inside-article"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('header/h2/a/text()').extract_first() link = sub_blog.xpath('header/h2/a/@href').extract_first() comments_count = sub_blog.xpath( 'footer/span[@class="comments-link"]/a').re(r'(\d+)\s*Comment') item['title'] = replace_special_chars(title) item['link'] = link item['comments_count'] = comments_count[ 0] if comments_count else None # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. # next_page = response.xpath('//a[@class="next page-numbers"]') next_page = response.xpath('//a[@class="next page-numbers"]') # next_page = response.xpath('//a[contains(text(), "Next")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() request = scrapy.Request(next_page_url, callback=self.parse) request.meta['dont_redirect'] = True request.meta['handle_httpstatus_all'] = True yield request
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//header[@class="entry-header"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('h2/a/text()').extract_first() link = sub_blog.xpath('h2/a/@href').extract_first() date = sub_blog.xpath( 'div/span/a/time[@class="entry-date published"]/text()' ).extract_first() item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # yield item # # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[@class="next page-numbers"]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//h2[@class="blogtitle"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('a/text()').extract_first() link = sub_blog.xpath('a/@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = link # yield item # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Older posts")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): archive_list = response.xpath( '//div[@id="smart-archives-list"]/ul[@class="archive-list"]') print response.url # print response.text for sub_block in archive_list: for each_blog in sub_block.xpath('li'): # print each_blog item = BlogExtractItem() # #Get Title, Link, Date title = each_blog.xpath('a/text()').extract_first().strip() link = each_blog.xpath('a/@href').extract_first() date = each_blog.re(r'.*?(\d+\s*\w+\s*\d+)\s*-')[0] comments_count = each_blog.re(r'.*?\((\d+)\s*comment')[0] item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date item['comments_count'] = comments_count # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_sub_blog) request.meta['dont_redirect'] = True request.meta['item'] = item yield request
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath( '//div[@id="post-entry"]/div[@class="post-meta"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('h1/a/text()').extract_first() link = sub_blog.xpath('h1/a/@href').extract_first() posted_date = sub_blog.xpath('div[@class="post-date"]').re( r'.*on\s*(.*)\s*<') author = sub_blog.xpath( 'div[@class="authors-cat"]/a[@rel="author"]/text()' ).extract_first() comments_count = sub_blog.xpath( 'div[@class="post-commented"]/a').re(r'(\d+)\s*Comment') item['title'] = replace_special_chars(title) item['link'] = link item['date'] = posted_date[0] if posted_date else None item['author'] = author item['comments_count'] = comments_count[ 0] if comments_count else None # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Older Entries")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): blog_posts = response.xpath('//div[@class="blog-posts"]') print response.url for sub_blog in blog_posts.xpath( 'div[@class="blog-post"]/div[@class="blog-post__description"]' ): item = BlogExtractItem() #Get Title, Link, Date title = sub_blog.xpath('h2/a/text()').extract_first().strip() link = sub_blog.xpath('h2/a/@href').extract_first() author = sub_blog.xpath( 'div[@class="blog-post__meta"]/strong/a/text()').extract_first( ) item['title'] = replace_special_chars(title) item['link'] = link item['author'] = author #Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_sub_blog) request.meta['dont_redirect'] = True request.meta['item'] = item yield request #If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(@title, "Next Page")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() request = scrapy.Request(next_page_url, callback=self.parse) request.meta['dont_redirect'] = True request.meta['handle_httpstatus_all'] = True yield request
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//div[contains(@id, "post-")]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link, Date, Comment title = sub_blog.xpath('h2/a/text()').extract_first() link = sub_blog.xpath('h2/a/@href').extract_first() date = sub_blog.xpath( 'abbr[@class="teaser_date published"]/text()').extract_first() comments_count = sub_blog.xpath( 'a[@class="teaser_comments"]/span/text()').extract_first() item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date item['comments_count'] = comments_count # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Older Posts")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): blog_posts = response.xpath('//article[contains(@class, "type-post status-publish")]') print 'page_url ', response.url for sub_blog in blog_posts: item = BlogExtractItem() # get post_id which is needed to call another request to get comments post_id = sub_blog.xpath('@class')[0].re(r'post-(\w+)')[0] #Get Title, Link, Date title = sub_blog.xpath('h2[@class="entry-title"]/a/text()').extract_first().strip() link = sub_blog.xpath('h2[@class="entry-title"]/a/@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = link # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_sub_blog) request.meta['dont_redirect'] = True request.meta['item'] = item request.meta['post_id'] = post_id yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Older Entries")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//header[@class="entry-header"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('h2/a/text()').extract_first() link = sub_blog.xpath('h2/a/@href').extract_first() date = sub_blog.xpath('p[@class="entry-meta"]/time/text()').extract_first().strip() author = sub_blog.xpath('p[@class="entry-meta"]/span[@class="entry-author"]/a/span/text()').extract_first().strip() comments_count = sub_blog.xpath('p[@class="entry-meta"]/span[@class="entry-comments-link"]/a').re(r'(\d+)\s*Comment') item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date item['author'] = author item['comments_count'] = comments_count[0] if comments_count else None # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Next Page")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//div[@class= "col-md-9 col-sm-12"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('h2/a/text()').extract_first() link = sub_blog.xpath('h2/a/@href').extract_first() comments_count = sub_blog.xpath('div[@class="entry-meta"]/a').re( r'(\d+)\s*Comment') item['title'] = replace_special_chars(title) item['link'] = link item['comments_count'] = comments_count[0] if comments_count else 0 # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Older posts")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//h1[@class="entry-title"]') for sub_blog in blogs: if sub_blog.xpath('span[text()="Articles"]'): continue if sub_blog.xpath('a[contains(text(),"The Weekly Roundup")]'): continue item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('a/text()').extract_first() link = sub_blog.xpath('a/@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = link # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # yield item # # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//div[@class="basel-pagination"]/span[@class="current"]/following-sibling::a') if next_page: next_page_url = next_page.xpath('@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//a[@class="u-url"]') for sub_blog in blogs: item = BlogExtractItem() #last offset 1259846023988 #Get Title, Link title = ''.join( a.replace('\n', '').strip() for a in sub_blog.xpath('text()').extract()) link = sub_blog.xpath('@href').extract_first() date = sub_blog.xpath('time/@datetime').extract_first() item['title'] = replace_special_chars(title) item['link'] = self.start_urls[0] + link item['date'] = date # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[@class= "load-more"]') if next_page: next_page_url = self.start_urls[0] + next_page.xpath( '@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): if not self.page_number == 1: data = json.loads(response.text) body = data['data'] response = HtmlResponse(url=response.url, body=body, encoding='utf-8') blog_posts = response.xpath('//article[contains(@class, "post-")]') if not blog_posts: return for sub_blog in blog_posts: item = BlogExtractItem() post_id = sub_blog.xpath('@class')[0].re(r'post-(\d+)')[0] #Get Title, Link, Date title = sub_blog.xpath( 'div/header/h2[@class="entry-title"]/a/text()').extract_first( ) if not title: title = sub_blog.xpath( 'div/header/h2[@class="entry-title"]/a/strong/text()' ).extract_first() title = title.strip() link = sub_blog.xpath( 'div/header/h2[@class="entry-title"]/a/@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = link # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item request.meta['post_id'] = post_id yield request self.page_number += 1 yield scrapy.FormRequest(url=self.intermediate_url, formdata={ "action": "be_ajax_load_more", "nonce": "5622db3955", "page": str(self.page_number), "query[pagename]": "articles" }, callback=self.parse, dont_filter=True)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//div[@class="journal-entry"]') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link, Date title = sub_blog.xpath( 'div[@class="journal-entry-text"]/h2/a/text()').extract_first( ) link = sub_blog.xpath( 'div[@class="journal-entry-text"]/h2/a/@href').extract_first() try: author = sub_blog.xpath( 'div[@class="journal-entry-tag journal-entry-tag-post-body"]\ /div/span[@class="posted-by"]/a/text()').extract( )[1] except: author = sub_blog.xpath( 'div[@class="journal-entry-text"]/div[@class="journal-entry-tag journal-entry-tag-post-body"]\ /div/span[@class="posted-by"]/a/text()').extract( )[1] try: date = sub_blog.xpath( 'div[@class="journal-entry-text"]/div[@class="journal-entry-tag journal-entry-tag-post-body"]\ /div/span[@class="posted-on"]/text()').extract()[1] except: date = sub_blog.xpath( 'div[@class="journal-entry-tag journal-entry-tag-post-body"]\ /div/span[@class="posted-on"]/text()').extract()[1] item['title'] = replace_special_chars(title) item['link'] = self.start_urls[0] + link item['author'] = author item['date'] = date # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(),"Next 20 Entries")]') if next_page: next_page_url = self.start_urls[0] + next_page.xpath( '@href').extract_first() yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response): blog_posts = response.xpath( '//div[contains(@class, "type-post status-publish")]') print 'page_url ', response.url for sub_blog in blog_posts: item = BlogExtractItem() # get post_id which is needed to call another request to get comments post_id = sub_blog.xpath('@class')[0].re(r'\s*post-(\d+)\s*')[0] #Get Title, Link, Date title = sub_blog.xpath( 'div[@class="blog-post-wrap"]//h2[@class="blog-post-title"]/a/text()' ).extract_first().strip() link = sub_blog.xpath( 'div[@class="blog-post-wrap"]//h2[@class="blog-post-title"]/a/@href' ).extract_first().strip() date = sub_blog.xpath( 'div[@class="blog-post-wrap"]//span[@class="vntd-meta-date"]/span/text()' ).extract_first().strip() author = sub_blog.xpath( 'div[@class="blog-post-wrap"]//span[@class="vntd-meta-author"]/a/text()' ).extract_first().strip() disgus_identifier = sub_blog.xpath( 'div[@class="blog-post-wrap"]//span[@class="dsq-postid"]/@data-dsqidentifier' ).extract_first().strip() item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date item['author'] = author # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_sub_blog) request.meta['dont_redirect'] = True request.meta['item'] = item request.meta['disgus_identifier'] = disgus_identifier yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(), "Next")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() request = scrapy.Request(next_page_url, callback=self.parse) request.meta['dont_redirect'] = True request.meta['handle_httpstatus_all'] = True yield request
def parse(self, response): blog_posts = response.xpath('//td[@id]') for sub_blog in blog_posts: item = BlogExtractItem() #Get Title, Link, Date title = sub_blog.xpath('a/text()').extract_first().strip() link = sub_blog.xpath('a/@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = link #Make a request to actual link for the blog to extract other info # response = requests.get(link).content request = scrapy.Request(link, callback=self.parse_sub_blog) request.meta['item'] = item yield request
def parse(self, response): blog_posts = response.xpath('//main[@class="content"]') print 'next_page_url ', response.url for sub_blog in blog_posts.xpath('article[contains(@class, "post-")]'): item = BlogExtractItem() #Get Title, Link, Date title = sub_blog.xpath( 'header/h2[@class="entry-title"]/a/text()').extract_first() if not title: title = sub_blog.xpath( 'header/h2[@class="entry-title"]/a/strong/text()' ).extract_first() title = title.strip() link = sub_blog.xpath( 'header/h2[@class="entry-title"]/a/@href').extract_first() date = sub_blog.xpath('header/p[@class="entry-meta"]/time/text()' ).extract_first().strip() author = sub_blog.xpath( 'header/p[@class="entry-meta"]/span[@class="entry-author"]/span/text()' ).extract_first().strip() comments_count = sub_blog.xpath( 'header/p[@class="entry-meta"]/span[@class="entry-comments-link"]/a' ).re(r'(\d+)\s*Comment') item['comments_count'] = comments_count[0] if comments_count else 0 item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date item['author'] = author # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_sub_blog) request.meta['dont_redirect'] = True request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(), "Next Page")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() request = scrapy.Request(next_page_url, callback=self.parse) request.meta['dont_redirect'] = True request.meta['handle_httpstatus_all'] = True yield request
def parse(self, response): data = json.loads(response.body) print 'current_index ', self.current_index current_items = data.get('items', []) maximum_times = data.get('maxNumItems') for post in current_items: item = BlogExtractItem() item['title'] = post['title'][0] item['link'] = post['link'] request = scrapy.Request(item['link'], self.parse_sub_blog) request.meta['item'] = item yield request self.current_index += 10 if self.current_index < maximum_times: yield scrapy.Request(self.quotes_base_url % self.current_index)
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//ul[@class="archive-list"]/li') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link, Date, Comment title = sub_blog.xpath('a/text()').extract_first() link = sub_blog.xpath('a/@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = link # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request
def parse(self, response): blog_posts = response.xpath('//div[@class="content_column"]/article') print 'next_page_url ', response.url for sub_blog in blog_posts: item = BlogExtractItem() #Get Title, Link, Date title = sub_blog.xpath( 'header/h1[@class="entry-title"]/a/text()').extract_first() title = title.strip() link = sub_blog.xpath( 'header/h1[@class="entry-title"]/a/@href').extract_first() author = sub_blog.xpath( 'header//a[@rel="author external"]/text()').extract_first() if not author: author = sub_blog.xpath('header//div[@class="post_byline"]' ).re(r'.*By\s*(.*?)\s*<')[0] comments_count = sub_blog.xpath( 'header//a[@class="comment_number"]/text()').extract_first( ).strip() item['title'] = replace_special_chars(title) item['link'] = link item['author'] = author.strip() if author else None item['comments_count'] = comments_count # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_sub_blog, dont_filter=True) request.meta['dont_redirect'] = True request.meta['item'] = item yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//a[contains(text(), "Previous")]') if next_page: next_page_url = next_page.xpath('@href').extract_first() request = scrapy.Request(next_page_url, callback=self.parse, dont_filter=True) # request.meta['dont_redirect'] = True # request.meta['handle_httpstatus_all'] = True yield request
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//ul[@class="car-monthlisting"]/li') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link, Date, Comment title = sub_blog.xpath('a/text()').extract_first() link = sub_blog.xpath('a/@href').extract_first() comments_count = sub_blog.xpath('span').re(r'\((\d+)\)') item['title'] = replace_special_chars(title) item['link'] = link item['comments_count'] = comments_count[0] if comments_count else 0 # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//table//td/a') for sub_blog in blogs: if sub_blog.xpath('../span[@class="archive-subscribers-only"]'): continue item = BlogExtractItem() #Get Title, Link, Date, Comment title = sub_blog.xpath('text()').extract_first() link = sub_blog.xpath('@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = link # Make a request to actual link for the blog to extract other info request = scrapy.Request(item['link'], callback=self.parse_sub_blog) request.meta['item'] = item yield request
def parse(self, response): data = json.loads(response.body.replace("])}while(1);</x>", '')) posts = data.get('payload', {}).get('references', {}).get('Post', {}) for post in posts.values(): item = BlogExtractItem() item['title'] = post['title'] item['link'] = "https://medium.com/@benjaminhardy/" + post[ 'uniqueSlug'] # print item['title'], blog_url request = scrapy.Request(item['link'], self.parse_sub_blog) request.meta['item'] = item yield request _next_page = data.get('payload', {}).get('paging', {}).get('next') if _next_page: _to = _next_page['to'] _page_num = _next_page['page'] yield scrapy.Request(self.quotes_base_url % (_to, _page_num))
def parse(self, response): blog_posts = response.xpath('//div[contains(@class, "blog-posts")]') print 'page_url ', response.url for sub_blog in blog_posts.xpath('article[@id]'): item = BlogExtractItem() # get post_id which is needed to call another request to get comments post_id = sub_blog.xpath('@id')[0].re(r'post-(\w+)')[0] #Get Title, Link, Date title = sub_blog.xpath('header/h1[@class="entry-title"]/a/text()' ).extract_first().strip() link = sub_blog.xpath( 'header/h1[@class="entry-title"]/a/@href').extract_first() date = sub_blog.xpath( 'footer/span[@class="post-date"]/a/time/text()').extract_first( ).strip() author = sub_blog.xpath( 'footer/span[@class="by-author"]/span/a/text()').extract_first( ).strip() item['title'] = replace_special_chars(title) item['link'] = link item['date'] = date item['author'] = author # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_sub_blog) request.meta['dont_redirect'] = True request.meta['item'] = item request.meta['post_id'] = post_id yield request # If next page is there, make a request and proceed similar as above. next_page = response.xpath('//div[@class = "nav-previous"]/a') if next_page: next_page_url = next_page.xpath('@href').extract_first() request = scrapy.Request(next_page_url, callback=self.parse) request.meta['dont_redirect'] = True request.meta['handle_httpstatus_all'] = True yield request
def parse(self, response): print 'next_page_url ', response.url blogs = response.xpath('//ul/*') for sub_blog in blogs: item = BlogExtractItem() #Get Title, Link title = sub_blog.xpath('a/text()').extract_first() link = sub_blog.xpath('a/@href').extract_first() item['title'] = replace_special_chars(title) item['link'] = self.start_urls[0].split('/blog')[0] + link yield SplashRequest(item['link'], self.parse_sub_blog, endpoint = 'execute', args={ 'lua_source': script, 'pad': 32, 'css': 'a.title' }, meta = {'item': item} )