def parse(self, response): if response.url == 'http://www.producthunt.com/': return sel = Selector(response) postid = sel.xpath( '//div[@class="modal-post--header--upvote upvote m-big"]/@data-vote-id' ).extract() # product in this comment if self.max_posts > 0: p = sel.xpath('//div[@class="modal-post"]') il = ProductItemLoader(response=response, selector=p) il.add_xpath('vote_count', '*//*[@class="vote-count"]/text()') il.add_value("postid", postid) il.add_xpath("name", 'header/div[2]/h1/a/text()') il.add_xpath("url", 'header/div[2]/h1/a/@href') il.add_xpath("description", '*//p[@class="modal-post--header--tagline"]/text()') il.add_value("comment_url", 'http://www.producthunt.com/posts/' + postid[0]) il.add_xpath("comment_count", 'section[3]/h2/text()') il.add_xpath("date", 'div/div[1]/h2/abbr/text()') il.add_xpath("userid", 'div/div[1]/span/a/@href') yield il.load_item() # comments for this product threads = sel.xpath('//div[@class="modal-post--comment"]') for t in threads: comments = t.xpath('div') parentid = t.xpath('@data-parent-id').extract() for c in comments: cls = c.xpath('@class').extract()[0] if cls in ['comment', 'comment child']: il = CommentItemLoader(response=response, selector=c) il.add_xpath('commentid', '@data-comment-id') il.add_value('parentid', parentid) il.add_value('postid', postid) il.add_xpath('userid', '*//*[@class="comment-user-handle"]/text()') il.add_xpath( 'user_title', '*//*[@class="comment-user-headline"]/text()') il.add_xpath('user_name', '*//*[@class="comment-user-name"]/a/text()') il.add_xpath( 'user_icon', '*//*[@class="user-image-link-post"]/img/@src') il.add_xpath('vote_count', '*//*[@class="vote-count"]/text()') il.add_xpath('comment_html', '*//*[@class="actual-comment"]') il.add_xpath('comment', '*//*[@class="actual-comment"]/text()') if cls == 'comment child': il.add_value('is_child', "1") else: il.add_value('is_child', "0") yield il.load_item()
def parse(self, response): sel = Selector(response) days = sel.xpath('//div[@class="posts"]/div') for day in days: posts = day.xpath('*/li') date = day.xpath('time/@datetime').extract() for p in posts: il = ProductItemLoader(response=response, selector=p) # vote_count il.add_xpath("vote_count", '*//*[@class="vote-count"]/text()') il.add_xpath("postid", '*//div[@class="upvote"]/@data-vote-id') # user info il.add_xpath("user_name", '*//div[@class="user-hover-card"]/h3/text()', re=r'\s*(.*)\s*') il.add_xpath("userid", '*//div[@class="user-hover-card"]/a/@href') il.add_xpath("user_title", '*//h4[@class="user-headline"]/text()') il.add_xpath("user_icon", '*//div[@class="user-hover-card"]/a/img/@src') # product info il.add_xpath("name", '*//a[@class="post-url title"]/text()') il.add_xpath("url", '*//a[@class="post-url title"]/@href') il.add_xpath("description", '*//*[@class="post-tagline description"]/text()') il.add_xpath("comment_url", 'div/@data-href') il.add_xpath("comment_count", '*//*[@class="comment-count"]/text()') il.add_value("date", date) yield il.load_item()
def parse(self, response): if response.url == 'http://www.producthunt.com/': return sel = Selector(response) postid = sel.xpath('//div[@class="post-show"]/@data-id').extract() # product in this comment if self.max_posts > 0: p = sel.xpath('//div[@class="comments-header"]') il = ProductItemLoader(response = response, selector = p) il.add_xpath('vote_count', '*//*[@class="vote-count"]/text()') il.add_value("postid", postid) il.add_xpath("name", '*//a[@class="post-url"]/text()') il.add_xpath("url", '*//a[@class="post-url"]/@href') il.add_xpath("description", '*//span[@class="post-tagline"]/text()') il.add_xpath("comment_url", '*//a[@class="permalink"]/@href') il.add_xpath("comment_count", '//h2[@class="subhead"]/text()') il.add_xpath("date", '*//a[@class="permalink"]/text()') il.add_xpath("userid", '*//div[@class="post-user"]/a/@href') yield il.load_item() # comments for this product threads = sel.xpath('//div[@class="comment-thread"]') for t in threads: comments = t.xpath('div') parentid = t.xpath('@data-parent-id').extract() for c in comments: cls = c.xpath('@class').extract()[0] if cls in ['comment', 'comment child']: il = CommentItemLoader(response = response, selector=c) il.add_xpath('commentid', '@data-comment-id') il.add_value('parentid', parentid) il.add_value('postid', postid) il.add_xpath('userid', '*//*[@class="comment-user-handle"]/text()') il.add_xpath('user_title', '*//*[@class="comment-user-headline"]/text()') il.add_xpath('user_name', '*//*[@class="comment-user-name"]/a/text()') il.add_xpath('user_icon', '*//*[@class="user-image-link-post"]/img/@src') il.add_xpath('vote_count', '*//*[@class="vote-count"]/text()') il.add_xpath('comment_html', '*//*[@class="actual-comment"]') il.add_xpath('comment', '*//*[@class="actual-comment"]/text()') if cls == 'comment child': il.add_value('is_child', "1") else: il.add_value('is_child', "0") yield il.load_item()
def parse(self, response): sel = Selector(response) days = sel.xpath('//div[@class="posts"]/div') for day in days: posts = day.xpath('*/li') date = day.xpath('time/@datetime').extract() for p in posts: il = ProductItemLoader(response = response, selector = p) # vote_count il.add_xpath("vote_count", '*//*[@class="vote-count"]/text()') il.add_xpath("postid", 'div[@class="upvote"]/@data-vote-id') # user info il.add_xpath("user_name", '*//div[@class="user-hover-card"]/h3/text()', re=r'\s*(.*)\s*') il.add_xpath("userid", '*//div[@class="user-hover-card"]/a/@href') il.add_xpath("user_title", '*//h4[@class="user-headline"]/text()') il.add_xpath("user_icon", '*//div[@class="user-hover-card"]/a/img/@src') # product info il.add_xpath("name", '*/a[@class="post-url title"]/text()') il.add_xpath("url", '*/a[@class="post-url title"]/@href') il.add_xpath("description", '*/span[@class="post-tagline description"]/text()') il.add_xpath("comment_url", 'a/@href') il.add_xpath("comment_count", '*/p[@class="comment-count"]/text()') il.add_value("date", date) yield il.load_item()