def parse_page(self, response): hxs = scrapy.Selector(response) log.debug(hxs) scripts = hxs.xpath('//script[@type="text/javascript"]') for script in scripts: js = " ".join(script.xpath('./text()').extract()) matches = js_re.match(js) if matches: json_data = json.loads(matches.group(2)) for post_id, post_data in json_data.iteritems(): body = re.sub( ">>\d+", "", html_util.strip_tags(parser.unescape( post_data['com']))) #log.debug(post_data) if 'resto' in post_data and int(post_data['resto']) != 0: url = u"http://boards.4chan.org/{}/thread/{}#p{}".format( post_data['board'], post_data['resto'], post_data['no']) else: url = u"http://boards.4chan.org/{}/thread/{}".format( post_data['board'], post_data['no']) yield items.PostItem(site_id=sites.FOURCHAN, points=int(post_data['replies']) if post_data['replies'] else 0, site_post_id=post_id, body=body, sn=post_data['name'], url=url)
def parse_page(self, response): data = json.loads(response.body) discovery_posts = data['response']['DiscoveryPosts'] posts = discovery_posts['posts'] for post in posts: try: hxs = scrapy.Selector(text=post) entry = hxs.xpath('//article[@data-id]') if not entry: continue post_id = entry.xpath('./@data-id').extract()[0].strip() #log.debug(post_id) author = entry.xpath( './@data-tumblelog-name').extract()[0].strip() #log.debug(author) header = entry.xpath('.//header') href = util.get_url_from_node(response, header.xpath('./div/a/@href')) #log.debug(href) tags = entry.xpath( './/section[@class="post_tags"]/div/a[@class="post_tag"]/@data-tag' ).extract() #log.debug(tags) body = entry.xpath( './/div[@class="post_body"]//text()').extract() #log.debug(body) body += tags body_text = " ".join(parser.unescape(body)) #log.debug(body_text) notes_str = entry.xpath( './/div[@class="post_notes_inner"]//span[@class="note_link_current"]/@data-count' ).extract()[0].strip() votes = self.parse_notes(notes_str) #log.debug(votes) yield items.PostItem(site_id=sites.TUMBLR, points=votes, site_post_id=post_id, body=body_text, sn=author, url=href) except: log.exception(u"Failed") log.debug(post)
def parse_page(self, response): hxs = scrapy.Selector(response) log.debug(hxs) entries = hxs.xpath( '//li[contains(@class,"expanded-shelf-content-item-wrapper")]') log.debug(entries) for thing in entries: log.debug(thing) item_node = thing.xpath( './div[contains(@class, "expanded-shelf-content-item")]') post_id = item_node.xpath( './div[contains(@class, "yt-lockup-video")]/@data-context-item-id' ).extract()[0].strip() title_link = item_node.xpath( './/div[contains(@class, "yt-lockup-content")]/h3[contains(@class, "yt-lockup-title")]/a' ) title = title_link.xpath('./text()').extract()[0].strip() href = util.get_url_from_node(response, title_link.xpath('./@href')) body = " ".join( item_node.xpath( './/div[contains(@class, "yt-lockup-content")]/div[contains(@class, "yt-lockup-description")]//text()' ).extract()).strip() author = item_node.xpath( './/div[contains(@class, "yt-lockup-content")]/div[contains(@class, "yt-lockup-byline")]/a/text()' ).extract()[0].strip() views = 0 try: views_str = item_node.xpath( './/div[contains(@class, "yt-lockup-content")]//ul[contains(@class, "yt-lockup-meta-info")]/li/text()' ).extract()[1].strip() matches = views_re.match(views_str) views = int(matches.group(1).replace(",", "")) except: log.exception(u"Exception parsing {}".format(thing)) yield items.PostItem(site_id=sites.YOUTUBE, points=views, site_post_id=post_id, body=body, sn=author, url=href)
def parse_page(self, response): hxs = scrapy.Selector(response) entries = hxs.xpath('//div[contains(@class,"postItem")]') for thing in entries: post_id = thing.xpath('./@data-post-id').extract()[0].strip() entry = thing.xpath( './/div[contains(@class, "postArticle-content")]') title_node = entry.xpath('.//h3') title = " ".join(title_node.xpath('.//text()').extract()).strip() href = util.get_url_from_node( response, thing.xpath( './/article[contains(@class, "postArticle")]/a/@href')) body_node = entry.xpath( './/div[contains(@class, "section-inner")]/p') body = title if body_node: body = " ".join(body_node.xpath('.//text()').extract()).strip() header = thing.xpath( './/div[contains(@class, "postMeta-previewHeader")]') author = header.xpath('.//a[@data-action="show-user-card"]/text()' ).extract()[0].strip() votes = 0 try: votes_str = thing.xpath( './/button[@data-action="show-recommends"]/text()' ).extract()[0].strip().replace(",", "") votes = self.parse_votes(votes_str) except: log.exception(u"Exception parsing {}".format(thing)) yield items.PostItem(site_id=sites.MEDIUM, points=votes, site_post_id=post_id, body=body, sn=author, url=href)
def parse_page(self, response): num = response.meta['num'] hxs = selector.HtmlXPathSelector(response) entries = hxs.xpath('//div[contains(@class,"thing")]') for thing in entries: post_id = thing.xpath('./@data-fullname').extract()[0].strip() entry = thing.xpath('./div[@class="entry unvoted"]') href = util.get_url_from_node( response, entry.xpath('./p[@class="title"]/a/@href')) title = entry.xpath( './p[@class="title"]/a/text()').extract()[0].strip() author = entry.xpath( './p[@class="tagline"]/a/text()').extract()[0].strip() votes = 0 try: votes_str = thing.xpath( './div[@class="midcol unvoted"]/div[@class="score unvoted"]/text()' ).extract()[0].strip() #log.debug(u"Votes string {}".format(votes_str)) votes = int(votes_str) except: pass #log.exception(u"Exception parsing {}".format(thing)) yield items.PostItem(site_id=sites.REDDIT, points=votes, site_post_id=post_id, body=title, sn=author, url=href) if num < 10: next_link = hxs.xpath('//a[@rel="nofollow next"]/@href') if next_link: yield http.Request(util.get_url_from_node(response, next_link), meta={ 'type': 'page', 'num': num + 1 })
def parse_page(self, response): num = response.meta['num'] hxs = selector.HtmlXPathSelector(response) links = hxs.xpath('//td[@class="title"]/a') subtext = hxs.xpath('//td[@class="subtext"]') for link, td in zip(links, subtext)[:-1]: sn = td.xpath('./a/text()')[0].extract() points_text = td.xpath('./span/text()')[0].extract() items_href = td.xpath('./a/@href')[1].extract() m = points_re.match(points_text) points = int(m.group(1)) m = post_id_re.match(items_href) post_id = int(m.group(1)) post = link.xpath('./text()')[0].extract() content_link = util.get_url_from_node(response, link.xpath('./@href')) yield items.PostItem(site_id=sites.HN, points=points, site_post_id=post_id, body=post, sn=sn, url=content_link) if num < 5: more_link = links[-1] more_url = util.get_url_from_node(response, more_link.xpath('./@href')) yield http.Request(more_url, meta={'type': 'page', 'num': num + 1})