def parse(self, response): # edit this only url = response.meta.get('url') title = response.css("h1.entry-title::text").get() time = response.css("time::text").get() content = response.css("div.td-post-content p ::text").getall() content = ' '.join(content) loader = ItemLoader(item=FashionLadyItem(), selector=response) loader.add_value("url", url) loader.add_value("title", title) loader._add_value("time", time) loader.add_value("content", content) os.remove(response.meta.get('temp_file')) return loader.load_item() # title = response.css("h1.entry-title::text").get() # time = response.css("time::text").get() # content = response.css("div.td-post-content p::text").getall() # content = ' '.join(content)
def parse(self, response): # edit this only url = response.meta.get('url') title = response.css("h1.bm-content-header__title::text").get() category = response.css("h2.bm-content-header__subtitle::text").get() # content=response.css("div.bm-article-body p.bm-article-body__copy::text").getall() # content2=response.css("div.bm-article-body p::text").getall() # content.extend(content2) content = response.xpath( "string(//div[@class='bm-article-body'])").getall() content = ' '.join(content) content = content.replace("SHARE", "", 1) loader = ItemLoader(item=LorealParisItem(), selector=response) loader.add_value("url", url) loader.add_value("title", title) loader._add_value("category", category) loader.add_value("content", content) os.remove(response.meta.get('temp_file')) return loader.load_item() # title = response.css("h1.bm-content-header__title::text").get() # category = response.css("h2.bm-content-header__subtitle::text").get() # content = content=response.css("div.bm-article-body p.bm-article-body__copy::text").getall()
def parse(self, response): l = ItemLoader(item=QuotesSpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l._add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item()
def parse_detail(self, response): title = "".join(response.css(".ArticleHeader_headline::text").extract()) content = "".join(response.css('.StandardArticleBody_body > p::text').extract()) if title and content: loader = ItemLoader(item=NewsContext(), response=response) loader._add_value("url", response.url) loader._add_value("title", title) loader._add_value("content", content) loader.add_value("date", int(time.time())) loader.add_value("domain", self.task_domain) return loader.load_item()
def parse_detail(self, response): title = response.css(".story-body h1::text").extract() content = "".join( response.css('div[property=articleBody] p::text').extract()) if title and content: loader = ItemLoader(item=NewsContext(), response=response) loader._add_value("url", response.url) loader._add_value("title", title) loader._add_value("content", content) loader.add_value("date", int(time.time())) loader.add_value("domain", self.task_domain) return loader.load_item()
def parse_lot(self, response): l = ItemLoader(item=LarsenDelpetersonItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath('LotNum', '//h1/text()') l.add_xpath( 'LotDescription', '//h2[contains(text(), "Item Details:")]/following-sibling::p[1]/text()[1]' ) address = response.xpath( '//b[contains(text(), "Item Location:")]/following-sibling::text()[1]' ).extract_first() city, region = address.split(',') l._add_value('City', city) l._add_value('State', region) l._add_value('ZIP', region) l.add_xpath( 'Contact', '//b[contains(text(), "Equipment Contact:")]/following-sibling::text()[1]' ) l.add_xpath( 'Phone', '//b[contains(text(), "Phone Number:")]/following-sibling::text()[1]' ) l.add_xpath( 'Category', '//strong[contains(text(), "Category:")]/following-sibling::text()[1]' ) l.add_xpath( 'ClosesOn', '//strong[contains(text(), "Closes On")]/following-sibling::text()[1]' ) l.add_xpath('image_urls', '//div[@id="gallery"]//a/@href') l.add_value('folder_name', self.auction_id) yield l.load_item()
def parse_reply(self, response): ''' parse reply to comments, root comment is added if flag ''' # from scrapy.utils.response import open_in_browser # open_in_browser(response) if response.meta['flag'] == 'init': #parse root comment for root in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=root) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') profile_img = "" # source_url = str(response.xpath(".//h3/a/@href").get()) # index1 = source_url.find("/profile.php?id=") # if index1 != -1: # index2 = source_url.find("&fref=nf&refid=18&__tn__=") # if index2 == -1: # index2 = source_url.find("&refid=18&__tn__=") # source_url = source_url[index1+16:index2] # profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url) # else: # index2 = source_url.find("?fref=nf&refid=18&__tn__=-R") # source_url = source_url[1:index2] # profile_img = "https://avatars.io/facebook/{}".format(source_url) # new._add_value('source_url', source_url) new._add_value('profile_img', profile_img) new.add_value('reply_to', 'ROOT') new.add_xpath('text', './/div[1]//text()') # new.add_xpath('date','.//abbr/text()') date_string = response.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) new._add_value('date', date) new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') # new.add_value('url',response.url) yield new.load_item() #parse all replies in the page for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') profile_img = "" # source_url = str(response.xpath(".//h3/a/@href").get()) # index1 = source_url.find("/profile.php?id=") # if index1 != -1: # index2 = source_url.find("&fref=nf&refid=18&__tn__=") # if index2 == -1: # index2 = source_url.find("&refid=18&__tn__=") # source_url = source_url[index1+16:index2] # profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url) # else: # index2 = source_url.find("?fref=nf&refid=18&__tn__=-R") # source_url = source_url[1:index2] # profile_img = "https://avatars.io/facebook/{}".format(source_url) # new._add_value('source_url', source_url) new._add_value('profile_img', profile_img) new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') # new.add_xpath('date','.//abbr/text()') date_string = response.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) new._add_value('date', date) new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') # new.add_value('url',response.url) yield new.load_item() back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() if back: self.logger.info('Back found, more nested comments') back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to proper page: {}' .format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] }) elif response.meta['flag'] == 'back': #parse all comments for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') profile_img = "" # source_url = str(response.xpath(".//h3/a/@href").get()) # index1 = source_url.find("/profile.php?id=") # if index1 != -1: # index2 = source_url.find("&fref=nf&refid=18&__tn__=") # if index2 == -1: # index2 = source_url.find("&refid=18&__tn__=") # source_url = source_url[index1+16:index2] # profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url) # else: # index2 = source_url.find("?fref=nf&refid=18&__tn__=-R") # source_url = source_url[1:index2] # profile_img = "https://avatars.io/facebook/{}".format(source_url) # new._add_value('source_url', source_url) new._add_value('profile_img', profile_img) new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') # new.add_xpath('date','.//abbr/text()') date_string = response.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) new._add_value('date', date) new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') # new.add_value('url',response.url) yield new.load_item() #keep going backwards back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() self.logger.info('Back found, more nested comments') if back: back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to home page: {}'. format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] }) # ============================================================================= # CRAWL REACTIONS # ============================================================================= # def parse_reactions(self,response): # new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item']) # new.context['lang'] = self.lang # new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") # new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") # new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") # new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") # new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") # new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") # yield new.load_item() # # #substitute # yield new.load_item() # ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾ # _________v___ # #response --> reply/root # reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href") # reactions = response.urljoin(reactions[0].extract()) # if reactions: # yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new}) # else: # yield new.load_item()
def parse_post(self, response): ''' parse post does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments 4) follows to new comment page ''' #load replied-to comments pages #select nested comment one-by-one matching with the index: response.meta['index'] path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str( response.meta['index']) + ']' group_flag = response.meta[ 'group'] if 'group' in response.meta else None for reply in response.xpath(path): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath( './/a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) self.logger.info('{} nested comment'.format( str(response.meta['index']))) yield scrapy.Request(ans, callback=self.parse_reply, priority=1000, meta={ 'reply_to': source, 'url': response.url, 'index': response.meta['index'], 'flag': 'init', 'group': group_flag }) #load regular comments if not response.xpath(path): #prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i, reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment'.format(i + 1)) new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') profile_img = "" # source_url = str(response.xpath(".//h3/a/@href").get()) # index1 = source_url.find("/profile.php?id=") # if index1 != -1: # index2 = source_url.find("&fref=nf&refid=18&__tn__=") # if index2 == -1: # index2 = source_url.find("&refid=18&__tn__=") # source_url = source_url[index1+16:index2] # profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url) # else: # index2 = source_url.find("?fref=nf&refid=18&__tn__=-R") # source_url = source_url[1:index2] # profile_img = "https://avatars.io/facebook/{}".format(source_url) # new._add_value('source_url', source_url) new._add_value('profile_img', profile_img) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('img', './/div[h3]/div[2]/img/@src') # new.add_xpath('date','.//abbr/text()') date_string = response.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) new._add_value('date', date) new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') # new.add_value('url',response.url) yield new.load_item() #new comment page if not response.xpath(path): #for groups next_xpath = './/div[contains(@id,"see_next")]' prev_xpath = './/div[contains(@id,"see_prev")]' if not response.xpath(next_xpath) or group_flag == 1: for next_page in response.xpath(prev_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': 1 }) else: for next_page in response.xpath(next_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': group_flag })