def parse_details(self, response): vars = ArticlesItem() vars["title"] = response.css("h1.sna_content_heading::text").extract_first().strip() vars["article_summary"] = response.css("span.article-summary::text").extract_first().strip() vars["article_content"] = [i.strip() for i in response.css("div.article-body p::text").extract()] vars["tags"] = [i.strip() for i in response.css("div.article-tags h2.tags::text").extract()] yield vars
def parse_abstract_page(self, response): # first check abstract using @class="abstract" abstract = response.xpath( '//p[@class="abstract"]/text()').extract_first() if (abstract == None): # extract abstract abstract = response.xpath('//div[@id="content"]/text()').extract() # remove white spaces abstract = list(map(lambda x: x.strip(), abstract)) # leave on elements with at least 10 characters abstract = '.'.join([item for item in abstract if len(item) > 1]) else: abstract = abstract.strip() # create object ArticlesItem and initialize its attributes item = ArticlesItem() item['Title'] = response.meta['title'] item['Authors'] = response.meta['authors'] item['Year'] = response.meta['year'] item['Volume'] = response.meta['volume'] item['Pdf_url'] = response.meta['pdf_url'] item['Abstract'] = abstract item['Journal_Conference'] = "JMLR" yield item
def parse_details(self, response): article = ArticlesItem() my_article = "" for i in response.css("div#readspeaker_maincontent p::text").extract(): my_article += i article["article_content"] = my_article.replace("\n", "") article["tags"] = "صحة" my_article = "" yield article
def parse(self,response): hxs = HtmlXPathSelector(response) article_titles = hxs.select("//div[@class='c-article-title']") items = [] for article_titles in article_titles: item = ArticlesItem() item["article_title"] = article_titles.select("a/text()").extract() items.append(item) return items
def parse(self, response): # if response.status == 200: # print("成功解析下载地址") # else: # print("下载地址解析失败") item = ArticlesItem() url = response.url item['file_urls'] = [url] yield item
def parse_article_page(self,response): # create object ArticlesItem item = ArticlesItem() item['Title'] = response.xpath('//h2[@class="subtitle"]/text()').extract_first() item['Authors'] = ','.join(response.xpath('//li[@class="author"]/a/text()').extract()) item['Year'] = response.meta['year'] item['Volume'] = '-' item['Abstract'] = response.xpath('//p[@class="abstract"]/text()').extract_first().strip() item['Pdf_url'] = 'https://papers.nips.cc' + response.xpath('//div[@class="main wrapper clearfix"]/a/@href').extract_first() item['Journal_Conference'] = "NIPS" yield item
def parse_details(self, response): article = ArticlesItem() list_content = [] my_article = "" article["title"] = response.css("h1.dmi-title::text").extract_first() for i in response.css("div.dmi-entry-content p::text").extract(): if len(i) > 20: my_article += i article["article_content"] = my_article.replace("\xa0", "") article["tags"] = "صحة" my_article = "" if len(article["article_content"]) > 20: yield article
def parse_details(self, response): article = ArticlesItem() list_content = [] my_article = "" article["title"] = response.css("h1.ft-ptitle::text").extract_first() for i in response.css("section.ft-entry p::text").extract(): if len(i) > 20: my_article += i article["article_content"] = my_article.replace("\xa0", "") article["tags"] = [ i.strip() for i in response.css("div.ft-ptags a::text").extract() ] my_article = "" if len(article["article_content"]) > 20 and len(article["tags"]) >= 1: yield article
def parse_details(self, response): article = ArticlesItem() list_content = [] clear_space_list = [] clear_line_list = [] final_output = [] article["title"] = response.css("h1.post-title.entry-title::text").extract_first().strip() for i in response.css("div.entry-content p"): list_content.append("".join(i.xpath('descendant-or-self::text()').extract())) clear_line_list = [i.replace("\n", " ") for i in list_content] clear_space_list = [i.replace("\xa0", "") for i in clear_line_list] final_output = list(filter(None, clear_space_list)) article["article_content"] = final_output article["tags"] = response.css("span.tagcloud a::text").extract() if len(article["tags"]) >= 2 and article["article_content"]: yield article
def parse_details(self, response): article = ArticlesItem() list_content = [] clear_space_list = [] clear_line_list = [] final_output = [] article["title"] = response.css("div.article h1.heading::text").extract_first() for i in response.css("div.text.js-text.js-mediator-article p:not(:first-child):not(:last-child)"): list_content.append("".join(i.xpath('descendant-or-self::text()').extract())) clear_line_list = [i.replace("\n", " ") for i in list_content] clear_space_list = [i.replace("\xa0", "") for i in clear_line_list] final_output = list(filter(None, clear_space_list)) del final_output[-1] # i do not want to show the reference! article["article_content"] = final_output article["tags"] = response.css("div.news-tags.news-tags_article a::text").extract() if article["article_content"] and len(article["tags"]) > 1: yield article
def parse(self, response): items = [] sites = response.css('.manual-list') for dl in sites.css('dl.manual-item-standard'): book = dl.css('.recommend-book') item = ArticlesItem() item['name'] = dl.css('.name::text').extract_first().strip() item['label'] = book.css('::attr(href)').extract_first().split( '/')[-1] item['cover_url'] = book.css( 'img::attr(src)').extract_first().strip() item['author'] = 'bookstack' item['description'] = '' item['tags'] = [] items.append(item) return items
def parse_details(self, response): article = ArticlesItem() list_content = [] clear_line_list = [] final_output = [] article["title"] = response.css( "div.articleHeader h1::text").extract_first().strip() for i in response.css("div#articleBody p"): list_content.append("".join( i.xpath('descendant-or-self::text()').extract())) for _ in list_content: clear_line_list = self.clean_articles(list_content) final_output = list(filter(None, clear_line_list)) article["article_content"] = final_output article["tags"] = response.css("div.tags h3 a::text").extract() if article["article_content"] and len(article["tags"]) > 1: yield article
def parse_details(self, response): var = ArticlesItem() list_content = [] final_output = [] var["title"] = response.css( "h1._2JPm2UuC56::text").extract_first().strip() for i in response.css( "div.clearfix.wysiwyg._2A-9LYJ7eK p:nth-child(n+2)"): list_content.append("".join( i.xpath('descendant-or-self::text()').extract())) final_output = [i.replace("\n", " ") for i in list_content] var["article_content"] = final_output var["tags"] = [ i.strip() for i in response.css("ul.AsCeVPiOdE li a::text").extract() ] if len(var["tags"]) >= 1 and var[ "article_content"]: # do not save any article that has neither tag nor content! yield var
def parse(self, response): soup = BeautifulSoup(response.body) print 'crawled',response.url for domain, attrs in self.domain_router.items(): if domain in response.url: attrs = json.loads(attrs) # attrs={"href":"/users/.+"} for key, value in attrs.items(): attrs[key] = re.compile(value) # attrs={"class":re(object)} try: author = soup.find_all(attrs=attrs)[0].text.strip() except: author = '' finally: break else: author = '' ''' print 'crawled',response.url if soup.select('.author_name'): author = soup.select('.author_name')[0].text.strip() #class='author_name' rule = 1 elif soup.select('.author'): author = soup.select('.author')[0].text.strip() rule = 2 elif soup.select('.author-link'): author = soup.select('.author-link')[0].text.strip() rule = 3 elif soup.select('.byline__author'): author = soup.select('.byline__author')[0].text.strip() rule = 4 elif soup.find_all(href=re.compile("/users/.+")): author = soup.find_all(href=re.compile("/users/.+"))[0].text.strip() rule = 5 elif soup.select('.cat_desc'): author = soup.select('.cat_desc')[0].text.strip() rule = 6 elif soup.select('.stat-author'): author = soup.select('.stat-author')[0].text.strip() rule = 7 elif soup.select('.js-authors-list'): author = soup.select('.js-authors-list')[0].text rule = 12 elif soup.find_all(rel="author"): author = soup.find_all(rel="author")[0].text.strip() rule = 8 elif soup.find_all(href=re.compile("/user/.+")): author = soup.find_all(href=re.compile("/user/.+"))[0].text.strip() rule = 9 elif soup.select('.fn'): author = soup.select('.fn')[0].text.strip() rule = 10 else: author = '' rule = 11 #error_articles.appends('{}, '.format(response.url)) ''' item = ArticlesItem() item['url'] = response.url item['author'] = author return item