예제 #1
 def parse_details(self, response):
     vars = ArticlesItem()
     vars["title"] = response.css("h1.sna_content_heading::text").extract_first().strip()
     vars["article_summary"] = response.css("span.article-summary::text").extract_first().strip()
     vars["article_content"] = [i.strip() for i in response.css("div.article-body p::text").extract()]
     vars["tags"] = [i.strip() for i in response.css("div.article-tags h2.tags::text").extract()]
     yield vars
예제 #2
    def parse_abstract_page(self, response):
        # first check abstract using @class="abstract"
        abstract = response.xpath(
        if (abstract == None):
            # extract abstract
            abstract = response.xpath('//div[@id="content"]/text()').extract()
            # remove white spaces
            abstract = list(map(lambda x: x.strip(), abstract))
            # leave on elements with at least 10 characters
            abstract = '.'.join([item for item in abstract if len(item) > 1])
            abstract = abstract.strip()

        # create object ArticlesItem and initialize its attributes
        item = ArticlesItem()
        item['Title'] = response.meta['title']
        item['Authors'] = response.meta['authors']
        item['Year'] = response.meta['year']
        item['Volume'] = response.meta['volume']
        item['Pdf_url'] = response.meta['pdf_url']
        item['Abstract'] = abstract
        item['Journal_Conference'] = "JMLR"

        yield item
 def parse_details(self, response):
     article = ArticlesItem()
     my_article = ""
     for i in response.css("div#readspeaker_maincontent p::text").extract():
         my_article += i
     article["article_content"] = my_article.replace("\n", "")
     article["tags"] = "صحة"
     my_article = ""
     yield article
예제 #4
	def parse(self,response):
		hxs = HtmlXPathSelector(response)
		article_titles = hxs.select("//div[@class='c-article-title']")
		items = []
		for article_titles in article_titles:
			item = ArticlesItem()
			item["article_title"] = article_titles.select("a/text()").extract()
		return items
예제 #5
    def parse(self, response):
        # if response.status == 200:
        #     print("成功解析下载地址")
        # else:
        #     print("下载地址解析失败")

        item = ArticlesItem()
        url = response.url
        item['file_urls'] = [url]
        yield item
예제 #6
	def parse_article_page(self,response):
		# create object ArticlesItem
		item = ArticlesItem()
		item['Title'] = response.xpath('//h2[@class="subtitle"]/text()').extract_first()
		item['Authors'] = ','.join(response.xpath('//li[@class="author"]/a/text()').extract())
		item['Year'] = response.meta['year']
		item['Volume'] = '-'
		item['Abstract'] = response.xpath('//p[@class="abstract"]/text()').extract_first().strip()
		item['Pdf_url'] = 'https://papers.nips.cc' + response.xpath('//div[@class="main wrapper clearfix"]/a/@href').extract_first()
		item['Journal_Conference'] = "NIPS"
		yield item
 def parse_details(self, response):
     article = ArticlesItem()
     list_content = []
     my_article = ""
     article["title"] = response.css("h1.dmi-title::text").extract_first()
     for i in response.css("div.dmi-entry-content p::text").extract():
         if len(i) > 20:
             my_article += i
     article["article_content"] = my_article.replace("\xa0", "")
     article["tags"] = "صحة"
     my_article = ""
     if len(article["article_content"]) > 20:
         yield article
예제 #8
 def parse_details(self, response):
     article = ArticlesItem()
     list_content = []
     my_article = ""
     article["title"] = response.css("h1.ft-ptitle::text").extract_first()
     for i in response.css("section.ft-entry p::text").extract():
         if len(i) > 20:
             my_article += i
     article["article_content"] = my_article.replace("\xa0", "")
     article["tags"] = [
         i.strip() for i in response.css("div.ft-ptags a::text").extract()
     my_article = ""
     if len(article["article_content"]) > 20 and len(article["tags"]) >= 1:
         yield article
예제 #9
    def parse_details(self, response):
        article = ArticlesItem()
        list_content = []
        clear_space_list = []
        clear_line_list = []
        final_output = []
        article["title"] = response.css("h1.post-title.entry-title::text").extract_first().strip()
        for i in response.css("div.entry-content p"):
            clear_line_list = [i.replace("\n", " ") for i in list_content]
            clear_space_list = [i.replace("\xa0", "") for i in clear_line_list]
            final_output = list(filter(None, clear_space_list))

        article["article_content"] = final_output
        article["tags"] = response.css("span.tagcloud a::text").extract()
        if len(article["tags"]) >= 2 and article["article_content"]:
            yield article
예제 #10
 def parse_details(self, response):
     article = ArticlesItem()
     list_content = []
     clear_space_list = []
     clear_line_list = []
     final_output = []
     article["title"] = response.css("div.article h1.heading::text").extract_first()
     for i in response.css("div.text.js-text.js-mediator-article p:not(:first-child):not(:last-child)"):
         clear_line_list = [i.replace("\n", " ") for i in list_content]
         clear_space_list = [i.replace("\xa0", "") for i in clear_line_list]
         final_output = list(filter(None, clear_space_list))
         del final_output[-1]  # i do not want to show the reference!
     article["article_content"] = final_output
     article["tags"] = response.css("div.news-tags.news-tags_article a::text").extract()
     if article["article_content"] and len(article["tags"]) > 1:
         yield article
예제 #11
    def parse(self, response):

        items = []
        sites = response.css('.manual-list')
        for dl in sites.css('dl.manual-item-standard'):
            book = dl.css('.recommend-book')
            item = ArticlesItem()
            item['name'] = dl.css('.name::text').extract_first().strip()
            item['label'] = book.css('::attr(href)').extract_first().split(
            item['cover_url'] = book.css(
            item['author'] = 'bookstack'
            item['description'] = ''
            item['tags'] = []

        return items
예제 #12
    def parse_details(self, response):
        article = ArticlesItem()
        list_content = []
        clear_line_list = []
        final_output = []

        article["title"] = response.css(
            "div.articleHeader h1::text").extract_first().strip()
        for i in response.css("div#articleBody p"):
            for _ in list_content:
                clear_line_list = self.clean_articles(list_content)
            final_output = list(filter(None, clear_line_list))
        article["article_content"] = final_output
        article["tags"] = response.css("div.tags h3  a::text").extract()

        if article["article_content"] and len(article["tags"]) > 1:
            yield article
예제 #13
    def parse_details(self, response):
        var = ArticlesItem()
        list_content = []
        final_output = []
        var["title"] = response.css(
        for i in response.css(
                "div.clearfix.wysiwyg._2A-9LYJ7eK p:nth-child(n+2)"):
            final_output = [i.replace("\n", " ") for i in list_content]

        var["article_content"] = final_output
        var["tags"] = [
            for i in response.css("ul.AsCeVPiOdE li a::text").extract()
        if len(var["tags"]) >= 1 and var[
                "article_content"]:  # do not save any article that has neither tag nor content!
            yield var
예제 #14
	def parse(self, response):
		soup = BeautifulSoup(response.body)
		print 'crawled',response.url

		for domain, attrs in self.domain_router.items():
			if domain in response.url:
				attrs = json.loads(attrs)          # attrs={"href":"/users/.+"}
				for key, value in attrs.items():  
					attrs[key] = re.compile(value)    # attrs={"class":re(object)}
				try: author = soup.find_all(attrs=attrs)[0].text.strip()
				except: author = ''
				finally: break
				author = ''

		print 'crawled',response.url
		if soup.select('.author_name'):
			author = soup.select('.author_name')[0].text.strip()  #class='author_name'
			rule = 1
		elif soup.select('.author'):
			author = soup.select('.author')[0].text.strip()
			rule = 2
		elif soup.select('.author-link'):    
			author = soup.select('.author-link')[0].text.strip()
			rule = 3
		elif soup.select('.byline__author'):
			author = soup.select('.byline__author')[0].text.strip()
			rule = 4
		elif soup.find_all(href=re.compile("/users/.+")):
			author = soup.find_all(href=re.compile("/users/.+"))[0].text.strip()
			rule = 5
		elif soup.select('.cat_desc'):
			author = soup.select('.cat_desc')[0].text.strip()
			rule = 6
		elif soup.select('.stat-author'):
			author = soup.select('.stat-author')[0].text.strip()
			rule = 7
		elif soup.select('.js-authors-list'):
			author = soup.select('.js-authors-list')[0].text
			rule = 12		
		elif soup.find_all(rel="author"):
			author = soup.find_all(rel="author")[0].text.strip()
			rule = 8
		elif soup.find_all(href=re.compile("/user/.+")):
			author = soup.find_all(href=re.compile("/user/.+"))[0].text.strip()
			rule = 9
		elif soup.select('.fn'):
			author = soup.select('.fn')[0].text.strip()
			rule = 10
			author = ''
			rule = 11
			#error_articles.appends('{}, '.format(response.url))
		item = ArticlesItem()
		item['url'] = response.url
		item['author'] = author
		return item