Ejemplo n.º 1
0
    def parse(self, response):
        item = NewsItem()

        type = response.xpath('//meta[@property="og:type"]//@content').extract_first()

        if type is None or "article" not in type:
            return

        item['url'] = response.url
        item['date'] = parse(
            response.xpath('//*[@id="article-feed"]/article[1]//span[@class="timestamp"]').extract()[0],
            fuzzy=True).strftime("%Y-%m-%dT%H:%M:%S")

        try:
            item['author'] = " ".join(
                response.xpath('//*[@id="article-feed"]/article[1]//div[@class="author"]//text()')
                    .extract()).strip()
        except IndexError:
            item['author'] = ''
        item['title'] = response.xpath('//meta[@property="og:title"]//@content').extract()[0].strip()
        item['description'] = response.xpath(
            '//meta[@property="og:description"]//@content').extract_first().rstrip()

        item['content'] = remove_unicode(' '.join(response.xpath(
            '//*[@id="article-feed"]/article[1]//*[@class="article-body"]//*[@itemprop="articleBody"]//text()').extract()).rstrip())

        yield item
Ejemplo n.º 2
0
 def parse_item_page(self, response):
     item_data = {
         "title":
         remove_unicode(
             response.xpath('//meta[@property="og:title"]/@content').
             extract()[0].strip()),
         "author":
         " ".join(
             response.xpath('//span[@class="author"]//text()').extract()
             [1:-1]).strip(),
         "date":
         parse(response.xpath(
             '//meta[@property="article:published_time"]/@content').extract(
             )[0].strip(),
               fuzzy=True).strftime("%Y-%m-%dT%H:%M:%S"),
         "description":
         remove_unicode(
             response.xpath('//meta[@property="og:description"]/@content').
             extract()[0].strip()),
         "content":
         self._get_content(response),
         "url":
         response.url,
     }
     yield NewsItem(**item_data)
Ejemplo n.º 3
0
	def parse_item(self, response):
		super(NextBigWhatSpider, self).parse_item(response)
		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()

		try:
			title = tree.xpath(".//header[contains(@class, 'entry-header')]/h1/text()")
			details = tree.xpath('.//div[contains(@class, "herald-entry-content")]/p/text()')
			
			if title and details:
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url.split('?')[0]

				news_item['title'] = title[0].strip().encode('ascii','ignore')
				news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()])

				img_urls = tree.xpath('.//div[contains(@class, "herald-post-thumbnail herald-post-thumbnail-single")]/span/img/@src')
				if img_urls:
					news_item['img_urls'] = get_stripped_list(img_urls)

				meta_result = self.get_meta(tree)

				if 'description' in meta_result:
					news_item['blurb'] = meta_result['description']

				return news_item

		except:
			pass
		return None
Ejemplo n.º 4
0
    def parse_item(self, response):
        super(FinancialExpressSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)
        self.log('==RESPONSE=================>>>>>>>>! %s' %
                 response.request.headers['Referer'])
        referer = response.request.headers['Referer']

        news_item = NewsItem()
        try:
            title = tree.xpath(".//meta[@itemprop='headline']/@content")
            details = tree.xpath(".//div[@itemprop='articleBody']//p//text()")
            # self.log('==Title=================>>>>>>>>! %s' % title[0])

            if title and details:
                news_item['source'] = self.name
                news_item['source_url'] = response.url.split('?')[0]
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join(
                    [ele.strip().encode('ascii', 'ignore') for ele in details])

                img_urls = tree.xpath(
                    ".//div[@itemprop='articleBody']//img[contains(@class,'size-full')]/@src"
                )

                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)
                    news_item['cover_image'] = img_urls[0]

                meta_result = self.get_meta(tree)

                if 'og:image' in meta_result:
                    news_item['cover_image'] = meta_result['og:image']

                if 'og:description' in meta_result:
                    news_item['blurb'] = meta_result['og:description']
                    news_item['blurb'] = news_item['blurb'].strip().encode(
                        'ascii', 'ignore')

                if 'og:updated_time' in meta_result:
                    news_item['published_date'] = datetime.strptime(
                        meta_result['og:updated_time'].split("+")[0],
                        '%Y-%m-%dT%H:%M:%S')

                authors = tree.xpath(".//meta[@itemprop='author']/@content")
                if authors:
                    news_item['author'] = get_stripped_list(authors)

                for item in categories:
                    if referer in sum(item['subcategory'].values(), []):
                        news_item['category'] = item['category']
                        key = (key
                               for key, value in item['subcategory'].items()
                               if referer in value).next()
                        news_item['sub_categories'] = [key]
                return news_item

        except Exception, e:
            self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 5
0
	def parse_item(self, response):
		filedir = self.pre_write_check()
		filename = os.path.join(filedir, md5(response.url).hexdigest())
		if not os.path.exists(filename):
			with open(filename, "wb") as html:
				html.write(response.body)
		else:
			print "skipped file {0}".format(filename)
			return None

		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()
		try:
			title = tree.xpath(".//h1[contains(@class,\"post-tile entry-title\")]/text()")
			details = tree.xpath('//div[contains(@class,"entry-content")]/p//text()')
			if title and details:
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url.split('?')[0]
				news_item['title'] = title[0].strip().encode('ascii','ignore')
				news_item['details'] = "\t".join([ele.strip().encode('ascii','ignore') for ele in details])
				# " ".join([ele.strip().encode('ascii','ignore') for ele in details])

				img_urls = tree.xpath('.//div[contains(@class,\'feature-img\')]/img/@src')
				if img_urls:
					news_item['img_urls'] = get_stripped_list(img_urls)
					news_item['cover_image'] = img_urls[0]

				blurb = tree.xpath('.//div[@class=\'entry-content\']/p/em/text()')
				if blurb:
					news_item['blurb'] = blurb[0].strip().encode('ascii','ignore')

				## TODO
				## Author, Tags
				tags = tree.xpath('.//div[contains(@class,\'mom-post-meta single-post-meta\')]/span[3]/a//text()')
				if tags:
					news_item['tags'] = tags

				published_date = tree.xpath('.//span//time[contains(@class,\'updated\')]//text()')
				if published_date:
					news_item['published_date'] = datetime.strptime(" ".join([item.strip().encode('ascii','ignore') for item in published_date]), '%B %d, %Y')


				author = tree.xpath('.//span[contains(@class,\'fn\')]/a/text()')
				if author:
					news_item['author'] = author

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]
				return news_item

		except Exception, e:
			self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 6
0
    def parse_item(self, response):
        super(MoneycontrolSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath('.//h1[contains(@class, "arti_title")]/text()')
            details = tree.xpath(
                './/div[contains(@class, "MT20")]//p//text()[not(ancestor::script)]'
            )
            if title and details:
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = '\t'.join([
                    item.strip().encode('ascii',
                                        'ignore').decode('unicode_escape')
                    for item in details if item.strip()
                ])

                img_urls = tree.xpath(
                    './/table[contains(@class,"MR15")]//div/img/@src')
                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)
                    news_item['cover_image'] = img_urls[0]

                tags = tree.xpath(
                    './/div[contains(@class, "tag_wrap MT20")]/a//text()')
                if tags:
                    news_item['tags'] = get_stripped_list(tags)

                published_date = tree.xpath(
                    './/p[contains(@class, "arttidate MT15")]//text()')

                if published_date:
                    if '|' in published_date[0]:
                        news_item['published_date'] = datetime.strptime(
                            published_date[0].split('|')[0].strip().encode(
                                'ascii', 'ignore'), '%b %d, %Y, %I.%M %p')
                    else:
                        news_item['published_date'] = datetime.strptime(
                            published_date[0].strip().encode(
                                'ascii', 'ignore'), '%b %d, %Y, %I.%M %p')

                referer = response.request.headers['Referer']
                for item in categories:
                    if referer in sum(item['subcategory'].values(), []):
                        news_item['category'] = item['category']
                        key = (key
                               for key, value in item['subcategory'].items()
                               if referer in value).next()
                        news_item['sub_categories'] = [key]
                return news_item

        except Exception, e:
            self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 7
0
    def parse_item(self, response):
        super(EntrepreneurSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:

            title = tree.xpath(".//h1[contains(@class,\'headline\')]//text()")
            details = tree.xpath(
                './/div[contains(@class,\'bodycopy\')]//p//text()')
            if title and details:
                news_item['source'] = self.name
                news_item['source_url'] = response.url.split('?')[0]
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join(
                    [det.strip().encode('ascii', 'ignore') for det in details])

                img_urls = tree.xpath(
                    './/div[contains(@class,\'hero topimage\')]/img/@src')
                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)
                    news_item['cover_image'] = img_urls[0]

                blurb = tree.xpath(
                    './/div[contains(@class,\'bodycopy\')]/p/text()')
                news_item['blurb'] = " ".join([
                    short_blurb.strip().encode('ascii', 'ignore')
                    for short_blurb in blurb[0:1]
                ])

                published_date = tree.xpath(
                    './/time[contains(@itemprop,\'datePublished\')]//text()')
                if published_date:
                    news_item['published_date'] = datetime.strptime(
                        published_date[0].strip(), '%B %d, %Y')

                tags = tree.xpath(
                    './/div[contains(@class,\'article-tags\')]/a/text()')
                if tags:
                    news_item['tags'] = get_stripped_list(tags)
                author = tree.xpath(
                    './/div[contains(@itemprop,\'name\')]/text()')
                if author:
                    news_item['author'] = get_stripped_list(author)

                referer = response.request.headers['Referer']
                for item in categories:
                    if referer in sum(item['subcategory'].values(), []):
                        news_item['category'] = item['category']
                        key = (key
                               for key, value in item['subcategory'].items()
                               if referer in value).next()
                        news_item['sub_categories'] = [key]
                return news_item
        except Exception, e:
            self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 8
0
    def parse_item(self, response):
        filedir = self.pre_write_check()
        filename = os.path.join(filedir, md5(response.url).hexdigest())
        with open(filename, "wb") as html:
            html.write(response.body)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath(
                ".//div[contains(@class,\"large-12 columns article-title\")]//h1//text()"
            )
            details = tree.xpath('//html/body/div/div/article/div//p//text()')
            if title and details:
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                details = [
                    ele.strip().encode('ascii', 'ignore') for ele in details
                ]
                news_item['details'] = "\t".join(details)

                img_urls = tree.xpath(
                    './/img[contains(@class,\"article-hero-img\")]/@src')

                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)

                cover_image = tree.xpath(
                    './/img[contains(@class,\"article-hero-img\")]//img/@src')
                if cover_image:
                    news_item['cover_image'] = cover_image

                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]
                author = tree.xpath(
                    ".//div[contains(@class,\"author\")]/h4/a/text()")
                news_item['author'] = author
                published_date = tree.xpath(
                    ".//div[contains(@class,\"datetime\")]/h2/span/text()")
                news_item['published_date'] = published_date
                news_item['tags'] = tree.xpath(
                    ".//div[contains(@class,\"tags\")]//a/text()")
                meta_result = self.get_meta(tree)

                if 'og:image' in meta_result:
                    news_item['cover_image'] = meta_result['og:image']

                if 'og:description' in meta_result:
                    news_item['blurb'] = meta_result['og:description']
                    news_item['blurb'] = news_item['blurb'].strip().encode(
                        'ascii', 'ignore')

                return news_item
        except:
            pass
        return None
Ejemplo n.º 9
0
	def parse_item(self, response):
		filedir = self.pre_write_check()
		filename = os.path.join(filedir, md5(response.url).hexdigest())
		with open(filename, "wb") as html:
			html.write(response.body)

		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()
		try:
			title = tree.xpath('.//h1[contains(@class,"arti_heading")]/text()')
			details = tree.xpath('.//div[@id=\'arti_content_n\']//p/text()')
			if title and details:
				news_item['title'] = title[0].strip().encode('ascii','ignore')
				details = [x.strip().encode('ascii','ignore') for x in details if x.strip()]
				details = "\t".join(details).strip()
				news_item['details'] = details
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url

				img_urls = tree.xpath('.//div[@id=\'arti_content_n\']/p/strong/img/@src')
				if img_urls:
					news_item['img_urls'] = get_stripped_list(img_urls)
					news_item['cover_image'] = img_urls[0]

				tags = tree.xpath('.//div[contains(@id, "tags_div")]//a/text()')
				if tags:
					news_item['tags'] = get_stripped_list(tags)

				author = tree.xpath('.//span[contains(@class, "grey1")]/a/text()')
				authorname = tree.xpath('.//span[contains(@class, "grey1")]/text()')
				if author:
					author = [x.strip().encode('ascii','ignore')for x in author]
					author = " ".join(author).strip()
					news_item['author'] = get_stripped_list(author)

				if authorname:
					authorname = [x.strip().encode('ascii','ignore')for x in authorname]
					authorname = " ".join(authorname).strip()
					news_item['author'] = get_stripped_list(authorname)

				published_date = tree.xpath('.//div[contains(@class, "sm1 grey1")]/text()')
				if published_date:
					pub_date = published_date[0]
					news_item['published_date'] = datetime.strptime(pub_date.split('IST')[0].strip().encode('ascii','ignore') if 'IST' in pub_date else pub_date, '%B %d, %Y %H:%M')

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]
				return news_item
		except Exception, e:
			self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 10
0
    def parse(self, response):
        item = NewsItem()

        lang = response.xpath(
            '//*[@id="responsive-news"]//meta[@property="og:locale"]//@content'
        ).extract_first()

        type = response.xpath(
            '//*[@id="responsive-news"]//meta[@property="og:type"]//@content'
        ).extract_first()

        if lang is None or "en" not in lang or "article" not in type:
            return

        item['url'] = response.url

        try:
            item['date'] = datetime.utcfromtimestamp(float(
                response.xpath(
                    '//div[@class="story-body"]//div[contains(@class,"date date--v2")]//@data-seconds').extract_first())) \
                .strftime("%Y-%m-%dT%H:%M:%S")
        except TypeError:
            item['date'] = ''
        try:
            _author = response.xpath(
                '//*//span[@class="byline__name"]//text()').extract_first()
            if _author is None:
                item['author'] = 'BBC News'
            else:
                _author_split = _author.split(" ")
                if _author_split[0] == "By":
                    _author = " ".join(_author_split[1:])
                item['author'] = _author + " | BBC News"
            #
            # " ".join(
            #     response.xpath('//*[@id="responsive-news"]//meta[@property="article:author"]//@content')
            #         .extract()[0]).strip()
            #
            # intoarce https://www.facebook.com/bbcnews
        except IndexError:
            item['author'] = 'BBC News'

        item['title'] = response.xpath(
            '//*[@id="responsive-news"]//meta[@property="og:title"]//@content'
        ).extract_first().strip()

        item['description'] = response.xpath(
            '//*[@id="responsive-news"]//meta[@property="og:description"]//@content'
        ).extract_first().rstrip()

        item['content'] = remove_unicode(' '.join(
            response.xpath(
                '//div[@class="story-body"]//div[@property="articleBody"]//p//text()'
            ).extract()).rstrip())

        yield item
Ejemplo n.º 11
0
    def parse_item(self, response):
        super(SmallBizTrendsSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath(".//div[@class='post-inner']/h1/text()")
            details = tree.xpath('.//div[@class=\"entry\"]/p/text()')
            if title and details:
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]
                news_item['title'] = title[0].strip().decode(
                    'unicode_escape').encode('ascii', 'ignore')
                news_item['details'] = '\t'.join([
                    item.strip().encode('ascii',
                                        'ignore').decode('unicode_escape')
                    for item in details if item.strip()
                ])
                # ' '.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()])

                if tree.xpath(
                        './/span[@class=\'full-span-featured-image\']/span/img/@src'
                ):
                    news_item['img_urls'] = tree.xpath(
                        './/span[@class=\'full-span-featured-image\']/span/img/@src'
                    )
                elif tree.xpath('.//img[contains(@class,\'size-full\')]/@src'):
                    news_item['img_urls'] = tree.xpath(
                        './/img[contains(@class,\'size-full\')]/@src')
                elif tree.xpath(
                        './/img[contains(@class,\'aligncenter\')]/@src'):
                    news_item['img_urls'] = tree.xpath(
                        './/img[contains(@class,\'aligncenter\')]/@src')

                meta_result = self.get_meta(tree)

                if 'description' in meta_result:
                    news_item['blurb'] = meta_result['description']

                published_date = tree.xpath(
                    './/span[contains(@class,\'article-date\')]/text()')
                if published_date:
                    news_item['published_date'] = datetime.strptime(
                        published_date[0], '%b %d, %Y')
                author = tree.xpath(
                    './/span[contains(@itemprop,\'name\')]/a/text()')
                if author:
                    news_item['author'] = author
                return news_item

        except:
            pass
        return None
Ejemplo n.º 12
0
	def parse_item(self, response):
		super(HuffingtonPostSpider, self).parse_item(response)
		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()
		try:
			title = tree.xpath(".//h1[contains(@class,\"title\")]//text()")
			details =  tree.xpath('.//div[contains(@class,\"content\")]//p//text()')
			if title and details:
				news_item['source'] = self.name
				news_item['source_url'] = response.url.split('?')[0]
				news_item['crawled_date'] = datetime.now()
				news_item['title'] = title[0].strip().encode('ascii','ignore')
				news_item['details'] = "\t".join([ele.strip().encode('ascii','ignore') for ele in details])

				img_urls = tree.xpath('.//div[contains(@class,\"top-media--image image\")]/img/@src')
				if img_urls:
					news_item['img_urls'] = get_stripped_list(img_urls)

				cover_image = tree.xpath('.//span[contains(@class,\"img-caption\")]//img/@src')
				if cover_image:
					news_item['cover_image'] = get_stripped_list(cover_image)[0]

				meta_result = self.get_meta(tree)

				if 'og:image' in meta_result:
					news_item['cover_image'] = meta_result['og:image']

				if 'og:description' in meta_result:
					news_item['blurb'] = meta_result['og:description']
					news_item['blurb'] = news_item['blurb'].strip().encode('ascii','ignore')

				published_date = tree.xpath('.//div[contains(@class,\"timestamp\")]/span/text()')
				if published_date:
					pub_date = published_date[0].strip()
					news_item['published_date'] = datetime.strptime(pub_date.split('IST')[0].strip() if 'IST' in pub_date else pub_date, '%d/%m/%Y %I:%M %p')

				author = tree.xpath('.//a[contains(@class,\"author-card__details__name\")]/text()')
				if author:
					news_item['author'] = author[0].strip().encode('ascii','ignore')

				tags = tree.xpath('.//div[contains(@class,\"tag-cloud\")]/a/text()')
				if tags:
					news_item['tags'] = [x.strip().encode('ascii','ignore')for x in tags]

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]
				return news_item
		except Exception, e:
			self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 13
0
    def parse_item(self, response):
        super(VentureBeatSpider, self).parse_item(response)

        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath(
                "//h1[contains(@class,\'article-title\')]//text()")
            details = tree.xpath(
                '//div[contains(@class,"article-content")]/p//text()')

            if title and details:
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join(
                    [ele.strip().encode('ascii', 'ignore') for ele in details])

                img_urls = tree.xpath(
                    '//div[contains(@class,"article-content")]//img/@src')
                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)
                    news_item['cover_image'] = img_urls[0]

                published_date = tree.xpath(
                    './/time[contains(@class,\"the-time\")]/text()')
                if published_date:
                    news_item['published_date'] = datetime.strptime(
                        published_date[0], '%B %d, %Y %I:%M %p')

                author = tree.xpath(
                    './/a[contains(@class,\"author url fn\")]/text()')
                if author:
                    news_item['author'] = get_stripped_list(author)

                tags = tree.xpath(
                    './/div[contains(@class,\"article-tags\")]/a/text()')
                if tags:
                    news_item['tags'] = get_stripped_list(tags)

                referer = response.request.headers['Referer']
                for item in categories:
                    if referer in sum(item['subcategory'].values(), []):
                        news_item['category'] = item['category']
                        key = (key
                               for key, value in item['subcategory'].items()
                               if referer in value).next()
                        news_item['sub_categories'] = [key]
                return news_item
        except:
            self.log('==Exception=================>>>>>>>>! %r' % e)
        return None
Ejemplo n.º 14
0
	def parse_item(self, response):
		super(NdtvSpider, self).parse_item(response)
		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()
		try:
			# title = tree.xpath('.//div[contains(@class, "storytitle")]/h1/text()')
			title = tree.xpath('.//h1[@itemprop="headline"]//text()')
			details = tree.xpath('.//div[contains(@class, "pdl200")]//text()[not(ancestor::script)]')
			# details = tree.xpath('.//span[@itemprop="articleBody"]//text')
			if title and details:
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url.split('?')[0]
				news_item['title'] = title[0].strip().encode('ascii','ignore')
				news_item['details'] ='\t'.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()])

				# img_urls = tree.xpath('.//div[contains(@class,"storypicbig")]/img/@src')
				img_urls = tree.xpath('.//div[contains(@class,"whosaid_top_mainimg_cont")]/img/@src')
				if img_urls:
					news_item['img_urls'] = get_stripped_list(img_urls)

				# cover_image = tree.xpath('.//table[contains(@class,"MR15")]//div/img/@src')
				# if cover_image:
				news_item['cover_image'] = get_stripped_list(img_urls)[0]

				published_date = tree.xpath('.//div[contains(@class, "dateline")]/text()')
				date_str = published_date[0].replace("(IST)","").strip().split(":",1)[1]

				if published_date:
					pub_date = published_date[0].strip()
					news_item['published_date'] = parse(date_str)
					# pub_date.strip('| Last Updated:(IST)').strip().encode('ascii','ignore') if '| Last Updated:(IST)' in pub_date else pub_date

				tags=tree.xpath('.//p[contains(@class, "alltags")]/a/text()')
				if tags:
					news_item['tags'] = get_stripped_list(tags)

				author = tree.xpath('.//div[contains(@class, "dateline")]/a/text()')
				if author:
					news_item['author'] = get_stripped_list(author)

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]

				return news_item

		except Exception, e:
			self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 15
0
	def parse_item(self, response):
		super(Thehindubusiness, self).parse_item(response)
		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()
		try:
			# title = tree.xpath('.//h1[@class=\'detail-title\']/text()')
			title = tree.xpath('.//h1[@class=\'title\']/text()')
			# details = tree.xpath('.//p[@class=\'body\']/text()')
			details = tree.xpath('.//div[starts-with(@id,"content-body-14269002")]//p//text()')
			if title and details:
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url.split('?')[0]
				news_item['title'] = title[0].strip().encode('ascii','ignore')
				news_item['details'] = "\t".join([x.strip().encode('ascii','ignore')for x in details]).strip()

				# img_urls = tree.xpath('.//div[contains(@class,"text-embed")]/img/@src')
				img_urls = tree.xpath('.//div[@class="img-container picture"]/img/@data-proxy-image')
				other_img_urls = tree.xpath('.//div[contains(@id,"hcenter")]/img/@src')

				if img_urls:
						news_item['img_urls'] = get_stripped_list(img_urls)
				if other_img_urls:
						news_item['img_urls'] = get_stripped_list(other_img_urls)


				cover_image = tree.xpath('.//div[@class="img-container picture"]/img/@data-proxy-image')
				if cover_image:
						news_item['cover_image'] = cover_image[0].strip()

				tags = tree.xpath('.//div[contains(@id, "articleKeywords")]/p//a/text()')
				if tags:
					news_item['tags'] = get_stripped_list(tags)

				# published_date = tree.xpath('.//div[contains(@class, "artPubUpdate")]/text()')
				published_date = tree.xpath('.//div[@class="teaser-text update-time"]/span/none/text()')
				date_str = published_date[0].replace("IST","").strip()
				if published_date:
					news_item['published_date'] = parse(date_str)
					# datetime.strptime(published_date[0].split('Updated:')[1].split('IST')[0].strip().encode('ascii','ignore'), '%B %d, %Y %I:%M')

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]
				return news_item
		except:
			self.log('==Exception=================>>>>>>>>! %r' % e)
		return None
Ejemplo n.º 16
0
    def parse_item(self, response):
        super(PandoSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath(".//div[contains(@class,\'shim\')]/h1//text()")
            details = tree.xpath(
                './/div[contains(@class,\'contains-copy excerpt\')]//p//text()'
            )

            if title and details:
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join(
                    [det.strip().encode('ascii', 'ignore') for det in details])

                img_urls = tree.xpath(
                    './/p[contains(@id,\'featured-image\')]/img/@src')
                if img_urls:
                    news_item['img_urls'] = img_urls

                blurb = tree.xpath(
                    './/div[contains(@class,\'contains-copy excerpt\')]/p/text()'
                )
                if blurb:
                    news_item['blurb'] = " ".join([
                        blurb.strip().encode('ascii', 'ignore')
                        for blurb in blurb
                    ])

                cover_image = tree.xpath(
                    './/p[contains(@id,\'featured-image\')]/img/@src')
                if cover_image:
                    news_item['cover_image'] = cover_image

                published_date = tree.xpath('//*[@id="byline"]/span/text()')
                if published_date:
                    news_item['published_date'] = datetime.strptime(
                        published_date[1].split('\n')[1].strip(), '%B %d, %Y')

                author = tree.xpath(
                    './/p[contains(@id,\'byline\')]/span//a/text()')
                if author:
                    news_item['author'] = author[1].split('By')[1].strip()
                return news_item

        except:
            pass
        return None
Ejemplo n.º 17
0
    def parse(self, response):
        article = response.xpath('//article[@itemtype="https://schema.org/NewsArticle"]')
        if article is None:
            return

        item = NewsItem()

        item['url'] = article.xpath('//meta[@itemprop="url"]/@content').extract_first()
        if item['url'] is None:
            return

        title = article.xpath('//meta[@itemprop="headline"]/@content').extract_first()
        if title is None:
            return

        index = title.index(' - CNN')
        if index >= 0:
            title = title[0:index]

        item['title'] = remove_unicode(title)

        item['description'] = remove_unicode(article.xpath('//meta[@itemprop="description"]/@content').extract_first())
        if item['description'] is None:
            return

        date = article.xpath('//meta[@itemprop="dateCreated"]/@content').extract_first()
        if date is None:
            return

        item['date'] = parse(date).strftime("%Y-%m-%dT%H:%M:%S")
        if item['date'] is None:
            return

        item['author'] = remove_unicode(article.xpath('//meta[@itemprop="author"]/@content').extract_first())
        if item['author'] is None:
            return

        articleBody = response.xpath('//article[@itemprop="articleBody"]')
        if articleBody is None:
            return

        paragraphs = response.xpath('//div[@class="zn-body__paragraph speakable"]')
        paragraphs.extend(response.xpath('//div[@class="zn-body__paragraph"]'))
        if len(paragraphs) == 0:
            return

        content = []
        for p in paragraphs:
            content.extend(p.xpath('string()').extract())

        item['content'] = remove_unicode(' '.join(content))

        yield item
Ejemplo n.º 18
0
	def parse_item(self, response):
		super(ETSpider, self).parse_item(response)

		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)
		news_item = NewsItem()

		try:
			title = tree.xpath('.//h1[contains(@class, "title")]/text()[1]')
			details = tree.xpath('.//div[@class=\'Normal\']//text()')
			if title and details :
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url.split('?')[0]
				news_item['title'] = title[0].strip().decode('unicode_escape').encode('ascii','ignore')
				news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()])
				news_item['cover_image'] = ''
				news_item['blurb'] = ''
				news_item['img_urls'] = []

				img_urls = tree.xpath('.//figure/img/@src')
				if img_urls:
					news_item['img_urls'] = get_stripped_list(img_urls)
				meta_result = self.get_meta(tree)

				if 'og:image' in meta_result:
					news_item['cover_image'] = meta_result['og:image']

				if 'og:description' in meta_result:
					news_item['blurb'] = meta_result['og:description']

				news_item['blurb'] =  news_item['blurb'].decode('unicode_escape').encode('ascii','ignore')

				published_date = tree.xpath('.//div[contains(@class,\'byline\')]/text()')
				self.log('==Pub date=================>>>>>>>>! %r' % published_date)
				print "pb------------------->",published_date
				if published_date:
					# published_date = " ".join(published_date)
					# news_item['published_date'] = datetime.strptime(published_date.split('|')[1].strip('IST').strip(), '%b %d, %Y, %I.%M %p')
					news_item['author'] = published_date[0].split('|')[0].strip()
					date_str = (published_date[0].split(":")[1:])[0].replace("IST","").strip()
					news_item['published_date'] = datetime.strptime(date_str, '%b %d, %Y, %I.%M %p')

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]
				return news_item
		except Exception, e:
			self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 19
0
    def parse_item(self, response):
        super(DealCurrySpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()

        try:
            title = tree.xpath(".//h1/text()")
            details = tree.xpath(
                './/div[contains(@class, "articleSpacer")]/p//text()')
            if title and details:
                news_item['source_url'] = response.url.split('?')[0]
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join([
                    x.strip().encode('ascii', 'ignore') for x in details
                ]).strip()
                # "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()])

                tags = tree.xpath(
                    './/div[contains(@style, "padding-bottom:10px")]/span[contains(@style, "color:#346f9a; float:left; text-align:left")]/a/text()'
                )
                news_item['tags'] = tags[0].strip().encode('ascii', 'ignore')

                published_date = tree.xpath(
                    ".//span[contains(@style, 'color:#6b6b6b;float:left; text-align:left; margin-left:5px')]/text()"
                )
                news_item['published_date'] = datetime.strptime(
                    published_date[0].encode('ascii', 'ignore'), '%d %B %Y')
                author = tree.xpath(
                    './/div[contains(@style, "")]/span[contains(@style, "color:#6b6b6b; float:left; text-align:left;")]/text()'
                )
                news_item['author'] = author[0].split('by')[1].strip().encode(
                    'ascii', 'ignore')

                img_urls = tree.xpath(
                    './/div[contains(@style, "padding-bottom:10px")]/img/@src')
                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)

                meta_result = self.get_meta(tree)

                if 'description' in meta_result:
                    news_item['blurb'] = meta_result['description']

                return news_item

        except:
            pass
        return None
Ejemplo n.º 20
0
 def parse(self, response):
     item_data = {
         "title": remove_unicode(response.xpath('//meta[@name="dc.title"]/@content').extract()[0].strip()),
         "author": " ".join(response.xpath('//*[@class="article-source"]//text()').extract()).strip(),
         "date": parse(response.xpath('//meta[@name="dc.date"]/@content').extract()[0], fuzzy=True).strftime(
             "%Y-%m-%dT%H:%M:%S"),
         "description": remove_unicode(
             response.xpath('//meta[@name="dc.description"]/@content').extract()[0].strip()),
         "content": remove_unicode(
             ' '.join(response.xpath('//*[@class="article-body"]/p//text()').extract()).strip()),
         "url": response.url,
     }
     yield NewsItem(**item_data)
Ejemplo n.º 21
0
	def parse_item(self, response):
		super(ReutersSpider, self).parse_item(response)
		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()
		try:
			title = tree.xpath(".//h1[contains(@class,\"article-headline\")]/text()")
			details = tree.xpath('//*[@id="article-text"]//text()')

			if title and details:
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url.split('?')[0]
				news_item['title'] = title[0].strip().encode('ascii','ignore')
				news_item['details'] = "\t".join([ det.strip().encode('ascii','ignore') for det in details ])
				# " ".join([ det.strip().encode('ascii','ignore') for det in details ])

				img_urls = tree.xpath('.//div[contains(@class,\'related-photo-container\')]/img/@src')
				if img_urls:
					news_item['img_urls'] = get_stripped_list(img_urls)
					news_item['cover_image'] = img_urls[0]

				blurb = tree.xpath('.//div[contains(@class,\'related-photo-caption\')]/text()')
				if blurb:
					news_item['blurb'] = " ".join([ blurb.strip().encode('ascii','ignore') for blurb in blurb ])

				published_date = tree.xpath('.//span[contains(@class,\'timestamp\')]//text()')
				date_str = published_date[0].replace("|","").replace("IST","").strip()

				if published_date:
					pub_date = published_date[0].strip()
					d1 =[pub_date.split('IST')[0] if 'IST' in pub_date else pub_date]
					# news_item['published_date'] = datetime.strptime(d1[0].strip().encode('ascii','ignore'), '%d %b, %Y')
					# datetime.strptime(d1[0], '%a %b %d, %Y %I:%M%p ')
					news_item['published_date'] = parse(date_str)

				author = tree.xpath('.//span[contains(@class,\'byline\')]/text()')
				if author:
					news_item['author'] = author[0].split('By')[1].strip()

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]
				return news_item

		except Exception, e:
			self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 22
0
    def parse_one_news(self, response):
        """
        """
        news_loader = NewsLoader(item=NewsItem(), response=response)
        #news_loader.add_css('title',"#h1title::text")
        title = response.xpath("/html/head/title/text()").extract()
        if title:
            #news_loader.add_value("title",title)
            news_loader.add_xpath('title', '/html/head/title/text()')
        else:
            news_loader.add_xpath(
                "title",
                "//div[@class='qq_article']/div[@class='hd']/h1/text()")
            logger.warning(
                "!!!! did't get title on head,parse <%s>'s body instead." %
                response.url)

        news_loader.add_value('rank', str(response.meta['rank']))
        news_loader.add_value('news_time', response.meta['news_time'])
        #news_loader.add_css('publisher',"#ne_article_source::text")
        publisher = response.xpath(
            "string(//div[@class='a_Info']/span[@class='a_source'])").extract(
            )

        if publisher and publisher[0]:
            news_loader.add_xpath(
                'publisher',
                "string(//div[@class='a_Info']/span[@class='a_source'])")
        else:
            news_loader.add_value("publisher", u"qq.com")
        news_loader.add_value("news_url", response.url)

        # content = response.xpath("//div[@id='Cnt-Main-Article-QQ']/p[not(style)]").extract()
        # if content:
        news_loader.add_xpath(
            'content', "//div[@id='Cnt-Main-Article-QQ']/p[not(style)]")
        # else:
        #     news_loader.add_xpath("content","//div[@class='w_text']")
        #     logger.warning("!!!! plan A failed,use plan B instead in parsing content <%s>" % response.url)

        news_loader.add_value('category', response.meta['category'])
        news_loader.add_value("site", u"qq.com")

        # 不要到pipeline当中去找这个cover
        cover = response.xpath(
            "//div[@id='Cnt-Main-Article-QQ']/p[not(style)]").xpath(
                ".//img/@src[starts-with(.,'http')]").extract()
        news_cover = cover[0] if cover else DEFAULT_NEWS_COVER
        news_loader.add_value("cover", news_cover)

        return news_loader.load_item()
Ejemplo n.º 23
0
    def parse(self, response):
        article = response.xpath('/html/head/meta[@property="og:type" and @content="article"]')
        if article is None:
            return

        item = NewsItem()

        item['url'] = response.xpath('//meta[@property="og:url"]/@content').extract_first()
        if item['url'] is None:
            return

        item['title'] = remove_unicode(response.xpath('//meta[@property="og:title"]/@content').extract_first())
        if item['title'] is None:
            return

        item['description'] = remove_unicode(response.xpath('//meta[@property="og:description"]/@content').extract_first())
        if item['description'] is None:
            return

        date = response.xpath('//*[@itemprop="datePublished"]/@content').extract_first()
        if date is None:
            return

        item['date'] = parse(date).strftime("%Y-%m-%dT%H:%M:%S")
        if item['date'] is None:
            return

        author = response.xpath('//*[@class="article-author"]')
        if author is None:
            return
        authors = ' '.join(response.xpath('//*[@class="byline__author-name" and @itemprop="name"]/@content').extract())
        item['author'] = remove_unicode(authors)
        if item['author'] is None:
            return

        articleBody = response.xpath('//article[@itemprop="articleBody"]')
        if articleBody is None:
            return

        content = []
        paragraphs = articleBody.xpath('//div[@class="article-body-text component version-2"]//p')
        if len(paragraphs) == 0:
            return

        for p in paragraphs:
            content.extend(p.xpath('string()').extract())

        item['content'] = remove_unicode(' '.join(content))

        yield item
Ejemplo n.º 24
0
	def parse_item(self, response):
		super(VccircleSpider, self).parse_item(response)
		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()
		try:
			title = tree.xpath('//*[@id="block-system-main"]/div/div[2]/div[2]/h2/text()')
			# details = tree.xpath('.//div[@class=\'cont-text\']/div//text()')
			details = tree.xpath('.//div[@class=\'vcc-snippet-body\']/p[@class=\'selectionShareable\']//text()')
			if title and details:
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url.split('?')[0]
				news_item['title'] = title[0].strip().encode('ascii','ignore')
				news_item['details'] = "\t".join([x.strip().encode('ascii','ignore')for x in details]).strip()

				
				img_urls = tree.xpath('.//div[contains(@class,"field-item even")]/img/@src')
				if img_urls:
						news_item['img_urls'] = get_stripped_list(img_urls)
						news_item['cover_image'] = img_urls[0]

				cover_image = tree.xpath('.//table[contains(@class,"MR15")]//div/img/@src')
				if cover_image:
						news_item['cover_image'] = cover_image[0]

				tags = tree.xpath('.//div[contains(@class, "content-tags")]//a/text()')
				if tags:
					news_item['tags'] = get_stripped_list(tags)

				author = tree.xpath('.//span[contains(@class, "byline_person")]/text()')
				if author:
					news_item['author'] = author[0].split('by')[1].strip() if 'by' in author[0] else author[0].strip()

				published_date = tree.xpath('.//span[contains(@class, "date-display-single")]/text()')
				if published_date:
					news_item['published_date'] = datetime.strptime("".join(get_stripped_list(published_date)[0]), '%A, %B %d, %Y -  %I:%M')

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]
				return news_item

		except Exception, e:
			self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 25
0
    def parse_item(self, response):
        super(BusinessStandardSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:

            title = tree.xpath(".//h1[contains(@class,\'headline\')]//text()")
            details = tree.xpath(
                './/span[contains(@class,\'p-content\')]/div//text()[not(ancestor::script)]'
            )
            if title and details:
                news_item['source'] = self.name
                news_item['source_url'] = response.url.split('?')[0]
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join([
                    item.strip().encode('ascii', 'ignore') for item in details
                ])

                img_urls = tree.xpath(
                    './/img[contains(@class,\'imgCont\')]/@src')
                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)

                published_date = tree.xpath(
                    './/p[contains(@class,\'fL\')]//span//text()')
                if published_date:
                    news_item['published_date'] = datetime.strptime(
                        published_date[3].split("\t")[0], '%B %d, %Y')

                related = tree.xpath(
                    './/div[contains(@class,\'readmore_tagBG\')]//h2//a/text()'
                )
                if related:
                    news_item['tags'] = [
                        item.strip() for item in related if item.strip()
                    ]

                cover_image = tree.xpath(
                    './/img[contains(@class,\'imgCont\')]/@src')
                if cover_image:
                    news_item['cover_image'] = cover_image
                return news_item

        except:
            pass
        return None
Ejemplo n.º 26
0
	def parse_item(self, response):
		super(SMETimesSpider, self).parse_item(response)

		htmlparser = etree.HTMLParser()
		tree = etree.parse(BytesIO(response.body), htmlparser)

		news_item = NewsItem()
		try:
			title = tree.xpath(".//span[contains(@class,\"blue-heading\")]//text()")
			details = tree.xpath('//span[@class="text"]//text()')
			details = [ele.encode('ascii','ignore').replace("\n","") for ele in details]
			if title and details:
				news_item['source'] = self.name
				news_item['crawled_date'] = datetime.now()
				news_item['source_url'] = response.url.split('?')[0]
				news_item['title'] = title[0].strip().encode('ascii','ignore')
				news_item['details'] = "\t".join(details)

				img_urls = tree.xpath('//span[contains(@class,"text")]//img/@src')

				if not img_urls[0].lower().find(self.name.lower()) == -1:
					news_item['img_urls'] = get_stripped_list(img_urls)
					news_item['cover_image'] = img_urls[0]

				published_date = tree.xpath('.//div[contains(@align,\'justify\')]/span/span//text()')
				if published_date:
					pub_date = published_date[0].split("|")[1]
					news_item['published_date'] = datetime.strptime(pub_date, ' %d %b, %Y')

				author = tree.xpath('.//div[contains(@align,\'justify\')]/span/span//text()')
				if author :
					news_item['author'] = author[0].split("|")[0].strip()

				referer = response.request.headers['Referer']
				for item in categories:
					if referer in sum(item['subcategory'].values(), []):
						news_item['category'] = item['category']
						key = (key for key,value in item['subcategory'].items() if referer in value).next()
						news_item['sub_categories'] = [key]
				self.log('==Exception=================>>>>>>>>! %r' % news_item)
				return news_item

		except Exception, e:
			self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 27
0
    def parse_item(self, response):
        super(TechCrunchSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath(
                ".//h1[contains(@class,\'alpha tweet-title\')]//text()")
            details = tree.xpath(
                './/div[contains(@class,\'article-entry text\')]//p//text()')
            if title and details:
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join(
                    [det.strip().encode('ascii', 'ignore') for det in details])

                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]

                img_urls = tree.xpath(
                    './/div[contains(@class,\'article-entry text\')]/img/@src')
                if img_urls:
                    news_item['img_urls'] = img_urls

                cover_image = tree.xpath(
                    './/div[contains(@class,\'article-entry text\')]/img/@src')
                if cover_image:
                    news_item['cover_image'] = cover_image[0]

                author = tree.xpath(
                    '/html/body/div[4]/article/div/div[1]/div/header/div[2]/div[1]/a/text()'
                )
                if author:
                    news_item['author'] = author

                return news_item

        except:
            pass
        return None
Ejemplo n.º 28
0
    def parse_item(self, response):
        super(SmeWebSpider, self).parse_item(response)

        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath(".//h1[contains(@itemprop,\"name\")]/text()")
            details = tree.xpath('.//div[@class="article__body"]/p//text()')
            if title and details:
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join([
                    x.strip().encode('ascii', 'ignore') for x in details
                ]).strip()

                news_item['source_url'] = response.url.split('?')[0]

                img_urls = tree.xpath(
                    './/a[contains(@class,\"article__figure__link\")]/img/@src'
                )

                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)

                meta_result = self.get_meta(tree)

                if 'og:image' in meta_result:
                    news_item['cover_image'] = meta_result['og:image']

                if 'og:description' in meta_result:
                    news_item['blurb'] = meta_result['og:description']
                    news_item['blurb'] = news_item['blurb'].strip().encode(
                        'ascii', 'ignore')

                published_date = tree.xpath(
                    './/span[contains(@class,\"article__meta__info\")]/time/text()'
                )
                if published_date:
                    news_item['published_date'] = datetime.strptime(
                        published_date[0].strip().encode('ascii', 'ignore'),
                        '%B %d %Y %I:%M %p')

                author = tree.xpath(
                    './/span[contains(@class,\"article__meta__value\")]/text()'
                )
                if author:
                    author = author[0].strip()
                    news_item['author'] = author.split(
                        '\n')[1].strip() if '\n' in author else author

                tags = tree.xpath(
                    './/div[contains(@class,\"article__tags-container\")]/a/span/text()'
                )
                if tags:
                    news_item['tags'] = get_stripped_list(tags)

                referer = response.request.headers['Referer']
                for item in categories:
                    if referer in sum(item['subcategory'].values(), []):
                        news_item['category'] = item['category']
                        key = (key
                               for key, value in item['subcategory'].items()
                               if referer in value).next()
                        news_item['sub_categories'] = [key]
                return news_item

        except Exception, e:
            self.log('==Exception=================>>>>>>>>! %r' % e)
Ejemplo n.º 29
0
    def parse_item(self, response):
        super(BusinessInsiderSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:

            # title = tree.xpath('//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1/text()')
            title = tree.xpath(
                '//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1//text()'
            )
            # details = tree.xpath('.//div[contains(@class,\'section1\')]//p//text()')
            details = tree.xpath(
                './/div[contains(@class,"hide_show_handler main_content")]//p//text()'
            )

            if title and details:
                news_item['source'] = self.name
                news_item['source_url'] = response.url.split('?')[0]
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join([
                    item.strip().encode('ascii', 'ignore') for item in details
                ])

                img_urls = tree.xpath(
                    './/div[contains(@class,\'MeetingImg blk\')]/img/@src')
                img_url_list = []
                if img_urls:
                    for img_url in img_urls:
                        img_url_list.append("http://www.businessinsider.in" +
                                            img_url)
                    news_item['img_urls'] = get_stripped_list(img_url_list)

                published_date = tree.xpath(
                    './/div[contains(@class,\'ByLine\')]//span[contains(@class,\'Date\')]//text()'
                )
                if published_date:
                    news_item['published_date'] = datetime.strptime(
                        get_stripped_list(published_date)[0],
                        '%b %d, %Y, %I.%M %p')

                author = tree.xpath('.//a[contains(@class,\'Name\')]/text()')
                if author:
                    news_item['author'] = get_stripped_list(author)

                tags = tree.xpath(
                    './/span[contains(@class,\'anchorLink\')]/text()')
                more_tags = tree.xpath(
                    './/div[contains(@id,\'commentHash\')]//a/text()')
                if tags:
                    news_item['tags'] = get_stripped_list(tags)
                if more_tags:
                    news_item['tags'] = get_stripped_list(more_tags)

                cover_image = tree.xpath(
                    './/div[contains(@class,\'MeetingImg blk\')]/img/@src')
                if cover_image:
                    news_item['cover_image'] = img_url_list[0]
                    # get_stripped_list(cover_image)

                referer = response.request.headers['Referer']
                for item in categories:
                    if referer in sum(item['subcategory'].values(), []):
                        news_item['category'] = item['category']
                        key = (key
                               for key, value in item['subcategory'].items()
                               if referer in value).next()
                        news_item['sub_categories'] = [key]
                return news_item

        except:
            self.log('==Exception=================>>>>>>>>! %r' % e)
        return None
Ejemplo n.º 30
0
    def parse_item(self, response):
        super(MashableSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath("//h1[contains(@class,\'title\')]//text()")
            details = tree.xpath(
                '//div[contains(@class,"post-text")]/p//text()')
            detail = tree.xpath(
                '//section[contains(@class,"article-content blueprint")]//p//text()'
            )
            if title and details or detail:
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                if details:
                    news_item['details'] = "\t".join([
                        ele.strip().encode('ascii', 'ignore')
                        for ele in details
                    ])
                if detail:
                    news_item['details'] = "\t".join([
                        ele.strip().encode('ascii', 'ignore') for ele in detail
                    ])

                img_urls = tree.xpath(
                    '//div[contains(@id,"post-content")]//img/@src')

                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)

                cover_image = tree.xpath(
                    '//div[contains(@id,"post-content")]//img/@src')
                if cover_image:
                    news_item['cover_image'] = cover_image

                meta_result = self.get_meta(tree)

                if 'og:image' in meta_result:
                    news_item['cover_image'] = meta_result['og:image']

                if 'og:description' in meta_result:
                    news_item['blurb'] = meta_result['og:description']
                    news_item['blurb'] = news_item['blurb'].strip().encode(
                        'ascii', 'ignore')

                author = tree.xpath(
                    '//span[contains(@class,"author_name")]/a/text()')
                if author:
                    news_item['author'] = author

                tags = tree.xpath(
                    '//footer[contains(@class,"article-topics")]/a/text()')
                if tags:
                    news_item['tags'] = tags

                return news_item
        except:
            pass
        return None