def parse_news(self, response): def listToString(s): # initialize an empty string str1 = " " # return string return (str1.join(s)) item = AllNewsItem() title = response.css('h2::text').extract_first() if title: item['title'] = str(title).strip() description = response.css('.some-class-name2 p ::text').extract() if not description: description = response.css( '.some-class-name2 div ::text').extract() description = [x.strip() + '\n\n' for x in description] item['description'] = listToString(description).replace( "googletag.cmd.push(function() { googletag.display('div-gpt-ad-1567335777172-0'); });", "").lstrip() item['image'] = response.css( '.img-popup img::attr(src)').extract_first() item['url'] = response.request.url item['source'] = 'kaler_kantho' if 'sport' in response.request.url: self.category = 'sports' if 'country-news' in response.request.url: self.category = 'bangladesh' if 'national' in response.request.url: self.category = 'bangladesh' if 'world' in response.request.url: self.category = 'international' if 'business' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'info-tech' in response.request.url: self.category = 'technology' if 'lifestyle' in response.request.url: self.category = 'lifestyle' if 'miscellaneous' in response.request.url: self.category = 'pachmishali' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): self.category = '' def listToString(s): # initialize an empty string str1 = " " # return string return (str1.join(s)) item = AllNewsItem() item['title'] = response.css('.headline_section ::text').extract_first() description = response.css('#myText p ::text').extract() description = [x.strip() + '\n\n' for x in description] item['description'] = listToString(description) try: item['image'] = 'https://www.jugantor.com' + response.css('.dtl_img_section img::attr(src)').extract_first() except Exception as e: item['image'] = '' item['url'] = response.request.url + '/' item['source'] = 'jugantor' if 'sports' in response.request.url: self.category = 'sports' if '/national' in response.request.url: self.category = 'bangladesh' if 'country-news' in response.request.url: self.category = 'bangladesh' if 'politics' in response.request.url: self.category = 'politics' if 'international' in response.request.url: self.category = 'international' if 'economics' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'tech' in response.request.url: self.category = 'technology' if 'editorial' in response.request.url: self.category = 'opinion' if 'lifestyle' in response.request.url: self.category = 'lifestyle' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): self.category = '' def listToString(s): # initialize an empty string str1 = " " return (str1.join(s)) item = AllNewsItem() title = response.css('h1 ::text').extract_first() if title: item['title'] = str(title).strip() description = response.css('.content-details p ::text').extract() description = [x.strip() + '\n\n' for x in description] item['description'] = listToString(description) item['image'] = response.xpath("//meta[@name='twitter:image']/@content")[0].extract() item['url'] = response.request.url item['source'] = 'jago_news24' if 'sports' in response.request.url: self.category = 'sports' if '/national' in response.request.url: self.category = 'bangladesh' if 'country' in response.request.url: self.category = 'bangladesh' if 'politics' in response.request.url: self.category = 'politics' if 'international' in response.request.url: self.category = 'international' if 'economy' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'technology' in response.request.url: self.category = 'technology' if 'jago-jobs' in response.request.url: self.category = 'job' if 'lifestyle' in response.request.url: self.category = 'lifestyle' if 'opinion' in response.request.url: self.category = 'opinion' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): self.category = '' def listToString(s): # initialize an empty string str1 = " " return (str1.join(s)) item = AllNewsItem() item['title'] = response.css('.detail-headline ::text').extract_first() description = response.css('.description p ::text').extract() description = [x.strip() + '\n\n' for x in description] desc = listToString(description) # if desc: # desc = str(desc).strip('\r\n') # desc = str(desc).strip() item['description'] = desc item['image'] = response.css( '.image-container img::attr(src)').extract_first() item['url'] = response.request.url + '/' item['source'] = 'samakal' if 'sports' in response.request.url: self.category = 'sports' if 'bangladesh' in response.request.url: self.category = 'bangladesh' if 'politics' in response.request.url: self.category = 'politics' if 'international' in response.request.url: self.category = 'international' if 'economics' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'technology' in response.request.url: self.category = 'technology' if 'lifestyle' in response.request.url: self.category = 'lifestyle' if 'chakri' in response.request.url: self.category = 'job' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): def listToString(s): # initialize an empty string str1 = " " # return string return (str1.join(s)) item = AllNewsItem() item['title'] = response.css('h1::text').extract_first() description = response.css('.report-content p ::text').extract() description = [x.strip() + '\n\n' for x in description] description = listToString(description) highlighted = response.css('.highlighted-content ::text').extract_first() if highlighted: description = description.replace(highlighted, highlighted+'। ') item['description'] = description item['image'] = response.css('.reports-big-img img::attr(src)').extract_first() item['url'] = response.request.url item['source'] = 'dhaka_tribune' if 'sports' in response.request.url: self.category = 'sports' if 'bangladesh' in response.request.url: self.category = 'bangladesh' if 'politics' in response.request.url: self.category = 'politics' if 'international' in response.request.url: self.category = 'international' if 'economy' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'tech' in response.request.url: self.category = 'technology' if 'opinion' in response.request.url: self.category = 'opinion' if 'features' in response.request.url: self.category = 'pachmishali' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): TAG_RE = re.compile(r'<[^>]+>') def remove_tags(text): return TAG_RE.sub('', text) item = AllNewsItem() item['title'] = response.css('.post-title ::text').extract_first() description = remove_tags(response.xpath("//article").extract_first( )).replace( "googletag.cmd.push(function() { googletag.display('div-gpt-ad-1551006634778-0'); });", "").strip() item['description'] = description item['image'] = 'https://www.' + self.allowed_domains[ 0] + '/' + response.css( '.main-image img::attr(src)').extract_first() item['url'] = response.request.url item['source'] = 'bd_protidin' if 'sports' in response.request.url: self.category = 'sports' if '/national/' in response.request.url: self.category = 'bangladesh' if 'country' in response.request.url: self.category = 'bangladesh' if 'city-news' in response.request.url: self.category = 'bangladesh' if 'international-news' in response.request.url: self.category = 'international' if 'mixter' in response.request.url: self.category = 'pachmishali' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'tech-world' in response.request.url: self.category = 'technology' if 'life' in response.request.url: self.category = 'lifestyle' if 'job-market' in response.request.url: self.category = 'job' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): def listToString(s): # initialize an empty string str1 = " " # return string return (str1.join(s)) item = AllNewsItem() item['title'] = response.css( '.headline_section h1::text').extract_first() description = response.css('#myText ::text').extract() description = [x.strip() + '\n\n' for x in description] description = listToString(description) item['description'] = description image = response.css('.dtl_img_section img::attr(src)').extract_first() if image: image = 'http://www.jaijaidinbd.com' + image item['image'] = image item['url'] = response.request.url + '/' item['source'] = 'jaijaidin' if 'sports' in response.request.url: self.category = 'sports' if 'homeland' in response.request.url: self.category = 'bangladesh' if 'politics' in response.request.url: self.category = 'politics' if 'rong-berong' in response.request.url: self.category = 'lifestyle' if 'abroad' in response.request.url: self.category = 'international' if 'trade-commerce' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'science-and-technology' in response.request.url: self.category = 'technology' if 'editorial' in response.request.url: self.category = 'opinion' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): self.category = '' def listToString(s): # initialize an empty string str1 = " " return (str1.join(s)) item = AllNewsItem() title = response.css('.color-blue ::text').extract_first() title = str(title).strip() title = title.strip('\n') item['title'] = title description = response.css('.section-content p ::text').extract() description = ['\n\n' + x.strip() if "," != x in x else x for x in description] description = listToString(description).replace("'", "") item['description'] = description item['image'] = response.css('.medium-6 img::attr(data-srcset)').extract_first() item['url'] = response.request.url item['source'] = 'ntv_bd' if 'sports' in response.request.url: self.category = 'sports' if 'bangladesh' in response.request.url: self.category = 'bangladesh' if 'world' in response.request.url: self.category = 'international' if 'economy' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'tech' in response.request.url: self.category = 'technology' if 'opinion' in response.request.url: self.category = 'opinion' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): def listToString(s): # initialize an empty string str1 = " " # return string return (str1.join(s)) item = AllNewsItem() item['title'] = response.css('.dtl_hl_block h1::text').extract_first() description = response.css('.dtl_content_block span::text').extract() description = [x.strip() + '\n\n' for x in description] description = listToString(description) if not description: description = response.css('.dtl_content_block strong::text').extract() if description: description = description + response.css('.dtl_content_block p::text').extract() description = [x.strip() + '\n\n' for x in description] description = listToString(description) else: description = description + response.css('.dtl_content_block p::text').extract() description = [x.strip() + '\n\n' for x in description] description = listToString(description) if 'আরও পড়ুন:'in description: description = description.replace('আরও পড়ুন:', '') if 'আরও পড়ুন :'in description: description = description.replace('আরও পড়ুন :', '') if 'আরো পড়ুন :' in description: description = description.replace('আরো পড়ুন :', '') if 'আরো পড়ুন:' in description: description = description.replace('আরো পড়ুন:', '') item['description'] = description item['image'] = 'https://' + self.allowed_domains[0] + response.css('.dtl_img_block img::attr(src)').extract_first() item['url'] = response.request.url+'/' item['source'] = 'ittefaq' if 'sports' in response.request.url: self.category = 'sports' if 'national' in response.request.url: self.category = 'bangladesh' if 'politics' in response.request.url: self.category = 'politics' if 'wholecountry' in response.request.url: self.category = 'bangladesh' if 'worldnews' in response.request.url: self.category = 'international' if 'economy' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'scienceandtechnology' in response.request.url: self.category = 'technology' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass
def parse_news(self, response): def listToString(s): # initialize an empty string str1 = " " # return string return (str1.join(s)) item = AllNewsItem() item['title'] = response.css('.mb10 ::text').extract_first() description = response.css( 'div[itemprop=articleBody] p ::text').extract() description = [ x.strip() + '\n\n' if not 'প্রথম আলো' in x else x.strip('\n') for x in description ] item['description'] = listToString(description) image = response.css( 'div[itemprop=articleBody] img::attr(src)').extract_first() if not image: image = response.css( '.featured_image img::attr(src)').extract_first() if image: image = 'https:' + image elif not image: image = response.css( 'div[itemprop=articleBody] iframe::attr(src)' ).extract_first() item['image'] = image item['url'] = response.request.url item['source'] = 'prothom_alo' if 'sports' in response.request.url: self.category = 'sports' if 'bangladesh' in response.request.url: self.category = 'bangladesh' if 'international' in response.request.url: self.category = 'international' if 'economy' in response.request.url: self.category = 'economy' if 'entertainment' in response.request.url: self.category = 'entertainment' if 'chakri-bakri' in response.request.url: self.category = 'job' if 'technology' in response.request.url: self.category = 'technology' if 'life-style' in response.request.url: self.category = 'lifestyle' if 'pachmisheli' in response.request.url: self.category = 'pachmishali' if 'opinion' in response.request.url: self.category = 'opinion' item['category'] = Category.objects.get(name=self.category) if description: if 'বিস্তারিত আসছে...' not in description: yield item else: pass