def parse(self, response): # getting features articles if len(str(response).replace('<', '').replace('>', '').split('/')) < 5: item = ArticleItem() item['type'] = "article" url = response.xpath( ".//section[@class='hero']/div/figure/a/@href")[0].extract() item['url'] = url item['title'] = BeautifulSoup( response.xpath( ".//section[@class='hero']/div/figure/a/figcaption/h2") [0].extract(), "lxml").string item['src'] = 'teamtalk' item['lang'] = 'en' yield scrapy.Request(url, callback=self.parse_article, meta={'item': item}) for sel in response.xpath(".//ul[@class='hero__list']/li"): item = ArticleItem() item['type'] = "article" url = sel.xpath(".//a/@href")[0].extract() item['url'] = url item['src'] = 'teamtalk' item['lang'] = 'en' item['title'] = BeautifulSoup( sel.xpath(".//h3")[0].extract(), "lxml").string yield scrapy.Request(url, callback=self.parse_article, meta={'item': item}) # getting normal articles for sel in response.xpath(".//ul[@class='articleList__list']/li"): item = ArticleItem() item['type'] = "article" url = sel.xpath(".//a/@href")[0].extract() item['url'] = url item['title'] = BeautifulSoup( sel.xpath(".//h3")[0].extract(), "lxml").string item['summary'] = BeautifulSoup( sel.xpath(".//p")[0].extract(), "lxml").string item['src'] = 'teamtalk' item['lang'] = 'en' item['itemIndex'] = self.itemCount self.itemCount += 1 yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})
def parse(self,response): for sel in response.xpath("//div[contains(@id,'news-archive')]//ul/li"): item = ArticleItem() item['type'] = "article" article_info = sel.xpath(".//div[contains(@class,'articleInfo')]")[0] item['title'] = article_info.xpath(".//a/text()")[0].extract() if sel.xpath(".//div[contains(@class,'articleSummary')]/text()"): item['summary'] = sel.xpath(".//div[contains(@class,'articleSummary')]/text()")[0].extract() item['date'] = str(sel.xpath("../../div[contains(@class,'date')]/text()")[0].extract().encode('utf8')) + str(sel.xpath(".//span")[1].xpath(".//text()")[0].extract().encode('utf8')) relative_url = str(article_info.xpath(".//a/@href")[0].extract()) url = response.urljoin(relative_url).replace("/en/news/archive","") item['url'] = url item['src'] = 'goal' item['lang'] = 'ar' tag = sel.xpath(".//strong/text()")[0].extract() item['tags'] = [tag] item['itemIndex'] = self.itemCount self.itemCount = self.itemCount+1 yield scrapy.Request(url, callback=self.parse_article,meta={'item': item})
def parse(self, response): data = json.load( urllib2.urlopen( "http://layser.bleacherreport.com/api/team_stream/world-football?tags=null&limit=100" )) articles = data['streams'][0]['items'] for article in articles: item = ArticleItem() item['postId'] = self.name + str(article["id"]) item['type'] = "article" item['title'] = article['title'] url = article['permalink'] item['url'] = article['permalink'] item['image'] = article["primary_image_650x440"] item['date'] = article["publishedAt"] item['src'] = 'bleacher_report' item['lang'] = 'en' item['tags'] = article['tags'] item['itemIndex'] = self.itemCount self.itemCount = self.itemCount + 1 yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})
def parse(self,response): for sel in response.xpath(".//div[contains(@class,'slider_bottom clearfix')]/div[starts-with(@class,'section')]"): item = ArticleItem() item['type'] = "article" url = sel.xpath(".//a[contains(@class,'title')]/@href")[0].extract() item['url'] = url item['title'] = sel.xpath(".//a[contains(@class,'title')]/h3/text()")[0].extract() item['summary'] = sel.xpath(".//p[contains(@class,'desc')]/text()")[0].extract() item['src'] = 'cairokora' item['lang'] = 'ar' item['itemIndex'] = self.itemCount itemCount = self.itemCount+1 yield scrapy.Request(url, callback=self.parse_article,meta={'item': item}) for sel in response.xpath(".//div[contains(@class,'thum clearfix')]"): url = sel.xpath(".//a[contains(@class,'title')]/@href")[0].extract() item['url'] = url item['title'] = sel.xpath(".//a[contains(@class,'title')]/h3/text()")[0].extract() item['src'] = 'cairokora' item['lang'] = 'ar' item['itemIndex'] = self.itemCount self.itemCount = self.itemCount+1 yield scrapy.Request(url, callback=self.parse_article,meta={'item': item})
def parse(self,response): itemCount = 1 for sel in response.xpath(".//div[contains(@class,'news-list__item news-list__item--show-thumb-bp30')]"): item = ArticleItem() item['type'] = "article" url = sel.xpath(".//h4/a/@href")[0].extract() item['url'] = url item['title'] = sel.xpath(".//h4/a/text()")[0].extract() if sel.xpath(".//p[contains(@class,'news-list__snippet')]/text()"): item['summary'] = sel.xpath(".//p[contains(@class,'news-list__snippet')]/text()")[0].extract() item['src'] = 'SkySports' item['lang'] = 'en' item['date'] = sel.xpath(".//span[contains(@class,'label__timestamp')]/text()")[0].extract() item['itemIndex'] = self.itemCount self.itemCount = self.itemCount+1 yield scrapy.Request(url, callback=self.parse_article,meta={'item': item})
def parse(self, response): for sel in response.xpath(".//ul[@class='ws-editorial-list-items']/a"): item = ArticleItem() relative_url = sel.xpath(".//@href")[0].extract() url = response.urljoin(relative_url) item['url'] = url item['title'] = sel.xpath( ".//div[@class='ws-editorial-title']/text()")[0].extract( ).strip() item['src'] = 'whoscored' item['lang'] = 'en' item['type'] = 'article' item['summary'] = '' yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})
def parse(self,response): for sel in response.xpath(".//div[contains(@class,'AllNews SeeAlso')]/ul/li"): item = ArticleItem() item['type'] = "article" relative_url = sel.xpath(".//a/@href")[0].extract() url = response.urljoin(relative_url) item['url'] = url item['title'] = sel.xpath(".//span[contains(@class,'ANT')]/text()")[0].extract() item['summary'] = sel.xpath(".//span[contains(@class,'ANB')]/text()")[0].extract() item['src'] = 'filgoal' item['lang'] = 'ar' item['date'] = sel.xpath(".//div[contains(@class,'ANTInfo')]/span/text()")[0].extract() item['itemIndex'] = self.itemCount itemCount = self.itemCount+1 yield scrapy.Request(url, callback=self.parse_article,meta={'item': item})
def parse(self,response): for sel in response.xpath(".//ul[contains(@class,'all-news-list')]/li"): item = ArticleItem() item['type'] = "article" relative_url = sel.xpath(".//div/h5/a/@href")[0].extract() url = response.urljoin(relative_url) item['url'] = url item['title'] = sel.xpath(".//div/h5/a/text()")[0].extract() item['summary'] = sel.xpath(".//div/p/text()")[0].extract() item['src'] = 'korabia' item['lang'] = 'ar' item['date'] = sel.xpath(".//div/p[contains(@class,'info')]/span/text()")[0].extract() item['itemIndex'] = self.itemCount self.itemCount = self.itemCount+1 yield scrapy.Request(url, callback=self.parse_article,meta={'item': item})
def parse(self, response): posts = json.loads(response.body) itemCount = 1 for post in posts["data"]: item = ArticleItem() item['type'] = "article" #needs to be checked url = post['link'] item['postId'] = self.name + post['id'] item['url'] = url item['tags'] = ' ' item['title'] = post['from']["name"] if 'message' in post: item['summary'] = post['message'] else: item['summary'] = ' ' item['src'] = 'facebook' item['lang'] = 'en' item['account_image'] = 'https://graph.facebook.com/' + post[ 'id'].split( '_' )[0] + '/picture?type=normal&access_token=' + self.access_token #post["picture"] item['date'] = post["created_time"] item['content'] = item['summary'] item['itemIndex'] = itemCount itemCount = itemCount + 1 isLink = False if post['type'] == 'photo': item['image'] = 'https://graph.facebook.com/' + post['id'].split( '_' )[-1] + '/picture?type=normal&access_token=' + self.access_token #post["picture"] elif post['type'] == 'link' or ('facebook.com' not in url and 'fbcdn' not in url): item['image'] = '' isLink = True yield scrapy.Request(url, callback=self.parse_link, meta={'item': item}, dont_filter=True) else: item['image'] = '' if isLink == False: yield item
def parse(self, response): for index, sel in enumerate( response.xpath( ".//li[contains(@class,'content-gallery__item w50')]")): item = ArticleItem() item['type'] = "article" relative_url = sel.xpath(".//figcaption/a/@href")[0].extract() url = response.urljoin(relative_url) item['url'] = url item['title'] = sel.xpath(".//figcaption/a/text()")[0].extract() item['src'] = 'bein' item['lang'] = 'en' item['itemIndex'] = self.itemCount self.itemCount = self.itemCount + 1 yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})
def parse(self, response): for sel in response.xpath(".//li[contains(@class,'ClipItem')]"): item = ArticleItem() item['type'] = "article" relative_url = sel.xpath( ".//a[contains(@class,'NewsTitle')]/@href")[0].extract() url = response.urljoin(relative_url) item['summary'] = sel.xpath( ".//a[contains(@class,'NewsTitle')]/text()")[0].extract() item['url'] = url item['title'] = sel.xpath( ".//a[contains(@class,'NewsTitle')]/text()")[0].extract( ).strip() item['src'] = 'yallakora' item['lang'] = 'ar' item['itemIndex'] = self.itemCount itemCount = self.itemCount + 1 yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})
def parse(self, response): for sel in response.xpath( ".//li[contains(@class,'dcm-thumblist-item')]"): item = ArticleItem() item['type'] = "article" relative_url = sel.xpath(".//h4/a/@href")[0].extract() url = response.urljoin(relative_url) item['url'] = url item['title'] = sel.xpath(".//h4/a/text()")[0].extract() item['summary'] = sel.xpath( ".//img/@ph-data-picture-comment")[0].extract() item['src'] = 'fifa' item['lang'] = 'ar' item['image'] = sel.xpath( ".//img/@ph-data-picture-url")[0].extract() item['itemIndex'] = self.itemCount itemCount = self.itemCount + 1 yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})
def parse(self, response): for sel in response.xpath( ".//div[contains(@class,'node node-article node-teaser clearfix')]" ): item = ArticleItem() item['type'] = "article" relative_url = sel.xpath(".//a/@href")[0].extract() url = response.urljoin(relative_url) item['url'] = url item['title'] = sel.xpath(".//h2/a/text()")[0].extract() item['summary'] = sel.xpath( ".//div[contains(@class,'field field-name-field-intro field-type-text-long field-label-hidden')]/div/div/text()" )[0].extract() item['src'] = 'talksport' item['lang'] = 'en' item['itemIndex'] = self.itemCount self.itemCount = self.itemCount + 1 yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})
def parse(self, response): data = json.load(urllib2.urlopen("http://www.espnfc.com/api/feed?xhr=1&t=54&device=pc&limit=100&content=story&offset=0&key=espnfc-en-www-index-news-600")) articles = data['data']['features'] for article in articles: item = ArticleItem() item['type'] = "article" item['postId'] = self.name+str(article['id']) item['title'] = article['headline'] item['url'] = article['linkUrl'] if 'images' in article and article['images']: item['image'] = article["images"][0]["URL"] elif 'thumbnail' in article: item['image'] = article["thumbnail"]["URL"] else: continue item['date'] = article['date'] item['src'] = 'espnfc' item['lang'] = 'en' if 'summary' in article.keys(): item['summary'] = article["summary"] tags = [] for tag in article["contentCategory"]: tags.append(tag['name']) item['tags'] = tags tree = etree.HTML(article['body']) content = tree.xpath(".//p/text()") item['content'] = ' '.join(content) item['itemIndex'] = self.itemCount self.itemCount = self.itemCount+1 item['account_image'] = ' ' yield item
def parse(self, response): for sel in response.xpath( ".//li[contains(@class,'cat-blog-container')]"): item = ArticleItem() item['type'] = "article" url = sel.xpath( ".//div[contains(@class,'cat-blog-inner')]/h3/a/@href" )[0].extract() item['summary'] = sel.xpath( ".//div[contains(@class,'cat-blog-inner')]/h3/a/text()" )[0].extract() item['url'] = url item['title'] = sel.xpath( ".//div[contains(@class,'cat-blog-inner')]/h3/a/text()" )[0].extract() item['src'] = 'greatgoals' item['lang'] = 'en' item['itemIndex'] = self.itemCount itemCount = self.itemCount + 1 yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})
def parse(self, response): for sel in response.xpath( ".//div[contains(@id,'content-loop')]/div[starts-with(@id,'post-')]" ): item = ArticleItem() item['type'] = "article" url = sel.xpath(".//h2/a/@href")[0].extract() item['url'] = url item['title'] = sel.xpath(".//h2/a/text()")[0].extract() item['summary'] = sel.xpath( ".//div[contains(@class,'entry-excerpt')]/text()")[0].extract( ) item['src'] = 'hihi2' item['lang'] = 'ar' item['itemIndex'] = self.itemCount itemCount = self.itemCount + 1 item['date'] = sel.xpath( ".//span[contains(@class,'entry-date')]/text()")[0].extract( ).replace('[', '').replace(']', '') yield scrapy.Request(url, callback=self.parse_article, meta={'item': item})