def parse(self, response): hxs = HtmlXPathSelector(response) items = [] for i in range(1, 11): url = hxs.select('//*[@id="search-results"]/li[' + str(i) + ']/h3/a/@href').extract()[0] if "/politics/" in url: item = NewsItem() #selector = '//li[@section=##]&*' #selector = selector.replace('##', str(i + 1)) try: item['headline'] = hxs.select('//*[@id="search-results"]/li[' + str(i) + ']/h3/a/text()').extract()[0].replace("\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "") except IndexError: {} try: item['imgsrc'] = hxs.select('//*[@id="search-results"]/li[' + str(i) + ']/a/img/@src').extract()[0].replace("145x100", "606w") #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read()) #im = Image.open(fd) item['imgSize'] = "606x406" except IndexError: {} try: item['url'] = url except IndexError: {} try: item['date'] = hxs.select('//*[@id="search-results"]/li[' + str(i) + ']/cite/span/text()').extract()[0] except IndexError: {} item['src'] = u'The Washington Post' items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) items = [] stuff = hxs.select( "/html/body/table/tbody/tr/td[3]/div/a/text()").extract() for i in stuff: item = NewsItem() item['headline'] = i items.append(item) return items
def parse1(self, response): hxs = HtmlXPathSelector(response) #/div[@class='clearfix']") item = NewsItem() if len( hxs.select( '//*[@id="mod-article-header"]/h1/text()').extract()) > 0: #selector = '//li[@section=##]&*' #selector = selector.replace('##', str(i + 1)) try: item['headline'] = hxs.select( '//*[@id="mod-article-header"]/h1/text()').extract()[0] except IndexError: {} try: item['url'] = response.url except IndexError: {} try: curdate = hxs.select('//*[@id="mod-article-byline"]/span[3]' / text()).extract()[0] item['date'] = re.sub('<[^<]+?>', '', curdate) except IndexError: {} item['src'] = u'Times of India' else: #selector = '//li[@section=##]&*' #selector = selector.replace('##', str(i + 1)) try: item['headline'] = hxs.select( '//*[@id="netspidersosh"]/div[1]/div/div[11]/div[1]/span[1]/h1/text()' ).extract()[0] except IndexError: {} try: item['url'] = response.url except IndexError: {} try: item['imgsrc'] = hxs.select( '//*[@id="bellyad"]/div/div[1]/img/@src').extract()[0] item['imgSize'] = "300x209" except IndexError: {} try: item['date'] = hxs.select( '/html/head/meta[1]/@content').extract()[0] except IndexError: {} item['src'] = u'Times of India' return item
def parse(self, response): hxs = HtmlXPathSelector(response) items = [] articles = hxs.select("//div[@class='result WireStory']") for i in articles: item = NewsItem() item['headline'] = i.select("./a[@class='title']/text()").extract()[0] item['url'] = i.select("./a[@class='title']/@href").extract()[0] item['date'] = i.select(".//span[@class='date']/text()").extract()[0] item['src'] = u'abc' try: item['imgsrc'] = i.select(".//img/@src").extract()[0].replace("_mw", "") item['imgSize'] = "512x329" except IndexError: {} items.append(item); return items
def parse1(self, response): hxs = HtmlXPathSelector(response) item = NewsItem() item['headline'] = hxs.select('//*[@id="fixed-header-progress-bar-container"]/span[1]/strong/text()').extract()[0] item['url'] = response.url item['date'] = hxs.select('//*[@id="title-progress-bar-container"]/span[2]/text()').extract()[0] item['src'] = u"New Republic" try: #if it's a newer article item['imgsrc'] = u'www.newrepublic.com' + hxs.select('//*[@class="legacy-image pull-right"]/img/@src').extract()[0] item['imgSize'] = u"242x242" except IndexError: try: #maybe it's an old article? item['imgsrc'] = hxs.select('/html/head/meta[12]/@content').extract()[0] item['imgSize'] = u"1250x517" except IndexError: #no image {} return item
def parse1(self, response): hxs = HtmlXPathSelector(response) item = NewsItem() item['headline'] = hxs.select( '//*[@id="ogtitle"]/@content').extract()[0] item['url'] = response.url item['date'] = hxs.select( '//*[@id="article_header"]/div[1]/div[4]/text()').extract()[0] try: item['imgsrc'] = hxs.select( '/html/body/div[2]/article/section/div[2]/div[1]/figure/img/@src' ).extract()[0] item['imgSize'] = '590x421' except IndexError: {} item['src'] = "Slate" return item
def parse(self, response): hxs = HtmlXPathSelector(response) items = [] myStuff = hxs.select('/html/body/div[1]/div[2]/div/div[1]/div[3]/ul/li') for i in myStuff: item = NewsItem() item['url'] = i.select(".//a/@href").extract()[0] item['headline'] = i.select(".//h2/a/text()").extract()[0] item['date'] = i.select("./div/ul/li/text()").extract()[0] try: item['imgsrc'] = i.select("./div/div[@class='newsImage']/a/img/@src").extract()[0] item['imgSize'] = "640x360" except IndexError: {} item['src'] = "Wall Street Journal" items.append(item) return items """hxs = HtmlXPathSelector(response)
def parse(self, response): hxs = HtmlXPathSelector(response) items = [] for i in range(15): item = NewsItem() selector = '//div[@class="tout"]&*' #selector = selector.replace('##', str(i + 1)) try: item['headline'] = hxs.select( "//div[@class='tout']/h3/a/text()").extract()[i] #item['headline'] = hxs.extract() #print hxs.extract() except IndexError: {} try: item['imgsrc'] = hxs.select( selector.replace( '&*', '/div[@class="img"]/a/img/@src')).extract()[i].replace( "?w=360", "") #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read()) #im = Image.open(fd) item['imgSize'] = "360x240" except IndexError: {} try: item['url'] = hxs.select(selector.replace( '&*', '//h3/a/@href')).extract()[i] except IndexError: {} try: item['date'] = hxs.select( selector.replace( '&*', '//span[@class = "date"]/text()')).extract()[i] except IndexError: {} item['src'] = u'Time' items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) items = [] for i in range(30): item = NewsItem() selector = '//li[@section=##]&*' selector = selector.replace('##', str(i + 1)) try: item['headline'] = hxs.select( selector.replace( '&*', '//a[@class="title"]/text()')).extract()[0] except IndexError: {} try: item['imgsrc'] = hxs.select( selector.replace( '&*', '//img[contains(@class, "storyImg")]/@src') ).extract()[0].replace("_75x56", "_640x480").replace("s.jpg", ".jpg") #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read()) #im = Image.open(fd) item['imgSize'] = "640x480" except IndexError: {} try: item['url'] = hxs.select( selector.replace( '&*', '//a[@class = "title"]/@href')).extract()[0] except IndexError: {} try: item['date'] = hxs.select( selector.replace( '&*', '//span[@class = "date"]/text()')).extract()[0] except IndexError: {} item['src'] = u'CBS' items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) items = [] myStuff = hxs.select('//*[@id="vine-t"]/div[2]/div/div[2]/ul/li') for i in myStuff: item = NewsItem() #selector = '//li[@section=##]&*' #selector = selector.replace('##', str(i + 1)) try: item['headline'] = i.select( './article/header/h2/a/text()').extract()[0] except IndexError: {} try: item['imgsrc'] = i.select('.//img/@src').extract( )[0] #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read()) #im = Image.open(fd) item['imgSize'] = "600x400" except IndexError: {} try: item['url'] = i.select( './article/header/h2/a/@href').extract()[0] except IndexError: {} try: item['date'] = i.select( './article/header/time/@datetime').extract()[0] except IndexError: {} item['src'] = u'NBC News' items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) items = [] articleBlocks = hxs.select( "//section[@class='list list-blog-roll']/article[@class='story']") #/div[@class='clearfix']") for i in articleBlocks: item = NewsItem() if len(i.select(".//div[@class='clearfix']").extract()) > 0: i = i.select(".//div[@class='clearfix']") #selector = '//li[@section=##]&*' #selector = selector.replace('##', str(i + 1)) try: item['headline'] = i.select( ".//div[@class='grid_9 omega']/h1/a/text()").extract( )[0] except IndexError: {} try: item['imgsrc'] = i.select( './/img/@src').extract()[0].replace("?w=145", "") #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read()) #im = Image.open(fd) item['imgSize'] = "485x342" except IndexError: {} try: item['url'] = "www.breitbart.com" + i.select( ".//div[@class='grid_3 alpha']/a/@href").extract()[0] except IndexError: {} try: item['date'] = i.select( './/span[@class = "story-time"]/text()').extract()[0] except IndexError: {} item['src'] = u'Breitbart' items.append(item) else: #selector = '//li[@section=##]&*' #selector = selector.replace('##', str(i + 1)) try: item['headline'] = i.select(".//h1/a/text()").extract()[0] except IndexError: {} try: item['url'] = "www.breitbart.com" + i.select( ".//h1/a/@href").extract()[0] except IndexError: {} try: item['date'] = i.select( './/span[@class = "story-time"]/text()').extract()[0] except IndexError: {} item['src'] = u'Breitbart' items.append(item) return items