def parse_item(self, response): item = NewsScrapItem() item[u'news_title'] = response.xpath(u'//h1/text()').extract_first() item["news_link"] = response.url # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]') # .extract()).strip() if item['news_title']: yield item
def parse(self, response): item = NewsScrapItem() item[u'news_title'] = response.xpath( u'//h1[@class="title"]/text()').extract_first() item["news_link"] = response.url item["news_article"] = response.xpath( u'//section[@class="article-detail"]//div[contains(@class, "content-all")]' ).extract_first() return item
def parse_data(self, response): item = NewsScrapItem() item[u'news_title'] = response.xpath(u'//h1/text()').extract_first() item["news_link"] = response.url item["news_article"] = ''.join( response.xpath( '//*[@id = "innity-in-post"]//table//td[@valign = "baseline"]/text()' ).extract()).strip() if 'ViewNews.aspx' in response.url: return item
def parse(self, response): item = NewsScrapItem() item[u'news_title'] = response.xpath( u'//head/meta[@property="og:title"]/@content').extract_first() item["news_link"] = response.url # no need article item["news_article"] = ''.join( response.xpath( u'//div[@class="text_post_block"]//p').extract()).strip() if item[u'news_title']: yield item
def parse_item(self, response): #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() item = NewsScrapItem() item[u'news_title'] = response.xpath(u'//h1/text()').extract_first() item["news_link"] = response.url item["news_article"] = ''.join( response.xpath( '//div[@class="mw-parser-output"]//p').extract()).strip() if item["news_article"]: return item
def parse_item(self, response): #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() item = NewsScrapItem() item[u'news_title'] = response.xpath( u'//head/meta[@property="og:title"]/@content').extract_first() item["news_link"] = response.url # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]') # .extract()).strip() if item['news_title']: yield item
def parse(self, response): # img = ['.jpg','.png','.gif'] for link in response.xpath('//a/@href').extract(): if not link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): link = urlparse.urljoin(response.url, link) yield scrapy.Request(link, callback=self.parse) item = NewsScrapItem() item[u'news_title'] = response.xpath( u'//head/meta[@property="og:title"]/@content').extract_first() item["news_link"] = response.url # item["news_article"] = ''.join(response.xpath(u'//div[@class="td-post-content"]') # .extract()).strip() if item['news_title']: yield item
def parse(self, response): img = ['.jpg', '.png', '.gif'] if re.search("news_\d*", response.url): item = NewsScrapItem() item[u'news_title'] = response.xpath( u'//h1/text()').extract_first() item["news_link"] = response.url item["news_article"] = ''.join( response.xpath( u'//div[@class="td-post-content"]').extract()).strip() yield item # if re.search("news_\d*",response.url): for link in response.xpath('//a/@href').extract(): if not link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): link = urlparse.urljoin(response.url, link) yield scrapy.Request(link, callback=self.parse)
def parse(self, response): item = NewsScrapItem() item[u'news_title'] = response.xpath(u'//h1/text()').extract_first() item["news_link"] = response.url item["news_article"] = response.xpath("//article//p").extract_first() return item
def parse_data(self, response): item = NewsScrapItem() item[u'news_title'] = response.xpath(u'//head/meta[@property="og:title"]/@content').extract_first() item["news_link"] = response.url item["news_article"] = ''.join(response.xpath(u'//div[@class="postmain"]//p').extract()).strip() yield item