def get_url(self, news_row): url_str = news_row.xpath('(./td)[2]/a/@href').extract_first().strip() url = url_str if 'javascript' in url.lower(): import re reg = re.compile(r"\('(.*?)'\)") url = self.root_url + reg.findall(url_str)[0] # todo: every url yielded should be validated & quoted return utils.quote_url(url)
def get_url(self, news_row): url = news_row.xpath('./a/@href').extract_first().strip() url = self.root_url + url return utils.quote_url(url)
def get_url(self, news_row): url = news_row.xpath( './/td[contains(@class, "views-field-field-productnews-display-title-value")]/a/@href' ).extract_first().strip() url = self.root_url + url return utils.quote_url(url)
def get_url(self, news_row): url = news_row.xpath('.//h4/a/@href').extract_first().strip() return utils.quote_url(url)
def get_url(self, news_row): url = news_row.xpath('./div[contains(@class, "title")]/a/@href' ).extract_first().strip() url = self.root_url + url return utils.quote_url(url)
def get_url(self, news_row): url = news_row.xpath( '(.//table)[1]//td/a/@href').extract_first().strip() # todo: asx hkex quote url return utils.quote_url(url)