def testBasicTitle(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div' conditions = { 'criterion': { 'title': 'a[href]', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertIsNone(items[0].get('url')) self.assertEquals(items[0].get('title'), '/?q=1') conditions = { 'criterion': { 'title': 'a', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertIsNone(items[0].get('url')) self.assertEquals(items[0].get('title'), 'link1')
def testBasicImageLink(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div' conditions = { 'criterion': { 'imagelink': 'img[src]', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertIsNone(items[0].get('url')) self.assertEquals(items[0].get('imgurl'), 'http://www.google.com/1.gif') self.assertIsNone(items[0].get('imgwidth')) self.assertIsNone(items[0].get('imgheight')) conditions = { 'criterion': { 'imagelink': 'img', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0].get('url'), 'http://www.google.com/?q=2') self.assertEquals(items[0].get('title'), 'a title q2') self.assertEquals(items[0].get('imgurl'), 'http://www.google.com/1.gif') self.assertEquals(items[0].get('imgwidth'), '10') self.assertEquals(items[0].get('imgheight'), '10')
def testBigPicture(self): url = 'http://www.boston.com/bigpicture/' content = self._loadTestData('bigpicture.htm') parser = HtmlContentParser() selector = 'div.headDiv2:first' conditions = { 'criterion': { 'link': 'h2 a', 'image': 'div.bpImageTop img', 'content': 'div.bpCaption', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals( items[0]['url'], u'http://www.boston.com/bigpicture/2012/09/mali.html') self.assertEquals( items[0]['imgurl'], 'http://inapcache.boston.com/universal/site_graphics/blogs/bigpicture/mali_092112/bp1.jpg' ) self.assertTrue(items[0]['content'].startswith('People walk')) self.assertTrue(items[0]['content'].endswith('(Joe Penney/Reuters)'))
def testTianya(self): url = 'http://focus.tianya.cn/' content = self._loadTestData('tianya.htm') parser = HtmlContentParser() selector = 'h1 a' items = parser.parse(url, content, selector, None) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0]['url'], u'http://www.tianya.cn/publicforum/content/develop/1/1079839.shtml')
def testBasicDefaultAutoLink(self): url = 'http://www.google.com/' content = self._loadTestData('complex.htm') parser = HtmlContentParser() selector = 'div#div2' conditions = None items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0].get('url'), 'http://www.google.com/?q=2') self.assertEquals(items[0].get('title'), 'div2 link a')
def testBasicAll(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div a' conditions = { 'enough': {'all': True}, } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 5)
def testBasicDefaultImg(self): url = 'http://www.google.com/' content = self._loadTestData('complex.htm') parser = HtmlContentParser() selector = 'div#div1 img' conditions = None items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0].get('url'), 'http://www.google.com/?q=1') self.assertEquals(items[0].get('title'), 'img-alt') self.assertEquals(items[0].get('imgurl'), 'http://www.google.com/1.jpg')
def testTianya(self): url = 'http://focus.tianya.cn/' content = self._loadTestData('tianya.htm') parser = HtmlContentParser() selector = 'h1 a' items = parser.parse(url, content, selector, None) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals( items[0]['url'], u'http://www.tianya.cn/publicforum/content/develop/1/1079839.shtml' )
def testGovCn(self): url = 'http://www.gov.cn/' content = self._loadTestData('govcn.htm') selector = 'a.hei14:first' parser = HtmlContentParser() conditions = {'criterion': {}} items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals( items[0]['url'], u'http://www.gov.cn/ldhd/2012-10/01/content_2236899.htm') self.assertTrue(items[0]['title'].startswith(u'胡锦涛'))
def testBasicCriterionNoMatch(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div a' conditions = { 'criterion': { 'url': 'a2[href]', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 0)
def testBasicAll(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div a' conditions = { 'enough': { 'all': True }, } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 5)
def testBasicParent(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div a img' conditions = { 'criterion': { 'url': 'parent[href]', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0].get('url'), 'http://www.google.com/?q=2')
def testGovCn(self): url = 'http://www.gov.cn/' content = self._loadTestData('govcn.htm') selector = 'a.hei14:first' parser = HtmlContentParser() conditions = { 'criterion': { } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0]['url'], u'http://www.gov.cn/ldhd/2012-10/01/content_2236899.htm') self.assertTrue(items[0]['title'].startswith(u'胡锦涛'))
def testBasicSelf(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div a img' conditions = { 'criterion': { 'image': 'self[src]', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0].get('imgurl'), 'http://www.google.com/1.gif')
def testBasicExclude(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div a' conditions = { 'exclude': { 'length': 10, } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0].get('url'), 'http://www.google.com/?q=4') self.assertEquals(items[0].get('title'), 'link1234567890')
def testBasicInclude(self): url = 'http://www.google.com/' content = self._loadTestData('basic.htm') parser = HtmlContentParser() selector = 'div a' conditions = { 'include': { 'selector': 'img', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0].get('url'), 'http://www.google.com/?q=2') self.assertEquals(items[0].get('title'), 'a title q2')
def testXinhuanet(self): url = 'http://www.xinhuanet.com/' content = self._loadTestData('xinhuanet.htm') selector = '#pictt a' parser = HtmlContentParser() conditions = { 'criterion': { 'link': 'self', 'title': 'img[alt]', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0]['url'], u'http://news.xinhuanet.com/politics/2012-09/22/c_113173016.htm') self.assertTrue(items[0]['title'].startswith(u'网络反腐'))
def testXinhuanet(self): url = 'http://www.xinhuanet.com/' content = self._loadTestData('xinhuanet.htm') selector = '#pictt a' parser = HtmlContentParser() conditions = { 'criterion': { 'link': 'self', 'title': 'img[alt]', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals( items[0]['url'], u'http://news.xinhuanet.com/politics/2012-09/22/c_113173016.htm') self.assertTrue(items[0]['title'].startswith(u'网络反腐'))
def testQq(self): url = 'http://view.news.qq.com/' content = self._loadTestData('qq.htm') parser = HtmlContentParser() selector = '.left1' conditions = { 'criterion': { 'link': '.left1pic a', 'image': '.left1img img', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0]['url'], u'http://view.news.qq.com/zt2012/bjd/index.htm') self.assertEquals(items[0]['imgurl'], u'http://img1.gtimg.com/view/pics/hv1/112/69/1152/74926507.jpg') self.assertIsNotNone(items[0]['imgwidth']) self.assertIsNotNone(items[0]['imgheight'])
def testBigPicture(self): url = 'http://www.boston.com/bigpicture/' content = self._loadTestData('bigpicture.htm') parser = HtmlContentParser() selector = 'div.headDiv2:first' conditions = { 'criterion': { 'link': 'h2 a', 'image': 'div.bpImageTop img', 'content': 'div.bpCaption', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0]['url'], u'http://www.boston.com/bigpicture/2012/09/mali.html') self.assertEquals(items[0]['imgurl'], 'http://inapcache.boston.com/universal/site_graphics/blogs/bigpicture/mali_092112/bp1.jpg') self.assertTrue(items[0]['content'].startswith('People walk')) self.assertTrue(items[0]['content'].endswith('(Joe Penney/Reuters)'))
def testQq(self): url = 'http://view.news.qq.com/' content = self._loadTestData('qq.htm') parser = HtmlContentParser() selector = '.left1' conditions = { 'criterion': { 'link': '.left1pic a', 'image': '.left1img img', } } items = parser.parse(url, content, selector, conditions) self.assertIsNotNone(items) self.assertEquals(len(items), 1) self.assertEquals(items[0]['url'], u'http://view.news.qq.com/zt2012/bjd/index.htm') self.assertEquals( items[0]['imgurl'], u'http://img1.gtimg.com/view/pics/hv1/112/69/1152/74926507.jpg') self.assertIsNotNone(items[0]['imgwidth']) self.assertIsNotNone(items[0]['imgheight'])
def post(self): self.response.headers['Content-Type'] = 'text/plain' data = json.loads(self.request.body) callbackurl = data['callbackurl'] triedcount = data.get('triedcount', 0) monitorRequest = data['request'] feedback = {} urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback) slug = monitorRequest['slug'] fetchurl = monitorRequest['fetchurl'] if not content: triedcount += 1 leftcount = _FETCH_TRYCOUNT - triedcount message = 'Failed to fetch content form %s for %s, lefted: %s.' % ( fetchurl, slug, leftcount, ) logging.error(message) self.response.out.write(message) if leftcount > 0: data['triedcount'] = triedcount taskqueue.add(queue_name="default", payload=json.dumps(data), url='/fetch/single/') return items = None responseData = None if content: content = lxmlutil.removeEncodingDeclaration(content) selector = monitorRequest['selector'] conditions = monitorRequest.get('conditions', {}) formatter = monitorRequest.get('formatter') parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, conditions, formatter) if items and conditions and conditions.get('detectdetail'): detaildetector.populateDetailUrls(items) sourceSlug = data['origin']['common']['slug'] if items: sourceDeprecated = models.isSourceDeprecated(sourceSlug) if sourceDeprecated: models.removeDeprecatedSource(sourceSlug) message = 'Items got for %s.' % (slug, ) logging.info(message) self.response.out.write(message) oldhash = monitorRequest['fetchhash'] fetchhash = _calculateHash(items) if oldhash != fetchhash or sourceDeprecated: responseData = { 'origin': data['origin'], 'result': { 'items': items, 'fetchhash': fetchhash, }, } else: models.addDeprecatedSource(sourceSlug) responseData = { 'origin': data['origin'], 'result': None, } if content: message = 'Failed to parse items from %s for %s by %s.' % ( fetchurl, slug, selector) elif feedback.get('overflow'): message = 'Quote overflow.' responseData['overflow'] = True else: message = 'Failed to fetch content from %s for %s.' % ( fetchurl, slug) logging.error(message) self.response.out.write(message) if responseData: success = networkutil.postData(callbackurl, responseData, tag=slug, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push items back for %s to %s.' % (slug, callbackurl) else: message = 'Failed to push items back for %s to %s.' % (slug, callbackurl) logging.info(message) self.response.out.write(message)
def post(self): action = self.request.get('action') keyword = '' pageinfo = None if action == 'JSON': jsonstr = self.request.get('jsonstr') if jsonstr: newssource = json.loads(jsonstr) else: newssource = _DEFAULT_NEWSSOURCE encodingUsed = '' urlUsed = '' content = '' httpheader = '' formatter = '' else: keyword = self.request.get('keyword').strip() pageinfo = self.request.get('pageinfo').strip() if pageinfo: pageinfo = json.loads(pageinfo) newssource = {} newssource['active'] = bool(self.request.get('active')) newssource['slug'] = self.request.get('slug') newssource['name'] = self.request.get('name') newssource['order'] = self.request.get('order') newssource['charts'] = bool(self.request.get('charts')) newssource['fetchurl'] = self.request.get('fetchurl') if newssource['fetchurl'] and not newssource[ 'fetchurl'].startswith('http'): newssource['fetchurl'] = 'http://' + newssource['fetchurl'] if not newssource['slug'] and newssource['fetchurl']: newssource['slug'] = urlparse.urlparse( newssource['fetchurl']).netloc httpheader = self.request.get('httpheader') if httpheader: newssource['header'] = json.loads(httpheader) newssource['encoding'] = self.request.get('encoding') newssource['tags'] = self.request.get('tags') # following fields only for showing parsed result. encodingUsed = self.request.get('encodingUsed') urlUsed = self.request.get('urlUsed') oldContent = self.request.get('oldContent') newssource['selector'] = self.request.get('selector').strip() conditions = {} conditions['returnall'] = bool(self.request.get('returnall')) conditions['emptytitle'] = bool(self.request.get('emptytitle')) conditions['detectdetail'] = bool(self.request.get('detectdetail')) conditions['scripttext'] = bool(self.request.get('scripttext')) excludeselector = self.request.get('excludeselector').strip() if excludeselector: if 'exclude' not in conditions: conditions['exclude'] = {} conditions['exclude']['selector'] = excludeselector includeselector = self.request.get('includeselector').strip() if includeselector: if 'include' not in conditions: conditions['include'] = {} conditions['include']['selector'] = includeselector urlselector = self.request.get('urlselector').strip() titleselector = self.request.get('titleselector').strip() imageselector = self.request.get('imageselector').strip() contentselector = self.request.get('contentselector').strip() linkselector = self.request.get('linkselector').strip() imagelinkselector = self.request.get('imagelinkselector').strip() if urlselector or titleselector or contentselector or \ imageselector or linkselector or imagelinkselector: conditions['criterion'] = {} if urlselector: conditions['criterion']['url'] = urlselector if titleselector: conditions['criterion']['title'] = titleselector if contentselector: conditions['criterion']['content'] = contentselector if imageselector: conditions['criterion']['image'] = imageselector if linkselector: conditions['criterion']['link'] = linkselector if imagelinkselector: conditions['criterion']['imagelink'] = imagelinkselector newssource['conditions'] = conditions formatter = self.request.get('formatter') if formatter: newssource['formatter'] = json.loads(formatter) newssource['description'] = self.request.get('description').strip() content = self.request.get('content') jsonstr = jsonutil.getReadableString(newssource) if 'active' not in newssource: newssource['active'] = True items = [] links = [] selector = newssource.get('selector') fetchurl = newssource.get('fetchurl') tried = 2 # the max try count is 3 if not content and fetchurl: fetcher = ContentFetcher(fetchurl, header=newssource.get('header'), encoding=newssource.get('encoding'), tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') oldContent = fetchResult.get('content.old') urlUsed = fetchResult.get('url') encodingUsed = '%s-%s' % (fetchResult.get('encoding'), fetchResult.get('encoding.src')) if content: content = lxmlutil.removeEncodingDeclaration(content) if selector: parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, newssource.get('conditions'), newssource.get('formatter')) else: links = linkdetector.detect(content, keyword) if items and newssource.get('conditions', {}).get('detectdetail'): detaildetector.populateDetailUrls(items) if newssource.get('header'): httpheader = jsonutil.getReadableString(newssource['header']) if newssource.get('formatter'): formatter = jsonutil.getReadableString(newssource['formatter']) if not pageinfo and fetchurl: pageinfo = pmapi.getPage(fetchurl) templateValues = { 'newssource': newssource, 'httpheader': httpheader, 'formatter': formatter, 'content': content, 'oldContent': oldContent, 'encodingUsed': encodingUsed, 'urlUsed': urlUsed, 'keyword': keyword, 'links': links, 'items': items, 'jsonstr': jsonstr, 'pageinfo': pageinfo, 'strpageinfo': json.dumps(pageinfo), } self._render(templateValues)
def post(self): self.response.headers['Content-Type'] = 'text/plain' data = json.loads(self.request.body) callbackurl = data['callbackurl'] triedcount = data.get('triedcount', 0) monitorRequest = data['request'] feedback = {} urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback) slug = monitorRequest['slug'] fetchurl = monitorRequest['fetchurl'] if not content: triedcount += 1 leftcount = _FETCH_TRYCOUNT - triedcount message = 'Failed to fetch content form %s for %s, lefted: %s.' % ( fetchurl, slug, leftcount, ) logging.error(message) self.response.out.write(message) if leftcount > 0: data['triedcount'] = triedcount taskqueue.add(queue_name="default", payload=json.dumps(data), url='/fetch/single/') return items = None responseData = None if content: content = lxmlutil.removeEncodingDeclaration(content) selector = monitorRequest['selector'] conditions = monitorRequest.get('conditions', {}) formatter = monitorRequest.get('formatter') parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, conditions, formatter) if items and conditions and conditions.get('detectdetail'): detaildetector.populateDetailUrls(items) sourceSlug = data['origin']['common']['slug'] if items: sourceDeprecated = models.isSourceDeprecated(sourceSlug) if sourceDeprecated: models.removeDeprecatedSource(sourceSlug) message = 'Items got for %s.' % (slug, ) logging.info(message) self.response.out.write(message) oldhash = monitorRequest['fetchhash'] fetchhash = _calculateHash(items) if oldhash != fetchhash or sourceDeprecated: responseData = { 'origin': data['origin'], 'result': { 'items': items, 'fetchhash': fetchhash, }, } else: models.addDeprecatedSource(sourceSlug) responseData = { 'origin': data['origin'], 'result': None, } if content: message = 'Failed to parse items from %s for %s by %s.' % ( fetchurl, slug, selector) elif feedback.get('overflow'): message = 'Quote overflow.' responseData['overflow'] = True else: message = 'Failed to fetch content from %s for %s.' % ( fetchurl, slug) logging.error(message) self.response.out.write(message) if responseData: success = networkutil.postData(callbackurl, responseData, tag=slug, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push items back for %s to %s.' % (slug, callbackurl) else: message = 'Failed to push items back for %s to %s.' % ( slug, callbackurl) logging.info(message) self.response.out.write(message)
def post(self): action = self.request.get('action') keyword = '' pageinfo = None if action == 'JSON': jsonstr = self.request.get('jsonstr') if jsonstr: newssource = json.loads(jsonstr) else: newssource = _DEFAULT_NEWSSOURCE encodingUsed = '' urlUsed = '' content = '' httpheader = '' formatter = '' else: keyword = self.request.get('keyword').strip() pageinfo = self.request.get('pageinfo').strip() if pageinfo: pageinfo = json.loads(pageinfo) newssource = {} newssource['active'] = bool(self.request.get('active')) newssource['slug'] = self.request.get('slug') newssource['name'] = self.request.get('name') newssource['order'] = self.request.get('order') newssource['charts'] = bool(self.request.get('charts')) newssource['fetchurl'] = self.request.get('fetchurl') if newssource['fetchurl'] and not newssource['fetchurl'].startswith('http'): newssource['fetchurl'] = 'http://' + newssource['fetchurl'] if not newssource['slug'] and newssource['fetchurl']: newssource['slug'] = urlparse.urlparse(newssource['fetchurl']).netloc httpheader = self.request.get('httpheader') if httpheader: newssource['header'] = json.loads(httpheader) newssource['encoding'] = self.request.get('encoding') newssource['tags'] = self.request.get('tags') # following fields only for showing parsed result. encodingUsed = self.request.get('encodingUsed') urlUsed = self.request.get('urlUsed') oldContent = self.request.get('oldContent') newssource['selector'] = self.request.get('selector').strip() conditions = {} conditions['returnall'] = bool(self.request.get('returnall')) conditions['emptytitle'] = bool(self.request.get('emptytitle')) conditions['detectdetail'] = bool(self.request.get('detectdetail')) conditions['scripttext'] = bool(self.request.get('scripttext')) excludeselector = self.request.get('excludeselector').strip() if excludeselector: if 'exclude' not in conditions: conditions['exclude'] = {} conditions['exclude']['selector'] = excludeselector includeselector = self.request.get('includeselector').strip() if includeselector: if 'include' not in conditions: conditions['include'] = {} conditions['include']['selector'] = includeselector urlselector = self.request.get('urlselector').strip() titleselector = self.request.get('titleselector').strip() imageselector = self.request.get('imageselector').strip() contentselector = self.request.get('contentselector').strip() linkselector = self.request.get('linkselector').strip() imagelinkselector = self.request.get('imagelinkselector').strip() if urlselector or titleselector or contentselector or \ imageselector or linkselector or imagelinkselector: conditions['criterion'] = {} if urlselector: conditions['criterion']['url'] = urlselector if titleselector: conditions['criterion']['title'] = titleselector if contentselector: conditions['criterion']['content'] = contentselector if imageselector: conditions['criterion']['image'] = imageselector if linkselector: conditions['criterion']['link'] = linkselector if imagelinkselector: conditions['criterion']['imagelink'] = imagelinkselector newssource['conditions'] = conditions formatter = self.request.get('formatter') if formatter: newssource['formatter'] = json.loads(formatter) newssource['description'] = self.request.get('description').strip() content = self.request.get('content') jsonstr = jsonutil.getReadableString(newssource) if 'active' not in newssource: newssource['active'] = True items = [] links = [] selector = newssource.get('selector') fetchurl = newssource.get('fetchurl') tried = 2 # the max try count is 3 if not content and fetchurl: fetcher = ContentFetcher(fetchurl, header=newssource.get('header'), encoding=newssource.get('encoding'), tried=tried ) fetchResult = fetcher.fetch() content = fetchResult.get('content') oldContent = fetchResult.get('content.old') urlUsed = fetchResult.get('url') encodingUsed = '%s-%s' % (fetchResult.get('encoding'), fetchResult.get('encoding.src')) if content: content = lxmlutil.removeEncodingDeclaration(content) if selector: parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, newssource.get('conditions'), newssource.get('formatter')) else: links = linkdetector.detect(content, keyword) if items and newssource.get('conditions', {}).get('detectdetail'): detaildetector.populateDetailUrls(items) if newssource.get('header'): httpheader = jsonutil.getReadableString(newssource['header']) if newssource.get('formatter'): formatter = jsonutil.getReadableString(newssource['formatter']) if not pageinfo and fetchurl: pageinfo = pmapi.getPage(fetchurl) templateValues = { 'newssource': newssource, 'httpheader': httpheader, 'formatter': formatter, 'content': content, 'oldContent': oldContent, 'encodingUsed': encodingUsed, 'urlUsed': urlUsed, 'keyword': keyword, 'links': links, 'items': items, 'jsonstr': jsonstr, 'pageinfo': pageinfo, 'strpageinfo': json.dumps(pageinfo), } self._render(templateValues)