def post(self): self.response.headers['Content-Type'] = 'text/plain' data = json.loads(self.request.body) callbackurl = data['callbackurl'] triedcount = data.get('triedcount', 0) monitorRequest = data['request'] feedback = {} urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback) slug = monitorRequest['slug'] fetchurl = monitorRequest['fetchurl'] if not content: triedcount += 1 leftcount = _FETCH_TRYCOUNT - triedcount message = 'Failed to fetch content form %s for %s, lefted: %s.' % ( fetchurl, slug, leftcount, ) logging.error(message) self.response.out.write(message) if leftcount > 0: data['triedcount'] = triedcount taskqueue.add(queue_name="default", payload=json.dumps(data), url='/fetch/single/') return items = None responseData = None if content: content = lxmlutil.removeEncodingDeclaration(content) selector = monitorRequest['selector'] conditions = monitorRequest.get('conditions', {}) formatter = monitorRequest.get('formatter') parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, conditions, formatter) if items and conditions and conditions.get('detectdetail'): detaildetector.populateDetailUrls(items) sourceSlug = data['origin']['common']['slug'] if items: sourceDeprecated = models.isSourceDeprecated(sourceSlug) if sourceDeprecated: models.removeDeprecatedSource(sourceSlug) message = 'Items got for %s.' % (slug, ) logging.info(message) self.response.out.write(message) oldhash = monitorRequest['fetchhash'] fetchhash = _calculateHash(items) if oldhash != fetchhash or sourceDeprecated: responseData = { 'origin': data['origin'], 'result': { 'items': items, 'fetchhash': fetchhash, }, } else: models.addDeprecatedSource(sourceSlug) responseData = { 'origin': data['origin'], 'result': None, } if content: message = 'Failed to parse items from %s for %s by %s.' % ( fetchurl, slug, selector) elif feedback.get('overflow'): message = 'Quote overflow.' responseData['overflow'] = True else: message = 'Failed to fetch content from %s for %s.' % ( fetchurl, slug) logging.error(message) self.response.out.write(message) if responseData: success = networkutil.postData(callbackurl, responseData, tag=slug, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push items back for %s to %s.' % (slug, callbackurl) else: message = 'Failed to push items back for %s to %s.' % ( slug, callbackurl) logging.info(message) self.response.out.write(message)
def post(self): self.response.headers['Content-Type'] = 'text/plain' data = json.loads(self.request.body) callbackurl = data['callbackurl'] triedcount = data.get('triedcount', 0) monitorRequest = data['request'] feedback = {} urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback) slug = monitorRequest['slug'] fetchurl = monitorRequest['fetchurl'] if not content: triedcount += 1 leftcount = _FETCH_TRYCOUNT - triedcount message = 'Failed to fetch content form %s for %s, lefted: %s.' % ( fetchurl, slug, leftcount, ) logging.error(message) self.response.out.write(message) if leftcount > 0: data['triedcount'] = triedcount taskqueue.add(queue_name="default", payload=json.dumps(data), url='/fetch/single/') return items = None responseData = None if content: content = lxmlutil.removeEncodingDeclaration(content) selector = monitorRequest['selector'] conditions = monitorRequest.get('conditions', {}) formatter = monitorRequest.get('formatter') parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, conditions, formatter) if items and conditions and conditions.get('detectdetail'): detaildetector.populateDetailUrls(items) sourceSlug = data['origin']['common']['slug'] if items: sourceDeprecated = models.isSourceDeprecated(sourceSlug) if sourceDeprecated: models.removeDeprecatedSource(sourceSlug) message = 'Items got for %s.' % (slug, ) logging.info(message) self.response.out.write(message) oldhash = monitorRequest['fetchhash'] fetchhash = _calculateHash(items) if oldhash != fetchhash or sourceDeprecated: responseData = { 'origin': data['origin'], 'result': { 'items': items, 'fetchhash': fetchhash, }, } else: models.addDeprecatedSource(sourceSlug) responseData = { 'origin': data['origin'], 'result': None, } if content: message = 'Failed to parse items from %s for %s by %s.' % ( fetchurl, slug, selector) elif feedback.get('overflow'): message = 'Quote overflow.' responseData['overflow'] = True else: message = 'Failed to fetch content from %s for %s.' % ( fetchurl, slug) logging.error(message) self.response.out.write(message) if responseData: success = networkutil.postData(callbackurl, responseData, tag=slug, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push items back for %s to %s.' % (slug, callbackurl) else: message = 'Failed to push items back for %s to %s.' % (slug, callbackurl) logging.info(message) self.response.out.write(message)
def post(self): action = self.request.get('action') keyword = '' pageinfo = None if action == 'JSON': jsonstr = self.request.get('jsonstr') if jsonstr: newssource = json.loads(jsonstr) else: newssource = _DEFAULT_NEWSSOURCE encodingUsed = '' urlUsed = '' content = '' httpheader = '' formatter = '' else: keyword = self.request.get('keyword').strip() pageinfo = self.request.get('pageinfo').strip() if pageinfo: pageinfo = json.loads(pageinfo) newssource = {} newssource['active'] = bool(self.request.get('active')) newssource['slug'] = self.request.get('slug') newssource['name'] = self.request.get('name') newssource['order'] = self.request.get('order') newssource['charts'] = bool(self.request.get('charts')) newssource['fetchurl'] = self.request.get('fetchurl') if newssource['fetchurl'] and not newssource['fetchurl'].startswith('http'): newssource['fetchurl'] = 'http://' + newssource['fetchurl'] if not newssource['slug'] and newssource['fetchurl']: newssource['slug'] = urlparse.urlparse(newssource['fetchurl']).netloc httpheader = self.request.get('httpheader') if httpheader: newssource['header'] = json.loads(httpheader) newssource['encoding'] = self.request.get('encoding') newssource['tags'] = self.request.get('tags') # following fields only for showing parsed result. encodingUsed = self.request.get('encodingUsed') urlUsed = self.request.get('urlUsed') oldContent = self.request.get('oldContent') newssource['selector'] = self.request.get('selector').strip() conditions = {} conditions['returnall'] = bool(self.request.get('returnall')) conditions['emptytitle'] = bool(self.request.get('emptytitle')) conditions['detectdetail'] = bool(self.request.get('detectdetail')) conditions['scripttext'] = bool(self.request.get('scripttext')) excludeselector = self.request.get('excludeselector').strip() if excludeselector: if 'exclude' not in conditions: conditions['exclude'] = {} conditions['exclude']['selector'] = excludeselector includeselector = self.request.get('includeselector').strip() if includeselector: if 'include' not in conditions: conditions['include'] = {} conditions['include']['selector'] = includeselector urlselector = self.request.get('urlselector').strip() titleselector = self.request.get('titleselector').strip() imageselector = self.request.get('imageselector').strip() contentselector = self.request.get('contentselector').strip() linkselector = self.request.get('linkselector').strip() imagelinkselector = self.request.get('imagelinkselector').strip() if urlselector or titleselector or contentselector or \ imageselector or linkselector or imagelinkselector: conditions['criterion'] = {} if urlselector: conditions['criterion']['url'] = urlselector if titleselector: conditions['criterion']['title'] = titleselector if contentselector: conditions['criterion']['content'] = contentselector if imageselector: conditions['criterion']['image'] = imageselector if linkselector: conditions['criterion']['link'] = linkselector if imagelinkselector: conditions['criterion']['imagelink'] = imagelinkselector newssource['conditions'] = conditions formatter = self.request.get('formatter') if formatter: newssource['formatter'] = json.loads(formatter) newssource['description'] = self.request.get('description').strip() content = self.request.get('content') jsonstr = jsonutil.getReadableString(newssource) if 'active' not in newssource: newssource['active'] = True items = [] links = [] selector = newssource.get('selector') fetchurl = newssource.get('fetchurl') tried = 2 # the max try count is 3 if not content and fetchurl: fetcher = ContentFetcher(fetchurl, header=newssource.get('header'), encoding=newssource.get('encoding'), tried=tried ) fetchResult = fetcher.fetch() content = fetchResult.get('content') oldContent = fetchResult.get('content.old') urlUsed = fetchResult.get('url') encodingUsed = '%s-%s' % (fetchResult.get('encoding'), fetchResult.get('encoding.src')) if content: content = lxmlutil.removeEncodingDeclaration(content) if selector: parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, newssource.get('conditions'), newssource.get('formatter')) else: links = linkdetector.detect(content, keyword) if items and newssource.get('conditions', {}).get('detectdetail'): detaildetector.populateDetailUrls(items) if newssource.get('header'): httpheader = jsonutil.getReadableString(newssource['header']) if newssource.get('formatter'): formatter = jsonutil.getReadableString(newssource['formatter']) if not pageinfo and fetchurl: pageinfo = pmapi.getPage(fetchurl) templateValues = { 'newssource': newssource, 'httpheader': httpheader, 'formatter': formatter, 'content': content, 'oldContent': oldContent, 'encodingUsed': encodingUsed, 'urlUsed': urlUsed, 'keyword': keyword, 'links': links, 'items': items, 'jsonstr': jsonstr, 'pageinfo': pageinfo, 'strpageinfo': json.dumps(pageinfo), } self._render(templateValues)
def post(self): action = self.request.get('action') keyword = '' pageinfo = None if action == 'JSON': jsonstr = self.request.get('jsonstr') if jsonstr: newssource = json.loads(jsonstr) else: newssource = _DEFAULT_NEWSSOURCE encodingUsed = '' urlUsed = '' content = '' httpheader = '' formatter = '' else: keyword = self.request.get('keyword').strip() pageinfo = self.request.get('pageinfo').strip() if pageinfo: pageinfo = json.loads(pageinfo) newssource = {} newssource['active'] = bool(self.request.get('active')) newssource['slug'] = self.request.get('slug') newssource['name'] = self.request.get('name') newssource['order'] = self.request.get('order') newssource['charts'] = bool(self.request.get('charts')) newssource['fetchurl'] = self.request.get('fetchurl') if newssource['fetchurl'] and not newssource[ 'fetchurl'].startswith('http'): newssource['fetchurl'] = 'http://' + newssource['fetchurl'] if not newssource['slug'] and newssource['fetchurl']: newssource['slug'] = urlparse.urlparse( newssource['fetchurl']).netloc httpheader = self.request.get('httpheader') if httpheader: newssource['header'] = json.loads(httpheader) newssource['encoding'] = self.request.get('encoding') newssource['tags'] = self.request.get('tags') # following fields only for showing parsed result. encodingUsed = self.request.get('encodingUsed') urlUsed = self.request.get('urlUsed') oldContent = self.request.get('oldContent') newssource['selector'] = self.request.get('selector').strip() conditions = {} conditions['returnall'] = bool(self.request.get('returnall')) conditions['emptytitle'] = bool(self.request.get('emptytitle')) conditions['detectdetail'] = bool(self.request.get('detectdetail')) conditions['scripttext'] = bool(self.request.get('scripttext')) excludeselector = self.request.get('excludeselector').strip() if excludeselector: if 'exclude' not in conditions: conditions['exclude'] = {} conditions['exclude']['selector'] = excludeselector includeselector = self.request.get('includeselector').strip() if includeselector: if 'include' not in conditions: conditions['include'] = {} conditions['include']['selector'] = includeselector urlselector = self.request.get('urlselector').strip() titleselector = self.request.get('titleselector').strip() imageselector = self.request.get('imageselector').strip() contentselector = self.request.get('contentselector').strip() linkselector = self.request.get('linkselector').strip() imagelinkselector = self.request.get('imagelinkselector').strip() if urlselector or titleselector or contentselector or \ imageselector or linkselector or imagelinkselector: conditions['criterion'] = {} if urlselector: conditions['criterion']['url'] = urlselector if titleselector: conditions['criterion']['title'] = titleselector if contentselector: conditions['criterion']['content'] = contentselector if imageselector: conditions['criterion']['image'] = imageselector if linkselector: conditions['criterion']['link'] = linkselector if imagelinkselector: conditions['criterion']['imagelink'] = imagelinkselector newssource['conditions'] = conditions formatter = self.request.get('formatter') if formatter: newssource['formatter'] = json.loads(formatter) newssource['description'] = self.request.get('description').strip() content = self.request.get('content') jsonstr = jsonutil.getReadableString(newssource) if 'active' not in newssource: newssource['active'] = True items = [] links = [] selector = newssource.get('selector') fetchurl = newssource.get('fetchurl') tried = 2 # the max try count is 3 if not content and fetchurl: fetcher = ContentFetcher(fetchurl, header=newssource.get('header'), encoding=newssource.get('encoding'), tried=tried) fetchResult = fetcher.fetch() content = fetchResult.get('content') oldContent = fetchResult.get('content.old') urlUsed = fetchResult.get('url') encodingUsed = '%s-%s' % (fetchResult.get('encoding'), fetchResult.get('encoding.src')) if content: content = lxmlutil.removeEncodingDeclaration(content) if selector: parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, newssource.get('conditions'), newssource.get('formatter')) else: links = linkdetector.detect(content, keyword) if items and newssource.get('conditions', {}).get('detectdetail'): detaildetector.populateDetailUrls(items) if newssource.get('header'): httpheader = jsonutil.getReadableString(newssource['header']) if newssource.get('formatter'): formatter = jsonutil.getReadableString(newssource['formatter']) if not pageinfo and fetchurl: pageinfo = pmapi.getPage(fetchurl) templateValues = { 'newssource': newssource, 'httpheader': httpheader, 'formatter': formatter, 'content': content, 'oldContent': oldContent, 'encodingUsed': encodingUsed, 'urlUsed': urlUsed, 'keyword': keyword, 'links': links, 'items': items, 'jsonstr': jsonstr, 'pageinfo': pageinfo, 'strpageinfo': json.dumps(pageinfo), } self._render(templateValues)