def publish(self, datasource, items): data = { 'datasource': datasource, 'items': items, } success = networkutil.postData(self.url, data, trycount=_POST_TRY_COUNT, timeout=_URL_TIMEOUT) return success
def post(self): data = json.loads(self.request.body) items = data['items'] origin = data['origin'] header = data.get('header') for item in items: url = item.get('url') if not url: continue fetcher = ContentFetcher(url, header=header, tried=2) fetchResult = fetcher.fetch() usedUrl = fetchResult.get('url') content = fetchResult.get('content') if not content: logging.error('Failed to get content from %s.' % (url, )) continue item['url'] = usedUrl try: editorFormat = globalconfig.getEditorFormat() page = pageanalyst.analyse(usedUrl, content, editorFormat=editorFormat, monitorTitle=item.get('title')) if not item.get('title') and page.get('title'): item['title'] = page['title'] if not item.get('published') and page.get('published') \ and not page['published'].endswith('0000'): # if no hour, minute, published is not precise enough item['published'] = page['published'] if origin.get('timezone'): item['published'] = dateutil.adjustDate14(item['published'], origin['timezone']) if not item.get('content') and page.get('content'): item['content'] = page['content'] if not item.get('img') and page.get('images'): item['img'] = page['images'][0] except Exception: logging.exception('Error happens when analyse %s.' % (usedUrl, )) responseData = { 'origin': data['origin'], 'items': items, } self.response.headers['Content-Type'] = 'text/plain' callbackurl = data['callbackurl'] success = networkutil.postData(callbackurl, responseData, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push items back for %s to %s.' % (data['origin'], callbackurl) else: message = 'Failed to push items back for %s to %s.' % (data['origin'], callbackurl) logging.info(message) self.response.out.write(message)
def _sendHotWordsRequest(serverUrl, masterUrls, masterKeyname, pages): if not pages: logging.warn('No pages is available for %s' % (masterKeyname, )) return titles = [ page['title'] for page in pages if page.get('title') ] data = { 'masters': masterUrls, 'key': masterKeyname, 'titles': titles, } # sleep random duration, # so avoid the trend server receives a lot of requests at the same time. time.sleep(_REQUEST_INTERVAL + random.randint(0, _REQUEST_INTERVAL)) success = networkutil.postData(serverUrl, data, tag=masterKeyname, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Send words request for %s successfully.' % (masterKeyname, ) else: message = 'Failed to send words request for %s.' % (masterKeyname, ) logging.info(message)
def post(self): data = json.loads(self.request.body) items = data['items'] oldHash= data['hash'] callbackurl = data['callbackurl'] resultItems = [] nnow14 = dateutil.getDateAs14(datetime.datetime.utcnow()) for item in items: pages = bs.search(item['title']) if pages: resultPage = pages[0] resultPage['added'] = models.getUrlAdded(resultPage['url'], nnow14) else: resultPage = {} resultPage['added'] = item['added'] resultPage['keyword'] = item['title'] resultPage['rank'] = item['rank'] resultPage['keywordadded'] = item['added'] resultItems.append(resultPage) contentHash = _calculateHash(resultItems) if oldHash == contentHash: logging.info('No change fetch for %s.' % (data['origin'], )) return responseData = { 'origin': data['origin'], 'items': resultItems, 'hash': contentHash, } success = networkutil.postData(callbackurl, responseData, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push %s back to %s.' % (data['origin'], callbackurl) else: message = 'Failed to push %s back to %s.' % (data['origin'], callbackurl) logging.info(message) self.response.out.write(message)
def _runTask(requestData): wordsConfig = globalconfig.getWordsConfig() stopWords = globalconfig.getStopWords() userDict = globalconfig.getWordsDict() wordsData = bs.calculateWords(wordsConfig, stopWords, userDict, requestData['titles']) if not wordsData: logging.warn('No words is available for %s.' % (requestData['key'], )) return _TOP_WORD_COUNT = 20 responseData = { 'key': requestData['key'], 'words': wordsData[:_TOP_WORD_COUNT], } masterUrls = requestData['masters'] for callbackurl in masterUrls: success = networkutil.postData(callbackurl, responseData, tag='words backend', trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Post words to %s for %s successfully.' % (callbackurl, requestData['key'], ) else: message = 'Failed to post words to %s for %s.' % (callbackurl, requestData['key'], ) logging.info(message)
def post(self): self.response.headers['Content-Type'] = 'text/plain' data = json.loads(self.request.body) callbackurl = data['callbackurl'] triedcount = data.get('triedcount', 0) monitorRequest = data['request'] feedback = {} urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback) slug = monitorRequest['slug'] fetchurl = monitorRequest['fetchurl'] if not content: triedcount += 1 leftcount = _FETCH_TRYCOUNT - triedcount message = 'Failed to fetch content form %s for %s, lefted: %s.' % ( fetchurl, slug, leftcount, ) logging.error(message) self.response.out.write(message) if leftcount > 0: data['triedcount'] = triedcount taskqueue.add(queue_name="default", payload=json.dumps(data), url='/fetch/single/') return items = None responseData = None if content: content = lxmlutil.removeEncodingDeclaration(content) selector = monitorRequest['selector'] conditions = monitorRequest.get('conditions', {}) formatter = monitorRequest.get('formatter') parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, conditions, formatter) if items and conditions and conditions.get('detectdetail'): detaildetector.populateDetailUrls(items) sourceSlug = data['origin']['common']['slug'] if items: sourceDeprecated = models.isSourceDeprecated(sourceSlug) if sourceDeprecated: models.removeDeprecatedSource(sourceSlug) message = 'Items got for %s.' % (slug, ) logging.info(message) self.response.out.write(message) oldhash = monitorRequest['fetchhash'] fetchhash = _calculateHash(items) if oldhash != fetchhash or sourceDeprecated: responseData = { 'origin': data['origin'], 'result': { 'items': items, 'fetchhash': fetchhash, }, } else: models.addDeprecatedSource(sourceSlug) responseData = { 'origin': data['origin'], 'result': None, } if content: message = 'Failed to parse items from %s for %s by %s.' % ( fetchurl, slug, selector) elif feedback.get('overflow'): message = 'Quote overflow.' responseData['overflow'] = True else: message = 'Failed to fetch content from %s for %s.' % ( fetchurl, slug) logging.error(message) self.response.out.write(message) if responseData: success = networkutil.postData(callbackurl, responseData, tag=slug, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push items back for %s to %s.' % (slug, callbackurl) else: message = 'Failed to push items back for %s to %s.' % ( slug, callbackurl) logging.info(message) self.response.out.write(message)
def post(self): self.response.headers['Content-Type'] = 'text/plain' data = json.loads(self.request.body) callbackurl = data['callbackurl'] triedcount = data.get('triedcount', 0) monitorRequest = data['request'] feedback = {} urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback) slug = monitorRequest['slug'] fetchurl = monitorRequest['fetchurl'] if not content: triedcount += 1 leftcount = _FETCH_TRYCOUNT - triedcount message = 'Failed to fetch content form %s for %s, lefted: %s.' % ( fetchurl, slug, leftcount, ) logging.error(message) self.response.out.write(message) if leftcount > 0: data['triedcount'] = triedcount taskqueue.add(queue_name="default", payload=json.dumps(data), url='/fetch/single/') return items = None responseData = None if content: content = lxmlutil.removeEncodingDeclaration(content) selector = monitorRequest['selector'] conditions = monitorRequest.get('conditions', {}) formatter = monitorRequest.get('formatter') parser = HtmlContentParser() items = parser.parse(urlUsed, content, selector, conditions, formatter) if items and conditions and conditions.get('detectdetail'): detaildetector.populateDetailUrls(items) sourceSlug = data['origin']['common']['slug'] if items: sourceDeprecated = models.isSourceDeprecated(sourceSlug) if sourceDeprecated: models.removeDeprecatedSource(sourceSlug) message = 'Items got for %s.' % (slug, ) logging.info(message) self.response.out.write(message) oldhash = monitorRequest['fetchhash'] fetchhash = _calculateHash(items) if oldhash != fetchhash or sourceDeprecated: responseData = { 'origin': data['origin'], 'result': { 'items': items, 'fetchhash': fetchhash, }, } else: models.addDeprecatedSource(sourceSlug) responseData = { 'origin': data['origin'], 'result': None, } if content: message = 'Failed to parse items from %s for %s by %s.' % ( fetchurl, slug, selector) elif feedback.get('overflow'): message = 'Quote overflow.' responseData['overflow'] = True else: message = 'Failed to fetch content from %s for %s.' % ( fetchurl, slug) logging.error(message) self.response.out.write(message) if responseData: success = networkutil.postData(callbackurl, responseData, tag=slug, trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT) if success: message = 'Push items back for %s to %s.' % (slug, callbackurl) else: message = 'Failed to push items back for %s to %s.' % (slug, callbackurl) logging.info(message) self.response.out.write(message)