Exemple #1
0
 def publish(self, datasource, items):
     data = {
             'datasource': datasource,
             'items': items,
         }
     success = networkutil.postData(self.url, data,
                 trycount=_POST_TRY_COUNT, timeout=_URL_TIMEOUT)
     return success
Exemple #2
0
 def publish(self, datasource, items):
     data = {
         'datasource': datasource,
         'items': items,
     }
     success = networkutil.postData(self.url,
                                    data,
                                    trycount=_POST_TRY_COUNT,
                                    timeout=_URL_TIMEOUT)
     return success
    def post(self):
        data = json.loads(self.request.body)

        items = data['items']
        origin = data['origin']
        header = data.get('header')
        for item in items:
            url = item.get('url')
            if not url:
                continue
            fetcher = ContentFetcher(url, header=header,
                                        tried=2)
            fetchResult = fetcher.fetch()
            usedUrl = fetchResult.get('url')
            content = fetchResult.get('content')
            if not content:
                logging.error('Failed to get content from %s.' % (url, ))
                continue
            item['url'] = usedUrl
            try:
                editorFormat = globalconfig.getEditorFormat()
                page = pageanalyst.analyse(usedUrl, content,
                            editorFormat=editorFormat, monitorTitle=item.get('title'))
                if not item.get('title') and page.get('title'):
                    item['title'] = page['title']
                if not item.get('published') and page.get('published') \
                        and not page['published'].endswith('0000'):
                    # if no hour, minute, published is not precise enough
                    item['published'] = page['published']
                    if origin.get('timezone'):
                        item['published'] = dateutil.adjustDate14(item['published'], origin['timezone'])
                if not item.get('content') and page.get('content'):
                    item['content'] = page['content']
                if not item.get('img') and page.get('images'):
                    item['img'] = page['images'][0]
            except Exception:
                logging.exception('Error happens when analyse %s.' % (usedUrl, ))

        responseData = {
                'origin': data['origin'],
                'items': items,
        }

        self.response.headers['Content-Type'] = 'text/plain'
        callbackurl = data['callbackurl']
        success = networkutil.postData(callbackurl, responseData,
                    trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)

        if success:
            message = 'Push items back for %s to %s.' % (data['origin'], callbackurl)
        else:
            message = 'Failed to push items back for %s to %s.' % (data['origin'], callbackurl)
        logging.info(message)
        self.response.out.write(message)
Exemple #4
0
def _sendHotWordsRequest(serverUrl, masterUrls, masterKeyname, pages):
    if not pages:
        logging.warn('No pages is available for %s' % (masterKeyname, ))
        return
    titles = [ page['title'] for page in pages if page.get('title') ]
    data = {
        'masters': masterUrls,
        'key': masterKeyname,
        'titles': titles,
    }
    # sleep random duration,
    # so avoid the trend server receives a lot of requests at the same time.
    time.sleep(_REQUEST_INTERVAL + random.randint(0, _REQUEST_INTERVAL))
    success = networkutil.postData(serverUrl, data, tag=masterKeyname,
                trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)
    if success:
        message = 'Send words request for %s successfully.' % (masterKeyname, )
    else:
        message = 'Failed to send words request for %s.' % (masterKeyname, )
    logging.info(message)
Exemple #5
0
    def post(self):
        data = json.loads(self.request.body)
        items = data['items']
        oldHash= data['hash']
        callbackurl = data['callbackurl']
        resultItems = []
        nnow14 = dateutil.getDateAs14(datetime.datetime.utcnow())
        for item in items:
            pages = bs.search(item['title'])
            if pages:
                resultPage = pages[0]
                resultPage['added'] = models.getUrlAdded(resultPage['url'], nnow14)
            else:
                resultPage = {}
                resultPage['added'] = item['added']
            resultPage['keyword'] = item['title']
            resultPage['rank'] = item['rank']
            resultPage['keywordadded'] = item['added']
            resultItems.append(resultPage)

        contentHash = _calculateHash(resultItems)
        if oldHash == contentHash:
            logging.info('No change fetch for %s.' % (data['origin'], ))
            return

        responseData = {
                'origin': data['origin'],
                'items': resultItems,
                'hash': contentHash,
        }

        success = networkutil.postData(callbackurl, responseData,
                    trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)
        if success:
            message = 'Push %s back to %s.' % (data['origin'], callbackurl)
        else:
            message = 'Failed to push %s back to %s.' % (data['origin'], callbackurl)
        logging.info(message)
        self.response.out.write(message)
def _runTask(requestData):
    wordsConfig = globalconfig.getWordsConfig()
    stopWords = globalconfig.getStopWords()
    userDict = globalconfig.getWordsDict()
    wordsData = bs.calculateWords(wordsConfig, stopWords, userDict, requestData['titles'])

    if not wordsData:
        logging.warn('No words is available for %s.' % (requestData['key'], ))
        return

    _TOP_WORD_COUNT = 20
    responseData = {
        'key': requestData['key'],
        'words': wordsData[:_TOP_WORD_COUNT],
    }
    masterUrls = requestData['masters']
    for callbackurl in masterUrls:
        success = networkutil.postData(callbackurl, responseData, tag='words backend',
                    trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)
        if success:
            message = 'Post words to %s for %s successfully.' % (callbackurl, requestData['key'], )
        else:
            message = 'Failed to post words to %s for %s.' % (callbackurl, requestData['key'], )
        logging.info(message)
Exemple #7
0
    def post(self):
        self.response.headers['Content-Type'] = 'text/plain'
        data = json.loads(self.request.body)
        callbackurl = data['callbackurl']

        triedcount = data.get('triedcount', 0)
        monitorRequest = data['request']
        feedback = {}
        urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback)
        slug = monitorRequest['slug']
        fetchurl = monitorRequest['fetchurl']
        if not content:
            triedcount += 1
            leftcount = _FETCH_TRYCOUNT - triedcount
            message = 'Failed to fetch content form %s for %s, lefted: %s.' % (
                fetchurl,
                slug,
                leftcount,
            )
            logging.error(message)
            self.response.out.write(message)
            if leftcount > 0:
                data['triedcount'] = triedcount
                taskqueue.add(queue_name="default",
                              payload=json.dumps(data),
                              url='/fetch/single/')
                return
        items = None
        responseData = None
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            selector = monitorRequest['selector']
            conditions = monitorRequest.get('conditions', {})
            formatter = monitorRequest.get('formatter')
            parser = HtmlContentParser()
            items = parser.parse(urlUsed, content, selector, conditions,
                                 formatter)

            if items and conditions and conditions.get('detectdetail'):
                detaildetector.populateDetailUrls(items)
        sourceSlug = data['origin']['common']['slug']
        if items:
            sourceDeprecated = models.isSourceDeprecated(sourceSlug)
            if sourceDeprecated:
                models.removeDeprecatedSource(sourceSlug)
            message = 'Items got for %s.' % (slug, )
            logging.info(message)
            self.response.out.write(message)

            oldhash = monitorRequest['fetchhash']
            fetchhash = _calculateHash(items)
            if oldhash != fetchhash or sourceDeprecated:
                responseData = {
                    'origin': data['origin'],
                    'result': {
                        'items': items,
                        'fetchhash': fetchhash,
                    },
                }
        else:
            models.addDeprecatedSource(sourceSlug)
            responseData = {
                'origin': data['origin'],
                'result': None,
            }

            if content:
                message = 'Failed to parse items from %s for %s by %s.' % (
                    fetchurl, slug, selector)
            elif feedback.get('overflow'):
                message = 'Quote overflow.'
                responseData['overflow'] = True
            else:
                message = 'Failed to fetch content from %s for %s.' % (
                    fetchurl, slug)
            logging.error(message)
            self.response.out.write(message)

        if responseData:
            success = networkutil.postData(callbackurl,
                                           responseData,
                                           tag=slug,
                                           trycount=_CALLBACK_TRYCOUNT,
                                           timeout=_URL_TIMEOUT)

            if success:
                message = 'Push items back for %s to %s.' % (slug, callbackurl)
            else:
                message = 'Failed to push items back for %s to %s.' % (
                    slug, callbackurl)
            logging.info(message)
            self.response.out.write(message)
    def post(self):
        self.response.headers['Content-Type'] = 'text/plain'
        data = json.loads(self.request.body)
        callbackurl = data['callbackurl']

        triedcount = data.get('triedcount', 0)
        monitorRequest = data['request']
        feedback = {}
        urlUsed, content = _fetchContent(monitorRequest, triedcount, feedback)
        slug = monitorRequest['slug']
        fetchurl = monitorRequest['fetchurl']
        if not content:
            triedcount += 1
            leftcount = _FETCH_TRYCOUNT - triedcount
            message = 'Failed to fetch content form %s for %s, lefted: %s.' % (
                        fetchurl, slug, leftcount, )
            logging.error(message)
            self.response.out.write(message)
            if leftcount > 0:
                data['triedcount'] = triedcount
                taskqueue.add(queue_name="default", payload=json.dumps(data),
                            url='/fetch/single/')
                return
        items = None
        responseData = None
        if content:
            content = lxmlutil.removeEncodingDeclaration(content)
            selector = monitorRequest['selector']
            conditions = monitorRequest.get('conditions', {})
            formatter = monitorRequest.get('formatter')
            parser = HtmlContentParser()
            items = parser.parse(urlUsed, content, selector, conditions, formatter)

            if items and conditions and conditions.get('detectdetail'):
                detaildetector.populateDetailUrls(items)
        sourceSlug = data['origin']['common']['slug']
        if items:
            sourceDeprecated = models.isSourceDeprecated(sourceSlug)
            if sourceDeprecated:
                models.removeDeprecatedSource(sourceSlug)
            message = 'Items got for %s.' % (slug, )
            logging.info(message)
            self.response.out.write(message)

            oldhash = monitorRequest['fetchhash']
            fetchhash = _calculateHash(items)
            if oldhash != fetchhash or sourceDeprecated:
                responseData = {
                        'origin': data['origin'],
                        'result': {
                            'items': items,
                            'fetchhash': fetchhash,
                        },
                }
        else:
            models.addDeprecatedSource(sourceSlug)
            responseData = {
                    'origin': data['origin'],
                    'result': None,
                }

            if content:
                message = 'Failed to parse items from %s for %s by %s.' % (
                                      fetchurl, slug, selector)
            elif feedback.get('overflow'):
                message = 'Quote overflow.'
                responseData['overflow'] = True
            else:
                message = 'Failed to fetch content from %s for %s.' % (
                                        fetchurl, slug)
            logging.error(message)
            self.response.out.write(message)

        if responseData:
            success = networkutil.postData(callbackurl, responseData, tag=slug,
                        trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)

            if success:
                message = 'Push items back for %s to %s.' % (slug, callbackurl)
            else:
                message = 'Failed to push items back for %s to %s.' % (slug, callbackurl)
            logging.info(message)
            self.response.out.write(message)