Ejemplo n.º 1
0
def archiveData(timezone):
    nnow = datetime.datetime.utcnow()
    lend = datetime.datetime(nnow.year, nnow.month, nnow.day, 23, 59, 0)
    nend = lend - datetime.timedelta(hours=timezone)
    if nend > nnow:
        lend -= datetime.timedelta(days=1)
        nend -= datetime.timedelta(days=1)
    topnend = nend
    datasources = models.getDatasourceHistory()
    leftSources = datasources
    while True:
        strend = dateutil.getDateAs14(nend)
        leftSources = [item for item in leftSources
                    if 'added' in item and item['added'] <= strend]
        if not leftSources:
            break
        nend2 = nend - datetime.timedelta(days=1)
        strend2 = dateutil.getDateAs14(nend2)
        matchedSources = [item for item in leftSources
                    if 'added' in item and item['added'] > strend2]
        if matchedSources:
            models.archiveData(lend.strftime('%Y%m%d'), matchedSources)
        lend -= datetime.timedelta(days=1)
        nend -= datetime.timedelta(days=1)
    strtopend = dateutil.getDateAs14(topnend)
    datasources = [item for item in datasources
                    if 'added' in item and item['added'] > strtopend]
    models.saveDatasourceHistory(datasources)
Ejemplo n.º 2
0
def getUrlAdded(url, added):
    items = cmapi.getItemValue(_getKeyname(), [],
                    modelname='RunStatus')
    found = _getItem(items, url)
    if found:
        found['count'] += 1
    else:
        found = {}
        found['count'] = 1
        found['url'] = url
        found['added'] = added
        items.append(found)
    found['updated'] = dateutil.getDateAs14(datetime.datetime.utcnow())
    start14 = dateutil.getHoursAs14(24)
    items = [ item for item in items if item['updated'] > start14 ]
    cmapi.saveItem(_getKeyname(), items,
                modelname='RunStatus')
    return found['added']
Ejemplo n.º 3
0
def getPage(url):
    result = {}
    pageInfo = pageinfo.fetch(url)
    if pageInfo:
        result['page'] = pageInfo

    alexaInfo = alexainfo.fetch(url)
    if alexaInfo:
        if 'alexa' in alexaInfo:
            result['alexa'] = alexaInfo['alexa']
        if 'dmoz' in alexaInfo:
            result['dmoz'] = alexaInfo['dmoz']

    pagerank = pagerankinfo.fetch(url)
    if pagerank >= 0:
        result['pagerank'] = pagerank

    if result:
        result['updated'] = dateutil.getDateAs14(datetime.datetime.utcnow())
    return result
Ejemplo n.º 4
0
def _saveWords(keyname, words, pages):
    matchedWords = []
    for keywords in words:
        word = {}
        word['keywords'] = keywords

        matched = globalutil.search(pages, keywords)
        if matched:
            wordPage = matched[0]
            word['page'] = wordPage
            word['size'] = len(matched)
            word['readablekeywords'] = _getNaturalKeywords(keywords, matched)
            matchedWords.append(word)
    nnow = dateutil.getDateAs14(datetime.datetime.utcnow())
    data = {
            'updated': nnow,
            'words': matchedWords,
        }
    models.saveWords(keyname, data)
    return matchedWords
Ejemplo n.º 5
0
def increaseIncomingBandwidth(bytes):
    itemKey = 'inbandwidth'
    inbandwidth = cmapi.getItemValue(itemKey, {}, modelname='RunStatus')

    allband = inbandwidth.get('all')
    if not allband:
        allband = {}
        allband['start'] = dateutil.getDateAs14(datetime.datetime.utcnow())
        inbandwidth['all'] = allband
    allband['bytes'] = allband.get('bytes', 0) + bytes
    allband['fetch'] = allband.get('fetch', 0) + 1

    timezonename = inbandwidth.get('tz')
    if not timezonename:
        timezonename = 'US/Pacific'
        inbandwidth['tz'] = timezonename

    nnow = datetime.datetime.now(tz=pytz.utc)
    tzdate = nnow.astimezone(pytz.timezone(timezonename))
    key = tzdate.strftime('%Y%m%d')

    current = inbandwidth.get('current')
    if not current or current.get('key') != key:
        historycount = inbandwidth.get('historycount')
        if not historycount:
            historycount = 7
            inbandwidth['historycount'] = historycount
        if current:
            history = inbandwidth.get('history')
            if not history:
                history = []
            history.insert(0, current)
            history = history[:historycount]
            inbandwidth['history'] = history
        current = {'key': key, 'bytes': bytes, 'fetch': 1}
        inbandwidth['current'] = current
    else:
        current['fetch'] += 1
        current['bytes'] += bytes

    cmapi.saveItem(itemKey, inbandwidth, modelname='RunStatus')
Ejemplo n.º 6
0
def isConstantTitle(titleConfig, url, title, sideEffect):
    if not url:
        return False
    netloc = urlparse.urlparse(url).netloc
    key = netloc
    value = cmapi.getItemValue(key, {}, modelname=PageConstantTitle)
    record = value.get(title)
    if not record:
        record = {}
    count = record.get('c', 0)
    isconstant = count >= titleConfig.get('occurrence', 1)
    if sideEffect:
        nnow = datetime.datetime.utcnow()
        record['c'] = count + 1
        record['u'] = dateutil.getDateAs14(nnow)
        if len(value) > 20:
            for ik, iv in value.items():
                if (nnow - dateutil.parseDate14(iv['u'])).days >= titleConfig.get('cache.day', 7):
                    del value[ik]
        value[title] = record
        success = cmapi.saveItem(key, value, modelname=PageConstantTitle)
    return isconstant
Ejemplo n.º 7
0
    def post(self):
        data = json.loads(self.request.body)
        items = data['items']
        oldHash= data['hash']
        callbackurl = data['callbackurl']
        resultItems = []
        nnow14 = dateutil.getDateAs14(datetime.datetime.utcnow())
        for item in items:
            pages = bs.search(item['title'])
            if pages:
                resultPage = pages[0]
                resultPage['added'] = models.getUrlAdded(resultPage['url'], nnow14)
            else:
                resultPage = {}
                resultPage['added'] = item['added']
            resultPage['keyword'] = item['title']
            resultPage['rank'] = item['rank']
            resultPage['keywordadded'] = item['added']
            resultItems.append(resultPage)

        contentHash = _calculateHash(resultItems)
        if oldHash == contentHash:
            logging.info('No change fetch for %s.' % (data['origin'], ))
            return

        responseData = {
                'origin': data['origin'],
                'items': resultItems,
                'hash': contentHash,
        }

        success = networkutil.postData(callbackurl, responseData,
                    trycount=_CALLBACK_TRYCOUNT, timeout=_URL_TIMEOUT)
        if success:
            message = 'Push %s back to %s.' % (data['origin'], callbackurl)
        else:
            message = 'Failed to push %s back to %s.' % (data['origin'], callbackurl)
        logging.info(message)
        self.response.out.write(message)
Ejemplo n.º 8
0
def summarizeEvents(eventCriterion, scope, words, pages, twitterAccount):
    exposePages = eventCriterion['expose.pages']
    events = models.getEvents(scope)
    if not events:
        events = {
            'counter': 0,
            'items': [],
        }

    _archiveEvents(scope, events)

    nnow = dateutil.getDateAs14(datetime.datetime.utcnow())

    for word in reversed(words):
        event = _summarizeEvent(exposePages, scope, events, word, nnow)
        if event:
            matcheds = globalutil.search(pages, word['keywords'])
            _saveEventItem(scope, event['id'], word, nnow, matcheds, twitterAccount)

    events['items'].sort(key=lambda item: item['updated'], reverse=True)
    events['items'].sort(key=lambda item: item['word']['size'], reverse=True)
    events['updated'] = nnow
    models.saveEvents(scope, events)