Ejemplo n.º 1
0
def format_time(s, locale='zh-cn'):
    '''
    Receive a string and return a datetime object formatted from the string.
    Use _formatters to format. If no formatter can format the string, return False.
    '''
    if not s:
        return default_time(locale)
    if type(s) in [float, int]:
        return timestamp2datetime(s * 1000)
    for f in _formatters:
        f = re.sub(_timezone_format, '', f).strip()
        zone, zone_str = None, ''
        try:
            dt = datetime.strptime(s, f)
            # If not exception, means no timezone info in string. Need to return
            # a datetime with default timezone that decided by locale.
            now = datetime.utcnow()
            if dt.year == 1900 and dt.month == 1 and dt.day == 1:
                dt = dt.replace(year=now.year, month=now.month, day=now.day)
            elif dt.year == 1900:
                dt = dt.replace(year=now.year)
            tz = CST_CN if locale == 'zh-cn' else utc
            dt = dt.replace(tzinfo=tz)
            return _normalize_time(dt.astimezone(utc))
        except ValueError, err:
            zone, zone_str = _get_timezone('%s' % err, locale)
        if zone and zone_str:
            s = s.replace(zone_str, '').strip()
            dt = datetime.strptime(s, f)
            dt = dt.replace(tzinfo=zone)
            return _normalize_time(dt.astimezone(utc))
Ejemplo n.º 2
0
def format_time(s, locale='zh-cn'):
    '''
    Receive a string and return a datetime object formatted from the string.
    Use _formatters to format. If no formatter can format the string, return False.
    '''
    if not s:
        return default_time(locale)
    if type(s) in [float, int]:
        return timestamp2datetime(s * 1000)
    for f in _formatters:
        f = re.sub(_timezone_format, '', f).strip()
        zone, zone_str = None, ''
        try:
            dt = datetime.strptime(s, f)
            # If not exception, means no timezone info in string. Need to return
            # a datetime with default timezone that decided by locale.
            now = datetime.utcnow()
            if dt.year == 1900 and dt.month == 1 and dt.day == 1:
                dt = dt.replace(year=now.year,month=now.month,day=now.day)
            elif dt.year == 1900:
                dt = dt.replace(year=now.year)
            tz = CST_CN if locale == 'zh-cn' else utc
            dt = dt.replace(tzinfo=tz)
            return _normalize_time(dt.astimezone(utc))
        except ValueError, err:
            zone, zone_str = _get_timezone('%s' % err, locale)
        if zone and zone_str:
            s = s.replace(zone_str, '').strip()
            dt = datetime.strptime(s, f)
            dt = dt.replace(tzinfo=zone)
            return _normalize_time(dt.astimezone(utc))
Ejemplo n.º 3
0
def _should_publish(info):
    '''
    Should publish the given info, return two value: should_publish and is_content_incomplete
    '''
    info_id = info['_id']
    content_type = info.get('type', None)
    content = None
    if 'news' in info and 'content' in info['news']:
        content = info['news']['content']
    pub_date = info.get('pubDate', None)
    if content is None and pub_date is None:
        # do not filter if content and pub date not provided
        return True, False
    is_content_incomplete = False
    if content_type == TYPE_MAP['news']:
        # check content length, just check for news type
        if not isinstance(content, unicode):
            content = content.encode('utf-8')
        image_count = len(_CLEAN_IMG_RE.findall(content))
        content = _CLEAN_IMG_RE.sub('', content)
        length = len(content)
        is_content_incomplete = image_count == 0 and length < _INCOMPLETE_CONTENT_LENGTH_THRESHOLD
        if length < _CONTENT_LENGTH_THRESHOLD:
            _LOGGER.warn('[PublishAgent] do not publish %s because content too short: %s' % (info_id, length))
            return False, is_content_incomplete
    # check pub date, check for all content types
    from_time = datetime.datetime.utcnow() - datetime.timedelta(days=_PUBDATE_INTERVAL)
    pub_date = timestamp2datetime(pub_date)
    if pub_date < from_time:
        _LOGGER.warn('[PublishAgent] do not publish %s because too old, pub date: %s' % (info_id, pub_date.strftime('%Y-%m-%d %H:%M:%S')))
        return False, is_content_incomplete
    return True, is_content_incomplete
Ejemplo n.º 4
0
def _should_publish(info):
    '''
    Should publish the given info, return two value: should_publish and is_content_incomplete
    '''
    info_id = info['_id']
    content_type = info.get('type', None)
    content = None
    if 'news' in info and 'content' in info['news']:
        content = info['news']['content']
    pub_date = info.get('pubDate', None)
    if content is None and pub_date is None:
        # do not filter if content and pub date not provided
        return True, False
    is_content_incomplete = False
    if content_type == TYPE_MAP['news']:
        # check content length, just check for news type
        if not isinstance(content, unicode):
            content = content.encode('utf-8')
        image_count = len(_CLEAN_IMG_RE.findall(content))
        content = _CLEAN_IMG_RE.sub('', content)
        length = len(content)
        is_content_incomplete = image_count == 0 and length < _INCOMPLETE_CONTENT_LENGTH_THRESHOLD
        if length < _CONTENT_LENGTH_THRESHOLD:
            _LOGGER.warn(
                '[PublishAgent] do not publish %s because content too short: %s'
                % (info_id, length))
            return False, is_content_incomplete
    # check pub date, check for all content types
    from_time = datetime.datetime.utcnow() - datetime.timedelta(
        days=_PUBDATE_INTERVAL)
    pub_date = timestamp2datetime(pub_date)
    if pub_date < from_time:
        _LOGGER.warn(
            '[PublishAgent] do not publish %s because too old, pub date: %s' %
            (info_id, pub_date.strftime('%Y-%m-%d %H:%M:%S')))
        return False, is_content_incomplete
    return True, is_content_incomplete