def format_time(s, locale='zh-cn'): ''' Receive a string and return a datetime object formatted from the string. Use _formatters to format. If no formatter can format the string, return False. ''' if not s: return default_time(locale) if type(s) in [float, int]: return timestamp2datetime(s * 1000) for f in _formatters: f = re.sub(_timezone_format, '', f).strip() zone, zone_str = None, '' try: dt = datetime.strptime(s, f) # If not exception, means no timezone info in string. Need to return # a datetime with default timezone that decided by locale. now = datetime.utcnow() if dt.year == 1900 and dt.month == 1 and dt.day == 1: dt = dt.replace(year=now.year, month=now.month, day=now.day) elif dt.year == 1900: dt = dt.replace(year=now.year) tz = CST_CN if locale == 'zh-cn' else utc dt = dt.replace(tzinfo=tz) return _normalize_time(dt.astimezone(utc)) except ValueError, err: zone, zone_str = _get_timezone('%s' % err, locale) if zone and zone_str: s = s.replace(zone_str, '').strip() dt = datetime.strptime(s, f) dt = dt.replace(tzinfo=zone) return _normalize_time(dt.astimezone(utc))
def format_time(s, locale='zh-cn'): ''' Receive a string and return a datetime object formatted from the string. Use _formatters to format. If no formatter can format the string, return False. ''' if not s: return default_time(locale) if type(s) in [float, int]: return timestamp2datetime(s * 1000) for f in _formatters: f = re.sub(_timezone_format, '', f).strip() zone, zone_str = None, '' try: dt = datetime.strptime(s, f) # If not exception, means no timezone info in string. Need to return # a datetime with default timezone that decided by locale. now = datetime.utcnow() if dt.year == 1900 and dt.month == 1 and dt.day == 1: dt = dt.replace(year=now.year,month=now.month,day=now.day) elif dt.year == 1900: dt = dt.replace(year=now.year) tz = CST_CN if locale == 'zh-cn' else utc dt = dt.replace(tzinfo=tz) return _normalize_time(dt.astimezone(utc)) except ValueError, err: zone, zone_str = _get_timezone('%s' % err, locale) if zone and zone_str: s = s.replace(zone_str, '').strip() dt = datetime.strptime(s, f) dt = dt.replace(tzinfo=zone) return _normalize_time(dt.astimezone(utc))
def _should_publish(info): ''' Should publish the given info, return two value: should_publish and is_content_incomplete ''' info_id = info['_id'] content_type = info.get('type', None) content = None if 'news' in info and 'content' in info['news']: content = info['news']['content'] pub_date = info.get('pubDate', None) if content is None and pub_date is None: # do not filter if content and pub date not provided return True, False is_content_incomplete = False if content_type == TYPE_MAP['news']: # check content length, just check for news type if not isinstance(content, unicode): content = content.encode('utf-8') image_count = len(_CLEAN_IMG_RE.findall(content)) content = _CLEAN_IMG_RE.sub('', content) length = len(content) is_content_incomplete = image_count == 0 and length < _INCOMPLETE_CONTENT_LENGTH_THRESHOLD if length < _CONTENT_LENGTH_THRESHOLD: _LOGGER.warn('[PublishAgent] do not publish %s because content too short: %s' % (info_id, length)) return False, is_content_incomplete # check pub date, check for all content types from_time = datetime.datetime.utcnow() - datetime.timedelta(days=_PUBDATE_INTERVAL) pub_date = timestamp2datetime(pub_date) if pub_date < from_time: _LOGGER.warn('[PublishAgent] do not publish %s because too old, pub date: %s' % (info_id, pub_date.strftime('%Y-%m-%d %H:%M:%S'))) return False, is_content_incomplete return True, is_content_incomplete
def _should_publish(info): ''' Should publish the given info, return two value: should_publish and is_content_incomplete ''' info_id = info['_id'] content_type = info.get('type', None) content = None if 'news' in info and 'content' in info['news']: content = info['news']['content'] pub_date = info.get('pubDate', None) if content is None and pub_date is None: # do not filter if content and pub date not provided return True, False is_content_incomplete = False if content_type == TYPE_MAP['news']: # check content length, just check for news type if not isinstance(content, unicode): content = content.encode('utf-8') image_count = len(_CLEAN_IMG_RE.findall(content)) content = _CLEAN_IMG_RE.sub('', content) length = len(content) is_content_incomplete = image_count == 0 and length < _INCOMPLETE_CONTENT_LENGTH_THRESHOLD if length < _CONTENT_LENGTH_THRESHOLD: _LOGGER.warn( '[PublishAgent] do not publish %s because content too short: %s' % (info_id, length)) return False, is_content_incomplete # check pub date, check for all content types from_time = datetime.datetime.utcnow() - datetime.timedelta( days=_PUBDATE_INTERVAL) pub_date = timestamp2datetime(pub_date) if pub_date < from_time: _LOGGER.warn( '[PublishAgent] do not publish %s because too old, pub date: %s' % (info_id, pub_date.strftime('%Y-%m-%d %H:%M:%S'))) return False, is_content_incomplete return True, is_content_incomplete