def find_biggest_image(images=None): """ find the biggest image in resolution from a list of images """ if not images: logger.error('Image list is found VOID!') return None try: biggest = None resolution_max = MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1] for image in images: if 'width' in image and 'height' in image: resolution_image = int(image['width']) * int(image['height']) if resolution_image > MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]: if resolution_image > resolution_max: biggest = image resolution_max = resolution_image else: logger.info('Image [%s] is not big enough!' % str(image['url'])) else: logger.info('Height and width not found! %s' % str(image)) return biggest except Exception as k: logger.error('Problem:[%s]\nSource:[%s]' % (str(k), str(images))) return None
def _collect_images(data=None, referer=None): """ find all possible images 1. image_list 2. images in the new content """ if not data: return None try: images = [] # first try to find images in image_list if 'image_list' in data and data.get('image_list'): for image in data.get('image_list'): if 'src' in image and image['src']: image_normalized = illustrator.find_image( image['src'].strip(), referer) if image_normalized: images.append(image_normalized) # then try to find images in the content images_from_content, data[ 'content'] = illustrator.find_images(data['content'], referer) if images_from_content: images.extend(images_from_content) # remove duplicated ones images = illustrator.dedup_images(images) if images else None return images, data except Exception as k: logger.error(str(k)) return None
def _extract(data=None, referer=None): """ extract images and text content """ if not data: logger.error('Received no data from UCK server.') return None, None, None successful = int(data['STRUCT_PAGE_TYPE']) if successful == 0: logger.info('Cannot interpret the page! status != 1') return None, None, None try: # content content = data['content'].replace("\\", "") content = _sanitize(content, referer) # images images, data = _collect_images(data, referer) images = images if images else None # title title = None if 'title' in data: title = data['title'] return title, content, images except Exception as k: logger.error(str(k)) return None, None, None
def _find_keywords(self): """ compute word-frenquecy map """ try: words = self._segment_text(self._article) # remove stop words stopwords_path = '%s%s_stopwords' % (DATA_PATH, self._language) # ar, en, id, ja, pt, th, zh f = open(stopwords_path, 'r') stopwords = f.readlines() f.close() #stopwords = [stopword.strip() for stopword in stopwords if # stopword.strip()] stopwords = [str(re.compile(r'[^\w ]', flags=re.UNICODE).sub("", unicode( stopword.strip()))) for stopword in stopwords if stopword.strip()] words_filtered = [word for word in words if word not in stopwords] # distinct words kwords = list(set(words_filtered)) # word-frenquency keywords = [(kword, words_filtered.count(kword)) for kword in kwords] keywords = sorted(keywords, key=lambda x: -x[1]) return (keywords, len(words)) except Exception as k: logger.error(str(k)) return None
def __init__(self, url, language): """ docs needed! """ try: self.candidates = {} self.url = url self.language = language self.data = transcoder.prepare_link(self.url) self.data = self.regexps['replace_brs'].sub( "</p><p>", str(self.data)) self.data = self.regexps['replace_fonts'].sub( "<\g<1>span>", str(self.data)) self.html = BeautifulSoup(self.data.decode('utf-8', 'ignore')) self.article_image = None self._get_specific_image() self._remove_script() self._remove_style() self._remove_link() self.title = self._get_title() self.short_title = self._get_short_title() self.content = self._get_article() self.images = self._get_images() except Exception as k: logger.error(str(k))
def _find_top_keywords(self, keywords=None, words_count=None): """ compute top-scored keywords """ if not keywords or not words_count: logger.error("Method malformed!") return None try: #col = Collection(db, KEYWORD_REGISTRAR) top_keywords = keywords[:TOP_KEYWORDS_LIMIT] topwords = [] for top_keyword in top_keywords: word = top_keyword[0] count = top_keyword[1] article_score = float(count) * 1.0 / float(words_count) word_score = article_score * 1.5 topwords.append((word, word_score)) topwords = sorted(topwords, key=lambda x: -x[1]) return topwords except Exception as k: logger.error(str(k)) return None
def _get_actual_link(prefix=None, link=None): """ find the actual news link """ if not prefix or not link: logger.error('Method malformed! Prefix:[%s], Link:[%s]' % (prefix, link)) try: actual_link = None raw_data = urllib2.urlopen(link) data = raw_data.readlines() # str() is critical soup = BeautifulStoneSoup(str(data)) html_tag, html_attrs = HIDDEN_LINKS[prefix] html_wrapper = soup.find(name=html_tag, attrs=html_attrs) if html_wrapper: actual_suffix = html_wrapper.find('a')['href'] actual_link = str('%s%s' % (prefix, actual_suffix)) return actual_link else: return None except Exception as k: logger.error('Cannot open %s' % k) return None
def _check_image(self, image_url=None, image_html=None): """ Replace orginal image_url with downloaded local copy, if original image_url could not be reached without HEADERS """ if not image_url: logger.error('Image URL is found VOID!') raise Exception('Image URL is found VOID!') if not image_html: logger.error('Image content is found VOID!') raise Exception('Image content is found VOID!') try: response = requests.get(image_url, timeout=UCK_TIMEOUT) if response.status_code > 400 or 'posttoday.com/media/content' in \ image_url: raise Exception('Without HEADERS [%s] cannot be reached!' % str(image_url)) except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), str(image_url))) # replace original image_url with downloaded local copy image_url_new = self._download_copy(image_url, image_html) return image_url_new if image_url_new else image_url # Image is accessible with/without HEADERS return image_url
def _find_redirected_link(url=None): """ find the real link from redirection """ if not url: logger.error('Link [%s] is not valid!' % url) return None IS_REDIRECTED = True counter = 0 nurl = url while IS_REDIRECTED and counter < 10: resp = requests.get(nurl) purl = nurl nurl = resp.url if nurl == purl: IS_REDIRECTED = False else: counter = counter + 1 if not IS_REDIRECTED: return nurl else: return None
def clean(): """ remove expired items from database """ logger.info('... cleaning database ...') try: document_names = _find_document_names() if document_names: for document_name in document_names: document = Collection(db, document_name) # compute a threshold current_utc_time_posix = calendar.timegm(time.gmtime()) deadline_datetime = datetime.utcfromtimestamp( current_utc_time_posix) - timedelta( days=DATABASE_REMOVAL_DAYS) deadline_posix = calendar.timegm(deadline_datetime.timetuple()) removal_candidates = document.find( {'updated': {'$lt': deadline_posix}}) for removal_candidate in removal_candidates: # see if removal candidate has a footage in memory clean_memory.clean_by_item(str(removal_candidate['_id'])) # remove corresponding files on disk clean_disk.clean_by_item(removal_candidate) # remove the candidate in database document.remove({'_id': removal_candidate['_id']}) return True else: logger.error('Cannot find documents') return False except Exception as k: logger.error(str(k)) return False
def _is_valid(content, language): """ check if the content meets the need need: chinese/japanese - more than 40 words """ if not content or not language: logger.error('Method malformed!') return False try: if isinstance(content, str): content = content.decode( chardet.detect(content)['encoding'], 'ignore') if language.startswith('zh') or language == 'ja': words = content if len(words) < PARAGRAPH_CRITERIA_KANJI: return False elif language.startswith('th'): words = content.split() if len(words) < PARAGRAPH_CRITERIA_THAI: return False else: words = content.split() if len(words) < PARAGRAPH_CRITERIA_LATIN: return False return True except Exception as k: logger.error(str(k)) return False
def _collect_images(data=None, referer=None): """ find all possible images 1. image_list 2. images in the new content """ if not data: return None try: images = [] # first try to find images in image_list if 'image_list' in data and data.get('image_list'): for image in data.get('image_list'): if 'src' in image and image['src']: image_normalized = illustrator.find_image( image['src'].strip(), referer) if image_normalized: images.append(image_normalized) # then try to find images in the content images_from_content, data['content'] = illustrator.find_images( data['content'], referer) if images_from_content: images.extend(images_from_content) # remove duplicated ones images = illustrator.dedup_images(images) if images else None return images, data except Exception as k: logger.error(str(k)) return None
def extract(language, title, content, summary, link, feed, category): """ get the summary from the source, first paragraph or summary """ if not content or not title or not language: logger.error('No data is found!') return None try: result_summary = "" # set the number of sentences # limit the number of words if content: if language in ['en', 'ja', 'pt', 'th']: teaser = PyTeaser( language, title, content, link, feed, category) result_summary = teaser.summarize() # if summary from rss provider is found use summary, but limit # the number of words if not result_summary and summary: result_summary = _get_summary(summary, language) # else find first paragraph from transcoded also limit the # number of words if not result_summary and content: result_summary = _get_first_paragraph(content, language) return result_summary except Exception as k: logger.error(str(k)) return None
def _get_shorter_text(content, language, limit): """ limit the number of words to 500 """ if not content or not language: logger.error('Method malformed!') return None try: # data should be processed as unicode, so if isinstance(content, str): content = content.decode( chardet.detect(content)['encoding'], 'ignore') # break text by sentence if language == 'zh' or language == 'ja': jp_sent_tokenizer = nltk.RegexpTokenizer('[^!?.!?。.]*[!?.!?。]*') sentences = jp_sent_tokenizer.tokenize(content) if language == 'th': sentences = content.split() else: # supports latin-based, thai and arabic sentences = nltk.sent_tokenize(content) enough_sentences = u"" for sentence in sentences: # sentence is in unicode, len() then applies to CJK sentence = sentence.strip() if sentence: if len(enough_sentences) + len(sentence) + 1 <= limit: enough_sentences = "%s %s" % (enough_sentences, sentence) return str(enough_sentences.strip()) except Exception as k: logger.error(str(k)) return None
def _get_summary(content, language): """ find out the first readable summary """ if not content or not language: logger.error('Method malformed!') return None try: # strip off html code h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.ignore_emphasis = True h.body_width = 0 paragraphs = (h.handle(content)).strip().strip( '#').strip().split("\n\n") paragraphs = [ paragraph for paragraph in paragraphs if paragraph.strip()] for paragraph in paragraphs: if paragraph and _is_valid(paragraph, language): summary = _get_shorter_text( paragraph, language, SUMMARY_LENGTH_LIMIT) if summary: return summary except Exception as k: logger.error(str(k)) return None
def _get_tts(entry=None, rand=None): """ get tts from the provider """ if not entry: logger.error('Method malformed!') return entry if not rand: # get a new rand rand = random.randint(0, 100000000) try: tts_relative_path = '%s_%s_%s_%i.mp3' % ( entry['language'], entry['feed_id'], entry['updated'], rand) read_content = '%s. %s' % (entry['title'], entry['summary'] if entry.has_key('summary') and entry['summary'] else "") entry['mp3'], entry['mp3_local'] = tts_provider.google( entry['language'], read_content, tts_relative_path) except Exception as k: logger.error(str(k)) entry['error'].append(str(k) + '\n') entry['mp3'] = None entry['mp3_local'] = None return entry
def save(feed_info=None): """ add a new record of feed """ if not feed_info: logger.error("Method malformed!") return None try: # if the collection does not exist, it will be created col = Collection(db, FEED_REGISTRAR) # make a record in the feeds table item = col.find_one( {'feed_link': feed_info['feed_link'], 'language': feed_info['language']}) if not item: feed_info['updated_times'] = 0 feed_info['latest_update'] = None # the final return return str(col.save(feed_info)) else: # the final return return str(item['_id']) except Exception as k: logger.error(str(k)) return None
def _check_image(self, image_url=None, image_html=None): """ Replace orginal image_url with downloaded local copy, if original image_url could not be reached without HEADERS """ if not image_url: logger.error('Image URL is found VOID!') raise Exception('Image URL is found VOID!') if not image_html: logger.error('Image content is found VOID!') raise Exception('Image content is found VOID!') try: response = requests.get(image_url, timeout=UCK_TIMEOUT) if response.status_code > 400 or 'posttoday.com/media/content' in \ image_url: raise Exception( 'Without HEADERS [%s] cannot be reached!' % str(image_url)) except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), str(image_url))) # replace original image_url with downloaded local copy image_url_new = self._download_copy(image_url, image_html) return image_url_new if image_url_new else image_url # Image is accessible with/without HEADERS return image_url
def convert(link): """ send link to uck api and reformat the content """ if not link: logger.error('Cannot transcode nothing!') return None, None, None # send link to uck server and get data back try: raw_data = _transcode(link) if raw_data: # check if raw_data is syntax-correct try: eval(raw_data) except Exception: logger.info('Invalid syntax found for UCK output') return None, None, None # text is sanitized, images are found from image_list title, transcoded, images = _extract(eval(raw_data), link) return title, transcoded, images else: logger.info('Cannot read anything from UCK server') return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def convert(link): """ use burify's readability implementation to transcode a web page and return the transcoded page and images found in it """ if not link: logger.error('Cannot transcode nothing!') return None, None, None try: data = transcoder.prepare_link(link) if data: article = Document(data) if article: images, content = _collect_images( article.summary(html_partial=False), link) return article.short_title(), content, images else: logger.info('Burify cannot recognize the data') return None, None, None else: logger.info('Cannot parse %s correctly' % link) return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def dedup_images(images=None): """ remove same images image: {'url':xxx, 'width':yyy, 'height':zzz} images = [image, image, image] """ if not images: logger.error('Image list is found VOID!') return None image_urls = [] def _exists(image): """ return boolean if image exists in the image_urls list """ if image['url'] not in image_urls: image_urls.append(image['url']) return False else: return True try: return filter(lambda x: not _exists(x), images) except Exception as k: logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(images))) return None
def _get_actual_link(prefix=None, link=None): """ find the actual news link """ if not prefix or not link: logger.error( 'Method malformed! Prefix:[%s], Link:[%s]' % (prefix, link)) try: actual_link = None raw_data = urllib2.urlopen(link) data = raw_data.readlines() # str() is critical soup = BeautifulStoneSoup(str(data)) html_tag, html_attrs = HIDDEN_LINKS[prefix] html_wrapper = soup.find(name=html_tag, attrs=html_attrs) if html_wrapper: actual_suffix = html_wrapper.find('a')['href'] actual_link = str('%s%s' % (prefix, actual_suffix)) return actual_link else: return None except Exception as k: logger.error('Cannot open %s' % k) return None
def _clean_article(self): """ remove html tags, images, links from the article, and encode it appropriately """ try: # convert to normal encoding self._article = str( urllib2.unquote(hparser.unescape(self._article))) # remove unnecessary parts html_stripper = html2text.HTML2Text() html_stripper.ignore_links = True html_stripper.ignore_images = True html_stripper.ignore_emphasis = True # body_width = 0 disables text wrapping html_stripper.body_width = 0 self._article = html_stripper.handle( self._article).strip().strip("#").strip() # convert to appropriate encoding if isinstance(self._article, str): self._article = self._article.decode( chardet.detect(self._article)['encoding'], 'ignore') except Exception as k: logger.error(str(k)) return None
def _clean_style(self, e): try: for elem in e.findAll(True): del elem['class'] del elem['id'] del elem['style'] except Exception as k: logger.error(str(k))
def __init__(self, image_url=None, referer=None): if not image_url: logger.error('Method malformed!') raise Exception('Method malformed!') self._image_url, self._image_html = self._analyze(image_url, referer) self._image_size = self._calculate_size(self._image_html) self._clean_data()
def get_image(self): try: textimage_public_path = "%s%s" % ( IMAGES_PUBLIC_DIR, self._textimage_relative_path) textimage = {'url': textimage_public_path, 'width': CATEGORY_IMAGE_SIZE[0], 'height': CATEGORY_IMAGE_SIZE[1]} return textimage except Exception as k: logger.error(str(k)) return None
def run(self): if self.transcoder == 'simplr': self.result = eval(self.transcoder).convert( self.url, self.language) else: try: self.result = eval(self.transcoder).convert(self.url) except Exception as k: logger.error(str(k)) self.result = None, None, None
def __init__(self, language=None, title=None, article=None, link=None, blog=None, category=None): if not language or not title or not article: logger.error('Method malformed!') self._language = language self._title = title self._article = article self._link = link self._blog = blog self._category = category
def _clean_zombies(): """ kill zombie processes, usually run semi-daily, or quasi-daily """ logger.info('-----------------killing zombies--------------------') try: clean_process.clean() return True except Exception as k: logger.error(str(k)) return False
def _download(language='en', query='Service provided by Baidu', tmp_file='do_not_exist.mp3'): """ docs needed! other ways to write _download 1. https://github.com/hungtruong/Google-Translate-TTS/blob/master /GoogleTTS.py 2. https://github.com/gavinmh/tts-api/blob/master/text_segmenter.py """ try: # break a long sentence/paragraph into google-acceptable length segments = _query_segment(language, query) # download chunks and write them to the output file threads = [] if segments: for segment in segments: if segment: logger.info('... Transmitting "%s"' % segment) gt_request = GoogleTranslateAPI(language, segment) threads.append(gt_request) gt_request.start() gt_request.join(GOOGLE_TTS_TIMEOUT) out = open(tmp_file, 'a') download_completed = True for th in threads: sys.stdout.flush() if th.result: out.write(th.result) else: download_completed = False break out.close() if download_completed: return tmp_file else: logger.info('Download not completed, now removing the file') if os.path.exists(tmp_file): os.remove(tmp_file) return None else: # nothing generated from the query logger.error('Nothing generated from the query') return None except Exception as k: logger.error( 'Part of tts dowload went wrong, now removing the file: %s' % str(k)) if os.path.exists(tmp_file): os.remove(tmp_file) return None
def _get_title(self): try: title = '' try: title = self.html.find('title').text except: pass return title except Exception as k: logger.error(str(k)) return None
def _collect_images(content, referer): """ find all images from the content """ if not content: logger.error('Content/HTML found VOID!') return None images, content_new = illustrator.find_images(content, referer) if content_new and content_new != content: content = content_new return images, content
def _download(language='en', query='Service provided by Baidu', tmp_file='do_not_exist.mp3'): """ docs needed! other ways to write _download 1. https://github.com/hungtruong/Google-Translate-TTS/blob/master /GoogleTTS.py 2. https://github.com/gavinmh/tts-api/blob/master/text_segmenter.py """ try: # break a long sentence/paragraph into google-acceptable length segments = _query_segment(language, query) # download chunks and write them to the output file threads = [] if segments: for segment in segments: if segment: logger.info('... Transmitting "%s"' % segment) gt_request = GoogleTranslateAPI(language, segment) threads.append(gt_request) gt_request.start() gt_request.join(GOOGLE_TTS_TIMEOUT) out = open(tmp_file, 'a') download_completed = True for th in threads: sys.stdout.flush() if th.result: out.write(th.result) else: download_completed = False break out.close() if download_completed: return tmp_file else: logger.info('Download not completed, now removing the file') if os.path.exists(tmp_file): os.remove(tmp_file) return None else: # nothing generated from the query logger.error('Nothing generated from the query') return None except Exception as k: logger.error( 'Part of tts dowload went wrong, now removing the file: %s' % str( k)) if os.path.exists(tmp_file): os.remove(tmp_file) return None
def _set_background(self): """ set the image background """ try: #self._image = Image.new("RGB", CATEGORY_IMAGE_SIZE, # self._background_color) self._image = Image.open("%s/home_bg.png" % DATA_PATH) self._draw = ImageDraw.Draw(self._image) except Exception as k: logger.error(str(k)) return None
def update(entry=None, expiration=None): """ add news and its attributes to memory """ if not entry: logger.error('Method malformed!') return False try: # check if redis in alive rclient.ping() entry_reduced = {} # simplify fields in entry to ones in field_list for field in entry: if field in field_list: entry_reduced[field] = entry[field] # add an entry to memory # add a piece of news into memory rclient.set(entry_reduced['_id'], entry_reduced) # expired in redis is counted in seconds expiration = MEMORY_EXPIRATION_DAYS * 24 * \ 60 * 60 if not expiration else expiration rclient.expire(entry_reduced['_id'], expiration) # add entry ids to the RSS list rclient.zadd( "news::%s::%s" % (entry_reduced['language'], entry_reduced['feed']), entry_reduced['updated'], entry_reduced['_id']) # add entry ids to the label list col = Collection(db, FEED_REGISTRAR) item = col.find_one({'feed_title': entry_reduced['feed']}, {'labels': 1}) if item and 'labels' in item and item['labels']: for label in item['labels']: # a label is a combination of country, category and label rclient.zadd( 'news::%s::%s' % (entry_reduced['language'], label), entry_reduced['updated'], entry_reduced['_id']) # final return return True except ConnectionError: logger.critical('Redis is down!') return False except Exception as k: logger.error(str(k)) return False
def _get_short_title(self): try: title = '' try: orig = self.html.find('title').text segmenter = tinysegmenter.TinySegmenter() # remove unnecessary parts for delimiter in [' | ', ' - ', ' :: ', ' / ']: if delimiter in orig: parts = orig.split(delimiter) if self.language.startswith( 'zh') or self.language == 'ja': words_head = segmenter.tokenize(unicode(parts[0])) words_tail = segmenter.tokenize(unicode(parts[-1])) if len(words_head) >= 4: title = parts[0] break elif len(words_tail) >= 4: title = parts[-1] break else: if len(parts[0].split()) >= 4: title = parts[0] break elif len(parts[-1].split()) >= 4: title = parts[-1] break if not title: orig = title if ': ' in orig: parts = orig.split(': ') if self.language.startswith('zh') or self.language == 'ja': words_tail = segmenter.tokenize(unicode(parts[-1])) if len(words_tail) >= 4: title = parts[-1] else: title = orig.split(': ', 1)[1] else: if len(parts[-1].split()) >= 4: title = parts[-1] else: title = orig.split(': ', 1)[1] if not title: return orig except: pass return title except Exception as k: logger.error(str(k)) return None
def convert(link): """ call UCK's new interface to get title, images and content """ if not link: logger.error('Cannot transcode nothing!') return None, None, None try: title, content, images = _extract(link) return title, content, images except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def _clean_article(self, content): try: self._clean_comments(content) # remove unwanted parts, by dirty hand self._clean_extra_parts(content) self._clean(content, 'h1') self._clean(content, 'blockquote') self._clean(content, 'object') self._clean_conditionally(content, "form") if len(content.findAll('h2')) == 1: self._clean(content, 'h2') self._clean(content, 'iframe') self._clean_conditionally(content, "table") self._clean_conditionally(content, "ul") # print 'Before removing div' # print content # print # print # '------------------------------------------------------------' self._clean_conditionally(content, "div") # print 'After removing div' # print content # print # print # '----------------------------------------------------------' self._clean_style(content) self._fix_images_path(content) self._fix_links_path(content) content_string = content.renderContents(encoding='utf-8') if not content.find('img') and self.article_image: article_image_string = self.article_image.renderContents( encoding='utf-8') # if <img> is rendered to None if not article_image_string: article_image_string = self.article_image.parent \ .renderContents( encoding='utf-8') content_string = article_image_string + content_string content_string = self.regexps[ 'kill_breaks'].sub("<br />", content_string) return content_string except Exception as k: logger.error(str(k)) return None
def _get_images(self): try: if self.content: # find_images normalizes images afterwards images, content_new = illustrator.find_images( self.content, self.url) if content_new and content_new != self.content: self.content = content_new return images else: return None except Exception as k: logger.error(str(k)) return None
def _clean_data(): """ clean memory, database and files, usually run daily """ logger.info('----------------------cleaning-------------------------') try: any_mistake = False # clean database if not clean_database.clean(): logger.error('Error found cleaning database') any_mistake = True # clean memory if not clean_memory.clean(): logger.error('Error found cleaning memory') any_mistake = True # clean disk if not clean_disk.clean(): logger.error('Error found cleaning disk') any_mistake = True if not any_mistake: logger.info('Memory, Database & Disk got cleaned!') return True else: return False except Exception as k: logger.error(str(k)) return False
def _preprocess(url): """ get the real address out """ if not url: logger.error('Method malformed!') return None try: last_http_index = url.rfind('http') return url[last_http_index:].strip() except Exception as k: logger.error(str(k)) return None
def update(entry=None, expiration=None): """ add news and its attributes to memory """ if not entry: logger.error('Method malformed!') return False try: # check if redis in alive rclient.ping() entry_reduced = {} # simplify fields in entry to ones in field_list for field in entry: if field in field_list: entry_reduced[field] = entry[field] # add an entry to memory # add a piece of news into memory rclient.set(entry_reduced['_id'], entry_reduced) # expired in redis is counted in seconds expiration = MEMORY_EXPIRATION_DAYS * 24 * \ 60 * 60 if not expiration else expiration rclient.expire(entry_reduced['_id'], expiration) # add entry ids to the RSS list rclient.zadd("news::%s::%s" % (entry_reduced['language'], entry_reduced['feed']), entry_reduced['updated'], entry_reduced['_id']) # add entry ids to the label list col = Collection(db, FEED_REGISTRAR) item = col.find_one( {'feed_title': entry_reduced['feed']}, {'labels': 1}) if item and 'labels' in item and item['labels']: for label in item['labels']: # a label is a combination of country, category and label rclient.zadd('news::%s::%s' % (entry_reduced['language'], label), entry_reduced['updated'], entry_reduced['_id']) # final return return True except ConnectionError: logger.critical('Redis is down!') return False except Exception as k: logger.error(str(k)) return False