def dedup_images(images=None): """ remove same images image: {'url':xxx, 'width':yyy, 'height':zzz} images = [image, image, image] """ if not images: logger.error('Image list is found VOID!') return None image_urls = [] def _exists(image): """ return boolean if image exists in the image_urls list """ if image['url'] not in image_urls: image_urls.append(image['url']) return False else: return True try: return filter(lambda x: not _exists(x), images) except Exception as k: logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(images))) return None
def find_biggest_image(images=None): """ find the biggest image in resolution from a list of images """ if not images: logger.error('Image list is found VOID!') return None try: biggest = None resolution_max = MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1] for image in images: if 'width' in image and 'height' in image: resolution_image = int(image['width']) * int(image['height']) if resolution_image > MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]: if resolution_image > resolution_max: biggest = image resolution_max = resolution_image else: logger.info('Image [%s] is not big enough!' % str(image['url'])) else: logger.info('Height and width not found! %s' % str(image)) return biggest except Exception as k: logger.error('Problem:[%s]\nSource:[%s]' % (str(k), str(images))) return None
def _check_image(self, image_url=None, image_html=None): """ Replace orginal image_url with downloaded local copy, if original image_url could not be reached without HEADERS """ if not image_url: logger.error('Image URL is found VOID!') raise Exception('Image URL is found VOID!') if not image_html: logger.error('Image content is found VOID!') raise Exception('Image content is found VOID!') try: response = requests.get(image_url, timeout=UCK_TIMEOUT) if response.status_code > 400 or 'posttoday.com/media/content' in \ image_url: raise Exception('Without HEADERS [%s] cannot be reached!' % str(image_url)) except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), str(image_url))) # replace original image_url with downloaded local copy image_url_new = self._download_copy(image_url, image_html) return image_url_new if image_url_new else image_url # Image is accessible with/without HEADERS return image_url
def dedup_images(images=None): """ remove same images image: {'url':xxx, 'width':yyy, 'height':zzz} images = [image, image, image] """ if not images: logger.error('Image list is found VOID!') return None image_urls = [] def _exists(image): """ return boolean if image exists in the image_urls list """ if image['url'] not in image_urls: image_urls.append(image['url']) return False else: return True try: return filter(lambda x: not _exists(x), images) except Exception as k: logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(images))) return None
def clean(): """ remove expired items from database """ logger.info('... cleaning database ...') try: document_names = _find_document_names() if document_names: for document_name in document_names: document = Collection(db, document_name) # compute a threshold current_utc_time_posix = calendar.timegm(time.gmtime()) deadline_datetime = datetime.utcfromtimestamp( current_utc_time_posix) - timedelta( days=DATABASE_REMOVAL_DAYS) deadline_posix = calendar.timegm(deadline_datetime.timetuple()) removal_candidates = document.find( {'updated': {'$lt': deadline_posix}}) for removal_candidate in removal_candidates: # see if removal candidate has a footage in memory clean_memory.clean_by_item(str(removal_candidate['_id'])) # remove corresponding files on disk clean_disk.clean_by_item(removal_candidate) # remove the candidate in database document.remove({'_id': removal_candidate['_id']}) return True else: logger.error('Cannot find documents') return False except Exception as k: logger.error(str(k)) return False
def dedup(entries=None, language=None): """ return entries not found in database """ if not entries: logger.error('Method malformed!') return None if not language or language not in LANGUAGES: logger.error("Language not found or not supported!") return None try: entries_new = [] col = Collection(db, language) for entry in entries: # find duplicate in the form of the same link or title dup_link = col.find_one( {'link': entry['link'], 'feed': entry['feed']}) if dup_link: logger.info('Find a duplicate for %s' % str(entry['title'])) continue else: dup_title = col.find_one( {'title': entry['title'], 'feed': entry['feed']}) if dup_title: logger.info( 'Find a duplicate for %s' % str(entry['title'])) continue else: entries_new.append(entry) return entries_new if entries_new else None except Exception as k: logger.error(str(k)) return None
def convert(link): """ use burify's readability implementation to transcode a web page and return the transcoded page and images found in it """ if not link: logger.error('Cannot transcode nothing!') return None, None, None try: data = transcoder.prepare_link(link) if data: article = Document(data) if article: images, content = _collect_images( article.summary(html_partial=False), link) return article.short_title(), content, images else: logger.info('Burify cannot recognize the data') return None, None, None else: logger.info('Cannot parse %s correctly' % link) return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def convert(link): """ send link to uck api and reformat the content """ if not link: logger.error('Cannot transcode nothing!') return None, None, None # send link to uck server and get data back try: raw_data = _transcode(link) if raw_data: # check if raw_data is syntax-correct try: eval(raw_data) except Exception: logger.info('Invalid syntax found for UCK output') return None, None, None # text is sanitized, images are found from image_list title, transcoded, images = _extract(eval(raw_data), link) return title, transcoded, images else: logger.info('Cannot read anything from UCK server') return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def convert(link): """ send link to uck api and reformat the content """ if not link: logger.error('Cannot transcode nothing!') return None, None, None # send link to uck server and get data back try: raw_data = _transcode(link) if raw_data: # check if raw_data is syntax-correct try: eval(raw_data) except Exception: logger.info('Invalid syntax found for UCK output') return None, None, None # text is sanitized, images are found from image_list title, transcoded, images = _extract(eval(raw_data), link) return title, transcoded, images else: logger.info('Cannot read anything from UCK server') return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def _extract(data=None, referer=None): """ extract images and text content """ if not data: logger.error('Received no data from UCK server.') return None, None, None successful = int(data['STRUCT_PAGE_TYPE']) if successful == 0: logger.info('Cannot interpret the page! status != 1') return None, None, None try: # content content = data['content'].replace("\\", "") content = _sanitize(content, referer) # images images, data = _collect_images(data, referer) images = images if images else None # title title = None if 'title' in data: title = data['title'] return title, content, images except Exception as k: logger.error(str(k)) return None, None, None
def _clean_data(): """ clean memory, database and files, usually run daily """ logger.info('----------------------cleaning-------------------------') try: any_mistake = False # clean database if not clean_database.clean(): logger.error('Error found cleaning database') any_mistake = True # clean memory if not clean_memory.clean(): logger.error('Error found cleaning memory') any_mistake = True # clean disk if not clean_disk.clean(): logger.error('Error found cleaning disk') any_mistake = True if not any_mistake: logger.info('Memory, Database & Disk got cleaned!') return True else: return False except Exception as k: logger.error(str(k)) return False
def _extract(data=None, referer=None): """ extract images and text content """ if not data: logger.error('Received no data from UCK server.') return None, None, None successful = int(data['STRUCT_PAGE_TYPE']) if successful == 0: logger.info('Cannot interpret the page! status != 1') return None, None, None try: # content content = data['content'].replace("\\", "") content = _sanitize(content, referer) # images images, data = _collect_images(data, referer) images = images if images else None # title title = None if 'title' in data: title = data['title'] return title, content, images except Exception as k: logger.error(str(k)) return None, None, None
def _clean_data(): """ clean memory, database and files, usually run daily """ logger.info('----------------------cleaning-------------------------') try: any_mistake = False # clean database if not clean_database.clean(): logger.error('Error found cleaning database') any_mistake = True # clean memory if not clean_memory.clean(): logger.error('Error found cleaning memory') any_mistake = True # clean disk if not clean_disk.clean(): logger.error('Error found cleaning disk') any_mistake = True if not any_mistake: logger.info('Memory, Database & Disk got cleaned!') return True else: return False except Exception as k: logger.error(str(k)) return False
def _check_image(self, image_url=None, image_html=None): """ Replace orginal image_url with downloaded local copy, if original image_url could not be reached without HEADERS """ if not image_url: logger.error('Image URL is found VOID!') raise Exception('Image URL is found VOID!') if not image_html: logger.error('Image content is found VOID!') raise Exception('Image content is found VOID!') try: response = requests.get(image_url, timeout=UCK_TIMEOUT) if response.status_code > 400 or 'posttoday.com/media/content' in \ image_url: raise Exception( 'Without HEADERS [%s] cannot be reached!' % str(image_url)) except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), str(image_url))) # replace original image_url with downloaded local copy image_url_new = self._download_copy(image_url, image_html) return image_url_new if image_url_new else image_url # Image is accessible with/without HEADERS return image_url
def _analyze(self, image_url=None, referer=None): """ remove CDN prefix, if any; and read image data """ if not image_url: logger.error('Method malformed!') raise Exception('Method malformed!') image_url = image_url.replace("\/", "/").strip() image_url = urllib2.unquote(hparser.unescape(image_url)) # as the name could be http://xxx.com/yyy--http://zzz.jpg # or http://xxx.com/yyy--https://zzz.jpg last_http_index = image_url.rfind('http') image_url = image_url[last_http_index:] response = None if referer: HEADERS['Referer'] = referer try: response = requests.get(image_url, headers=HEADERS, timeout=UCK_TIMEOUT) # avoid redirected URL image_url = response.url # either exception or wrong HTTP code if response.status_code >= 400: raise Exception('Response code %s' % response.status_code) except Exception as k: logger.info('%s for %s' % (str(k), str(image_url))) try: # CDN URL could be formed as http:/xxxx.jpg path = re.split('https?://?', image_url)[-1] scheme = requests.utils.urlparse(image_url).scheme image_url = '%s://%s' % (scheme, path) response = requests.get(image_url, headers=HEADERS, timeout=UCK_TIMEOUT) # avoid redirected URL image_url = response.url if response.status_code >= 400: raise Exception('Response code %s' % response.status_code) except Exception as k: logger.error('%s for %s' % (str(k), str(image_url))) raise Exception('%s for %s' % (str(k), str(image_url))) if response and response.status_code < 400 and response.content: # GIF is not supported yet #pr = requests.utils.urlparse(image_url) #image_url_address = pr.netloc + pr.path # if image_url_address.lower().endswith('.gif'): # raise Exception('GIF is not supported! %s' % str(image_url)) # else: image_html = response.content image_url = self._check_image(image_url, image_html) return str(image_url), str(image_html) else: logger.error('Cannot parse %s' % str(image_url)) raise Exception('Cannot parse %s' % str(image_url))
def convert(link): """ use burify's readability implementation to transcode a web page and return the transcoded page and images found in it """ if not link: logger.error('Cannot transcode nothing!') return None, None, None try: data = transcoder.prepare_link(link) if data: article = Document(data) if article: images, content = _collect_images( article.summary(html_partial=False), link) return article.short_title(), content, images else: logger.info('Burify cannot recognize the data') return None, None, None else: logger.info('Cannot parse %s correctly' % link) return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def find_biggest_image(images=None): """ find the biggest image in resolution from a list of images """ if not images: logger.error('Image list is found VOID!') return None try: biggest = None resolution_max = MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1] for image in images: if 'width' in image and 'height' in image: resolution_image = int(image['width']) * int(image['height']) if resolution_image > MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]: if resolution_image > resolution_max: biggest = image resolution_max = resolution_image else: logger.info('Image [%s] is not big enough!' % str(image['url'])) else: logger.info('Height and width not found! %s' % str(image)) return biggest except Exception as k: logger.error('Problem:[%s]\nSource:[%s]' % (str(k), str(images))) return None
def _analyze(self, image_url=None, referer=None): """ remove CDN prefix, if any; and read image data """ if not image_url: logger.error('Method malformed!') raise Exception('Method malformed!') image_url = image_url.replace("\/", "/").strip() image_url = urllib2.unquote(hparser.unescape(image_url)) # as the name could be http://xxx.com/yyy--http://zzz.jpg # or http://xxx.com/yyy--https://zzz.jpg last_http_index = image_url.rfind('http') image_url = image_url[last_http_index:] response = None if referer: HEADERS['Referer'] = referer try: response = requests.get( image_url, headers=HEADERS, timeout=UCK_TIMEOUT) # avoid redirected URL image_url = response.url # either exception or wrong HTTP code if response.status_code >= 400: raise Exception('Response code %s' % response.status_code) except Exception as k: logger.info('%s for %s' % (str(k), str(image_url))) try: # CDN URL could be formed as http:/xxxx.jpg path = re.split('https?://?', image_url)[-1] scheme = requests.utils.urlparse(image_url).scheme image_url = '%s://%s' % (scheme, path) response = requests.get( image_url, headers=HEADERS, timeout=UCK_TIMEOUT) # avoid redirected URL image_url = response.url if response.status_code >= 400: raise Exception('Response code %s' % response.status_code) except Exception as k: logger.error('%s for %s' % (str(k), str(image_url))) raise Exception('%s for %s' % (str(k), str(image_url))) if response and response.status_code < 400 and response.content: # GIF is not supported yet #pr = requests.utils.urlparse(image_url) #image_url_address = pr.netloc + pr.path # if image_url_address.lower().endswith('.gif'): # raise Exception('GIF is not supported! %s' % str(image_url)) # else: image_html = response.content image_url = self._check_image(image_url, image_html) return str(image_url), str(image_html) else: logger.error('Cannot parse %s' % str(image_url)) raise Exception('Cannot parse %s' % str(image_url))
def _download(language='en', query='Service provided by Baidu', tmp_file='do_not_exist.mp3'): """ docs needed! other ways to write _download 1. https://github.com/hungtruong/Google-Translate-TTS/blob/master /GoogleTTS.py 2. https://github.com/gavinmh/tts-api/blob/master/text_segmenter.py """ try: # break a long sentence/paragraph into google-acceptable length segments = _query_segment(language, query) # download chunks and write them to the output file threads = [] if segments: for segment in segments: if segment: logger.info('... Transmitting "%s"' % segment) gt_request = GoogleTranslateAPI(language, segment) threads.append(gt_request) gt_request.start() gt_request.join(GOOGLE_TTS_TIMEOUT) out = open(tmp_file, 'a') download_completed = True for th in threads: sys.stdout.flush() if th.result: out.write(th.result) else: download_completed = False break out.close() if download_completed: return tmp_file else: logger.info('Download not completed, now removing the file') if os.path.exists(tmp_file): os.remove(tmp_file) return None else: # nothing generated from the query logger.error('Nothing generated from the query') return None except Exception as k: logger.error( 'Part of tts dowload went wrong, now removing the file: %s' % str(k)) if os.path.exists(tmp_file): os.remove(tmp_file) return None
def _clean_zombies(): """ kill zombie processes, usually run semi-daily, or quasi-daily """ logger.info('-----------------killing zombies--------------------') try: clean_process.clean() return True except Exception as k: logger.error(str(k)) return False
def _clean_zombies(): """ kill zombie processes, usually run semi-daily, or quasi-daily """ logger.info('-----------------killing zombies--------------------') try: clean_process.clean() return True except Exception as k: logger.error(str(k)) return False
def _transcode(link): """ send link to uck and get the data """ try: html = urllib2.urlopen( '%s%s' % (UCK_TRANSCODING_NEW, link), timeout=UCK_TIMEOUT).read() data = urllib2.unquote(hparser.unescape(html)) return data except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), link)) return None
def _download(language='en', query='Service provided by Baidu', tmp_file='do_not_exist.mp3'): """ docs needed! other ways to write _download 1. https://github.com/hungtruong/Google-Translate-TTS/blob/master /GoogleTTS.py 2. https://github.com/gavinmh/tts-api/blob/master/text_segmenter.py """ try: # break a long sentence/paragraph into google-acceptable length segments = _query_segment(language, query) # download chunks and write them to the output file threads = [] if segments: for segment in segments: if segment: logger.info('... Transmitting "%s"' % segment) gt_request = GoogleTranslateAPI(language, segment) threads.append(gt_request) gt_request.start() gt_request.join(GOOGLE_TTS_TIMEOUT) out = open(tmp_file, 'a') download_completed = True for th in threads: sys.stdout.flush() if th.result: out.write(th.result) else: download_completed = False break out.close() if download_completed: return tmp_file else: logger.info('Download not completed, now removing the file') if os.path.exists(tmp_file): os.remove(tmp_file) return None else: # nothing generated from the query logger.error('Nothing generated from the query') return None except Exception as k: logger.error( 'Part of tts dowload went wrong, now removing the file: %s' % str( k)) if os.path.exists(tmp_file): os.remove(tmp_file) return None
def _transcode(link): """ send link to uck and get the data """ try: html = urllib2.urlopen('%s%s' % (UCK_TRANSCODING_NEW, link), timeout=UCK_TIMEOUT).read() data = urllib2.unquote(hparser.unescape(html)) return data except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), link)) return None
def _transcode(link): """ send link to uck server """ try: uck_url = '%s%s' % (UCK_TRANSCODING, link) # timeout set to UCK_TIMEOUT, currently html = urllib2.urlopen(uck_url, timeout=UCK_TIMEOUT).read() # free data from html encoding data = urllib2.unquote(hparser.unescape(html)) return data except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), link)) return None
def _transcode(link): """ send link to uck server """ try: uck_url = '%s%s' % (UCK_TRANSCODING, link) # timeout set to UCK_TIMEOUT, currently html = urllib2.urlopen(uck_url, timeout=UCK_TIMEOUT).read() # free data from html encoding data = urllib2.unquote(hparser.unescape(html)) return data except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), link)) return None
def clean(): """ interface to clean temporary and unrecorded files """ logger.info('... cleaning files on the disk ...') any_mistake = False if not _clean_unrecorded_files(): logger.error('Error found cleaning unrecorded files') any_mistake = True if not _clean_tempory_files(): logger.error('Error found cleaning temporary files') any_mistake = True if not any_mistake: return True else: return False
def convert(url, language): """ an interface to expose Simplr """ if not url: logger.error("Cannot transcode nothing!") return None, None, None # pdb.set_trace() try: readable = Simplr(url, language) if readable: return readable.short_title, readable.content, readable.images else: logger.info('Simplr cannot parse the data') return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(url))) return None, None, None
def clean(): """ remove expired items from queues in memory walk through all redis content """ logger.info('... cleaning memory ...') try: news_lists = rclient.keys('news::*') for news_list in news_lists: # get the total number of a news list news_list_count = rclient.zcard(news_list) # get all the ids in a news list if news_list_count: news_ids = rclient.zrange(news_list, 0, news_list_count) for news_id in news_ids: # make sure every item is touched if not rclient.exists(news_id): rclient.zrem(news_list, news_id) else: news_item_string = rclient.get(news_id) if news_item_string: news_item = eval(news_item_string) news_updated = float(news_item['updated']) # WTF, remove it if cleaner.is_overdue(news_updated): rclient.zrem(news_list, news_id) rclient.delete(news_id) else: # check if this is zombie if _is_zombie(news_item): rclient.zrem(news_list, news_id) rclient.delete(news_id) else: rclient.zrem(news_list, news_id) rclient.delete(news_id) else: logger.error('Nothing in the list') continue return True except Exception as k: logger.error(str(k)) return False
def google(language='en', query='Service provided by Baidu', relative_path='do_not_exist.mp3'): """ 1. download mp3 from google tts api 2. convert it to wav 3. speed up the wav file, if necessary 4. convert to mp3 5. store in some location 6. return the path """ if not language or not query or not relative_path: logger.error('Method malformed!') return None, None if language not in LANGUAGES: logger.error('%s not supported!' % language) return None, None try: # generate out.mp3 tmp_file = _download(language, query, '%s%s-tmp.mp3' % (MEDIA_TEMP_LOCAL_DIR, relative_path[:-4])) if tmp_file: # form paths tts_local_path = '%s%s' % (MEDIA_LOCAL_DIR, relative_path) tts_web_path = '%s%s' % (MEDIA_PUBLIC_DIR, relative_path) command = 'lame -S --decode {0} - | sox -q -t wav - -t wav - ' \ 'speed 1.06 | lame -S - {1}; rm {0}'.format( tmp_file, tts_local_path) subprocess.Popen(command, stderr=subprocess.PIPE, shell=True) logger.info('... MP3 acceleration is successfully completed!') return tts_web_path, tts_local_path else: logger.info( '%s is revoked due to erros found in downloading!' % relative_path) return None, None except Exception as k: logger.error(str(k)) return None, None
def google(language='en', query='Service provided by Baidu', relative_path='do_not_exist.mp3'): """ 1. download mp3 from google tts api 2. convert it to wav 3. speed up the wav file, if necessary 4. convert to mp3 5. store in some location 6. return the path """ if not language or not query or not relative_path: logger.error('Method malformed!') return None, None if language not in LANGUAGES: logger.error('%s not supported!' % language) return None, None try: # generate out.mp3 tmp_file = _download( language, query, '%s%s-tmp.mp3' % (MEDIA_TEMP_LOCAL_DIR, relative_path[:-4])) if tmp_file: # form paths tts_local_path = '%s%s' % (MEDIA_LOCAL_DIR, relative_path) tts_web_path = '%s%s' % (MEDIA_PUBLIC_DIR, relative_path) command = 'lame -S --decode {0} - | sox -q -t wav - -t wav - ' \ 'speed 1.06 | lame -S - {1}; rm {0}'.format( tmp_file, tts_local_path) subprocess.Popen(command, stderr=subprocess.PIPE, shell=True) logger.info('... MP3 acceleration is successfully completed!') return tts_web_path, tts_local_path else: logger.info('%s is revoked due to erros found in downloading!' % relative_path) return None, None except Exception as k: logger.error(str(k)) return None, None
def _combine(content, images): """ combine results from transcoders """ if not content or not images: return content, images try: # for now, if there are more than one image, take only one of them biggest = illustrator.find_biggest_image(images) if biggest: image_tag = '<img src="%s" width="%s" height="%s">' image = image_tag % (biggest['url'], str( biggest['width']), str(biggest['height'])) return "%s %s" % (image, content), images else: logger.info('Cannot find biggest image') return content, biggest except Exception as k: logger.error(str(k)) return content, images
def _combine(content, images): """ combine results from transcoders """ if not content or not images: return content, images try: # for now, if there are more than one image, take only one of them biggest = illustrator.find_biggest_image(images) if biggest: image_tag = '<img src="%s" width="%s" height="%s">' image = image_tag % ( biggest['url'], str(biggest['width']), str(biggest['height'])) return "%s %s" % (image, content), images else: logger.info('Cannot find biggest image') return content, biggest except Exception as k: logger.error(str(k)) return content, images
def _extract(link): """ extract title, content and images """ data_string = _transcode(link) if data_string: # syntax checker try: eval(data_string) except Exception: logger.info('Invalid syntax found for New UCK output') return None, None, None data = eval(data_string) if int(data['status']) == 1: title = None if 'title' not in data or not data['title'] else data[ 'title'] content = None if 'content' not in data or not data[ 'content'] else data['content'] images, content = _collect_images(content, link) return title, content, images else: logger.info('UCK cannot parse the link: status != 1') return None, None, None else: logger.info('Get nothing from UCK server') return None, None, None
def _extract(link): """ extract title, content and images """ data_string = _transcode(link) if data_string: # syntax checker try: eval(data_string) except Exception: logger.info('Invalid syntax found for New UCK output') return None, None, None data = eval(data_string) if int(data['status']) == 1: title = None if 'title' not in data or not data[ 'title'] else data['title'] content = None if 'content' not in data or not data[ 'content'] else data['content'] images, content = _collect_images(content, link) return title, content, images else: logger.info('UCK cannot parse the link: status != 1') return None, None, None else: logger.info('Get nothing from UCK server') return None, None, None
def run(self): response = subprocess.Popen( '''curl --silent --connect-timeout %s -A Mozilla "http://translate.google .com/translate_tts?ie=UTF-8&oe=UTF-8&tl=%s&q=%s"''' % (GOOGLE_TTS_TIMEOUT, self.language, urllib2.quote(self.text)), stdout=subprocess.PIPE, shell=True, close_fds=True) content, error = response.communicate() if not error and content: if 'error' not in content or 'permission' not in content: self.result = content else: logger.error('Errors or Permission found in HTTP response') self.result = None else: if error: logger.error('Error %s found for %s' % (str(error), self.text)) self.result = None else: logger.info('No content returned for %s' % self.text) self.result = None
def find_images(content=None, referer=None): """ find out all images from content and its size info """ if not content: logger.error('Content/HTML is found VOID!') return None, content try: if isinstance(content, str) or isinstance(content, unicode): soup = BeautifulSoup(content.decode('utf-8', 'ignore')) normalized_images = [] element_replaced = False for image in soup.findAll('img'): if image.get('src'): normalized_image = find_image(image.get('src'), referer) if normalized_image: # replace original image link with clean and (local) # copy if 'original_url' in normalized_image and \ normalized_image['original_url']: image['src'] = str(normalized_image['url']) element_replaced = True normalized_images.append(normalized_image) content_new = soup.prettify(encoding='utf-8') if element_replaced and content_new: content = str( html_slimmer( urllib2.unquote(hparser.unescape(content_new)))) return normalized_images, content else: logger.info("Wrong format %s" % content) return None, content except Exception as k: logger.error("Problem [%s] Source [%s]" % (str(k), content)) return None, content
def find_images(content=None, referer=None): """ find out all images from content and its size info """ if not content: logger.error('Content/HTML is found VOID!') return None, content try: if isinstance(content, str) or isinstance(content, unicode): soup = BeautifulSoup(content.decode('utf-8', 'ignore')) normalized_images = [] element_replaced = False for image in soup.findAll('img'): if image.get('src'): normalized_image = find_image(image.get('src'), referer) if normalized_image: # replace original image link with clean and (local) # copy if 'original_url' in normalized_image and \ normalized_image['original_url']: image['src'] = str(normalized_image['url']) element_replaced = True normalized_images.append(normalized_image) content_new = soup.prettify(encoding='utf-8') if element_replaced and content_new: content = str( html_slimmer(urllib2.unquote( hparser.unescape(content_new)))) return normalized_images, content else: logger.info("Wrong format %s" % content) return None, content except Exception as k: logger.error("Problem [%s] Source [%s]" % (str(k), content)) return None, content
def dedup(entries=None, language=None): """ return entries not found in database """ if not entries: logger.error('Method malformed!') return None if not language or language not in LANGUAGES: logger.error("Language not found or not supported!") return None try: entries_new = [] col = Collection(db, language) for entry in entries: # find duplicate in the form of the same link or title dup_link = col.find_one({ 'link': entry['link'], 'feed': entry['feed'] }) if dup_link: logger.info('Find a duplicate for %s' % str(entry['title'])) continue else: dup_title = col.find_one({ 'title': entry['title'], 'feed': entry['feed'] }) if dup_title: logger.info('Find a duplicate for %s' % str(entry['title'])) continue else: entries_new.append(entry) return entries_new if entries_new else None except Exception as k: logger.error(str(k)) return None
def prepare_link(url): """ decode with the correct encoding """ if not url: logger.error('Method malformed!') return None try: resp = requests.get(url, timeout=UCK_TIMEOUT) html = resp.content if resp.ok else None if html: detected = chardet.detect(html) encoding = detected['encoding'] if detected else 'utf-8' encoding = 'windows-1252' if 'folha.uol.com.br' in url else encoding data = html.decode(encoding, 'ignore') return hparser.unescape(urllib2.unquote(data)).replace(u'\xa0', ' ') else: logger.warning("Cannot read %s" % url) return None except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), url)) return None
def prepare_link(url): """ decode with the correct encoding """ if not url: logger.error('Method malformed!') return None try: resp = requests.get(url, timeout=UCK_TIMEOUT) html = resp.content if resp.ok else None if html: detected = chardet.detect(html) encoding = detected['encoding'] if detected else 'utf-8' encoding = 'windows-1252' if 'folha.uol.com.br' in url else encoding data = html.decode(encoding, 'ignore') return hparser.unescape(urllib2.unquote(data)).replace( u'\xa0', ' ') else: logger.warning("Cannot read %s" % url) return None except Exception as k: logger.info('Problem:[%s] Source:[%s]' % (str(k), url)) return None
def run(self): response = subprocess.Popen( '''curl --silent --connect-timeout %s -A Mozilla "http://translate.google .com/translate_tts?ie=UTF-8&oe=UTF-8&tl=%s&q=%s"''' % (GOOGLE_TTS_TIMEOUT, self.language, urllib2.quote(self.text)), stdout=subprocess.PIPE, shell=True, close_fds=True) content, error = response.communicate() if not error and content: if 'error' not in content or 'permission' not in content: self.result = content else: logger.error('Errors or Permission found in HTTP response') self.result = None else: if error: logger.error('Error %s found for %s' % (str(error), self.text)) self.result = None else: logger.info('No content returned for %s' % self.text) self.result = None
def clean(): """ remove expired items from database """ logger.info('... cleaning database ...') try: document_names = _find_document_names() if document_names: for document_name in document_names: document = Collection(db, document_name) # compute a threshold current_utc_time_posix = calendar.timegm(time.gmtime()) deadline_datetime = datetime.utcfromtimestamp( current_utc_time_posix) - timedelta( days=DATABASE_REMOVAL_DAYS) deadline_posix = calendar.timegm(deadline_datetime.timetuple()) removal_candidates = document.find( {'updated': { '$lt': deadline_posix }}) for removal_candidate in removal_candidates: # see if removal candidate has a footage in memory clean_memory.clean_by_item(str(removal_candidate['_id'])) # remove corresponding files on disk clean_disk.clean_by_item(removal_candidate) # remove the candidate in database document.remove({'_id': removal_candidate['_id']}) return True else: logger.error('Cannot find documents') return False except Exception as k: logger.error(str(k)) return False
#!/usr/bin/env python #-*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('UTF-8') from newsman.config.settings import logger logger.error('error') logger.info('info') logger.exception('exception') logger.critical('critical') logger.warning('warning')
def _read_entry(e=None, feed_id=None, feed_title=None, language=None, categories=None): """ read a specific entry item from a feed Note. categories are ids of category item """ if not e or not feed_title or not language or not categories: logger.error('Method malformed!') return None if language not in LANGUAGES: logger.error("Language not supported for %s!" % feed_title) return None try: entry = {} entry['feed_id'] = feed_id entry['feed'] = feed_title.strip() entry['language'] = language.strip() entry['categories'] = categories # the easy part: the must-have entry['error'] = [] # article original link if e.link: original_link = e.link.strip() if not original_link.startswith(AD_LINKS): # print 'original', original_link # print 'unescaped', hparser.unescape(original_link) # print 'unquoted', urllib2.unquote(original_link) # print 'unescaped-unquoted', urllib2.unquote(hparser # .unescape(original_link)) # print 'unquoted-unescaped', hparser.unescape(urllib2 # .unquote(original_link)) # find the real link from redirection # the sequence of the following two steps are IMPORTANT! original_link = _find_redirected_link(original_link) # print 'anti-redirected', original_link # clean the URL # original_link = urllib2.unquote(hparser.unescape( # original_link)) # print 'unescaped-unquoted', original_link # print '------------------------------------------------' # find the redirected link matched_prefix = [ link for link in HIDDEN_LINKS if original_link.startswith(link) ] found_prefix = matched_prefix[0] if matched_prefix else None if found_prefix: actual_link = _get_actual_link(found_prefix, original_link) if actual_link: entry['link'] = actual_link else: logger.error('No actual link found for %s!' % original_link) return None else: entry['link'] = original_link else: logger.info('Advertising link %s' % original_link) return None else: logger.info('Feed malformed! No link found!') return None # article title if e.title_detail.type != 'text/plain': entry['title'] = urllib2.unquote(hparser.unescape(e.title.strip())) elif 'title' in e: entry['title'] = e.title.strip() else: entry['title'] = None # remove possible htmlized title entry['title'] = re.sub( "<.*?>", " ", entry['title']) if 'title' in entry and entry['title'] else None # article published time # first try parsed time info try: entry['updated'] = calendar.timegm(e.updated_parsed) entry['updated_human'] = e.updated except AttributeError as k: try: entry['updated'] = calendar.timegm(e.published_parsed) entry['updated_human'] = e.published except AttributeError as k: entry['error'] = [ '%s\n%s' % (entry['error'], "no 'updated_parsed' or 'published_parsed'") ] # then try unparsed time info # this is rarely possible. try: updated = e.updated if 'updated' in e else e.published if updated: # get time zone offset = int(updated[-5:]) delta = timedelta(hours=int(offset) / 100) format = "%a, %d %b %Y %H:%M:%S" if updated[-8:-5] != 'UTC': updated = datetime.strptime(updated[:-6], format) else: updated = datetime.strptime(updated[:-9], format) updated -= delta entry['updated'] = time.mktime(updated.timetuple()) else: logger.info("Attribute updated/published has no value") return None except ValueError as k: logger.info(str(k)) entry['error'].append('%s\n%s' % (entry['error'], k)) return None except AttributeError as k: logger.info(str(k)) entry['error'].append('no update or published\n') return None # article's summary try: # its possible summary is html-based summary = urllib2.unquote(hparser.unescape(e.summary)) if isinstance(summary, str): summary_encoding = chardet.detect(summary)['encoding'] summary = summary.decode(summary_encoding, 'ignore') # a <div, for example, and a </div is_html = True if len( re.findall(u'</?a|</?p|</?strong|</?img|</?html|</?div', summary)) > 1 else False if is_html: h = html2text.HTML2Text() h.ignore_images = True h.ignore_links = True h.ignore_emphasis = True paragraphs = ( h.handle(summary)).strip().strip('#').strip().split('\n\n') paragraphs_above_limit = [] # remove paragraphs that contain less than x number of words for paragraph in paragraphs: if entry['language'].startswith( 'zh') or entry['language'] == 'ja': if len(paragraph) > 18: paragraphs_above_limit.append(paragraph) else: words = paragraph.split() if len(words) > 12: paragraphs_above_limit.append(paragraph) entry['summary'] = '\n\n'.join(paragraphs_above_limit) else: entry['summary'] = summary except AttributeError as k: entry['summary'] = None entry['summary'] = None if not entry['summary'] else entry['summary'] # article's images # e.g. [{'url':'http://image.com/tests.jpg, 'width': u'130', 'height': # u'86'}] entry['images'] = [] try: images, media_content_new = illustrator.find_images( e.media_content, entry['link']) if images: entry['images'].extend(images) except AttributeError as k: pass try: images, media_content_new = illustrator.find_images( e.media_thumbnail, entry['link']) if images: entry['images'].extend(images) except AttributeError as k: pass for attribute in e: if 'thumbnail' in attribute: # currently set thumbnail to None if its a dictionary image = e[attribute] if isinstance(e[attribute], str) else None image = illustrator.find_image(image, entry['link']) if image: entry['images'].append(image) try: links = e.links for link in links: if 'type' in link and 'image' in link.type: if 'href' in link: image = illustrator.find_image(link.href, entry['link']) if image: entry['images'].append(image) except AttributeError as k: pass if entry.has_key('summary') and entry['summary']: images, entry['summary'] = illustrator.find_images( entry['summary'], entry['link']) if images: entry['images'].extend(images) # dedup images is processed at rss.py # article's author # e.g. Yuan Jin try: # i guess this could be a string or a list entry['author'] = e.author except AttributeError as k: entry['author'] = None # article's source # e.g. {'href': u'http://www.reuters.com/', 'title': u'Reuters'} try: entry['source'] = e.source except AttributeError as k: entry['source'] = None # article's tags # e.g. [{'term': u'Campus Party', 'scheme': None, 'label': None}] # term is usually combined with scheme to form a url; label is # the name of term try: entry['tags'] = e.tag except AttributeError as k: entry['tags'] = None # the FINAL return return entry except Exception as k: logger.error(str(k)) return None
def parse(feed_link=None, feed_id=None, feed_title=None, language=None, categories=None, etag=None, modified=None): """ read rss/atom data from a given feed feed_id is the feed ObjectId in MongoDB Etag and Modified are used to save rss http server's bandwidth Note: category should be added to feed table/database """ if not feed_link or not feed_id or not language or not categories: logger.error("Method malformed!") return None, None, feed_title, etag, modified, "Method malformed!" if language not in LANGUAGES: logger.error("Language not supported for %s!" % feed_link) return None, None, feed_title, etag, modified, "Language not " "supported for %s!" % feed_link def _validate_time(entry): """ see if the entry's updated time is earlier than needed """ deadline = datetime.utcfromtimestamp( entry['updated']) + timedelta(days=DATABASE_REMOVAL_DAYS) return True if deadline > datetime.now() else False try: # variables d and e follow feedparser tradition feedparser.USER_AGENT = "newsman" d = feedparser.parse(feed_link, etag=etag, modified=modified) if d: # http://pythonhosted.org/feedparser/reference-status.html # http://pythonhosted.org/feedparser/http-etag.html#http-etag status = d.status if 'status' in d else None if status == 301: logger.critical('%s has been permantently moved to a %s!' % (feed_link, d.href)) return None, status, feed_title, etag, modified, '%s has been ' 'permantently moved to a %s!' % (feed_link, d.href) elif status == 304: logger.warning('%s server has not updated its feeds' % feed_link) return None, status, feed_title, etag, modified, '%s server ' 'has not updated its feeds' % feed_link elif status == 410: logger.critical( '%s is gone! Admin should check the feed availability!' % feed_link) return None, status, feed_title, etag, modified, '%s is gone! ' 'Admin should check the feed availability!' % feed_link elif status == 200 or status == 302: # no need to worry. if status == 302: logger.info('%s url has been temp moved to a new place' % feed_link) if not feed_title: # if title were not found in feed, an AttributeError would # be raised. feed_title = urllib2.unquote(hparser.unescape( d.feed.title)).strip() else: feed_title = feed_title.strip() if 'title' in d.feed: feed_title_latest = urllib2.unquote( hparser.unescape(d.feed.title)).strip() if feed_title != feed_title_latest: # change feed title logger.info('%s title changed! Please update feed ' 'table/database' % feed_link) logger.info('old title: %s' % feed_title) logger.info('new title: %s' % feed_title_latest) #feed_title = feed_title_latest else: logger.info('%s[%s] has no title in its latest RSS' % (feed_title, feed_link)) # update etag/modified etag = None modified = None try: etag = d.etag except AttributeError: try: modified = d.modified except AttributeError: pass if 'entries' in d: language = language if 'language' not in d else d.language # an Exception might be raised from _read_entry entries = [] logger.error('%s begins processing' % feed_title) for i, e in enumerate(d.entries): if e: entry = _read_entry(e, feed_id, feed_title, language, categories) if entry: entries.append(entry) else: logger.info('Cannot parse %s' % e['link']) continue else: logger.info('No infomation found for %s-th entry' % i) continue if entries: # the FINAL return # the last one indicates nothing wrong happended in # parsing return filter(_validate_time, entries), status, feed_title, etag, \ modified, 'OK' else: logger.info('Feed parsing goes wrong!') return None, status, feed_title, etag, modified, \ 'Feed parsing goes wrong!' else: logger.info("Feed %s has no items!" % feed_id) return None, status, feed_title, etag, modified, 'Feed %s ' 'has no items!' % feed_id else: logger.info('HTTP Error Code [%s] for %s' % (status, feed_link)) return None, status, feed_title, etag, modified, 'HTTP Error ' 'Code [%s] for %s' % (status, feed_link) else: logger.info("Cannot parse %s correctly!" % feed_id) return None, None, feed_title, etag, modified, "Cannot parse %s " "correctly!" % feed_id except Exception as k: logger.exception('%s for %s' % (str(k), feed_id)) return None, None, feed_title, etag, modified, '%s for %s' % (str(k), feed_id)
def scale_image(image=None, referer=None, size_expected=MIN_IMAGE_SIZE, resize_by_width=True, crop_by_center=True, relative_path=None): """ resize an image as requested resize_by_width: resize image according to its width(True)/height(False) crop_by_center: crop image from its center(True) or by point(0, 0)(False) """ if not image: logger.error('Image not found!') return None, None if not size_expected: logger.error('Expected image size not found!') return None, None if not relative_path: logger.error('Relative path for saving image not found!') return None, None image_url = image_size = None try: image_url = image['url'] image_size = image['width'], image['height'] except Exception: logger.error('Image [%s] is malformed!' % str(image)) return None, None if not image_url: logger.error('Image URL not found!') return None, None if not image_size: logger.error('Expected image size not found!') return None, None try: width = int(image_size[0]) height = int(image_size[1]) width_expected = int(size_expected[0]) height_expected = int(size_expected[1]) if width >= width_expected and height >= height_expected: if resize_by_width: height_new = width_expected * height / width width_new = width_expected else: width_new = height_expected * width / height height_new = height_expected # larger and equal than is important here if width_new >= width_expected and height_new >= height_expected: # resize size_new = width_new, height_new image_data = None try: if referer: HEADERS['Referer'] = referer response = requests.get( image_url, headers=HEADERS, timeout=UCK_TIMEOUT) image_data = Image.open(StringIO(response.content)) except Exception as k: logger.info( 'Problem:[%s]\nSource:[%s]' % (str(k), str(image_url))) return None, None # resize image according to new size image_format = image_data.format.lower( ) if image_data and image_data.format else 'jpg' image_data.thumbnail(size_new, Image.ANTIALIAS) image_cropped = None # crop out unnecessary part if crop_by_center: left = (width_new - width_expected) / 2 top = (height_new - height_expected) / 2 right = (width_new + width_expected) / 2 bottom = (height_new + height_expected) / 2 image_cropped = image_data.crop((left, top, right, bottom)) else: left = 0 top = 0 right = width_expected bottom = height_expected image_cropped = image_data.crop((left, top, right, bottom)) # save to disk if image_cropped: image_web_path = '%s%s.%s' % ( IMAGES_PUBLIC_DIR, relative_path, image_format.lower()) image_local_path = '%s%s.%s' % ( IMAGES_LOCAL_DIR, relative_path, image_format.lower()) image_cropped = image_cropped.convert('RGB') image_cropped.save(image_local_path, image_format) # clean data if image_cropped: del image_cropped if image_data: del image_data return {'url': image_web_path, 'width': width_expected, 'height': height_expected}, { 'url': image_local_path, 'width': width_expected, 'height': height_expected} else: # clean data if image_cropped: del image_cropped if image_data: del image_data return None, None else: return scale_image(image=image, referer=referer, size_expected=size_expected, resize_by_width=not resize_by_width, crop_by_center=crop_by_center, relative_path=relative_path) else: return None, None except Exception as k: logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(image_url))) return None, None
def _read_entry(e=None, feed_id=None, feed_title=None, language=None, categories=None): """ read a specific entry item from a feed Note. categories are ids of category item """ if not e or not feed_title or not language or not categories: logger.error('Method malformed!') return None if language not in LANGUAGES: logger.error("Language not supported for %s!" % feed_title) return None try: entry = {} entry['feed_id'] = feed_id entry['feed'] = feed_title.strip() entry['language'] = language.strip() entry['categories'] = categories # the easy part: the must-have entry['error'] = [] # article original link if e.link: original_link = e.link.strip() if not original_link.startswith(AD_LINKS): # print 'original', original_link # print 'unescaped', hparser.unescape(original_link) # print 'unquoted', urllib2.unquote(original_link) # print 'unescaped-unquoted', urllib2.unquote(hparser # .unescape(original_link)) # print 'unquoted-unescaped', hparser.unescape(urllib2 # .unquote(original_link)) # find the real link from redirection # the sequence of the following two steps are IMPORTANT! original_link = _find_redirected_link(original_link) # print 'anti-redirected', original_link # clean the URL # original_link = urllib2.unquote(hparser.unescape( # original_link)) # print 'unescaped-unquoted', original_link # print '------------------------------------------------' # find the redirected link matched_prefix = [ link for link in HIDDEN_LINKS if original_link.startswith(link)] found_prefix = matched_prefix[0] if matched_prefix else None if found_prefix: actual_link = _get_actual_link(found_prefix, original_link) if actual_link: entry['link'] = actual_link else: logger.error( 'No actual link found for %s!' % original_link) return None else: entry['link'] = original_link else: logger.info('Advertising link %s' % original_link) return None else: logger.info('Feed malformed! No link found!') return None # article title if e.title_detail.type != 'text/plain': entry['title'] = urllib2.unquote(hparser.unescape(e.title.strip())) elif 'title' in e: entry['title'] = e.title.strip() else: entry['title'] = None # remove possible htmlized title entry['title'] = re.sub("<.*?>", " ", entry[ 'title']) if 'title' in entry and entry['title'] else None # article published time # first try parsed time info try: entry['updated'] = calendar.timegm(e.updated_parsed) entry['updated_human'] = e.updated except AttributeError as k: try: entry['updated'] = calendar.timegm(e.published_parsed) entry['updated_human'] = e.published except AttributeError as k: entry['error'] = ['%s\n%s' % ( entry['error'], "no 'updated_parsed' or 'published_parsed'")] # then try unparsed time info # this is rarely possible. try: updated = e.updated if 'updated' in e else e.published if updated: # get time zone offset = int(updated[-5:]) delta = timedelta(hours=int(offset) / 100) format = "%a, %d %b %Y %H:%M:%S" if updated[-8:-5] != 'UTC': updated = datetime.strptime(updated[:-6], format) else: updated = datetime.strptime(updated[:-9], format) updated -= delta entry['updated'] = time.mktime(updated.timetuple()) else: logger.info( "Attribute updated/published has no value") return None except ValueError as k: logger.info(str(k)) entry['error'].append('%s\n%s' % (entry['error'], k)) return None except AttributeError as k: logger.info(str(k)) entry['error'].append('no update or published\n') return None # article's summary try: # its possible summary is html-based summary = urllib2.unquote(hparser.unescape(e.summary)) if isinstance(summary, str): summary_encoding = chardet.detect(summary)['encoding'] summary = summary.decode(summary_encoding, 'ignore') # a <div, for example, and a </div is_html = True if len( re.findall(u'</?a|</?p|</?strong|</?img|</?html|</?div', summary)) > 1 else False if is_html: h = html2text.HTML2Text() h.ignore_images = True h.ignore_links = True h.ignore_emphasis = True paragraphs = (h.handle(summary)).strip().strip( '#').strip().split('\n\n') paragraphs_above_limit = [] # remove paragraphs that contain less than x number of words for paragraph in paragraphs: if entry['language'].startswith('zh') or entry[ 'language'] == 'ja': if len(paragraph) > 18: paragraphs_above_limit.append(paragraph) else: words = paragraph.split() if len(words) > 12: paragraphs_above_limit.append(paragraph) entry['summary'] = '\n\n'.join(paragraphs_above_limit) else: entry['summary'] = summary except AttributeError as k: entry['summary'] = None entry['summary'] = None if not entry['summary'] else entry['summary'] # article's images # e.g. [{'url':'http://image.com/tests.jpg, 'width': u'130', 'height': # u'86'}] entry['images'] = [] try: images, media_content_new = illustrator.find_images( e.media_content, entry['link']) if images: entry['images'].extend(images) except AttributeError as k: pass try: images, media_content_new = illustrator.find_images( e.media_thumbnail, entry['link']) if images: entry['images'].extend(images) except AttributeError as k: pass for attribute in e: if 'thumbnail' in attribute: # currently set thumbnail to None if its a dictionary image = e[attribute] if isinstance(e[attribute], str) else None image = illustrator.find_image(image, entry['link']) if image: entry['images'].append(image) try: links = e.links for link in links: if 'type' in link and 'image' in link.type: if 'href' in link: image = illustrator.find_image( link.href, entry['link']) if image: entry['images'].append(image) except AttributeError as k: pass if entry.has_key('summary') and entry['summary']: images, entry['summary'] = illustrator.find_images( entry['summary'], entry['link']) if images: entry['images'].extend(images) # dedup images is processed at rss.py # article's author # e.g. Yuan Jin try: # i guess this could be a string or a list entry['author'] = e.author except AttributeError as k: entry['author'] = None # article's source # e.g. {'href': u'http://www.reuters.com/', 'title': u'Reuters'} try: entry['source'] = e.source except AttributeError as k: entry['source'] = None # article's tags # e.g. [{'term': u'Campus Party', 'scheme': None, 'label': None}] # term is usually combined with scheme to form a url; label is # the name of term try: entry['tags'] = e.tag except AttributeError as k: entry['tags'] = None # the FINAL return return entry except Exception as k: logger.error(str(k)) return None
def _sanitize(content=None, referer=None): """ modified uck content to suit news needs """ if not content: return None try: soup = BeautifulSoup(content.decode('utf-8', 'ignore')) # remove all <span> for span in soup.findAll('span'): span.extract() # sanitize <a> for a in soup.findAll('a'): img = a.find('img') if img: a.replaceWith(img) else: # it might be a simple href a.replaceWith(a.text) # remove img prefix for img in soup.findAll('img'): img_source = img.get('src') if img_source: img_tuple = img_source.rpartition('src=') img['src'] = img_tuple[2] # call NormalizedImage width = height = None try: ni = NormalizedImage(img['src'], referer) width, height = ni.get_image_size() except Exception as k: logger.info( 'Problem [%s] for Source [%s]' % ( str(k), str(img['src']))) continue if 480 <= width: img['width'] = '100%' img['height'] = 'auto' # clear away useless style for style in soup.findAll('div', style='border-top:none;'): img = style.find('img') if not img: if not style.find('p'): style.extract() else: style.replaceWith(img) # remove navigble strings and <div> for component in soup.contents: if isinstance(component, NavigableString): if len(component.string.split()) < 10: component.extract() elif isinstance(component, Tag): if component.name == 'div': if not component.find('p'): component.extract() # filter item img_count = 0 for item in soup.contents: if isinstance(item, Tag) and item.name == 'img': img_count += 1 if img_count == len(soup.contents): return None else: return ''.join([str(item) for item in soup.contents]) except Exception as k: logger.error(str(k)) return None
def _value_added_process(entries=None, language=None, transcoder_type='chengdujin'): """ add more value to an entry tts, transcode, images, redis_entry_expiration, database_entry_expiration """ if not entries: logger.error('Method malformed!') return None if not language or language not in LANGUAGES: logger.error("Language not found or not supported!") return None updated_entries = [] for i, entry in enumerate(entries): try: logger.info('... Working on %i of %d ...' % (i + 1, len(entries))) logger.info(entry['title']) logger.info(entry['link']) # [MUST-HAVE] transcoding # get a random int from 100 million possibilities rand = random.randint(0, 100000000) transcoded_relative_path = '%s_%s_%s_%i' % ( entry['language'], entry['feed_id'], entry['updated'], rand) # high chances transcoder cannot work properly entry['transcoded'], entry[ 'transcoded_local'], raw_transcoded_content, \ images_from_transcoded = transcoder.convert( entry['language'], entry['title'], entry['link'], entry['updated'], entry['feed'], transcoder_type, transcoded_relative_path) if entry['transcoded']: # [OPTIONAL] summary if entry['summary'] or raw_transcoded_content: summary_found = summarizer.extract(entry['language'], entry['title'], str( raw_transcoded_content), entry[ 'summary'], entry['link'], entry['feed'], '*|*'.join( entry['categories'])) entry['summary'] = summary_found #entry['summary'] = entry['summary'] if 'summary' in entry # and entry['summary'] else None # [OPTIONAL] images # process images found in the transcoded data if images_from_transcoded: # images from transcoded are already normalized entry['images'].extend(images_from_transcoded) # remove duplicated images images_deduped = illustrator.dedup_images( entry['images']) if entry.has_key('images') and entry[ 'images'] else None # be cautious dedup_images might return None if network # sucks if images_deduped: entry['images'] = images_deduped entry['images'] = entry[ 'images'] if 'images' in entry and entry['images'] else None # [OPTIONAL] generate 3 types of images: thumbnail, # category image and hot news image if entry.has_key('images') and entry['images']: biggest = illustrator.find_biggest_image(entry['images']) if biggest: entry = _generate_images(biggest, entry, rand) # for older version users entry['image'] = entry['thumbnail_image'][ 'url'] if 'thumbnail_image' in entry and entry[ 'thumbnail_image'] else None # [OPTIONAL] text image # if no category_image is found, generate a text-image if 'category_image' not in entry or ( 'category_image' in entry and not entry[ 'category_image']): image_relative_path = '%s_%s_%s_%i' % ( entry['language'], entry['feed_id'], entry['updated'], rand) try: text_img = text2img.Text2Image( language, entry['title'], '%s_textimage.png' % image_relative_path) entry['text_image'] = text_img.get_image() except Exception as k: logger.error( 'Problem [%s] generating text2image for [%s]' % ( str(k), entry['link'])) # [OPTIONAL] google tts not for indonesian if entry['language'] != 'in': # you dont get None in _get_tts # at worst the original entry is returned entry = _get_tts(entry, rand) # [MUST-HAVE] add expiration data def _expired(updated, days_to_deadline): """ compute expiration information return time string and unix time """ deadline = datetime.utcfromtimestamp( updated) + timedelta(days=days_to_deadline) return time.asctime( time.gmtime(calendar.timegm(deadline.timetuple()))) entry['memory_expired'] = _expired( entry['updated'], MEMORY_EXPIRATION_DAYS) entry['database_expired'] = _expired( entry['updated'], DATABASE_REMOVAL_DAYS) # [OPTIONAL] if logger is used, this could be removed entry['error'] = entry[ 'error'] if 'error' in entry and entry['error'] else None # [MUST-HAVE] update new entry to db_news # each entry is added with _id entry = db_news.update(entry) if entry: # [MUST-HAVE] store in memory result = memory.update(entry) if result: updated_entries.append(entry) else: logger.error('Error found in updating memory') # remove entry in database if clean_database.clean_by_item(entry): logger.info( 'Cleaned %s in database' % entry['title']) else: logger.error( 'Error cleaning %s in database' % entry[ 'title']) # remove entry-created files on disk if clean_disk.clean_by_item(entry): logger.info('Cleaned %s on disk' % entry['title']) else: logger.error( 'Error cleaning %s on disk' % entry['title']) continue else: logger.error('Error found in updating to news database') # remove entry-created files on disk if clean_disk.clean_by_item(entry): logger.info('Cleaned %s on disk' % entry['title']) else: logger.error( 'Error cleaning %s on disk' % entry['title']) continue else: logger.info('Error found in transcoding') continue except Exception as k: logger.error(str(k)) continue # the FINAL return if updated_entries: return True else: logger.info('No entry got value added!') return False
def update(feed_link=None, feed_id=None, language=None, categories=None, transcoder_type='chengdujin', parser_type=None): """ update could be called 1. from task procedure: feed_id 2. after an rss is added: feed_id 3. manually for testing purpose: feed_link, language Note categories are kept for manual testing """ if not feed_id and not (feed_link and language): logger.error('Method malformed!') return None try: # try to find the feed in database if feed_id: feed = db_feeds.get(feed_id=feed_id) else: feed = db_feeds.get(feed_link=feed_link, language=language) if feed: # read latest feed info from database feed_id = str(feed['_id']) feed_link = feed['feed_link'] language = feed['language'] categories = feed['categories'].keys() transcoder_type = feed['transcoder'] parser_type = feed['parser'] feed_title = feed_title_new = feed[ 'feed_title'] if 'feed_title' in feed else None etag = etag_new = feed['etag'] if 'etag' in feed else None modified = modified_new = feed[ 'modified'] if 'modified' in feed else None status_new = None reason_new = None entries = None if parser_type == 'rss': import rss_parser # parse rss reading from remote rss servers entries, status_new, feed_title_new, etag_new, modified_new, \ reason_new = rss_parser.parse( feed_link, feed_id, feed_title, language, categories, etag, modified) elif parser_type == 'twitter': import twitter_parser entries, status_new, feed_title_new, etag_new, reason_new = \ twitter_parser.parse( feed_link, feed_id, feed_title, language, categories, etag) else: pass if entries: # filter out existing entries in db_news # there are some possible exceptions -- yet let it be entries = db_news.dedup(entries, language) if entries: logger.warning('%s entries of %s received!' % (str(len(entries)), feed_link)) # and do tts, big_images, image as well as transcode. result = _value_added_process( entries, language, transcoder_type) if result: # feed_title, etag and modified to db_feeds # only feed_id is necessary, others are optional # **kwargs result = db_feeds.update( feed_id=feed_id, status=status_new, feed_title=feed_title_new, etag=etag_new, modified=modified_new, reason=reason_new) logger.warning('%s entries of %s added to database!' % (str(len(entries)), feed_link)) if result: return result else: logger.info('Error found updating feeds database') return None else: logger.info('Error found adding value to entries') return None else: logger.info('Nothing from RSS is found new!') return None else: logger.info('Nothing from RSS is updated!') result = db_feeds.update( feed_id=feed_id, status=status_new, feed_title=feed_title_new, etag=etag_new, modified=modified_new, reason=reason_new) if not result: logger.error('Error found updating feeds database') return None else: logger.warning('Register feed in database before updating!') return None except Exception as k: logger.error(str(k)) return None
def _sanitize(content=None, referer=None): """ modified uck content to suit news needs """ if not content: return None try: soup = BeautifulSoup(content.decode('utf-8', 'ignore')) # remove all <span> for span in soup.findAll('span'): span.extract() # sanitize <a> for a in soup.findAll('a'): img = a.find('img') if img: a.replaceWith(img) else: # it might be a simple href a.replaceWith(a.text) # remove img prefix for img in soup.findAll('img'): img_source = img.get('src') if img_source: img_tuple = img_source.rpartition('src=') img['src'] = img_tuple[2] # call NormalizedImage width = height = None try: ni = NormalizedImage(img['src'], referer) width, height = ni.get_image_size() except Exception as k: logger.info('Problem [%s] for Source [%s]' % (str(k), str(img['src']))) continue if 480 <= width: img['width'] = '100%' img['height'] = 'auto' # clear away useless style for style in soup.findAll('div', style='border-top:none;'): img = style.find('img') if not img: if not style.find('p'): style.extract() else: style.replaceWith(img) # remove navigble strings and <div> for component in soup.contents: if isinstance(component, NavigableString): if len(component.string.split()) < 10: component.extract() elif isinstance(component, Tag): if component.name == 'div': if not component.find('p'): component.extract() # filter item img_count = 0 for item in soup.contents: if isinstance(item, Tag) and item.name == 'img': img_count += 1 if img_count == len(soup.contents): return None else: return ''.join([str(item) for item in soup.contents]) except Exception as k: logger.error(str(k)) return None
#!/usr/bin/env python #-*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('UTF-8') from newsman.config.settings import logger logger.error('error') logger.info('info') logger.exception('exception') logger.critical('critical') logger.warning('warning')
def convert(language="en", title=None, link=None, updated=None, feed=None, transcoder="chengdujin", relative_path=None, stdout=False): """ select a transcoder send the link gather the data combine them with the template generate paths return news and images * stdout is to print result directly, no saving to physical disk related * stdout default value False """ if not language or not link: logger.error('Method malformed! language: %s link: %s' % (language, link)) if not stdout: return None, None, None, None else: return None, None try: link_clean = _preprocess(link) if link_clean: # this wont suck transcoders = _organize_transcoders(transcoder) title_new, content, images = _transcode( link_clean, transcoders, language) # remove null content content = content.strip() if content else None # in case no title is found from feed information if not title: title = title_new if content and title: # slimmer the content content = html_slimmer(content) if not stdout: # embed content in template news = _compose( language, title, updated, feed, _sanitize(content), images) if news: # create web/local path web_path, local_path = _save(news, relative_path) if web_path: # the FINAL return return web_path, local_path, content, images else: if not stdout: return None, None, None, None else: return None, None else: logger.error( 'Cannot combine content with the template!') if not stdout: return None, None, None, None else: return None, None else: return title, content else: if not content: logger.info('Transcoder %s failed for %s' % (transcoder, link_clean)) else: logger.info('Cannot find title for %s' % link_clean) if not stdout: # original link is returned as transcoded path logger.info('Original link %s is used as transcoded path') return link_clean, None, None, None else: return None, None else: logger.error( 'Link [clean %s] [original %s] cannot be parsed' % ( link_clean, link)) if not stdout: return None, None, None, None else: return None, None except Exception as k: logger.error(str(k)) if not stdout: return None, None, None, None else: return None, None
def update(feed_link=None, feed_id=None, language=None, categories=None, transcoder_type='chengdujin', parser_type=None): """ update could be called 1. from task procedure: feed_id 2. after an rss is added: feed_id 3. manually for testing purpose: feed_link, language Note categories are kept for manual testing """ if not feed_id and not (feed_link and language): logger.error('Method malformed!') return None try: # try to find the feed in database if feed_id: feed = db_feeds.get(feed_id=feed_id) else: feed = db_feeds.get(feed_link=feed_link, language=language) if feed: # read latest feed info from database feed_id = str(feed['_id']) feed_link = feed['feed_link'] language = feed['language'] categories = feed['categories'].keys() transcoder_type = feed['transcoder'] parser_type = feed['parser'] feed_title = feed_title_new = feed[ 'feed_title'] if 'feed_title' in feed else None etag = etag_new = feed['etag'] if 'etag' in feed else None modified = modified_new = feed[ 'modified'] if 'modified' in feed else None status_new = None reason_new = None entries = None if parser_type == 'rss': import rss_parser # parse rss reading from remote rss servers entries, status_new, feed_title_new, etag_new, modified_new, \ reason_new = rss_parser.parse( feed_link, feed_id, feed_title, language, categories, etag, modified) elif parser_type == 'twitter': import twitter_parser entries, status_new, feed_title_new, etag_new, reason_new = \ twitter_parser.parse( feed_link, feed_id, feed_title, language, categories, etag) else: pass if entries: # filter out existing entries in db_news # there are some possible exceptions -- yet let it be entries = db_news.dedup(entries, language) if entries: logger.warning('%s entries of %s received!' % (str(len(entries)), feed_link)) # and do tts, big_images, image as well as transcode. result = _value_added_process(entries, language, transcoder_type) if result: # feed_title, etag and modified to db_feeds # only feed_id is necessary, others are optional # **kwargs result = db_feeds.update(feed_id=feed_id, status=status_new, feed_title=feed_title_new, etag=etag_new, modified=modified_new, reason=reason_new) logger.warning('%s entries of %s added to database!' % (str(len(entries)), feed_link)) if result: return result else: logger.info('Error found updating feeds database') return None else: logger.info('Error found adding value to entries') return None else: logger.info('Nothing from RSS is found new!') return None else: logger.info('Nothing from RSS is updated!') result = db_feeds.update(feed_id=feed_id, status=status_new, feed_title=feed_title_new, etag=etag_new, modified=modified_new, reason=reason_new) if not result: logger.error('Error found updating feeds database') return None else: logger.warning('Register feed in database before updating!') return None except Exception as k: logger.error(str(k)) return None
def convert(language="en", title=None, link=None, updated=None, feed=None, transcoder="chengdujin", relative_path=None, stdout=False): """ select a transcoder send the link gather the data combine them with the template generate paths return news and images * stdout is to print result directly, no saving to physical disk related * stdout default value False """ if not language or not link: logger.error('Method malformed! language: %s link: %s' % (language, link)) if not stdout: return None, None, None, None else: return None, None try: link_clean = _preprocess(link) if link_clean: # this wont suck transcoders = _organize_transcoders(transcoder) title_new, content, images = _transcode(link_clean, transcoders, language) # remove null content content = content.strip() if content else None # in case no title is found from feed information if not title: title = title_new if content and title: # slimmer the content content = html_slimmer(content) if not stdout: # embed content in template news = _compose(language, title, updated, feed, _sanitize(content), images) if news: # create web/local path web_path, local_path = _save(news, relative_path) if web_path: # the FINAL return return web_path, local_path, content, images else: if not stdout: return None, None, None, None else: return None, None else: logger.error( 'Cannot combine content with the template!') if not stdout: return None, None, None, None else: return None, None else: return title, content else: if not content: logger.info('Transcoder %s failed for %s' % (transcoder, link_clean)) else: logger.info('Cannot find title for %s' % link_clean) if not stdout: # original link is returned as transcoded path logger.info('Original link %s is used as transcoded path') return link_clean, None, None, None else: return None, None else: logger.error('Link [clean %s] [original %s] cannot be parsed' % (link_clean, link)) if not stdout: return None, None, None, None else: return None, None except Exception as k: logger.error(str(k)) if not stdout: return None, None, None, None else: return None, None
def _value_added_process(entries=None, language=None, transcoder_type='chengdujin'): """ add more value to an entry tts, transcode, images, redis_entry_expiration, database_entry_expiration """ if not entries: logger.error('Method malformed!') return None if not language or language not in LANGUAGES: logger.error("Language not found or not supported!") return None updated_entries = [] for i, entry in enumerate(entries): try: logger.info('... Working on %i of %d ...' % (i + 1, len(entries))) logger.info(entry['title']) logger.info(entry['link']) # [MUST-HAVE] transcoding # get a random int from 100 million possibilities rand = random.randint(0, 100000000) transcoded_relative_path = '%s_%s_%s_%i' % ( entry['language'], entry['feed_id'], entry['updated'], rand) # high chances transcoder cannot work properly entry['transcoded'], entry[ 'transcoded_local'], raw_transcoded_content, \ images_from_transcoded = transcoder.convert( entry['language'], entry['title'], entry['link'], entry['updated'], entry['feed'], transcoder_type, transcoded_relative_path) if entry['transcoded']: # [OPTIONAL] summary if entry['summary'] or raw_transcoded_content: summary_found = summarizer.extract( entry['language'], entry['title'], str(raw_transcoded_content), entry['summary'], entry['link'], entry['feed'], '*|*'.join(entry['categories'])) entry['summary'] = summary_found #entry['summary'] = entry['summary'] if 'summary' in entry # and entry['summary'] else None # [OPTIONAL] images # process images found in the transcoded data if images_from_transcoded: # images from transcoded are already normalized entry['images'].extend(images_from_transcoded) # remove duplicated images images_deduped = illustrator.dedup_images( entry['images'] ) if entry.has_key('images') and entry['images'] else None # be cautious dedup_images might return None if network # sucks if images_deduped: entry['images'] = images_deduped entry['images'] = entry[ 'images'] if 'images' in entry and entry['images'] else None # [OPTIONAL] generate 3 types of images: thumbnail, # category image and hot news image if entry.has_key('images') and entry['images']: biggest = illustrator.find_biggest_image(entry['images']) if biggest: entry = _generate_images(biggest, entry, rand) # for older version users entry['image'] = entry['thumbnail_image'][ 'url'] if 'thumbnail_image' in entry and entry[ 'thumbnail_image'] else None # [OPTIONAL] text image # if no category_image is found, generate a text-image if 'category_image' not in entry or ( 'category_image' in entry and not entry['category_image']): image_relative_path = '%s_%s_%s_%i' % ( entry['language'], entry['feed_id'], entry['updated'], rand) try: text_img = text2img.Text2Image( language, entry['title'], '%s_textimage.png' % image_relative_path) entry['text_image'] = text_img.get_image() except Exception as k: logger.error( 'Problem [%s] generating text2image for [%s]' % (str(k), entry['link'])) # [OPTIONAL] google tts not for indonesian if entry['language'] != 'in': # you dont get None in _get_tts # at worst the original entry is returned entry = _get_tts(entry, rand) # [MUST-HAVE] add expiration data def _expired(updated, days_to_deadline): """ compute expiration information return time string and unix time """ deadline = datetime.utcfromtimestamp(updated) + timedelta( days=days_to_deadline) return time.asctime( time.gmtime(calendar.timegm(deadline.timetuple()))) entry['memory_expired'] = _expired(entry['updated'], MEMORY_EXPIRATION_DAYS) entry['database_expired'] = _expired(entry['updated'], DATABASE_REMOVAL_DAYS) # [OPTIONAL] if logger is used, this could be removed entry['error'] = entry[ 'error'] if 'error' in entry and entry['error'] else None # [MUST-HAVE] update new entry to db_news # each entry is added with _id entry = db_news.update(entry) if entry: # [MUST-HAVE] store in memory result = memory.update(entry) if result: updated_entries.append(entry) else: logger.error('Error found in updating memory') # remove entry in database if clean_database.clean_by_item(entry): logger.info('Cleaned %s in database' % entry['title']) else: logger.error('Error cleaning %s in database' % entry['title']) # remove entry-created files on disk if clean_disk.clean_by_item(entry): logger.info('Cleaned %s on disk' % entry['title']) else: logger.error('Error cleaning %s on disk' % entry['title']) continue else: logger.error('Error found in updating to news database') # remove entry-created files on disk if clean_disk.clean_by_item(entry): logger.info('Cleaned %s on disk' % entry['title']) else: logger.error('Error cleaning %s on disk' % entry['title']) continue else: logger.info('Error found in transcoding') continue except Exception as k: logger.error(str(k)) continue # the FINAL return if updated_entries: return True else: logger.info('No entry got value added!') return False
def parse(feed_link=None, feed_id=None, feed_title=None, language=None, categories=None, etag=None, modified=None): """ read rss/atom data from a given feed feed_id is the feed ObjectId in MongoDB Etag and Modified are used to save rss http server's bandwidth Note: category should be added to feed table/database """ if not feed_link or not feed_id or not language or not categories: logger.error("Method malformed!") return None, None, feed_title, etag, modified, "Method malformed!" if language not in LANGUAGES: logger.error("Language not supported for %s!" % feed_link) return None, None, feed_title, etag, modified, "Language not " "supported for %s!" % feed_link def _validate_time(entry): """ see if the entry's updated time is earlier than needed """ deadline = datetime.utcfromtimestamp( entry['updated']) + timedelta(days=DATABASE_REMOVAL_DAYS) return True if deadline > datetime.now() else False try: # variables d and e follow feedparser tradition feedparser.USER_AGENT = "newsman" d = feedparser.parse(feed_link, etag=etag, modified=modified) if d: # http://pythonhosted.org/feedparser/reference-status.html # http://pythonhosted.org/feedparser/http-etag.html#http-etag status = d.status if 'status' in d else None if status == 301: logger.critical( '%s has been permantently moved to a %s!' % ( feed_link, d.href)) return None, status, feed_title, etag, modified, '%s has been ' 'permantently moved to a %s!' % ( feed_link, d.href) elif status == 304: logger.warning( '%s server has not updated its feeds' % feed_link) return None, status, feed_title, etag, modified, '%s server ' 'has not updated its feeds' % feed_link elif status == 410: logger.critical( '%s is gone! Admin should check the feed availability!' % feed_link) return None, status, feed_title, etag, modified, '%s is gone! ' 'Admin should check the feed availability!' % feed_link elif status == 200 or status == 302: # no need to worry. if status == 302: logger.info( '%s url has been temp moved to a new place' % feed_link) if not feed_title: # if title were not found in feed, an AttributeError would # be raised. feed_title = urllib2.unquote( hparser.unescape(d.feed.title)).strip() else: feed_title = feed_title.strip() if 'title' in d.feed: feed_title_latest = urllib2.unquote( hparser.unescape(d.feed.title)).strip() if feed_title != feed_title_latest: # change feed title logger.info( '%s title changed! Please update feed ' 'table/database' % feed_link) logger.info('old title: %s' % feed_title) logger.info('new title: %s' % feed_title_latest) #feed_title = feed_title_latest else: logger.info( '%s[%s] has no title in its latest RSS' % ( feed_title, feed_link)) # update etag/modified etag = None modified = None try: etag = d.etag except AttributeError: try: modified = d.modified except AttributeError: pass if 'entries' in d: language = language if 'language' not in d else d.language # an Exception might be raised from _read_entry entries = [] logger.error('%s begins processing' % feed_title) for i, e in enumerate(d.entries): if e: entry = _read_entry( e, feed_id, feed_title, language, categories) if entry: entries.append(entry) else: logger.info('Cannot parse %s' % e['link']) continue else: logger.info( 'No infomation found for %s-th entry' % i) continue if entries: # the FINAL return # the last one indicates nothing wrong happended in # parsing return filter(_validate_time, entries), status, feed_title, etag, \ modified, 'OK' else: logger.info('Feed parsing goes wrong!') return None, status, feed_title, etag, modified, \ 'Feed parsing goes wrong!' else: logger.info("Feed %s has no items!" % feed_id) return None, status, feed_title, etag, modified, 'Feed %s ' 'has no items!' % feed_id else: logger.info( 'HTTP Error Code [%s] for %s' % (status, feed_link)) return None, status, feed_title, etag, modified, 'HTTP Error ' 'Code [%s] for %s' % ( status, feed_link) else: logger.info("Cannot parse %s correctly!" % feed_id) return None, None, feed_title, etag, modified, "Cannot parse %s " "correctly!" % feed_id except Exception as k: logger.exception('%s for %s' % (str(k), feed_id)) return None, None, feed_title, etag, modified, '%s for %s' % ( str(k), feed_id)