Ejemplo n.º 1
0
def dedup_images(images=None):
    """
    remove same images
    image: {'url':xxx, 'width':yyy, 'height':zzz}
    images = [image, image, image]
    """
    if not images:
        logger.error('Image list is found VOID!')
        return None

    image_urls = []

    def _exists(image):
        """
        return boolean if image exists in the image_urls list
        """
        if image['url'] not in image_urls:
            image_urls.append(image['url'])
            return False
        else:
            return True

    try:
        return filter(lambda x: not _exists(x), images)
    except Exception as k:
        logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(images)))
        return None
Ejemplo n.º 2
0
def find_biggest_image(images=None):
    """
    find the biggest image in resolution from a list of images
    """
    if not images:
        logger.error('Image list is found VOID!')
        return None

    try:
        biggest = None
        resolution_max = MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]
        for image in images:
            if 'width' in image and 'height' in image:
                resolution_image = int(image['width']) * int(image['height'])
                if resolution_image > MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]:
                    if resolution_image > resolution_max:
                        biggest = image
                        resolution_max = resolution_image
                else:
                    logger.info('Image [%s] is not big enough!' %
                                str(image['url']))
            else:
                logger.info('Height and width not found! %s' % str(image))
        return biggest
    except Exception as k:
        logger.error('Problem:[%s]\nSource:[%s]' % (str(k), str(images)))
        return None
Ejemplo n.º 3
0
    def _check_image(self, image_url=None, image_html=None):
        """
        Replace orginal image_url with downloaded local copy, if original 
        image_url could not be reached without HEADERS
        """
        if not image_url:
            logger.error('Image URL is found VOID!')
            raise Exception('Image URL is found VOID!')
        if not image_html:
            logger.error('Image content is found VOID!')
            raise Exception('Image content is found VOID!')

        try:
            response = requests.get(image_url, timeout=UCK_TIMEOUT)
            if response.status_code > 400 or 'posttoday.com/media/content' in \
                    image_url:
                raise Exception('Without HEADERS [%s] cannot be reached!' %
                                str(image_url))
        except Exception as k:
            logger.info('Problem:[%s] Source:[%s]' % (str(k), str(image_url)))

            # replace original image_url with downloaded local copy
            image_url_new = self._download_copy(image_url, image_html)
            return image_url_new if image_url_new else image_url

        # Image is accessible with/without HEADERS
        return image_url
Ejemplo n.º 4
0
def dedup_images(images=None):
    """
    remove same images
    image: {'url':xxx, 'width':yyy, 'height':zzz}
    images = [image, image, image]
    """
    if not images:
        logger.error('Image list is found VOID!')
        return None

    image_urls = []

    def _exists(image):
        """
        return boolean if image exists in the image_urls list
        """
        if image['url'] not in image_urls:
            image_urls.append(image['url'])
            return False
        else:
            return True

    try:
        return filter(lambda x: not _exists(x), images)
    except Exception as k:
        logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(images)))
        return None
Ejemplo n.º 5
0
def clean():
    """
    remove expired items from database
    """
    logger.info('... cleaning database ...')
    try:
        document_names = _find_document_names()
        if document_names:
            for document_name in document_names:
                document = Collection(db, document_name)

                # compute a threshold
                current_utc_time_posix = calendar.timegm(time.gmtime())
                deadline_datetime = datetime.utcfromtimestamp(
                    current_utc_time_posix) - timedelta(
                    days=DATABASE_REMOVAL_DAYS)
                deadline_posix = calendar.timegm(deadline_datetime.timetuple())

                removal_candidates = document.find(
                    {'updated': {'$lt': deadline_posix}})
                for removal_candidate in removal_candidates:
                    # see if removal candidate has a footage in memory
                    clean_memory.clean_by_item(str(removal_candidate['_id']))
                    # remove corresponding files on disk
                    clean_disk.clean_by_item(removal_candidate)
                    # remove the candidate in database
                    document.remove({'_id': removal_candidate['_id']})
            return True
        else:
            logger.error('Cannot find documents')
            return False
    except Exception as k:
        logger.error(str(k))
        return False
Ejemplo n.º 6
0
def dedup(entries=None, language=None):
    """
    return entries not found in database
    """
    if not entries:
        logger.error('Method malformed!')
        return None
    if not language or language not in LANGUAGES:
        logger.error("Language not found or not supported!")
        return None

    try:
        entries_new = []
        col = Collection(db, language)
        for entry in entries:
            # find duplicate in the form of the same link or title
            dup_link = col.find_one(
                {'link': entry['link'], 'feed': entry['feed']})
            if dup_link:
                logger.info('Find a duplicate for %s' % str(entry['title']))
                continue
            else:
                dup_title = col.find_one(
                    {'title': entry['title'], 'feed': entry['feed']})
                if dup_title:
                    logger.info(
                        'Find a duplicate for %s' % str(entry['title']))
                    continue
                else:
                    entries_new.append(entry)
        return entries_new if entries_new else None
    except Exception as k:
        logger.error(str(k))
        return None
Ejemplo n.º 7
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Ejemplo n.º 8
0
def convert(link):
    """
    send link to uck api and reformat the content
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    # send link to uck server and get data back
    try:
        raw_data = _transcode(link)
        if raw_data:
            # check if raw_data is syntax-correct
            try:
                eval(raw_data)
            except Exception:
                logger.info('Invalid syntax found for UCK output')
                return None, None, None

            # text is sanitized, images are found from image_list
            title, transcoded, images = _extract(eval(raw_data), link)
            return title, transcoded, images
        else:
            logger.info('Cannot read anything from UCK server')
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Ejemplo n.º 9
0
def convert(link):
    """
    send link to uck api and reformat the content
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    # send link to uck server and get data back
    try:
        raw_data = _transcode(link)
        if raw_data:
            # check if raw_data is syntax-correct
            try:
                eval(raw_data)
            except Exception:
                logger.info('Invalid syntax found for UCK output')
                return None, None, None

            # text is sanitized, images are found from image_list
            title, transcoded, images = _extract(eval(raw_data), link)
            return title, transcoded, images
        else:
            logger.info('Cannot read anything from UCK server')
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Ejemplo n.º 10
0
def _extract(data=None, referer=None):
    """
    extract images and text content
    """
    if not data:
        logger.error('Received no data from UCK server.')
        return None, None, None

    successful = int(data['STRUCT_PAGE_TYPE'])
    if successful == 0:
        logger.info('Cannot interpret the page! status != 1')
        return None, None, None

    try:
        # content
        content = data['content'].replace("\\", "")
        content = _sanitize(content, referer)

        # images
        images, data = _collect_images(data, referer)
        images = images if images else None

        # title
        title = None
        if 'title' in data:
            title = data['title']

        return title, content, images
    except Exception as k:
        logger.error(str(k))
        return None, None, None
Ejemplo n.º 11
0
def _clean_data():
    """
    clean memory, database and files, usually run daily
    """
    logger.info('----------------------cleaning-------------------------')
    try:
        any_mistake = False
        # clean database
        if not clean_database.clean():
            logger.error('Error found cleaning database')
            any_mistake = True
        # clean memory
        if not clean_memory.clean():
            logger.error('Error found cleaning memory')
            any_mistake = True
        # clean disk
        if not clean_disk.clean():
            logger.error('Error found cleaning disk')
            any_mistake = True

        if not any_mistake:
            logger.info('Memory, Database & Disk got cleaned!')
            return True
        else:
            return False
    except Exception as k:
        logger.error(str(k))
        return False
Ejemplo n.º 12
0
def _extract(data=None, referer=None):
    """
    extract images and text content
    """
    if not data:
        logger.error('Received no data from UCK server.')
        return None, None, None

    successful = int(data['STRUCT_PAGE_TYPE'])
    if successful == 0:
        logger.info('Cannot interpret the page! status != 1')
        return None, None, None

    try:
        # content
        content = data['content'].replace("\\", "")
        content = _sanitize(content, referer)

        # images
        images, data = _collect_images(data, referer)
        images = images if images else None

        # title
        title = None
        if 'title' in data:
            title = data['title']

        return title, content, images
    except Exception as k:
        logger.error(str(k))
        return None, None, None
Ejemplo n.º 13
0
def _clean_data():
    """
    clean memory, database and files, usually run daily
    """
    logger.info('----------------------cleaning-------------------------')
    try:
        any_mistake = False
        # clean database
        if not clean_database.clean():
            logger.error('Error found cleaning database')
            any_mistake = True
        # clean memory
        if not clean_memory.clean():
            logger.error('Error found cleaning memory')
            any_mistake = True
        # clean disk
        if not clean_disk.clean():
            logger.error('Error found cleaning disk')
            any_mistake = True

        if not any_mistake:
            logger.info('Memory, Database & Disk got cleaned!')
            return True
        else:
            return False
    except Exception as k:
        logger.error(str(k))
        return False
Ejemplo n.º 14
0
    def _check_image(self, image_url=None, image_html=None):
        """
        Replace orginal image_url with downloaded local copy, if original 
        image_url could not be reached without HEADERS
        """
        if not image_url:
            logger.error('Image URL is found VOID!')
            raise Exception('Image URL is found VOID!')
        if not image_html:
            logger.error('Image content is found VOID!')
            raise Exception('Image content is found VOID!')

        try:
            response = requests.get(image_url, timeout=UCK_TIMEOUT)
            if response.status_code > 400 or 'posttoday.com/media/content' in \
                    image_url:
                raise Exception(
                    'Without HEADERS [%s] cannot be reached!' % str(image_url))
        except Exception as k:
            logger.info('Problem:[%s] Source:[%s]' % (str(k), str(image_url)))

            # replace original image_url with downloaded local copy
            image_url_new = self._download_copy(image_url, image_html)
            return image_url_new if image_url_new else image_url

        # Image is accessible with/without HEADERS
        return image_url
Ejemplo n.º 15
0
    def _analyze(self, image_url=None, referer=None):
        """
        remove CDN prefix, if any; and read image data
        """
        if not image_url:
            logger.error('Method malformed!')
            raise Exception('Method malformed!')

        image_url = image_url.replace("\/", "/").strip()
        image_url = urllib2.unquote(hparser.unescape(image_url))

        # as the name could be http://xxx.com/yyy--http://zzz.jpg
        # or http://xxx.com/yyy--https://zzz.jpg
        last_http_index = image_url.rfind('http')
        image_url = image_url[last_http_index:]

        response = None
        if referer:
            HEADERS['Referer'] = referer
        try:
            response = requests.get(image_url,
                                    headers=HEADERS,
                                    timeout=UCK_TIMEOUT)
            # avoid redirected URL
            image_url = response.url
            # either exception or wrong HTTP code
            if response.status_code >= 400:
                raise Exception('Response code %s' % response.status_code)
        except Exception as k:
            logger.info('%s for %s' % (str(k), str(image_url)))
            try:
                # CDN URL could be formed as http:/xxxx.jpg
                path = re.split('https?://?', image_url)[-1]
                scheme = requests.utils.urlparse(image_url).scheme
                image_url = '%s://%s' % (scheme, path)

                response = requests.get(image_url,
                                        headers=HEADERS,
                                        timeout=UCK_TIMEOUT)
                # avoid redirected URL
                image_url = response.url
                if response.status_code >= 400:
                    raise Exception('Response code %s' % response.status_code)
            except Exception as k:
                logger.error('%s for %s' % (str(k), str(image_url)))
                raise Exception('%s for %s' % (str(k), str(image_url)))

        if response and response.status_code < 400 and response.content:
            # GIF is not supported yet
            #pr = requests.utils.urlparse(image_url)
            #image_url_address = pr.netloc + pr.path
            # if image_url_address.lower().endswith('.gif'):
            #    raise Exception('GIF is not supported! %s' % str(image_url))
            # else:
            image_html = response.content
            image_url = self._check_image(image_url, image_html)
            return str(image_url), str(image_html)
        else:
            logger.error('Cannot parse %s' % str(image_url))
            raise Exception('Cannot parse %s' % str(image_url))
Ejemplo n.º 16
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Ejemplo n.º 17
0
def find_biggest_image(images=None):
    """
    find the biggest image in resolution from a list of images
    """
    if not images:
        logger.error('Image list is found VOID!')
        return None

    try:
        biggest = None
        resolution_max = MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]
        for image in images:
            if 'width' in image and 'height' in image:
                resolution_image = int(image['width']) * int(image['height'])
                if resolution_image > MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]:
                    if resolution_image > resolution_max:
                        biggest = image
                        resolution_max = resolution_image
                else:
                    logger.info('Image [%s] is not big enough!' %
                                str(image['url']))
            else:
                logger.info('Height and width not found! %s' % str(image))
        return biggest
    except Exception as k:
        logger.error('Problem:[%s]\nSource:[%s]' % (str(k), str(images)))
        return None
Ejemplo n.º 18
0
    def _analyze(self, image_url=None, referer=None):
        """
        remove CDN prefix, if any; and read image data
        """
        if not image_url:
            logger.error('Method malformed!')
            raise Exception('Method malformed!')

        image_url = image_url.replace("\/", "/").strip()
        image_url = urllib2.unquote(hparser.unescape(image_url))

        # as the name could be http://xxx.com/yyy--http://zzz.jpg
        # or http://xxx.com/yyy--https://zzz.jpg
        last_http_index = image_url.rfind('http')
        image_url = image_url[last_http_index:]

        response = None
        if referer:
            HEADERS['Referer'] = referer
        try:
            response = requests.get(
                image_url, headers=HEADERS, timeout=UCK_TIMEOUT)
            # avoid redirected URL
            image_url = response.url
            # either exception or wrong HTTP code
            if response.status_code >= 400:
                raise Exception('Response code %s' % response.status_code)
        except Exception as k:
            logger.info('%s for %s' % (str(k), str(image_url)))
            try:
                # CDN URL could be formed as http:/xxxx.jpg
                path = re.split('https?://?', image_url)[-1]
                scheme = requests.utils.urlparse(image_url).scheme
                image_url = '%s://%s' % (scheme, path)

                response = requests.get(
                    image_url, headers=HEADERS, timeout=UCK_TIMEOUT)
                # avoid redirected URL
                image_url = response.url
                if response.status_code >= 400:
                    raise Exception('Response code %s' % response.status_code)
            except Exception as k:
                logger.error('%s for %s' % (str(k), str(image_url)))
                raise Exception('%s for %s' % (str(k), str(image_url)))

        if response and response.status_code < 400 and response.content:
            # GIF is not supported yet
            #pr = requests.utils.urlparse(image_url)
            #image_url_address = pr.netloc + pr.path
            # if image_url_address.lower().endswith('.gif'):
            #    raise Exception('GIF is not supported! %s' % str(image_url))
            # else:
            image_html = response.content
            image_url = self._check_image(image_url, image_html)
            return str(image_url), str(image_html)
        else:
            logger.error('Cannot parse %s' % str(image_url))
            raise Exception('Cannot parse %s' % str(image_url))
Ejemplo n.º 19
0
def _download(language='en',
              query='Service provided by Baidu',
              tmp_file='do_not_exist.mp3'):
    """
    docs needed!
    other ways to write _download
    1. https://github.com/hungtruong/Google-Translate-TTS/blob/master
    /GoogleTTS.py
    2. https://github.com/gavinmh/tts-api/blob/master/text_segmenter.py
    """

    try:
        # break a long sentence/paragraph into google-acceptable length
        segments = _query_segment(language, query)

        # download chunks and write them to the output file
        threads = []
        if segments:
            for segment in segments:
                if segment:
                    logger.info('... Transmitting "%s"' % segment)
                    gt_request = GoogleTranslateAPI(language, segment)
                    threads.append(gt_request)
                    gt_request.start()
                    gt_request.join(GOOGLE_TTS_TIMEOUT)

            out = open(tmp_file, 'a')
            download_completed = True
            for th in threads:
                sys.stdout.flush()
                if th.result:
                    out.write(th.result)
                else:
                    download_completed = False
                    break
            out.close()

            if download_completed:
                return tmp_file
            else:
                logger.info('Download not completed, now removing the file')
                if os.path.exists(tmp_file):
                    os.remove(tmp_file)
                return None
        else:  # nothing generated from the query
            logger.error('Nothing generated from the query')
            return None
    except Exception as k:
        logger.error(
            'Part of tts dowload went wrong, now removing the file: %s' %
            str(k))
        if os.path.exists(tmp_file):
            os.remove(tmp_file)
        return None
Ejemplo n.º 20
0
def _clean_zombies():
    """
    kill zombie processes, usually run semi-daily, or quasi-daily
    """
    logger.info('-----------------killing zombies--------------------')
    try:
        clean_process.clean()
        return True
    except Exception as k:
        logger.error(str(k))
        return False
Ejemplo n.º 21
0
def _clean_zombies():
    """
    kill zombie processes, usually run semi-daily, or quasi-daily
    """
    logger.info('-----------------killing zombies--------------------')
    try:
        clean_process.clean()
        return True
    except Exception as k:
        logger.error(str(k))
        return False
Ejemplo n.º 22
0
def _transcode(link):
    """
    send link to uck and get the data
    """
    try:
        html = urllib2.urlopen(
            '%s%s' % (UCK_TRANSCODING_NEW, link), timeout=UCK_TIMEOUT).read()
        data = urllib2.unquote(hparser.unescape(html))
        return data
    except Exception as k:
        logger.info('Problem:[%s] Source:[%s]' % (str(k), link))
        return None
Ejemplo n.º 23
0
def _download(language='en', query='Service provided by Baidu',
              tmp_file='do_not_exist.mp3'):
    """
    docs needed!
    other ways to write _download
    1. https://github.com/hungtruong/Google-Translate-TTS/blob/master
    /GoogleTTS.py
    2. https://github.com/gavinmh/tts-api/blob/master/text_segmenter.py
    """

    try:
        # break a long sentence/paragraph into google-acceptable length
        segments = _query_segment(language, query)

        # download chunks and write them to the output file
        threads = []
        if segments:
            for segment in segments:
                if segment:
                    logger.info('... Transmitting "%s"' % segment)
                    gt_request = GoogleTranslateAPI(language, segment)
                    threads.append(gt_request)
                    gt_request.start()
                    gt_request.join(GOOGLE_TTS_TIMEOUT)

            out = open(tmp_file, 'a')
            download_completed = True
            for th in threads:
                sys.stdout.flush()
                if th.result:
                    out.write(th.result)
                else:
                    download_completed = False
                    break
            out.close()

            if download_completed:
                return tmp_file
            else:
                logger.info('Download not completed, now removing the file')
                if os.path.exists(tmp_file):
                    os.remove(tmp_file)
                return None
        else:  # nothing generated from the query
            logger.error('Nothing generated from the query')
            return None
    except Exception as k:
        logger.error(
            'Part of tts dowload went wrong, now removing the file: %s' % str(
                k))
        if os.path.exists(tmp_file):
            os.remove(tmp_file)
        return None
Ejemplo n.º 24
0
def _transcode(link):
    """
    send link to uck and get the data
    """
    try:
        html = urllib2.urlopen('%s%s' % (UCK_TRANSCODING_NEW, link),
                               timeout=UCK_TIMEOUT).read()
        data = urllib2.unquote(hparser.unescape(html))
        return data
    except Exception as k:
        logger.info('Problem:[%s] Source:[%s]' % (str(k), link))
        return None
Ejemplo n.º 25
0
def _transcode(link):
    """
    send link to uck server
    """
    try:
        uck_url = '%s%s' % (UCK_TRANSCODING, link)
        # timeout set to UCK_TIMEOUT, currently
        html = urllib2.urlopen(uck_url, timeout=UCK_TIMEOUT).read()
        # free data from html encoding
        data = urllib2.unquote(hparser.unescape(html))
        return data
    except Exception as k:
        logger.info('Problem:[%s] Source:[%s]' % (str(k), link))
        return None
Ejemplo n.º 26
0
def _transcode(link):
    """
    send link to uck server
    """
    try:
        uck_url = '%s%s' % (UCK_TRANSCODING, link)
        # timeout set to UCK_TIMEOUT, currently
        html = urllib2.urlopen(uck_url, timeout=UCK_TIMEOUT).read()
        # free data from html encoding
        data = urllib2.unquote(hparser.unescape(html))
        return data
    except Exception as k:
        logger.info('Problem:[%s] Source:[%s]' % (str(k), link))
        return None
Ejemplo n.º 27
0
def clean():
    """
    interface to clean temporary and unrecorded files
    """
    logger.info('... cleaning files on the disk ...')
    any_mistake = False
    if not _clean_unrecorded_files():
        logger.error('Error found cleaning unrecorded files')
        any_mistake = True
    if not _clean_tempory_files():
        logger.error('Error found cleaning temporary files')
        any_mistake = True

    if not any_mistake:
        return True
    else:
        return False
Ejemplo n.º 28
0
def convert(url, language):
    """
    an interface to expose Simplr
    """
    if not url:
        logger.error("Cannot transcode nothing!")
        return None, None, None

    # pdb.set_trace()
    try:
        readable = Simplr(url, language)
        if readable:
            return readable.short_title, readable.content, readable.images
        else:
            logger.info('Simplr cannot parse the data')
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(url)))
        return None, None, None
Ejemplo n.º 29
0
def clean():
    """
    remove expired items from queues in memory
    walk through all redis content
    """
    logger.info('... cleaning memory ...')
    try:
        news_lists = rclient.keys('news::*')
        for news_list in news_lists:
            # get the total number of a news list
            news_list_count = rclient.zcard(news_list)
            # get all the ids in a news list
            if news_list_count:
                news_ids = rclient.zrange(news_list, 0, news_list_count)
                for news_id in news_ids:
                    # make sure every item is touched
                    if not rclient.exists(news_id):
                        rclient.zrem(news_list, news_id)
                    else:
                        news_item_string = rclient.get(news_id)
                        if news_item_string:
                            news_item = eval(news_item_string)
                            news_updated = float(news_item['updated'])

                            # WTF, remove it
                            if cleaner.is_overdue(news_updated):
                                rclient.zrem(news_list, news_id)
                                rclient.delete(news_id)
                            else:  # check if this is zombie
                                if _is_zombie(news_item):
                                    rclient.zrem(news_list, news_id)
                                    rclient.delete(news_id)
                        else:
                            rclient.zrem(news_list, news_id)
                            rclient.delete(news_id)
            else:
                logger.error('Nothing in the list')
                continue
        return True
    except Exception as k:
        logger.error(str(k))
        return False
Ejemplo n.º 30
0
def google(language='en', query='Service provided by Baidu',
           relative_path='do_not_exist.mp3'):
    """
    1. download mp3 from google tts api
    2. convert it to wav
    3. speed up the wav file, if necessary
    4. convert to mp3
    5. store in some location
    6. return the path
    """
    if not language or not query or not relative_path:
        logger.error('Method malformed!')
        return None, None
    if language not in LANGUAGES:
        logger.error('%s not supported!' % language)
        return None, None

    try:
        # generate out.mp3
        tmp_file = _download(language, query, '%s%s-tmp.mp3' %
                                              (MEDIA_TEMP_LOCAL_DIR,
                                               relative_path[:-4]))
        if tmp_file:
            # form paths
            tts_local_path = '%s%s' % (MEDIA_LOCAL_DIR, relative_path)
            tts_web_path = '%s%s' % (MEDIA_PUBLIC_DIR, relative_path)

            command = 'lame -S --decode {0} - | sox -q -t wav - -t wav - ' \
                      'speed 1.06 | lame -S - {1}; rm {0}'.format(
                tmp_file, tts_local_path)
            subprocess.Popen(command, stderr=subprocess.PIPE, shell=True)
            logger.info('... MP3 acceleration is successfully completed!')
            return tts_web_path, tts_local_path
        else:
            logger.info(
                '%s is revoked due to erros found in downloading!' %
                relative_path)
            return None, None
    except Exception as k:
        logger.error(str(k))
        return None, None
Ejemplo n.º 31
0
def google(language='en',
           query='Service provided by Baidu',
           relative_path='do_not_exist.mp3'):
    """
    1. download mp3 from google tts api
    2. convert it to wav
    3. speed up the wav file, if necessary
    4. convert to mp3
    5. store in some location
    6. return the path
    """
    if not language or not query or not relative_path:
        logger.error('Method malformed!')
        return None, None
    if language not in LANGUAGES:
        logger.error('%s not supported!' % language)
        return None, None

    try:
        # generate out.mp3
        tmp_file = _download(
            language, query,
            '%s%s-tmp.mp3' % (MEDIA_TEMP_LOCAL_DIR, relative_path[:-4]))
        if tmp_file:
            # form paths
            tts_local_path = '%s%s' % (MEDIA_LOCAL_DIR, relative_path)
            tts_web_path = '%s%s' % (MEDIA_PUBLIC_DIR, relative_path)

            command = 'lame -S --decode {0} - | sox -q -t wav - -t wav - ' \
                      'speed 1.06 | lame -S - {1}; rm {0}'.format(
                tmp_file, tts_local_path)
            subprocess.Popen(command, stderr=subprocess.PIPE, shell=True)
            logger.info('... MP3 acceleration is successfully completed!')
            return tts_web_path, tts_local_path
        else:
            logger.info('%s is revoked due to erros found in downloading!' %
                        relative_path)
            return None, None
    except Exception as k:
        logger.error(str(k))
        return None, None
Ejemplo n.º 32
0
def _combine(content, images):
    """
    combine results from transcoders
    """
    if not content or not images:
        return content, images

    try:
        # for now, if there are more than one image, take only one of them
        biggest = illustrator.find_biggest_image(images)
        if biggest:
            image_tag = '<img src="%s" width="%s" height="%s">'
            image = image_tag % (biggest['url'], str(
                biggest['width']), str(biggest['height']))
            return "%s %s" % (image, content), images
        else:
            logger.info('Cannot find biggest image')
            return content, biggest
    except Exception as k:
        logger.error(str(k))
        return content, images
Ejemplo n.º 33
0
def _combine(content, images):
    """
    combine results from transcoders
    """
    if not content or not images:
        return content, images

    try:
        # for now, if there are more than one image, take only one of them
        biggest = illustrator.find_biggest_image(images)
        if biggest:
            image_tag = '<img src="%s" width="%s" height="%s">'
            image = image_tag % (
                biggest['url'], str(biggest['width']), str(biggest['height']))
            return "%s %s" % (image, content), images
        else:
            logger.info('Cannot find biggest image')
            return content, biggest
    except Exception as k:
        logger.error(str(k))
        return content, images
Ejemplo n.º 34
0
def _extract(link):
    """
    extract title, content and images
    """
    data_string = _transcode(link)
    if data_string:
        # syntax checker
        try:
            eval(data_string)
        except Exception:
            logger.info('Invalid syntax found for New UCK output')
            return None, None, None

        data = eval(data_string)

        if int(data['status']) == 1:
            title = None if 'title' not in data or not data['title'] else data[
                'title']
            content = None if 'content' not in data or not data[
                'content'] else data['content']
            images, content = _collect_images(content, link)
            return title, content, images
        else:
            logger.info('UCK cannot parse the link: status != 1')
            return None, None, None
    else:
        logger.info('Get nothing from UCK server')
        return None, None, None
Ejemplo n.º 35
0
def _extract(link):
    """
    extract title, content and images
    """
    data_string = _transcode(link)
    if data_string:
        # syntax checker
        try:
            eval(data_string)
        except Exception:
            logger.info('Invalid syntax found for New UCK output')
            return None, None, None

        data = eval(data_string)

        if int(data['status']) == 1:
            title = None if 'title' not in data or not data[
                'title'] else data['title']
            content = None if 'content' not in data or not data[
                'content'] else data['content']
            images, content = _collect_images(content, link)
            return title, content, images
        else:
            logger.info('UCK cannot parse the link: status != 1')
            return None, None, None
    else:
        logger.info('Get nothing from UCK server')
        return None, None, None
Ejemplo n.º 36
0
    def run(self):
        response = subprocess.Popen(
            '''curl --silent --connect-timeout %s -A Mozilla
            "http://translate.google
            .com/translate_tts?ie=UTF-8&oe=UTF-8&tl=%s&q=%s"''' %
            (GOOGLE_TTS_TIMEOUT, self.language, urllib2.quote(self.text)),
            stdout=subprocess.PIPE, shell=True, close_fds=True)

        content, error = response.communicate()
        if not error and content:
            if 'error' not in content or 'permission' not in content:
                self.result = content
            else:
                logger.error('Errors or Permission found in HTTP response')
                self.result = None
        else:
            if error:
                logger.error('Error %s found for %s' % (str(error), self.text))
                self.result = None
            else:
                logger.info('No content returned for %s' % self.text)
                self.result = None
Ejemplo n.º 37
0
def find_images(content=None, referer=None):
    """
    find out all images from content and its size info
    """
    if not content:
        logger.error('Content/HTML is found VOID!')
        return None, content

    try:
        if isinstance(content, str) or isinstance(content, unicode):
            soup = BeautifulSoup(content.decode('utf-8', 'ignore'))
            normalized_images = []

            element_replaced = False
            for image in soup.findAll('img'):
                if image.get('src'):
                    normalized_image = find_image(image.get('src'), referer)
                    if normalized_image:
                        # replace original image link with clean and (local)
                        # copy
                        if 'original_url' in normalized_image and \
                                normalized_image['original_url']:
                            image['src'] = str(normalized_image['url'])
                            element_replaced = True
                        normalized_images.append(normalized_image)

            content_new = soup.prettify(encoding='utf-8')
            if element_replaced and content_new:
                content = str(
                    html_slimmer(
                        urllib2.unquote(hparser.unescape(content_new))))
            return normalized_images, content
        else:
            logger.info("Wrong format %s" % content)
            return None, content
    except Exception as k:
        logger.error("Problem [%s] Source [%s]" % (str(k), content))
        return None, content
Ejemplo n.º 38
0
def find_images(content=None, referer=None):
    """
    find out all images from content and its size info
    """
    if not content:
        logger.error('Content/HTML is found VOID!')
        return None, content

    try:
        if isinstance(content, str) or isinstance(content, unicode):
            soup = BeautifulSoup(content.decode('utf-8', 'ignore'))
            normalized_images = []

            element_replaced = False
            for image in soup.findAll('img'):
                if image.get('src'):
                    normalized_image = find_image(image.get('src'), referer)
                    if normalized_image:
                        # replace original image link with clean and (local)
                        # copy
                        if 'original_url' in normalized_image and \
                                normalized_image['original_url']:
                            image['src'] = str(normalized_image['url'])
                            element_replaced = True
                        normalized_images.append(normalized_image)

            content_new = soup.prettify(encoding='utf-8')
            if element_replaced and content_new:
                content = str(
                    html_slimmer(urllib2.unquote(
                        hparser.unescape(content_new))))
            return normalized_images, content
        else:
            logger.info("Wrong format %s" % content)
            return None, content
    except Exception as k:
        logger.error("Problem [%s] Source [%s]" % (str(k), content))
        return None, content
Ejemplo n.º 39
0
def dedup(entries=None, language=None):
    """
    return entries not found in database
    """
    if not entries:
        logger.error('Method malformed!')
        return None
    if not language or language not in LANGUAGES:
        logger.error("Language not found or not supported!")
        return None

    try:
        entries_new = []
        col = Collection(db, language)
        for entry in entries:
            # find duplicate in the form of the same link or title
            dup_link = col.find_one({
                'link': entry['link'],
                'feed': entry['feed']
            })
            if dup_link:
                logger.info('Find a duplicate for %s' % str(entry['title']))
                continue
            else:
                dup_title = col.find_one({
                    'title': entry['title'],
                    'feed': entry['feed']
                })
                if dup_title:
                    logger.info('Find a duplicate for %s' %
                                str(entry['title']))
                    continue
                else:
                    entries_new.append(entry)
        return entries_new if entries_new else None
    except Exception as k:
        logger.error(str(k))
        return None
Ejemplo n.º 40
0
def prepare_link(url):
    """
    decode with the correct encoding
    """
    if not url:
        logger.error('Method malformed!')
        return None

    try:
        resp = requests.get(url, timeout=UCK_TIMEOUT)
        html = resp.content if resp.ok else None
        if html:
            detected = chardet.detect(html)
            encoding = detected['encoding'] if detected else 'utf-8'
            encoding = 'windows-1252' if 'folha.uol.com.br' in url else encoding
            data = html.decode(encoding, 'ignore')
            return hparser.unescape(urllib2.unquote(data)).replace(u'\xa0', ' ')
        else:
            logger.warning("Cannot read %s" % url)
            return None
    except Exception as k:
        logger.info('Problem:[%s] Source:[%s]' % (str(k), url))
        return None
Ejemplo n.º 41
0
def prepare_link(url):
    """
    decode with the correct encoding
    """
    if not url:
        logger.error('Method malformed!')
        return None

    try:
        resp = requests.get(url, timeout=UCK_TIMEOUT)
        html = resp.content if resp.ok else None
        if html:
            detected = chardet.detect(html)
            encoding = detected['encoding'] if detected else 'utf-8'
            encoding = 'windows-1252' if 'folha.uol.com.br' in url else encoding
            data = html.decode(encoding, 'ignore')
            return hparser.unescape(urllib2.unquote(data)).replace(
                u'\xa0', ' ')
        else:
            logger.warning("Cannot read %s" % url)
            return None
    except Exception as k:
        logger.info('Problem:[%s] Source:[%s]' % (str(k), url))
        return None
Ejemplo n.º 42
0
    def run(self):
        response = subprocess.Popen(
            '''curl --silent --connect-timeout %s -A Mozilla
            "http://translate.google
            .com/translate_tts?ie=UTF-8&oe=UTF-8&tl=%s&q=%s"''' %
            (GOOGLE_TTS_TIMEOUT, self.language, urllib2.quote(self.text)),
            stdout=subprocess.PIPE,
            shell=True,
            close_fds=True)

        content, error = response.communicate()
        if not error and content:
            if 'error' not in content or 'permission' not in content:
                self.result = content
            else:
                logger.error('Errors or Permission found in HTTP response')
                self.result = None
        else:
            if error:
                logger.error('Error %s found for %s' % (str(error), self.text))
                self.result = None
            else:
                logger.info('No content returned for %s' % self.text)
                self.result = None
Ejemplo n.º 43
0
def clean():
    """
    remove expired items from database
    """
    logger.info('... cleaning database ...')
    try:
        document_names = _find_document_names()
        if document_names:
            for document_name in document_names:
                document = Collection(db, document_name)

                # compute a threshold
                current_utc_time_posix = calendar.timegm(time.gmtime())
                deadline_datetime = datetime.utcfromtimestamp(
                    current_utc_time_posix) - timedelta(
                        days=DATABASE_REMOVAL_DAYS)
                deadline_posix = calendar.timegm(deadline_datetime.timetuple())

                removal_candidates = document.find(
                    {'updated': {
                        '$lt': deadline_posix
                    }})
                for removal_candidate in removal_candidates:
                    # see if removal candidate has a footage in memory
                    clean_memory.clean_by_item(str(removal_candidate['_id']))
                    # remove corresponding files on disk
                    clean_disk.clean_by_item(removal_candidate)
                    # remove the candidate in database
                    document.remove({'_id': removal_candidate['_id']})
            return True
        else:
            logger.error('Cannot find documents')
            return False
    except Exception as k:
        logger.error(str(k))
        return False
Ejemplo n.º 44
0
#!/usr/bin/env python
#-*- coding: utf-8 -*-

import sys

reload(sys)
sys.setdefaultencoding('UTF-8')

from newsman.config.settings import logger

logger.error('error')
logger.info('info')
logger.exception('exception')
logger.critical('critical')
logger.warning('warning')
Ejemplo n.º 45
0
def _read_entry(e=None,
                feed_id=None,
                feed_title=None,
                language=None,
                categories=None):
    """
    read a specific entry item from a feed 
    Note. categories are ids of category item
    """
    if not e or not feed_title or not language or not categories:
        logger.error('Method malformed!')
        return None
    if language not in LANGUAGES:
        logger.error("Language not supported for %s!" % feed_title)
        return None

    try:
        entry = {}
        entry['feed_id'] = feed_id
        entry['feed'] = feed_title.strip()
        entry['language'] = language.strip()
        entry['categories'] = categories

        # the easy part: the must-have
        entry['error'] = []

        # article original link
        if e.link:
            original_link = e.link.strip()
            if not original_link.startswith(AD_LINKS):
                # print 'original', original_link
                # print 'unescaped', hparser.unescape(original_link)
                # print 'unquoted', urllib2.unquote(original_link)
                # print 'unescaped-unquoted', urllib2.unquote(hparser
                # .unescape(original_link))
                # print 'unquoted-unescaped', hparser.unescape(urllib2
                # .unquote(original_link))
                # find the real link from redirection
                # the sequence of the following two steps are IMPORTANT!
                original_link = _find_redirected_link(original_link)
                # print 'anti-redirected', original_link
                # clean the URL
                # original_link = urllib2.unquote(hparser.unescape(
                # original_link))
                # print 'unescaped-unquoted', original_link
                # print '------------------------------------------------'

                # find the redirected link
                matched_prefix = [
                    link for link in HIDDEN_LINKS
                    if original_link.startswith(link)
                ]
                found_prefix = matched_prefix[0] if matched_prefix else None
                if found_prefix:
                    actual_link = _get_actual_link(found_prefix, original_link)
                    if actual_link:
                        entry['link'] = actual_link
                    else:
                        logger.error('No actual link found for %s!' %
                                     original_link)
                        return None
                else:
                    entry['link'] = original_link
            else:
                logger.info('Advertising link %s' % original_link)
                return None
        else:
            logger.info('Feed malformed! No link found!')
            return None

        # article title
        if e.title_detail.type != 'text/plain':
            entry['title'] = urllib2.unquote(hparser.unescape(e.title.strip()))
        elif 'title' in e:
            entry['title'] = e.title.strip()
        else:
            entry['title'] = None
        # remove possible htmlized title
        entry['title'] = re.sub(
            "<.*?>", " ",
            entry['title']) if 'title' in entry and entry['title'] else None

        # article published time
        # first try parsed time info
        try:
            entry['updated'] = calendar.timegm(e.updated_parsed)
            entry['updated_human'] = e.updated
        except AttributeError as k:
            try:
                entry['updated'] = calendar.timegm(e.published_parsed)
                entry['updated_human'] = e.published
            except AttributeError as k:
                entry['error'] = [
                    '%s\n%s' % (entry['error'],
                                "no 'updated_parsed' or 'published_parsed'")
                ]
                # then try unparsed time info
                # this is rarely possible.
                try:
                    updated = e.updated if 'updated' in e else e.published
                    if updated:
                        # get time zone
                        offset = int(updated[-5:])
                        delta = timedelta(hours=int(offset) / 100)
                        format = "%a, %d %b %Y %H:%M:%S"
                        if updated[-8:-5] != 'UTC':
                            updated = datetime.strptime(updated[:-6], format)
                        else:
                            updated = datetime.strptime(updated[:-9], format)
                        updated -= delta
                        entry['updated'] = time.mktime(updated.timetuple())
                    else:
                        logger.info("Attribute updated/published has no value")
                        return None
                except ValueError as k:
                    logger.info(str(k))
                    entry['error'].append('%s\n%s' % (entry['error'], k))
                    return None
                except AttributeError as k:
                    logger.info(str(k))
                    entry['error'].append('no update or published\n')
                    return None

        # article's summary
        try:
            # its possible summary is html-based
            summary = urllib2.unquote(hparser.unescape(e.summary))
            if isinstance(summary, str):
                summary_encoding = chardet.detect(summary)['encoding']
                summary = summary.decode(summary_encoding, 'ignore')
            # a <div, for example, and a </div
            is_html = True if len(
                re.findall(u'</?a|</?p|</?strong|</?img|</?html|</?div',
                           summary)) > 1 else False
            if is_html:
                h = html2text.HTML2Text()
                h.ignore_images = True
                h.ignore_links = True
                h.ignore_emphasis = True
                paragraphs = (
                    h.handle(summary)).strip().strip('#').strip().split('\n\n')
                paragraphs_above_limit = []
                # remove paragraphs that contain less than x number of words
                for paragraph in paragraphs:
                    if entry['language'].startswith(
                            'zh') or entry['language'] == 'ja':
                        if len(paragraph) > 18:
                            paragraphs_above_limit.append(paragraph)
                    else:
                        words = paragraph.split()
                        if len(words) > 12:
                            paragraphs_above_limit.append(paragraph)
                entry['summary'] = '\n\n'.join(paragraphs_above_limit)
            else:
                entry['summary'] = summary
        except AttributeError as k:
            entry['summary'] = None
        entry['summary'] = None if not entry['summary'] else entry['summary']

        # article's images
        # e.g. [{'url':'http://image.com/tests.jpg, 'width': u'130', 'height':
        # u'86'}]
        entry['images'] = []
        try:
            images, media_content_new = illustrator.find_images(
                e.media_content, entry['link'])
            if images:
                entry['images'].extend(images)
        except AttributeError as k:
            pass
        try:
            images, media_content_new = illustrator.find_images(
                e.media_thumbnail, entry['link'])
            if images:
                entry['images'].extend(images)
        except AttributeError as k:
            pass
        for attribute in e:
            if 'thumbnail' in attribute:
                # currently set thumbnail to None if its a dictionary
                image = e[attribute] if isinstance(e[attribute], str) else None
                image = illustrator.find_image(image, entry['link'])
                if image:
                    entry['images'].append(image)
        try:
            links = e.links
            for link in links:
                if 'type' in link and 'image' in link.type:
                    if 'href' in link:
                        image = illustrator.find_image(link.href,
                                                       entry['link'])
                        if image:
                            entry['images'].append(image)
        except AttributeError as k:
            pass

        if entry.has_key('summary') and entry['summary']:
            images, entry['summary'] = illustrator.find_images(
                entry['summary'], entry['link'])
            if images:
                entry['images'].extend(images)
        # dedup images is processed at rss.py

        # article's author
        # e.g. Yuan Jin
        try:
            # i guess this could be a string or a list
            entry['author'] = e.author
        except AttributeError as k:
            entry['author'] = None

        # article's source
        # e.g. {'href': u'http://www.reuters.com/', 'title': u'Reuters'}
        try:
            entry['source'] = e.source
        except AttributeError as k:
            entry['source'] = None

        # article's tags
        # e.g. [{'term': u'Campus Party', 'scheme': None, 'label': None}]
        # term is usually combined with scheme to form a url; label is
        # the name of term
        try:
            entry['tags'] = e.tag
        except AttributeError as k:
            entry['tags'] = None

        # the FINAL return
        return entry
    except Exception as k:
        logger.error(str(k))
        return None
Ejemplo n.º 46
0
def parse(feed_link=None,
          feed_id=None,
          feed_title=None,
          language=None,
          categories=None,
          etag=None,
          modified=None):
    """
    read rss/atom data from a given feed
    feed_id is the feed ObjectId in MongoDB
    Etag and Modified are used to save rss http server's bandwidth
    Note: category should be added to feed table/database
    """
    if not feed_link or not feed_id or not language or not categories:
        logger.error("Method malformed!")
        return None, None, feed_title, etag, modified, "Method malformed!"
    if language not in LANGUAGES:
        logger.error("Language not supported for %s!" % feed_link)
        return None, None, feed_title, etag, modified, "Language not "
        "supported for %s!" % feed_link

    def _validate_time(entry):
        """
        see if the entry's updated time is earlier than needed
        """
        deadline = datetime.utcfromtimestamp(
            entry['updated']) + timedelta(days=DATABASE_REMOVAL_DAYS)
        return True if deadline > datetime.now() else False

    try:
        # variables d and e follow feedparser tradition
        feedparser.USER_AGENT = "newsman"
        d = feedparser.parse(feed_link, etag=etag, modified=modified)
        if d:
            # http://pythonhosted.org/feedparser/reference-status.html
            # http://pythonhosted.org/feedparser/http-etag.html#http-etag
            status = d.status if 'status' in d else None

            if status == 301:
                logger.critical('%s has been permantently moved to a %s!' %
                                (feed_link, d.href))
                return None, status, feed_title, etag, modified, '%s has been '
                'permantently moved to a %s!' % (feed_link, d.href)
            elif status == 304:
                logger.warning('%s server has not updated its feeds' %
                               feed_link)
                return None, status, feed_title, etag, modified, '%s server '
                'has not updated its feeds' % feed_link
            elif status == 410:
                logger.critical(
                    '%s is gone! Admin should check the feed availability!' %
                    feed_link)
                return None, status, feed_title, etag, modified, '%s is gone! '
                'Admin should check the feed availability!' % feed_link
            elif status == 200 or status == 302:
                # no need to worry.
                if status == 302:
                    logger.info('%s url has been temp moved to a new place' %
                                feed_link)

                if not feed_title:
                    # if title were not found in feed, an AttributeError would
                    # be raised.
                    feed_title = urllib2.unquote(hparser.unescape(
                        d.feed.title)).strip()
                else:
                    feed_title = feed_title.strip()
                    if 'title' in d.feed:
                        feed_title_latest = urllib2.unquote(
                            hparser.unescape(d.feed.title)).strip()
                        if feed_title != feed_title_latest:
                            # change feed title
                            logger.info('%s title changed! Please update feed '
                                        'table/database' % feed_link)
                            logger.info('old title: %s' % feed_title)
                            logger.info('new title: %s' % feed_title_latest)
                            #feed_title = feed_title_latest
                    else:
                        logger.info('%s[%s] has no title in its latest RSS' %
                                    (feed_title, feed_link))

                # update etag/modified
                etag = None
                modified = None
                try:
                    etag = d.etag
                except AttributeError:
                    try:
                        modified = d.modified
                    except AttributeError:
                        pass

                if 'entries' in d:
                    language = language if 'language' not in d else d.language
                    # an Exception might be raised from _read_entry
                    entries = []
                    logger.error('%s begins processing' % feed_title)
                    for i, e in enumerate(d.entries):
                        if e:
                            entry = _read_entry(e, feed_id, feed_title,
                                                language, categories)
                            if entry:
                                entries.append(entry)
                            else:
                                logger.info('Cannot parse %s' % e['link'])
                                continue
                        else:
                            logger.info('No infomation found for %s-th entry' %
                                        i)
                            continue

                    if entries:
                        # the FINAL return
                        # the last one indicates nothing wrong happended in
                        # parsing
                        return filter(_validate_time,
                                      entries), status, feed_title, etag, \
                               modified, 'OK'
                    else:
                        logger.info('Feed parsing goes wrong!')
                        return None, status, feed_title, etag, modified, \
                               'Feed parsing goes wrong!'
                else:
                    logger.info("Feed %s has no items!" % feed_id)
                    return None, status, feed_title, etag, modified, 'Feed %s '
                    'has no items!' % feed_id
            else:
                logger.info('HTTP Error Code [%s] for %s' %
                            (status, feed_link))
                return None, status, feed_title, etag, modified, 'HTTP Error '
                'Code [%s] for %s' % (status, feed_link)
        else:
            logger.info("Cannot parse %s correctly!" % feed_id)
            return None, None, feed_title, etag, modified, "Cannot parse %s "
            "correctly!" % feed_id
    except Exception as k:
        logger.exception('%s for %s' % (str(k), feed_id))
        return None, None, feed_title, etag, modified, '%s for %s' % (str(k),
                                                                      feed_id)
Ejemplo n.º 47
0
def scale_image(image=None, referer=None, size_expected=MIN_IMAGE_SIZE,
                resize_by_width=True, crop_by_center=True, relative_path=None):
    """
    resize an image as requested
    resize_by_width: resize image according to its width(True)/height(False)
    crop_by_center: crop image from its center(True) or by point(0, 0)(False)
    """
    if not image:
        logger.error('Image not found!')
        return None, None
    if not size_expected:
        logger.error('Expected image size not found!')
        return None, None
    if not relative_path:
        logger.error('Relative path for saving image not found!')
        return None, None

    image_url = image_size = None
    try:
        image_url = image['url']
        image_size = image['width'], image['height']
    except Exception:
        logger.error('Image [%s] is malformed!' % str(image))
        return None, None

    if not image_url:
        logger.error('Image URL not found!')
        return None, None
    if not image_size:
        logger.error('Expected image size not found!')
        return None, None

    try:
        width = int(image_size[0])
        height = int(image_size[1])
        width_expected = int(size_expected[0])
        height_expected = int(size_expected[1])

        if width >= width_expected and height >= height_expected:
            if resize_by_width:
                height_new = width_expected * height / width
                width_new = width_expected
            else:
                width_new = height_expected * width / height
                height_new = height_expected

            # larger and equal than is important here
            if width_new >= width_expected and height_new >= height_expected:
                # resize
                size_new = width_new, height_new

                image_data = None
                try:
                    if referer:
                        HEADERS['Referer'] = referer
                    response = requests.get(
                        image_url, headers=HEADERS, timeout=UCK_TIMEOUT)
                    image_data = Image.open(StringIO(response.content))
                except Exception as k:
                    logger.info(
                        'Problem:[%s]\nSource:[%s]' % (str(k), str(image_url)))
                    return None, None

                # resize image according to new size
                image_format = image_data.format.lower(
                ) if image_data and image_data.format else 'jpg'
                image_data.thumbnail(size_new, Image.ANTIALIAS)
                image_cropped = None

                # crop out unnecessary part
                if crop_by_center:
                    left = (width_new - width_expected) / 2
                    top = (height_new - height_expected) / 2
                    right = (width_new + width_expected) / 2
                    bottom = (height_new + height_expected) / 2
                    image_cropped = image_data.crop((left, top, right, bottom))
                else:
                    left = 0
                    top = 0
                    right = width_expected
                    bottom = height_expected
                    image_cropped = image_data.crop((left, top, right, bottom))

                # save to disk
                if image_cropped:
                    image_web_path = '%s%s.%s' % (
                        IMAGES_PUBLIC_DIR, relative_path, image_format.lower())
                    image_local_path = '%s%s.%s' % (
                        IMAGES_LOCAL_DIR, relative_path, image_format.lower())
                    image_cropped = image_cropped.convert('RGB')
                    image_cropped.save(image_local_path, image_format)

                    # clean data
                    if image_cropped:
                        del image_cropped
                    if image_data:
                        del image_data
                    return {'url': image_web_path, 'width': width_expected,
                            'height': height_expected}, {
                               'url': image_local_path, 'width': width_expected,
                               'height': height_expected}
                else:
                    # clean data
                    if image_cropped:
                        del image_cropped
                    if image_data:
                        del image_data
                    return None, None
            else:
                return scale_image(image=image, referer=referer,
                                   size_expected=size_expected,
                                   resize_by_width=not resize_by_width,
                                   crop_by_center=crop_by_center,
                                   relative_path=relative_path)
        else:
            return None, None
    except Exception as k:
        logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(image_url)))
        return None, None
Ejemplo n.º 48
0
def _read_entry(e=None, feed_id=None, feed_title=None, language=None,
                categories=None):
    """
    read a specific entry item from a feed 
    Note. categories are ids of category item
    """
    if not e or not feed_title or not language or not categories:
        logger.error('Method malformed!')
        return None
    if language not in LANGUAGES:
        logger.error("Language not supported for %s!" % feed_title)
        return None

    try:
        entry = {}
        entry['feed_id'] = feed_id
        entry['feed'] = feed_title.strip()
        entry['language'] = language.strip()
        entry['categories'] = categories

        # the easy part: the must-have
        entry['error'] = []

        # article original link
        if e.link:
            original_link = e.link.strip()
            if not original_link.startswith(AD_LINKS):
                # print 'original', original_link
                # print 'unescaped', hparser.unescape(original_link)
                # print 'unquoted', urllib2.unquote(original_link)
                # print 'unescaped-unquoted', urllib2.unquote(hparser
                # .unescape(original_link))
                # print 'unquoted-unescaped', hparser.unescape(urllib2
                # .unquote(original_link))
                # find the real link from redirection
                # the sequence of the following two steps are IMPORTANT!
                original_link = _find_redirected_link(original_link)
                # print 'anti-redirected', original_link
                # clean the URL
                # original_link = urllib2.unquote(hparser.unescape(
                # original_link))
                # print 'unescaped-unquoted', original_link
                # print '------------------------------------------------'


                # find the redirected link
                matched_prefix = [
                    link for link in HIDDEN_LINKS if
                    original_link.startswith(link)]
                found_prefix = matched_prefix[0] if matched_prefix else None
                if found_prefix:
                    actual_link = _get_actual_link(found_prefix, original_link)
                    if actual_link:
                        entry['link'] = actual_link
                    else:
                        logger.error(
                            'No actual link found for %s!' % original_link)
                        return None
                else:
                    entry['link'] = original_link
            else:
                logger.info('Advertising link %s' % original_link)
                return None
        else:
            logger.info('Feed malformed! No link found!')
            return None

        # article title
        if e.title_detail.type != 'text/plain':
            entry['title'] = urllib2.unquote(hparser.unescape(e.title.strip()))
        elif 'title' in e:
            entry['title'] = e.title.strip()
        else:
            entry['title'] = None
        # remove possible htmlized title
        entry['title'] = re.sub("<.*?>", " ", entry[
            'title']) if 'title' in entry and entry['title'] else None

        # article published time
        # first try parsed time info
        try:
            entry['updated'] = calendar.timegm(e.updated_parsed)
            entry['updated_human'] = e.updated
        except AttributeError as k:
            try:
                entry['updated'] = calendar.timegm(e.published_parsed)
                entry['updated_human'] = e.published
            except AttributeError as k:
                entry['error'] = ['%s\n%s' % (
                    entry['error'],
                    "no 'updated_parsed' or 'published_parsed'")]
                # then try unparsed time info
                # this is rarely possible.
                try:
                    updated = e.updated if 'updated' in e else e.published
                    if updated:
                        # get time zone
                        offset = int(updated[-5:])
                        delta = timedelta(hours=int(offset) / 100)
                        format = "%a, %d %b %Y %H:%M:%S"
                        if updated[-8:-5] != 'UTC':
                            updated = datetime.strptime(updated[:-6], format)
                        else:
                            updated = datetime.strptime(updated[:-9], format)
                        updated -= delta
                        entry['updated'] = time.mktime(updated.timetuple())
                    else:
                        logger.info(
                            "Attribute updated/published has no value")
                        return None
                except ValueError as k:
                    logger.info(str(k))
                    entry['error'].append('%s\n%s' % (entry['error'], k))
                    return None
                except AttributeError as k:
                    logger.info(str(k))
                    entry['error'].append('no update or published\n')
                    return None

        # article's summary
        try:
            # its possible summary is html-based
            summary = urllib2.unquote(hparser.unescape(e.summary))
            if isinstance(summary, str):
                summary_encoding = chardet.detect(summary)['encoding']
                summary = summary.decode(summary_encoding, 'ignore')
            # a <div, for example, and a </div
            is_html = True if len(
                re.findall(u'</?a|</?p|</?strong|</?img|</?html|</?div',
                           summary)) > 1 else False
            if is_html:
                h = html2text.HTML2Text()
                h.ignore_images = True
                h.ignore_links = True
                h.ignore_emphasis = True
                paragraphs = (h.handle(summary)).strip().strip(
                    '#').strip().split('\n\n')
                paragraphs_above_limit = []
                # remove paragraphs that contain less than x number of words
                for paragraph in paragraphs:
                    if entry['language'].startswith('zh') or entry[
                        'language'] == 'ja':
                        if len(paragraph) > 18:
                            paragraphs_above_limit.append(paragraph)
                    else:
                        words = paragraph.split()
                        if len(words) > 12:
                            paragraphs_above_limit.append(paragraph)
                entry['summary'] = '\n\n'.join(paragraphs_above_limit)
            else:
                entry['summary'] = summary
        except AttributeError as k:
            entry['summary'] = None
        entry['summary'] = None if not entry['summary'] else entry['summary']

        # article's images
        # e.g. [{'url':'http://image.com/tests.jpg, 'width': u'130', 'height':
        # u'86'}]
        entry['images'] = []
        try:
            images, media_content_new = illustrator.find_images(
                e.media_content, entry['link'])
            if images:
                entry['images'].extend(images)
        except AttributeError as k:
            pass
        try:
            images, media_content_new = illustrator.find_images(
                e.media_thumbnail, entry['link'])
            if images:
                entry['images'].extend(images)
        except AttributeError as k:
            pass
        for attribute in e:
            if 'thumbnail' in attribute:
                # currently set thumbnail to None if its a dictionary
                image = e[attribute] if isinstance(e[attribute], str) else None
                image = illustrator.find_image(image, entry['link'])
                if image:
                    entry['images'].append(image)
        try:
            links = e.links
            for link in links:
                if 'type' in link and 'image' in link.type:
                    if 'href' in link:
                        image = illustrator.find_image(
                            link.href, entry['link'])
                        if image:
                            entry['images'].append(image)
        except AttributeError as k:
            pass

        if entry.has_key('summary') and entry['summary']:
            images, entry['summary'] = illustrator.find_images(
                entry['summary'], entry['link'])
            if images:
                entry['images'].extend(images)
        # dedup images is processed at rss.py

        # article's author
        # e.g. Yuan Jin
        try:
            # i guess this could be a string or a list
            entry['author'] = e.author
        except AttributeError as k:
            entry['author'] = None

        # article's source
        # e.g. {'href': u'http://www.reuters.com/', 'title': u'Reuters'}
        try:
            entry['source'] = e.source
        except AttributeError as k:
            entry['source'] = None

        # article's tags
        # e.g. [{'term': u'Campus Party', 'scheme': None, 'label': None}]
        # term is usually combined with scheme to form a url; label is
        # the name of term
        try:
            entry['tags'] = e.tag
        except AttributeError as k:
            entry['tags'] = None

        # the FINAL return
        return entry
    except Exception as k:
        logger.error(str(k))
        return None
Ejemplo n.º 49
0
def _sanitize(content=None, referer=None):
    """
    modified uck content to suit news needs
    """
    if not content:
        return None

    try:
        soup = BeautifulSoup(content.decode('utf-8', 'ignore'))
        # remove all <span>
        for span in soup.findAll('span'):
            span.extract()

        # sanitize <a>
        for a in soup.findAll('a'):
            img = a.find('img')
            if img:
                a.replaceWith(img)
            else:  # it might be a simple href
                a.replaceWith(a.text)

        # remove img prefix
        for img in soup.findAll('img'):
            img_source = img.get('src')
            if img_source:
                img_tuple = img_source.rpartition('src=')
                img['src'] = img_tuple[2]
                # call NormalizedImage
                width = height = None
                try:
                    ni = NormalizedImage(img['src'], referer)
                    width, height = ni.get_image_size()
                except Exception as k:
                    logger.info(
                        'Problem [%s] for Source [%s]' % (
                            str(k), str(img['src'])))
                    continue
                if 480 <= width:
                    img['width'] = '100%'
                    img['height'] = 'auto'

        # clear away useless style
        for style in soup.findAll('div', style='border-top:none;'):
            img = style.find('img')
            if not img:
                if not style.find('p'):
                    style.extract()
            else:
                style.replaceWith(img)

        # remove navigble strings and <div>
        for component in soup.contents:
            if isinstance(component, NavigableString):
                if len(component.string.split()) < 10:
                    component.extract()
            elif isinstance(component, Tag):
                if component.name == 'div':
                    if not component.find('p'):
                        component.extract()

        # filter item
        img_count = 0
        for item in soup.contents:
            if isinstance(item, Tag) and item.name == 'img':
                img_count += 1
        if img_count == len(soup.contents):
            return None
        else:
            return ''.join([str(item) for item in soup.contents])
    except Exception as k:
        logger.error(str(k))
        return None
Ejemplo n.º 50
0
def _value_added_process(entries=None, language=None,
                         transcoder_type='chengdujin'):
    """
    add more value to an entry
    tts, transcode, images, redis_entry_expiration, database_entry_expiration
    """
    if not entries:
        logger.error('Method malformed!')
        return None
    if not language or language not in LANGUAGES:
        logger.error("Language not found or not supported!")
        return None

    updated_entries = []
    for i, entry in enumerate(entries):
        try:
            logger.info('... Working on %i of %d ...' % (i + 1, len(entries)))
            logger.info(entry['title'])
            logger.info(entry['link'])

            # [MUST-HAVE] transcoding
            # get a random int from 100 million possibilities
            rand = random.randint(0, 100000000)
            transcoded_relative_path = '%s_%s_%s_%i' % (
                entry['language'], entry['feed_id'], entry['updated'], rand)

            # high chances transcoder cannot work properly
            entry['transcoded'], entry[
                'transcoded_local'], raw_transcoded_content, \
            images_from_transcoded = transcoder.convert(
                entry['language'], entry['title'], entry['link'],
                entry['updated'], entry['feed'], transcoder_type,
                transcoded_relative_path)

            if entry['transcoded']:
                # [OPTIONAL] summary
                if entry['summary'] or raw_transcoded_content:
                    summary_found = summarizer.extract(entry['language'],
                                                       entry['title'], str(
                            raw_transcoded_content), entry[
                                                           'summary'],
                                                       entry['link'],
                                                       entry['feed'],
                                                       '*|*'.join(
                                                           entry['categories']))
                    entry['summary'] = summary_found
                #entry['summary'] = entry['summary'] if 'summary' in entry
                # and entry['summary'] else None

                # [OPTIONAL] images
                # process images found in the transcoded data
                if images_from_transcoded:
                    # images from transcoded are already normalized
                    entry['images'].extend(images_from_transcoded)
                    # remove duplicated images
                    images_deduped = illustrator.dedup_images(
                        entry['images']) if entry.has_key('images') and entry[
                        'images'] else None
                    # be cautious dedup_images might return None if network
                    # sucks
                    if images_deduped:
                        entry['images'] = images_deduped
                entry['images'] = entry[
                    'images'] if 'images' in entry and entry['images'] else None

                # [OPTIONAL] generate 3 types of images: thumbnail,
                # category image and hot news image
                if entry.has_key('images') and entry['images']:
                    biggest = illustrator.find_biggest_image(entry['images'])
                    if biggest:
                        entry = _generate_images(biggest, entry, rand)
                # for older version users
                entry['image'] = entry['thumbnail_image'][
                    'url'] if 'thumbnail_image' in entry and entry[
                    'thumbnail_image'] else None

                # [OPTIONAL] text image
                # if no category_image is found, generate a text-image
                if 'category_image' not in entry or (
                                'category_image' in entry and not entry[
                            'category_image']):
                    image_relative_path = '%s_%s_%s_%i' % (
                        entry['language'], entry['feed_id'], entry['updated'],
                        rand)
                    try:
                        text_img = text2img.Text2Image(
                            language, entry['title'],
                            '%s_textimage.png' % image_relative_path)
                        entry['text_image'] = text_img.get_image()
                    except Exception as k:
                        logger.error(
                            'Problem [%s] generating text2image for [%s]' % (
                                str(k), entry['link']))

                # [OPTIONAL] google tts not for indonesian
                if entry['language'] != 'in':
                    # you dont get None in _get_tts
                    # at worst the original entry is returned
                    entry = _get_tts(entry, rand)

                # [MUST-HAVE] add expiration data
                def _expired(updated, days_to_deadline):
                    """
                    compute expiration information
                    return time string and unix time
                    """
                    deadline = datetime.utcfromtimestamp(
                        updated) + timedelta(days=days_to_deadline)
                    return time.asctime(
                        time.gmtime(calendar.timegm(deadline.timetuple())))

                entry['memory_expired'] = _expired(
                    entry['updated'], MEMORY_EXPIRATION_DAYS)
                entry['database_expired'] = _expired(
                    entry['updated'], DATABASE_REMOVAL_DAYS)

                # [OPTIONAL] if logger is used, this could be removed
                entry['error'] = entry[
                    'error'] if 'error' in entry and entry['error'] else None

                # [MUST-HAVE] update new entry to db_news
                # each entry is added with _id
                entry = db_news.update(entry)
                if entry:
                    # [MUST-HAVE] store in memory
                    result = memory.update(entry)
                    if result:
                        updated_entries.append(entry)
                    else:
                        logger.error('Error found in updating memory')
                        # remove entry in database
                        if clean_database.clean_by_item(entry):
                            logger.info(
                                'Cleaned %s in database' % entry['title'])
                        else:
                            logger.error(
                                'Error cleaning %s in database' % entry[
                                    'title'])
                        # remove entry-created files on disk
                        if clean_disk.clean_by_item(entry):
                            logger.info('Cleaned %s on disk' % entry['title'])
                        else:
                            logger.error(
                                'Error cleaning %s on disk' % entry['title'])
                        continue
                else:
                    logger.error('Error found in updating to news database')
                    # remove entry-created files on disk
                    if clean_disk.clean_by_item(entry):
                        logger.info('Cleaned %s on disk' % entry['title'])
                    else:
                        logger.error(
                            'Error cleaning %s on disk' % entry['title'])
                    continue
            else:
                logger.info('Error found in transcoding')
                continue
        except Exception as k:
            logger.error(str(k))
            continue
    # the FINAL return
    if updated_entries:
        return True
    else:
        logger.info('No entry got value added!')
        return False
Ejemplo n.º 51
0
def update(feed_link=None, feed_id=None, language=None, categories=None,
           transcoder_type='chengdujin', parser_type=None):
    """
    update could be called
    1. from task procedure: feed_id
    2. after an rss is added: feed_id
    3. manually for testing purpose: feed_link, language
    Note categories are kept for manual testing
    """
    if not feed_id and not (feed_link and language):
        logger.error('Method malformed!')
        return None

    try:
        # try to find the feed in database
        if feed_id:
            feed = db_feeds.get(feed_id=feed_id)
        else:
            feed = db_feeds.get(feed_link=feed_link, language=language)

        if feed:
            # read latest feed info from database
            feed_id = str(feed['_id'])
            feed_link = feed['feed_link']
            language = feed['language']
            categories = feed['categories'].keys()
            transcoder_type = feed['transcoder']
            parser_type = feed['parser']
            feed_title = feed_title_new = feed[
                'feed_title'] if 'feed_title' in feed else None
            etag = etag_new = feed['etag'] if 'etag' in feed else None
            modified = modified_new = feed[
                'modified'] if 'modified' in feed else None
            status_new = None
            reason_new = None
            entries = None

            if parser_type == 'rss':
                import rss_parser
                # parse rss reading from remote rss servers
                entries, status_new, feed_title_new, etag_new, modified_new, \
                reason_new = rss_parser.parse(
                    feed_link, feed_id, feed_title, language, categories, etag,
                    modified)
            elif parser_type == 'twitter':
                import twitter_parser

                entries, status_new, feed_title_new, etag_new, reason_new = \
                    twitter_parser.parse(
                        feed_link, feed_id, feed_title, language, categories,
                        etag)
            else:
                pass

            if entries:
                # filter out existing entries in db_news
                # there are some possible exceptions -- yet let it be
                entries = db_news.dedup(entries, language)

                if entries:
                    logger.warning('%s entries of %s received!' %
                                   (str(len(entries)), feed_link))
                    # and do tts, big_images, image as well as transcode.
                    result = _value_added_process(
                        entries, language, transcoder_type)
                    if result:
                        # feed_title, etag and modified to db_feeds
                        # only feed_id is necessary, others are optional
                        # **kwargs
                        result = db_feeds.update(
                            feed_id=feed_id, status=status_new,
                            feed_title=feed_title_new, etag=etag_new,
                            modified=modified_new, reason=reason_new)
                        logger.warning('%s entries of %s added to database!' %
                                       (str(len(entries)), feed_link))
                        if result:
                            return result
                        else:
                            logger.info('Error found updating feeds database')
                            return None
                    else:
                        logger.info('Error found adding value to entries')
                        return None

                else:
                    logger.info('Nothing from RSS is found new!')
                    return None
            else:
                logger.info('Nothing from RSS is updated!')
                result = db_feeds.update(
                    feed_id=feed_id, status=status_new,
                    feed_title=feed_title_new, etag=etag_new,
                    modified=modified_new, reason=reason_new)
                if not result:
                    logger.error('Error found updating feeds database')
                return None
        else:
            logger.warning('Register feed in database before updating!')
            return None
    except Exception as k:
        logger.error(str(k))
        return None
Ejemplo n.º 52
0
def _sanitize(content=None, referer=None):
    """
    modified uck content to suit news needs
    """
    if not content:
        return None

    try:
        soup = BeautifulSoup(content.decode('utf-8', 'ignore'))
        # remove all <span>
        for span in soup.findAll('span'):
            span.extract()

        # sanitize <a>
        for a in soup.findAll('a'):
            img = a.find('img')
            if img:
                a.replaceWith(img)
            else:  # it might be a simple href
                a.replaceWith(a.text)

        # remove img prefix
        for img in soup.findAll('img'):
            img_source = img.get('src')
            if img_source:
                img_tuple = img_source.rpartition('src=')
                img['src'] = img_tuple[2]
                # call NormalizedImage
                width = height = None
                try:
                    ni = NormalizedImage(img['src'], referer)
                    width, height = ni.get_image_size()
                except Exception as k:
                    logger.info('Problem [%s] for Source [%s]' %
                                (str(k), str(img['src'])))
                    continue
                if 480 <= width:
                    img['width'] = '100%'
                    img['height'] = 'auto'

        # clear away useless style
        for style in soup.findAll('div', style='border-top:none;'):
            img = style.find('img')
            if not img:
                if not style.find('p'):
                    style.extract()
            else:
                style.replaceWith(img)

        # remove navigble strings and <div>
        for component in soup.contents:
            if isinstance(component, NavigableString):
                if len(component.string.split()) < 10:
                    component.extract()
            elif isinstance(component, Tag):
                if component.name == 'div':
                    if not component.find('p'):
                        component.extract()

        # filter item
        img_count = 0
        for item in soup.contents:
            if isinstance(item, Tag) and item.name == 'img':
                img_count += 1
        if img_count == len(soup.contents):
            return None
        else:
            return ''.join([str(item) for item in soup.contents])
    except Exception as k:
        logger.error(str(k))
        return None
Ejemplo n.º 53
0
#!/usr/bin/env python 
#-*- coding: utf-8 -*- 

import sys

reload(sys)
sys.setdefaultencoding('UTF-8')

from newsman.config.settings import logger

logger.error('error')
logger.info('info')
logger.exception('exception')
logger.critical('critical')
logger.warning('warning')
Ejemplo n.º 54
0
def convert(language="en", title=None, link=None, updated=None, feed=None,
            transcoder="chengdujin", relative_path=None, stdout=False):
    """
    select a transcoder
    send the link
    gather the data
    combine them with the template
    generate paths
    return news and images
    * stdout is to print result directly, no saving to physical disk related
    * stdout default value False
    """
    if not language or not link:
        logger.error('Method malformed! language: %s link: %s' %
                     (language, link))
        if not stdout:
            return None, None, None, None
        else:
            return None, None

    try:
        link_clean = _preprocess(link)
        if link_clean:
            # this wont suck
            transcoders = _organize_transcoders(transcoder)
            title_new, content, images = _transcode(
                link_clean, transcoders, language)
            # remove null content
            content = content.strip() if content else None

            # in case no title is found from feed information
            if not title:
                title = title_new

            if content and title:
                # slimmer the content
                content = html_slimmer(content)
                if not stdout:
                    # embed content in template
                    news = _compose(
                        language, title, updated, feed, _sanitize(content),
                        images)
                    if news:
                        # create web/local path
                        web_path, local_path = _save(news, relative_path)
                        if web_path:
                            # the FINAL return
                            return web_path, local_path, content, images
                        else:
                            if not stdout:
                                return None, None, None, None
                            else:
                                return None, None
                    else:
                        logger.error(
                            'Cannot combine content with the template!')
                        if not stdout:
                            return None, None, None, None
                        else:
                            return None, None
                else:
                    return title, content
            else:
                if not content:
                    logger.info('Transcoder %s failed for %s' %
                                (transcoder, link_clean))
                else:
                    logger.info('Cannot find title for %s' % link_clean)

                if not stdout:
                    # original link is returned as transcoded path
                    logger.info('Original link %s is used as transcoded path')
                    return link_clean, None, None, None
                else:
                    return None, None
        else:
            logger.error(
                'Link [clean %s] [original %s] cannot be parsed' % (
                    link_clean, link))
            if not stdout:
                return None, None, None, None
            else:
                return None, None
    except Exception as k:
        logger.error(str(k))
        if not stdout:
            return None, None, None, None
        else:
            return None, None
Ejemplo n.º 55
0
def update(feed_link=None,
           feed_id=None,
           language=None,
           categories=None,
           transcoder_type='chengdujin',
           parser_type=None):
    """
    update could be called
    1. from task procedure: feed_id
    2. after an rss is added: feed_id
    3. manually for testing purpose: feed_link, language
    Note categories are kept for manual testing
    """
    if not feed_id and not (feed_link and language):
        logger.error('Method malformed!')
        return None

    try:
        # try to find the feed in database
        if feed_id:
            feed = db_feeds.get(feed_id=feed_id)
        else:
            feed = db_feeds.get(feed_link=feed_link, language=language)

        if feed:
            # read latest feed info from database
            feed_id = str(feed['_id'])
            feed_link = feed['feed_link']
            language = feed['language']
            categories = feed['categories'].keys()
            transcoder_type = feed['transcoder']
            parser_type = feed['parser']
            feed_title = feed_title_new = feed[
                'feed_title'] if 'feed_title' in feed else None
            etag = etag_new = feed['etag'] if 'etag' in feed else None
            modified = modified_new = feed[
                'modified'] if 'modified' in feed else None
            status_new = None
            reason_new = None
            entries = None

            if parser_type == 'rss':
                import rss_parser
                # parse rss reading from remote rss servers
                entries, status_new, feed_title_new, etag_new, modified_new, \
                reason_new = rss_parser.parse(
                    feed_link, feed_id, feed_title, language, categories, etag,
                    modified)
            elif parser_type == 'twitter':
                import twitter_parser

                entries, status_new, feed_title_new, etag_new, reason_new = \
                    twitter_parser.parse(
                        feed_link, feed_id, feed_title, language, categories,
                        etag)
            else:
                pass

            if entries:
                # filter out existing entries in db_news
                # there are some possible exceptions -- yet let it be
                entries = db_news.dedup(entries, language)

                if entries:
                    logger.warning('%s entries of %s received!' %
                                   (str(len(entries)), feed_link))
                    # and do tts, big_images, image as well as transcode.
                    result = _value_added_process(entries, language,
                                                  transcoder_type)
                    if result:
                        # feed_title, etag and modified to db_feeds
                        # only feed_id is necessary, others are optional
                        # **kwargs
                        result = db_feeds.update(feed_id=feed_id,
                                                 status=status_new,
                                                 feed_title=feed_title_new,
                                                 etag=etag_new,
                                                 modified=modified_new,
                                                 reason=reason_new)
                        logger.warning('%s entries of %s added to database!' %
                                       (str(len(entries)), feed_link))
                        if result:
                            return result
                        else:
                            logger.info('Error found updating feeds database')
                            return None
                    else:
                        logger.info('Error found adding value to entries')
                        return None

                else:
                    logger.info('Nothing from RSS is found new!')
                    return None
            else:
                logger.info('Nothing from RSS is updated!')
                result = db_feeds.update(feed_id=feed_id,
                                         status=status_new,
                                         feed_title=feed_title_new,
                                         etag=etag_new,
                                         modified=modified_new,
                                         reason=reason_new)
                if not result:
                    logger.error('Error found updating feeds database')
                return None
        else:
            logger.warning('Register feed in database before updating!')
            return None
    except Exception as k:
        logger.error(str(k))
        return None
Ejemplo n.º 56
0
def convert(language="en",
            title=None,
            link=None,
            updated=None,
            feed=None,
            transcoder="chengdujin",
            relative_path=None,
            stdout=False):
    """
    select a transcoder
    send the link
    gather the data
    combine them with the template
    generate paths
    return news and images
    * stdout is to print result directly, no saving to physical disk related
    * stdout default value False
    """
    if not language or not link:
        logger.error('Method malformed! language: %s link: %s' %
                     (language, link))
        if not stdout:
            return None, None, None, None
        else:
            return None, None

    try:
        link_clean = _preprocess(link)
        if link_clean:
            # this wont suck
            transcoders = _organize_transcoders(transcoder)
            title_new, content, images = _transcode(link_clean, transcoders,
                                                    language)
            # remove null content
            content = content.strip() if content else None

            # in case no title is found from feed information
            if not title:
                title = title_new

            if content and title:
                # slimmer the content
                content = html_slimmer(content)
                if not stdout:
                    # embed content in template
                    news = _compose(language, title, updated, feed,
                                    _sanitize(content), images)
                    if news:
                        # create web/local path
                        web_path, local_path = _save(news, relative_path)
                        if web_path:
                            # the FINAL return
                            return web_path, local_path, content, images
                        else:
                            if not stdout:
                                return None, None, None, None
                            else:
                                return None, None
                    else:
                        logger.error(
                            'Cannot combine content with the template!')
                        if not stdout:
                            return None, None, None, None
                        else:
                            return None, None
                else:
                    return title, content
            else:
                if not content:
                    logger.info('Transcoder %s failed for %s' %
                                (transcoder, link_clean))
                else:
                    logger.info('Cannot find title for %s' % link_clean)

                if not stdout:
                    # original link is returned as transcoded path
                    logger.info('Original link %s is used as transcoded path')
                    return link_clean, None, None, None
                else:
                    return None, None
        else:
            logger.error('Link [clean %s] [original %s] cannot be parsed' %
                         (link_clean, link))
            if not stdout:
                return None, None, None, None
            else:
                return None, None
    except Exception as k:
        logger.error(str(k))
        if not stdout:
            return None, None, None, None
        else:
            return None, None
Ejemplo n.º 57
0
def _value_added_process(entries=None,
                         language=None,
                         transcoder_type='chengdujin'):
    """
    add more value to an entry
    tts, transcode, images, redis_entry_expiration, database_entry_expiration
    """
    if not entries:
        logger.error('Method malformed!')
        return None
    if not language or language not in LANGUAGES:
        logger.error("Language not found or not supported!")
        return None

    updated_entries = []
    for i, entry in enumerate(entries):
        try:
            logger.info('... Working on %i of %d ...' % (i + 1, len(entries)))
            logger.info(entry['title'])
            logger.info(entry['link'])

            # [MUST-HAVE] transcoding
            # get a random int from 100 million possibilities
            rand = random.randint(0, 100000000)
            transcoded_relative_path = '%s_%s_%s_%i' % (
                entry['language'], entry['feed_id'], entry['updated'], rand)

            # high chances transcoder cannot work properly
            entry['transcoded'], entry[
                'transcoded_local'], raw_transcoded_content, \
            images_from_transcoded = transcoder.convert(
                entry['language'], entry['title'], entry['link'],
                entry['updated'], entry['feed'], transcoder_type,
                transcoded_relative_path)

            if entry['transcoded']:
                # [OPTIONAL] summary
                if entry['summary'] or raw_transcoded_content:
                    summary_found = summarizer.extract(
                        entry['language'], entry['title'],
                        str(raw_transcoded_content), entry['summary'],
                        entry['link'], entry['feed'],
                        '*|*'.join(entry['categories']))
                    entry['summary'] = summary_found
                #entry['summary'] = entry['summary'] if 'summary' in entry
                # and entry['summary'] else None

                # [OPTIONAL] images
                # process images found in the transcoded data
                if images_from_transcoded:
                    # images from transcoded are already normalized
                    entry['images'].extend(images_from_transcoded)
                    # remove duplicated images
                    images_deduped = illustrator.dedup_images(
                        entry['images']
                    ) if entry.has_key('images') and entry['images'] else None
                    # be cautious dedup_images might return None if network
                    # sucks
                    if images_deduped:
                        entry['images'] = images_deduped
                entry['images'] = entry[
                    'images'] if 'images' in entry and entry['images'] else None

                # [OPTIONAL] generate 3 types of images: thumbnail,
                # category image and hot news image
                if entry.has_key('images') and entry['images']:
                    biggest = illustrator.find_biggest_image(entry['images'])
                    if biggest:
                        entry = _generate_images(biggest, entry, rand)
                # for older version users
                entry['image'] = entry['thumbnail_image'][
                    'url'] if 'thumbnail_image' in entry and entry[
                        'thumbnail_image'] else None

                # [OPTIONAL] text image
                # if no category_image is found, generate a text-image
                if 'category_image' not in entry or (
                        'category_image' in entry
                        and not entry['category_image']):
                    image_relative_path = '%s_%s_%s_%i' % (
                        entry['language'], entry['feed_id'], entry['updated'],
                        rand)
                    try:
                        text_img = text2img.Text2Image(
                            language, entry['title'],
                            '%s_textimage.png' % image_relative_path)
                        entry['text_image'] = text_img.get_image()
                    except Exception as k:
                        logger.error(
                            'Problem [%s] generating text2image for [%s]' %
                            (str(k), entry['link']))

                # [OPTIONAL] google tts not for indonesian
                if entry['language'] != 'in':
                    # you dont get None in _get_tts
                    # at worst the original entry is returned
                    entry = _get_tts(entry, rand)

                # [MUST-HAVE] add expiration data
                def _expired(updated, days_to_deadline):
                    """
                    compute expiration information
                    return time string and unix time
                    """
                    deadline = datetime.utcfromtimestamp(updated) + timedelta(
                        days=days_to_deadline)
                    return time.asctime(
                        time.gmtime(calendar.timegm(deadline.timetuple())))

                entry['memory_expired'] = _expired(entry['updated'],
                                                   MEMORY_EXPIRATION_DAYS)
                entry['database_expired'] = _expired(entry['updated'],
                                                     DATABASE_REMOVAL_DAYS)

                # [OPTIONAL] if logger is used, this could be removed
                entry['error'] = entry[
                    'error'] if 'error' in entry and entry['error'] else None

                # [MUST-HAVE] update new entry to db_news
                # each entry is added with _id
                entry = db_news.update(entry)
                if entry:
                    # [MUST-HAVE] store in memory
                    result = memory.update(entry)
                    if result:
                        updated_entries.append(entry)
                    else:
                        logger.error('Error found in updating memory')
                        # remove entry in database
                        if clean_database.clean_by_item(entry):
                            logger.info('Cleaned %s in database' %
                                        entry['title'])
                        else:
                            logger.error('Error cleaning %s in database' %
                                         entry['title'])
                        # remove entry-created files on disk
                        if clean_disk.clean_by_item(entry):
                            logger.info('Cleaned %s on disk' % entry['title'])
                        else:
                            logger.error('Error cleaning %s on disk' %
                                         entry['title'])
                        continue
                else:
                    logger.error('Error found in updating to news database')
                    # remove entry-created files on disk
                    if clean_disk.clean_by_item(entry):
                        logger.info('Cleaned %s on disk' % entry['title'])
                    else:
                        logger.error('Error cleaning %s on disk' %
                                     entry['title'])
                    continue
            else:
                logger.info('Error found in transcoding')
                continue
        except Exception as k:
            logger.error(str(k))
            continue
    # the FINAL return
    if updated_entries:
        return True
    else:
        logger.info('No entry got value added!')
        return False
Ejemplo n.º 58
0
def parse(feed_link=None, feed_id=None, feed_title=None, language=None,
          categories=None, etag=None, modified=None):
    """
    read rss/atom data from a given feed
    feed_id is the feed ObjectId in MongoDB
    Etag and Modified are used to save rss http server's bandwidth
    Note: category should be added to feed table/database
    """
    if not feed_link or not feed_id or not language or not categories:
        logger.error("Method malformed!")
        return None, None, feed_title, etag, modified, "Method malformed!"
    if language not in LANGUAGES:
        logger.error("Language not supported for %s!" % feed_link)
        return None, None, feed_title, etag, modified, "Language not "
        "supported for %s!" % feed_link

    def _validate_time(entry):
        """
        see if the entry's updated time is earlier than needed
        """
        deadline = datetime.utcfromtimestamp(
            entry['updated']) + timedelta(days=DATABASE_REMOVAL_DAYS)
        return True if deadline > datetime.now() else False

    try:
        # variables d and e follow feedparser tradition
        feedparser.USER_AGENT = "newsman"
        d = feedparser.parse(feed_link, etag=etag, modified=modified)
        if d:
            # http://pythonhosted.org/feedparser/reference-status.html
            # http://pythonhosted.org/feedparser/http-etag.html#http-etag
            status = d.status if 'status' in d else None

            if status == 301:
                logger.critical(
                    '%s has been permantently moved to a %s!' % (
                        feed_link, d.href))
                return None, status, feed_title, etag, modified, '%s has been '
                'permantently moved to a %s!' % (
                    feed_link, d.href)
            elif status == 304:
                logger.warning(
                    '%s server has not updated its feeds' % feed_link)
                return None, status, feed_title, etag, modified, '%s server '
                'has not updated its feeds' % feed_link
            elif status == 410:
                logger.critical(
                    '%s is gone! Admin should check the feed availability!' %
                    feed_link)
                return None, status, feed_title, etag, modified, '%s is gone! '
                'Admin should check the feed availability!' % feed_link
            elif status == 200 or status == 302:
                # no need to worry.
                if status == 302:
                    logger.info(
                        '%s url has been temp moved to a new place' % feed_link)

                if not feed_title:
                    # if title were not found in feed, an AttributeError would
                    # be raised.
                    feed_title = urllib2.unquote(
                        hparser.unescape(d.feed.title)).strip()
                else:
                    feed_title = feed_title.strip()
                    if 'title' in d.feed:
                        feed_title_latest = urllib2.unquote(
                            hparser.unescape(d.feed.title)).strip()
                        if feed_title != feed_title_latest:
                            # change feed title
                            logger.info(
                                '%s title changed! Please update feed '
                                'table/database' % feed_link)
                            logger.info('old title: %s' % feed_title)
                            logger.info('new title: %s' % feed_title_latest)
                            #feed_title = feed_title_latest
                    else:
                        logger.info(
                            '%s[%s] has no title in its latest RSS' % (
                                feed_title, feed_link))

                # update etag/modified
                etag = None
                modified = None
                try:
                    etag = d.etag
                except AttributeError:
                    try:
                        modified = d.modified
                    except AttributeError:
                        pass

                if 'entries' in d:
                    language = language if 'language' not in d else d.language
                    # an Exception might be raised from _read_entry
                    entries = []
                    logger.error('%s begins processing' % feed_title)
                    for i, e in enumerate(d.entries):
                        if e:
                            entry = _read_entry(
                                e, feed_id, feed_title, language, categories)
                            if entry:
                                entries.append(entry)
                            else:
                                logger.info('Cannot parse %s' % e['link'])
                                continue
                        else:
                            logger.info(
                                'No infomation found for %s-th entry' % i)
                            continue

                    if entries:
                        # the FINAL return
                        # the last one indicates nothing wrong happended in
                        # parsing
                        return filter(_validate_time,
                                      entries), status, feed_title, etag, \
                               modified, 'OK'
                    else:
                        logger.info('Feed parsing goes wrong!')
                        return None, status, feed_title, etag, modified, \
                               'Feed parsing goes wrong!'
                else:
                    logger.info("Feed %s has no items!" % feed_id)
                    return None, status, feed_title, etag, modified, 'Feed %s '
                    'has no items!' % feed_id
            else:
                logger.info(
                    'HTTP Error Code [%s] for %s' % (status, feed_link))
                return None, status, feed_title, etag, modified, 'HTTP Error '
                'Code [%s] for %s' % (
                    status, feed_link)
        else:
            logger.info("Cannot parse %s correctly!" % feed_id)
            return None, None, feed_title, etag, modified, "Cannot parse %s "
            "correctly!" % feed_id
    except Exception as k:
        logger.exception('%s for %s' % (str(k), feed_id))
        return None, None, feed_title, etag, modified, '%s for %s' % (
            str(k), feed_id)