Beispiel #1
0
def find_biggest_image(images=None):
    """
    find the biggest image in resolution from a list of images
    """
    if not images:
        logger.error('Image list is found VOID!')
        return None

    try:
        biggest = None
        resolution_max = MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]
        for image in images:
            if 'width' in image and 'height' in image:
                resolution_image = int(image['width']) * int(image['height'])
                if resolution_image > MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]:
                    if resolution_image > resolution_max:
                        biggest = image
                        resolution_max = resolution_image
                else:
                    logger.info('Image [%s] is not big enough!' %
                                str(image['url']))
            else:
                logger.info('Height and width not found! %s' % str(image))
        return biggest
    except Exception as k:
        logger.error('Problem:[%s]\nSource:[%s]' % (str(k), str(images)))
        return None
Beispiel #2
0
def _collect_images(data=None, referer=None):
    """
    find all possible images
    1. image_list
    2. images in the new content
    """
    if not data:
        return None

    try:
        images = []
        # first try to find images in image_list
        if 'image_list' in data and data.get('image_list'):
            for image in data.get('image_list'):
                if 'src' in image and image['src']:
                    image_normalized = illustrator.find_image(
                        image['src'].strip(), referer)
                    if image_normalized:
                        images.append(image_normalized)

        # then try to find images in the content
        images_from_content, data[
            'content'] = illustrator.find_images(data['content'], referer)
        if images_from_content:
            images.extend(images_from_content)

        # remove duplicated ones
        images = illustrator.dedup_images(images) if images else None
        return images, data
    except Exception as k:
        logger.error(str(k))
        return None
Beispiel #3
0
def _extract(data=None, referer=None):
    """
    extract images and text content
    """
    if not data:
        logger.error('Received no data from UCK server.')
        return None, None, None

    successful = int(data['STRUCT_PAGE_TYPE'])
    if successful == 0:
        logger.info('Cannot interpret the page! status != 1')
        return None, None, None

    try:
        # content
        content = data['content'].replace("\\", "")
        content = _sanitize(content, referer)

        # images
        images, data = _collect_images(data, referer)
        images = images if images else None

        # title
        title = None
        if 'title' in data:
            title = data['title']

        return title, content, images
    except Exception as k:
        logger.error(str(k))
        return None, None, None
Beispiel #4
0
    def _find_keywords(self):
        """
        compute word-frenquecy map
        """
        try:
            words = self._segment_text(self._article)

            # remove stop words
            stopwords_path = '%s%s_stopwords' % (DATA_PATH, self._language)
            # ar, en, id, ja, pt, th, zh
            f = open(stopwords_path, 'r')
            stopwords = f.readlines()
            f.close()
            #stopwords = [stopword.strip() for stopword in stopwords if
            # stopword.strip()]
            stopwords = [str(re.compile(r'[^\w ]', flags=re.UNICODE).sub("",
                                                                         unicode(
                                                                             stopword.strip())))
                         for stopword in stopwords if stopword.strip()]
            words_filtered = [word for word in words if word not in stopwords]

            # distinct words
            kwords = list(set(words_filtered))

            # word-frenquency
            keywords = [(kword, words_filtered.count(kword))
                        for kword in kwords]
            keywords = sorted(keywords, key=lambda x: -x[1])

            return (keywords, len(words))
        except Exception as k:
            logger.error(str(k))
            return None
Beispiel #5
0
    def __init__(self, url, language):
        """
        docs needed!
        """
        try:
            self.candidates = {}
            self.url = url
            self.language = language

            self.data = transcoder.prepare_link(self.url)
            self.data = self.regexps['replace_brs'].sub(
                "</p><p>", str(self.data))
            self.data = self.regexps['replace_fonts'].sub(
                "<\g<1>span>", str(self.data))

            self.html = BeautifulSoup(self.data.decode('utf-8', 'ignore'))
            self.article_image = None
            self._get_specific_image()
            self._remove_script()
            self._remove_style()
            self._remove_link()

            self.title = self._get_title()
            self.short_title = self._get_short_title()
            self.content = self._get_article()
            self.images = self._get_images()
        except Exception as k:
            logger.error(str(k))
Beispiel #6
0
    def _find_top_keywords(self, keywords=None, words_count=None):
        """
        compute top-scored keywords
        """
        if not keywords or not words_count:
            logger.error("Method malformed!")
            return None

        try:
            #col = Collection(db, KEYWORD_REGISTRAR)
            top_keywords = keywords[:TOP_KEYWORDS_LIMIT]
            topwords = []

            for top_keyword in top_keywords:
                word = top_keyword[0]
                count = top_keyword[1]

                article_score = float(count) * 1.0 / float(words_count)
                word_score = article_score * 1.5
                topwords.append((word, word_score))

            topwords = sorted(topwords, key=lambda x: -x[1])
            return topwords
        except Exception as k:
            logger.error(str(k))
            return None
Beispiel #7
0
def _get_actual_link(prefix=None, link=None):
    """
    find the actual news link
    """
    if not prefix or not link:
        logger.error('Method malformed! Prefix:[%s], Link:[%s]' %
                     (prefix, link))

    try:
        actual_link = None
        raw_data = urllib2.urlopen(link)
        data = raw_data.readlines()
        # str() is critical
        soup = BeautifulStoneSoup(str(data))
        html_tag, html_attrs = HIDDEN_LINKS[prefix]
        html_wrapper = soup.find(name=html_tag, attrs=html_attrs)
        if html_wrapper:
            actual_suffix = html_wrapper.find('a')['href']
            actual_link = str('%s%s' % (prefix, actual_suffix))
            return actual_link
        else:
            return None
    except Exception as k:
        logger.error('Cannot open %s' % k)
        return None
Beispiel #8
0
def _extract(data=None, referer=None):
    """
    extract images and text content
    """
    if not data:
        logger.error('Received no data from UCK server.')
        return None, None, None

    successful = int(data['STRUCT_PAGE_TYPE'])
    if successful == 0:
        logger.info('Cannot interpret the page! status != 1')
        return None, None, None

    try:
        # content
        content = data['content'].replace("\\", "")
        content = _sanitize(content, referer)

        # images
        images, data = _collect_images(data, referer)
        images = images if images else None

        # title
        title = None
        if 'title' in data:
            title = data['title']

        return title, content, images
    except Exception as k:
        logger.error(str(k))
        return None, None, None
Beispiel #9
0
    def _check_image(self, image_url=None, image_html=None):
        """
        Replace orginal image_url with downloaded local copy, if original 
        image_url could not be reached without HEADERS
        """
        if not image_url:
            logger.error('Image URL is found VOID!')
            raise Exception('Image URL is found VOID!')
        if not image_html:
            logger.error('Image content is found VOID!')
            raise Exception('Image content is found VOID!')

        try:
            response = requests.get(image_url, timeout=UCK_TIMEOUT)
            if response.status_code > 400 or 'posttoday.com/media/content' in \
                    image_url:
                raise Exception('Without HEADERS [%s] cannot be reached!' %
                                str(image_url))
        except Exception as k:
            logger.info('Problem:[%s] Source:[%s]' % (str(k), str(image_url)))

            # replace original image_url with downloaded local copy
            image_url_new = self._download_copy(image_url, image_html)
            return image_url_new if image_url_new else image_url

        # Image is accessible with/without HEADERS
        return image_url
Beispiel #10
0
def _find_redirected_link(url=None):
    """
    find the real link from redirection
    """
    if not url:
        logger.error('Link [%s] is not valid!' % url)
        return None

    IS_REDIRECTED = True
    counter = 0
    nurl = url

    while IS_REDIRECTED and counter < 10:
        resp = requests.get(nurl)
        purl = nurl
        nurl = resp.url

        if nurl == purl:
            IS_REDIRECTED = False
        else:
            counter = counter + 1

    if not IS_REDIRECTED:
        return nurl
    else:
        return None
Beispiel #11
0
def clean():
    """
    remove expired items from database
    """
    logger.info('... cleaning database ...')
    try:
        document_names = _find_document_names()
        if document_names:
            for document_name in document_names:
                document = Collection(db, document_name)

                # compute a threshold
                current_utc_time_posix = calendar.timegm(time.gmtime())
                deadline_datetime = datetime.utcfromtimestamp(
                    current_utc_time_posix) - timedelta(
                    days=DATABASE_REMOVAL_DAYS)
                deadline_posix = calendar.timegm(deadline_datetime.timetuple())

                removal_candidates = document.find(
                    {'updated': {'$lt': deadline_posix}})
                for removal_candidate in removal_candidates:
                    # see if removal candidate has a footage in memory
                    clean_memory.clean_by_item(str(removal_candidate['_id']))
                    # remove corresponding files on disk
                    clean_disk.clean_by_item(removal_candidate)
                    # remove the candidate in database
                    document.remove({'_id': removal_candidate['_id']})
            return True
        else:
            logger.error('Cannot find documents')
            return False
    except Exception as k:
        logger.error(str(k))
        return False
Beispiel #12
0
def _is_valid(content, language):
    """
    check if the content meets the need
    need: chinese/japanese - more than 40 words
    """
    if not content or not language:
        logger.error('Method malformed!')
        return False

    try:
        if isinstance(content, str):
            content = content.decode(
                chardet.detect(content)['encoding'], 'ignore')

        if language.startswith('zh') or language == 'ja':
            words = content
            if len(words) < PARAGRAPH_CRITERIA_KANJI:
                return False
        elif language.startswith('th'):
            words = content.split()
            if len(words) < PARAGRAPH_CRITERIA_THAI:
                return False
        else:
            words = content.split()
            if len(words) < PARAGRAPH_CRITERIA_LATIN:
                return False

        return True
    except Exception as k:
        logger.error(str(k))
        return False
Beispiel #13
0
def _collect_images(data=None, referer=None):
    """
    find all possible images
    1. image_list
    2. images in the new content
    """
    if not data:
        return None

    try:
        images = []
        # first try to find images in image_list
        if 'image_list' in data and data.get('image_list'):
            for image in data.get('image_list'):
                if 'src' in image and image['src']:
                    image_normalized = illustrator.find_image(
                        image['src'].strip(), referer)
                    if image_normalized:
                        images.append(image_normalized)

        # then try to find images in the content
        images_from_content, data['content'] = illustrator.find_images(
            data['content'], referer)
        if images_from_content:
            images.extend(images_from_content)

        # remove duplicated ones
        images = illustrator.dedup_images(images) if images else None
        return images, data
    except Exception as k:
        logger.error(str(k))
        return None
Beispiel #14
0
def extract(language, title, content, summary, link, feed, category):
    """
    get the summary from the source, first paragraph or summary
    """
    if not content or not title or not language:
        logger.error('No data is found!')
        return None

    try:
        result_summary = ""

        # set the number of sentences
        # limit the number of words
        if content:
            if language in ['en', 'ja', 'pt', 'th']:
                teaser = PyTeaser(
                    language, title, content, link, feed, category)
                result_summary = teaser.summarize()

        # if summary from rss provider is found use summary, but limit
        # the number of words
        if not result_summary and summary:
            result_summary = _get_summary(summary, language)

        # else find first paragraph from transcoded also limit the
        # number of words
        if not result_summary and content:
            result_summary = _get_first_paragraph(content, language)

        return result_summary
    except Exception as k:
        logger.error(str(k))
        return None
Beispiel #15
0
def _get_shorter_text(content, language, limit):
    """
    limit the number of words to 500
    """
    if not content or not language:
        logger.error('Method malformed!')
        return None

    try:
        # data should be processed as unicode, so
        if isinstance(content, str):
            content = content.decode(
                chardet.detect(content)['encoding'], 'ignore')

        # break text by sentence
        if language == 'zh' or language == 'ja':
            jp_sent_tokenizer = nltk.RegexpTokenizer('[^!?.!?。.]*[!?.!?。]*')
            sentences = jp_sent_tokenizer.tokenize(content)
        if language == 'th':
            sentences = content.split()
        else:  # supports latin-based, thai and arabic
            sentences = nltk.sent_tokenize(content)

        enough_sentences = u""
        for sentence in sentences:
            # sentence is in unicode, len() then applies to CJK
            sentence = sentence.strip()
            if sentence:
                if len(enough_sentences) + len(sentence) + 1 <= limit:
                    enough_sentences = "%s %s" % (enough_sentences, sentence)

        return str(enough_sentences.strip())
    except Exception as k:
        logger.error(str(k))
        return None
Beispiel #16
0
def _get_summary(content, language):
    """
    find out the first readable summary
    """
    if not content or not language:
        logger.error('Method malformed!')
        return None

    try:
        # strip off html code
        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_images = True
        h.ignore_emphasis = True
        h.body_width = 0
        paragraphs = (h.handle(content)).strip().strip(
            '#').strip().split("\n\n")
        paragraphs = [
            paragraph for paragraph in paragraphs if paragraph.strip()]
        for paragraph in paragraphs:
            if paragraph and _is_valid(paragraph, language):
                summary = _get_shorter_text(
                    paragraph, language, SUMMARY_LENGTH_LIMIT)
                if summary:
                    return summary
    except Exception as k:
        logger.error(str(k))
        return None
Beispiel #17
0
def _get_tts(entry=None, rand=None):
    """
    get tts from the provider
    """
    if not entry:
        logger.error('Method malformed!')
        return entry
    if not rand:
        # get a new rand
        rand = random.randint(0, 100000000)

    try:
        tts_relative_path = '%s_%s_%s_%i.mp3' % (
            entry['language'], entry['feed_id'], entry['updated'], rand)
        read_content = '%s. %s' % (entry['title'],
                                   entry['summary'] if entry.has_key('summary')
                                   and entry['summary'] else "")
        entry['mp3'], entry['mp3_local'] = tts_provider.google(
            entry['language'], read_content, tts_relative_path)
    except Exception as k:
        logger.error(str(k))
        entry['error'].append(str(k) + '\n')
        entry['mp3'] = None
        entry['mp3_local'] = None
    return entry
Beispiel #18
0
def save(feed_info=None):
    """
    add a new record of feed
    """
    if not feed_info:
        logger.error("Method malformed!")
        return None

    try:
        # if the collection does not exist, it will be created
        col = Collection(db, FEED_REGISTRAR)
        # make a record in the feeds table
        item = col.find_one(
            {'feed_link': feed_info['feed_link'],
             'language': feed_info['language']})
        if not item:
            feed_info['updated_times'] = 0
            feed_info['latest_update'] = None
            # the final return
            return str(col.save(feed_info))
        else:
            # the final return
            return str(item['_id'])
    except Exception as k:
        logger.error(str(k))
        return None
Beispiel #19
0
    def _check_image(self, image_url=None, image_html=None):
        """
        Replace orginal image_url with downloaded local copy, if original 
        image_url could not be reached without HEADERS
        """
        if not image_url:
            logger.error('Image URL is found VOID!')
            raise Exception('Image URL is found VOID!')
        if not image_html:
            logger.error('Image content is found VOID!')
            raise Exception('Image content is found VOID!')

        try:
            response = requests.get(image_url, timeout=UCK_TIMEOUT)
            if response.status_code > 400 or 'posttoday.com/media/content' in \
                    image_url:
                raise Exception(
                    'Without HEADERS [%s] cannot be reached!' % str(image_url))
        except Exception as k:
            logger.info('Problem:[%s] Source:[%s]' % (str(k), str(image_url)))

            # replace original image_url with downloaded local copy
            image_url_new = self._download_copy(image_url, image_html)
            return image_url_new if image_url_new else image_url

        # Image is accessible with/without HEADERS
        return image_url
Beispiel #20
0
def convert(link):
    """
    send link to uck api and reformat the content
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    # send link to uck server and get data back
    try:
        raw_data = _transcode(link)
        if raw_data:
            # check if raw_data is syntax-correct
            try:
                eval(raw_data)
            except Exception:
                logger.info('Invalid syntax found for UCK output')
                return None, None, None

            # text is sanitized, images are found from image_list
            title, transcoded, images = _extract(eval(raw_data), link)
            return title, transcoded, images
        else:
            logger.info('Cannot read anything from UCK server')
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Beispiel #21
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Beispiel #22
0
def _find_redirected_link(url=None):
    """
    find the real link from redirection
    """
    if not url:
        logger.error('Link [%s] is not valid!' % url)
        return None

    IS_REDIRECTED = True
    counter = 0
    nurl = url

    while IS_REDIRECTED and counter < 10:
        resp = requests.get(nurl)
        purl = nurl
        nurl = resp.url

        if nurl == purl:
            IS_REDIRECTED = False
        else:
            counter = counter + 1

    if not IS_REDIRECTED:
        return nurl
    else:
        return None
Beispiel #23
0
def convert(link):
    """
    send link to uck api and reformat the content
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    # send link to uck server and get data back
    try:
        raw_data = _transcode(link)
        if raw_data:
            # check if raw_data is syntax-correct
            try:
                eval(raw_data)
            except Exception:
                logger.info('Invalid syntax found for UCK output')
                return None, None, None

            # text is sanitized, images are found from image_list
            title, transcoded, images = _extract(eval(raw_data), link)
            return title, transcoded, images
        else:
            logger.info('Cannot read anything from UCK server')
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Beispiel #24
0
def dedup_images(images=None):
    """
    remove same images
    image: {'url':xxx, 'width':yyy, 'height':zzz}
    images = [image, image, image]
    """
    if not images:
        logger.error('Image list is found VOID!')
        return None

    image_urls = []

    def _exists(image):
        """
        return boolean if image exists in the image_urls list
        """
        if image['url'] not in image_urls:
            image_urls.append(image['url'])
            return False
        else:
            return True

    try:
        return filter(lambda x: not _exists(x), images)
    except Exception as k:
        logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(images)))
        return None
Beispiel #25
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Beispiel #26
0
def _get_actual_link(prefix=None, link=None):
    """
    find the actual news link
    """
    if not prefix or not link:
        logger.error(
            'Method malformed! Prefix:[%s], Link:[%s]' % (prefix, link))

    try:
        actual_link = None
        raw_data = urllib2.urlopen(link)
        data = raw_data.readlines()
        # str() is critical
        soup = BeautifulStoneSoup(str(data))
        html_tag, html_attrs = HIDDEN_LINKS[prefix]
        html_wrapper = soup.find(name=html_tag, attrs=html_attrs)
        if html_wrapper:
            actual_suffix = html_wrapper.find('a')['href']
            actual_link = str('%s%s' % (prefix, actual_suffix))
            return actual_link
        else:
            return None
    except Exception as k:
        logger.error('Cannot open %s' % k)
        return None
Beispiel #27
0
def dedup_images(images=None):
    """
    remove same images
    image: {'url':xxx, 'width':yyy, 'height':zzz}
    images = [image, image, image]
    """
    if not images:
        logger.error('Image list is found VOID!')
        return None

    image_urls = []

    def _exists(image):
        """
        return boolean if image exists in the image_urls list
        """
        if image['url'] not in image_urls:
            image_urls.append(image['url'])
            return False
        else:
            return True

    try:
        return filter(lambda x: not _exists(x), images)
    except Exception as k:
        logger.info('Problem:[%s]\nSource:[%s]' % (str(k), str(images)))
        return None
Beispiel #28
0
    def _clean_article(self):
        """
        remove html tags, images, links from the article, and encode it
        appropriately
        """
        try:
            # convert to normal encoding
            self._article = str(
                urllib2.unquote(hparser.unescape(self._article)))

            # remove unnecessary parts
            html_stripper = html2text.HTML2Text()
            html_stripper.ignore_links = True
            html_stripper.ignore_images = True
            html_stripper.ignore_emphasis = True
            # body_width = 0 disables text wrapping
            html_stripper.body_width = 0
            self._article = html_stripper.handle(
                self._article).strip().strip("#").strip()

            # convert to appropriate encoding
            if isinstance(self._article, str):
                self._article = self._article.decode(
                    chardet.detect(self._article)['encoding'], 'ignore')
        except Exception as k:
            logger.error(str(k))
            return None
Beispiel #29
0
def find_biggest_image(images=None):
    """
    find the biggest image in resolution from a list of images
    """
    if not images:
        logger.error('Image list is found VOID!')
        return None

    try:
        biggest = None
        resolution_max = MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]
        for image in images:
            if 'width' in image and 'height' in image:
                resolution_image = int(image['width']) * int(image['height'])
                if resolution_image > MIN_IMAGE_SIZE[0] * MIN_IMAGE_SIZE[1]:
                    if resolution_image > resolution_max:
                        biggest = image
                        resolution_max = resolution_image
                else:
                    logger.info('Image [%s] is not big enough!' %
                                str(image['url']))
            else:
                logger.info('Height and width not found! %s' % str(image))
        return biggest
    except Exception as k:
        logger.error('Problem:[%s]\nSource:[%s]' % (str(k), str(images)))
        return None
Beispiel #30
0
 def _clean_style(self, e):
     try:
         for elem in e.findAll(True):
             del elem['class']
             del elem['id']
             del elem['style']
     except Exception as k:
         logger.error(str(k))
Beispiel #31
0
    def __init__(self, image_url=None, referer=None):
        if not image_url:
            logger.error('Method malformed!')
            raise Exception('Method malformed!')

        self._image_url, self._image_html = self._analyze(image_url, referer)
        self._image_size = self._calculate_size(self._image_html)

        self._clean_data()
Beispiel #32
0
    def __init__(self, image_url=None, referer=None):
        if not image_url:
            logger.error('Method malformed!')
            raise Exception('Method malformed!')

        self._image_url, self._image_html = self._analyze(image_url, referer)
        self._image_size = self._calculate_size(self._image_html)

        self._clean_data()
Beispiel #33
0
 def get_image(self):
     try:
         textimage_public_path = "%s%s" % (
             IMAGES_PUBLIC_DIR, self._textimage_relative_path)
         textimage = {'url': textimage_public_path, 'width':
             CATEGORY_IMAGE_SIZE[0], 'height': CATEGORY_IMAGE_SIZE[1]}
         return textimage
     except Exception as k:
         logger.error(str(k))
         return None
Beispiel #34
0
 def run(self):
     if self.transcoder == 'simplr':
         self.result = eval(self.transcoder).convert(
             self.url, self.language)
     else:
         try:
             self.result = eval(self.transcoder).convert(self.url)
         except Exception as k:
             logger.error(str(k))
             self.result = None, None, None
Beispiel #35
0
 def run(self):
     if self.transcoder == 'simplr':
         self.result = eval(self.transcoder).convert(
             self.url, self.language)
     else:
         try:
             self.result = eval(self.transcoder).convert(self.url)
         except Exception as k:
             logger.error(str(k))
             self.result = None, None, None
Beispiel #36
0
    def __init__(self, language=None, title=None, article=None, link=None,
                 blog=None, category=None):
        if not language or not title or not article:
            logger.error('Method malformed!')

        self._language = language
        self._title = title
        self._article = article
        self._link = link
        self._blog = blog
        self._category = category
Beispiel #37
0
def _clean_zombies():
    """
    kill zombie processes, usually run semi-daily, or quasi-daily
    """
    logger.info('-----------------killing zombies--------------------')
    try:
        clean_process.clean()
        return True
    except Exception as k:
        logger.error(str(k))
        return False
Beispiel #38
0
def _clean_zombies():
    """
    kill zombie processes, usually run semi-daily, or quasi-daily
    """
    logger.info('-----------------killing zombies--------------------')
    try:
        clean_process.clean()
        return True
    except Exception as k:
        logger.error(str(k))
        return False
Beispiel #39
0
def _download(language='en',
              query='Service provided by Baidu',
              tmp_file='do_not_exist.mp3'):
    """
    docs needed!
    other ways to write _download
    1. https://github.com/hungtruong/Google-Translate-TTS/blob/master
    /GoogleTTS.py
    2. https://github.com/gavinmh/tts-api/blob/master/text_segmenter.py
    """

    try:
        # break a long sentence/paragraph into google-acceptable length
        segments = _query_segment(language, query)

        # download chunks and write them to the output file
        threads = []
        if segments:
            for segment in segments:
                if segment:
                    logger.info('... Transmitting "%s"' % segment)
                    gt_request = GoogleTranslateAPI(language, segment)
                    threads.append(gt_request)
                    gt_request.start()
                    gt_request.join(GOOGLE_TTS_TIMEOUT)

            out = open(tmp_file, 'a')
            download_completed = True
            for th in threads:
                sys.stdout.flush()
                if th.result:
                    out.write(th.result)
                else:
                    download_completed = False
                    break
            out.close()

            if download_completed:
                return tmp_file
            else:
                logger.info('Download not completed, now removing the file')
                if os.path.exists(tmp_file):
                    os.remove(tmp_file)
                return None
        else:  # nothing generated from the query
            logger.error('Nothing generated from the query')
            return None
    except Exception as k:
        logger.error(
            'Part of tts dowload went wrong, now removing the file: %s' %
            str(k))
        if os.path.exists(tmp_file):
            os.remove(tmp_file)
        return None
Beispiel #40
0
 def _get_title(self):
     try:
         title = ''
         try:
             title = self.html.find('title').text
         except:
             pass
         return title
     except Exception as k:
         logger.error(str(k))
         return None
Beispiel #41
0
def _collect_images(content, referer):
    """
    find all images from the content
    """
    if not content:
        logger.error('Content/HTML found VOID!')
        return None

    images, content_new = illustrator.find_images(content, referer)
    if content_new and content_new != content:
        content = content_new
    return images, content
Beispiel #42
0
def _download(language='en', query='Service provided by Baidu',
              tmp_file='do_not_exist.mp3'):
    """
    docs needed!
    other ways to write _download
    1. https://github.com/hungtruong/Google-Translate-TTS/blob/master
    /GoogleTTS.py
    2. https://github.com/gavinmh/tts-api/blob/master/text_segmenter.py
    """

    try:
        # break a long sentence/paragraph into google-acceptable length
        segments = _query_segment(language, query)

        # download chunks and write them to the output file
        threads = []
        if segments:
            for segment in segments:
                if segment:
                    logger.info('... Transmitting "%s"' % segment)
                    gt_request = GoogleTranslateAPI(language, segment)
                    threads.append(gt_request)
                    gt_request.start()
                    gt_request.join(GOOGLE_TTS_TIMEOUT)

            out = open(tmp_file, 'a')
            download_completed = True
            for th in threads:
                sys.stdout.flush()
                if th.result:
                    out.write(th.result)
                else:
                    download_completed = False
                    break
            out.close()

            if download_completed:
                return tmp_file
            else:
                logger.info('Download not completed, now removing the file')
                if os.path.exists(tmp_file):
                    os.remove(tmp_file)
                return None
        else:  # nothing generated from the query
            logger.error('Nothing generated from the query')
            return None
    except Exception as k:
        logger.error(
            'Part of tts dowload went wrong, now removing the file: %s' % str(
                k))
        if os.path.exists(tmp_file):
            os.remove(tmp_file)
        return None
Beispiel #43
0
def _collect_images(content, referer):
    """
    find all images from the content
    """
    if not content:
        logger.error('Content/HTML found VOID!')
        return None

    images, content_new = illustrator.find_images(content, referer)
    if content_new and content_new != content:
        content = content_new
    return images, content
Beispiel #44
0
 def _set_background(self):
     """
     set the image background
     """
     try:
         #self._image = Image.new("RGB", CATEGORY_IMAGE_SIZE,
         # self._background_color)
         self._image = Image.open("%s/home_bg.png" % DATA_PATH)
         self._draw = ImageDraw.Draw(self._image)
     except Exception as k:
         logger.error(str(k))
         return None
Beispiel #45
0
def update(entry=None, expiration=None):
    """
    add news and its attributes to memory
    """
    if not entry:
        logger.error('Method malformed!')
        return False

    try:
        # check if redis in alive
        rclient.ping()

        entry_reduced = {}
        # simplify fields in entry to ones in field_list
        for field in entry:
            if field in field_list:
                entry_reduced[field] = entry[field]

        # add an entry to memory
        # add a piece of news into memory
        rclient.set(entry_reduced['_id'], entry_reduced)

        # expired in redis is counted in seconds
        expiration = MEMORY_EXPIRATION_DAYS * 24 * \
                     60 * 60 if not expiration else expiration
        rclient.expire(entry_reduced['_id'], expiration)

        # add entry ids to the RSS list
        rclient.zadd(
            "news::%s::%s" %
            (entry_reduced['language'], entry_reduced['feed']),
            entry_reduced['updated'], entry_reduced['_id'])

        # add entry ids to the label list
        col = Collection(db, FEED_REGISTRAR)
        item = col.find_one({'feed_title': entry_reduced['feed']},
                            {'labels': 1})
        if item and 'labels' in item and item['labels']:
            for label in item['labels']:
                # a label is a combination of country, category and label
                rclient.zadd(
                    'news::%s::%s' % (entry_reduced['language'], label),
                    entry_reduced['updated'], entry_reduced['_id'])
        # final return
        return True
    except ConnectionError:
        logger.critical('Redis is down!')
        return False
    except Exception as k:
        logger.error(str(k))
        return False
Beispiel #46
0
 def _get_short_title(self):
     try:
         title = ''
         try:
             orig = self.html.find('title').text
             segmenter = tinysegmenter.TinySegmenter()
             # remove unnecessary parts
             for delimiter in [' | ', ' - ', ' :: ', ' / ']:
                 if delimiter in orig:
                     parts = orig.split(delimiter)
                     if self.language.startswith(
                             'zh') or self.language == 'ja':
                         words_head = segmenter.tokenize(unicode(parts[0]))
                         words_tail = segmenter.tokenize(unicode(parts[-1]))
                         if len(words_head) >= 4:
                             title = parts[0]
                             break
                         elif len(words_tail) >= 4:
                             title = parts[-1]
                             break
                     else:
                         if len(parts[0].split()) >= 4:
                             title = parts[0]
                             break
                         elif len(parts[-1].split()) >= 4:
                             title = parts[-1]
                             break
             if not title:
                 orig = title
             if ': ' in orig:
                 parts = orig.split(': ')
                 if self.language.startswith('zh') or self.language == 'ja':
                     words_tail = segmenter.tokenize(unicode(parts[-1]))
                     if len(words_tail) >= 4:
                         title = parts[-1]
                     else:
                         title = orig.split(': ', 1)[1]
                 else:
                     if len(parts[-1].split()) >= 4:
                         title = parts[-1]
                     else:
                         title = orig.split(': ', 1)[1]
             if not title:
                 return orig
         except:
             pass
         return title
     except Exception as k:
         logger.error(str(k))
         return None
Beispiel #47
0
def convert(link):
    """
    call UCK's new interface to get title, images and content
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        title, content, images = _extract(link)
        return title, content, images
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Beispiel #48
0
    def _clean_article(self, content):
        try:
            self._clean_comments(content)
            # remove unwanted parts, by dirty hand
            self._clean_extra_parts(content)
            self._clean(content, 'h1')
            self._clean(content, 'blockquote')
            self._clean(content, 'object')
            self._clean_conditionally(content, "form")

            if len(content.findAll('h2')) == 1:
                self._clean(content, 'h2')

            self._clean(content, 'iframe')

            self._clean_conditionally(content, "table")
            self._clean_conditionally(content, "ul")
            # print 'Before removing div'
            # print content
            # print
            # print
            # '------------------------------------------------------------'
            self._clean_conditionally(content, "div")
            # print 'After removing div'
            # print content
            # print
            # print
            # '----------------------------------------------------------'
            self._clean_style(content)

            self._fix_images_path(content)
            self._fix_links_path(content)

            content_string = content.renderContents(encoding='utf-8')
            if not content.find('img') and self.article_image:
                article_image_string = self.article_image.renderContents(
                    encoding='utf-8')
                # if <img> is rendered to None
                if not article_image_string:
                    article_image_string = self.article_image.parent \
                        .renderContents(
                        encoding='utf-8')
                content_string = article_image_string + content_string

            content_string = self.regexps[
                'kill_breaks'].sub("<br />", content_string)
            return content_string
        except Exception as k:
            logger.error(str(k))
            return None
Beispiel #49
0
 def _get_images(self):
     try:
         if self.content:
             # find_images normalizes images afterwards
             images, content_new = illustrator.find_images(
                 self.content, self.url)
             if content_new and content_new != self.content:
                 self.content = content_new
             return images
         else:
             return None
     except Exception as k:
         logger.error(str(k))
         return None
Beispiel #50
0
def _clean_data():
    """
    clean memory, database and files, usually run daily
    """
    logger.info('----------------------cleaning-------------------------')
    try:
        any_mistake = False
        # clean database
        if not clean_database.clean():
            logger.error('Error found cleaning database')
            any_mistake = True
        # clean memory
        if not clean_memory.clean():
            logger.error('Error found cleaning memory')
            any_mistake = True
        # clean disk
        if not clean_disk.clean():
            logger.error('Error found cleaning disk')
            any_mistake = True

        if not any_mistake:
            logger.info('Memory, Database & Disk got cleaned!')
            return True
        else:
            return False
    except Exception as k:
        logger.error(str(k))
        return False
Beispiel #51
0
def _preprocess(url):
    """
    get the real address out
    """
    if not url:
        logger.error('Method malformed!')
        return None

    try:
        last_http_index = url.rfind('http')
        return url[last_http_index:].strip()
    except Exception as k:
        logger.error(str(k))
        return None
Beispiel #52
0
def update(entry=None, expiration=None):
    """
    add news and its attributes to memory
    """
    if not entry:
        logger.error('Method malformed!')
        return False

    try:
        # check if redis in alive
        rclient.ping()

        entry_reduced = {}
        # simplify fields in entry to ones in field_list
        for field in entry:
            if field in field_list:
                entry_reduced[field] = entry[field]

        # add an entry to memory
        # add a piece of news into memory
        rclient.set(entry_reduced['_id'], entry_reduced)

        # expired in redis is counted in seconds
        expiration = MEMORY_EXPIRATION_DAYS * 24 * \
                     60 * 60 if not expiration else expiration
        rclient.expire(entry_reduced['_id'], expiration)

        # add entry ids to the RSS list
        rclient.zadd("news::%s::%s" %
                     (entry_reduced['language'], entry_reduced['feed']),
                     entry_reduced['updated'], entry_reduced['_id'])

        # add entry ids to the label list
        col = Collection(db, FEED_REGISTRAR)
        item = col.find_one(
            {'feed_title': entry_reduced['feed']}, {'labels': 1})
        if item and 'labels' in item and item['labels']:
            for label in item['labels']:
                # a label is a combination of country, category and label
                rclient.zadd('news::%s::%s' %
                             (entry_reduced['language'], label),
                             entry_reduced['updated'], entry_reduced['_id'])
        # final return
        return True
    except ConnectionError:
        logger.critical('Redis is down!')
        return False
    except Exception as k:
        logger.error(str(k))
        return False