Example #1
0
def publish_toutiao():
    client = db.get_redis_client(config.get('app.redis'))
    published_key = 'published_articles'
    publisher = Toutiao()
    logger.info('Start toutiao publish-processing...')
    article_str = client.lpop('fetched_article')
    while article_str and len(article_str) > 0:
        logger.info('Fetched article str from redis')
        try:
            if isinstance(article_str, bytes):
                article_str = bytes.decode(article_str)
            article_json = json.loads(article_str)
            article = Article()
            article.rebuild(article_json)
            title = repr(article.title)
            logger.info('Pre-publish article [%s]' % title)
            if article and (not client.sismember(published_key, title)):
                publisher.publish(article)
                client.sadd(published_key, title)
            else:
                logger.error(
                    'Pre-publish article [%s] error, due to published before' %
                    title)
        except Exception as e:
            logger.error('Pickle loads article error')
            logger.error(e)
        finally:
            article_str = client.lpop('fetched_article')
Example #2
0
def publish_toutiao(event, context):
    import json
    from publisher.toutiao.publisher import ToutiaoPublisher
    from core import logger, config, db
    from entities import Article
    import hashlib

    client = db.get_redis_client(config.get('app.redis'))
    published_key = 'published_articles'
    publisher = ToutiaoPublisher()
    logger.info('Start toutiao publish-processing...')
    article_str = client.lpop('fetched_article')
    while article_str and len(article_str) > 0:
        logger.info('Fetched article str from redis')
        try:
            if isinstance(article_str, bytes):
                article_str = bytes.decode(article_str)
            article_json = json.loads(article_str)
            article = Article()
            article.rebuild(article_json)
            title = repr(article.title)
            hashed_title = hashlib.md5(title.encode('utf8')).hexdigest().upper()
            logger.info('Pre-publish article [%s] hash value [%s]' % (title, hashed_title))
            if article and (not client.sismember(published_key, hash(title))):
                publisher.publish(article)
                client.sadd(published_key, hashed_title)
            else:
                logger.error('Pre-publish article [%s] error, due to published before' % title)
        except Exception as e:
            logger.error('Pickle loads article error')
            logger.error(e)
        finally:
            article_str = client.lpop('fetched_article')
Example #3
0
 def estimate(self, article: Article):
     try:
         article.summary = summarize(article)
         for sen in article.summary:
             article.abstract_str += str(sen)
         article.score = baidu_repetition_rate(article)
     except Exception as e:
         logger.error('Estimate article error: [%s]' % article.title)
Example #4
0
    def transformer(self, article: Article):
        '''
        :param article:
        :return:
        purify href of img tag
        '''
        soup = BeautifulSoup(article.html)

        # purify image
        for img in soup.find_all('img'):
            try:
                if 'src' in img.attrs:
                    img['src'] = uploader.upload(img['src'])
            except Exception as e:
                logger.error(e)
                continue

        # remove link
        for a in soup.find_all('a'):
            try:
                if 'href' in a.attrs:
                    del a['href']
            except Exception as e:
                logger.error(e)
                continue

        # control length of title 5-30
        # 一个汉字算一个长度,2个字母算一个长度
        alpha_num = 1
        word_num = 0
        for x in range(len(article.title)):
            if alpha_num / 2 + word_num >= 28:
                article.title = article.title[:x]
            if article.title[x] in punctuation or (
                    u'/u4e00' <= article.title[x] <= u'/u9fa5'):
                word_num += 1
            else:
                alpha_num += 1
        if alpha_num / 2 + word_num < 5:
            article.title = '技术专栏-' + article.title

        # append summarize
        summ = ''
        if article.abstract_str:
            summ = '<h1>内容导读</h1><blockquote><p>%s</p></blockquote>' % str(
                article.abstract_str)
        article.html = summ + str(soup)
Example #5
0
 def fetch_article_from_url(self, url):
     resp = requests.get(url, headers=headers)
     soup = BeautifulSoup(resp.text)
     article = soup.find('div', class_='article')
     title = article.h1.string
     content = article.find('div', class_='show-content')
     for img in content.find_all('img'):
         if 'data-original-src' in img.attrs:
             img['src'] = 'http:' + img['data-original-src']
     return Article(title, content.get_text(), str(content))