def publish_toutiao(): client = db.get_redis_client(config.get('app.redis')) published_key = 'published_articles' publisher = Toutiao() logger.info('Start toutiao publish-processing...') article_str = client.lpop('fetched_article') while article_str and len(article_str) > 0: logger.info('Fetched article str from redis') try: if isinstance(article_str, bytes): article_str = bytes.decode(article_str) article_json = json.loads(article_str) article = Article() article.rebuild(article_json) title = repr(article.title) logger.info('Pre-publish article [%s]' % title) if article and (not client.sismember(published_key, title)): publisher.publish(article) client.sadd(published_key, title) else: logger.error( 'Pre-publish article [%s] error, due to published before' % title) except Exception as e: logger.error('Pickle loads article error') logger.error(e) finally: article_str = client.lpop('fetched_article')
def publish_toutiao(event, context): import json from publisher.toutiao.publisher import ToutiaoPublisher from core import logger, config, db from entities import Article import hashlib client = db.get_redis_client(config.get('app.redis')) published_key = 'published_articles' publisher = ToutiaoPublisher() logger.info('Start toutiao publish-processing...') article_str = client.lpop('fetched_article') while article_str and len(article_str) > 0: logger.info('Fetched article str from redis') try: if isinstance(article_str, bytes): article_str = bytes.decode(article_str) article_json = json.loads(article_str) article = Article() article.rebuild(article_json) title = repr(article.title) hashed_title = hashlib.md5(title.encode('utf8')).hexdigest().upper() logger.info('Pre-publish article [%s] hash value [%s]' % (title, hashed_title)) if article and (not client.sismember(published_key, hash(title))): publisher.publish(article) client.sadd(published_key, hashed_title) else: logger.error('Pre-publish article [%s] error, due to published before' % title) except Exception as e: logger.error('Pickle loads article error') logger.error(e) finally: article_str = client.lpop('fetched_article')
def estimate(self, article: Article): try: article.summary = summarize(article) for sen in article.summary: article.abstract_str += str(sen) article.score = baidu_repetition_rate(article) except Exception as e: logger.error('Estimate article error: [%s]' % article.title)
def transformer(self, article: Article): ''' :param article: :return: purify href of img tag ''' soup = BeautifulSoup(article.html) # purify image for img in soup.find_all('img'): try: if 'src' in img.attrs: img['src'] = uploader.upload(img['src']) except Exception as e: logger.error(e) continue # remove link for a in soup.find_all('a'): try: if 'href' in a.attrs: del a['href'] except Exception as e: logger.error(e) continue # control length of title 5-30 # 一个汉字算一个长度,2个字母算一个长度 alpha_num = 1 word_num = 0 for x in range(len(article.title)): if alpha_num / 2 + word_num >= 28: article.title = article.title[:x] if article.title[x] in punctuation or ( u'/u4e00' <= article.title[x] <= u'/u9fa5'): word_num += 1 else: alpha_num += 1 if alpha_num / 2 + word_num < 5: article.title = '技术专栏-' + article.title # append summarize summ = '' if article.abstract_str: summ = '<h1>内容导读</h1><blockquote><p>%s</p></blockquote>' % str( article.abstract_str) article.html = summ + str(soup)
def fetch_article_from_url(self, url): resp = requests.get(url, headers=headers) soup = BeautifulSoup(resp.text) article = soup.find('div', class_='article') title = article.h1.string content = article.find('div', class_='show-content') for img in content.find_all('img'): if 'data-original-src' in img.attrs: img['src'] = 'http:' + img['data-original-src'] return Article(title, content.get_text(), str(content))