def process_articles(articles, news_provider, campaign_instance): """Get content from News API. For each article verifies if exists in DB or not. If exists, ignore it, otherwise, process each field. Perform sentiment analysis on article content or description fields. :param articles: :param news_provider: :param campaign_instance: :return: """ news_id = None translated_text = None tweets = [] num_of_articles = len(articles) campaign_instance.set_articles(num_of_articles) report = Report.get_report(subject='News ML | %s' % news_provider) log.info('Analyzing %d articles...', num_of_articles) if num_of_articles < 1: if campaign_instance.send_report: log.warning('Skipping report via email...') log.error('No titles found') return # Create Report instance and attach recipients. log.info('Translation enabled: %s', campaign_instance.translation_enable) log.info('Email reporting enabled: %s', campaign_instance.send_report) log.info('Twitter enabled: %s', campaign_instance.twitter) log.info('Twitter image extraction: %s', settings.EXTRACT_TWITTER_IMAGE) log.info('Twitter add hash tags: %s', settings.TWITTER_ADD_HASHTAGS) if campaign_instance.send_report: report.email_recipients = campaign_instance.email_recipients for _, article in articles.items(): new_article = False if not article.description: log.error('Not description found in article: %s', article.url) continue if settings.EXTRACT_TWITTER_IMAGE: # meta:twitter:image article.twitter_image = twitter_utils.get_twitter_element( article.url, 'twitter:image') log.info('Article: %s, [%s], [%s]', article.title, article.url, article.twitter_image) if not DbHelper.record_exists(article.url): news_id = None log.info('New Article retrieved: %r, %r' % (article.title, article.url)) try: log.info('Processing sentiment analysis') score, magnitude = nlp_utils.get_sentiment_scores( article.content or article.description) log.info('Insert article into Database') news_id = DbHelper.insert_news( title=article.title, author=article.author, description=article.description, content=article.content, url=article.url, url_to_image=article.url_to_image, source_id=article.source_id, source=article.source, campaign=campaign_instance.reference, published_at=article.published_at, score=score, magnitude=magnitude, sentiment=nlp_utils.get_sentiment(score)) if not news_id: log.error('Unable to insert record %s', article.url) continue except (ValueError, UnicodeDecodeError) as exception: log.exception(exception) new_article = True if settings.PROCESS_ENTITIES: entities = common_utils.process_entities( article, news_id, True) else: log.warning('Article %r already exists ', article.url) if settings.PROCESS_ENTITIES: entities = common_utils.process_entities( article, news_id, False) if campaign_instance.translation_enable: translated_text = translate_utils.translate_article( campaign_instance, article, new_article, news_id) if len(translated_text) > 1: log.info('Adding translated content to report.') article.title = translated_text else: logging.error('Translated text is empty.') if campaign_instance.send_report: # Only send today articles in Report. today = datetime.now().date() published_at = datetime.strptime(article.published_at[:10], '%Y-%m-%d').date() if settings.REPORT_ALL_DATES_ARTICLES: log.info('Publishing all dates articles') log.info('Today: %s Report date: %s. ', today, published_at) if today == published_at or settings.REPORT_ALL_DATES_ARTICLES: log.info('Adding article information to Report: %s %s' % (article.title, article.url)) report.add_content(article.url, article.title, article.twitter_image) else: log.warning( 'Article published date is not today (%s), ' 'skipping article from Report', published_at) # Handle Twitter if campaign_instance.twitter: tweet_text = article.title if campaign_instance.translation_enable: tweet_text = translated_text if settings.TWITTER_ADD_HASHTAGS: # TODO (gogasca) Find Twitter handlers tweet_text = twitter_utils.add_hash_tags(tweet_text, entities) tweets.append('{} {}'.format(tweet_text, article.url)) if campaign_instance.send_report: log.info('Sending report via email...') report.send() if campaign_instance.twitter: log.info('Sending Tweets') twitter_utils.send_tweets(tweets, campaign_instance.twitter_delay) log.info('Extraction completed')
def launch(campaign_instance=None): """ The logic is as follows: 1. Extract Articles from <Provider: (Techmeme, Techcrunch)> web page 2. For each article extract title, short url and content. 3. Translate Article information 4. Use Google NLP to extract meaningful keywords from content 5. Insert record in database. 6. Send report. :param campaign_instance: :return: """ entities = None news_id = None num_of_articles = 0 translated_text = None tweets = [] report = Report.get_report(subject=settings.TECHMEME_REPORT) """Create Report instance and attach recipients.""" log.info('Translation enabled: %s', campaign_instance.translation_enable) log.info('Email reporting enabled: %s', campaign_instance.send_report) log.info('Twitter enabled: %s', campaign_instance.twitter) log.info('Twitter image extraction: %s', settings.EXTRACT_TWITTER_IMAGE) log.info('Twitter add hash tags: %s', settings.TWITTER_ADD_HASHTAGS) articles = extract_articles(settings.TECHMEME_URL) if articles: num_of_articles = len(articles) else: logging.error('No articles found.') log.info('Retrieving %d articles...', num_of_articles) if campaign_instance.limit > 0: logging.warning('Limit is defined. Skipping other news') articles = dict( itertools.islice(articles.items(), campaign_instance.limit)) num_of_articles = len(articles) if num_of_articles < 1: log.error('No articles found') if campaign_instance.send_report: log.warning('Skipping report via email...') return log.info('Processing %d articles...', num_of_articles) campaign_instance.set_articles(num_of_articles) if campaign_instance.send_report: report.email_recipients = campaign_instance.email_recipients for _, article in articles.items(): new_article = False if not article.title: log.warning('No title found. Article won\'t be inserted') continue if settings.EXTRACT_TWITTER_IMAGE: # meta:twitter:image article.twitter_image = twitter_utils.get_twitter_element( article.url, 'twitter:image') log.info('Article: %s, [%s], [%s]', article.title, article.url, article.twitter_image) if not DbHelper.record_exists(article.url): news_id = None log.info('New Article retrieved: %r, %r' % ( article.title, article.url)) try: log.info('Processing sentiment analysis') score, magnitude = nlp_utils.get_sentiment_scores( article.content) source = url_extract.get_domain(article.url) or '' log.info('Insert article into Database') news_id = DbHelper.insert_news(title=article.title, content=article.content, url=article.url, provider=settings.TECHMEME, source=source.upper(), source_id=source, campaign=campaign_instance.reference, score=score, magnitude=magnitude, sentiment=nlp_utils.get_sentiment( score) ) if not news_id: log.error('Unable to insert record %s', article.url) continue except (ValueError, UnicodeDecodeError) as e: log.exception(e) new_article = True if settings.PROCESS_ENTITIES: entities = common_utils.process_entities(article, news_id, True) else: log.warning('Article already exists.') if settings.PROCESS_ENTITIES: entities = common_utils.process_entities(article, news_id, False) if campaign_instance.translation_enable: translated_text = translate_utils.translate_article( campaign_instance, article, new_article, news_id) if translated_text: log.info('Adding translated content to report.') article.title = translated_text if campaign_instance.send_report: # Only send articles created 'today' in Report. today = datetime.now().date() published_at = datetime.strptime(article.published_at, '%y%m%d').date() if settings.REPORT_ALL_DATES_ARTICLES: log.info('Publishing all dates articles') log.info('Today: %s Report date: %s. ', today, published_at) if today == published_at or settings.REPORT_ALL_DATES_ARTICLES: log.info( 'Adding article information to Report: %s %s' % ( article.title, article.url)) report.add_content(article.url, article.title, article.twitter_image) else: log.warning( 'Article published date is not today (%s), ' 'skipping article from Report', published_at) if campaign_instance.twitter: tweet_text = article.title if campaign_instance.translation_enable: tweet_text = translated_text if settings.TWITTER_ADD_HASHTAGS: # TODO (gogasca) Find Twitter handlers tweet_text = twitter_utils.add_hash_tags(tweet_text, entities) tweets.append('{} {}'.format(tweet_text, article.url)) if campaign_instance.send_report: log.info('Sending email notification...') report.send() if campaign_instance.twitter: log.info('Sending Tweets...') twitter_utils.send_tweets(tweets, campaign_instance.twitter_delay) log.info('Extraction completed')