Ejemplo n.º 1
0
def process_articles(articles, news_provider, campaign_instance):
    """Get content from News API.
    For each article verifies if exists in DB or not.
    If exists, ignore it, otherwise, process each field.
    Perform sentiment analysis on article content or description fields.


    :param articles:
    :param news_provider:
    :param campaign_instance:
    :return:
    """
    news_id = None
    translated_text = None
    tweets = []
    num_of_articles = len(articles)
    campaign_instance.set_articles(num_of_articles)
    report = Report.get_report(subject='News ML | %s' % news_provider)

    log.info('Analyzing %d articles...', num_of_articles)
    if num_of_articles < 1:
        if campaign_instance.send_report:
            log.warning('Skipping report via email...')
        log.error('No titles found')
        return
    # Create Report instance and attach recipients.
    log.info('Translation enabled: %s', campaign_instance.translation_enable)
    log.info('Email reporting enabled: %s', campaign_instance.send_report)
    log.info('Twitter enabled: %s', campaign_instance.twitter)
    log.info('Twitter image extraction: %s', settings.EXTRACT_TWITTER_IMAGE)
    log.info('Twitter add hash tags: %s', settings.TWITTER_ADD_HASHTAGS)

    if campaign_instance.send_report:
        report.email_recipients = campaign_instance.email_recipients

    for _, article in articles.items():
        new_article = False
        if not article.description:
            log.error('Not description found in article: %s', article.url)
            continue
        if settings.EXTRACT_TWITTER_IMAGE:  # meta:twitter:image
            article.twitter_image = twitter_utils.get_twitter_element(
                article.url, 'twitter:image')
        log.info('Article: %s, [%s], [%s]', article.title, article.url,
                 article.twitter_image)
        if not DbHelper.record_exists(article.url):
            news_id = None
            log.info('New Article retrieved: %r, %r' %
                     (article.title, article.url))
            try:
                log.info('Processing sentiment analysis')
                score, magnitude = nlp_utils.get_sentiment_scores(
                    article.content or article.description)
                log.info('Insert article into Database')
                news_id = DbHelper.insert_news(
                    title=article.title,
                    author=article.author,
                    description=article.description,
                    content=article.content,
                    url=article.url,
                    url_to_image=article.url_to_image,
                    source_id=article.source_id,
                    source=article.source,
                    campaign=campaign_instance.reference,
                    published_at=article.published_at,
                    score=score,
                    magnitude=magnitude,
                    sentiment=nlp_utils.get_sentiment(score))
                if not news_id:
                    log.error('Unable to insert record %s', article.url)
                    continue
            except (ValueError, UnicodeDecodeError) as exception:
                log.exception(exception)
            new_article = True
            if settings.PROCESS_ENTITIES:
                entities = common_utils.process_entities(
                    article, news_id, True)
        else:
            log.warning('Article %r already exists ', article.url)
            if settings.PROCESS_ENTITIES:
                entities = common_utils.process_entities(
                    article, news_id, False)

        if campaign_instance.translation_enable:
            translated_text = translate_utils.translate_article(
                campaign_instance, article, new_article, news_id)
            if len(translated_text) > 1:
                log.info('Adding translated content to report.')
                article.title = translated_text
            else:
                logging.error('Translated text is empty.')

        if campaign_instance.send_report:
            # Only send today articles in Report.
            today = datetime.now().date()
            published_at = datetime.strptime(article.published_at[:10],
                                             '%Y-%m-%d').date()
            if settings.REPORT_ALL_DATES_ARTICLES:
                log.info('Publishing all dates articles')
            log.info('Today: %s Report date: %s. ', today, published_at)
            if today == published_at or settings.REPORT_ALL_DATES_ARTICLES:
                log.info('Adding article information to Report: %s %s' %
                         (article.title, article.url))
                report.add_content(article.url, article.title,
                                   article.twitter_image)
            else:
                log.warning(
                    'Article published date is not today (%s), '
                    'skipping article from Report', published_at)

        # Handle Twitter
        if campaign_instance.twitter:
            tweet_text = article.title
            if campaign_instance.translation_enable:
                tweet_text = translated_text
            if settings.TWITTER_ADD_HASHTAGS:
                # TODO (gogasca) Find Twitter handlers
                tweet_text = twitter_utils.add_hash_tags(tweet_text, entities)
            tweets.append('{} {}'.format(tweet_text, article.url))

    if campaign_instance.send_report:
        log.info('Sending report via email...')
        report.send()

    if campaign_instance.twitter:
        log.info('Sending Tweets')
        twitter_utils.send_tweets(tweets, campaign_instance.twitter_delay)

    log.info('Extraction completed')
Ejemplo n.º 2
0
def launch(campaign_instance=None):
    """
    The logic is as follows:
        1. Extract Articles from <Provider: (Techmeme, Techcrunch)> web page
        2. For each article extract title, short url and content.
        3. Translate Article information
        4. Use Google NLP to extract meaningful keywords from content
        5. Insert record in database.
        6. Send report.

    :param campaign_instance:
    :return:
    """
    entities = None
    news_id = None
    num_of_articles = 0
    translated_text = None
    tweets = []
    report = Report.get_report(subject=settings.TECHMEME_REPORT)

    """Create Report instance and attach recipients."""
    log.info('Translation enabled: %s', campaign_instance.translation_enable)
    log.info('Email reporting enabled: %s', campaign_instance.send_report)
    log.info('Twitter enabled: %s', campaign_instance.twitter)
    log.info('Twitter image extraction: %s', settings.EXTRACT_TWITTER_IMAGE)
    log.info('Twitter add hash tags: %s', settings.TWITTER_ADD_HASHTAGS)

    articles = extract_articles(settings.TECHMEME_URL)
    if articles:
        num_of_articles = len(articles)
    else:
        logging.error('No articles found.')

    log.info('Retrieving %d articles...', num_of_articles)
    if campaign_instance.limit > 0:
        logging.warning('Limit is defined. Skipping other news')
        articles = dict(
            itertools.islice(articles.items(), campaign_instance.limit))
        num_of_articles = len(articles)
    if num_of_articles < 1:
        log.error('No articles found')
        if campaign_instance.send_report:
            log.warning('Skipping report via email...')
        return
    log.info('Processing %d articles...', num_of_articles)
    campaign_instance.set_articles(num_of_articles)

    if campaign_instance.send_report:
        report.email_recipients = campaign_instance.email_recipients
    for _, article in articles.items():
        new_article = False
        if not article.title:
            log.warning('No title found. Article won\'t be inserted')
            continue
        if settings.EXTRACT_TWITTER_IMAGE:  # meta:twitter:image
            article.twitter_image = twitter_utils.get_twitter_element(
                article.url, 'twitter:image')
        log.info('Article: %s, [%s], [%s]', article.title, article.url,
                 article.twitter_image)
        if not DbHelper.record_exists(article.url):
            news_id = None
            log.info('New Article retrieved: %r, %r' % (
                article.title, article.url))
            try:
                log.info('Processing sentiment analysis')
                score, magnitude = nlp_utils.get_sentiment_scores(
                    article.content)
                source = url_extract.get_domain(article.url) or ''
                log.info('Insert article into Database')
                news_id = DbHelper.insert_news(title=article.title,
                                               content=article.content,
                                               url=article.url,
                                               provider=settings.TECHMEME,
                                               source=source.upper(),
                                               source_id=source,
                                               campaign=campaign_instance.reference,
                                               score=score,
                                               magnitude=magnitude,
                                               sentiment=nlp_utils.get_sentiment(
                                                   score)
                                               )
                if not news_id:
                    log.error('Unable to insert record %s', article.url)
                    continue
            except (ValueError, UnicodeDecodeError) as e:
                log.exception(e)
            new_article = True
            if settings.PROCESS_ENTITIES:
                entities = common_utils.process_entities(article, news_id, True)
        else:
            log.warning('Article already exists.')
            if settings.PROCESS_ENTITIES:
                entities = common_utils.process_entities(article, news_id, False)

        if campaign_instance.translation_enable:
            translated_text = translate_utils.translate_article(
                campaign_instance, article, new_article, news_id)
            if translated_text:
                log.info('Adding translated content to report.')
                article.title = translated_text

        if campaign_instance.send_report:
            # Only send articles created 'today' in Report.
            today = datetime.now().date()
            published_at = datetime.strptime(article.published_at,
                                             '%y%m%d').date()
            if settings.REPORT_ALL_DATES_ARTICLES:
                log.info('Publishing all dates articles')
            log.info('Today: %s Report date: %s. ', today, published_at)
            if today == published_at or settings.REPORT_ALL_DATES_ARTICLES:
                log.info(
                    'Adding article information to Report: %s %s' % (
                        article.title, article.url))
                report.add_content(article.url, article.title,
                                   article.twitter_image)
            else:
                log.warning(
                    'Article published date is not today (%s), '
                    'skipping article from Report', published_at)

        if campaign_instance.twitter:
            tweet_text = article.title
            if campaign_instance.translation_enable:
                tweet_text = translated_text
            if settings.TWITTER_ADD_HASHTAGS:
                # TODO (gogasca) Find Twitter handlers
                tweet_text = twitter_utils.add_hash_tags(tweet_text, entities)
            tweets.append('{} {}'.format(tweet_text, article.url))

    if campaign_instance.send_report:
        log.info('Sending email notification...')
        report.send()

    if campaign_instance.twitter:
        log.info('Sending Tweets...')
        twitter_utils.send_tweets(tweets, campaign_instance.twitter_delay)

    log.info('Extraction completed')