Example #1
0
    def test_get_classifier_returns_naive_bayes_classifier_with_corpora_when_corpora_exist(
            self):
        # mock the shuffle list method
        classify.shuffle = mock.MagicMock()
        # regenerate the command class being tested as we mocked an extra module above
        self.command = classify.Command()

        # create a dummy news item and corpus
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.save()

        corpus = Corpus()
        corpus.news_item = news_item
        corpus.positive = True
        corpus.save()

        # call the method being tested
        classifier = self.command.get_classifier()

        # shuffled the corpora
        classify.shuffle.assert_called_once_with([('foo', 'pos')])
        # a classifier with corpora was created
        classify.NaiveBayesClassifier.assert_called_once_with([('foo', 'pos')])
        # the method returned a non-empty result
        self.assertTrue(classifier != None)
Example #2
0
class RssFeedTestCase(TestCase):

    rss_feed = None
    news_item = None

    def setUp(self):
        self.news_item = NewsItem()
        self.rss_feed = RssFeed()

    def test_items_returns_empty_list_when_newsitem_does_not_exist(self):
        items = self.rss_feed.items()
        self.assertEquals([], list(items))

    def test_items_returns_newsitems_when_positive_newsitem_exists(self):
        self.news_item.score = 1
        self.news_item.published = True
        self.news_item.save()

        items = self.rss_feed.items()
        self.assertEquals(1, len(list(items)))

    def test_items_returns_empty_list_when_no_positive_newsitem_exists(self):
        self.news_item.score = 0
        self.news_item.published = False
        self.news_item.save()

        items = self.rss_feed.items()
        self.assertEquals([], list(items))

    def test_item_title_returns_newsitem_title_when_set(self):
        self.news_item.title = 'foo_title'
        self.assertEquals('foo_title',
                          self.rss_feed.item_title(self.news_item))

    def test_item_title_returns_empty_string_when_newsitem_title_not_set(self):
        self.assertEquals('', self.rss_feed.item_title(self.news_item))

    def test_item_description_returns_newsitem_description_when_set(self):
        self.news_item.description = 'foo_description'
        self.assertEquals('foo_description',
                          self.rss_feed.item_description(self.news_item))

    def test_item_description_returns_empty_string_when_newsitem_description_not_set(
            self):
        self.assertEquals('', self.rss_feed.item_description(self.news_item))

    def test_item_link_returns_newsitem_url_when_set(self):
        self.news_item.url = 'https://www.google.com'
        self.assertEquals('https://www.google.com',
                          self.rss_feed.item_link(self.news_item))

    def test_item_link_returns_empty_string_when_newsitem_url_not_set(self):
        self.assertEquals('', self.rss_feed.item_link(self.news_item))

    def test_item_pubdate_returns_none_when_newsitem_added_at_not_set(self):
        self.assertNotEquals(None, self.rss_feed.item_pubdate(self.news_item))

    def test_item_pubdate_returns_newsitem_added_at_datetime_when_set(self):
        self.news_item.url = timezone.now()
        self.assertNotEquals(None, self.rss_feed.item_pubdate(self.news_item))
Example #3
0
    def test_get_classifier_uses_unique_corpora(self):
        # mock the shuffle list method
        classify.shuffle = mock.MagicMock()
        # regenerate the command class being tested as we mocked an extra module above
        self.command = classify.Command()

        # create dummy news items and corpora with same title and classification
        for index in range(1, 3):
            news_item = NewsItem()
            news_item.title = 'foo'
            news_item.save()

            corpus = Corpus()
            corpus.news_item = news_item
            corpus.positive = True
            corpus.save()

        # test that two corpora exist
        corpora = Corpus.objects.all()
        self.assertEquals(2, len(corpora))

        # call the method being tested
        classifier = self.command.get_classifier()

        # shuffled the corpora with only one of the identical corpora
        classify.shuffle.assert_called_once_with([('foo', 'pos')])
Example #4
0
    def test_changelist_view_returns_metrics_when_corpora_exist_and_query_is_on_the_added_at_date(
            self):
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.save()

        corpus = Corpus()
        corpus.news_item = news_item
        corpus.positive = True
        corpus.added_at = '2018-12-02 21:00:00+00:00'
        corpus.save()

        superuser = self.create_superuser('superuser')
        request = self.mocked_authenticated_request(
            '/admin/rss/corpusmetric/?added_at__month=12&added_at__year=2018',
            superuser)
        response = self.admin.changelist_view(request)

        expected_metrics = [{'positive': True, 'total': 1}]
        expected_metrics_total = {'total': 1}

        self.assertEquals(expected_metrics,
                          response.context_data['corpus_metrics'])
        self.assertEquals(expected_metrics_total,
                          response.context_data['corpus_metrics_total'])
Example #5
0
    def test_get_newsitem_title_returns_title_when_news_item_is_not_none(self):
        news_item = NewsItem()
        news_item.title = 'foo'
        corpus = Corpus()
        corpus.news_item = news_item

        admin = CorpusAdmin(Corpus, AdminSite())
        self.assertEquals('foo', admin.get_news_item_title(corpus))
Example #6
0
    def test_news_item_unpublish_unpublishes_newsitems_in_query_set(self):
        news_items = NewsItem.objects.all()
        self.assertEquals([], list(news_items))

        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.published = True
        query_set = [news_item]

        self.newsitem.news_item_unpublish(None, None, query_set)

        news_items = NewsItem.objects.all()
        self.assertEquals(1, len(news_items))
        self.assertEquals(False, news_items[0].published)
Example #7
0
    def get_classifier(self):
        self.logger.info('Training classifier...')

        stopwords_blacklisted = self.get_stopwords()
        stopwords_pattern = re.compile(r'\b(' +
                                       r'|'.join(stopwords_blacklisted) +
                                       r')\b\s*')

        corpora_classified = list()
        for corpus in Corpus.objects.filter(active=True):
            title = stopwords_pattern.sub('', corpus.news_item.title)
            corpora_classified.append((title, corpus.get_classification()))

        for news_item in NewsItem.find_neutral():
            title = stopwords_pattern.sub('', news_item.title)
            corpora_classified.append((title, 'neu'))

        corpora_classified = list(set(corpora_classified))
        shuffle(corpora_classified)

        classifier = NaiveBayesClassifier(corpora_classified)

        self.logger.info('Dumping classifier.')
        pickle.dump(classifier, open(settings.CLASSIFIER_DUMP_FILEPATH, 'wb'))
        self.logger.info('Classifier dumped!')

        return classifier
Example #8
0
    def handle(self, *args, **options):
        news_items = NewsItem.find_negative(
            settings.SENTIMENT_POLARITY_THRESHOLD)

        if not news_items:
            self.logger.info('No news items found.')
            return

        connection = pika.BlockingConnection(
            pika.ConnectionParameters(host=settings.QUEUE_HOSTNAME))
        channel = connection.channel()
        channel.queue_declare(queue=settings.QUEUE_NAME_CLASSIFY, durable=True)
        self.logger.info(
            'Found %s negative news items that are going to be re-classified.',
            len(news_items))

        for news_item in news_items:
            body = serializers.serialize('json', [news_item])
            channel.basic_publish(exchange='',
                                  routing_key=settings.QUEUE_NAME_CLASSIFY,
                                  body=body,
                                  properties=pika.BasicProperties(
                                      delivery_mode=2,
                                      headers={'x-is-self-train': True}))

            self.logger.info('Successfully re-queued #%s "%s"!', news_item.id,
                             news_item.title)
Example #9
0
    def test_changelist_view_returns_metrics_when_positive_newsitem_and_negative_corpus_exist(self):
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.score = 1
        news_item.added_at = '2018-12-03 21:00:00+00:00'
        news_item.save()

        corpus = Corpus()
        corpus.positive = False
        corpus.news_item = news_item
        corpus.save()

        superuser = self.create_superuser('superuser')
        request = self.mocked_authenticated_request('/admin/rss/newsitemmetric/', superuser)
        response = self.admin.changelist_view(request)

        self.assertEquals(1, response.context_data['news_items_count'])
        self.assertEquals(0, response.context_data['news_items_unclassified'])
        self.assertEquals(1, response.context_data['classification_initial']['positive'])
        self.assertEquals(0, response.context_data['classification_initial']['negative'])
        self.assertEquals(0, response.context_data['classification_supervised']['positive'])
        self.assertEquals(1, response.context_data['classification_supervised']['negative'])
        self.assertEquals(0, response.context_data['corpus_count']['positive'])
        self.assertEquals(1, response.context_data['corpus_count']['negative'])
        self.assertEquals([{'accuracy': 0, 'added_at': '2018-12-03'}], response.context_data['accuracy'])
Example #10
0
File: web.py Project: mylk/ojah
def news(request):
    template = loader.get_template('web/news.html')

    news_items = NewsItem.find_positive(settings.SENTIMENT_POLARITY_THRESHOLD,
                                        settings.WEB_NEWS_ITEMS_COUNT)

    context = {'news_items': news_items}

    return HttpResponse(template.render(context))
Example #11
0
    def test_get_classifier_uses_corpora_clean_of_stopwords(self):
        # mock the shuffle list method
        classify.shuffle = mock.MagicMock()
        # regenerate the command class being tested as we mocked an extra module above
        self.command = classify.Command()

        # create dummy news item (having title that contains stopwords) and corpora
        news_item = NewsItem()
        news_item.title = 'when foo then bar'
        news_item.save()

        corpus = Corpus()
        corpus.news_item = news_item
        corpus.positive = True
        corpus.save()

        # call the method being tested
        classifier = self.command.get_classifier()

        # shuffled the corpora with only one of the identical corpora
        classify.shuffle.assert_called_once_with([('foo bar', 'pos')])
Example #12
0
    def test_get_accuracy_total_returns_100_percent_when_no_corpora(self):
        news_item = NewsItem()
        news_item.score = 1
        news_item.added_at = '2018-11-24 01:00:00+00:00'
        news_item.save()

        accuracy = self.newsitem_metric.get_accuracy_total(self.date_range)
        self.assertEquals(100, accuracy)
Example #13
0
    def test_changelist_view_returns_positive_and_negative_metrics_when_positive_and_negative_corpora_exist(
            self):
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.save()

        corpus = Corpus()
        corpus.news_item = news_item
        corpus.positive = True
        corpus.save()

        news_item = NewsItem()
        news_item.title = 'bar'
        news_item.save()

        corpus = Corpus()
        corpus.news_item = news_item
        corpus.positive = False
        corpus.save()

        superuser = self.create_superuser('superuser')
        request = self.mocked_authenticated_request('/admin/rss/corpusmetric/',
                                                    superuser)
        response = self.admin.changelist_view(request)

        expected_metrics = [{
            'positive': True,
            'total': 1
        }, {
            'positive': False,
            'total': 1
        }]
        expected_metrics_total = {'total': 2}

        self.assertEquals(expected_metrics,
                          response.context_data['corpus_metrics'])
        self.assertEquals(expected_metrics_total,
                          response.context_data['corpus_metrics_total'])
Example #14
0
    def test_get_accuracy_returns_100_percent_when_no_corpora(self):
        news_item = NewsItem()
        news_item.score = 1
        news_item.added_at = '2018-11-24 01:00:00+00:00'
        news_item.save()

        metrics = self.newsitem_metric.get_accuracy(self.date_range)
        self.assertEquals(1, len(metrics))
        self.assertEquals(100, metrics[0]['accuracy'])
        self.assertEquals('2018-11-24', metrics[0]['added_at'])
Example #15
0
    def test_corpus_create_positive_creates_positive_corpora_and_enqueues_job_to_retrain_classifier_when_newsitems_in_query_set(
            self):
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.published = True
        news_item.save()
        query_set = [news_item]

        self.newsitem.corpus_create_positive(None, None, query_set)

        self.newsitem.enqueue_corpus_creation.assert_called_once()
        corpora = Corpus.objects.all()
        self.assertEquals(1, len(corpora))
        self.assertEquals(True, corpora[0].positive)
Example #16
0
    def test_get_accuracy_total_returns_0_percent_when_only_not_accurate_newsitems_exist(
            self):
        news_item = NewsItem()
        news_item.score = 1
        news_item.added_at = '2018-11-24 01:00:00+00:00'
        news_item.save()

        corpus = Corpus()
        corpus.news_item = news_item
        corpus.positive = False
        corpus.save()

        accuracy = self.newsitem_metric.get_accuracy_total(self.date_range)
        self.assertEquals(0.0, accuracy)
Example #17
0
    def setUp(self):
        # retain the original imported packages
        classify.serializers_real = classify.serializers

        # mock the tread target methods
        self.thread_classify = mock.MagicMock()
        self.thread_train = mock.MagicMock()
        classify.threading.Thread = mock.MagicMock(
            side_effect=[self.thread_classify, self.thread_train])

        # mock pika.BlockingConnection
        self.connection = mock.MagicMock()
        classify.pika.BlockingConnection = mock.MagicMock(
            return_value=self.connection)
        # mock pika.BlockingConnection.channel
        self.channel = mock.MagicMock()
        self.connection.channel = mock.MagicMock(return_value=self.channel)
        # mock ConnectionParameters
        classify.pika.ConnectionParameters = mock.MagicMock()
        # mock the ORM connection
        self.db_connection = mock.MagicMock()
        classify.connection = self.db_connection

        # mock the classifier
        classify.NaiveBayesClassifier = mock.MagicMock()

        classify.settings.AUTO_PUBLISH = False

        # a fake news item to be used as classification input
        news_item = NewsItem()
        news_item.title = 'foo'
        self.serialized_news_item = serializers.serialize('json', [news_item])

        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.published = 'foo'
        self.serialized_news_item_crap = serializers.serialize(
            'json', [news_item])

        # mock the logger
        self.logger = mock.MagicMock()
        classify.logging.getLogger = mock.MagicMock(return_value=self.logger)

        self.command = classify.Command()
Example #18
0
    def test_news_item_publish_and_corpus_create_negative_publishes_newsitems_and_creates_negative_corpora_when_newsitems_in_query_set(
            self):
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.publshed = True
        news_item.score = 1.00
        news_item.save()

        query_set = [news_item]
        self.newsitem.news_item_unpublish_and_corpus_create_negative(
            None, None, query_set)

        self.newsitem.enqueue_corpus_creation.assert_called_once()
        news_items = NewsItem.objects.all()
        self.assertEquals(1, len(news_items))
        self.assertFalse(news_items[0].published)
        corpus = Corpus.objects.filter(news_item=news_items[0])
        self.assertNotEquals(None, corpus)
        self.assertFalse(corpus[0].positive)
Example #19
0
    def test_get_accuracy_returns_0_percent_when_only_not_accurate_newsitems_exist(
            self):
        news_item = NewsItem()
        news_item.score = 1
        news_item.added_at = '2018-11-24 01:00:00+00:00'
        news_item.save()

        corpus = Corpus()
        corpus.news_item = news_item
        corpus.positive = False
        corpus.save()

        metrics = self.newsitem_metric.get_accuracy(self.date_range)
        self.assertEquals(1, len(metrics))
        self.assertEquals(0, metrics[0]['accuracy'])
        self.assertEquals('2018-11-24', metrics[0]['added_at'])
Example #20
0
    def test_news_returns_http_response_with_template_and_positive_newsitems_when_positive_newsitems_exist(self):
        # create a positive news item
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.score = 1
        news_item.published = True
        news_item.save()

        # make request to news view
        response = self.get_news()

        # returns an instance of HttpResponse
        self.assertTrue(type(response) is HttpResponse)

        # request didn't fail
        self.assertEquals(200, response.status_code)

        # response contains news items
        content = response.getvalue()
        self.assertTrue('news-item' in str(content))
Example #21
0
    def test_changelist_view_returns_metrics_when_newsitems_exist_but_no_corpora_and_date_query_includes_newsitem(self):
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.score = 1
        news_item.added_at = '2018-12-03 21:00:00+00:00'
        news_item.save()

        superuser = self.create_superuser('superuser')
        request = self.mocked_authenticated_request('/admin/rss/newsitemmetric/?added_at__month=12&added_at__year=2018', superuser)
        response = self.admin.changelist_view(request)

        self.assertEquals(1, response.context_data['news_items_count'])
        self.assertEquals(0, response.context_data['news_items_unclassified'])
        self.assertEquals(1, response.context_data['classification_initial']['positive'])
        self.assertEquals(0, response.context_data['classification_initial']['negative'])
        self.assertEquals(1, response.context_data['classification_supervised']['positive'])
        self.assertEquals(0, response.context_data['classification_supervised']['negative'])
        self.assertEquals(0, response.context_data['corpus_count']['positive'])
        self.assertEquals(0, response.context_data['corpus_count']['negative'])
        self.assertEquals([{'accuracy': 100.0, 'added_at': '2018-12-03'}], response.context_data['accuracy'])
Example #22
0
    def test_news_item_publish_and_corpus_create_positive_publishes_newsitems_and_creates_positive_corpora_when_newsitems_in_query_set(
            self):
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.published = False
        news_item.save()

        query_set = [news_item]
        self.newsitem.news_item_publish_and_corpus_create_positive(
            None, None, query_set)

        self.newsitem.enqueue_corpus_creation.assert_called_once()
        corpora = Corpus.objects.all()
        self.assertEquals(1, len(corpora))
        self.assertEquals(True, corpora[0].positive)
        news_items = NewsItem.objects.all()
        self.assertEquals(1, len(news_items))
        self.assertEquals(True, news_items[0].published)
Example #23
0
    def test_about_returns_http_response_with_stats(self):
        # create a news item
        news_item = NewsItem()
        news_item.title = 'foo'
        news_item.score = 1
        news_item.published = False
        news_item.save()

        # create an active source
        source = Source()
        source.name = 'foo'
        source.url = 'http://www.foo.com'
        source.homepage = 'http://www.foo.com'
        source.save()

        # create an inactive source
        source = Source()
        source.name = 'bar'
        source.url = 'http://www.bar.com'
        source.homepage = 'http://www.bar.com'
        source.active = False
        source.save()

        # make request to news view
        response = self.get_about()

        # returns an instance of HttpResponse
        self.assertTrue(type(response) is HttpResponse)

        # request didn't fail
        self.assertEquals(200, response.status_code)

        # response does not contain any news items
        content = response.getvalue()
        self.assertTrue('Sources crawled</strong>: 1' in str(content))
        self.assertTrue('News classified</strong>: 1' in str(content))
        self.assertTrue('Corpora created</strong>: 0' in str(content))
        self.assertTrue('Classification accuracy</strong>: 100%' in str(content))
Example #24
0
    def test_handle_publishes_when_newsitems_exist(self):
        news_item = NewsItem()
        news_item.id = 1
        news_item.title = 'foo'
        news_item.save()

        # the method being tested
        self.command.handle()

        classify_requeue.pika.BlockingConnection.assert_called_once()
        classify_requeue.pika.ConnectionParameters.assert_called_once()
        self.connection.channel.assert_called_once()
        self.channel.queue_declare.assert_called_once_with(
            queue=settings.QUEUE_NAME_CLASSIFY, durable=True)
        self.channel.basic_publish.assert_called_once()

        self.logger.info.assert_any_call(
            'Found %s news items that need to be classified.', 1)
        self.logger.info.assert_any_call('Successfully re-queued #%s "%s"!', 1,
                                         'foo')
Example #25
0
    def test_to_dict_returns_dictionaries_when_cursor_more_than_one_results(
            self):
        news_item = NewsItem()
        news_item.score = 1
        news_item.added_at = '2018-11-23 01:00:00+00:00'
        news_item.save()

        news_item = NewsItem()
        news_item.score = 1
        news_item.added_at = '2018-11-23 02:00:00+00:00'
        news_item.save()

        with connection.cursor() as cursor:
            cursor.execute('SELECT * FROM news_item')
            results = self.newsitem_metric.to_dict(cursor)

        self.assertEquals(2, len(results))
        self.assertEquals(dict, type(results[0]))
        self.assertEquals(dict, type(results[1]))
Example #26
0
 def setUp(self):
     news_item = NewsItem()
     self.corpus = Corpus()
     self.corpus.news_item = news_item
Example #27
0
    def test_to_dict_returns_one_dictionary_when_cursor_has_one_result(self):
        source = Source()
        source.id = 1
        source.name = 'foo_source'
        source.save()

        news_item = NewsItem()
        news_item.id = 1
        news_item.title = 'foo'
        news_item.description = 'bar'
        news_item.url = 'https://www.google.com'
        news_item.added_at = '2018-11-23 01:00:00+00:00'
        news_item.source = source
        news_item.published = False
        news_item.score = 1
        news_item.save()

        with connection.cursor() as cursor:
            cursor.execute('SELECT * FROM news_item')
            result = self.newsitem_metric.to_dict(cursor)

        self.assertEquals(1, len(result))
        self.assertEquals([{
            'id': 1,
            'title': 'foo',
            'description': 'bar',
            'url': 'https://www.google.com',
            'added_at': datetime(2018, 11, 23, 1, 0),
            'source_id': 1,
            'published': False,
            'score': 1
        }], result)
Example #28
0
    def test_get_accuracy_does_not_include_unclassified_newsitems(self):
        news_item = NewsItem()
        news_item.score = None
        news_item.added_at = '2018-11-24 01:00:00+00:00'
        news_item.save()

        news_item = NewsItem()
        news_item.score = None
        news_item.published = True
        news_item.added_at = '2018-11-24 02:00:00+00:00'
        news_item.save()

        metrics = self.newsitem_metric.get_accuracy(self.date_range)
        self.assertEquals([], metrics)
Example #29
0
    def test_get_accuracy_total_uses_two_day_statistics_when_exist(self):
        news_item = NewsItem()
        news_item.score = 1
        news_item.added_at = '2018-11-24 01:00:00+00:00'
        news_item.save()

        corpus = Corpus()
        corpus.news_item = news_item
        corpus.positive = False
        corpus.save()

        news_item = NewsItem()
        news_item.score = 1
        news_item.added_at = '2018-11-25 01:00:00+00:00'
        news_item.save()

        accuracy = self.newsitem_metric.get_accuracy_total(self.date_range)
        self.assertEquals(50.0, accuracy)
Example #30
0
 def setUp(self):
     self.news_item = NewsItem()
     self.rss_feed = RssFeed()