def test_changelist_view_returns_metrics_when_accurate_and_inaccurate_newsitems_exist_and_finally_one_of_each_class(self): news_item = NewsItem() news_item.title = 'foo' news_item.score = 1 news_item.added_at = '2018-12-03 21:00:00+00:00' news_item.save() corpus = Corpus() corpus.positive = False corpus.news_item = news_item corpus.save() news_item = NewsItem() news_item.title = 'bar' news_item.score = 1 news_item.added_at = '2018-12-03 22:00:00+00:00' news_item.save() superuser = self.create_superuser('superuser') request = self.mocked_authenticated_request('/admin/rss/newsitemmetric/', superuser) response = self.admin.changelist_view(request) self.assertEquals(2, response.context_data['news_items_count']) self.assertEquals(0, response.context_data['news_items_unclassified']) self.assertEquals(2, response.context_data['classification_initial']['positive']) self.assertEquals(0, response.context_data['classification_initial']['negative']) self.assertEquals(1, response.context_data['classification_supervised']['positive']) self.assertEquals(1, response.context_data['classification_supervised']['negative']) self.assertEquals(0, response.context_data['corpus_count']['positive']) self.assertEquals(1, response.context_data['corpus_count']['negative']) self.assertEquals([{'accuracy': 50, 'added_at': '2018-12-03'}], response.context_data['accuracy'])
def test_get_accuracy_does_not_include_unclassified_newsitems(self): news_item = NewsItem() news_item.score = None news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() news_item = NewsItem() news_item.score = None news_item.published = True news_item.added_at = '2018-11-24 02:00:00+00:00' news_item.save() metrics = self.newsitem_metric.get_accuracy(self.date_range) self.assertEquals([], metrics)
def test_get_accuracy_returns_empty_list_when_no_newsitems_between_range( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-23 01:00:00+00:00' news_item.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-26 01:00:00+00:00' news_item.save() metrics = self.newsitem_metric.get_accuracy(self.date_range) self.assertEquals([], metrics)
def test_get_accuracy_total_returns_none_when_no_newsitems_between_range( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-23 01:00:00+00:00' news_item.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-26 01:00:00+00:00' news_item.save() accuracy = self.newsitem_metric.get_accuracy_total(self.date_range) self.assertEquals(None, accuracy)
def test_to_dict_returns_one_dictionary_when_cursor_has_one_result(self): source = Source() source.id = 1 source.name = 'foo_source' source.save() news_item = NewsItem() news_item.id = 1 news_item.title = 'foo' news_item.description = 'bar' news_item.url = 'https://www.google.com' news_item.added_at = '2018-11-23 01:00:00+00:00' news_item.source = source news_item.published = False news_item.score = 1 news_item.save() with connection.cursor() as cursor: cursor.execute('SELECT * FROM news_item') result = self.newsitem_metric.to_dict(cursor) self.assertEquals(1, len(result)) self.assertEquals([{ 'id': 1, 'title': 'foo', 'description': 'bar', 'url': 'https://www.google.com', 'added_at': datetime(2018, 11, 23, 1, 0), 'source_id': 1, 'published': False, 'score': 1 }], result)
def test_get_accuracy_total_returns_100_percent_when_no_corpora(self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() accuracy = self.newsitem_metric.get_accuracy_total(self.date_range) self.assertEquals(100, accuracy)
def test_get_accuracy_total_uses_two_day_statistics_when_exist(self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = False corpus.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-25 01:00:00+00:00' news_item.save() accuracy = self.newsitem_metric.get_accuracy_total(self.date_range) self.assertEquals(50.0, accuracy)
def test_to_dict_returns_dictionaries_when_cursor_more_than_one_results( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-23 01:00:00+00:00' news_item.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-23 02:00:00+00:00' news_item.save() with connection.cursor() as cursor: cursor.execute('SELECT * FROM news_item') results = self.newsitem_metric.to_dict(cursor) self.assertEquals(2, len(results)) self.assertEquals(dict, type(results[0])) self.assertEquals(dict, type(results[1]))
def test_get_accuracy_total_returns_50_percent_when_one_accurate_newsitem_and_one_not_accurate( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = False corpus.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 02:00:00+00:00' news_item.save() accuracy = self.newsitem_metric.get_accuracy_total(self.date_range) self.assertEquals(50.0, accuracy)
def test_get_accuracy_returns_100_percent_when_no_corpora(self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() metrics = self.newsitem_metric.get_accuracy(self.date_range) self.assertEquals(1, len(metrics)) self.assertEquals(100, metrics[0]['accuracy']) self.assertEquals('2018-11-24', metrics[0]['added_at'])
def test_get_accuracy_returns_50_percent_when_one_accurate_newsitem_and_one_not_accurate( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = False corpus.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 02:00:00+00:00' news_item.save() metrics = self.newsitem_metric.get_accuracy(self.date_range) self.assertEquals(1, len(metrics)) self.assertEquals(50, metrics[0]['accuracy']) self.assertEquals('2018-11-24', metrics[0]['added_at'])
def test_get_accuracy_returns_two_day_statistics_when_newsitems_for_two_days_exist( self): news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-24 01:00:00+00:00' news_item.save() corpus = Corpus() corpus.news_item = news_item corpus.positive = False corpus.save() news_item = NewsItem() news_item.score = 1 news_item.added_at = '2018-11-25 01:00:00+00:00' news_item.save() metrics = self.newsitem_metric.get_accuracy(self.date_range) self.assertEquals(2, len(metrics)) self.assertEquals('2018-11-24', metrics[0]['added_at']) self.assertEquals(0, metrics[0]['accuracy']) self.assertEquals('2018-11-25', metrics[1]['added_at']) self.assertEquals(100, metrics[1]['accuracy'])
def crawl(self, source, channel): source.crawling() self.logger.info('Crawling \'%s\'...', source.name) try: feedparser.USER_AGENT = settings.RSS_CRAWL_USER_AGENT feed = feedparser.parse(source.url) except RuntimeError: self.logger.error('Could not crawl \'%s\'.', source.name) return for entry in feed['entries']: if 'published' in entry: pass elif 'updated' in entry: entry['published'] = entry['updated'] else: entry['published'] = timezone.now().isoformat() if NewsItem.exists(entry['title'], parse(entry['published']), source): continue description = entry['summary'] if 'summary' in entry else entry[ 'title'] news_item = NewsItem() news_item.title = entry['title'] news_item.description = description news_item.url = entry['link'] news_item.source = source news_item.score = None news_item.added_at = parse(entry['published']) news_item.save() body = serializers.serialize('json', [news_item]) channel.basic_publish(exchange='', routing_key=settings.QUEUE_NAME_CLASSIFY, body=body, properties=pika.BasicProperties( delivery_mode=2, headers={'x-is-self-train': False})) source.crawled() self.logger.info('Successfully crawled \'%s\'!', source.name)
def test_news_item_publish_and_corpus_create_negative_publishes_newsitems_and_creates_negative_corpora_when_newsitems_in_query_set( self): news_item = NewsItem() news_item.title = 'foo' news_item.publshed = True news_item.score = 1.00 news_item.save() query_set = [news_item] self.newsitem.news_item_unpublish_and_corpus_create_negative( None, None, query_set) self.newsitem.enqueue_corpus_creation.assert_called_once() news_items = NewsItem.objects.all() self.assertEquals(1, len(news_items)) self.assertFalse(news_items[0].published) corpus = Corpus.objects.filter(news_item=news_items[0]) self.assertNotEquals(None, corpus) self.assertFalse(corpus[0].positive)
def test_news_returns_http_response_with_template_and_positive_newsitems_when_positive_newsitems_exist(self): # create a positive news item news_item = NewsItem() news_item.title = 'foo' news_item.score = 1 news_item.published = True news_item.save() # make request to news view response = self.get_news() # returns an instance of HttpResponse self.assertTrue(type(response) is HttpResponse) # request didn't fail self.assertEquals(200, response.status_code) # response contains news items content = response.getvalue() self.assertTrue('news-item' in str(content))
def test_changelist_view_returns_metrics_when_unclassified_newsitems_exist_and_no_corpora(self): news_item = NewsItem() news_item.title = 'foo' news_item.score = None news_item.added_at = '2018-12-03 21:00:00+00:00' news_item.save() superuser = self.create_superuser('superuser') request = self.mocked_authenticated_request('/admin/rss/newsitemmetric/', superuser) response = self.admin.changelist_view(request) self.assertEquals(1, response.context_data['news_items_count']) self.assertEquals(1, response.context_data['news_items_unclassified']) self.assertEquals(0, response.context_data['classification_initial']['positive']) self.assertEquals(0, response.context_data['classification_initial']['negative']) self.assertEquals(0, response.context_data['classification_supervised']['positive']) self.assertEquals(0, response.context_data['classification_supervised']['negative']) self.assertEquals(0, response.context_data['corpus_count']['positive']) self.assertEquals(0, response.context_data['corpus_count']['negative']) self.assertEquals([], response.context_data['accuracy'])
def test_changelist_view_returns_metrics_when_newsitems_exist_but_no_corpora_and_date_query_includes_newsitem(self): news_item = NewsItem() news_item.title = 'foo' news_item.score = 1 news_item.added_at = '2018-12-03 21:00:00+00:00' news_item.save() superuser = self.create_superuser('superuser') request = self.mocked_authenticated_request('/admin/rss/newsitemmetric/?added_at__month=12&added_at__year=2018', superuser) response = self.admin.changelist_view(request) self.assertEquals(1, response.context_data['news_items_count']) self.assertEquals(0, response.context_data['news_items_unclassified']) self.assertEquals(1, response.context_data['classification_initial']['positive']) self.assertEquals(0, response.context_data['classification_initial']['negative']) self.assertEquals(1, response.context_data['classification_supervised']['positive']) self.assertEquals(0, response.context_data['classification_supervised']['negative']) self.assertEquals(0, response.context_data['corpus_count']['positive']) self.assertEquals(0, response.context_data['corpus_count']['negative']) self.assertEquals([{'accuracy': 100.0, 'added_at': '2018-12-03'}], response.context_data['accuracy'])
def test_about_returns_http_response_with_stats(self): # create a news item news_item = NewsItem() news_item.title = 'foo' news_item.score = 1 news_item.published = False news_item.save() # create an active source source = Source() source.name = 'foo' source.url = 'http://www.foo.com' source.homepage = 'http://www.foo.com' source.save() # create an inactive source source = Source() source.name = 'bar' source.url = 'http://www.bar.com' source.homepage = 'http://www.bar.com' source.active = False source.save() # make request to news view response = self.get_about() # returns an instance of HttpResponse self.assertTrue(type(response) is HttpResponse) # request didn't fail self.assertEquals(200, response.status_code) # response does not contain any news items content = response.getvalue() self.assertTrue('Sources crawled</strong>: 1' in str(content)) self.assertTrue('News classified</strong>: 1' in str(content)) self.assertTrue('Corpora created</strong>: 0' in str(content)) self.assertTrue('Classification accuracy</strong>: 100%' in str(content))