def test_to_dict_returns_one_dictionary_when_cursor_has_one_result(self): source = Source() source.id = 1 source.name = 'foo_source' source.save() news_item = NewsItem() news_item.id = 1 news_item.title = 'foo' news_item.description = 'bar' news_item.url = 'https://www.google.com' news_item.added_at = '2018-11-23 01:00:00+00:00' news_item.source = source news_item.published = False news_item.score = 1 news_item.save() with connection.cursor() as cursor: cursor.execute('SELECT * FROM news_item') result = self.newsitem_metric.to_dict(cursor) self.assertEquals(1, len(result)) self.assertEquals([{ 'id': 1, 'title': 'foo', 'description': 'bar', 'url': 'https://www.google.com', 'added_at': datetime(2018, 11, 23, 1, 0), 'source_id': 1, 'published': False, 'score': 1 }], result)
def crawl(self, source, channel): source.crawling() self.logger.info('Crawling \'%s\'...', source.name) try: feedparser.USER_AGENT = settings.RSS_CRAWL_USER_AGENT feed = feedparser.parse(source.url) except RuntimeError: self.logger.error('Could not crawl \'%s\'.', source.name) return for entry in feed['entries']: if 'published' in entry: pass elif 'updated' in entry: entry['published'] = entry['updated'] else: entry['published'] = timezone.now().isoformat() if NewsItem.exists(entry['title'], parse(entry['published']), source): continue description = entry['summary'] if 'summary' in entry else entry[ 'title'] news_item = NewsItem() news_item.title = entry['title'] news_item.description = description news_item.url = entry['link'] news_item.source = source news_item.score = None news_item.added_at = parse(entry['published']) news_item.save() body = serializers.serialize('json', [news_item]) channel.basic_publish(exchange='', routing_key=settings.QUEUE_NAME_CLASSIFY, body=body, properties=pika.BasicProperties( delivery_mode=2, headers={'x-is-self-train': False})) source.crawled() self.logger.info('Successfully crawled \'%s\'!', source.name)