Example #1
0
    def test_to_dict_returns_one_dictionary_when_cursor_has_one_result(self):
        source = Source()
        source.id = 1
        source.name = 'foo_source'
        source.save()

        news_item = NewsItem()
        news_item.id = 1
        news_item.title = 'foo'
        news_item.description = 'bar'
        news_item.url = 'https://www.google.com'
        news_item.added_at = '2018-11-23 01:00:00+00:00'
        news_item.source = source
        news_item.published = False
        news_item.score = 1
        news_item.save()

        with connection.cursor() as cursor:
            cursor.execute('SELECT * FROM news_item')
            result = self.newsitem_metric.to_dict(cursor)

        self.assertEquals(1, len(result))
        self.assertEquals([{
            'id': 1,
            'title': 'foo',
            'description': 'bar',
            'url': 'https://www.google.com',
            'added_at': datetime(2018, 11, 23, 1, 0),
            'source_id': 1,
            'published': False,
            'score': 1
        }], result)
Example #2
0
    def crawl(self, source, channel):
        source.crawling()

        self.logger.info('Crawling \'%s\'...', source.name)

        try:
            feedparser.USER_AGENT = settings.RSS_CRAWL_USER_AGENT
            feed = feedparser.parse(source.url)
        except RuntimeError:
            self.logger.error('Could not crawl \'%s\'.', source.name)
            return

        for entry in feed['entries']:
            if 'published' in entry:
                pass
            elif 'updated' in entry:
                entry['published'] = entry['updated']
            else:
                entry['published'] = timezone.now().isoformat()

            if NewsItem.exists(entry['title'], parse(entry['published']),
                               source):
                continue

            description = entry['summary'] if 'summary' in entry else entry[
                'title']

            news_item = NewsItem()
            news_item.title = entry['title']
            news_item.description = description
            news_item.url = entry['link']
            news_item.source = source
            news_item.score = None
            news_item.added_at = parse(entry['published'])
            news_item.save()

            body = serializers.serialize('json', [news_item])
            channel.basic_publish(exchange='',
                                  routing_key=settings.QUEUE_NAME_CLASSIFY,
                                  body=body,
                                  properties=pika.BasicProperties(
                                      delivery_mode=2,
                                      headers={'x-is-self-train': False}))

        source.crawled()

        self.logger.info('Successfully crawled \'%s\'!', source.name)