def initialize(self): # initialize URL cache # use the SHA1 of channel cid + rss_url as key cache_key = hashlib.sha1(self.channel_community.cid) cache_key.update(self.rss_url) cache_key_str = hexlify(cache_key.digest()) self._logger.debug(u"using key %s for channel %s, rss %s", cache_key_str, hexlify(self.channel_community.cid), self.rss_url) url_cache_name = u"rss_cache_%s.txt" % cache_key_str url_cache_path = os.path.join(self.session.config.get_state_dir(), url_cache_name) self._url_cache = SimpleCache(url_cache_path) self._url_cache.load() # schedule the scraping task self.register_task(u"rss_scrape", reactor.callLater(2, self._task_scrape)) # subscribe to channel torrent creation self.session.notifier.add_observer( self.on_channel_torrent_created, SIGNAL_CHANNEL_COMMUNITY, [SIGNAL_ON_TORRENT_UPDATED], self.channel_community.get_channel_id()) # notify that a RSS feed has been created rss_feed_data = { u'channel': self.channel_community, u'rss_feed_url': self.rss_url } self.session.notifier.notify(SIGNAL_RSS_FEED, SIGNAL_ON_UPDATED, None, rss_feed_data) self.running = True
def test_parse_rss_feed(self): self.channel_rss.rss_url = os.path.join(TESTS_DATA_DIR, 'test_rss.xml') self.channel_rss._url_cache = SimpleCache( os.path.join(self.session_base_dir, 'cache.txt')) dl = self.channel_rss.parse_feed() self.assertIsInstance(dl, DeferredList) yield dl
def test_parse(self): parser = RSSFeedParser() for rss_item in parser.parse(os.path.join(TESTS_DATA_DIR, 'test_rss.xml'), SimpleCache(os.path.join(self.session_base_dir, 'cache.txt'))): self.assertEqual(len(rss_item['thumbnail_list']), 1) self.assertEqual(rss_item['title'], "ubuntu-15.04-desktop-amd64.iso") self.assertEqual(rss_item['description'], '')
def test_parse_rss_feed(self): prepare_xml_rss(self.session_base_dir, 'test_rss.xml') self.channel_rss.rss_url = os.path.join(self.session_base_dir, 'test_rss.xml') self.channel_rss._url_cache = SimpleCache(os.path.join(self.session_base_dir, 'cache.txt')) dl = self.channel_rss.parse_feed() self.assertIsInstance(dl, DeferredList) return dl
def test_parse(self): test_rss_file = os.path.join(TESTS_DATA_DIR, 'test_rss.xml') files_path = os.path.join(self.session_base_dir, 'files') os.mkdir(files_path) shutil.copyfile(test_rss_file, os.path.join(files_path, 'test_rss.xml')) file_server_port = get_random_port() self.setUpFileServer(file_server_port, files_path) parser = RSSFeedParser() cache = SimpleCache(os.path.join(self.session_base_dir, 'cache.txt')) cache.add('http://localhost:RANDOMPORT/ubuntu.torrent') def on_items(rss_items): self.assertEqual(len(rss_items), 2) self.assertEqual(len(rss_items[0]['thumbnail_list']), 1) return parser.parse('http://localhost:%d/test_rss.xml' % file_server_port, cache).addCallback(on_items)
def test_parse_rss_feed(self): """ Test parsing a rss feed """ self.channel_rss.rss_url = 'http://localhost:%d/test_rss.xml' % self.file_server_port self.channel_rss._url_cache = SimpleCache( os.path.join(self.session_base_dir, 'cache.txt')) def verify_rss(items): self.assertEqual(len(items), 2) return self.channel_rss.parse_feed().addCallback(verify_rss)
def test_parse_feed_stopped(self): """ Test whether items are not parsed anymore when the parse feeder is stopped """ self.channel_rss.rss_url = 'http://localhost:%d/test_rss.xml' % self.file_server_port self.channel_rss._url_cache = SimpleCache(os.path.join(self.session_base_dir, 'cache.txt')) self.channel_rss._to_stop = True def verify_rss(items): self.assertEqual(len(items), 0) return self.channel_rss.parse_feed().addCallback(verify_rss)
class ChannelRssParser(TaskManager): def __init__(self, session, channel_community, rss_url, check_interval=DEFAULT_CHECK_INTERVAL): super(ChannelRssParser, self).__init__() self._logger = logging.getLogger(self.__class__.__name__) self.session = session self.channel_community = channel_community self.rss_url = rss_url self.check_interval = check_interval self._url_cache = None self._pending_metadata_requests = {} self._to_stop = False self.running = False @blocking_call_on_reactor_thread def initialize(self): # initialize URL cache # use the SHA1 of channel cid + rss_url as key cache_key = hashlib.sha1(self.channel_community.cid) cache_key.update(self.rss_url) cache_key_str = hexlify(cache_key.digest()) self._logger.debug(u"using key %s for channel %s, rss %s", cache_key_str, hexlify(self.channel_community.cid), self.rss_url) url_cache_name = u"rss_cache_%s.txt" % cache_key_str url_cache_path = os.path.join(self.session.config.get_state_dir(), url_cache_name) self._url_cache = SimpleCache(url_cache_path) self._url_cache.load() # schedule the scraping task self.register_task(u"rss_scrape", reactor.callLater(2, self._task_scrape)) # subscribe to channel torrent creation self.session.notifier.add_observer( self.on_channel_torrent_created, SIGNAL_CHANNEL_COMMUNITY, [SIGNAL_ON_TORRENT_UPDATED], self.channel_community.get_channel_id()) # notify that a RSS feed has been created rss_feed_data = { u'channel': self.channel_community, u'rss_feed_url': self.rss_url } self.session.notifier.notify(SIGNAL_RSS_FEED, SIGNAL_ON_UPDATED, None, rss_feed_data) self.running = True @blocking_call_on_reactor_thread def shutdown(self): self._to_stop = True self.cancel_all_pending_tasks() self._url_cache.save() self._url_cache = None self.channel_community = None self.session = None self.running = False def parse_feed(self): rss_parser = RSSFeedParser() def on_rss_items(rss_items): if not rss_items: self._logger.warning(u"No RSS items found.") return succeed(None) def_list = [] for rss_item in rss_items: if self._to_stop: continue torrent_url = rss_item[u'torrent_url'].encode('utf-8') if torrent_url.startswith('magnet:'): self._logger.warning( u"Tribler does not support adding magnet links to a channel from a RSS feed." ) continue torrent_deferred = getPage(torrent_url) torrent_deferred.addCallbacks( lambda t, r=rss_item: self.on_got_torrent(t, rss_item=r), self.on_got_torrent_error) def_list.append(torrent_deferred) return DeferredList(def_list, consumeErrors=True) return rss_parser.parse(self.rss_url, self._url_cache).addCallback(on_rss_items) def _task_scrape(self): deferred = self.parse_feed() if not self._to_stop: # schedule the next scraping task self._logger.info(u"Finish scraping %s, schedule task after %s", self.rss_url, self.check_interval) self.register_task( u'rss_scrape', reactor.callLater(self.check_interval, self._task_scrape)) return deferred def on_got_torrent(self, torrent_data, rss_item=None): if self._to_stop: return # save torrent tdef = TorrentDef.load_from_memory(torrent_data) self.session.lm.rtorrent_handler.save_torrent(tdef) # add metadata pending request info_hash = tdef.get_infohash() if u'thumbnail_list' in rss_item and rss_item[u'thumbnail_list']: # only use the first thumbnail rss_item[u'thumbnail_url'] = rss_item[u'thumbnail_list'][0] if info_hash not in self._pending_metadata_requests: self._pending_metadata_requests[info_hash] = rss_item # create channel torrent self.channel_community._disp_create_torrent_from_torrentdef( tdef, long(time.time())) # update URL cache self._url_cache.add(rss_item[u'torrent_url']) self._url_cache.save() self._logger.info(u"Channel torrent %s created", tdef.get_name_as_unicode()) def on_got_torrent_error(self, failure): """ This callback is invoked when the lookup for a specific torrent failed. """ self._logger.warning(u"Failed to fetch torrent info from RSS feed: %s", failure) def on_channel_torrent_created(self, subject, events, object_id, data_list): if self._to_stop: return for data in data_list: if data[u'info_hash'] in self._pending_metadata_requests: rss_item = self._pending_metadata_requests.pop( data[u'info_hash']) rss_item[u'info_hash'] = data[u'info_hash'] rss_item[u'channel_torrent_id'] = data[u'channel_torrent_id'] metadata_deferred = getPage( rss_item[u'thumbnail_url'].encode('utf-8')) metadata_deferred.addCallback(lambda md, r=rss_item: self. on_got_metadata(md, rss_item=r)) def on_got_metadata(self, metadata_data, rss_item=None): # save metadata thumb_hash = hashlib.sha1(metadata_data).digest() self.session.lm.rtorrent_handler.save_metadata(thumb_hash, metadata_data) # create modification message for channel modifications = { u'metadata-json': json.dumps({ u'title': rss_item['title'][:64], u'description': rss_item['description'][:768], u'thumb_hash': thumb_hash.encode('hex') }) } self.channel_community.modifyTorrent(rss_item[u'channel_torrent_id'], modifications)
def test_parse_feed_stopped(self): self.channel_rss.rss_url = os.path.join(TESTS_DATA_DIR, 'test_rss.xml') self.channel_rss._url_cache = SimpleCache( os.path.join(self.session_base_dir, 'cache.txt')) self.channel_rss._to_stop = True self.assertIsNone(self.channel_rss.parse_feed())