def __init__( self, parent, cache, scrape_resolver, fake_feed_resolver, request_queue, fulfilled_feed_queue, fulfilled_scrape_queue, ): super().__init__(target=self.serve_requests) self.parent = parent self.cache = cache self.scrape_resolver = scrape_resolver self.fake_feed_resolver = fake_feed_resolver self.request_queue = request_queue self.fulfilled_feed_queue = fulfilled_feed_queue self.fulfilled_scrape_queue = fulfilled_scrape_queue self.session = requests.Session() self.scrape_job = ScrapeJob(self.session, self.scrape_resolver) self.daemon = True
class GathererWorkerThread(Thread): def __init__( self, parent, cache, scrape_resolver, fake_feed_resolver, request_queue, fulfilled_feed_queue, fulfilled_scrape_queue, ): super().__init__(target=self.serve_requests) self.parent = parent self.cache = cache self.scrape_resolver = scrape_resolver self.fake_feed_resolver = fake_feed_resolver self.request_queue = request_queue self.fulfilled_feed_queue = fulfilled_feed_queue self.fulfilled_scrape_queue = fulfilled_scrape_queue self.session = requests.Session() self.scrape_job = ScrapeJob(self.session, self.scrape_resolver) self.daemon = True def notify_main_thread(self, signal): Gdk.threads_enter() self.parent.emit(signal) Gdk.threads_leave() def _fresh_scrape_job(self): self.scrape_job.clear() return self.scrape_job def serve_requests(self): while True: request = self.request_queue.get(block=True) if request.lock.acquire(blocking=False): # Can't use the cleaner "with lock:" syntax due to non-blocking try: request_type = type(request) if request_type == Feed: feed = request if feed.is_fake(): self.gather_fake_feed(feed) else: self.gather_feed(feed) if feed.items: # Checks the cache for any item link before handing the feed over for item in feed.items: hit = self.cache.query(item.link) if hit: item.article = hit self.fulfilled_feed_queue.put(feed) self.notify_main_thread("feed_gathered_event") elif request_type == Item: item = request if not item.article and item.link: self.gather_item(item) if item.article: self.fulfilled_scrape_queue.put(item) self.notify_main_thread("item_scraped_event") else: raise RuntimeError( "Invalid request of type " + str(request_type) + " given to GathererWorkerThread the item was " + str(request) ) finally: request.lock.release() def gather_item(self, item): hit = self.cache.query(item.link) if hit: item.article = hit else: item.article = self.scrape_resolver.select_rule(item.link, self._fresh_scrape_job()) self.cache.put(item.link, item.article) return item def gather_fake_feed(self, feed): feed.items = self.fake_feed_resolver.select_rule(feed.name, self._fresh_scrape_job()) def gather_feed(self, feed): """ Given a feed, retrieves the items of the feed """ content = utilityFunctions.feedparser_parse(feed.uri) items = list() if content: for entry in content["entries"]: d = defaultdict(str, entry) description = d["description"] if not description: description = d["summary"] title = d["title"] link = d["link"] if not title and not description and not link: print( "WARNING: An entry from the feed with label " + feed.name + " has no title, description, or link. Skipped." + str(entry) ) else: item = Item(feed.name, title, Gatherer.description_cleanup(description), link) items.append(item) feed.items = items