Exemple #1
0
 def __init__(
     self,
     parent,
     cache,
     scrape_resolver,
     fake_feed_resolver,
     request_queue,
     fulfilled_feed_queue,
     fulfilled_scrape_queue,
 ):
     super().__init__(target=self.serve_requests)
     self.parent = parent
     self.cache = cache
     self.scrape_resolver = scrape_resolver
     self.fake_feed_resolver = fake_feed_resolver
     self.request_queue = request_queue
     self.fulfilled_feed_queue = fulfilled_feed_queue
     self.fulfilled_scrape_queue = fulfilled_scrape_queue
     self.session = requests.Session()
     self.scrape_job = ScrapeJob(self.session, self.scrape_resolver)
     self.daemon = True
Exemple #2
0
class GathererWorkerThread(Thread):
    def __init__(
        self,
        parent,
        cache,
        scrape_resolver,
        fake_feed_resolver,
        request_queue,
        fulfilled_feed_queue,
        fulfilled_scrape_queue,
    ):
        super().__init__(target=self.serve_requests)
        self.parent = parent
        self.cache = cache
        self.scrape_resolver = scrape_resolver
        self.fake_feed_resolver = fake_feed_resolver
        self.request_queue = request_queue
        self.fulfilled_feed_queue = fulfilled_feed_queue
        self.fulfilled_scrape_queue = fulfilled_scrape_queue
        self.session = requests.Session()
        self.scrape_job = ScrapeJob(self.session, self.scrape_resolver)
        self.daemon = True

    def notify_main_thread(self, signal):
        Gdk.threads_enter()
        self.parent.emit(signal)
        Gdk.threads_leave()

    def _fresh_scrape_job(self):
        self.scrape_job.clear()
        return self.scrape_job

    def serve_requests(self):
        while True:
            request = self.request_queue.get(block=True)

            if request.lock.acquire(blocking=False):  # Can't use the cleaner "with lock:" syntax due to non-blocking

                try:
                    request_type = type(request)

                    if request_type == Feed:
                        feed = request
                        if feed.is_fake():
                            self.gather_fake_feed(feed)
                        else:
                            self.gather_feed(feed)

                        if feed.items:
                            # Checks the cache for any item link before handing the feed over
                            for item in feed.items:
                                hit = self.cache.query(item.link)
                                if hit:
                                    item.article = hit

                            self.fulfilled_feed_queue.put(feed)
                            self.notify_main_thread("feed_gathered_event")

                    elif request_type == Item:
                        item = request
                        if not item.article and item.link:
                            self.gather_item(item)
                            if item.article:
                                self.fulfilled_scrape_queue.put(item)
                                self.notify_main_thread("item_scraped_event")

                    else:
                        raise RuntimeError(
                            "Invalid request of type "
                            + str(request_type)
                            + " given to GathererWorkerThread the item was "
                            + str(request)
                        )
                finally:
                    request.lock.release()

    def gather_item(self, item):
        hit = self.cache.query(item.link)
        if hit:
            item.article = hit
        else:
            item.article = self.scrape_resolver.select_rule(item.link, self._fresh_scrape_job())
            self.cache.put(item.link, item.article)
        return item

    def gather_fake_feed(self, feed):
        feed.items = self.fake_feed_resolver.select_rule(feed.name, self._fresh_scrape_job())

    def gather_feed(self, feed):
        """ Given a feed, retrieves the items of the feed """
        content = utilityFunctions.feedparser_parse(feed.uri)
        items = list()
        if content:
            for entry in content["entries"]:
                d = defaultdict(str, entry)

                description = d["description"]
                if not description:
                    description = d["summary"]
                title = d["title"]
                link = d["link"]

                if not title and not description and not link:
                    print(
                        "WARNING: An entry from the feed with label "
                        + feed.name
                        + " has no title, description, or link. Skipped."
                        + str(entry)
                    )
                else:
                    item = Item(feed.name, title, Gatherer.description_cleanup(description), link)
                    items.append(item)

        feed.items = items