Ejemplo n.º 1
0
    async def put(self, proxy, prepopulation=False):
        """
        [x] TODO:
        --------
        Depending on treatment of Confirmed Queue when proxies exist in queue
        already, this might be able to moved to base class.

        [x] Note:
        ---------
        Because proxies in the ConfirmedQueue are used simultaneously by multiple
        threads at the same time, and a proxy from the ConfirmedQueue is likely to
        cause subsequent successful responses, it is likely that the proxy is
        already in the ConfirmedQueue.

        This means we have to check before we put in.
        """
        async with self.lock:
            # This might happen a lot because confirmed proxies are not removed
            # from the queue!
            if await self.contains(proxy):
                raise ProxyPoolError(f'Cannot Add Proxy to {self.__NAME__}')

            if proxy.id in self.times_used:
                raise ProxyPoolError('Did not expect proxy to be in count.')

            # Have to initialize so that the _get_proxy() method can find it.
            self.times_used[proxy.id] = 0
            self.mapped[proxy.id] = proxy

            await super(ConfirmedQueue, self).put(proxy,
                                                  prepopulation=prepopulation)
Ejemplo n.º 2
0
    async def start(self, limit=None, confirmed=False):
        """
        Retrieves proxies from the queue that is populated from the Broker and
        then puts these proxies in the prioritized heapq pool.

        Prepopulates proxies if the flag is set to put proxies that we previously
        saved into the pool.

        [x] TODO:
        ---------
        We are eventually going to need the relationship between prepopulation
        and collection to be more dynamic and adjust, and collection to trigger
        if we are running low on proxies.
        """
        await super(BrokeredProxyManager, self).start(limit=limit, confirmed=confirmed)

        if settings.proxies.pool.collect:
            # Pool will set start event when it starts collecting proxies.
            await self.pool.collect()
        else:
            if self.start_event.is_set():
                raise ProxyPoolError('Start Event Already Set')

            self.start_event.set()
            self.log.debug('Setting Start Event', extra={
                'other': 'Proxy Pool Prepopulated'
            })
Ejemplo n.º 3
0
 async def raise_if_missing(self, proxy):
     """
     Most likely a temporary utility.
     Raises ProxyPoolError if a proxy is expected in the queue but is not
     found.
     """
     if not await self.contains(proxy):
         raise ProxyPoolError(f'Expected Proxy to be in {self.__NAME__}')
Ejemplo n.º 4
0
 async def raise_if_present(self, proxy):
     """
     Most likely a temporary utility.
     Raises ProxyPoolError if a proxy is not expected in the queue but is
     found.
     """
     if await self.contains(proxy):
         raise ProxyPoolError(
             f'Did Not Expect Proxy to be in {self.__NAME__}')
Ejemplo n.º 5
0
    def raise_for_queue(self, proxy):
        """
        Validates whether or not the proxy is allowed to be in the queue.

        For the Confirmed Queue, we do have to do this on get() and put(),
        since proxies are not completely removed from the queue in the
        get() method.
        """
        if not proxy.confirmed():
            raise ProxyPoolError(f"Found Unconfirmed Proxy in {self.__NAME__}")
Ejemplo n.º 6
0
    async def _get_proxy(self):
        """
        [x] TODO:
        --------
        There may be a smarter way to do this, that involves staggering the
        retrieval of the same proxy with asyncio.sleep() based on the number
        of times it was already pulled out.
        """
        async with self.lock:
            least_common = self.times_used.most_common()
            if not least_common:
                raise QueueEmpty(self)

            proxy_id, count = least_common[:-2:-1][0]
            proxy = self.mapped[proxy_id]

            # As proxies are confirmed and put back in, this might cause issues,
            # since there may wind up being more proxies than the limit of giving
            # out.  We will have to decrement the count when a proxy is put back
            # in.
            if count >= MAX_CONFIRMED_PROXIES:
                raise ProxyPoolError('Count %s exceeds %s.' %
                                     (count, MAX_CONFIRMED_PROXIES))

            # As proxies are confirmed and put back in,
            if count == MAX_CONFIRMED_PROXIES - 1:
                await self.remove(proxy)

                if settings.logging.log_proxy_queue:
                    self.log.debug(
                        f'Returning & Removing Proxy from {self.__NAME__}',
                        extra={
                            'data': {
                                'Times Used':
                                f"{self.times_used[proxy.id] or 0} (Last Allowed)",
                                f'{self.__NAME__} Size': self.qsize(),
                            },
                            'proxy': proxy,
                        })
            else:
                if settings.logging.log_proxy_queue:
                    self.log.debug(f'Returning Proxy from {self.__NAME__}',
                                   extra={
                                       'data': {
                                           'Times Used':
                                           self.times_used[proxy.id] or 0,
                                           f'{self.__NAME__} Size':
                                           self.qsize(),
                                       },
                                       'proxy': proxy,
                                   })

            self.times_used[proxy.id] += 1
            return proxy
Ejemplo n.º 7
0
    def raise_for_queue(self, proxy):
        """
        Validates whether or not the proxy is allowed to be in the queue.

        For the Hold Queue, we do not have to do this on get(), since we are
        removing the proxy in the put() method, vs. the Confirmed Queue, where
        the proxy stays in the Queue, so has to be validated on get() as well.
        """
        # Proxy can be confirmed over horizon even if the most recent error is a
        # a timeout error, so we cannot do this:
        # >>> if proxy.confirmed_over_threshold_in_horizon():
        # >>>   raise ProxyPoolError("Hold Queue: Found Confirmed Proxy")

        # Most Recent Request Confirmed -> Always Should be in Confirmed Queue
        last_request = proxy.last_request(active=True)
        if last_request.confirmed:
            raise ProxyPoolError(f"Found Confirmed Proxy in {self.__NAME__}")

        if not last_request.was_timeout_error:
            raise ProxyPoolError(
                f"Found Non Holdable Proxy in {self.__NAME__}")
Ejemplo n.º 8
0
    async def put_from_pool(self, proxy):
        """
        Takes a proxy that is currently in the General Pool and determines how to
        handle it based on the new request appended to the proxy's history.

        This involves either:
            (1) Putting proxy in Confirmed Queue if it resulted in a successful
                response.
            (2) Putting proxy in Hold Queue if it resulted in a timeout error.
            (3) Putting back in pool.

        [x] TODO:
        --------
        Since we do not return proxies from the Confirmed Queue or Hold Queue
        back to the General Pool, should we discard proxies that have errors
        after being taken out of General Pool?

        For the above, the evaluation will determine whether or not the proxy
        should stay in the General Pool, but this is a little counter-intuitive,
        since we don't apply that same evaluation logic to determine whether or
        not to keep the proxy after it fails in the Confirmed Queue or Hold Queue.

        [x] TODO: Race Condition
        --------
        This is unusual:
        >>>  await self.hold.warn_if_present(proxy)

        This means that a proxy in the pool is already in the Hold Queue - did
        some other thread already put it in there?  Was it not fully removed from
        the Hold Queue?
        """
        await self.hold.warn_if_present(proxy)
        await self.confirmed.raise_if_present(proxy)

        last_request = proxy.requests(-1, active=True)

        if last_request.confirmed:
            await self.confirmed.put(proxy)
        else:
            # [x] NOTE: There really shouldn't be any confirmed proxies in the
            # general pool unless the immediate last request was confirmed.  Once
            # confirmed proxies leave the general pool, they stay out.
            if proxy.confirmed():
                raise ProxyPoolError(
                    f"Should Not be Confirmed Proxy in {self.pool.__NAME__}")

            if last_request.was_timeout_error:
                # Typical Race Conditions w Hold Queue
                await self.hold.safe_put(proxy)
            else:
                await super(SmartProxyManager, self).put(proxy)
Ejemplo n.º 9
0
 async def remove(self, proxy):
     """
     [x] Note:
     ---------
     Because proxies in the ConfirmedQueue are used simultaneously by multiple
     threads at the same time (not the case for HeldQueue), it is possible
     that the proxy is already removed from the ConfirmedQueue by the time another
     thread determines it should be removed.
     """
     async with self.lock:
         if not await self.contains(proxy):
             raise ProxyPoolError(
                 f'Cannot Remove Proxy from {self.__NAME__}')
         self._queue.remove(proxy)
Ejemplo n.º 10
0
    async def put(self, proxy):
        """
        [x] TODO:
        --------
        Depending on treatment of Confirmed Queue when proxies exist in queue
        already, this might be able to moved to base class.

        [x] Note:
        ---------
        Proxies in Hold Queue are not used by multiple threads simultaneously,
        so when one thread determines that the proxy should be put in the
        Hold Queue, it should not already be in there.
        """
        async with self.lock:
            if await self.contains(proxy):
                raise ProxyPoolError(f'Cannot Add Proxy to {self.__NAME__}')
            await super(HoldQueue, self).put(proxy)
Ejemplo n.º 11
0
    async def remove(self, proxy):
        """
        [x] Note:
        ---------
        Because proxies in the ConfirmedQueue are used simultaneously by multiple
        threads at the same time (not the case for HeldQueue), it is possible
        that the proxy is already removed from the ConfirmedQueue by the time another
        thread determines it should be removed.
        """
        async with self.lock:
            if proxy not in self._queue:
                raise ProxyPoolError(
                    f'Cannot Remove Proxy from {self.__NAME__}',
                    extra={'proxy': proxy})

            ind = self._queue.index(proxy)
            self._delete_nth(ind)
            del self.times_used[proxy.id]
            del self.mapped[proxy.id]
Ejemplo n.º 12
0
    async def put_from_hold(self, proxy):
        """
        Takes a proxy that was taken from the Hold Queue and determines how to
        handle it based on the new request appended to the proxy's history.

        [x] NOTE:
        ---------
        Since the proxy is already in the Hold Queue, the second to last request
        should be a timeout error, otherwise it would not have been sent to the
        Hold Queue to begin with.

        [x] NOTE:
        ---------
        Proxies in the manager only move up, and not down (with the exception of
        a proxy moving from confirmed to held).
            - If a proxy is in the Hold Queue and times out, but then returns a
              confirmed request, we move back up to the Confirmed Queue.
            - If a proxy is in the Hold Queue and returns an error, or times out,
              we discard, not move back to the General Pool.

        [x] TODO:
        --------
        Remove sanity checks `raise_if_` once we are more confident in operation
        of the manager.
        """
        # await self.hold.raise_if_present(proxy)  # Racee Condition - Another thread might be you to it.
        #
        # Don't know why this is failing.
        # await self.confirmed.raise_if_present(proxy)

        # [x] TODO:
        # This Keeps Failing - Only thing I can think of is a Race Condition?
        # We will log warning for now, hopefully find bug.
        last_last_request = proxy.requests(-2, active=True)
        if not last_last_request.was_timeout_error:
            e = ProxyPoolError(
                f"Second to Last Request Should be Timeout Error, "
                f"Not {last_last_request.error}"
            )
            self.log.warning(e)

        last_request = proxy.requests(-1, active=True)

        # Request Confirmed - Move from Hold to Confirmed ^
        if last_request.confirmed:
            # Why were we moving it?  Proxy was removed from hold, it's not
            # in there anymore...
            await self.confirmed.put(proxy)
            # await self.hold.move_to_confirmed(proxy)

        # Another Timeout Error - Increment Timeout and Check Max
        elif last_request.was_timeout_error:
            if last_request.error == last_last_request.error:
                try:
                    proxy.increment_timeout(last_request.error)

                # Proxy Maxes Out -> Discard
                # Should we maybe limit this to discarding only proxies that don't
                # have any recent confirmations?
                except ProxyMaxTimeoutError as e:
                    self.log.info(e)
                    proxy.reset_timeout(last_request.error)
                    pass
                else:
                    # Typical Race Conditions w Hold Queue
                    await self.hold.safe_put(proxy)
            else:
                # Typical Race Conditions w Hold Queue
                await self.hold.safe_put(proxy)
        else:
            # Proxy No Longer Holdable -> Discard
            pass