Esempio n. 1
0
    def initPool(self,criteria={}):
        self.user_pool = list(self.db.users.find(criteria))

        self.total_pool_size = len(self.user_pool)
        logger.debug("Initialized user pool size: %d" % self.total_pool_size)

        return len(self.user_pool)
Esempio n. 2
0
  def initPool(self,criteria={}):

    if "anonymity_level" in criteria and not (isinstance( criteria["anonymity_level"], int ) or isinstance(criteria["anonymity_level"], dict)):
      raise Exception("anonymity_level must be int or dict")

    criteria["proxy_pool_connectable"] = { '$in':[True, None] }

    self.proxy_pool = list(self.db.web_proxys.find(criteria))

    self.total_pool_size = len(self.proxy_pool)
    logger.debug("Initialized proxy pool size: %d" % self.total_pool_size)

    return len(self.proxy_pool)
Esempio n. 3
0
    def getUser(self,reuse=False):
        if self.user_pool is None:
            raise Exception("User pool is not initialized")

        if self.isEmptyPool():
            raise Exception("User pool is empty")

        user = None
        isValidProxy = False
        pool_size = len(self.user_pool)
        rand = randrange(0,pool_size)

        user = None
        if reuse:
            user =  User(self.user_pool[rand])
        else:
            user =  User(self.user_pool.pop(rand))

        user.setUsersCollection(self.db.users)

        logger.debug("Use user: "******"Remaining user pool size: %d / %d" % (len(self.user_pool),self.total_pool_size))

        return user
Esempio n. 4
0
  def validateProxy(self,proxy):
    shoudRetry = True;
    attemptCount = 1
    while(shoudRetry):
        hostname = proxy["hostname"]
        country = proxy["country"]
        anonymity_level = proxy["anonymity_level"]
        proxy_type = proxy["proxy_type"]
        try:
            if proxy["proxy_type"] == "https":
              url = "https://example.com/"
            else:
              url = "http://example.com/"

            http_proxy  = "http://" + hostname
            https_proxy = "https://" + hostname
            ftp_proxy   = "ftp://" + hostname

            proxyDict = {
                          "http"  : http_proxy,
                          "https" : https_proxy,
                          "ftp"   : ftp_proxy
                        }

            r = requests.get(url, proxies=proxyDict, timeout=5)

            #if the above code does not throw timeout exception
            self.db.web_proxys.update(
              {"hostname":hostname},
              {"$set":{
                "proxy_pool_connectable": True,
                "proxy_pool_check_date": datetime.datetime.utcnow()
              }}
            )

            logger.debug("Use proxy:\n    hostname: %s\n    from: %s\n    anonymity_level: %s\n    proxy_type: %s\n" % (hostname,country,anonymity_level,proxy_type))
            logger.debug("Remaining proxy pool size: %d / %d" % (len(self.proxy_pool),self.total_pool_size))

            return True

        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError,TypeError) as e:
            # retry if the record is scraped freshly

            scraped_date = proxy["update_at"]
            now = datetime.datetime.now()
            timediff = now - scraped_date
            if timediff.days >= 1 or attemptCount >= 3:
              self.db.web_proxys.update(
                {"hostname":hostname},
                {"$set":{
                  "proxy_pool_connectable": False,
                  "proxy_pool_check_date": datetime.datetime.utcnow()
                }}
              )

              logger.debug("Invalidated proxy: %s (%s)" % (hostname,country))
              logger.debug("Invalidated reason:\n%s " % e.message)
              logger.debug("Remaining proxy pool size: %d / %d" % (len(self.proxy_pool),self.total_pool_size))

              return False
            else:
              #retry
              logger.debug("retry proxy: %s (%s)" % (hostname,country))
              attemptCount+=1
              time.sleep(1)