def initPool(self,criteria={}): self.user_pool = list(self.db.users.find(criteria)) self.total_pool_size = len(self.user_pool) logger.debug("Initialized user pool size: %d" % self.total_pool_size) return len(self.user_pool)
def initPool(self,criteria={}): if "anonymity_level" in criteria and not (isinstance( criteria["anonymity_level"], int ) or isinstance(criteria["anonymity_level"], dict)): raise Exception("anonymity_level must be int or dict") criteria["proxy_pool_connectable"] = { '$in':[True, None] } self.proxy_pool = list(self.db.web_proxys.find(criteria)) self.total_pool_size = len(self.proxy_pool) logger.debug("Initialized proxy pool size: %d" % self.total_pool_size) return len(self.proxy_pool)
def getUser(self,reuse=False): if self.user_pool is None: raise Exception("User pool is not initialized") if self.isEmptyPool(): raise Exception("User pool is empty") user = None isValidProxy = False pool_size = len(self.user_pool) rand = randrange(0,pool_size) user = None if reuse: user = User(self.user_pool[rand]) else: user = User(self.user_pool.pop(rand)) user.setUsersCollection(self.db.users) logger.debug("Use user: "******"Remaining user pool size: %d / %d" % (len(self.user_pool),self.total_pool_size)) return user
def validateProxy(self,proxy): shoudRetry = True; attemptCount = 1 while(shoudRetry): hostname = proxy["hostname"] country = proxy["country"] anonymity_level = proxy["anonymity_level"] proxy_type = proxy["proxy_type"] try: if proxy["proxy_type"] == "https": url = "https://example.com/" else: url = "http://example.com/" http_proxy = "http://" + hostname https_proxy = "https://" + hostname ftp_proxy = "ftp://" + hostname proxyDict = { "http" : http_proxy, "https" : https_proxy, "ftp" : ftp_proxy } r = requests.get(url, proxies=proxyDict, timeout=5) #if the above code does not throw timeout exception self.db.web_proxys.update( {"hostname":hostname}, {"$set":{ "proxy_pool_connectable": True, "proxy_pool_check_date": datetime.datetime.utcnow() }} ) logger.debug("Use proxy:\n hostname: %s\n from: %s\n anonymity_level: %s\n proxy_type: %s\n" % (hostname,country,anonymity_level,proxy_type)) logger.debug("Remaining proxy pool size: %d / %d" % (len(self.proxy_pool),self.total_pool_size)) return True except (requests.exceptions.Timeout, requests.exceptions.ConnectionError,TypeError) as e: # retry if the record is scraped freshly scraped_date = proxy["update_at"] now = datetime.datetime.now() timediff = now - scraped_date if timediff.days >= 1 or attemptCount >= 3: self.db.web_proxys.update( {"hostname":hostname}, {"$set":{ "proxy_pool_connectable": False, "proxy_pool_check_date": datetime.datetime.utcnow() }} ) logger.debug("Invalidated proxy: %s (%s)" % (hostname,country)) logger.debug("Invalidated reason:\n%s " % e.message) logger.debug("Remaining proxy pool size: %d / %d" % (len(self.proxy_pool),self.total_pool_size)) return False else: #retry logger.debug("retry proxy: %s (%s)" % (hostname,country)) attemptCount+=1 time.sleep(1)