class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.useful_proxy_queue = 'useful_proxy_queue' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy.strip(): proxy_set.add(proxy.strip()) # store raw proxy self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.get() # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() def get_status(self): self.db.changeTable(self.raw_proxy_queue) quan_raw_proxy = self.db.get_status() self.db.changeTable(self.useful_proxy_queue) quan_useful_queue = self.db.get_status() return {'raw_proxy': quan_raw_proxy, 'useful_proxy_queue': quan_useful_queue}
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) # store raw proxy self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.get() # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() def get_status(self): # TODO rename get_count.. self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.get_status() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.get_status() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class MongoSsdbUrlManager(object): def __init__(self, host="localhost", client=None): self.config = GetConfig() self.client = MongoClient('mongodb://localhost:27017/') if client is None else client self.ssdb_client = DbClient() self.db = self.client.spider if self.db.lazada.count() is 0: self.db.lazada.create_index([("status", ASCENDING), ("pr", DESCENDING)]) def enqueue_url(self, url, status, depth): md5 = hashlib.md5(url).hexdigest() i = 0 while i < 2: try: num = self.ssdb_client.get(md5) if num is not None: self.ssdb_client.update(key=md5, value=int(num) + 1) return self.ssdb_client.put(md5) i = 2 except Exception as error: print error i += 1 self.db.lazada.save({ '_id': md5, 'url': url, 'status': status, 'queue_time': datetime.utcnow(), 'depth': depth, 'pr': 0 }) def dequeue_url(self, depth): # record = self.db.lazada.find_one_and_update( # {'status': 'downloading', 'depth': depth}, # {'$set': {'status': 'downloading'}}, # upsert=False, # sort=[('pr', DESCENDING)], # sort by pr in descending # returnNewDocument=False # ) record = self.db.lazada.find_one( {'status': 'downloading', 'depth': depth} ) if record: return record else: return None def finish_url(self, url): record = {'status': 'done', 'done_time': datetime.utcnow()} self.db.lazada.update({'_id': hashlib.md5(url).hexdigest()}, {'$set': record}, upsert=False) def clear(self): self.ssdb_client.clear() def save_error(self, url, status, depth): md5 = hashlib.md5(url).hexdigest() self.db.lazada_error.save({ '_id': md5, 'url': url, 'status': status, 'queue_time': datetime.utcnow(), 'depth': depth, 'pr': 0 }) def update(self): self.db.lazada.update({'status': 'downloading'}, {'$set': {'status': 'new'}}, multi=True) def save(self, record): self.db.record.save(record)
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.useful_proxy_queue = 'useful_proxy_queue' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy.strip(): proxy_set.add(proxy.strip()) # store raw proxy self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.get() # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() def get_status(self): self.db.changeTable(self.raw_proxy_queue) quan_raw_proxy = self.db.get_status() self.db.changeTable(self.useful_proxy_queue) quan_useful_queue = self.db.get_status() return { 'raw_proxy': quan_raw_proxy, 'useful_proxy_queue': quan_useful_queue }