class TaskQueue(object): ''' task queue for scheduler, have a priority queue and a time queue for delayed tasks ''' processing_timeout = 10 * 60 def __init__(self, rate=0, burst=0): self.mutex = threading.Lock() self.priority_queue = PriorityTaskQueue() self.time_queue = PriorityTaskQueue() self.processing = PriorityTaskQueue() self.bucket = Bucket(rate=rate, burst=burst) @property def rate(self): return self.bucket.rate @rate.setter def rate(self, value): self.bucket.rate = value @property def burst(self): return self.burst.burst @burst.setter def burst(self, value): self.bucket.burst = value def check_update(self): self._check_time_queue() self._check_processing() def _check_time_queue(self): now = time.time() self.mutex.acquire() while self.time_queue.qsize() and self.time_queue.top.exetime < now: task = self.time_queue.get() task.exetime = 0 self.priority_queue.put(task) self.mutex.release() def _check_processing(self): now = time.time() self.mutex.acquire() while self.processing.qsize() and self.processing.top.exetime < now: task = self.processing.get() if task.taskid is None: continue task.exetime = 0 self.priority_queue.put(task) logging.info("[processing: retry] %s" % task.taskid) self.mutex.release() def put(self, taskid, priority=0, exetime=0): now = time.time() self.mutex.acquire() if taskid in self.priority_queue: task = self.priority_queue[taskid] if priority > task.priority: task.priority = priority elif taskid in self.time_queue: task = self.time_queue[taskid] if priority > task.priority: task.priority = priority if exetime < task.exetime: task.exetime = exetime else: task = InQueueTask(taskid, priority) if exetime and exetime > now: task.exetime = exetime self.time_queue.put(task) else: self.priority_queue.put(task) self.mutex.release() def get(self): if self.bucket.get() < 1: return None now = time.time() self.mutex.acquire() try: task = self.priority_queue.get_nowait() self.bucket.desc() except Queue.Empty: self.mutex.release() return None task.exetime = now + self.processing_timeout self.processing.put(task) self.mutex.release() return task.taskid def done(self, taskid): if taskid in self.processing: self.processing[taskid].taskid = None def __len__(self): return self.priority_queue.qsize() + self.time_queue.qsize() def __contains__(self, taskid): return taskid in self.priority_queue or taskid in self.time_queue
def __init__(self, rate=0, burst=0): self.mutex = threading.Lock() self.priority_queue = PriorityTaskQueue() self.time_queue = PriorityTaskQueue() self.processing = PriorityTaskQueue() self.bucket = Bucket(rate=rate, burst=burst)
class TaskQueue(object): ''' task queue for scheduler, have a priority queue and a time queue for delayed tasks ''' processing_timeout = 10 * 60 def __init__(self, rate=0, burst=0): self.mutex = threading.Lock() self.priority_queue = PriorityTaskQueue() self.time_queue = PriorityTaskQueue() self.processing = PriorityTaskQueue() self.bucket = Bucket(rate=rate, burst=burst) @property def rate(self): return self.bucket.rate @rate.setter def rate(self, value): self.bucket.rate = value @property def burst(self): return self.burst.burst @burst.setter def burst(self, value): self.bucket.burst = value def check_update(self): self._check_time_queue() self._check_processing() def _check_time_queue(self): now = time.time() self.mutex.acquire() while self.time_queue.qsize() and self.time_queue.top.exetime < now: task = self.time_queue.get() task.exetime = 0 self.priority_queue.put(task) self.mutex.release() def _check_processing(self): now = time.time() self.mutex.acquire() while self.processing.qsize() and self.processing.top.exetime < now: task = self.processing.get() if task.taskid is None: continue task.exetime = 0 self.priority_queue.put(task) logging.info("[processing: retry] %s" % task.taskid) self.mutex.release() def put(self, taskid, priority=0, exetime=0): now = time.time() self.mutex.acquire() if taskid in self.priority_queue: task = self.priority_queue[taskid] if priority > task.priority: task.priority = priority self.priority_queue.resort() elif taskid in self.time_queue: task = self.time_queue[taskid] if priority > task.priority: task.priority = priority if exetime < task.exetime: task.exetime = exetime self.time_queue.resort() elif taskid in self.processing and self.processing[taskid].taskid: # force update a processing task is not allowed as there are so many # problems may happen return else: task = InQueueTask(taskid, priority) if exetime and exetime > now: task.exetime = exetime self.time_queue.put(task) else: self.priority_queue.put(task) self.mutex.release() def get(self): if self.bucket.get() < 1: return None now = time.time() self.mutex.acquire() try: task = self.priority_queue.get_nowait() self.bucket.desc() except Queue.Empty: self.mutex.release() return None task.exetime = now + self.processing_timeout self.processing.put(task) self.mutex.release() return task.taskid def done(self, taskid): if taskid in self.processing: self.processing.queue_dict.pop(taskid).taskid = None return True return False def __len__(self): return self.priority_queue.qsize() + self.time_queue.qsize() def __contains__(self, taskid): if taskid in self.priority_queue or taskid in self.time_queue: return True if taskid in self.processing and self.processing[taskid].taskid: return True return False
class TaskQueue(object): ''' task queue for scheduler, have a priority queue and a time queue for delayed tasks ''' processing_timeout = 10 * 60 def __init__(self, rate=0, burst=0): self.mutex = threading.RLock() self.priority_queue = PriorityTaskQueue() self.time_queue = PriorityTaskQueue() self.processing = PriorityTaskQueue() self.bucket = Bucket(rate=rate, burst=burst) @property def rate(self): return self.bucket.rate @rate.setter def rate(self, value): self.bucket.rate = value @property def burst(self): return self.burst.burst @burst.setter def burst(self, value): self.bucket.burst = value def check_update(self): ''' Check time queue and processing queue put tasks to priority queue when execute time arrived or process timeout ''' self._check_time_queue() self._check_processing() def _check_time_queue(self): now = time.time() self.mutex.acquire() while self.time_queue.qsize() and self.time_queue.top and self.time_queue.top.exetime < now: task = self.time_queue.get_nowait() task.exetime = 0 self.priority_queue.put(task) self.mutex.release() def _check_processing(self): now = time.time() self.mutex.acquire() while self.processing.qsize() and self.processing.top and self.processing.top.exetime < now: task = self.processing.get_nowait() if task.taskid is None: continue task.exetime = 0 self.priority_queue.put(task) logger.info("processing: retry %s", task.taskid) self.mutex.release() def put(self, taskid, priority=0, exetime=0): '''Put a task into task queue''' now = time.time() task = InQueueTask(taskid, priority, exetime) self.mutex.acquire() if taskid in self.priority_queue: self.priority_queue.put(task) elif taskid in self.time_queue: self.time_queue.put(task) elif taskid in self.processing and self.processing[taskid].taskid: # force update a processing task is not allowed as there are so many # problems may happen pass else: if exetime and exetime > now: self.time_queue.put(task) else: self.priority_queue.put(task) self.mutex.release() def get(self): '''Get a task from queue when bucket available''' if self.bucket.get() < 1: return None now = time.time() self.mutex.acquire() try: task = self.priority_queue.get_nowait() self.bucket.desc() except Queue.Empty: self.mutex.release() return None task.exetime = now + self.processing_timeout self.processing.put(task) self.mutex.release() return task.taskid def done(self, taskid): '''Mark task done''' if taskid in self.processing: del self.processing[taskid] return True return False def size(self): return self.priority_queue.qsize() + self.time_queue.qsize() + self.processing.qsize() def __len__(self): return self.size() def __contains__(self, taskid): if taskid in self.priority_queue or taskid in self.time_queue: return True if taskid in self.processing and self.processing[taskid].taskid: return True return False
import requests from bs4 import BeautifulSoup import proxy_switcher from api_helper import CRAWLER_HEADERS from token_bucket import Bucket from utils.jsdati import JsdatiApi from config.config import V2EX_USERNAME, V2EX_PASSWORD, JSDATI_USERNAME, JSDATI_PASSWORD from utils.notification import wechat_notify V2EX_INDEX_URL = 'https://www.v2ex.com' V2EX_SIGNIN_URL = 'https://www.v2ex.com/signin' V2EX_TOPIC_WEB_URL = 'https://www.v2ex.com/t/{topic_id}' bucket = Bucket(rate=0.3, burst=1) dmapi = JsdatiApi(JSDATI_USERNAME, JSDATI_PASSWORD) def consume_token(func): @functools.wraps(func) def wrapper(*args, **kwargs): while bucket.get() < 1: time.sleep(0.5) bucket.desc() return func(*args, **kwargs) return wrapper
} # V2EX API V2EX_SITE_URL = 'https://www.v2ex.com' STATS_API_PATH = '/api/site/stats.json' ALL_NODES_PATH = '/api/nodes/all.json' NODE_INFO_PATH = '/api/nodes/show.json' # param: `id` or `name` LATEST_TOPICS_PATH = '/api/topics/latest.json' TOPIC_INFO_PATH = '/api/topics/show.json' # param: `id` REPLIES_OF_TOPIC_PATH = '/api/replies/show.json' # param: `topic_id` MEMBER_INFO_PATH = '/api/members/show.json' # param: `id` or `username` API_RATE_LIMIT_ONE_HOUR = 120 bucket = Bucket(rate=0.5, burst=1) def consume_token(func): @functools.wraps(func) def wrapper(*args, **kwargs): while bucket.get() < 1: time.sleep(0.5) bucket.desc() return func(*args, **kwargs) return wrapper class APIHelper(object): """API service with traffic flow controller"""