Esempio n. 1
0
class TaskQueue(object):
    '''
    task queue for scheduler, have a priority queue and a time queue for delayed tasks
    '''
    processing_timeout = 10 * 60

    def __init__(self, rate=0, burst=0):
        self.mutex = threading.Lock()
        self.priority_queue = PriorityTaskQueue()
        self.time_queue = PriorityTaskQueue()
        self.processing = PriorityTaskQueue()
        self.bucket = Bucket(rate=rate, burst=burst)

    @property
    def rate(self):
        return self.bucket.rate

    @rate.setter
    def rate(self, value):
        self.bucket.rate = value

    @property
    def burst(self):
        return self.burst.burst

    @burst.setter
    def burst(self, value):
        self.bucket.burst = value

    def check_update(self):
        self._check_time_queue()
        self._check_processing()

    def _check_time_queue(self):
        now = time.time()
        self.mutex.acquire()
        while self.time_queue.qsize() and self.time_queue.top.exetime < now:
            task = self.time_queue.get()
            task.exetime = 0
            self.priority_queue.put(task)
        self.mutex.release()

    def _check_processing(self):
        now = time.time()
        self.mutex.acquire()
        while self.processing.qsize() and self.processing.top.exetime < now:
            task = self.processing.get()
            if task.taskid is None:
                continue
            task.exetime = 0
            self.priority_queue.put(task)
            logging.info("[processing: retry] %s" % task.taskid)
        self.mutex.release()

    def put(self, taskid, priority=0, exetime=0):
        now = time.time()
        self.mutex.acquire()
        if taskid in self.priority_queue:
            task = self.priority_queue[taskid]
            if priority > task.priority:
                task.priority = priority
        elif taskid in self.time_queue:
            task = self.time_queue[taskid]
            if priority > task.priority:
                task.priority = priority
            if exetime < task.exetime:
                task.exetime = exetime
        else:
            task = InQueueTask(taskid, priority)
            if exetime and exetime > now:
                task.exetime = exetime
                self.time_queue.put(task)
            else:
                self.priority_queue.put(task)
        self.mutex.release()

    def get(self):
        if self.bucket.get() < 1:
            return None
        now = time.time()
        self.mutex.acquire()
        try:
            task = self.priority_queue.get_nowait()
            self.bucket.desc()
        except Queue.Empty:
            self.mutex.release()
            return None
        task.exetime = now + self.processing_timeout
        self.processing.put(task)
        self.mutex.release()
        return task.taskid

    def done(self, taskid):
        if taskid in self.processing:
            self.processing[taskid].taskid = None

    def __len__(self):
        return self.priority_queue.qsize() + self.time_queue.qsize()

    def __contains__(self, taskid):
        return taskid in self.priority_queue or taskid in self.time_queue
Esempio n. 2
0
 def __init__(self, rate=0, burst=0):
     self.mutex = threading.Lock()
     self.priority_queue = PriorityTaskQueue()
     self.time_queue = PriorityTaskQueue()
     self.processing = PriorityTaskQueue()
     self.bucket = Bucket(rate=rate, burst=burst)
Esempio n. 3
0
 def __init__(self, rate=0, burst=0):
     self.mutex = threading.Lock()
     self.priority_queue = PriorityTaskQueue()
     self.time_queue = PriorityTaskQueue()
     self.processing = PriorityTaskQueue()
     self.bucket = Bucket(rate=rate, burst=burst)
Esempio n. 4
0
class TaskQueue(object):

    '''
    task queue for scheduler, have a priority queue and a time queue for delayed tasks
    '''
    processing_timeout = 10 * 60

    def __init__(self, rate=0, burst=0):
        self.mutex = threading.Lock()
        self.priority_queue = PriorityTaskQueue()
        self.time_queue = PriorityTaskQueue()
        self.processing = PriorityTaskQueue()
        self.bucket = Bucket(rate=rate, burst=burst)

    @property
    def rate(self):
        return self.bucket.rate

    @rate.setter
    def rate(self, value):
        self.bucket.rate = value

    @property
    def burst(self):
        return self.burst.burst

    @burst.setter
    def burst(self, value):
        self.bucket.burst = value

    def check_update(self):
        self._check_time_queue()
        self._check_processing()

    def _check_time_queue(self):
        now = time.time()
        self.mutex.acquire()
        while self.time_queue.qsize() and self.time_queue.top.exetime < now:
            task = self.time_queue.get()
            task.exetime = 0
            self.priority_queue.put(task)
        self.mutex.release()

    def _check_processing(self):
        now = time.time()
        self.mutex.acquire()
        while self.processing.qsize() and self.processing.top.exetime < now:
            task = self.processing.get()
            if task.taskid is None:
                continue
            task.exetime = 0
            self.priority_queue.put(task)
            logging.info("[processing: retry] %s" % task.taskid)
        self.mutex.release()

    def put(self, taskid, priority=0, exetime=0):
        now = time.time()
        self.mutex.acquire()
        if taskid in self.priority_queue:
            task = self.priority_queue[taskid]
            if priority > task.priority:
                task.priority = priority
                self.priority_queue.resort()
        elif taskid in self.time_queue:
            task = self.time_queue[taskid]
            if priority > task.priority:
                task.priority = priority
            if exetime < task.exetime:
                task.exetime = exetime
                self.time_queue.resort()
        elif taskid in self.processing and self.processing[taskid].taskid:
            # force update a processing task is not allowed as there are so many
            # problems may happen
            return
        else:
            task = InQueueTask(taskid, priority)
            if exetime and exetime > now:
                task.exetime = exetime
                self.time_queue.put(task)
            else:
                self.priority_queue.put(task)
        self.mutex.release()

    def get(self):
        if self.bucket.get() < 1:
            return None
        now = time.time()
        self.mutex.acquire()
        try:
            task = self.priority_queue.get_nowait()
            self.bucket.desc()
        except Queue.Empty:
            self.mutex.release()
            return None
        task.exetime = now + self.processing_timeout
        self.processing.put(task)
        self.mutex.release()
        return task.taskid

    def done(self, taskid):
        if taskid in self.processing:
            self.processing.queue_dict.pop(taskid).taskid = None
            return True
        return False

    def __len__(self):
        return self.priority_queue.qsize() + self.time_queue.qsize()

    def __contains__(self, taskid):
        if taskid in self.priority_queue or taskid in self.time_queue:
            return True
        if taskid in self.processing and self.processing[taskid].taskid:
            return True
        return False
Esempio n. 5
0
class TaskQueue(object):

    '''
    task queue for scheduler, have a priority queue and a time queue for delayed tasks
    '''
    processing_timeout = 10 * 60

    def __init__(self, rate=0, burst=0):
        self.mutex = threading.RLock()
        self.priority_queue = PriorityTaskQueue()
        self.time_queue = PriorityTaskQueue()
        self.processing = PriorityTaskQueue()
        self.bucket = Bucket(rate=rate, burst=burst)

    @property
    def rate(self):
        return self.bucket.rate

    @rate.setter
    def rate(self, value):
        self.bucket.rate = value

    @property
    def burst(self):
        return self.burst.burst

    @burst.setter
    def burst(self, value):
        self.bucket.burst = value

    def check_update(self):
        '''
        Check time queue and processing queue

        put tasks to priority queue when execute time arrived or process timeout
        '''
        self._check_time_queue()
        self._check_processing()

    def _check_time_queue(self):
        now = time.time()
        self.mutex.acquire()
        while self.time_queue.qsize() and self.time_queue.top and self.time_queue.top.exetime < now:
            task = self.time_queue.get_nowait()
            task.exetime = 0
            self.priority_queue.put(task)
        self.mutex.release()

    def _check_processing(self):
        now = time.time()
        self.mutex.acquire()
        while self.processing.qsize() and self.processing.top and self.processing.top.exetime < now:
            task = self.processing.get_nowait()
            if task.taskid is None:
                continue
            task.exetime = 0
            self.priority_queue.put(task)
            logger.info("processing: retry %s", task.taskid)
        self.mutex.release()

    def put(self, taskid, priority=0, exetime=0):
        '''Put a task into task queue'''
        now = time.time()
        task = InQueueTask(taskid, priority, exetime)
        self.mutex.acquire()
        if taskid in self.priority_queue:
            self.priority_queue.put(task)
        elif taskid in self.time_queue:
            self.time_queue.put(task)
        elif taskid in self.processing and self.processing[taskid].taskid:
            # force update a processing task is not allowed as there are so many
            # problems may happen
            pass
        else:
            if exetime and exetime > now:
                self.time_queue.put(task)
            else:
                self.priority_queue.put(task)
        self.mutex.release()

    def get(self):
        '''Get a task from queue when bucket available'''
        if self.bucket.get() < 1:
            return None
        now = time.time()
        self.mutex.acquire()
        try:
            task = self.priority_queue.get_nowait()
            self.bucket.desc()
        except Queue.Empty:
            self.mutex.release()
            return None
        task.exetime = now + self.processing_timeout
        self.processing.put(task)
        self.mutex.release()
        return task.taskid

    def done(self, taskid):
        '''Mark task done'''
        if taskid in self.processing:
            del self.processing[taskid]
            return True
        return False

    def size(self):
        return self.priority_queue.qsize() + self.time_queue.qsize() + self.processing.qsize()

    def __len__(self):
        return self.size()

    def __contains__(self, taskid):
        if taskid in self.priority_queue or taskid in self.time_queue:
            return True
        if taskid in self.processing and self.processing[taskid].taskid:
            return True
        return False
Esempio n. 6
0
import requests
from bs4 import BeautifulSoup

import proxy_switcher
from api_helper import CRAWLER_HEADERS
from token_bucket import Bucket
from utils.jsdati import JsdatiApi
from config.config import V2EX_USERNAME, V2EX_PASSWORD, JSDATI_USERNAME, JSDATI_PASSWORD
from utils.notification import wechat_notify

V2EX_INDEX_URL = 'https://www.v2ex.com'
V2EX_SIGNIN_URL = 'https://www.v2ex.com/signin'
V2EX_TOPIC_WEB_URL = 'https://www.v2ex.com/t/{topic_id}'

bucket = Bucket(rate=0.3, burst=1)

dmapi = JsdatiApi(JSDATI_USERNAME, JSDATI_PASSWORD)


def consume_token(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        while bucket.get() < 1:
            time.sleep(0.5)
        bucket.desc()
        return func(*args, **kwargs)

    return wrapper

Esempio n. 7
0
}

# V2EX API
V2EX_SITE_URL = 'https://www.v2ex.com'

STATS_API_PATH = '/api/site/stats.json'
ALL_NODES_PATH = '/api/nodes/all.json'
NODE_INFO_PATH = '/api/nodes/show.json'  # param: `id` or `name`
LATEST_TOPICS_PATH = '/api/topics/latest.json'
TOPIC_INFO_PATH = '/api/topics/show.json'  # param: `id`
REPLIES_OF_TOPIC_PATH = '/api/replies/show.json'  # param: `topic_id`
MEMBER_INFO_PATH = '/api/members/show.json'  # param: `id` or `username`

API_RATE_LIMIT_ONE_HOUR = 120

bucket = Bucket(rate=0.5, burst=1)


def consume_token(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        while bucket.get() < 1:
            time.sleep(0.5)
        bucket.desc()
        return func(*args, **kwargs)

    return wrapper


class APIHelper(object):
    """API service with traffic flow controller"""