Exemple #1
0
 def update_proxies(self):
     self.proxies = self.api.get_proxies()
     self.proxy_manager = ProxyManager(self.proxies)
     self.last_update_proxies = time.time()
Exemple #2
0
class UDBHandler(BaseHandler):
    crawl_config = {
        'headers': {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36',
        }
    }

    RESULT_FIELDS = ['url', 'type', 'title', 'text', 'authors', 'publish_time', 'keywords', 'extra']

    UPDATE_SETTINGS_INTERVAL = 3600
    UPDATE_KEYWORDS_INTERVAL = 3600
    UPDATE_PROXIES_INTERVAL = 3600*12

    PROXY_ON = False
    JS_ON = False

    SETTING_TYPE = 'common'
    KEYWORD_TYPE = 'common'

    def __init__(self):
        self.api = Api()
        self.update_settings()
        self.update_keywords()
        self.update_proxies()

    def check_update(self):
        if time.time() - self.last_update_settings > self.UPDATE_SETTINGS_INTERVAL:
            self.update_settings()
        if time.time() - self.last_update_keywords > self.UPDATE_KEYWORDS_INTERVAL:
            self.update_keywords()
        if time.time() - self.last_update_proxies > self.UPDATE_PROXIES_INTERVAL:
            self.update_proxies()

    def get_settings(self):
        return self.api.get_settings(self.SETTING_TYPE)

    def get_keywords(self):
        return self.api.get_keywords(self.KEYWORD_TYPE)

    def get_proxies(self):
        return self.api.get_proxies()

    def update_settings(self):
        self.settings = self.get_settings()
        self.last_update_settings = time.time()

    def update_keywords(self):
        self.keywords = self.get_keywords()
        self.last_update_keywords = time.time()

    def update_proxies(self):
        self.proxies = self.api.get_proxies()
        self.proxy_manager = ProxyManager(self.proxies)
        self.last_update_proxies = time.time()

    def pick_proxy(self):
        return self.proxy_manager.pick_one()

    def crawl(self, url, **kwargs):

        if self.PROXY_ON:
            kwargs['proxy'] = self.proxy_manager.pick_one()

        if self.JS_ON:
            kwargs['fetch_type'] = 'js'

        return super(UDBHandler, self).crawl(url, **kwargs)


    # def crawl(self, url, **kwargs):
    #     if self.settings.get('proxy_on', False):
    #         kwargs['proxy'] = self.proxy_manager.pick_one()
    #     if self.settings.get('js_on', False):
    #         kwargs['fetch_type'] = 'js'

    #     return self.crawl(url, **kwargs)

    # def get_taskid(self, task):
    #     '''Generate taskid by information of task md5(url) by default, override me'''
    #     return md5string(task['url'])

    def clean_result(self, result):
        """ keep result in certain format """
        # for k in result.keys():
        #     if k not in self.RESULT_FIELDS:
        #         del result[k]

        if 'publish_time' in result:
            if result['publish_time']:
                try:
                    result['publish_time'] = int(result['publish_time'])
                except:
                    publish_date = dateutil.parser.parse(result['publish_time'])
                    result['publish_time'] = int(time.mktime(publish_date.timetuple()))
            else:
                result['publish_time'] = 0

        result['update_time'] = int(time.time())

        return result

    def on_result(self, result):
        """Receiving returns from other callback, override me."""
        if not result:
            return
        assert self.task, "on_result can't outside a callback."
        if self.is_debugger():
            pprint(result)

        result_queue = self.__env__.get('result_queue')

        if result_queue:
            cleaned_result = self.clean_result(result)
            if cleaned_result.get('url', self.task['url']) != self.task['url']:
                new_task = self.task.copy()
                new_task['url'] = cleaned_result['url']
                new_task['taskid'] = self.get_taskid(new_task)
                result_queue.put((new_task, cleaned_result))
            else:
                result_queue.put((self.task, cleaned_result))

            # pack obj by ujson
            if ENABLE_JSON_RESULT and hasattr(result_queue, 'redis'):
                result_queue.redis.rpush(UDB_RESULT_QUEUE_NAME, ujson.dumps(cleaned_result))