Beispiel #1
0
    def _crawl(self, url, **kwargs):
        task = {}

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, basestring) and hasattr(self, callback):
                func = getattr(self, callback)
            elif hasattr(callback, 'im_self') and callback.im_self is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in func._config.iteritems():
                    kwargs.setdefault(k, v)

        if hasattr(self, 'crawl_config'):
            for k, v in self.crawl_config.iteritems():
                kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.get('params')))
        if kwargs.get('files'):
            assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.get('data', {}),
                                                            kwargs.get('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'):
            if key in kwargs and kwargs[key] is not None:
                schedule[key] = kwargs[key]
        if schedule:
            task['schedule'] = schedule

        fetch = {}
        for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'):
            if key in kwargs and kwargs[key] is not None:
                fetch[key] = kwargs[key]
        if fetch:
            task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs and kwargs[key] is not None:
                process[key] = kwargs[key]
        if process:
            task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        task['taskid'] = task.get('taskid') or md5string(url)

        self._follows.append(task)
        return task
Beispiel #2
0
    def _crawl(self, url, **kwargs):
        task = {}

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, basestring) and hasattr(self, callback):
                func = getattr(self, callback)
            elif hasattr(callback, 'im_self') and callback.im_self is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in func._config.iteritems():
                    kwargs.setdefault(k, v)

        if hasattr(self, 'crawl_config'):
            for k, v in self.crawl_config.iteritems():
                kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.get('params')))
        if kwargs.get('files'):
            assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.get('data', {}),
                                                            kwargs.get('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'):
            if key in kwargs and kwargs[key] is not None:
                schedule[key] = kwargs[key]
        if schedule:
            task['schedule'] = schedule

        fetch = {}
        for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'):
            if key in kwargs and kwargs[key] is not None:
                fetch[key] = kwargs[key]
        if fetch:
            task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs and kwargs[key] is not None:
                process[key] = kwargs[key]
        if process:
            task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        task['taskid'] = task.get('taskid') or md5string(url)

        self._follows.append(task)
        return task
    def _crawl(self, url, **kwargs):
        """
        real crawl API

        checking kwargs, and repack them to each sub-dict
        """
        task = {}

        assert len(url) < 1024, "Maximum (1024) URL length error."

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, six.string_types) and hasattr(self, callback):
                func = getattr(self, callback)
            elif six.callable(callback) and six.get_method_self(callback) is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in iteritems(func._config):
                    if isinstance(v, dict) and isinstance(kwargs.get(k), dict):
                        kwargs[k].update(v)
                    else:
                        kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None)))
        if kwargs.get('files'):
            assert isinstance(
                kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}),
                                                            kwargs.pop('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in self.schedule_fields:
            if key in kwargs:
                schedule[key] = kwargs.pop(key)
            elif key in self.crawl_config:
                schedule[key] = self.crawl_config[key]

        task['schedule'] = schedule

        fetch = {}
        for key in self.fetch_fields:
            if key in kwargs:
                fetch[key] = kwargs.pop(key)
        task['fetch'] = fetch

        process = {}
        for key in self.process_fields:
            if key in kwargs:
                process[key] = kwargs.pop(key)
        task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        if 'taskid' in kwargs:
            task['taskid'] = kwargs.pop('taskid')
        else:
            task['taskid'] = self.get_taskid(task)

        if kwargs:
            raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys())

        if self.is_debugger():
            task = self.task_join_crawl_config(task, self.crawl_config)
            if task['fetch'].get('proxy', False) and task['fetch'].get('fetch_type', None) in ('js', 'phantomjs') \
                    and not hasattr(self, '_proxy_warning'):
                self.logger.warning('phantomjs does not support specify proxy from script, use phantomjs args instead')
                self._proxy_warning = True

        cache_key = "%(project)s:%(taskid)s" % task
        if cache_key not in self._follows_keys:
            self._follows_keys.add(cache_key)
            self._follows.append(task)
        return task
Beispiel #4
0
    def _crawl(self, url, **kwargs):
        """
        real crawl API

        checking kwargs, and repack them to each sub-dict
        """
        task = {}

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, six.string_types) and hasattr(self, callback):
                func = getattr(self, callback)
            elif six.callable(callback) and six.get_method_self(callback) is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in iteritems(func._config):
                    kwargs.setdefault(k, v)

        for k, v in iteritems(self.crawl_config):
            kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.get('params')))
        if kwargs.get('files'):
            assert isinstance(
                kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.get('data', {}),
                                                            kwargs.get('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'):
            if key in kwargs and kwargs[key] is not None:
                schedule[key] = kwargs[key]
        task['schedule'] = schedule

        fetch = {}
        for key in (
                'method',
                'headers',
                'data',
                'timeout',
                'allow_redirects',
                'cookies',
                'proxy',
                'etag',
                'last_modifed',
                'save',
                'js_run_at',
                'js_script',
                'load_images',
                'fetch_type'
        ):
            if key in kwargs and kwargs[key] is not None:
                fetch[key] = kwargs[key]
        task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs and kwargs[key] is not None:
                process[key] = kwargs[key]
        task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        task['taskid'] = task.get('taskid') or self.get_taskid(task)

        cache_key = "%(project)s:%(taskid)s" % task
        if cache_key not in self._follows_keys:
            self._follows_keys.add(cache_key)
            self._follows.append(task)
        return task
Beispiel #5
0
    def _crawl(self, url, **kwargs):
        """
        real crawl API

        checking kwargs, and repack them to each sub-dict
        """
        task = {}

        assert len(url) < 1024, "Maximum (1024) URL length error."

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, six.string_types) and hasattr(
                    self, callback):
                func = getattr(self, callback)
            elif six.callable(
                    callback) and six.get_method_self(callback) is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" %
                                          callback)
            if hasattr(func, '_config'):
                for k, v in iteritems(func._config):
                    if isinstance(v, dict) and isinstance(kwargs.get(k), dict):
                        kwargs[k].update(v)
                    else:
                        kwargs.setdefault(k, v)

        for k, v in iteritems(self.crawl_config):
            if isinstance(v, dict) and isinstance(kwargs.get(k), dict):
                kwargs[k].update(v)
            else:
                kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.pop('params',
                                                               None)))
        if kwargs.get('files'):
            assert isinstance(
                kwargs.get('data', {}),
                dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(
                kwargs.pop('data', {}), kwargs.pop('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag',
                    'force_update', 'auto_recrawl'):
            if key in kwargs:
                schedule[key] = kwargs.pop(key)
        task['schedule'] = schedule

        fetch = {}
        for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects',
                    'cookies', 'proxy', 'etag', 'last_modifed',
                    'last_modified', 'save', 'js_run_at', 'js_script',
                    'js_viewport_width', 'js_viewport_height', 'load_images',
                    'fetch_type', 'use_gzip', 'validate_cert', 'max_redirects',
                    'robots_txt'):
            if key in kwargs:
                fetch[key] = kwargs.pop(key)
        task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs:
                process[key] = kwargs.pop(key)
        task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        if 'taskid' in kwargs:
            task['taskid'] = kwargs.pop('taskid')
        else:
            task['taskid'] = self.get_taskid(task)

        if kwargs:
            raise TypeError('crawl() got unexpected keyword argument: %s' %
                            kwargs.keys())

        cache_key = "%(project)s:%(taskid)s" % task
        if cache_key not in self._follows_keys:
            self._follows_keys.add(cache_key)
            self._follows.append(task)
        return task
Beispiel #6
0
    def _crawl(self, url, **kwargs):
        """
        real crawl API

        checking kwargs, and repack them to each sub-dict
        """
        task = {}

        assert len(url) < 1024, "Maximum (1024) URL length error."

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, six.string_types) and hasattr(self, callback):
                func = getattr(self, callback)
            elif six.callable(callback) and six.get_method_self(callback) is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in iteritems(func._config):
                    if isinstance(v, dict) and isinstance(kwargs.get(k), dict):
                        kwargs[k].update(v)
                    else:
                        kwargs.setdefault(k, v)

        for k, v in iteritems(self.crawl_config):
            if isinstance(v, dict) and isinstance(kwargs.get(k), dict):
                kwargs[k].update(v)
            else:
                kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None)))
        if kwargs.get('files'):
            assert isinstance(
                kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}),
                                                            kwargs.pop('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update',
                    'auto_recrawl'):
            if key in kwargs:
                schedule[key] = kwargs.pop(key)
        task['schedule'] = schedule

        fetch = {}
        for key in (
                'method',
                'headers',
                'data',
                'timeout',
                'allow_redirects',
                'cookies',
                'proxy',
                'etag',
                'last_modifed',
                'save',
                'js_run_at',
                'js_script',
                'js_viewport_width',
                'js_viewport_height',
                'load_images',
                'fetch_type',
                'use_gzip',
                'validate_cert',
                'max_redirects'
        ):
            if key in kwargs:
                fetch[key] = kwargs.pop(key)
        task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs:
                process[key] = kwargs.pop(key)
        task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        if 'taskid' in kwargs:
            task['taskid'] = kwargs.pop('taskid')
        else:
            task['taskid'] = self.get_taskid(task)

        if kwargs:
            raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys())

        cache_key = "%(project)s:%(taskid)s" % task
        if cache_key not in self._follows_keys:
            self._follows_keys.add(cache_key)
            self._follows.append(task)
        return task
Beispiel #7
0
def build_baidu_url(word, page):
    baidu_base_url = 'http://www.baidu.com/s?'
    qs_dict = {'wd':word,'pn':str(page)+'0','tn':'baidurt','ie':'utf-8','bsst':'1'}
    return _build_url(baidu_base_url, qs_dict)
Beispiel #8
0
    def _crawl(self, url, **kwargs):
        """
        real crawl API

        checking kwargs, and repack them to each sub-dict
        """
        task = {}

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, six.string_types) and hasattr(
                    self, callback):
                func = getattr(self, callback)
            elif six.callable(
                    callback) and six.get_method_self(callback) is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" %
                                          callback)
            if hasattr(func, '_config'):
                for k, v in iteritems(func._config):
                    kwargs.setdefault(k, v)

        for k, v in iteritems(self.crawl_config):
            kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.get('params')))
        if kwargs.get('files'):
            assert isinstance(
                kwargs.get('data', {}),
                dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(
                kwargs.get('data', {}), kwargs.get('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag',
                    'force_update'):
            if key in kwargs and kwargs[key] is not None:
                schedule[key] = kwargs[key]
        task['schedule'] = schedule

        fetch = {}
        for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects',
                    'cookies', 'proxy', 'etag', 'last_modifed', 'save',
                    'js_run_at', 'js_script', 'load_images', 'fetch_type'):
            if key in kwargs and kwargs[key] is not None:
                fetch[key] = kwargs[key]
        task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs and kwargs[key] is not None:
                process[key] = kwargs[key]
        task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        task['taskid'] = task.get('taskid') or self.get_taskid(task)

        cache_key = "%(project)s:%(taskid)s" % task
        if cache_key not in self._follows_keys:
            self._follows_keys.add(cache_key)
            self._follows.append(task)
        return task
Beispiel #9
0
    def _crawl(self, url, **kwargs):
        """
        real crawl API

        checking kwargs, and repack them to each sub-dict
        """
        task = {}

        assert len(url) < 1024, "Maximum (1024) URL length error."

        if kwargs.get("callback"):
            callback = kwargs["callback"]
            if isinstance(callback, six.string_types) and hasattr(self, callback):
                func = getattr(self, callback)
            elif six.callable(callback) and six.get_method_self(callback) is self:
                func = callback
                kwargs["callback"] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, "_config"):
                for k, v in iteritems(func._config):
                    kwargs.setdefault(k, v)

        for k, v in iteritems(self.crawl_config):
            kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.pop("params", None)))
        if kwargs.get("files"):
            assert isinstance(kwargs.get("data", {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.pop("data", {}), kwargs.pop("files", {}))
            kwargs.setdefault("headers", {})
            kwargs["headers"]["Content-Type"] = content_type
            kwargs["data"] = data
        if kwargs.get("data"):
            kwargs["data"] = _encode_params(kwargs["data"])
        if kwargs.get("data"):
            kwargs.setdefault("method", "POST")

        schedule = {}
        for key in ("priority", "retries", "exetime", "age", "itag", "force_update", "auto_recrawl"):
            if key in kwargs:
                schedule[key] = kwargs.pop(key)
        task["schedule"] = schedule

        fetch = {}
        for key in (
            "method",
            "headers",
            "data",
            "timeout",
            "allow_redirects",
            "cookies",
            "proxy",
            "etag",
            "last_modifed",
            "save",
            "js_run_at",
            "js_script",
            "js_viewport_width",
            "js_viewport_height",
            "load_images",
            "fetch_type",
            "use_gzip",
            "validate_cert",
        ):
            if key in kwargs:
                fetch[key] = kwargs.pop(key)
        task["fetch"] = fetch

        process = {}
        for key in ("callback",):
            if key in kwargs:
                process[key] = kwargs.pop(key)
        task["process"] = process

        task["project"] = self.project_name
        task["url"] = url
        if "taskid" in kwargs:
            task["taskid"] = kwargs.pop("taskid")
        else:
            task["taskid"] = self.get_taskid(task)

        if kwargs:
            raise TypeError("crawl() got unexpected keyword argument: %s" % kwargs.keys())

        cache_key = "%(project)s:%(taskid)s" % task
        if cache_key not in self._follows_keys:
            self._follows_keys.add(cache_key)
            self._follows.append(task)
        return task