def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}), kwargs.pop('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in self.schedule_fields: if key in kwargs: schedule[key] = kwargs.pop(key) elif key in self.crawl_config: schedule[key] = self.crawl_config[key] task['schedule'] = schedule fetch = {} for key in self.fetch_fields: if key in kwargs: fetch[key] = kwargs.pop(key) task['fetch'] = fetch process = {} for key in self.process_fields: if key in kwargs: process[key] = kwargs.pop(key) task['process'] = process task['project'] = self.project_name task['url'] = url if 'taskid' in kwargs: task['taskid'] = kwargs.pop('taskid') else: task['taskid'] = self.get_taskid(task) if kwargs: raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) if self.is_debugger(): task = self.task_join_crawl_config(task, self.crawl_config) if task['fetch'].get('proxy', False) and task['fetch'].get('fetch_type', None) in ('js', 'phantomjs') \ and not hasattr(self, '_proxy_warning'): self.logger.warning('phantomjs does not support specify proxy from script, use phantomjs args instead') self._proxy_warning = True cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.get('params'))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.get('data', {}), kwargs.get('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'): if key in kwargs and kwargs[key] is not None: schedule[key] = kwargs[key] task['schedule'] = schedule fetch = {} for key in ( 'method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type' ): if key in kwargs and kwargs[key] is not None: fetch[key] = kwargs[key] task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs and kwargs[key] is not None: process[key] = kwargs[key] task['process'] = process task['project'] = self.project_name task['url'] = url task['taskid'] = task.get('taskid') or self.get_taskid(task) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr( self, callback): func = getattr(self, callback) elif six.callable( callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.get('params'))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata( kwargs.get('data', {}), kwargs.get('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'): if key in kwargs and kwargs[key] is not None: schedule[key] = kwargs[key] if schedule: task['schedule'] = schedule fetch = {} for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'): if key in kwargs and kwargs[key] is not None: fetch[key] = kwargs[key] if fetch: task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs and kwargs[key] is not None: process[key] = kwargs[key] if process: task['process'] = process task['project'] = self.project_name task['url'] = url task['taskid'] = task.get('taskid') or self.get_taskid(task) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr( self, callback): func = getattr(self, callback) elif six.callable( callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata( kwargs.pop('data', {}), kwargs.pop('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl'): if key in kwargs: schedule[key] = kwargs.pop(key) task['schedule'] = schedule fetch = {} for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'last_modified', 'save', 'js_run_at', 'js_script', 'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert', 'max_redirects', 'robots_txt'): if key in kwargs: fetch[key] = kwargs.pop(key) task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs: process[key] = kwargs.pop(key) task['process'] = process task['project'] = self.project_name task['url'] = url if 'taskid' in kwargs: task['taskid'] = kwargs.pop('taskid') else: task['taskid'] = self.get_taskid(task) if kwargs: raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}), kwargs.pop('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl'): if key in kwargs: schedule[key] = kwargs.pop(key) task['schedule'] = schedule fetch = {} for key in ( 'method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert', 'max_redirects' ): if key in kwargs: fetch[key] = kwargs.pop(key) task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs: process[key] = kwargs.pop(key) task['process'] = process task['project'] = self.project_name task['url'] = url if 'taskid' in kwargs: task['taskid'] = kwargs.pop('taskid') else: task['taskid'] = self.get_taskid(task) if kwargs: raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get("callback"): callback = kwargs["callback"] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs["callback"] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, "_config"): for k, v in iteritems(func._config): kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop("params", None))) if kwargs.get("files"): assert isinstance(kwargs.get("data", {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.pop("data", {}), kwargs.pop("files", {})) kwargs.setdefault("headers", {}) kwargs["headers"]["Content-Type"] = content_type kwargs["data"] = data if kwargs.get("data"): kwargs["data"] = _encode_params(kwargs["data"]) if kwargs.get("data"): kwargs.setdefault("method", "POST") schedule = {} for key in ("priority", "retries", "exetime", "age", "itag", "force_update", "auto_recrawl"): if key in kwargs: schedule[key] = kwargs.pop(key) task["schedule"] = schedule fetch = {} for key in ( "method", "headers", "data", "timeout", "allow_redirects", "cookies", "proxy", "etag", "last_modifed", "save", "js_run_at", "js_script", "js_viewport_width", "js_viewport_height", "load_images", "fetch_type", "use_gzip", "validate_cert", ): if key in kwargs: fetch[key] = kwargs.pop(key) task["fetch"] = fetch process = {} for key in ("callback",): if key in kwargs: process[key] = kwargs.pop(key) task["process"] = process task["project"] = self.project_name task["url"] = url if "taskid" in kwargs: task["taskid"] = kwargs.pop("taskid") else: task["taskid"] = self.get_taskid(task) if kwargs: raise TypeError("crawl() got unexpected keyword argument: %s" % kwargs.keys()) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task