def text(self, regex=None, strip=True, separator=""): if self._root is None: if self._default is ArgDefault: raise errors.TaskError(f"未找到:{repr(self)}") else: # 默认值不校验格式,直接返回 return self._default _text = self._root.get_text(separator, strip) if regex is None or re.match(regex, _text): return _text else: raise errors.TaskError(f"未通过正则校验:{regex}")
def text(self, regex=None, strip=True): if self._root is None: if self._default is ArgDefault: raise errors.TaskError(f"未找到{repr(self)}") else: # 默认值不校验格式,直接返回 return self._default _text = self._root.text _text = '' if _text is None else _text _text = _text.strip() if strip else _text if regex is None or re.match(regex, _text): return _text else: raise errors.TaskError(f"未通过正则校验:{regex}")
def html(self): if self._root is None: if self._default is ArgDefault: raise errors.TaskError(f"未找到:{repr(self)}") else: # 默认值不校验格式,直接返回 return self._default return str(self._root)
def __init__(self, root, pattern="/*"): if isinstance(root, (etree._Element, type(None))): self._root = root elif isinstance(root, str): self._root = etree.HTML(root) else: raise errors.TaskError(f"不支持从'{type(root)}'类型构造XPath") self._pattern = pattern self._default = ArgDefault
def __init__(self, root, pattern=":root"): if isinstance(root, (element.Tag, type(None))): self._root = root elif isinstance(root, str): self._root = BeautifulSoup(root, "lxml") else: raise errors.TaskError(f"不支持从'{type(root)}'类型构造CSS") self._pattern = pattern self._default = ArgDefault
def execute(task: Task): """ 运行task实例并处理所有异常 Returns: links: {priority: urls} """ try: task.tracking.incr('on_download') task.response = task.on_download() task.tracking.incr('on_download_ok') task.result = task.on_parse() links = task.on_link() if isinstance(links, list): links = {3: links} elif links is None: links = {} elif not isinstance(links, dict): raise errors.TaskError(f"on_link返回值应是list或dict型,而非{type(links)}") task.on_save() task.on_finish() return links except errors.TaskFinish: logger.debug("TaskFinish", task.url) task.on_finish() return {} except errors.TaskBreak as e: logger.debug("TaskBack", e.priority, task.url) task._queue.insert(task.url, e.priority) return {} except errors.TaskError as e: task._queue.report_error(e.__class__.__name__, task.url) logger.warning("Task报告的异常", str(e), task.url) return {} except Exception as e: if task.on_error(e): return {} task._queue.report_error("unknown", task.url) logger.error(f"Task未处理的异常", "unknown", task.url) traceback.print_exc() return {}
def interval(self, value): if not isinstance(value, (int, float)): raise errors.TaskError("interval应为int或float型") self._spider.set_field("interval", value)
def timeout(self, value): if not isinstance(value, (int, float)): raise errors.TaskError("timeout应为int或float型") self._spider.set_field("timeout", value)