def extract_content(self, param, clean=None, before=None, after=None): params = {"params": {"selector": "div#eData > dl"}, "method": "select"} img_params = { "params": { "selector": "dd:nth-of-type(1)" }, "method": "select" } text_params = { "params": { "selector": "dd:nth-of-type(5)" }, "method": "select" } dls = find_tags(self._soup, params) content = list() for dl in dls: image_tag = find_tag(dl, img_params) if not image_tag: continue url = extract_tag_attribute(image_tag, "text") if not (url and url.startswith("http") and url.endswith(".jpg")): continue content.append({"tag": "img", "src": url}) text_tag = find_tag(dl, text_params) if not text_tag: continue text = extract_tag_attribute(text_tag, "text") if not text: continue text = text.replace("<br />", " ").strip() content.append({"tag": "p", "text": text}) return self.clean_content(content)
def extract_content(self, param, clean=None, before=None, after=None): roots = list() if param is None: roots.append(self._find_content_tag()) elif isinstance(param, list): for p in param: root = find_tag(self._soup, p) if root is None or root in roots: continue else: roots.append(root) else: roots.append(find_tag(self._soup, param)) content = list() for root in roots: if root is None: continue if before is not None: self._clean_content_before(root, before) if after is not None: self._clean_content_after(root, after) if clean is not None: self._clean_content(root, clean) if root.name == "textarea": content.extend(self.parse_text_area(root)) else: content.extend(self.parse_content(root)) return self.clean_content(content)
def judge_missing(self, param): if param is None: return False if isinstance(param, list): for p in param: if find_tag(self.soup, p): return True else: if find_tag(self.soup, param): return True return False
def _clean_content_after(self, root, param): tag = find_tag(root, param) if tag is not None: siblings = [] for sibling in tag.next_siblings: siblings.append(sibling) for sibling in siblings: sibling.extract() tag.extract()
def _clean_content_before(self, root, param): tag = find_tag(root, param) if tag is not None: siblings = [] for sibling in tag.previous_siblings: siblings.append(sibling) for sibling in siblings: sibling.extract() tag.extract()
def clean_author(self, author): param = { "method": "select", "params": { "selector": "div#news_template_03_AuthorAndTime > span" } } tag = find_tag(root=self.soup, param=param) text = extract_tag_attribute(root=tag) return author.replace(text, "")
def _extract_tag(root, param): """ param = { "method": "find_all", "params": {}, "nth": 0, "attribute": "text", } :param root: :type root: :param param: :type param: :return: :rtype: """ tag = find_tag(root, param) attribute = param.get("attribute") if attribute is None: string = extract_tag_attribute(tag, "text") else: string = extract_tag_attribute(tag, attribute) return string
def find_tag_extract_attribute(root, params): tag = find_tag(root, params) if not tag: return None return extract_tag_attribute(tag, params.get("attribute", "text"))