def addLink(self, url, anchor_text, tag, description=""): if not url.startswith("http"): url = makePerfectURL(url, self.base_url) if url not in self.links: self.links[url] = list() link = LinkURL(url, tag, "IN", anchor_text, "") link.description = description self.links[url].append(link)
sub_dict[c_rule.field] = t_text except Exception, msg: getLogger().error(msg) ret_list.append(sub_dict) parsing_result[rule.field] = ret_list else: r_node = ret_tree.getNode(rule.type, rule.value, rule.offset) if r_node: if rule.field == "imageArea": parsing_result["imgs"] = list() res = r_node.getTextHtmlWithPosition() core, rest, links, imgs, core_len, t_text_list = res for img in imgs: if img[0] == "#" or img.lower().find("mailto") >= 0 or img.lower().find("javascript:") >= 0: continue parsing_result["imgs"].append(makePerfectURL(img, url)) parsing_result["imageCount"] = len(parsing_result["imgs"]) elif rule.field == "body": # body에 offset이 직접 붙으면 시작점도 변경 sid = 0 eid = 100000000 for c_idx, c_rule in rule.children_rules.items(): if c_rule.type == "r_offset" and c_rule.field == "body_start": sid = r_node.id + int(c_rule.value) else: t_node = ret_tree.getNode(c_rule.type, c_rule.value, c_rule.offset) if t_node: if c_rule.field == "body_end": eid = t_node.id elif c_rule.field == "body_start":