Example #1
0
 def addLink(self, url, anchor_text, tag, description=""):
     if not url.startswith("http"):
         url = makePerfectURL(url, self.base_url)
     if url not in self.links:
         self.links[url] = list()
     link = LinkURL(url, tag, "IN", anchor_text, "")
     link.description = description
     self.links[url].append(link)
Example #2
0
										sub_dict[c_rule.field] = t_text
							except Exception, msg:
								getLogger().error(msg)
						ret_list.append(sub_dict)
				parsing_result[rule.field] = ret_list
			else:	
				r_node = ret_tree.getNode(rule.type, rule.value, rule.offset)
				if r_node:
					if rule.field == "imageArea":
						parsing_result["imgs"] = list()
						res = r_node.getTextHtmlWithPosition()
						core, rest, links, imgs, core_len, t_text_list = res
						for img in imgs:
							if img[0] == "#" or img.lower().find("mailto") >= 0  or img.lower().find("javascript:") >= 0:
								continue
							parsing_result["imgs"].append(makePerfectURL(img, url))
						parsing_result["imageCount"] = len(parsing_result["imgs"])
					elif rule.field == "body":    # body에 offset이 직접 붙으면 시작점도 변경 
						sid = 0
						eid = 100000000

						for c_idx, c_rule in rule.children_rules.items():

							if c_rule.type == "r_offset" and c_rule.field == "body_start":
								sid = r_node.id + int(c_rule.value)
							else:
								t_node = ret_tree.getNode(c_rule.type, c_rule.value, c_rule.offset)
								if t_node:
									if c_rule.field == "body_end":
										eid = t_node.id
									elif c_rule.field == "body_start":