def build_dom(self, curr_url, root, required_link=False): node_list = [] for element in root.iter(): # if element.tag != "script" and element.tag != "style": node = Node() node.el = element # node.tag = element.tag if node.el.text != None: node.el.text = node.el.text.strip() # node.tag = element.tag[const.TAG_TRIM_LEN:] for parent in node.el.iterancestors(): node.parent_count = node.parent_count + 1 node_list.append(node) if required_link == True: for p_node in node_list: for c_node in node_list: if (Node.is_same(p_node.el, c_node.el.getparent())): c_node.parent = p_node p_node.children.append(c_node) return node_list
def test_get_encoded_str(self): data = 'ABB' n1 = Node('A', 1) n2 = Node('B', 2) root = Node('', 3, n2, n1) encoded_str = utility.get_encoded_str(root, data) assert encoded_str == '100'
def test_get_decoded_str(self): data = 'aab' n1 = Node('a', 2) n2 = Node('b', 1) root = Node('', 3, n2, n1) encoded_str = utility.get_encoded_str(root, data) decoded_str = utility.get_decoded_str(root, encoded_str) assert decoded_str == data
def test_get_codes(self): child1 = Node('a', 1) child2 = Node('b', 2) child3 = Node('', 3, child1, child2) child4 = Node('c', 4) root = Node('', 7, child3, child4) codes = utility.get_codes(root) assert type(codes) is dict assert 'a' in codes.keys() assert 'b' in codes.keys() assert 'c' in codes.keys() assert codes['a'] == '00' assert codes['b'] == '01' assert codes['c'] == '1'
def create_queue_from_frequencies(frequencies): ''' Create priority queue from frequency table ''' q = PriorityQueue() for k, v in frequencies.items(): n = Node(k, v) q.put((n.freq, n)) return q
def create_nodes_from_frequencies(frequencies): ''' Create node list from frequency table and sort it by frequency, then by alphabet in descending order ''' nodes = [] for k, v in frequencies.items(): nodes.append(Node(k, v)) # Sort by frequency, then by alphabet nodes.sort(key=attrgetter('freq', 'char'), reverse=True) return nodes
def build_data_record_tree(self, dr): # data_record_queue = queue.PriorityQueue() data_record_queue = [] di = None for i in range(0, len(dr), 1): g = dr[i] if i == 0: di = g.data_record_indicator if g.size() == 1: if di == GeneralizedNode.SELF: data_record_queue.append(Tree(g.get(0))) elif di == GeneralizedNode.CHILD_CONT: for child in g.get(0).children: data_record_queue.append(Tree(child)) elif di == GeneralizedNode.CHILD_NON_CONT: pass else: tag_tree = Tree(Node("p")) if di == GeneralizedNode.SELF: for node in g.get_nodes(): tag_tree.get_root().children.append(node) data_record_queue.append(tag_tree) break elif di == GeneralizedNode.CHILD_CONT: break elif di == GeneralizedNode.CHILD_NON_CONT: for j in range(0, len(g.get(0).children), 1): tag_tree = Tree(Node("p")) for tag_node in g.get_nodes(): tag_tree.get_root().children.append( tag_node.children[j]) data_record_queue.append(tag_tree) break return data_record_queue
def build_tree(nodes): ''' Build binary tree from node list ''' length = len(nodes) if length == 1: return nodes[0] index = split(nodes) left = build_tree(nodes[0:index]) right = build_tree(nodes[index:]) return Node('', left.freq + right.freq, left, right)
def build_tree(frequencies): ''' Build Huffman tree and return its root ''' q = create_queue_from_frequencies(frequencies) while q.qsize() > 1: left = q.get()[1] right = q.get()[1] new_node = Node('', left.freq + right.freq, left, right) q.put((new_node.freq, new_node)) root = q.get()[1] return root
def pre_order(self, node, traverse_list): is_found = False for child in traverse_list: if Node.is_same(child.el, node.el): is_found = True break if not is_found: traverse_list.append(node) for child in node.children: self.pre_order(child, traverse_list)
def find_recordN(self, generalized_node): gi = iter(generalized_node) similar_children = False next_node = Node() while next_node != None: next_node = next(gi, None) if next_node == None: break tag_node = next_node next_node = next(gi, None) if next_node != None: next_tag_node = tag_node.next_sibling if len(tag_node.children) != len(next_tag_node.children): similar_children = False break child_iter = iter(tag_node.children) next_child_node = next(child_iter, None) if next_child_node == None: child = next_child_node next_child_node = next(child_iter, None) if next_child_node == None: similar_children = True else: next_child = next_child_node if "".join(child.to_preorder_string()).lower() == "".join( next_child.to_preorder_string()).lower(): similar_children = True while next_child_node != None: next_child_node = next(child_iter, None) if next_child_node == None: break child = next_child next_child = next_child_node if "".join(child.to_preorder_string()).lower( ) == "".join( next_child.to_preorder_string()).lower(): similar_children = True else: similar_children = False break if similar_children == True: generalized_node.data_record_indicator = GeneralizedNode.CHILD_NON_CONT else: generalized_node.data_record_indicator = GeneralizedNode.SELF
def start_diff_pages(self, curr_url): curr_time = time.time() print("Rendering page for current url...") print(curr_url) session = HTMLSession() r = session.get(curr_url) r.html.render() root = r.html.lxml print("Done in {}s.".format(str(round(time.time() - curr_time, 2)))) curr_time = time.time() print("Finding similar pages by queries and edit distance...") link_list = r.html.absolute_links link_dict_list = [] curr_url_qs = urlparse.parse_qs(urlparse.urlparse(curr_url).query) for link in link_list: link_qs = urlparse.parse_qs(urlparse.urlparse(link).query) has_same_query = True # if len(curr_url_qs) <= len(link_qs) and curr_url != link: # if curr_url != link and len(curr_url) != len(link): if curr_url != link: for key, value in curr_url_qs.items(): if key not in link_qs: has_same_query = False break else: has_same_query = False if has_same_query: is_valid = True if len(curr_url) == len(link): is_all_same = True for key, value in curr_url_qs.items(): if link_qs[key] != curr_url_qs[key]: is_all_same = False break if is_all_same: is_valid = False if is_valid: link_dict_list.append({"url": link, "query": link_qs}) def sort_link(a): return len(a["url"]) link_dict_list.sort(key=sort_link) if len(link_dict_list) == 0: link_dict_list = [] mdr_util = MDRUtil() for link in link_list: edit_distance = decimal.Decimal( mdr_util.xlevenshte_in_distance(curr_url, link)) if edit_distance != 0: link_dict_list.append({ "url": link, "edit_distance": edit_distance }) def sort_link(a): return a["edit_distance"] link_dict_list.sort(key=sort_link) # self.write_query(curr_url, link_dict_list) # self.write_info(curr_url, curr_node_list) # # print("Reference url: ", link_dict_list[0]["url"]) print("Done in {}s.".format(str(round(time.time() - curr_time, 2)))) curr_time = time.time() print("Building DOM for current page...") curr_node_list = self.build_dom(curr_url, root, True) root_node = curr_node_list[0] print("Done in {}s.".format(str(round(time.time() - curr_time, 2)))) have_duplicated_found = False for i in range(0, len(link_dict_list), 1): curr_time = time.time() print("Rendering page for reference url...") print(link_dict_list[i]["url"]) session = HTMLSession() r = session.get(link_dict_list[i]["url"]) r.html.render() refer_root = r.html.lxml print("Done in {}s.".format(str(round(time.time() - curr_time, 2)))) curr_time = time.time() print("Building DOM for reference page...") refer_node_list = self.build_dom(link_dict_list[i]["url"], refer_root) # self.write_info(link_dict_list[1]["url"], refer_node_list) print("Done in {}s.".format(str(round(time.time() - curr_time, 2)))) curr_time = time.time() print("Calculating duplicated count for each node...") for curr in curr_node_list: for refer in refer_node_list: # if curr.el.text == refer.el.text and curr.el.tag == refer.el.tag: if Node.is_same(curr.el, refer.el): curr.duplicate_count = curr.duplicate_count + 1 break print("Done in {}s.".format(str(round(time.time() - curr_time, 2)))) curr_time = time.time() print("Searching for unique nodes...") result_node_list = [] for curr in curr_node_list: if curr.duplicate_count == 0 and curr.el.text != None and curr.el.text != "" and curr.el.tag not in const.UNWANTED_TAGS: # Commented, invalid html tags are not allowed to set attributes print(curr.el.tag, " ", curr.el.text) try: have_duplicated_found = True # curr.el.set("fyp-web-miner", "content") curr.is_content = True curr.iter_parent = curr.parent result_node_list.append(curr) # iter_node = curr # while iter_node.parent != None: # iter_node = iter_node.parent # if iter_node.data_regions != None and len(iter_node.data_regions) > 0: # iter_node.is_content_group = True # elif iter_node.is_content != True: # iter_node.is_content_holder = True except TypeError as e: print("Skipped, can't set attributes for tag: ", curr.el.tag, " text: ", curr.el.text) print("Done in {}s.".format(str(round(time.time() - curr_time, 2)))) if have_duplicated_found: break return result_node_list
# print(unique_node_list[i].el.tag ," ", unique_node_list[i].el.text) curr_time = time.time() print("Grouping nodes based on the highest level parent...") same_level_node_list = [] highest_nodes = [] for node in unique_node_list: is_found = False for node_list in same_level_node_list: if node_list[ 0].parent_count == node.parent_count and Node.is_same_without_text( node_list[0].el, node.el): is_found = True node_list.append(node) if not is_found: same_level_node_list.append([node]) for node_list in same_level_node_list: next_parent = node_list[0].parent while True: is_all_same_parent = True if next_parent == None: break for node in node_list: