Ejemplo n.º 1
0
    def build_dom(self, curr_url, root, required_link=False):
        node_list = []

        for element in root.iter():
            # if element.tag != "script" and element.tag != "style":
            node = Node()
            node.el = element
            # node.tag = element.tag

            if node.el.text != None:
                node.el.text = node.el.text.strip()

            # node.tag = element.tag[const.TAG_TRIM_LEN:]

            for parent in node.el.iterancestors():
                node.parent_count = node.parent_count + 1

            node_list.append(node)

        if required_link == True:
            for p_node in node_list:
                for c_node in node_list:
                    if (Node.is_same(p_node.el, c_node.el.getparent())):
                        c_node.parent = p_node
                        p_node.children.append(c_node)

        return node_list
 def test_get_encoded_str(self):
     data = 'ABB'
     n1 = Node('A', 1)
     n2 = Node('B', 2)
     root = Node('', 3, n2, n1)
     encoded_str = utility.get_encoded_str(root, data)
     assert encoded_str == '100'
 def test_get_decoded_str(self):
     data = 'aab'
     n1 = Node('a', 2)
     n2 = Node('b', 1)
     root = Node('', 3, n2, n1)
     encoded_str = utility.get_encoded_str(root, data)
     decoded_str = utility.get_decoded_str(root, encoded_str)
     assert decoded_str == data
    def test_get_codes(self):
        child1 = Node('a', 1)
        child2 = Node('b', 2)
        child3 = Node('', 3, child1, child2)
        child4 = Node('c', 4)
        root = Node('', 7, child3, child4)

        codes = utility.get_codes(root)
        assert type(codes) is dict
        assert 'a' in codes.keys()
        assert 'b' in codes.keys()
        assert 'c' in codes.keys()
        assert codes['a'] == '00'
        assert codes['b'] == '01'
        assert codes['c'] == '1'
def create_queue_from_frequencies(frequencies):
    ''' Create priority queue from frequency table '''
    q = PriorityQueue()
    for k, v in frequencies.items():
        n = Node(k, v)
        q.put((n.freq, n))
    return q
Ejemplo n.º 6
0
def create_nodes_from_frequencies(frequencies):
    ''' Create node list from frequency table and sort it
    by frequency, then by alphabet in descending order '''
    nodes = []
    for k, v in frequencies.items():
        nodes.append(Node(k, v))
    # Sort by frequency, then by alphabet
    nodes.sort(key=attrgetter('freq', 'char'), reverse=True)
    return nodes
Ejemplo n.º 7
0
    def build_data_record_tree(self, dr):
        # data_record_queue = queue.PriorityQueue()
        data_record_queue = []

        di = None

        for i in range(0, len(dr), 1):
            g = dr[i]

            if i == 0:
                di = g.data_record_indicator

            if g.size() == 1:
                if di == GeneralizedNode.SELF:
                    data_record_queue.append(Tree(g.get(0)))
                elif di == GeneralizedNode.CHILD_CONT:
                    for child in g.get(0).children:
                        data_record_queue.append(Tree(child))
                elif di == GeneralizedNode.CHILD_NON_CONT:
                    pass
            else:
                tag_tree = Tree(Node("p"))

                if di == GeneralizedNode.SELF:
                    for node in g.get_nodes():
                        tag_tree.get_root().children.append(node)

                    data_record_queue.append(tag_tree)
                    break
                elif di == GeneralizedNode.CHILD_CONT:
                    break
                elif di == GeneralizedNode.CHILD_NON_CONT:
                    for j in range(0, len(g.get(0).children), 1):
                        tag_tree = Tree(Node("p"))

                        for tag_node in g.get_nodes():
                            tag_tree.get_root().children.append(
                                tag_node.children[j])

                        data_record_queue.append(tag_tree)

                    break

        return data_record_queue
Ejemplo n.º 8
0
def build_tree(nodes):
    ''' Build binary tree from node list '''
    length = len(nodes)
    if length == 1:
        return nodes[0]

    index = split(nodes)
    left = build_tree(nodes[0:index])
    right = build_tree(nodes[index:])
    return Node('', left.freq + right.freq, left, right)
def build_tree(frequencies):
    ''' Build Huffman tree and return its root '''
    q = create_queue_from_frequencies(frequencies)
    while q.qsize() > 1:
        left = q.get()[1]
        right = q.get()[1]
        new_node = Node('', left.freq + right.freq, left, right)
        q.put((new_node.freq, new_node))

    root = q.get()[1]
    return root
Ejemplo n.º 10
0
    def pre_order(self, node, traverse_list):
        is_found = False

        for child in traverse_list:
            if Node.is_same(child.el, node.el):
                is_found = True
                break

        if not is_found:
            traverse_list.append(node)

        for child in node.children:
            self.pre_order(child, traverse_list)
Ejemplo n.º 11
0
    def find_recordN(self, generalized_node):
        gi = iter(generalized_node)

        similar_children = False

        next_node = Node()

        while next_node != None:
            next_node = next(gi, None)

            if next_node == None: break

            tag_node = next_node

            next_node = next(gi, None)

            if next_node != None:
                next_tag_node = tag_node.next_sibling

                if len(tag_node.children) != len(next_tag_node.children):
                    similar_children = False
                    break

            child_iter = iter(tag_node.children)

            next_child_node = next(child_iter, None)

            if next_child_node == None:
                child = next_child_node

                next_child_node = next(child_iter, None)

                if next_child_node == None:
                    similar_children = True
                else:
                    next_child = next_child_node

                    if "".join(child.to_preorder_string()).lower() == "".join(
                            next_child.to_preorder_string()).lower():
                        similar_children = True

                        while next_child_node != None:
                            next_child_node = next(child_iter, None)

                            if next_child_node == None: break

                            child = next_child
                            next_child = next_child_node

                            if "".join(child.to_preorder_string()).lower(
                            ) == "".join(
                                    next_child.to_preorder_string()).lower():
                                similar_children = True
                            else:
                                similar_children = False
                                break

        if similar_children == True:
            generalized_node.data_record_indicator = GeneralizedNode.CHILD_NON_CONT
        else:
            generalized_node.data_record_indicator = GeneralizedNode.SELF
Ejemplo n.º 12
0
    def start_diff_pages(self, curr_url):
        curr_time = time.time()

        print("Rendering page for current url...")
        print(curr_url)

        session = HTMLSession()
        r = session.get(curr_url)

        r.html.render()
        root = r.html.lxml

        print("Done in {}s.".format(str(round(time.time() - curr_time, 2))))
        curr_time = time.time()

        print("Finding similar pages by queries and edit distance...")

        link_list = r.html.absolute_links

        link_dict_list = []

        curr_url_qs = urlparse.parse_qs(urlparse.urlparse(curr_url).query)

        for link in link_list:
            link_qs = urlparse.parse_qs(urlparse.urlparse(link).query)

            has_same_query = True

            # if len(curr_url_qs) <= len(link_qs) and curr_url != link:
            # if curr_url != link and len(curr_url) != len(link):
            if curr_url != link:
                for key, value in curr_url_qs.items():
                    if key not in link_qs:
                        has_same_query = False
                        break
            else:
                has_same_query = False

            if has_same_query:
                is_valid = True

                if len(curr_url) == len(link):
                    is_all_same = True

                    for key, value in curr_url_qs.items():
                        if link_qs[key] != curr_url_qs[key]:
                            is_all_same = False
                            break

                    if is_all_same:
                        is_valid = False

                if is_valid:
                    link_dict_list.append({"url": link, "query": link_qs})

        def sort_link(a):
            return len(a["url"])

        link_dict_list.sort(key=sort_link)

        if len(link_dict_list) == 0:
            link_dict_list = []

            mdr_util = MDRUtil()

            for link in link_list:
                edit_distance = decimal.Decimal(
                    mdr_util.xlevenshte_in_distance(curr_url, link))

                if edit_distance != 0:
                    link_dict_list.append({
                        "url": link,
                        "edit_distance": edit_distance
                    })

            def sort_link(a):
                return a["edit_distance"]

            link_dict_list.sort(key=sort_link)

        # self.write_query(curr_url, link_dict_list)

        # self.write_info(curr_url, curr_node_list)
        #
        # print("Reference url: ", link_dict_list[0]["url"])

        print("Done in {}s.".format(str(round(time.time() - curr_time, 2))))
        curr_time = time.time()

        print("Building DOM for current page...")

        curr_node_list = self.build_dom(curr_url, root, True)

        root_node = curr_node_list[0]

        print("Done in {}s.".format(str(round(time.time() - curr_time, 2))))

        have_duplicated_found = False

        for i in range(0, len(link_dict_list), 1):
            curr_time = time.time()

            print("Rendering page for reference url...")

            print(link_dict_list[i]["url"])

            session = HTMLSession()
            r = session.get(link_dict_list[i]["url"])

            r.html.render()
            refer_root = r.html.lxml

            print("Done in {}s.".format(str(round(time.time() - curr_time,
                                                  2))))
            curr_time = time.time()

            print("Building DOM for reference page...")

            refer_node_list = self.build_dom(link_dict_list[i]["url"],
                                             refer_root)

            # self.write_info(link_dict_list[1]["url"], refer_node_list)

            print("Done in {}s.".format(str(round(time.time() - curr_time,
                                                  2))))
            curr_time = time.time()

            print("Calculating duplicated count for each node...")

            for curr in curr_node_list:
                for refer in refer_node_list:
                    # if curr.el.text == refer.el.text and curr.el.tag == refer.el.tag:
                    if Node.is_same(curr.el, refer.el):
                        curr.duplicate_count = curr.duplicate_count + 1
                        break

            print("Done in {}s.".format(str(round(time.time() - curr_time,
                                                  2))))
            curr_time = time.time()

            print("Searching for unique nodes...")

            result_node_list = []

            for curr in curr_node_list:
                if curr.duplicate_count == 0 and curr.el.text != None and curr.el.text != "" and curr.el.tag not in const.UNWANTED_TAGS:
                    # Commented, invalid html tags are not allowed to set attributes
                    print(curr.el.tag, " ", curr.el.text)
                    try:
                        have_duplicated_found = True
                        # curr.el.set("fyp-web-miner", "content")
                        curr.is_content = True
                        curr.iter_parent = curr.parent

                        result_node_list.append(curr)

                        # iter_node = curr

                        # while iter_node.parent != None:
                        #     iter_node = iter_node.parent

                        #     if iter_node.data_regions != None and len(iter_node.data_regions) > 0:
                        #         iter_node.is_content_group = True
                        #     elif iter_node.is_content != True:
                        #         iter_node.is_content_holder = True

                    except TypeError as e:
                        print("Skipped, can't set attributes for tag: ",
                              curr.el.tag, " text: ", curr.el.text)

            print("Done in {}s.".format(str(round(time.time() - curr_time,
                                                  2))))

            if have_duplicated_found:
                break

        return result_node_list
Ejemplo n.º 13
0
    #                 print(unique_node_list[i].el.tag ," ", unique_node_list[i].el.text)

    curr_time = time.time()

    print("Grouping nodes based on the highest level parent...")

    same_level_node_list = []

    highest_nodes = []

    for node in unique_node_list:
        is_found = False

        for node_list in same_level_node_list:
            if node_list[
                    0].parent_count == node.parent_count and Node.is_same_without_text(
                        node_list[0].el, node.el):
                is_found = True
                node_list.append(node)

        if not is_found:
            same_level_node_list.append([node])

    for node_list in same_level_node_list:
        next_parent = node_list[0].parent

        while True:
            is_all_same_parent = True

            if next_parent == None: break

            for node in node_list: