Beispiel #1
0
    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]
            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]

            for j in range(1,len(level_nodes)-1):
                left_node = level_nodes[j-1]
                #横向比较
                right_bound = min(len(level_nodes), j+5)
                right_nodes = level_nodes[j:right_bound]

                #纵向比较
                down_nodes = right_nodes[0]
                right_nodes.extend(down_nodes)

                for right_node in right_nodes:
                    if cls.similar_check(left_node, right_node):
                        left_node.set(kg_record_mark,'1')
                        right_node.set(kg_record_mark, '1')
                        break

        record_groups = cls.merger_sibling_record_node(doctree)
        return record_groups
Beispiel #2
0
def get_aricle_cetd(doctree):
    cetd_parse(doctree)
    body = ElementHelper.get_body(doctree)
    # ElementHelper.print_element(body)
    CleanTreeByMark(body)
    RemoveAttribute(body)
    return ElementHelper.element_text_content(body)
Beispiel #3
0
def get_title_util(body, title_text):
    if len(title_text) < 2:
        return None
    #1.计算节点文本与title_text的longest commen lenght
    candidate_nodes = []
    for tag in TITLE_TAG:
        nodes = ElementHelper.get_element_by_tag(body, tag)
        if nodes is None or len(nodes)<1: continue

        nodes = [node for node in nodes if is_possible_title_tag(node)]
        candidate_nodes.extend(nodes)

    mapping = {}
    for node in candidate_nodes:
        node_text = ElementHelper.element_text_content(node)
        # if len(node_text)==0 or len(node_text)>len(title_text): continue  #

        if len(node_text)==0: continue

        llength = longest_common_length(node_text, title_text)
        if llength >= 1:
            mapping[node] = llength

    if len(mapping)==0: return None
    #2.选择长度最大的作为title节点,如果存在多个最大的,选择最靠前的作为
    #title节点
    sorted_nodes = [node for node, _ in sorted(mapping.items(), key=lambda x:x[1], reverse=True)]
    max_len = mapping[sorted_nodes[0]]

    candidates = [node for node in sorted_nodes if mapping[node]==max_len]
    if len(candidate_nodes)==1:
        return sorted_nodes[0]
    else:
        candidates.sort(cmp=lambda x,y: ElementHelper.get_element_preorder_num(x)- ElementHelper.get_element_preorder_num(y), reverse=False)
        return candidates[0]
Beispiel #4
0
def is_cluster_all_links(cluster):
    """ #p判断是否是链接节点的集合。1.如果该集合中所有的文本节点都是链接节点,则属于链接噪声<a> text </a>或<li><a>text</a></li>的形式
    if all tags which contain links are <a> tag, then return True
    For example:
        <a> link </a>
        OR
        <li> <a> link </a> </li>
    """
    all_nodes_contain_text = []
    for node in cluster:
        children = ElementHelper.get_children(node)
        nodes_contain_text = [node for node in children if not ElementHelper.is_element_text_none(node)
            and node.tag not in ('em','strong','span','i','b')]
        all_nodes_contain_text.extend(nodes_contain_text)

    link_nodes = [node for node in all_nodes_contain_text if node.tag=='a' or node.getparent().tag=='a']
    other_nodes = [node for node in all_nodes_contain_text if node.tag!='a' and node.getparent().tag != 'a']

    link_nodes_text_number = cluster_text_number(link_nodes)
    other_nodes_text_number = cluster_text_number(other_nodes)

    if len(other_nodes)==0 or other_nodes_text_number==0:
        return True

    if 1.0 *link_nodes_text_number/other_nodes_text_number>2.0:
        return True

    return False
Beispiel #5
0
    def similar_check(cls, nodeA, nodeB):
        if nodeA.tag != nodeB.tag:
            return False
        #compare distinct nodes
        dnodesA = ElementHelper.get_children(nodeA)
        dnodesB = ElementHelper.get_children(nodeB)

        #dA is node_levels_mapping, rA is level_nodes_mapping
        dA, dB, rA, rB = {}, {}, {}, {}
        for node in dnodesA:
            #ignore <a> tag as distinct tag
            if node.tag == 'a': continue

            dA.setdefault(node.tag, []).append(int(node.get(px)))
            rA.setdefault(int(node.get(px)), []).append(node.tag)
        for node in dnodesB:
            if node.tag == 'a': continue

            dB.setdefault(node.tag, []).append(int(node.get(px)))
            rB.setdefault(int(node.get(px)), []).append(node.tag)

        if abs(len(dA)-len(dB))>1 or abs(len(rA)-len(rB))>1:
            return False

        #check distinct tag is same?
        for tag in dA:
            if tag not in ('em', 'b', 'br','i', 'font') and tag not in dB:
                return False

        sumA = sum([len(StringHelper.unique(rA[A])) for A in rA])
        sumB = sum([len(StringHelper.unique(rB[B])) for B in rB])
        if abs(sumA-sumB)>1:
            return False
        return True
Beispiel #6
0
def get_title_util(body, title_text):
    if len(title_text) < 2:
        return None
    #1.计算节点文本与title_text的longest commen lenght
    candidate_nodes = []
    for tag in TITLE_TAG:
        nodes = ElementHelper.get_element_by_tag(body, tag)
        if nodes is None or len(nodes)<1: continue

        nodes = [node for node in nodes if is_possible_title_tag(node)]
        candidate_nodes.extend(nodes)

    mapping = {}
    for node in candidate_nodes:
        node_text = ElementHelper.element_text_content(node)
        # if len(node_text)==0 or len(node_text)>len(title_text): continue  #

        if len(node_text)==0: continue

        llength = longest_common_length(node_text, title_text)
        if llength >= 1:
            mapping[node] = llength

    if len(mapping)==0: return None
    #2.选择长度最大的作为title节点,如果存在多个最大的,选择最靠前的作为
    #title节点
    sorted_nodes = [node for node, _ in sorted(mapping.items(), key=lambda x:x[1], reverse=True)]
    max_len = mapping[sorted_nodes[0]]

    candidates = [node for node in sorted_nodes if mapping[node]==max_len]
    if len(candidate_nodes)==1:
        return sorted_nodes[0]
    else:
        candidates.sort(cmp=lambda x,y: ElementHelper.get_element_preorder_num(x)- ElementHelper.get_element_preorder_num(y), reverse=False)
        return candidates[0]
Beispiel #7
0
    def similar_check(cls, nodeA, nodeB):
        if nodeA.tag != nodeB.tag:
            return False
        #compare distinct nodes
        dnodesA = ElementHelper.get_children(nodeA)
        dnodesB = ElementHelper.get_children(nodeB)

        #dA is node_levels_mapping, rA is level_nodes_mapping
        dA, dB, rA, rB = {}, {}, {}, {}
        for node in dnodesA:
            #ignore <a> tag as distinct tag
            if node.tag == 'a': continue

            dA.setdefault(node.tag, []).append(int(node.get(px)))
            rA.setdefault(int(node.get(px)), []).append(node.tag)
        for node in dnodesB:
            if node.tag == 'a': continue

            dB.setdefault(node.tag, []).append(int(node.get(px)))
            rB.setdefault(int(node.get(px)), []).append(node.tag)

        if abs(len(dA)-len(dB))>1 or abs(len(rA)-len(rB))>1:
            return False

        #check distinct tag is same?
        for tag in dA:
            if tag not in ('em', 'b', 'br','i', 'font') and tag not in dB:
                return False

        sumA = sum([len(StringHelper.unique(rA[A])) for A in rA])
        sumB = sum([len(StringHelper.unique(rB[B])) for B in rB])
        if abs(sumA-sumB)>1:
            return False
        return True
Beispiel #8
0
def is_cluster_all_links(cluster):
    """ if all tags which contain links are <a> tag, then return True
    For example:
        <a> link </a>
        OR
        <li> <a> link </a> </li>
    """
    all_nodes_contain_text = []
    for node in cluster:
        children = ElementHelper.get_children(node)
        nodes_contain_text = [node for node in children if not ElementHelper.is_element_text_none(node)
            and node.tag not in ('em','strong','span','i','b')]
        all_nodes_contain_text.extend(nodes_contain_text)

    link_nodes = [node for node in all_nodes_contain_text if node.tag=='a' or node.getparent().tag=='a']
    other_nodes = [node for node in all_nodes_contain_text if node.tag!='a' and node.getparent().tag != 'a']

    link_nodes_text_number = cluster_text_number(link_nodes)
    other_nodes_text_number = cluster_text_number(other_nodes)

    if len(other_nodes)==0 or other_nodes_text_number==0:
        return True

    if 1.0 *link_nodes_text_number/other_nodes_text_number>2.0:
        return True

    return False
Beispiel #9
0
def get_aricle_cetd(doctree):
    cetd_parse(doctree)
    body = ElementHelper.get_body(doctree)
    # ElementHelper.print_element(body)
    CleanTreeByMark(body)
    RemoveAttribute(body)
    return ElementHelper.element_text_content(body)
Beispiel #10
0
 def print_cluster_record(cls, clusters, doctree):
     ElementHelper.print_element(doctree)
     for cluster in clusters:
         if len(cluster)>1:
             print '===='*10
             nodes = clusters[cluster]
             for node in nodes:
                 print ElementHelper.get_xpath_by_element(node, doctree)
Beispiel #11
0
 def print_cluster_record(cls, clusters, doctree):
     ElementHelper.print_element(doctree)
     for cluster in clusters:
         if len(cluster)>1:
             print '===='*10
             nodes = clusters[cluster]
             for node in nodes:
                 print ElementHelper.get_xpath_by_element(node, doctree), node.get(py)
Beispiel #12
0
    def get_headline_content_in_cleaned_body(body):
        headlin_tag = ['h1', 'h2', 'h3', 'h4']

        headline_contents = [ElementHelper.element_text_content(node)
                             for node in ElementHelper.get_elements_by_tagnames(body, headlin_tag)
                             if not ElementHelper.is_element_content_none(node)]

        return headline_contents
Beispiel #13
0
    def get_headline_content_in_cleaned_body(body):
        headlin_tag = ['h1', 'h2', 'h3', 'h4']

        headline_contents = [ElementHelper.element_text_content(node)
                             for node in ElementHelper.get_elements_by_tagnames(body, headlin_tag)
                             if not ElementHelper.is_element_content_none(node)]

        return '\n'.join(headline_contents)
Beispiel #14
0
def CleanTreeByMark(element):
    mark = long(element.get(kg_mark))
    if 0 == mark:
        ElementHelper.remove_element(element)
    elif 1 == mark:
        return
    else:
        for child in element:
            CleanTreeByMark(child)
Beispiel #15
0
def CleanTreeByMark(element):
    mark = long(element.get(kg_mark))
    if 0==mark:
        ElementHelper.remove_element(element)
    elif 1==mark:
        return
    else:
        for child in element:
            CleanTreeByMark(child)
Beispiel #16
0
    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        #记录相似的节点
        cluster={}

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]

            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
            #在同一个父亲节点下进行比较
            # tag_names = set([node.getparent() for node in level_nodes])
            # tmp = {}
            # for tag in tag_names:
            #     for node in level_nodes:
            #         tmp.setdefault(tag, []).append(node)
            tmp = cls.segement(level_nodes)

            for k, nodes in tmp.items():
                # if len(nodes)==1:break
                first = None
                node_set = set()
                for i in range(1,len(nodes)):
                    if nodes[i].get(kg_record_mark)=='1':
                        continue
                    left_node = nodes[i-1]
                    # 和集合类的所有元素比较,查看是否有相同的
                    right_nodes=nodes[i:]
                    for node in right_nodes:
                        if cls.similar_check(left_node, node):
                            if first is None:
                                first = left_node
                                node_set.add(nodes[i-1])
                            left_node.set(kg_record_mark, '1')
                            node.set(kg_record_mark, '1')
                            node_set.add(node)
                if first is not None:
                    cluster[first]=node_set

        record_groups = cls.merger_sibling_record_node(doctree, cluster)
        # record_groups = cluster
        record_groups = {k:v for k,v in record_groups.items() if k.get(kg_record_mark)=='1'}

        return record_groups
Beispiel #17
0
 def get_meta_content(doctree, metaAttrName, value):
     """Extract a given meta content form document.
     Example metaNames:
     (name, description)
     (name, keyword)
     (property, og:type)
     """
     meta  = ElementHelper.get_element_by_tag_attr(doctree, 'meta',metaAttrName, value)
     content = None
     if meta is not None and len(meta)>0:
         content = ElementHelper.get_attribute(meta[0], 'content')
     if content is not None:
         return normalize_word(content)
     return ''
Beispiel #18
0
 def get_meta_content(doctree, metaAttrName, value):
     """Extract a given meta content form document.
     Example metaNames:
     (name, description)
     (name, keyword)
     (property, og:type)
     """
     meta  = ElementHelper.get_element_by_tag_attr(doctree, 'meta',metaAttrName, value)
     content = None
     if meta is not None and len(meta)>0:
         content = ElementHelper.get_attribute(meta[0], 'content')
     if content is not None:
         return normalize_word(content)
     return ''
Beispiel #19
0
    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]
            try:
                next_level_nodes = all_level_nodes[level+1]
            except KeyError:
                next_level_nodes=None
            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]

            for j in range(1,len(level_nodes)-1):
                left_node = level_nodes[j-1]
                #将横向名称相同的节点放到一起进行比较
                # right_bound = min(len(level_nodes), j+5)
                # right_nodes = level_nodes[j:right_bound]
                # #纵向比较
                # down_nodes = right_nodes[0]
                # right_nodes.extend(down_nodes)

                right_nodes = [node for node in level_nodes[j:] if node.tag==left_node.tag]
                #纵向查找
                # if next_level_nodes is not None:
                #     for node in next_level_nodes:
                #         if node.tag==left_node.tag:
                #             right_node.append(node)





                for right_node in right_nodes:
                    if cls.similar_check(left_node, right_node):
                        left_node.set(kg_record_mark,'1')
                        right_node.set(kg_record_mark, '1')
                        break

        record_groups = cls.merger_sibling_record_node(doctree)
        return record_groups
Beispiel #20
0
    def find_first_sibling_record_node(cls, element, doctree):
        '''找到element所在区域的起始节点
        1.首选查看element的xpath下表,如果其下表<2,表示element左边没有兄弟节点了,直接返回element
        2.如果element的xpath下标大于=2,表示element左边有兄弟节点,那么先找到element的父亲(以便于访问element的兄弟节点,然后index=设置为element的下表-2
        查看parentt[index]是否是数据区域,如果是继续项左寻找,否则返回parent[index+1]
        '''
        parent = element.getparent()
        if len(parent)<2:
            return element

        element_xpath = ElementHelper.get_xpath_by_element(element, doctree)
        # print 'xpath: %s' %element_xpath
        element_last_index = StringHelper.get_digits(element_xpath.split('/')[-1])

        if element_last_index < 2:
            return element

        index = element_last_index - 2
        # print 'parent length:%d' %len(parent)
        while index >= 0:
            # print index
            if parent[index].get(kg_record_mark) == '1':
                index -= 1
            else:
                break
        return parent[index+1]
Beispiel #21
0
def collect_urls(html, base_url, encoding=None):
    """ only collect url
    :param html: page string
    :param base_url:
    :param encoding:
    :return: list of url
    """
    h = HtmlHelper()
    doctree = h.create_doc(html, encoding)
    a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
    for a in a_tags:
        link = a.get('href',None)
        link = m_strip(link)
        if link is None or len(link)<2:continue
        if link[0]=='#': continue #link to itself
        link = normalize_url(link, base_url)

        #if url in non visited set
        if is_url_visited(link, unvisited_url_set):
            continue

        if not should_collect_url(link, base_url):
            continue
        #if url not in same domain

        yield link
Beispiel #22
0
    def merger_sibling_record_node(cls, doctree, cluster):
        ''' 融合数据记录
        1.首先对数据记录进行修正,然后将连续的数据记录放入到一个集合中
        将同层次相同标签的节点的节点放入一个集合中,然后在就行纠正,具体详见correct_record_mark
        :param doctree: 经过了初步的相似度比较之后标记了的DOM树
        :param cluster: 初步的相似的数据记录的集合
        :return:
        '''
        node_record_mapping = {}

        body = ElementHelper.get_body(doctree)
        thislevel = []
        thislevel.extend(body)
        # while thislevel:
        #     nextlevel = list()
        #     for node in thislevel:
        #         # correct nodes which
        #         cls.correct_record_mark(node)
        #
        #         if cls.is_node_or_ancestor_record(node):
        #             first_record_sibling = cls.find_first_sibling_record_node(node, doctree)
        #             node_record_mapping.setdefault(first_record_sibling, []).append(node)
        #ToDo 2016-04-20
        while thislevel:
            nextlevel = list()
            cls.correct_record_mark(thislevel, cluster)
            for node in thislevel:
                if len(node) > 0:
                    nextlevel.extend([child for child in node if not cls.is_node_or_ancestor_record(node)])
            thislevel = nextlevel

        return cluster
Beispiel #23
0
    def find_first_sibling_record_node(cls, element, doctree):
        '''找到element所在区域的起始节点
        1.首选查看element的xpath下表,如果其下表<2,表示element左边没有兄弟节点了,直接返回element
        2.如果element的xpath下标大于=2,表示element左边有兄弟节点,那么先找到element的父亲(以便于访问element的兄弟节点,然后index=设置为element的下表-2
        查看parentt[index]是否是数据区域,如果是继续项左寻找,否则返回parent[index+1]
        '''
        parent = element.getparent()
        if len(parent)<2:
            return element

        element_xpath = ElementHelper.get_xpath_by_element(element, doctree)
        # print 'xpath: %s' %element_xpath
        element_last_index = StringHelper.get_digits(element_xpath.split('/')[-1])

        if element_last_index < 2:
            return element

        index = element_last_index - 2
        # print 'parent length:%d' %len(parent)
        while index >= 0:
            # print index
            if parent[index].get(kg_record_mark) == '1':
                index -= 1
            else:
                break
        return parent[index+1]
Beispiel #24
0
    def is_node_or_children_record(cls, element):
        children = ElementHelper.get_children(element)

        marks = [child.get(kg_record_mark) for child in children]
        unique_marks = StringHelper.unique(marks)
        if len(unique_marks)==2:
            return True
        return False
Beispiel #25
0
def is_possible_title_tag(element):
    if element.tag not in TITLE_TAG:
        return False
    if len(element)>1:
        return False
    if ElementHelper.is_element_content_none(element):
        return False
    return True
Beispiel #26
0
    def is_node_or_children_record(cls, element):
        children = ElementHelper.get_children(element)

        marks = [child.get(kg_record_mark) for child in children]
        unique_marks = StringHelper.unique(marks)
        if len(unique_marks)==2:
            return True
        return False
Beispiel #27
0
def is_possible_title_tag(element):
    if element.tag not in TITLE_TAG:
        return False
    if len(element)>1:
        return False
    if ElementHelper.is_element_content_none(element):
        return False
    return True
Beispiel #28
0
 def pre_process_domtree(doctree):
     if doctree is not None:
         root = ElementHelper.get_root(doctree)
         if is_set_attribute_valid(root):
             return doctree
         else:
             return None
     else:
         return None
Beispiel #29
0
 def pre_process_domtree(doctree):
     if doctree is not None:
         root = ElementHelper.get_root(doctree)
         if is_set_attribute_valid(root):
             return doctree
         else:
             return None
     else:
         return None
Beispiel #30
0
def is_cluster_contain_user_comments(cluster):
    """ identify whether element or its children contain comment content, only consider <a> tag
    1.each node in cluster, at least has 3 children
    2.there is at least one <a> tag has same text
    """
    # can not identify
    if len(cluster) < 2: return False

    text_number_mapping = {}

    #at least have three children contain text
    for node in cluster:
        children = ElementHelper.get_children(node)
        link_nodes_contain_text = [n for n in children if is_link_node_with_text(n)]
        non_link_nodes_contain_text = [n for n in children if is_none_link_node_with_text(n)]

        if len(link_nodes_contain_text)<3: return False
        if len(non_link_nodes_contain_text)<2: return False

        for n in link_nodes_contain_text:
            text = ElementHelper.element_text_content(n)
            if text in text_number_mapping:
                text_number_mapping[text] += 1
            else:
                text_number_mapping[text] = 1
    #去除标点符号,出数字,空的文本
    tmp = copy.deepcopy(text_number_mapping)
    for text in tmp:
        if len(text)==0 or StringHelper.is_digits(text) :
            del text_number_mapping[text]

    text_number = text_number_mapping.values()

    # FOR TEST
    # for text, number in node_text_mapping.items():
    #     print text,':', number

    text_number_counter = collections.Counter(text_number).most_common()

    for number, counter in text_number_counter:
        if number > 1 and number==len(cluster) and counter>=2: #ToDo 2016/03/08  old:counter>=2 --> new:counter>=1
            print 'find comment!'
            return True
    return False
Beispiel #31
0
def is_cluster_contain_user_comments(cluster):
    """ identify whether element or its children contain comment content, only consider <a> tag
    1.each node in cluster, at least has 3 children
    2.there is at least one <a> tag has same text
    """
    # can not identify
    if len(cluster) < 2: return False

    text_number_mapping = {}

    #at least have three children contain text
    for node in cluster:
        children = ElementHelper.get_children(node)
        link_nodes_contain_text = [n for n in children if is_link_node_with_text(n)]
        non_link_nodes_contain_text = [n for n in children if is_none_link_node_with_text(n)]

        if len(link_nodes_contain_text)<3: return False
        if len(non_link_nodes_contain_text)<2: return False

        for n in link_nodes_contain_text:
            text = ElementHelper.element_text_content(n)
            if text in text_number_mapping:
                text_number_mapping[text] += 1
            else:
                text_number_mapping[text] = 1
    #去除标点符号,出数字,空的文本
    tmp = copy.deepcopy(text_number_mapping)
    for text in tmp:
        if len(text)==0 or StringHelper.is_digits(text) :
            del text_number_mapping[text]

    text_number = text_number_mapping.values()

    # FOR TEST
    # for text, number in node_text_mapping.items():
    #     print text,':', number

    text_number_counter = collections.Counter(text_number).most_common()

    for number, counter in text_number_counter:
        if number > 1 and number==len(cluster) and counter>=2: #ToDo 2016/03/08  old:counter>=2 --> new:counter>=1
            print 'find comment!'
            return True
    return False
Beispiel #32
0
def html2words(docstring, base_url, encoding=None, supervisior=None):
    """
    从网页源码中抽取正文
    :param docstring:
    :param encoding:
    :return:
    """
    string_size=sys.getsizeof(docstring)
    byte_size=string_size / (1024)
    if byte_size < 1:
        return

    docstring=docstring.lower()
    doctree=HtmlHelper.create_doc(docstring, encoding)

    if doctree is None: return None
    copy_doc=copy.deepcopy(doctree)
    # try:
    #
    #     link_ratio=get_page_link_ratio(copy_doc)
    #     print 'link_ratio: %f' % link_ratio
    #
    #     if link_ratio > 0.6:
    #         print 'this is home page'
    #         return None
    # except ValueError:
    #     return None

    doctree=HtmlHelper.pre_process_domtree(doctree)
    if doctree is None:
        return None
    # get page title and para content
    para, title=HtmlHelper.get_article(doctree, debug=False)

    # get page meta keywords and meta description
    meta_description=HtmlHelper.get_meta_description(copy_doc)

    # get headlines in page
    cleaned_body=ElementHelper.get_body(doctree)
    headlines=HtmlHelper.get_headline_content_in_cleaned_body(cleaned_body)

    # get all urls
    url_items=[]
    for item in get_link_word_by_pair(docstring, base_url, supervisior): url_items.append(item)

    document=Document()
    document['base_url']=base_url
    document['title']=title
    document['meta']=meta_description
    document['headlines']=headlines
    document['para']=para
    document['url_items']=url_items


    return document
Beispiel #33
0
def get_link_word_by_pair(docstring, base_url, supervisior=None, encoding='utf-8'):
    """ collect urls from
    :param html:
    :param base_url:
    :return:
    """
    h = HtmlHelper()
    doctree = h.create_doc(docstring, encoding)
    if isinstance(base_url, unicode):
        base_url = base_url.encode('utf-8')
    a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')

    for a in a_tags:

        link = a.get('href',None)

        link = a.get('href',None)
        link = m_strip(link)

        if link is None or len(link)<2:continue
        if link[0]=='#': continue #link to itself
        link = normalize_url(link, base_url)

        #if url in non visited set
        if is_url_visited(link, unvisited_url_set):
            continue

        # if not should_collect_url(link, base_url):
        #     continue


        link_item = UrlItem()
        link_item['parent_url'] = base_url
        link_item['url'] = link
        link_item['anchor_text'] = ElementHelper.element_text_content(a).encode('utf-8')
        link_item['neigb_text'] = ''
        if supervisior is not None:
            link_item['label'], link_item['interestness'] = supervisior.predict(link_item['anchor_text'])
        else:
            link_item['label'], link_item['interestness'] = '1', 0.0  #1为负样本

        yield link_item
Beispiel #34
0
def ComputeDensitySum(element, ratio):
    density_sum, char_num_sum = 0.0, 0
    _from, index, length = 0, 0, 0

    content = ElementHelper.element_text_content(element)
    if ElementHelper.is_element_has_child(element):
        for child in element:
            ComputeDensitySum(child, ratio)
        for child in element:
            density_sum += float(child.attrib.get(kg_text_density))
            char_num_sum += long(child.attrib.get(kg_char_num))

            #text before tag
            child_content = ElementHelper.element_text_content(child)
            index = -1
            if child_content != '':
                index = StringHelper.index_of(content, child_content, _from)

            if index > -1:
                length = index - _from
                if length > 0:
                    try:
                        tmp = length * qLn(1.0 * length) / qLn(
                            qLn(ratio * length + qExp(1.0)))  #此处的计算结果都为0
                        density_sum += tmp
                    except ZeroDivisionError:
                        pass
                _from = index + len(child_content)

        #text after tag
        length = len(ElementHelper.element_text_content(element)) - _from
        if length > 0:
            try:
                density_sum += length * qLn(1.0 * length) / qLn(
                    qLn(ratio * length + qExp(1.0)))
            except ZeroDivisionError:
                pass
    else:
        density_sum = float(element.attrib.get(kg_text_density))

    d2s_density_sum = str(density_sum)
    element.set(kg_density_sum, d2s_density_sum)
Beispiel #35
0
def set_text_mark(element, s, t):
    for child in element:
        set_text_mark(child, s, t)

    pos = ElementHelper.get_element_preorder_num(element)
    if pos >=s and pos<=t:
        element.set(kg_text_mark, '1')
        parent = element.getparent()
        while parent.tag != 'html':
            parent.set(kg_text_mark, '2')
            parent = parent.getparent()
Beispiel #36
0
def set_text_mark(element, s, t):
    for child in element:
        set_text_mark(child, s, t)

    pos = ElementHelper.get_element_preorder_num(element)
    if pos >=s and pos<=t:
        element.set(kg_text_mark, '1')
        parent = element.getparent()
        while parent.tag != 'html':
            parent.set(kg_text_mark, '2')
            parent = parent.getparent()
Beispiel #37
0
    def get_article_title_element(doctree):
        body = ElementHelper.get_body(doctree)

        title_node = HtmlHelper.get_title(doctree)

        if title_node is None:
            return None

        title_text = title_node.text
        title = get_title_util(body, title_text)
        if title is None: return None
        return title
Beispiel #38
0
def CountTag(element):
    tag_num = 0
    l2s_tag_num = str(tag_num)
    if ElementHelper.is_element_has_child(element):
        for child in element:
            CountTag(child)
        for child in element:
            tag_num += long(child.attrib.get(kg_tag_num))+1
        l2s_tag_num = str(tag_num)
        element.set(kg_tag_num, l2s_tag_num)
    else:
        element.set(kg_tag_num, l2s_tag_num)
Beispiel #39
0
def CountTag(element):
    tag_num = 0
    l2s_tag_num = str(tag_num)
    if ElementHelper.is_element_has_child(element):
        for child in element:
            CountTag(child)
        for child in element:
            tag_num += long(child.attrib.get(kg_tag_num)) + 1
        l2s_tag_num = str(tag_num)
        element.set(kg_tag_num, l2s_tag_num)
    else:
        element.set(kg_tag_num, l2s_tag_num)
Beispiel #40
0
def get_page_link_ratio(doctree):
    body = ElementHelper.get_body(doctree)
    CountChar(body)
    CountTag(body)
    CountLinkChar(body)
    CountLinkTag(body)
    char_num = float(body.attrib.get(kg_char_num))
    if char_num==0: char_num=1
    linkchar_num = float(body.attrib.get(kg_linkchar_num))
    ratio = linkchar_num/char_num

    return ratio
Beispiel #41
0
def get_page_link_ratio(doctree):
    body = ElementHelper.get_body(doctree)
    CountChar(body)
    CountTag(body)
    CountLinkChar(body)
    CountLinkTag(body)
    char_num = float(body.attrib.get(kg_char_num))
    if char_num == 0: char_num = 1
    linkchar_num = float(body.attrib.get(kg_linkchar_num))
    ratio = linkchar_num / char_num

    return ratio
Beispiel #42
0
    def get_article_title_element(doctree):
        body = ElementHelper.get_body(doctree)

        title_node = HtmlHelper.get_title(doctree)

        if title_node is None:
            return None

        title_text = title_node.text
        title = get_title_util(body, title_text)
        if title is None: return None
        return title
Beispiel #43
0
def ComputeDensitySum(element, ratio):
    density_sum, char_num_sum  = 0.0, 0
    _from, index, length = 0, 0, 0

    content = ElementHelper.element_text_content(element)
    if ElementHelper.is_element_has_child(element):
        for child in element:
            ComputeDensitySum(child, ratio)
        for child in element:
            density_sum += float(child.attrib.get(kg_text_density))
            char_num_sum += long(child.attrib.get(kg_char_num))

            #text before tag
            child_content = ElementHelper.element_text_content(child)
            index = -1
            if child_content != '':
                index = StringHelper.index_of(content, child_content, _from)

            if index > -1:
                length = index - _from
                if length > 0:
                    try:
                        tmp = length * qLn(1.0 * length) / qLn(qLn(ratio * length + qExp(1.0))) #此处的计算结果都为0
                        density_sum += tmp
                    except ZeroDivisionError:
                        pass
                _from = index + len(child_content)

        #text after tag
        length = len(ElementHelper.element_text_content(element)) - _from
        if length>0:
            try:
                density_sum += length * qLn(1.0 * length) / qLn(qLn(ratio * length + qExp(1.0)))
            except ZeroDivisionError:
                pass
    else:
        density_sum = float(element.attrib.get(kg_text_density))

    d2s_density_sum  = str(density_sum)
    element.set(kg_density_sum, d2s_density_sum)
Beispiel #44
0
def set_element_attribute(element, x, y):
    """ x is level of element, y is PreOrder number
    """
    #set_element_coordinate
    element.set(px,str(x))
    element.set(py, str(y[0]))
    y[0] += 1

    #set element record, it mark whether an element is ancestor node of a record
    element.set(kg_record_mark, '0')

    #kg_text_mark mark whether an element belong to main text part
    element.set(kg_text_mark, '0')

    #set element .text and .tail
    element.tail = normalize_word(element.tail)
    element.text = normalize_word(element.text)

    if len(element)==0 and len(element.tail)==0 and len(element.text)==0:
        ElementHelper.remove_element(element)

    for child in element:
        set_element_attribute(child, x+1, y)
Beispiel #45
0
    def bfs_tree(cls, doctree):
        all_level_nodes = {}
        body = ElementHelper.get_body(doctree)

        level = int(body.get(px))
        thislevel = [body]
        while thislevel:
            nextlevel = list()
            all_level_nodes[level] = thislevel
            for node in thislevel:
                nextlevel.extend([child for child in node if len(node)>0])
            thislevel = nextlevel
            level += 1
        return all_level_nodes
Beispiel #46
0
    def bfs_tree(cls, doctree):
        all_level_nodes = {}
        body = ElementHelper.get_body(doctree)

        level = int(body.get(px))
        thislevel = [body]
        while thislevel:
            nextlevel = list()
            all_level_nodes[level] = thislevel
            for node in thislevel:
                nextlevel.extend([child for child in node if len(node)>0])
            thislevel = nextlevel
            level += 1
        return all_level_nodes
Beispiel #47
0
def set_element_attribute(element, x, y):
    """ x is level of element, y is PreOrder number
    """
    #set_element_coordinate
    element.set(px,str(x))
    element.set(py, str(y[0]))
    y[0] += 1

    #set element record, it mark whether an element is ancestor node of a record
    element.set(kg_record_mark, '0')

    #kg_text_mark mark whether an element belong to main text part
    element.set(kg_text_mark, '0')

    #set element .text and .tail
    element.tail = normalize_word(element.tail)
    element.text = normalize_word(element.text)

    if len(element)==0 and len(element.tail)==0 and len(element.text)==0:
        ElementHelper.remove_element(element)

    for child in element:
        set_element_attribute(child, x+1, y)
Beispiel #48
0
    def get_article(doctree, debug = False):
        w = WISH()

        title = HtmlHelper.get_article_title_element(doctree)

        clusters = w.get_clustered_records(doctree)

        if debug:
            w.print_cluster_record(clusters, doctree)

        articles, title_text = get_article_wish(clusters, doctree, title, debug)
        if title_text is None or len(title_text)==0:
            title_text = ElementHelper.element_text_content(HtmlHelper.get_title(doctree))

        return articles, title_text
Beispiel #49
0
    def get_article(doctree, debug = False):
        w = WISH()

        title = HtmlHelper.get_article_title_element(doctree)

        clusters = w.get_clustered_records(doctree)

        if debug:
            w.print_cluster_record(clusters, doctree)

        articles, title_text = get_article_wish(clusters, doctree, title, debug)
        if title_text is None or len(title_text)==0:
            title_text = ElementHelper.element_text_content(HtmlHelper.get_title(doctree))

        return articles, title_text
Beispiel #50
0
    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]
            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]

            for j in range(1,len(level_nodes)-1):
                left_node = level_nodes[j-1]
                #将横向名称相同的节点放到一起进行比较
                # right_bound = min(len(level_nodes), j+5)
                # right_nodes = level_nodes[j:right_bound]
                right_nodes = [node for node in level_nodes[j:] if node.tag==left_node.tag]

                # #纵向比较
                # down_nodes = right_nodes[0]
                # right_nodes.extend(down_nodes)

                for right_node in right_nodes:
                    if cls.similar_check(left_node, right_node):
                        left_node.set(kg_record_mark,'1')
                        right_node.set(kg_record_mark, '1')
                        break

        record_groups = cls.merger_sibling_record_node(doctree)
        return record_groups
Beispiel #51
0
def cetd_parse(doctree):
    body = ElementHelper.get_body(doctree)
    CountChar(body)
    CountTag(body)
    CountLinkChar(body)
    CountLinkTag(body)
    char_num = float(body.attrib.get(kg_char_num))
    if char_num==0: char_num=1
    linkchar_num = float(body.attrib.get(kg_linkchar_num))
    ratio = linkchar_num/char_num
    ComputeTextDensity(body, ratio)
    ComputeDensitySum(body, ratio)
    max_density_sum = FindMaxDensitySum(body)
    SetMark(body, 0)
    thresold = GetThreshold(body, max_density_sum)
    MarkContent(body, thresold)
Beispiel #52
0
def cetd_parse(doctree):
    body = ElementHelper.get_body(doctree)
    CountChar(body)
    CountTag(body)
    CountLinkChar(body)
    CountLinkTag(body)
    char_num = float(body.attrib.get(kg_char_num))
    if char_num == 0: char_num = 1
    linkchar_num = float(body.attrib.get(kg_linkchar_num))
    ratio = linkchar_num / char_num
    ComputeTextDensity(body, ratio)
    ComputeDensitySum(body, ratio)
    max_density_sum = FindMaxDensitySum(body)
    SetMark(body, 0)
    thresold = GetThreshold(body, max_density_sum)
    MarkContent(body, thresold)
Beispiel #53
0
def neigb_text_of_link(atag, is_in_main_cluster=False, level=2):
    if not is_in_main_cluster:
        return ''

    cnt = 0
    negib_text = ''
    parent = atag.getparent()
    while cnt<level:
        parent_content = ElementHelper.element_text_tail(parent)
        if len(parent_content)>0:
            neigb_text = parent_content
            break
        else:
            cnt += 1
            parent = atag.getparent()
    return negib_text
Beispiel #54
0
def clean_body(clusters, doctree, title_node=None, debug = False):
    #filter user comments and all link records
    clusters = filter_cluster(clusters)
    if len(clusters) == 0:
        title = ElementHelper.element_text_content(title_node)
        return title, title

    #choose cluster which has most texts
    maxCluster = get_biggest_cluster(clusters)
    nodes = clusters[maxCluster]

    #get all children of max cluster record
    allnodes = []
    for node in nodes:
        children = ElementHelper.get_children(node)
        allnodes.extend(children)

    #sort max cluster nodes by its preorder index
    allnodes.sort(lambda p,q:ElementHelper.get_element_preorder_num(p)-ElementHelper.get_element_preorder_num(q))
    s = ElementHelper.get_element_preorder_num(allnodes[0])
    t = ElementHelper.get_element_preorder_num(allnodes[-1])

    #correct start position by title node
    title_text = ''
    #====================================================================
    #在实验时借助title纠正正文文本的起始位置可以提高recall,但是实际过程中不需要
    if title_node is not None:
        index = ElementHelper.get_element_preorder_num(title_node)
        if index < s:
            s = index #ToDo:add 2016/03/09
            title_text = ElementHelper.element_text_content(title_node)
    #====================================================================

    body = ElementHelper.get_body(doctree)
    # remove nodes which not belong to main text
    set_text_mark(body, s, t)
    remove_nontext_element(body)

    return body, title_text
Beispiel #55
0
    def merger_sibling_record_node(cls, doctree):
        node_record_mapping = {}

        body = ElementHelper.get_body(doctree)
        thislevel = []
        thislevel.extend(body)
        while thislevel:
            nextlevel = list()
            for node in thislevel:
                # correct nodes which
                cls.correct_record_mark(node)

                if cls.is_node_or_ancestor_record(node):
                    first_record_sibling = cls.find_first_sibling_record_node(node, doctree)
                    node_record_mapping.setdefault(first_record_sibling, []).append(node)

            for node in thislevel:
                if len(node) > 0:
                    nextlevel.extend([child for child in node if not cls.is_node_or_ancestor_record(node)])
            thislevel = nextlevel

        return node_record_mapping
Beispiel #56
0
def is_link_node_with_text(element):
    """ if element.text_content not None and element=='a', return True
    """
    if not ElementHelper.is_element_content_none(element) and element.tag=='a':
        return True
    return False
Beispiel #57
0
def is_none_link_node_with_text(element):
    if not ElementHelper.is_element_text_none(element) and element.tag!='a':
        return True
    return False
Beispiel #58
0
def get_article_wish(clusters, doctree, title_node=None, debug = False):
    body, title_text = clean_body(clusters, doctree, title_node, debug)
    return ElementHelper.element_text_content_list(body), title_text
Beispiel #59
0
def CountChar(element):
    char_num = len(ElementHelper.element_text_content(element))
    l2s_char_num = str(char_num)
    element.set(kg_char_num, l2s_char_num)
    for child in element:
        CountChar(child)
Beispiel #60
0
def remove_nontext_element(element):
    if element.get(kg_text_mark, '0') == '0':
        ElementHelper.remove_element(element)

    for child in element:
        remove_nontext_element(child)