Example #1
0
def clean_document(node):
    """Clean up the final document we return as the readable article"""
    if node is None or len(node) == 0:
        return

    LNODE.log(node, 2, "Processing doc")
    clean_list = ['object', 'h1']
    to_drop = []

    # If there is only one h2, they are probably using it as a header and
    # not a subheader, so remove it since we already have a header.
    if len(node.findall('.//h2')) == 1:
        LOG.debug('Adding H2 to list of nodes to clean.')
        clean_list.append('h2')

    for n in node.iter():
        LNODE.log(n, 2, "Cleaning iter node")
        # clean out any in-line style properties
        if 'style' in n.attrib:
            n.set('style', '')

        # remove all of the following tags
        # Clean a node of all elements of type "tag".
        # (Unless it's a youtube/vimeo video. People love movies.)
        is_embed = True if n.tag in ['object', 'embed'] else False
        if n.tag in clean_list:
            allow = False

            # Allow youtube and vimeo videos through as people usually
            # want to see those.
            if is_embed:
                if ok_embedded_video(n):
                    allow = True

            if not allow:
                LNODE.log(n, 2, "Dropping Node")
                to_drop.append(n)

        if n.tag in ['h1', 'h2', 'h3', 'h4']:
            # clean headings
            # if the heading has no css weight or a high link density,
            # remove it
            if get_class_weight(n) < 0 or get_link_density(n) > .33:
                LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
                to_drop.append(n)

        # clean out extra <p>
        if n.tag == 'p':
            # if the p has no children and has no content...well then down
            # with it.
            if not n.getchildren() and len(n.text_content()) < 5:
                LNODE.log(n, 2, 'Dropping extra <p>')
                to_drop.append(n)

        # finally try out the conditional cleaning of the target node
        if clean_conditionally(n):
            to_drop.append(n)

    [n.drop_tree() for n in to_drop if n.getparent() is not None]
    return node
Example #2
0
def clean_document(node):
    """Clean up the final document we return as the readable article"""
    if node is None or len(node) == 0:
        return

    LNODE.log(node, 2, "Processing doc")
    clean_list = ['object', 'h1']
    to_drop = []

    # If there is only one h2, they are probably using it as a header and
    # not a subheader, so remove it since we already have a header.
    if len(node.findall('.//h2')) == 1:
        LOG.debug('Adding H2 to list of nodes to clean.')
        clean_list.append('h2')

    for n in node.iter():
        LNODE.log(n, 2, "Cleaning iter node")
        # clean out any in-line style properties
        if 'style' in n.attrib:
            n.set('style', '')

        # remove all of the following tags
        # Clean a node of all elements of type "tag".
        # (Unless it's a youtube/vimeo video. People love movies.)
        is_embed = True if n.tag in ['object', 'embed'] else False
        if n.tag in clean_list:
            allow = False

            # Allow youtube and vimeo videos through as people usually
            # want to see those.
            if is_embed:
                if ok_embedded_video(n):
                    allow = True

            if not allow:
                LNODE.log(n, 2, "Dropping Node")
                to_drop.append(n)

        if n.tag in ['h1', 'h2', 'h3', 'h4']:
            # clean headings
            # if the heading has no css weight or a high link density,
            # remove it
            if get_class_weight(n) < 0 or get_link_density(n) > .33:
                LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
                to_drop.append(n)

        # clean out extra <p>
        if n.tag == 'p':
            # if the p has no children and has no content...well then down
            # with it.
            if not n.getchildren() and len(n.text_content()) < 5:
                LNODE.log(n, 2, 'Dropping extra <p>')
                to_drop.append(n)

        # finally try out the conditional cleaning of the target node
        if clean_conditionally(n):
            to_drop.append(n)

    [n.drop_tree() for n in to_drop if n.getparent() is not None]
    return node
Example #3
0
def check_siblings(candidate_node, candidate_list):
    """Look through siblings for content that might also be related.

    Things like preambles, content split by ads that we removed, etc.

    """
    candidate_css = candidate_node.node.get('class')
    potential_target = candidate_node.content_score * 0.2
    sibling_target_score = potential_target if potential_target > 10 else 10
    parent = candidate_node.node.getparent()
    siblings = parent.getchildren() if parent is not None else []

    for sibling in siblings:
        append = False
        content_bonus = 0

        if sibling is candidate_node.node:
            LNODE.log(sibling, 1, 'Sibling is the node so append')
            append = True

        # Give a bonus if sibling nodes and top candidates have the example
        # same class name
        if candidate_css and sibling.get('class') == candidate_css:
            content_bonus += candidate_node.content_score * 0.2

        if sibling in candidate_list:
            adjusted_score = candidate_list[sibling].content_score + \
                content_bonus

            if adjusted_score >= sibling_target_score:
                append = True

        if sibling.tag == 'p':
            link_density = get_link_density(sibling)
            content = sibling.text_content()
            content_length = len(content)

            if content_length > 80 and link_density < 0.25:
                append = True
            elif content_length < 80 and link_density == 0:
                if ". " in content:
                    append = True

        if append:
            LNODE.log(sibling, 1, 'Sibling being appended')
            if sibling.tag not in ['div', 'p']:
                # We have a node that isn't a common block level element, like
                # a form or td tag. Turn it into a div so it doesn't get
                # filtered out later by accident.
                sibling.tag = 'div'

            if candidate_node.node != sibling:
                candidate_node.node.append(sibling)

    return candidate_node
Example #4
0
def check_siblings(candidate_node, candidate_list):
    """Look through siblings for content that might also be related.

    Things like preambles, content split by ads that we removed, etc.

    """
    candidate_css = candidate_node.node.get('class')
    potential_target = candidate_node.content_score * 0.2
    sibling_target_score = potential_target if potential_target > 10 else 10
    parent = candidate_node.node.getparent()
    siblings = parent.getchildren() if parent is not None else []

    for sibling in siblings:
        append = False
        content_bonus = 0

        if sibling is candidate_node.node:
            LNODE.log(sibling, 1, 'Sibling is the node so append')
            append = True

        # Give a bonus if sibling nodes and top candidates have the example
        # same class name
        if candidate_css and sibling.get('class') == candidate_css:
            content_bonus += candidate_node.content_score * 0.2

        if sibling in candidate_list:
            adjusted_score = candidate_list[sibling].content_score + \
                content_bonus

            if adjusted_score >= sibling_target_score:
                append = True

        if sibling.tag == 'p':
            link_density = get_link_density(sibling)
            content = sibling.text_content()
            content_length = len(content)

            if content_length > 80 and link_density < 0.25:
                append = True
            elif content_length < 80 and link_density == 0:
                if ". " in content:
                    append = True

        if append:
            LNODE.log(sibling, 1, 'Sibling being appended')
            if sibling.tag not in ['div', 'p']:
                # We have a node that isn't a common block level element, like
                # a form or td tag. Turn it into a div so it doesn't get
                # filtered out later by accident.
                sibling.tag = 'div'

            candidate_node.node.append(sibling)

    return candidate_node
Example #5
0
def clean_conditionally(node):
    """Remove the clean_el if it looks like bad content based on rules."""
    target_tags = ['form', 'table', 'ul', 'div', 'p']

    LNODE.log(node, 2, 'Cleaning conditionally node.')

    if node.tag not in target_tags:
        # this is not the tag you're looking for
        LNODE.log(node, 2, 'Node cleared.')
        return

    weight = get_class_weight(node)
    # content_score = LOOK up the content score for this node we found
    # before else default to 0
    content_score = 0

    if (weight + content_score < 0):
        LNODE.log(node, 2, 'Dropping conditional node')
        LNODE.log(node, 2, 'Weight + score < 0')
        return True

    if node.text_content().count(',') < 10:
        LOG.debug("There aren't 10 ,s so we're processing more")

        # If there are not very many commas, and the number of
        # non-paragraph elements is more than paragraphs or other ominous
        # signs, remove the element.
        p = len(node.findall('.//p'))
        img = len(node.findall('.//img'))
        li = len(node.findall('.//li')) - 100
        inputs = len(node.findall('.//input'))

        embed = 0
        embeds = node.findall('.//embed')
        for e in embeds:
            if ok_embedded_video(e):
                embed += 1
        link_density = get_link_density(node)
        content_length = len(node.text_content())

        remove_node = False

        if li > p and node.tag != 'ul' and node.tag != 'ol':
            LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
            remove_node = True
        elif inputs > p / 3.0:
            LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
            remove_node = True
        elif content_length < 25 and (img == 0 or img > 2):
            LNODE.log(node, 2,
                'Conditional drop: len < 25 and 0/>2 images')
            remove_node = True
        elif weight < 25 and link_density > 0.2:
            LNODE.log(node, 2,
                'Conditional drop: weight small and link is dense')
            remove_node = True
        elif weight >= 25 and link_density > 0.5:
            LNODE.log(node, 2,
                'Conditional drop: weight big but link heavy')
            remove_node = True
        elif (embed == 1 and content_length < 75) or embed > 1:
            LNODE.log(node, 2,
                'Conditional drop: embed w/o much content or many embed')
            remove_node = True

        if remove_node:
            LNODE.log(node, 2, 'Node will be removed')
        else:
            LNODE.log(node, 2, 'Node cleared')
        return remove_node

    # nope, don't remove anything
    LNODE.log(node, 2, 'Node Cleared final.')
    return False
Example #6
0
def clean_conditionally(node):
    """Remove the clean_el if it looks like bad content based on rules."""
    target_tags = ['form', 'table', 'ul', 'div', 'p']

    LNODE.log(node, 2, 'Cleaning conditionally node.')

    if node.tag not in target_tags:
        # this is not the tag you're looking for
        LNODE.log(node, 2, 'Node cleared.')
        return

    weight = get_class_weight(node)
    # content_score = LOOK up the content score for this node we found
    # before else default to 0
    content_score = 0

    if (weight + content_score < 0):
        LNODE.log(node, 2, 'Dropping conditional node')
        LNODE.log(node, 2, 'Weight + score < 0')
        return True

    if node.text_content().count(',') < 10:
        LOG.debug("There aren't 10 ,s so we're processing more")

        # If there are not very many commas, and the number of
        # non-paragraph elements is more than paragraphs or other ominous
        # signs, remove the element.
        p = len(node.findall('.//p'))
        img = len(node.findall('.//img'))
        li = len(node.findall('.//li')) - 100
        inputs = len(node.findall('.//input'))

        embed = 0
        embeds = node.findall('.//embed')
        for e in embeds:
            if ok_embedded_video(e):
                embed += 1
        link_density = get_link_density(node)
        content_length = len(node.text_content())

        remove_node = False

        if li > p and node.tag != 'ul' and node.tag != 'ol':
            LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
            remove_node = True
        elif inputs > p / 3.0:
            LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
            remove_node = True
        elif content_length < 25 and (img == 0 or img > 2):
            LNODE.log(node, 2, 'Conditional drop: len < 25 and 0/>2 images')
            remove_node = True
        elif weight < 25 and link_density > 0.2:
            LNODE.log(node, 2,
                      'Conditional drop: weight small and link is dense')
            remove_node = True
        elif weight >= 25 and link_density > 0.5:
            LNODE.log(node, 2, 'Conditional drop: weight big but link heavy')
            remove_node = True
        elif (embed == 1 and content_length < 75) or embed > 1:
            LNODE.log(
                node, 2,
                'Conditional drop: embed w/o much content or many embed')
            remove_node = True

        if remove_node:
            LNODE.log(node, 2, 'Node will be removed')
        else:
            LNODE.log(node, 2, 'Node cleared')
        return remove_node

    # nope, don't remove anything
    LNODE.log(node, 2, 'Node Cleared final.')
    return False
Example #7
0
    def clean_conditionally(node):
        """Remove the clean_el if it looks like bad content based on rules."""
        target_tags = ["form", "table", "ul", "div", "p"]

        if node.tag not in target_tags:
            # this is not the tag you're looking for
            return

        weight = get_class_weight(node)
        # content_score = LOOK up the content score for this node we found
        # before else default to 0
        content_score = 0

        if weight + content_score < 0:
            LNODE.log(node, 2, "Dropping conditional node")
            return True

        if node.text_content().count(",") < 10:
            LOG.debug("There aren't 10 ,s so we're processing more")

            # If there are not very many commas, and the number of
            # non-paragraph elements is more than paragraphs or other ominous
            # signs, remove the element.
            p = len(node.findall(".//p"))
            img = len(node.findall(".//img"))
            li = len(node.findall(".//li")) - 100
            inputs = len(node.findall(".//input"))

            embed = 0
            embeds = node.findall(".//embed")
            for e in embeds:
                if ok_embedded_video(e):
                    embed += 1
            link_density = get_link_density(node)
            content_length = len(node.text_content())

            remove_node = False

            if img > p:
                # this one has shown to do some extra image removals.
                # we could get around this by checking for caption info in the
                # images to try to do some scoring of good v. bad images.
                # failing example:
                # arstechnica.com/science/news/2012/05/1859s-great-auroral-stormthe-week-the-sun-touched-the-earth.ars
                LNODE.log(node, 2, "Conditional drop: img > p")
                remove_node = True
            elif li > p and node.tag != "ul" and node.tag != "ol":
                LNODE.log(node, 2, "Conditional drop: li > p and not ul/ol")
                remove_node = True
            elif inputs > p / 3.0:
                LNODE.log(node, 2, "Conditional drop: inputs > p/3.0")
                remove_node = True
            elif content_length < 25 and (img == 0 or img > 2):
                LNODE.log(node, 2, "Conditional drop: len < 25 and 0/>2 images")
                remove_node = True
            elif weight < 25 and link_density > 0.2:
                LNODE.log(node, 2, "Conditional drop: weight small and link is dense")
                remove_node = True
            elif weight >= 25 and link_density > 0.5:
                LNODE.log(node, 2, "Conditional drop: weight big but link heavy")
                remove_node = True
            elif (embed == 1 and content_length < 75) or embed > 1:
                LNODE.log(node, 2, "Conditional drop: embed without much content or many embed")
                remove_node = True
            return remove_node

        # nope, don't remove anything
        return False
Example #8
0
    def clean_document(node):
        """Clean up the final document we return as the readable article"""
        LOG.debug("Cleaning document")
        clean_list = ["object", "h1"]

        # If there is only one h2, they are probably using it as a header and
        # not a subheader, so remove it since we already have a header.
        if len(node.findall(".//h2")) == 1:
            LOG.debug("Adding H2 to list of nodes to clean.")
            clean_list.append("h2")

        for n in node.iter():
            # clean out any incline style properties
            if "style" in n.attrib:
                n.set("style", "")

            # remove all of the following tags
            # Clean a node of all elements of type "tag".
            # (Unless it's a youtube/vimeo video. People love movies.)
            is_embed = True if n.tag in ["object", "embed"] else False
            if n.tag in clean_list:
                allow = False

                # Allow youtube and vimeo videos through as people usually
                # want to see those.
                if is_embed:
                    if ok_embedded_video(n):
                        allow = True

                if not allow:
                    LNODE.log(n, 2, "Dropping Node")
                    n.drop_tree()
                    # go on with next loop, this guy is gone
                    continue

            if n.tag in ["h1", "h2", "h3", "h4"]:
                # clean headings
                # if the heading has no css weight or a high link density,
                # remove it
                if get_class_weight(n) < 0 or get_link_density(n) > 0.33:
                    # for some reason we get nodes here without a parent
                    if n.getparent() is not None:
                        LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
                        n.drop_tree()
                        # go on with next loop, this guy is gone
                        continue

            # clean out extra <p>
            if n.tag == "p":
                # if the p has no children and has no content...well then down
                # with it.
                if not n.getchildren() and len(n.text_content()) < 5:
                    LNODE.log(n, 2, "Dropping extra <p>")
                    n.drop_tree()
                    # go on with next loop, this guy is gone
                    continue

            # finally try out the conditional cleaning of the target node
            if clean_conditionally(n):
                # For some reason the parent is none so we can't drop, we're
                # not in a tree that can take dropping this node.
                if n.getparent() is not None:
                    n.drop_tree()

        return node