Ejemplo n.º 1
0
    def _extract_common_features(self, node):
        features = {"link_length" : 0, "link_length_bak" : 0, "link_count" : 0, "image_link_count" : 0, "short_link_count" : 0, "text_length" : 0, "large_text_count" : 0, "image_count" : 0}
        if node.tag == "a":
            if self._classifiers["valid_link_classifier"].classify(node):
                self._extract_link_features(node, features)
            else:
                self._hide_node(node)
                return False, None
        elif node.tag == "img":
            features["image_count"] = 1
            return True, features
        elif node.tag == "style":
            #move internal styles in <body> to <head>
            if self._config["operation_switches"]["move_internal_styles"]:
                self._move_internal_styles(node)
            return False, None
        elif node.tag == "script":
            if self._config["operation_switches"]["drop_scripts"]:
                node.drop_tree()
                return False, None
        elif node.tag in self._config["skipped_tags"]:
            return False, None

        features["text_length"] = label_count(remove_space(node.text.strip())) if node.text is not None else 0 + label_count(remove_space(node.tail.strip())) if node.tail is not None else 0

        if features["text_length"] >= self._config["large_text_threshold"]:
            features["large_text_count"] = 1

        return True, features
Ejemplo n.º 2
0
 def is_empty_node(cls, node, default_empty_tags, invisible_tags):
     """ Check if a node is empty
     """
     if node.tag not in default_empty_tags:
         text_length = len(remove_space(node.text_content()))
         # children_length = len(node.getchildren())
         children_length = len(filter(lambda child: child not in invisible_tags, node.getchildren()))
         return children_length == 0 and text_length == 0
     return False