def preprocess(cls, dom): children = dom.getchildren() # cut useless/spam dom nodes for child in children: if not isinstance(child, HtmlElement) or \ child.tag in DomTreeHelper.filtered_preprocess_tags or \ dom.get("style") is not None and misc.find_list(dom.get("style"), DomTreeHelper.filtered_styles) or \ child.get("class") is not None and misc.find_list(child.get("class"), DomTreeHelper.filtered_classids) or \ child.get("id") is not None and misc.find_list(child.get("id"), DomTreeHelper.filtered_classids): child.drop_tree() else: DomTreeHelper.preprocess(child)
def is_domain_url(self, url): parse_result = urlparse.urlparse(url) if (len(parse_result.path) == 0 or parse_result.path == "/" or \ misc.find_list(lambda filename : parse_result.path.startswith(filename), domain_url_filenames)) and len(parse_result.query) == 0: return True else: return False
def validate(self, url, html, headers, extras = None): #content type filtering content_type = headers.get('Content-Type', None) if content_type is not None and not misc.find_list(content_type.lower(), self._settings["general_crawl_policies"]["supported_content_types"]): False, "filtered by content_type %s" % content_type #doc check if headers.has_key('Content-Length') and headers['Content-Length'].strip() == "0" or len(html) == 0: False, "doc is empty" return True, None