def run(options, args): stdin = sys.stdin.read() if options.format == 'auto': if not re.compile("\<").match(stdin): options.format = 'txt' elif not re.compile("\<html").match(stdin): options.format = 'xml' else: options.format = 'html' if options.format == 'html': output = seolinter.lint_html(stdin) if options.format == 'xml': output = seolinter.lint_sitemap(stdin) if options.format == 'txt': output = seolinter.lint_robots_txt(stdin) exit = 0 for rule in seolinter.rules: for key, value in output.iteritems(): if key == rule[0]: print rule[0] + ':', rule[1], '(' + seolinter.levels[rule[2]] + ')' if value != True: print "\tfound:", value if rule[2] == seolinter.ERROR or rule[2] == seolinter.CRITICAL: exit = 1 # if exit: # print html_string sys.exit(exit)
def process_html(html, url): lint_errors = seolinter.lint_html(html) page_details = extract_page_details(html, url) links = extract_links(html, url) sources = extract_sources(html, url) return lint_errors, page_details, links, sources
def from_response(cls, response): page = cls(response) if response.request.method != 'HEAD' and hasattr(response, 'xpath'): # page['content_hash'] = hashlib.sha256( # response.body.encode('ascii', 'ignore')).hexdigest() page['body'] = response.body title = response.xpath('//title/text()').extract() title = title[0] if len(title) > 0 else '' page['page_title'] = title page['page_title_length'] = len(title) description = response.xpath( '//meta[@name="description"]/@content').extract() description = description[0] if len(description) > 0 else '' page['meta_description'] = description page['meta_description_length'] = len(description) h1s = response.xpath('//h1/text()').extract() page['h1_count'] = len(h1s) if len(h1s) > 0: page['h1_1'] = h1s[0] page['h1_length_1'] = len(h1s[0]) if len(h1s) > 1: page['h1_2'] = h1s[1] page['h1_length_2'] = len(h1s[1]) robots = response.xpath( '//meta[@name="robots"]/@content').extract() robots = robots[0] if len(robots) > 0 else None page['meta_robots'] = robots rel_next = response.xpath('//link[@rel="next"]/@href') rel_next = rel_next[0] if len(rel_next) > 0 else None page['rel_next'] = rel_next rel_prev = response.xpath('//link[@rel="prev"]/@href') rel_prev = rel_prev[0] if len(rel_prev) > 0 else None page['rel_prev'] = rel_prev content_item_id = response.xpath( '//meta[@name="phx:content-item-id"]/@content').extract() content_item_id = content_item_id[0] if len( content_item_id) > 0 else None page['content_item_id'] = content_item_id content_node_id = response.xpath( '//meta[@name="phx:content-node-id"]/@content').extract() content_node_id = content_node_id[0] if len( content_node_id) > 0 else None page['content_node_id'] = content_node_id object_type = response.xpath( '//meta[@name="phx:content-object-type"]/@content').extract() object_type = object_type[0] if len(object_type) > 0 else None page['object_type'] = object_type if isinstance(response, HtmlResponse) and \ response.request.method != 'HEAD': if isinstance(response, HtmlResponse): res = lint_html(response.body) lint_keys = res.keys() page['lint_critical'] = len( [l for l in lint_keys if l[0] == 'C']) page['lint_error'] = len( [l for l in lint_keys if l[0] == 'E']) page['lint_warn'] = len( [l for l in lint_keys if l[0] == 'W']) page['lint_info'] = len( [l for l in lint_keys if l[0] == 'I']) page['lint_results'] = json.dumps(res) return page
def from_response(cls, response): page = cls(response) if response.request.method != 'HEAD' and hasattr(response, 'xpath'): # page['content_hash'] = hashlib.sha256( # response.body.encode('ascii', 'ignore')).hexdigest() page['body'] = response.body title = response.xpath('//title/text()').extract() title = title[0] if len(title) > 0 else '' page['page_title'] = title page['page_title_length'] = len(title) description = response.xpath('//meta[@name="description"]/@content').extract() description = description[0] if len(description) > 0 else '' page['meta_description'] = description page['meta_description_length'] = len(description) h1s = response.xpath('//h1/text()').extract() page['h1_count'] = len(h1s) if len(h1s) > 0: page['h1_1'] = h1s[0] page['h1_length_1'] = len(h1s[0]) if len(h1s) > 1: page['h1_2'] = h1s[1] page['h1_length_2'] = len(h1s[1]) robots = response.xpath('//meta[@name="robots"]/@content').extract() robots = robots[0] if len(robots) > 0 else None page['meta_robots'] = robots rel_next = response.xpath('//link[@rel="next"]/@href') rel_next = rel_next[0] if len(rel_next) > 0 else None page['rel_next'] = rel_next rel_prev = response.xpath('//link[@rel="prev"]/@href') rel_prev = rel_prev[0] if len(rel_prev) > 0 else None page['rel_prev'] = rel_prev content_item_id = response.xpath('//meta[@name="phx:content-item-id"]/@content').extract() content_item_id = content_item_id[0] if len(content_item_id) > 0 else None page['content_item_id'] = content_item_id content_node_id = response.xpath('//meta[@name="phx:content-node-id"]/@content').extract() content_node_id = content_node_id[0] if len(content_node_id) > 0 else None page['content_node_id'] = content_node_id object_type = response.xpath('//meta[@name="phx:content-object-type"]/@content').extract() object_type = object_type[0] if len(object_type) > 0 else None page['object_type'] = object_type if isinstance(response, HtmlResponse) and \ response.request.method != 'HEAD': if isinstance(response, HtmlResponse): res = lint_html(response.body) lint_keys = res.keys() page['lint_critical'] = len([l for l in lint_keys if l[0] == 'C']) page['lint_error'] = len([l for l in lint_keys if l[0] == 'E']) page['lint_warn'] = len([l for l in lint_keys if l[0] == 'W']) page['lint_info'] = len([l for l in lint_keys if l[0] == 'I']) page['lint_results'] = json.dumps(res) return page