Esempio n. 1
0
def run(options, args):
    stdin = sys.stdin.read()

    if options.format == 'auto':
        if not re.compile("\<").match(stdin):
            options.format = 'txt'
        elif not re.compile("\<html").match(stdin):
            options.format = 'xml'
        else:
            options.format = 'html'

    if options.format == 'html':
        output = seolinter.lint_html(stdin)
    if options.format == 'xml':
        output = seolinter.lint_sitemap(stdin)
    if options.format == 'txt':
        output = seolinter.lint_robots_txt(stdin)

    exit = 0

    for rule in seolinter.rules:
        for key, value in output.iteritems():
            if key == rule[0]:
                print rule[0] + ':', rule[1], '(' + seolinter.levels[rule[2]] + ')'
                if value != True:
                    print "\tfound:", value
                if rule[2] == seolinter.ERROR or rule[2] == seolinter.CRITICAL:
                    exit = 1

    # if exit:
    #     print html_string

    sys.exit(exit)
Esempio n. 2
0
def run(options, args):
    stdin = sys.stdin.read()

    if options.format == 'auto':
        if not re.compile("\<").match(stdin):
            options.format = 'txt'
        elif not re.compile("\<html").match(stdin):
            options.format = 'xml'
        else:
            options.format = 'html'

    if options.format == 'html':
        output = seolinter.lint_html(stdin)
    if options.format == 'xml':
        output = seolinter.lint_sitemap(stdin)
    if options.format == 'txt':
        output = seolinter.lint_robots_txt(stdin)

    exit = 0

    for rule in seolinter.rules:
        for key, value in output.iteritems():
            if key == rule[0]:
                print rule[0] + ':', rule[1], '(' + seolinter.levels[rule[2]] + ')'
                if value != True:
                    print "\tfound:", value
                if rule[2] == seolinter.ERROR or rule[2] == seolinter.CRITICAL:
                    exit = 1

    # if exit:
    #     print html_string

    sys.exit(exit)
Esempio n. 3
0
def process_html(html, url):

    lint_errors = seolinter.lint_html(html)

    page_details = extract_page_details(html, url)

    links = extract_links(html, url)

    sources = extract_sources(html, url)

    return lint_errors, page_details, links, sources
Esempio n. 4
0
def process_html(html, url):

    lint_errors = seolinter.lint_html(html)

    page_details = extract_page_details(html, url)

    links = extract_links(html, url)

    sources = extract_sources(html, url)

    return lint_errors, page_details, links, sources
Esempio n. 5
0
    def from_response(cls, response):
        page = cls(response)
        if response.request.method != 'HEAD' and hasattr(response, 'xpath'):
            # page['content_hash'] = hashlib.sha256(
            #     response.body.encode('ascii', 'ignore')).hexdigest()
            page['body'] = response.body

            title = response.xpath('//title/text()').extract()
            title = title[0] if len(title) > 0 else ''
            page['page_title'] = title
            page['page_title_length'] = len(title)

            description = response.xpath(
                '//meta[@name="description"]/@content').extract()
            description = description[0] if len(description) > 0 else ''
            page['meta_description'] = description
            page['meta_description_length'] = len(description)

            h1s = response.xpath('//h1/text()').extract()
            page['h1_count'] = len(h1s)
            if len(h1s) > 0:
                page['h1_1'] = h1s[0]
                page['h1_length_1'] = len(h1s[0])
            if len(h1s) > 1:
                page['h1_2'] = h1s[1]
                page['h1_length_2'] = len(h1s[1])

            robots = response.xpath(
                '//meta[@name="robots"]/@content').extract()
            robots = robots[0] if len(robots) > 0 else None
            page['meta_robots'] = robots

            rel_next = response.xpath('//link[@rel="next"]/@href')
            rel_next = rel_next[0] if len(rel_next) > 0 else None
            page['rel_next'] = rel_next

            rel_prev = response.xpath('//link[@rel="prev"]/@href')
            rel_prev = rel_prev[0] if len(rel_prev) > 0 else None
            page['rel_prev'] = rel_prev

            content_item_id = response.xpath(
                '//meta[@name="phx:content-item-id"]/@content').extract()
            content_item_id = content_item_id[0] if len(
                content_item_id) > 0 else None
            page['content_item_id'] = content_item_id

            content_node_id = response.xpath(
                '//meta[@name="phx:content-node-id"]/@content').extract()
            content_node_id = content_node_id[0] if len(
                content_node_id) > 0 else None
            page['content_node_id'] = content_node_id

            object_type = response.xpath(
                '//meta[@name="phx:content-object-type"]/@content').extract()
            object_type = object_type[0] if len(object_type) > 0 else None
            page['object_type'] = object_type

            if isinstance(response, HtmlResponse) and \
                response.request.method != 'HEAD':

                if isinstance(response, HtmlResponse):
                    res = lint_html(response.body)
                    lint_keys = res.keys()
                    page['lint_critical'] = len(
                        [l for l in lint_keys if l[0] == 'C'])
                    page['lint_error'] = len(
                        [l for l in lint_keys if l[0] == 'E'])
                    page['lint_warn'] = len(
                        [l for l in lint_keys if l[0] == 'W'])
                    page['lint_info'] = len(
                        [l for l in lint_keys if l[0] == 'I'])
                    page['lint_results'] = json.dumps(res)

        return page
Esempio n. 6
0
    def from_response(cls, response):
        page = cls(response)
        if response.request.method != 'HEAD' and hasattr(response, 'xpath'):
            # page['content_hash'] = hashlib.sha256(
            #     response.body.encode('ascii', 'ignore')).hexdigest()
            page['body'] = response.body

            title = response.xpath('//title/text()').extract()
            title = title[0] if len(title) > 0 else ''
            page['page_title'] = title
            page['page_title_length'] = len(title)

            description = response.xpath('//meta[@name="description"]/@content').extract()
            description = description[0] if len(description) > 0 else ''
            page['meta_description'] = description
            page['meta_description_length'] = len(description)

            h1s = response.xpath('//h1/text()').extract()
            page['h1_count'] = len(h1s)
            if len(h1s) > 0:
                page['h1_1'] = h1s[0]
                page['h1_length_1'] = len(h1s[0])
            if len(h1s) > 1:
                page['h1_2'] = h1s[1]
                page['h1_length_2'] = len(h1s[1])

            robots = response.xpath('//meta[@name="robots"]/@content').extract()
            robots = robots[0] if len(robots) > 0 else None
            page['meta_robots'] = robots

            rel_next = response.xpath('//link[@rel="next"]/@href')
            rel_next = rel_next[0] if len(rel_next) > 0 else None
            page['rel_next'] = rel_next

            rel_prev = response.xpath('//link[@rel="prev"]/@href')
            rel_prev = rel_prev[0] if len(rel_prev) > 0 else None
            page['rel_prev'] = rel_prev

            content_item_id = response.xpath('//meta[@name="phx:content-item-id"]/@content').extract()
            content_item_id = content_item_id[0] if len(content_item_id) > 0 else None
            page['content_item_id'] = content_item_id

            content_node_id = response.xpath('//meta[@name="phx:content-node-id"]/@content').extract()
            content_node_id = content_node_id[0] if len(content_node_id) > 0 else None
            page['content_node_id'] = content_node_id

            object_type = response.xpath('//meta[@name="phx:content-object-type"]/@content').extract()
            object_type = object_type[0] if len(object_type) > 0 else None
            page['object_type'] = object_type

            if isinstance(response, HtmlResponse) and \
                response.request.method != 'HEAD':

                if isinstance(response, HtmlResponse):
                    res = lint_html(response.body)
                    lint_keys = res.keys()
                    page['lint_critical'] = len([l for l in lint_keys if l[0] == 'C'])
                    page['lint_error'] = len([l for l in lint_keys if l[0] == 'E'])
                    page['lint_warn'] = len([l for l in lint_keys if l[0] == 'W'])
                    page['lint_info'] = len([l for l in lint_keys if l[0] == 'I'])
                    page['lint_results'] = json.dumps(res)


        return page