def generate(site): """Output the list of pages without a title to the given file descriptor.""" # get all internal pages without a title links = [ x for x in site.linkMap.values() if x.ispage and x.isinternal and (x.title is None or x.title == '') ] links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.notitles, site) if not links: fp.write( ' <p class="description">\n' ' All pages had a title specified.\n' ' </p>\n' ) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' This is the list of all (internal) pages without a proper title\n' ' specified.\n' ' </p>\n' ' <ol>\n') for link in links: fp.write( ' <li>%(link)s</li>\n' % { 'link': plugins.make_link(link,link.url) }) link.add_pageproblem('missing title') fp.write( ' </ol>\n' ) plugins.close_html(fp)
def generate(site): """Generate the list of external links to the given file descriptor.""" # get all external links links = [ x for x in site.linkMap.values() if not x.isinternal ] # sort list links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.external, site) if not links: fp.write( ' <p class="description">' ' No external links were found on the website.' ' </p>\n' ) plugins.close_html(fp) return fp.write( ' <p class="description">' ' This is the list of all external urls encountered during the' ' examination of the website.' ' </p>\n' ' <ol>\n' ) for link in links: fp.write( ' <li>\n' ' %(link)s\n' % { 'link': plugins.make_link(link) }) # present a list of parents plugins.print_parents(fp, link, ' ') fp.write( ' </li>\n') fp.write( ' </ol>\n' ) plugins.close_html(fp)
def generate(site): """Generate the list of external links to the given file descriptor.""" # get all external links links = [x for x in site.linkMap.values() if not x.isinternal] # sort list links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.external, site) if not links: fp.write(' <p class="description">' ' No external links were found on the website.' ' </p>\n') plugins.close_html(fp) return fp.write(' <p class="description">' ' This is the list of all external urls encountered during the' ' examination of the website.' ' </p>\n' ' <ol>\n') for link in links: fp.write(' <li>\n' ' %(link)s\n' % {'link': plugins.make_link(link)}) # present a list of parents plugins.print_parents(fp, link, ' ') fp.write(' </li>\n') fp.write(' </ol>\n') plugins.close_html(fp)
def _explore(fp, link, explored=None, depth=0, indent=' '): """Recursively do a breadth first traversal of the graph of links on the site. Prints the html results to the file descriptor.""" # set up explored if explored is None: explored = [ link ] # output this link fp.write(indent+'<li>\n') fp.write(indent+' '+plugins.make_link(link)+'\n') # only check children if we are not too deep yet if depth <= config.REPORT_SITEMAP_LEVEL: # figure out the links to follow and ensure that they are only # explored from here children = [] for child in link.pagechildren: # skip pages that have the wrong depth, are not internal or have # already been visited if child.depth != depth+1 or not child.isinternal or child in explored: continue # set child as explored and add to to explore list explored.append(child) children.append(child) # go over the children and present them as a list if len(children) > 0: fp.write(indent+' <ul>\n') children.sort(lambda a, b: cmp(a.url, b.url)) for child in children: _explore(fp, child, explored, depth+1, indent+' ') fp.write(indent+' </ul>\n') fp.write(indent+'</li>\n')
def generate(site): """Output a list of images to the given file descriptor.""" # this finds all links with a reasonable image-like content-type matcher = re.compile('^image/.*$') # get non-page images that have an image/* mimetype links = [ x for x in site.linkMap.values() if not x.ispage and x.mimetype is not None and matcher.search(x.mimetype) ] # sort list links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.images, site) if not links: fp.write( ' <p class="description">\n' ' No images were linked on the website.\n' ' </p>\n' ' <ol>\n' ) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' This is the list of all images found linked on the website.\n' ' </p>\n' ' <ol>\n' ) for link in links: fp.write(' <li>%s</li>\n' % plugins.make_link(link, link.url)) fp.write( ' </ol>\n' ) plugins.close_html(fp)
def generate(site): """Output the list of not checked pages to the given file descriptor.""" # get all yanked urls links = [ x for x in site.linkMap.values() if x.isyanked ] links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.notchkd, site) if not links: fp.write( ' <p class="description">\n' ' All links have been checked.\n' ' </p>\n' ) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' This is the list of all urls that were encountered but not checked\n' ' at all during the examination of the website.\n' ' </p>\n' ' <ol>\n') for link in links: fp.write( ' <li>\n' ' %(link)s\n' % { 'link': plugins.make_link(link, link.url) }) # present a list of parents plugins.print_parents(fp, link, ' ') fp.write( ' </li>\n') fp.write( ' </ol>\n' ) plugins.close_html(fp)
def generate(site): """Output the list of pages without a title to the given file descriptor.""" # get all internal pages without a title links = [ x for x in site.linkMap.values() if x.ispage and x.isinternal and (x.title is None or x.title == '') ] links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.notitles, site) if not links: fp.write(' <p class="description">\n' ' All pages had a title specified.\n' ' </p>\n') plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' This is the list of all (internal) pages without a proper title\n' ' specified.\n' ' </p>\n' ' <ol>\n') for link in links: fp.write(' <li>%(link)s</li>\n' % {'link': plugins.make_link(link, link.url)}) link.add_pageproblem('missing title') fp.write(' </ol>\n') plugins.close_html(fp)
def generate(site): """Output the list of not checked pages to the given file descriptor.""" # get all yanked urls links = [x for x in site.linkMap.values() if x.isyanked] links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.notchkd, site) if not links: fp.write(' <p class="description">\n' ' All links have been checked.\n' ' </p>\n') plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' This is the list of all urls that were encountered but not checked\n' ' at all during the examination of the website.\n' ' </p>\n' ' <ol>\n') for link in links: fp.write(' <li>\n' ' %(link)s\n' % {'link': plugins.make_link(link, link.url)}) # present a list of parents plugins.print_parents(fp, link, ' ') fp.write(' </li>\n') fp.write(' </ol>\n') plugins.close_html(fp)
def generate(site): """Output a list of images to the given file descriptor.""" # this finds all links with a reasonable image-like content-type matcher = re.compile('^image/.*$') # get non-page images that have an image/* mimetype links = [ x for x in site.linkMap.values() if not x.ispage and x.mimetype is not None and matcher.search(x.mimetype) ] # sort list links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.images, site) if not links: fp.write(' <p class="description">\n' ' No images were linked on the website.\n' ' </p>\n' ' <ol>\n') plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' This is the list of all images found linked on the website.\n' ' </p>\n' ' <ol>\n') for link in links: fp.write(' <li>%s</li>\n' % plugins.make_link(link, link.url)) fp.write(' </ol>\n') plugins.close_html(fp)
def _explore(fp, link, explored=None, depth=0, indent=' '): """Recursively do a breadth first traversal of the graph of links on the site. Prints the html results to the file descriptor.""" # set up explored if explored is None: explored = [link] # output this link fp.write(indent + '<li>\n') fp.write(indent + ' ' + plugins.make_link(link) + '\n') # only check children if we are not too deep yet if depth <= config.REPORT_SITEMAP_LEVEL: # figure out the links to follow and ensure that they are only # explored from here children = [] for child in link.pagechildren: # skip pages that have the wrong depth, are not internal or have # already been visited if child.depth != depth + 1 or not child.isinternal or child in explored: continue # set child as explored and add to to explore list explored.append(child) children.append(child) # go over the children and present them as a list if len(children) > 0: fp.write(indent + ' <ul>\n') children.sort(lambda a, b: cmp(a.url, b.url)) for child in children: _explore(fp, child, explored, depth + 1, indent + ' ') fp.write(indent + ' </ul>\n') fp.write(indent + '</li>\n')
def generate(site): """Present the list of bad links to the given file descriptor.""" # find all links with link problems links = [ x for x in site.linkMap.values() if len(x.linkproblems)>0 ] # sort list links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.badlinks, site) if not links: fp.write( ' <p class="description">\n' ' There were no problems retrieving links from the website.\n' ' </p>\n' ' <ol>\n' ) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' These links could not be retrieved during the crawling of the website.\n' ' </p>\n' ' <ol>\n' ) for link in links: # list the link fp.write( ' <li>\n' ' %(badurl)s\n' ' <ul class="problems">\n' % { 'badurl': plugins.make_link(link,link.url) }) # list the problems for problem in link.linkproblems: fp.write( ' <li>%(problem)s</li>\n' % { 'problem': plugins.htmlescape(problem) }) fp.write( ' </ul>\n') # present a list of parents link.parents.sort() plugins.print_parents(fp, link, ' ') # add a reference to the problem map for problem in link.linkproblems: for parent in link.parents: parent.add_pageproblem('bad link: ' + link.url + ': ' + problem) fp.write( ' </li>\n') fp.write( ' </ol>\n' ) plugins.close_html(fp)
def generate(site): """Output a sorted list of urls to the specified file descriptor.""" fp = plugins.open_html(plugins.urllist, site) fp.write( ' <p class="description">\n' ' This is the list of all urls encountered during the examination of\n' ' the website. It lists internal as well as external and\n' ' non-examined urls.\n' ' </p>\n' ' <ol>\n') urls = site.linkMap.keys() urls.sort() for url in urls: fp.write(' <li>' + plugins.make_link(site.linkMap[url], url) + '</li>\n') fp.write(' </ol>\n') plugins.close_html(fp)
def generate(site): """Output a sorted list of urls to the specified file descriptor.""" fp = plugins.open_html(plugins.urllist, site) fp.write( ' <p class="description">\n' ' This is the list of all urls encountered during the examination of\n' ' the website. It lists internal as well as external and\n' ' non-examined urls.\n' ' </p>\n' ' <ol>\n' ) urls = site.linkMap.keys() urls.sort() for url in urls: fp.write(' <li>'+plugins.make_link(site.linkMap[url], url)+'</li>\n') fp.write( ' </ol>\n' ) plugins.close_html(fp)
def generate(site): """Output the list of large pages to the given file descriptor.""" # get all internal pages and get big links links = [ x for x in site.linkMap.values() if x.ispage and x.isinternal and _getsize(x) >= config.REPORT_SLOW_URL_SIZE*1024 ] # sort links by size (biggest first) links.sort(lambda a, b: cmp(b.totalSize, a.totalSize)) # present results fp = plugins.open_html(plugins.size, site) if not links: fp.write( ' <p class="description">\n' ' No pages over %(size)sK were found.\n' ' </p>\n' % { 'size': config.REPORT_SLOW_URL_SIZE }) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' These pages are probably too big (over %(size)sK) which could be\n' ' slow to download.\n' ' </p>\n' ' <ul>\n' % { 'size': config.REPORT_SLOW_URL_SIZE }) for link in links: size = plugins.get_size(link.totalSize) fp.write( ' <li>\n' ' %(link)s\n' ' <ul class="problem">\n' ' <li>size: %(size)s</li>\n' ' </ul>\n' ' </li>\n' % { 'link': plugins.make_link(link), 'size': size }) link.add_pageproblem( 'this page and its components is %(size)s' % { 'size': size }) fp.write( ' </ul>\n' ) plugins.close_html(fp)
def generate(site): """Present the list of bad links to the given file descriptor.""" # find all links with link problems links = [x for x in site.linkMap.values() if len(x.linkproblems) > 0] # sort list links.sort(lambda a, b: cmp(a.url, b.url)) # present results fp = plugins.open_html(plugins.badlinks, site) if not links: fp.write( ' <p class="description">\n' ' There were no problems retrieving links from the website.\n' ' </p>\n' ' <ol>\n') plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' These links could not be retrieved during the crawling of the website.\n' ' </p>\n' ' <ol>\n') for link in links: # list the link fp.write(' <li>\n' ' %(badurl)s\n' ' <ul class="problems">\n' % {'badurl': plugins.make_link(link, link.url)}) # list the problems for problem in link.linkproblems: fp.write(' <li>%(problem)s</li>\n' % {'problem': plugins.htmlescape(problem)}) fp.write(' </ul>\n') # present a list of parents link.parents.sort() plugins.print_parents(fp, link, ' ') # add a reference to the problem map for problem in link.linkproblems: for parent in link.parents: parent.add_pageproblem('bad link: ' + link.url + ': ' + problem) fp.write(' </li>\n') fp.write(' </ol>\n') plugins.close_html(fp)
def generate(site): """Output the list of recently modified pages to the specified file descriptor.""" # the time for which links are considered new newtime = time.time()-SECS_PER_DAY*config.REPORT_WHATSNEW_URL_AGE # get all internal pages that are new links = [ x for x in site.linkMap.values() if x.ispage and x.isinternal and x.mtime is not None and x.mtime > newtime ] # sort links links.sort(lambda a, b: cmp(b.mtime, a.mtime)) # present results fp = plugins.open_html(plugins.new, site) if not links: fp.write( ' <p class="description">\n' ' No pages were found that were modified within the last %(new)d days.\n' ' </p>\n' % { 'new': config.REPORT_WHATSNEW_URL_AGE }) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' These pages have been recently modified (within %(new)d days).\n' ' </p>\n' ' <ul>\n' % { 'new': config.REPORT_WHATSNEW_URL_AGE }) for link in links: age = (time.time()-link.mtime)/SECS_PER_DAY fp.write( ' <li>\n' ' %(link)s\n' ' <ul class="problems">\n' ' <li>age: %(age)d days</li>\n' ' </ul>\n' ' </li>\n' % { 'link': plugins.make_link(link), 'age': age }) fp.write(' </ul>\n') plugins.close_html(fp)
def generate(site): """Output the list of outdated pages to the specified file descriptor.""" # the time for which links are considered old oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE # get all internal pages that are old links = [ x for x in site.linkMap.values() if x.ispage and x.isinternal and x.mtime is not None and x.mtime < oldtime ] # sort links links.sort(lambda a, b: cmp(a.mtime, b.mtime)) # present results fp = plugins.open_html(plugins.old, site) if not links: fp.write( ' <p class="description">\n' ' No pages were found that were older than %(old)d days old.\n' ' </p>\n' % {'old': config.REPORT_WHATSOLD_URL_AGE}) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' These pages have been modified a long time ago (older than %(old)d\n' ' days) and may be outdated.\n' ' </p>\n' ' <ul>\n' % {'old': config.REPORT_WHATSOLD_URL_AGE}) for link in links: age = (time.time() - link.mtime) / SECS_PER_DAY fp.write(' <li>\n' ' %(link)s\n' ' <ul class="problems">\n' ' <li>age: %(age)d days</li>\n' ' </ul>\n' ' </li>\n' % { 'link': plugins.make_link(link), 'age': age }) # add link to problem database link.add_pageproblem('this page is %d days old' % age) fp.write(' </ul>\n') plugins.close_html(fp)
def generate(site): """Output the list of outdated pages to the specified file descriptor.""" # the time for which links are considered old oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE # get all internal pages that are old links = [ x for x in site.linkMap.values() if x.ispage and x.isinternal and x.mtime is not None and x.mtime < oldtime ] # sort links links.sort(lambda a, b: cmp(a.mtime, b.mtime)) # present results fp = plugins.open_html(plugins.old, site) if not links: fp.write( ' <p class="description">\n' " No pages were found that were older than %(old)d days old.\n" " </p>\n" % {"old": config.REPORT_WHATSOLD_URL_AGE} ) plugins.close_html(fp) return fp.write( ' <p class="description">\n' " These pages have been modified a long time ago (older than %(old)d\n" " days) and may be outdated.\n" " </p>\n" " <ul>\n" % {"old": config.REPORT_WHATSOLD_URL_AGE} ) for link in links: age = (time.time() - link.mtime) / SECS_PER_DAY fp.write( " <li>\n" " %(link)s\n" ' <ul class="problems">\n' " <li>age: %(age)d days</li>\n" " </ul>\n" " </li>\n" % {"link": plugins.make_link(link), "age": age} ) # add link to problem database link.add_pageproblem("this page is %d days old" % age) fp.write(" </ul>\n") plugins.close_html(fp)
def generate(site): """Output the list of large pages to the given file descriptor.""" # get all internal pages and get big links links = [ x for x in site.linkMap.values() if x.ispage and x.isinternal and _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024 ] # sort links by size (biggest first) links.sort(lambda a, b: cmp(b.totalSize, a.totalSize)) # present results fp = plugins.open_html(plugins.size, site) if not links: fp.write(' <p class="description">\n' ' No pages over %(size)sK were found.\n' ' </p>\n' % {'size': config.REPORT_SLOW_URL_SIZE}) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' These pages are probably too big (over %(size)sK) which could be\n' ' slow to download.\n' ' </p>\n' ' <ul>\n' % {'size': config.REPORT_SLOW_URL_SIZE}) for link in links: size = plugins.get_size(link.totalSize) fp.write(' <li>\n' ' %(link)s\n' ' <ul class="problem">\n' ' <li>size: %(size)s</li>\n' ' </ul>\n' ' </li>\n' % { 'link': plugins.make_link(link), 'size': size }) link.add_pageproblem('this page and its components is %(size)s' % {'size': size}) fp.write(' </ul>\n') plugins.close_html(fp)
def generate(site): """Output the list of recently modified pages to the specified file descriptor.""" # the time for which links are considered new newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE # get all internal pages that are new links = [ x for x in site.linkMap.values() if x.ispage and x.isinternal and x.mtime is not None and x.mtime > newtime ] # sort links links.sort(lambda a, b: cmp(b.mtime, a.mtime)) # present results fp = plugins.open_html(plugins.new, site) if not links: fp.write( ' <p class="description">\n' ' No pages were found that were modified within the last %(new)d days.\n' ' </p>\n' % {'new': config.REPORT_WHATSNEW_URL_AGE}) plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' These pages have been recently modified (within %(new)d days).\n' ' </p>\n' ' <ul>\n' % {'new': config.REPORT_WHATSNEW_URL_AGE}) for link in links: age = (time.time() - link.mtime) / SECS_PER_DAY fp.write(' <li>\n' ' %(link)s\n' ' <ul class="problems">\n' ' <li>age: %(age)d days</li>\n' ' </ul>\n' ' </li>\n' % { 'link': plugins.make_link(link), 'age': age }) fp.write(' </ul>\n') plugins.close_html(fp)
def generate(site): """Output the overview of problems to the given file descriptor.""" # make a list of problems per author problem_db = {} for link in site.linkMap.values(): # skip external pages if not link.isinternal or len(link.pageproblems) == 0: continue # make a normal name for the author if link.author: author = link.author.strip() else: author = unicode('Unknown') # store the problem if problem_db.has_key(author): problem_db[author].append(link) else: problem_db[author] = [link] fp = plugins.open_html(plugins.problems, site) if not problem_db: fp.write( ' <p class="description">\n' ' No problems were found on this site, hurray.\n' ' </p>\n' ) plugins.close_html(fp) return # print description fp.write( ' <p class="description">\n' ' This is an overview of all the problems on the site, grouped by\n' ' author.\n' ' </p>\n' ) # get a list of authors authors = problem_db.keys() authors.sort() # generate short list of authors if len(authors) > 1: fp.write(' <ul class="authorlist">\n') for author in authors: fp.write( ' <li><a href="#%(authorref)s">Author: %(author)s</a></li>\n' % { 'authorref': urllib.quote(author,''), 'author': plugins.htmlescape(author) }) fp.write(' </ul>\n') # generate problem report fp.write(' <ul>\n') for author in authors: fp.write( ' <li>\n' ' <a name="%(authorref)s">Author: %(author)s</a>\n' ' <ul>\n' % { 'authorref': urllib.quote(author,''), 'author': plugins.htmlescape(author) }) # sort pages by url problem_db[author].sort(lambda a, b: cmp(a.url, b.url)) # list problems for this author for link in problem_db[author]: # present the links fp.write( ' <li>\n' ' %(link)s\n' ' <ul class="problems">\n' % { 'link': plugins.make_link(link) }) # sort problems by name link.pageproblems.sort() # list the problems for problem in link.pageproblems: fp.write( ' <li>%(problem)s</li>\n' % { 'problem': plugins.htmlescape(problem) }) # end the list item fp.write( ' </ul>\n' ' </li>\n' ) fp.write( ' </ul>\n' ' </li>\n' ) fp.write( ' </ul>\n' ) plugins.close_html(fp)