def output_module_html(webpage_path): '''Output an HTML page for each module''' icons_relpath = relpath(cellprofiler.icons.__path__[0]) all_png_icons = glob(os.path.join(icons_relpath, "*.png")) icon_names = [os.path.basename(f)[:-4] for f in all_png_icons] help_text = """ <h2>Help for CellProfiler Modules</a></h2> <ul>\n""" d = {} module_path = webpage_path if not (os.path.exists(module_path) and os.path.isdir(module_path)): try: os.mkdir(module_path) except IOError: raise ValueError("Could not create directory %s" % module_path) for module_name in sorted(get_module_names()): module = instantiate_module(module_name) location = os.path.split( module.create_settings.im_func.func_code.co_filename)[0] if location == cpprefs.get_plugin_directory(): continue if isinstance(module.category, (str,unicode)): module.category = [module.category] for category in module.category: if not d.has_key(category): d[category] = {} d[category][module_name] = module result = module.get_help() if result is None: continue result = result.replace('<body><h1>','<body><h1>Module: ') # Replace refs to icons in memory with the relative path to the image dir (see above) result = re.sub("memory:",os.path.join("images","").encode('string-escape'),result) # Check if a corresponding image exists for the module if module_name in icon_names: # Strip out end html tags so I can add more stuff result = result.replace('</body>','').replace('</html>','') # Include images specific to the module, relative to html files ('images' dir) LOCATION_MODULE_IMAGES = os.path.join('images','%s.png'%(module_name)) result += '\n\n<div><p><img src="%s", width="50%%"></p></div>\n'%LOCATION_MODULE_IMAGES # Now end the help text result += '</body></html>' fd = open(os.path.join(module_path,"%s.html" % module_name), "w") fd.write(result) fd.close() for category in sorted(d.keys()): sub_d = d[category] help_text += "<li><b>%s</b><br><ul>\n"%category for module_name in sorted(sub_d.keys()): help_text += "<li><a href='%s.html'>%s</a></li>\n" % (module_name, module_name) help_text += "</ul></li>\n" help_text += "</ul>\n" return help_text
def plugin_list(): plugin_dir = cpprefs.get_plugin_directory() if plugin_dir is not None and os.path.isdir(plugin_dir): file_list = glob.glob(os.path.join(plugin_dir, '*.py')) return [os.path.basename(f)[:-3] for f in file_list if not f.endswith('__init__.py')] return []
def find_module(self, fullname, path=None): if not fullname.startswith('cellprofiler.modules.plugins'): return None prefix, modname = fullname.rsplit('.', 1) if prefix != 'cellprofiler.modules.plugins': return None if os.path.exists(os.path.join(cpprefs.get_plugin_directory(), modname + '.py')): return self return None
def load_module(self, fullname): if fullname in sys.modules: return sys.modules[fullname] prefix, modname = fullname.rsplit('.', 1) assert prefix == 'cellprofiler.modules.plugins' try: mod = imp.new_module(fullname) sys.modules[fullname] = mod mod.__loader__ = self mod.__file__ = os.path.join(cpprefs.get_plugin_directory(), modname + '.py') contents = open(mod.__file__, "r").read() exec compile(contents, mod.__file__, "exec") in mod.__dict__ return mod except: if fullname in sys.module: del sys.modules[fullname]
def search_module_help(text): '''Search the help for a string text - find text in the module help using case-insensitive matching returns an html document of all the module help pages that matched or None if no match found. ''' matching_help = [] for item in MAIN_HELP: matching_help += __search_menu_helper( item, lambda x:__search_fn(x, text)) count = sum([len(x[2]) for x in matching_help]) for module_name in get_module_names(): module = instantiate_module(module_name) location = os.path.split( module.create_settings.im_func.func_code.co_filename)[0] if location == cpprefs.get_plugin_directory(): continue help_text = module.get_help() matches = __search_fn(help_text, text) if len(matches) > 0: matching_help.append((module_name, help_text, matches)) count += len(matches) if len(matching_help) == 0: return None top = """<html style="font-family:arial"> <head><title>%s found</title></head> <body><h1>Matches found</h1><br><ul> """ % ("1 match" if len(matching_help) == 1 else "%d matches" % len(matching_help)) body = "<br>" match_num = 1 prev_link = ( '<a href="#match%d" title="Previous match">' '<img src="memory:previous.png" alt="previous match"></a>') anchor = '<a name="match%d"><u>%s</u></a>' next_link = ('<a href="#match%d" title="Next match">' '<img src="memory:next.png" alt="next match"></a>') for title, help_text, pairs in matching_help: top += """<li><a href="#match%d">%s</a></li>\n""" % ( match_num, title) if help_text.find("<h1>") == -1: body += "<h1>%s</h1>" % title start_match = re.search(r"<\s*body[^>]*?>", help_text, re.IGNORECASE) if start_match is None: start = 0 else: start = start_match.end() end_match = re.search(r"<\\\s*body", help_text, re.IGNORECASE) if end_match is None: end = len(help_text) else: end = end_match.start() for begin_pos, end_pos in pairs: body += help_text[start:begin_pos] if match_num > 1: body += prev_link % (match_num - 1) body += anchor % (match_num, help_text[begin_pos:end_pos]) if match_num != count: body += next_link % (match_num + 1) start = end_pos match_num += 1 body += help_text[start:end] + "<br>" result = "%s</ul><br>\n%s</body></html>" % (top, body) return result
def fill_modules(): del pymodules[:] del badmodules[:] del datatools[:] all_modules.clear() svn_revisions.clear() def add_module(mod, check_svn): try: m = __import__(mod, globals(), locals(), ['__all__'], 0) cp_module = find_cpmodule(m) name = cp_module.module_name except Exception as e: logger.warning("Could not load %s", mod, exc_info=True) badmodules.append((mod, e)) return try: pymodules.append(m) if name in all_modules: logger.warning( "Multiple definitions of module %s\n\told in %s\n\tnew in %s", name, sys.modules[all_modules[name].__module__].__file__, m.__file__) all_modules[name] = cp_module check_module(cp_module, name) # attempt to instantiate if not hasattr(cp_module, 'do_not_check'): cp_module() if hasattr(cp_module, "run_as_data_tool"): datatools.append(name) if check_svn and hasattr(m, '__version__'): match = re.match('^\$Revision: ([0-9]+) \$$', m.__version__) if match is not None: svn_revisions[name] = match.groups()[0] if not hasattr(all_modules[name], "settings"): # No settings = pure data tool pure_datatools[name] = all_modules[name] del all_modules[name] except Exception as e: logger.warning("Failed to load %s", name, exc_info=True) badmodules.append((mod, e)) if name in all_modules: del all_modules[name] del pymodules[-1] for mod in builtin_modules: add_module('cellprofiler.modules.' + mod, True) plugin_directory = get_plugin_directory() if plugin_directory is not None: old_path = sys.path sys.path.insert(0, plugin_directory) try: for mod in plugin_list(): add_module(mod, False) finally: sys.path = old_path datatools.sort() if len(badmodules) > 0: logger.warning("could not load these modules: %s", ",".join([x[0] for x in badmodules]))
svn_revisions[name] = match.groups()[0] if not hasattr(all_modules[name], "settings"): # No settings = pure data tool pure_datatools[name] = all_modules[name] del all_modules[name] except Exception, e: logger.warning("Failed to load %s", name, exc_info=True) badmodules.append((mod, e)) if name in all_modules: del all_modules[name] del pymodules[-1] for mod in builtin_modules: add_module('cellprofiler.modules.' + mod, True) plugin_directory = get_plugin_directory() if plugin_directory is not None: old_path = sys.path sys.path.insert(0, plugin_directory) try: for mod in plugin_list(): add_module(mod, False) finally: sys.path = old_path datatools.sort() if len(badmodules) > 0: logger.warning("could not load these modules: %s", ",".join([x[0] for x in badmodules]))
def start_workers(cls, num=None): if cls.workers: return try: num = multiprocessing.cpu_count() if num is None else num except NotImplementedError: num = 4 cls.work_announce_address = get_announcer_address() logger.info("Starting workers on address %s" % cls.work_announce_address) if 'CP_DEBUG_WORKER' in os.environ: if os.environ['CP_DEBUG_WORKER'] == 'NOT_INPROC': return from cellprofiler.analysis_worker import \ AnalysisWorker, NOTIFY_ADDR, NOTIFY_STOP, CancelledException class WorkerRunner(threading.Thread): def __init__(self, work_announce_address): threading.Thread.__init__(self) self.work_announce_address = work_announce_address self.notify_socket = zmq.Context.instance().socket(zmq.PUB) self.notify_socket.bind(NOTIFY_ADDR) def run(self): with AnalysisWorker(self.work_announce_address) as aw: try: aw.run() except CancelledException: logger.info("Exiting debug worker thread") def wait(self): self.notify_socket.send(NOTIFY_STOP) self.join() thread = WorkerRunner(cls.work_announce_address) thread.setDaemon(True) thread.start() cls.workers.append(thread) return close_fds = False # start workers for idx in range(num): if sys.platform == 'darwin': close_all_on_exec() # stdin for the subprocesses serves as a deadman's switch. When # closed, the subprocess exits. if hasattr(sys, 'frozen'): if sys.platform == 'darwin': # sys.argv[0] points at # CellProfiler.app/Contents/Resources/CellProfiler.py # We want # CellProfiler.app/Contents/MacOS/CellProfiler # contents_resources_dir = os.path.split(sys.argv[0])[0] contents_dir = os.path.split(contents_resources_dir)[0] cp_executable = os.path.join(contents_dir, "MacOS", "CellProfiler") assert os.path.isfile(cp_executable), \ "Did not find CellProfiler in its expected place: %s" % cp_executable assert os.access(cp_executable, os.EX_OK), \ "%s is not executable" % cp_executable args = ["arch", "-x86_64", "-i386", cp_executable, "--work-announce", cls.work_announce_address, "--plugins-directory", cpprefs.get_plugin_directory(), "--ij-plugins-directory", cpprefs.get_ij_plugin_directory()] else: aw_path = os.path.join( os.path.split( os.path.abspath(sys.argv[0]))[0], "analysis_worker") args = [aw_path, '--work-announce', cls.work_announce_address, "--plugins-directory", cpprefs.get_plugin_directory(), "--ij-plugins-directory", cpprefs.get_ij_plugin_directory()] worker = subprocess.Popen(args, env=find_worker_env(), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds = close_fds) else: worker = subprocess.Popen( [find_python(), '-u', # unbuffered find_analysis_worker_source(), '--work-announce', cls.work_announce_address, "--plugins-directory", cpprefs.get_plugin_directory(), "--ij-plugins-directory", cpprefs.get_ij_plugin_directory()], env=find_worker_env(), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds = close_fds) def run_logger(workR, widx): while(True): try: line = workR.stdout.readline() if not line: break logger.info("Worker %d: %s", widx, line.rstrip()) except: break start_daemon_thread(target=run_logger, args=(worker, idx,), name='worker stdout logger') cls.workers += [worker] cls.deadman_switches += [worker.stdin] # closing stdin will kill subprocess
def start_workers(cls, num=None): if cls.workers: return try: num = multiprocessing.cpu_count() if num is None else num except NotImplementedError: num = 4 cls.work_announce_address = get_announcer_address() logger.info("Starting workers on address %s" % cls.work_announce_address) if 'CP_DEBUG_WORKER' in os.environ: if os.environ['CP_DEBUG_WORKER'] == 'NOT_INPROC': return from cellprofiler.analysis_worker import \ AnalysisWorker, NOTIFY_ADDR, NOTIFY_STOP, CancelledException class WorkerRunner(threading.Thread): def __init__(self, work_announce_address): threading.Thread.__init__(self) self.work_announce_address = work_announce_address self.notify_socket = zmq.Context.instance().socket(zmq.PUB) self.notify_socket.bind(NOTIFY_ADDR) def run(self): with AnalysisWorker(self.work_announce_address) as aw: try: aw.run() except CancelledException: logger.info("Exiting debug worker thread") def wait(self): self.notify_socket.send(NOTIFY_STOP) self.join() thread = WorkerRunner(cls.work_announce_address) thread.setDaemon(True) thread.start() cls.workers.append(thread) return close_fds = False # start workers for idx in range(num): if sys.platform == 'darwin': close_all_on_exec() # stdin for the subprocesses serves as a deadman's switch. When # closed, the subprocess exits. if hasattr(sys, 'frozen'): if sys.platform == 'darwin': # sys.argv[0] points at # CellProfiler.app/Contents/Resources/CellProfiler.py # We want # CellProfiler.app/Contents/MacOS/CellProfiler # contents_resources_dir = os.path.split(sys.argv[0])[0] contents_dir = os.path.split(contents_resources_dir)[0] cp_executable = os.path.join(contents_dir, "MacOS", "CellProfiler") assert os.path.isfile(cp_executable), \ "Did not find CellProfiler in its expected place: %s" % cp_executable assert os.access(cp_executable, os.EX_OK), \ "%s is not executable" % cp_executable args = ["arch", "-x86_64", cp_executable, "--work-announce", cls.work_announce_address, "--plugins-directory", cpprefs.get_plugin_directory(), "--ij-plugins-directory", cpprefs.get_ij_plugin_directory()] else: aw_path = os.path.join( os.path.split( os.path.abspath(sys.argv[0]))[0], "analysis_worker") args = [aw_path, '--work-announce', cls.work_announce_address, "--plugins-directory", cpprefs.get_plugin_directory(), "--ij-plugins-directory", cpprefs.get_ij_plugin_directory()] worker = subprocess.Popen(args, env=find_worker_env(), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds = close_fds) else: worker = subprocess.Popen( [find_python(), '-u', # unbuffered find_analysis_worker_source(), '--work-announce', cls.work_announce_address, "--plugins-directory", cpprefs.get_plugin_directory(), "--ij-plugins-directory", cpprefs.get_ij_plugin_directory()], env=find_worker_env(), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds = close_fds) def run_logger(workR, widx): while(True): try: line = workR.stdout.readline() if not line: break logger.info("Worker %d: %s", widx, line.rstrip()) except: break start_daemon_thread(target=run_logger, args=(worker, idx,), name='worker stdout logger') cls.workers += [worker] cls.deadman_switches += [worker.stdin] # closing stdin will kill subprocess
def test_01_01_output_module_html(self): from cellprofiler.modules import get_module_names, instantiate_module M.output_module_html(self.temp_dir) for module_name in sorted(get_module_names()): fd = None try: fd = open(os.path.join(self.temp_dir, module_name + ".html")) except: module = instantiate_module(module_name) location = os.path.split( module.create_settings.im_func.func_code.co_filename)[0] if location == cpprefs.get_plugin_directory(): continue traceback.print_exc() self.assert_("Failed to open %s.html" % module_name) data = fd.read() fd.close() # # Make sure that some nesting rules are obeyed. # tags_we_care_about = ("i", "b", "ul", "ol", "li", "table", "tr", "td", "th", "h1", "h2", "h3", "html", "head", "body") pattern = r"<\s*([a-zA-Z0-9]+).[^>]*>" anti_pattern = r"</\s*([a-zA-Z0-9]+)[^>]*>" d = {} anti_d = {} COUNT = 0 LIST = 1 for tag in tags_we_care_about: for dd in (d, anti_d): dd[tag] = [0, []] for p, dd in ((pattern, d), (anti_pattern, anti_d)): pos = 0 while (True): m = re.search(p, data[pos:]) if m is None: break tag = m.groups()[0].lower() pos = pos + m.start(1) + 1 if dd.has_key(tag): dd[tag][COUNT] += 1 dd[tag][LIST].append(pos) # # Check table nesting rules # T_TABLE = 0 T_ANTI_TABLE = 1 T_TR = 2 T_ANTI_TR = 3 T_TH = 4 T_ANTI_TH = 5 T_TD = 6 T_ANTI_TD = 7 T_UL = 8 T_ANTI_UL = 9 T_OL = 10 T_ANTI_OL = 11 T_LI = 12 T_ANTI_LI = 13 T_I = 14 T_ANTI_I = 15 T_B = 16 T_ANTI_B = 17 tokens = [] for tag, token, anti_token in (('table', T_TABLE, T_ANTI_TABLE), ('tr', T_TR, T_ANTI_TR), ('td', T_TD, T_ANTI_TD), ('th', T_TH, T_ANTI_TH), ('ul', T_UL, T_ANTI_UL), ('ol', T_OL, T_ANTI_OL), ('li', T_LI, T_ANTI_LI), ('i', T_I, T_ANTI_I), ('b', T_B, T_ANTI_B)): tokens += [(pos, token) for pos in d[tag][LIST]] tokens += [(pos, anti_token) for pos in anti_d[tag][LIST]] tokens = sorted(tokens) S_INIT = 0 S_AFTER_TABLE = 1 S_AFTER_TR = 2 S_AFTER_TD = 3 S_AFTER_TH = 4 S_AFTER_OL = 5 S_AFTER_UL = 6 S_AFTER_LI = 7 S_AFTER_I = 8 S_AFTER_B = 9 state_transitions = { S_INIT: { T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL, T_I: S_AFTER_I, T_B: S_AFTER_B }, S_AFTER_TABLE: { T_ANTI_TABLE: S_INIT, T_TR: S_AFTER_TR }, S_AFTER_TR: { T_ANTI_TR: S_INIT, T_TD: S_AFTER_TD, T_TH: S_AFTER_TH }, S_AFTER_TD: { T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL, T_B: S_AFTER_B, T_I: S_AFTER_I, T_ANTI_TD: S_INIT }, S_AFTER_TH: { T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL, T_B: S_AFTER_B, T_I: S_AFTER_I, T_ANTI_TH: S_INIT }, S_AFTER_OL: { T_LI: S_AFTER_LI, T_ANTI_OL: S_INIT }, S_AFTER_UL: { T_LI: S_AFTER_LI, T_ANTI_UL: S_INIT }, S_AFTER_LI: { T_ANTI_LI: S_INIT, T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL, T_B: S_AFTER_B, T_I: S_AFTER_I }, S_AFTER_I: { T_ANTI_I: S_INIT, T_I: S_AFTER_I, # Stupid but legal <i><i>Foo</i></i> T_B: S_AFTER_B, T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL }, S_AFTER_B: { T_ANTI_B: S_INIT, T_B: S_AFTER_B, T_I: S_AFTER_I, T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL } } state = [] for pos, token in tokens: self.assertTrue( len(state) >= 0, "Error in %s near position %d (%s)" % (module_name, pos, data[max(0, pos - 30):max(pos + 30, len(data))])) top_state, start_pos = (S_INIT, 0) if len(state) == 0 else state[-1] self.assertTrue( state_transitions[top_state].has_key(token), "Nesting error in %s near position %d (%s)" % (module_name, pos, data[max(0, pos - 50):pos] + "^^^" + data[pos:min(pos + 50, len(data))])) next_state = state_transitions[top_state][token] if next_state == S_INIT: state.pop() else: state.append((next_state, pos)) if len(state) > 0: self.assertEqual( len(state), 0, "Couldn't find last closing tag in %s. Last tag position = %d (%s)" % (module_name, state[-1][1], data[(state[-1][1] - 30):(state[-1][1] + 30)])) # # Check begin/end tag counts # for tag in tags_we_care_about: if d.has_key(tag): self.assertTrue( anti_d.has_key(tag), "Missing closing </%s> tag in %s" % (tag, module_name)) self.assertEqual( d[tag][COUNT], anti_d[tag][COUNT], "Found %d <%s>, != %d </%s> in %s" % (d[tag][COUNT], tag, anti_d[tag][COUNT], tag, module_name)) else: self.assertFalse( anti_d.has_key(tag), "Missing opening <%s> tag in %s" % (tag, module_name))
svn_revisions[name] = match.groups()[0] if not hasattr(all_modules[name], "settings"): # No settings = pure data tool pure_datatools[name] = all_modules[name] del all_modules[name] except Exception, e: logger.warning("Failed to load %s", name, exc_info=True) badmodules.append((mod, e)) if name in all_modules: del all_modules[name] del pymodules[-1] for mod in builtin_modules: add_module("cellprofiler.modules." + mod, True) plugin_directory = get_plugin_directory() if plugin_directory is not None: old_path = sys.path sys.path.insert(0, plugin_directory) try: for mod in plugin_list(): add_module(mod, False) finally: sys.path = old_path datatools.sort() if len(badmodules) > 0: logger.warning("could not load these modules: %s", ",".join([x[0] for x in badmodules])) def add_module_for_tst(module_class):
def start_workers(cls, num=None): if cls.workers: return try: num = multiprocessing.cpu_count() if num is None else num except NotImplementedError: num = 4 cls.work_announce_address = get_announcer_address() logger.info("Starting workers on address %s" % cls.work_announce_address) if 'CP_DEBUG_WORKER' in os.environ: if os.environ['CP_DEBUG_WORKER'] == 'NOT_INPROC': return from cellprofiler.worker import \ AnalysisWorker, NOTIFY_ADDR, NOTIFY_STOP from cellprofiler.pipeline import CancelledException class WorkerRunner(threading.Thread): def __init__(self, work_announce_address): threading.Thread.__init__(self) self.work_announce_address = work_announce_address self.notify_socket = zmq.Context.instance().socket(zmq.PUB) self.notify_socket.bind(NOTIFY_ADDR) def run(self): with AnalysisWorker(self.work_announce_address) as aw: try: aw.run() except CancelledException: logger.info("Exiting debug worker thread") def wait(self): self.notify_socket.send(NOTIFY_STOP) self.join() thread = WorkerRunner(cls.work_announce_address) thread.setDaemon(True) thread.start() cls.workers.append(thread) return close_fds = False # start workers for idx in range(num): if sys.platform == 'darwin': close_all_on_exec() aw_args = ["--work-announce", cls.work_announce_address, "--plugins-directory", cpprefs.get_plugin_directory(), "--ij-plugins-directory", cpprefs.get_ij_plugin_directory()] jvm_arg = "%dm" % cpprefs.get_jvm_heap_mb() aw_args.append("--jvm-heap-size=%s" % jvm_arg) # stdin for the subprocesses serves as a deadman's switch. When # closed, the subprocess exits. if hasattr(sys, 'frozen'): if sys.platform == 'darwin': executable = os.path.join( os.path.dirname(sys.executable), "CellProfiler") args = ([executable] + aw_args) elif sys.platform.startswith('linux'): aw_path = os.path.join(os.path.dirname(__file__), "worker.py") args = [sys.executable, aw_path] + aw_args else: args = [sys.executable] + aw_args worker = subprocess.Popen(args, env=find_worker_env(idx), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=close_fds) else: worker = subprocess.Popen( [find_python(), '-u', # unbuffered find_analysis_worker_source()] + aw_args, env=find_worker_env(idx), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=close_fds) def run_logger(workR, widx): while True: try: line = workR.stdout.readline() if not line: break logger.info("Worker %d: %s", widx, line.rstrip()) except: break start_daemon_thread(target=run_logger, args=(worker, idx,), name='worker stdout logger') cls.workers += [worker] cls.deadman_switches += [worker.stdin] # closing stdin will kill subprocess
def test_01_01_output_module_html(self): from cellprofiler.modules import get_module_names, instantiate_module M.output_module_html(self.temp_dir) for module_name in sorted(get_module_names()): fd = None try: fd = open(os.path.join(self.temp_dir, module_name + ".html")) except: module = instantiate_module(module_name) location = os.path.split( module.create_settings.im_func.func_code.co_filename)[0] if location == cpprefs.get_plugin_directory(): continue traceback.print_exc() self.assert_("Failed to open %s.html" %module_name) data = fd.read() fd.close() # # Make sure that some nesting rules are obeyed. # tags_we_care_about = ("i","b","ul","ol","li","table","tr","td","th", "h1","h2","h3","html","head", "body") pattern = r"<\s*([a-zA-Z0-9]+).[^>]*>" anti_pattern = r"</\s*([a-zA-Z0-9]+)[^>]*>" d = {} anti_d = {} COUNT = 0 LIST = 1 for tag in tags_we_care_about: for dd in (d, anti_d): dd[tag] = [0, []] for p, dd in ((pattern, d), (anti_pattern, anti_d)): pos = 0 while(True): m = re.search(p, data[pos:]) if m is None: break tag = m.groups()[0].lower() pos = pos + m.start(1)+1 if dd.has_key(tag): dd[tag][COUNT] += 1 dd[tag][LIST].append(pos) # # Check table nesting rules # T_TABLE = 0 T_ANTI_TABLE = 1 T_TR = 2 T_ANTI_TR = 3 T_TH = 4 T_ANTI_TH = 5 T_TD = 6 T_ANTI_TD = 7 T_UL = 8 T_ANTI_UL = 9 T_OL = 10 T_ANTI_OL = 11 T_LI = 12 T_ANTI_LI = 13 T_I = 14 T_ANTI_I = 15 T_B = 16 T_ANTI_B = 17 tokens = [] for tag, token, anti_token in ( ('table', T_TABLE, T_ANTI_TABLE), ('tr', T_TR, T_ANTI_TR), ('td', T_TD, T_ANTI_TD), ('th', T_TH, T_ANTI_TH), ('ul', T_UL, T_ANTI_UL), ('ol', T_OL, T_ANTI_OL), ('li', T_LI, T_ANTI_LI), ('i', T_I, T_ANTI_I), ('b', T_B, T_ANTI_B) ): tokens += [(pos, token) for pos in d[tag][LIST]] tokens += [(pos, anti_token) for pos in anti_d[tag][LIST]] tokens = sorted(tokens) S_INIT = 0 S_AFTER_TABLE = 1 S_AFTER_TR = 2 S_AFTER_TD = 3 S_AFTER_TH = 4 S_AFTER_OL = 5 S_AFTER_UL = 6 S_AFTER_LI = 7 S_AFTER_I = 8 S_AFTER_B = 9 state_transitions = { S_INIT: { T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL, T_I: S_AFTER_I, T_B: S_AFTER_B }, S_AFTER_TABLE: { T_ANTI_TABLE: S_INIT, T_TR: S_AFTER_TR }, S_AFTER_TR: { T_ANTI_TR: S_INIT, T_TD: S_AFTER_TD, T_TH: S_AFTER_TH }, S_AFTER_TD: { T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL, T_B: S_AFTER_B, T_I: S_AFTER_I, T_ANTI_TD: S_INIT }, S_AFTER_TH: { T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL, T_B: S_AFTER_B, T_I: S_AFTER_I, T_ANTI_TH: S_INIT }, S_AFTER_OL: { T_LI: S_AFTER_LI, T_ANTI_OL: S_INIT }, S_AFTER_UL: { T_LI: S_AFTER_LI, T_ANTI_UL: S_INIT }, S_AFTER_LI: { T_ANTI_LI: S_INIT, T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL, T_B: S_AFTER_B, T_I: S_AFTER_I }, S_AFTER_I: { T_ANTI_I: S_INIT, T_I: S_AFTER_I, # Stupid but legal <i><i>Foo</i></i> T_B: S_AFTER_B, T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL }, S_AFTER_B: { T_ANTI_B: S_INIT, T_B: S_AFTER_B, T_I: S_AFTER_I, T_TABLE: S_AFTER_TABLE, T_OL: S_AFTER_OL, T_UL: S_AFTER_UL } } state = [] for pos, token in tokens: self.assertTrue( len(state) >= 0, "Error in %s near position %d (%s)" % (module_name, pos, data[max(0,pos - 30): max(pos + 30, len(data))]) ) top_state, start_pos = (S_INIT,0) if len(state) == 0 else state[-1] self.assertTrue( state_transitions[top_state].has_key(token), "Nesting error in %s near position %d (%s)" % (module_name, pos, data[max(0,pos - 50):pos]+"^^^"+ data[pos:min(pos + 50, len(data))])) next_state = state_transitions[top_state][token] if next_state == S_INIT: state.pop() else: state.append((next_state, pos)) if len(state) > 0: self.assertEqual( len(state), 0, "Couldn't find last closing tag in %s. Last tag position = %d (%s)" % (module_name, state[-1][1], data[(state[-1][1] - 30): (state[-1][1] + 30)])) # # Check begin/end tag counts # for tag in tags_we_care_about: if d.has_key(tag): self.assertTrue(anti_d.has_key(tag), "Missing closing </%s> tag in %s" % (tag, module_name)) self.assertEqual( d[tag][COUNT], anti_d[tag][COUNT], "Found %d <%s>, != %d </%s> in %s" % (d[tag][COUNT], tag, anti_d[tag][COUNT], tag, module_name)) else: self.assertFalse(anti_d.has_key(tag), "Missing opening <%s> tag in %s" % (tag, module_name))