def _grammar(self, method_name, postprocessor_name): """Return a full or partial grammar. method_name -- If truthy, the attribute of the full grammar to return """ if postprocessor_name == 'html': allowed_tags = ['p', 'span', 'b', 'i'] allowed_autoclose_tags = ['br', 'hr'] allowed_parameters = ['class', 'style', 'name', 'id', 'scope'] interwiki = { 'en': 'http://en.wikipedia.org/wiki/', 'fr': 'http://fr.wikipedia.org/wiki/' } namespaces = { 'Template': 10, u'Catégorie': 14, 'Category': 14, 'File': 6, 'Image': 6 } parser = html.make_parser(allowed_tags, allowed_autoclose_tags, allowed_parameters, interwiki, namespaces) elif postprocessor_name == 'text': parser = text.make_parser() else: parser = raw.make_parser() return getattr(parser, method_name) if method_name else parser
def testit(content): global foo templates = {} allowed_tags = ["PRE"] allowed_self_closing_tags = [] allowed_attributes = [] interwiki = {} namespaces = {} preprocess = preprocessor.make_parser(templates) parser = html.make_parser(allowed_tags, allowed_self_closing_tags, allowed_attributes, interwiki, namespaces) #parser._setTopPattern('wikitext') #parser = raw.make_parser() preprocessed_text = preprocess.parseTest(content) #import pdb; pdb.set_trace() #Pattern.TRACE=True foo = parser.parseTest(preprocessed_text).leaves()
def _grammar(self, method_name, postprocessor_name): """Return a full or partial grammar. method_name -- If truthy, the attribute of the full grammar to return """ if postprocessor_name == 'html': allowed_tags = ['p', 'span', 'b', 'i'] allowed_autoclose_tags = ['br', 'hr'] allowed_parameters = ['class', 'style', 'name', 'id', 'scope'] interwiki = {'en': 'http://en.wikipedia.org/wiki/', 'fr': 'http://fr.wikipedia.org/wiki/'} namespaces = {'Template': 10, u'Catégorie': 14, 'Category': 14, 'File': 6, 'Image': 6} parser = html.make_parser(allowed_tags, allowed_autoclose_tags, allowed_parameters, interwiki, namespaces) elif postprocessor_name == 'text': parser = text.make_parser() else: parser = raw.make_parser() return getattr(parser, method_name) if method_name else parser
allowed_tags = ['p', 'span', 'b', 'i'] allowed_autoclose_tags = ['br', 'hr'] allowed_parameters = ['class', 'style', 'name', 'id', 'scope'] interwiki = { 'en': 'http://en.wikipedia.org/wiki/', 'fr': 'http://fr.wikipedia.org/wiki/' } namespaces = { 'Template': 10, u'Catégorie': 14, 'Category': 14, 'File': 6, 'Image': 6 } parser = html.make_parser(allowed_tags, allowed_autoclose_tags, allowed_parameters, interwiki, namespaces) preprocessor_parser = preprocessor.make_parser({}) siteSubElem = lxml.html.fromstring( '<div class="siteSub">From Fakipedia, the fake Wikipedia</div><div class="contentSub"/>' ) def preprocess(source): source = source.replace("\n ", "\n") \ .replace(" \n", "\n") \ .replace("= ", "=") \ .replace(" =", "=") \ .replace("@ ", " ") \ .replace(" @", " ") \ .strip() source_split = source.split("\n")