def second_pass_render(request, content): """ Split on the secret delimiter and generate the token list by passing through text outside of phased blocks as single text tokens and tokenizing text inside the phased blocks. This ensures that nothing outside of the phased blocks is tokenized, thus eliminating the possibility of a template code injection vulnerability. """ result = tokens = [] for index, bit in enumerate(content.split( settings.PHASED_SECRET_DELIMITER)): if index % 2: tokens = Lexer(bit, None).tokenize() else: tokens.append(Token(TOKEN_TEXT, bit)) context = RequestContext( request, restore_csrf_token(request, unpickle_context(bit))) rendered = Parser(tokens).parse().render(context) if settings.PHASED_SECRET_DELIMITER in rendered: rendered = second_pass_render(request, rendered) result.append(rendered) return "".join(result)
def parse_source(self): source_lines = set() lexer = Lexer(self.text, "<string>") tokens = lexer.tokenize() comment = False for token in tokens: assert isinstance(token, Token) if token.token_type == TOKEN_BLOCK: if token.contents == 'comment': comment = True continue elif token.contents == 'endcomment': comment = False continue if comment: continue if token.token_type == TOKEN_BLOCK or token.token_type == TOKEN_VAR: if token.token_type == TOKEN_BLOCK and token.contents.startswith( 'end'): continue source_lines.add(token.lineno) return tuple(sorted(source_lines)), ()
def second_pass_render(request, content): """ Split on the secret delimiter and generate the token list by passing through text outside of phased blocks as single text tokens and tokenizing text inside the phased blocks. This ensures that nothing outside of the phased blocks is tokenized, thus eliminating the possibility of a template code injection vulnerability. """ result = tokens = [] for index, bit in enumerate(content.split(settings.SECRET_DELIMITER)): if index % 2: tokens = Lexer(bit, None).tokenize() else: tokens.append(Token(TOKEN_TEXT, bit)) # restore the previos context including the CSRF token context = RequestContext(request, restore_csrf_token(request, unpickle_context(bit))) # restore the loaded components (tags and filters) parser = Parser(tokens) unpickled_components = unpickle_components(bit) or [] for component in unpickled_components: lib = import_library(component) parser.add_library(lib) # render the piece with the restored context rendered = parser.parse().render(context) if settings.SECRET_DELIMITER in rendered: rendered = second_pass_render(request, rendered) result.append(rendered) return "".join(result)
def parse_source(self): source_lines = set() lexer = Lexer(self.text, "<string>") tokens = lexer.tokenize() comment = False for token in tokens: assert isinstance(token, Token) if token.token_type == TOKEN_BLOCK: if token.contents == 'comment': comment = True continue elif token.contents == 'endcomment': comment = False continue if comment: continue if token.token_type == TOKEN_BLOCK or token.token_type == TOKEN_VAR: if token.token_type == TOKEN_BLOCK and token.contents.startswith('end'): continue source_lines.add(token.lineno) return tuple(sorted(source_lines)), ()
def validate_template(self, template_string): # We want to tokenize like normal, then use a custom parser. lexer = Lexer(template_string, None) tokens = lexer.tokenize() parser = TemplateValidationParser(tokens, self.allow, self.disallow, self.secure) for node in parser.parse(): template = getattr(node, LOADED_TEMPLATE_ATTR, None)
def tokenize(): """ Returns a stream of Django Token() entities """ for template in get_templates(): with open(template) as fp: template_content = fp.read() lexer = Lexer(template_content, None) for token in lexer.tokenize(): yield token
def load_blocks(self): """Loads the asset blocks defined in the template handles: * extends - to track template hierachy * css,javascript - start of asset * endcss, endjavascript - end of asset * {{ .. }} - expansion of variables to settings variables according to VAR_EXPANSIONS """ try: template_string, _filepath = filesystem.load_template_source(self.templatepath) except TemplateDoesNotExist: template_string, _filepath = app_directories.load_template_source(self.templatepath) self.content_hash = hash(template_string) try: result = TemplateAssetBucket() l = Lexer(template_string, self.templatepath) within = None texts = [] for m in l.tokenize(): if m.token_type == TOKEN_BLOCK: split = m.split_contents() typ = split[0] if typ == "extends": if split[1].endswith('"') or split[1].endswith("'"): self.extends = split[1].strip('"').strip("'") else: pass #TODO figure out support for variable expansion elif typ in TemplateAssetBlock.BLOCKTAGS: within = typ prop = _parse_asset_parameters(m.split_contents()) elif typ.startswith('end'): if typ[3:] == within: within = None result.append(TemplateAssetBlock(''.join(texts), template=self, **prop)) elif typ[3:] in TemplateAssetBlock.BLOCKTAGS: assert false, "encountered dangling %s tag in '%s'" % (typ,self.templatepath) elif within: if m.token_type == TOKEN_TEXT: texts.append(m.contents) elif m.token_type == TOKEN_VAR: v = VAR_EXPANSIONS.get(m.contents,'') if v: texts.append(v) #? else: #assert False, "Variable replacement in client side magic not yet supported" return result except UnicodeDecodeError: return "/* could not load %s as a template */\n" % templatepath
def _render_html(self, template_string, context={}): # :( if DJANGO_VERSION > (1,2): from django.template import import_library tag_lib = import_library('beproud.django.commons.tests.test_tags') else: from django.template import get_library tag_lib = get_library('beproud.django.commons.tests.test_tags') lexer = Lexer(template_string, self._make_origin()) parser = Parser(lexer.tokenize()) parser.add_library(tag_lib) nodelist = parser.parse() return nodelist.render(Context(context))
def _load_all_templates(directory): """ Loads all templates in a directory (recursively) and yields tuples of template tokens and template paths. """ if os.path.exists(directory): for name in os.listdir(directory): path = os.path.join(directory, name) if os.path.isdir(path): for template in _load_all_templates(path): yield template elif path.endswith('.html'): with open(path, 'rb') as fobj: source = fobj.read().decode(settings.FILE_CHARSET) lexer = Lexer(source, path) yield lexer.tokenize(), path
def render_custom_content(body, context_data={}): """Renders custom content for the payload using Django templating. This will take the custom payload content template provided by the user and render it using a stripped down version of Django's templating system. In order to keep the payload safe, we use a limited Context along with a custom Parser that blocks certain template tags. This gives us tags like {% for %} and {% if %}, but blacklists tags like {% load %} and {% include %}. """ lexer = Lexer(body, origin=None) parser = CustomPayloadParser(lexer.tokenize()) nodes = parser.parse() return nodes.render(Context(context_data))
def _render_html(self, template_string, context={}): # :( if DJANGO_VERSION > (1, 9): from django.template.library import import_library tag_lib = import_library('testapp.tags') else: # DJANGO_VERSION > (1,7): from django.template.base import import_library tag_lib = import_library('testapp.tags') if DJANGO_VERSION > (1, 9): lexer = Lexer(template_string) else: lexer = Lexer(template_string, self._make_origin()) parser = Parser(lexer.tokenize()) parser.add_library(tag_lib) nodelist = parser.parse() return nodelist.render(Context(context))
def _fix_html_type(request, html, filetype): for group, files in requested_assets[request][filetype].items(): # parse the content for the individual file tokens indices = [] def sub_func(matchobj): indices.append(int(matchobj.group(2))) return "" regex = token_regexes[filetype][group] html = regex.sub(sub_func, html) # replace the 'replace me' tag with actual list of # 'tags' (ie <link href="foo.css">) file_html = u"" uncompressible_html = u"" for index in indices: fileObj = files[index] if fileObj.isCompressible(): file_html += fileObj.render() else: uncompressible_html += fileObj.render() # try to use the provided 'compress' app to compress the output if hasattr(settings, 'COMPRESS') and settings.COMPRESS: # Currently this only supports the django-css app we use from django.template import Lexer,Parser,Token,TOKEN_TEXT file_html += "{% endcompress %}" lexer = Lexer(file_html, None) from compressor.templatetags.compress import compress file_html = compress( Parser(lexer.tokenize()), Token(TOKEN_TEXT, "compress " + filetype) ).render({}) file_html = uncompressible_html + file_html tag = ASSET_DEFS[filetype]['destination_tag'].get(group, None) if tag: html = smart_unicode(html) html = html.replace(tag, file_html + tag) return html
def _get_completion_ppp(self, text): """ Return tuple containing the prefix, pivot, and partial of the current line of input. >>> completer._get_completion_ppp('{{') ('{', '{', '') >>> completer._get_completion_ppp('{{ var }}{% get_') ('{{ var }}{', '%', ' get_') How it works: 1. Tokenize text, add first n-1 tokens to "prefix". 2. Split on final "|%{:". Call it "pivot". 3. Any text after pivot is called the "partial". 4. Text prior to the pivot but after the first n-1 tokens is appended to the prefix. """ if len(text) == 0: return ('', '', '') prefix = '' partial = '' pivot = '' tokens = Lexer(text, None).tokenize() if tokens[-1].token_type != TOKEN_TEXT: return (text, '', '') prefix_tokens = tokens[:-1] working_area = tokens[-1].contents prefix = text[:-len(working_area)] # Iterate backwards through string, finding the first # occurrence of any of the chars "|%{:". Call it the pivot. for index, char in list(enumerate(working_area))[::-1]: if char == ' ': if ' ' in working_area[:index]: pivot = char break if char in '|%{:': pivot = char break # No pivot was found if len(pivot) == 0: return (text, '', '') pieces = working_area.split(pivot) prefix += pivot.join(pieces[:-1]) partial = pieces[-1] return (prefix, pivot, partial)
def second_pass_render(request, content): """ Split on the secret delimiter and generate the token list by passing through text outside of phased blocks as single text tokens and tokenizing text inside the phased blocks. This ensures that nothing outside of the phased blocks is tokenized, thus eliminating the possibility of a template code injection vulnerability. """ result = tokens = [] for index, bit in enumerate(content.split(settings.PHASED_SECRET_DELIMITER)): if index % 2: tokens = Lexer(bit, None).tokenize() else: tokens.append(Token(TOKEN_TEXT, bit)) context = RequestContext(request, restore_csrf_token(request, unpickle_context(bit))) rendered = Parser(tokens).parse().render(context) if settings.PHASED_SECRET_DELIMITER in rendered: rendered = second_pass_render(request, rendered) result.append(rendered) return "".join(result)
def runsource(self, source, filename="<input>", symbol="single"): """ readline calls this method with the current source buffer. This method can return True to instruct readline to capture another line of input using the "..." prompt or return False to tell readline to clear the source buffer and capture a new phrase. How it works: 1. Tokenize input. 2. Load parser with tokens. 3. Attempt to parse, loading a list with nodes. 4. If unclosed tag exception is raised, get more user input. 5. If everything went smoothly, print output, otherwise print exception. """ if source == 'exit': raise ExitREPL() if not source: return False tokens = Lexer(source, None).tokenize() self.parser.tokens = tokens nodes = [] try: try: for node in self.parser.parse(): nodes.append(node) except TemplateSyntaxError as e: if e.args[0].startswith('Unclosed tags'): # inside block, so ask for more input return True else: raise for node in nodes: self.output.write('%s' % (node.render(self.context), )) self.output.write('\n') return False except: self.showtraceback() return False
def extract_django(fileobj, keywords, comment_tags, options): """Extract messages from Django template files. :param fileobj: the file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results :param options: a dictionary of additional options (optional) :return: an iterator over ``(lineno, funcname, message, comments)`` tuples :rtype: ``iterator`` """ intrans = False inplural = False message_context = None singular = [] plural = [] lineno = 1 encoding = options.get('encoding', 'utf8') text = fileobj.read().decode(encoding) for t in Lexer(text, None).tokenize(): lineno += t.contents.count('\n') if intrans: if t.token_type == TOKEN_BLOCK: endbmatch = endblock_re.match(t.contents) pluralmatch = plural_re.match(t.contents) if endbmatch: if inplural: if message_context: yield ( lineno, 'npgettext', [ smart_text(message_context), smart_text(u''.join(singular)), smart_text(u''.join(plural)) ], [], ) else: yield (lineno, 'ngettext', (smart_text(u''.join(singular)), smart_text(u''.join(plural))), []) else: if message_context: yield ( lineno, 'pgettext', [ smart_text(message_context), smart_text(u''.join(singular)) ], [], ) else: yield (lineno, None, smart_text(u''.join(singular)), []) intrans = False inplural = False message_context = None singular = [] plural = [] elif pluralmatch: inplural = True else: raise SyntaxError('Translation blocks must not include ' 'other block tags: %s' % t.contents) elif t.token_type == TOKEN_VAR: if inplural: plural.append('%%(%s)s' % t.contents) else: singular.append('%%(%s)s' % t.contents) elif t.token_type == TOKEN_TEXT: if inplural: plural.append(t.contents) else: singular.append(t.contents) else: if t.token_type == TOKEN_BLOCK: imatch = inline_re.match(t.contents) bmatch = block_re.match(t.contents) cmatches = constant_re.findall(t.contents) if imatch: g = imatch.group(1) if g[0] == '"': g = g.strip('"') elif g[0] == "'": g = g.strip("'") message_context = imatch.group(3) if message_context: # strip quotes message_context = message_context[1:-1] yield ( lineno, 'pgettext', [smart_text(message_context), smart_text(g)], [], ) message_context = None else: yield lineno, None, smart_text(g), [] elif bmatch: if bmatch.group(2): message_context = bmatch.group(2)[1:-1] for fmatch in constant_re.findall(t.contents): yield lineno, None, smart_text(fmatch), [] intrans = True inplural = False singular = [] plural = [] elif cmatches: for cmatch in cmatches: yield lineno, None, smart_text(cmatch), [] elif t.token_type == TOKEN_VAR: parts = t.contents.split('|') cmatch = constant_re.match(parts[0]) if cmatch: yield lineno, None, smart_text(cmatch.group(1)), [] for p in parts[1:]: if p.find(':_(') >= 0: p1 = p.split(':', 1)[1] if p1[0] == '_': p1 = p1[1:] if p1[0] == '(': p1 = p1.strip('()') if p1[0] == "'": p1 = p1.strip("'") elif p1[0] == '"': p1 = p1.strip('"') yield lineno, None, smart_text(p1), []
def templatize(src, origin=None): """ Turns a Django template into something that is understood by xgettext. It does so by translating the Django translation tags into standard gettext function invocations. """ from django.template import (Lexer, TOKEN_TEXT, TOKEN_VAR, TOKEN_BLOCK, TOKEN_COMMENT, TRANSLATOR_COMMENT_MARK) out = StringIO() message_context = None intrans = False inplural = False singular = [] plural = [] incomment = False comment = [] for t in Lexer(src, origin).tokenize(): if incomment: if t.token_type == TOKEN_BLOCK and t.contents == 'endcomment': content = ''.join(comment) translators_comment_start = None for lineno, line in enumerate(content.splitlines(True)): if line.lstrip().startswith(TRANSLATOR_COMMENT_MARK): translators_comment_start = lineno for lineno, line in enumerate(content.splitlines(True)): if translators_comment_start is not None and lineno >= translators_comment_start: out.write(' # %s' % line) else: out.write(' #\n') incomment = False comment = [] else: comment.append(t.contents) elif intrans: if t.token_type == TOKEN_BLOCK: endbmatch = endblock_re.match(t.contents) pluralmatch = plural_re.match(t.contents) if endbmatch: if inplural: if message_context: out.write(' npgettext(%r, %r, %r,count) ' % (message_context, ''.join(singular), ''.join(plural))) else: out.write(' ngettext(%r, %r, count) ' % (''.join(singular), ''.join(plural))) for part in singular: out.write(blankout(part, 'S')) for part in plural: out.write(blankout(part, 'P')) else: if message_context: out.write(' pgettext(%r, %r) ' % (message_context, ''.join(singular))) else: out.write(' gettext(%r) ' % ''.join(singular)) for part in singular: out.write(blankout(part, 'S')) message_context = None intrans = False inplural = False singular = [] plural = [] elif pluralmatch: inplural = True else: filemsg = '' if origin: filemsg = 'file %s, ' % origin raise SyntaxError( "Translation blocks must not include other block tags: %s (%sline %d)" % (t.contents, filemsg, t.lineno)) elif t.token_type == TOKEN_VAR: if inplural: plural.append('%%(%s)s' % t.contents) else: singular.append('%%(%s)s' % t.contents) elif t.token_type == TOKEN_TEXT: contents = one_percent_re.sub('%%', t.contents) if inplural: plural.append(contents) else: singular.append(contents) else: if t.token_type == TOKEN_BLOCK: imatch = inline_re.match(t.contents) bmatch = block_re.match(t.contents) cmatches = constant_re.findall(t.contents) if imatch: g = imatch.group(1) if g[0] == '"': g = g.strip('"') elif g[0] == "'": g = g.strip("'") g = one_percent_re.sub('%%', g) if imatch.group(2): # A context is provided context_match = context_re.match(imatch.group(2)) message_context = context_match.group(1) if message_context[0] == '"': message_context = message_context.strip('"') elif message_context[0] == "'": message_context = message_context.strip("'") out.write(' pgettext(%r, %r) ' % (message_context, g)) message_context = None else: out.write(' gettext(%r) ' % g) elif bmatch: for fmatch in constant_re.findall(t.contents): out.write(' _(%s) ' % fmatch) if bmatch.group(1): # A context is provided context_match = context_re.match(bmatch.group(1)) message_context = context_match.group(1) if message_context[0] == '"': message_context = message_context.strip('"') elif message_context[0] == "'": message_context = message_context.strip("'") intrans = True inplural = False singular = [] plural = [] elif cmatches: for cmatch in cmatches: out.write(' _(%s) ' % cmatch) elif t.contents == 'comment': incomment = True else: out.write(blankout(t.contents, 'B')) elif t.token_type == TOKEN_VAR: parts = t.contents.split('|') cmatch = constant_re.match(parts[0]) if cmatch: out.write(' _(%s) ' % cmatch.group(1)) for p in parts[1:]: if p.find(':_(') >= 0: out.write(' %s ' % p.split(':', 1)[1]) else: out.write(blankout(p, 'F')) elif t.token_type == TOKEN_COMMENT: out.write(' # %s' % t.contents) else: out.write(blankout(t.contents, 'X')) return out.getvalue()
def templatize(src): """ Turns a Django template into something that is understood by xgettext. It does so by translating the Django translation tags into standard gettext function invocations. """ from django.template import Lexer, TOKEN_TEXT, TOKEN_VAR, TOKEN_BLOCK out = StringIO() intrans = False inplural = False singular = [] plural = [] for t in Lexer(src, None).tokenize(): if intrans: if t.token_type == TOKEN_BLOCK: endbmatch = endblock_re.match(t.contents) pluralmatch = plural_re.match(t.contents) if endbmatch: if inplural: out.write(' ngettext(%r,%r,count) ' % (''.join(singular), ''.join(plural))) for part in singular: out.write(blankout(part, 'S')) for part in plural: out.write(blankout(part, 'P')) else: out.write(' gettext(%r) ' % ''.join(singular)) for part in singular: out.write(blankout(part, 'S')) intrans = False inplural = False singular = [] plural = [] elif pluralmatch: inplural = True else: raise SyntaxError("Translation blocks must not include other block tags: %s" % t.contents) elif t.token_type == TOKEN_VAR: if inplural: plural.append('%%(%s)s' % t.contents) else: singular.append('%%(%s)s' % t.contents) elif t.token_type == TOKEN_TEXT: if inplural: plural.append(t.contents) else: singular.append(t.contents) else: if t.token_type == TOKEN_BLOCK: imatch = inline_re.match(t.contents) bmatch = block_re.match(t.contents) cmatches = constant_re.findall(t.contents) if imatch: g = imatch.group(1) if g[0] == '"': g = g.strip('"') elif g[0] == "'": g = g.strip("'") out.write(' gettext(%r) ' % g) elif bmatch: for fmatch in constant_re.findall(t.contents): out.write(' _(%s) ' % fmatch) intrans = True inplural = False singular = [] plural = [] elif cmatches: for cmatch in cmatches: out.write(' _(%s) ' % cmatch) else: out.write(blankout(t.contents, 'B')) elif t.token_type == TOKEN_VAR: parts = t.contents.split('|') cmatch = constant_re.match(parts[0]) if cmatch: out.write(' _(%s) ' % cmatch.group(1)) for p in parts[1:]: if p.find(':_(') >= 0: out.write(' %s ' % p.split(':',1)[1]) else: out.write(blankout(p, 'F')) else: out.write(blankout(t.contents, 'X')) return out.getvalue()
def my_templatize(src, origin=None): # Jinja2 spaceless src = strip_whitespaces(src) """ Turns a Django template into something that is understood by xgettext. It does so by translating the Django translation tags into standard gettext function invocations. """ from django.template import (Lexer, TOKEN_TEXT, TOKEN_VAR, TOKEN_BLOCK, TOKEN_COMMENT, TRANSLATOR_COMMENT_MARK) out = StringIO() intrans = False inplural = False singular = [] plural = [] incomment = False comment = [] for t in Lexer(src, origin).tokenize(): if incomment: if t.token_type == TOKEN_BLOCK and t.contents == 'endcomment': content = ''.join(comment) translators_comment_start = None for lineno, line in enumerate( content.splitlines(True)): if line.lstrip().startswith( TRANSLATOR_COMMENT_MARK): translators_comment_start = lineno for lineno, line in enumerate( content.splitlines(True)): if translators_comment_start is not None and lineno >= translators_comment_start: out.write(' # %s' % line) else: out.write(' #\n') incomment = False comment = [] else: comment.append(t.contents) elif intrans: if t.token_type == TOKEN_BLOCK: endbmatch = trans_real.endblock_re.match(t.contents) pluralmatch = trans_real.plural_re.match(t.contents) if endbmatch: if inplural: out.write(' ngettext(%r,%r,count) ' % (''.join(singular), ''.join(plural))) for part in singular: out.write(trans_real.blankout(part, 'S')) for part in plural: out.write(trans_real.blankout(part, 'P')) else: out.write(' gettext(%r) ' % ''.join(singular)) for part in singular: out.write(trans_real.blankout(part, 'S')) intrans = False inplural = False singular = [] plural = [] elif pluralmatch: inplural = True else: filemsg = '' if origin: filemsg = 'file %s, ' % origin raise SyntaxError( "Translation blocks must not include other block tags: %s (%sline %d)" % (t.contents, filemsg, t.lineno)) elif t.token_type == TOKEN_VAR: if inplural: plural.append('%%(%s)s' % t.contents) else: singular.append('%%(%s)s' % t.contents) elif t.token_type == TOKEN_TEXT: contents = t.contents.replace('%', '%%') if inplural: plural.append(contents) else: singular.append(contents) else: if t.token_type == TOKEN_BLOCK: imatch = trans_real.inline_re.match(t.contents) bmatch = trans_real.block_re.match(t.contents) cmatches = trans_real.constant_re.findall(t.contents) if imatch: g = imatch.group(1) if g[0] == '"': g = g.strip('"') elif g[0] == "'": g = g.strip("'") out.write(' gettext(%r) ' % g) elif bmatch: for fmatch in trans_real.constant_re.findall( t.contents): out.write(' _(%s) ' % fmatch) intrans = True inplural = False singular = [] plural = [] elif cmatches: for cmatch in cmatches: out.write(' _(%s) ' % cmatch) elif t.contents == 'comment': incomment = True else: out.write(trans_real.blankout(t.contents, 'B')) elif t.token_type == TOKEN_VAR: cmatches = trans_real.constant_re.findall(t.contents) if cmatches: for cmatch in cmatches: out.write(' _(%s) ' % cmatch) # findall is necessary for macros having translation constants as parameters # original django code: # # parts = t.contents.split('|') # cmatch = constant_re.match(parts[0]) # if cmatch: # out.write(' _(%s) ' % cmatch.group(1)) # for p in parts[1:]: # if p.find(':_(') >= 0: # out.write(' %s ' % p.split(':',1)[1]) # else: # out.write(trans_real.blankout(p, 'F')) elif t.token_type == TOKEN_COMMENT: out.write(' # %s' % t.contents) else: out.write(trans_real.blankout(t.contents, 'X')) return out.getvalue()
def compile_string(template_string): lexer = Lexer(template_string, None) parser = ProcessingParser(lexer.tokenize()) return parser.parse()
def extract_django(fileobj, keywords, comment_tags, options): """Extract messages from Django template files. :param fileobj: the file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results :param options: a dictionary of additional options (optional) :return: an iterator over ``(lineno, funcname, message, comments)`` tuples :rtype: ``iterator`` """ intrans = False inplural = False message_context = None singular = [] plural = [] lineno = 1 encoding = options.get('encoding', 'utf8') text = fileobj.read().decode(encoding) try: text_lexer = Lexer(text) except TypeError: # Django 1.9 changed the way we invoke Lexer; older versions # require two parameters. text_lexer = Lexer(text, None) for t in text_lexer.tokenize(): lineno += t.contents.count('\n') if intrans: if t.token_type == TOKEN_BLOCK: endbmatch = endblock_re.match(t.contents) pluralmatch = plural_re.match(t.contents) if endbmatch: if inplural: if message_context: yield ( lineno, 'npgettext', [smart_text(message_context), smart_text(u''.join(singular)), smart_text(u''.join(plural))], [], ) else: yield ( lineno, 'ngettext', (smart_text(u''.join(singular)), smart_text(u''.join(plural))), []) else: if message_context: yield ( lineno, 'pgettext', [smart_text(message_context), smart_text(u''.join(singular))], [], ) else: yield ( lineno, None, smart_text(u''.join(singular)), []) intrans = False inplural = False message_context = None singular = [] plural = [] elif pluralmatch: inplural = True else: raise SyntaxError('Translation blocks must not include ' 'other block tags: %s' % t.contents) elif t.token_type == TOKEN_VAR: if inplural: plural.append('%%(%s)s' % t.contents) else: singular.append('%%(%s)s' % t.contents) elif t.token_type == TOKEN_TEXT: if inplural: plural.append(t.contents) else: singular.append(t.contents) else: if t.token_type == TOKEN_BLOCK: imatch = inline_re.match(t.contents) bmatch = block_re.match(t.contents) cmatches = constant_re.findall(t.contents) if imatch: g = imatch.group(1) if g[0] == '"': g = g.strip('"') elif g[0] == "'": g = g.strip("'") message_context = imatch.group(3) if message_context: # strip quotes message_context = message_context[1:-1] yield ( lineno, 'pgettext', [smart_text(message_context), smart_text(g)], [], ) message_context = None else: yield lineno, None, smart_text(g), [] elif bmatch: if bmatch.group(2): message_context = bmatch.group(2)[1:-1] for fmatch in constant_re.findall(t.contents): yield lineno, None, smart_text(fmatch), [] intrans = True inplural = False singular = [] plural = [] elif cmatches: for cmatch in cmatches: yield lineno, None, smart_text(cmatch), [] elif t.token_type == TOKEN_VAR: parts = t.contents.split('|') cmatch = constant_re.match(parts[0]) if cmatch: yield lineno, None, smart_text(cmatch.group(1)), [] for p in parts[1:]: if p.find(':_(') >= 0: p1 = p.split(':', 1)[1] if p1[0] == '_': p1 = p1[1:] if p1[0] == '(': p1 = p1.strip('()') if p1[0] == "'": p1 = p1.strip("'") elif p1[0] == '"': p1 = p1.strip('"') yield lineno, None, smart_text(p1), []
def templatize(src, origin=None): """ Turns a Django template into something that is understood by xgettext. It does so by translating the Django translation tags into standard gettext function invocations. """ from django.conf import settings from django.template import (Lexer, TOKEN_TEXT, TOKEN_VAR, TOKEN_BLOCK, TOKEN_COMMENT, TRANSLATOR_COMMENT_MARK) src = force_text(src, settings.FILE_CHARSET) out = StringIO() message_context = None intrans = False inplural = False singular = [] plural = [] incomment = False comment = [] lineno_comment_map = {} comment_lineno_cache = None for t in Lexer(src, origin).tokenize(): if incomment: if t.token_type == TOKEN_BLOCK and t.contents == 'endcomment': content = ''.join(comment) translators_comment_start = None for lineno, line in enumerate(content.splitlines(True)): if line.lstrip().startswith(TRANSLATOR_COMMENT_MARK): translators_comment_start = lineno for lineno, line in enumerate(content.splitlines(True)): if translators_comment_start is not None and lineno >= translators_comment_start: out.write(' # %s' % line) else: out.write(' #\n') incomment = False comment = [] else: comment.append(t.contents) elif intrans: if t.token_type == TOKEN_BLOCK: endbmatch = endblock_re.match(t.contents) pluralmatch = plural_re.match(t.contents) if endbmatch: if inplural: if message_context: out.write(' npgettext(%r, %r, %r,count) ' % (message_context, ''.join(singular), ''.join(plural))) else: out.write(' ngettext(%r, %r, count) ' % (''.join(singular), ''.join(plural))) for part in singular: out.write(blankout(part, 'S')) for part in plural: out.write(blankout(part, 'P')) else: if message_context: out.write(' pgettext(%r, %r) ' % (message_context, ''.join(singular))) else: out.write(' gettext(%r) ' % ''.join(singular)) for part in singular: out.write(blankout(part, 'S')) message_context = None intrans = False inplural = False singular = [] plural = [] elif pluralmatch: inplural = True else: filemsg = '' if origin: filemsg = 'file %s, ' % origin raise SyntaxError( "Translation blocks must not include other block tags: %s (%sline %d)" % (t.contents, filemsg, t.lineno)) elif t.token_type == TOKEN_VAR: if inplural: plural.append('%%(%s)s' % t.contents) else: singular.append('%%(%s)s' % t.contents) elif t.token_type == TOKEN_TEXT: contents = one_percent_re.sub('%%', t.contents) if inplural: plural.append(contents) else: singular.append(contents) else: # Handle comment tokens (`{# ... #}`) plus other constructs on # the same line: if comment_lineno_cache is not None: cur_lineno = t.lineno + t.contents.count('\n') if comment_lineno_cache == cur_lineno: if t.token_type != TOKEN_COMMENT: for c in lineno_comment_map[comment_lineno_cache]: filemsg = '' if origin: filemsg = 'file %s, ' % origin warn_msg = ( "The translator-targeted comment '%s' " "(%sline %d) was ignored, because it wasn't the last item " "on the line.") % (c, filemsg, comment_lineno_cache) warnings.warn(warn_msg, TranslatorCommentWarning) lineno_comment_map[comment_lineno_cache] = [] else: out.write( '# %s' % ' | '.join(lineno_comment_map[comment_lineno_cache])) comment_lineno_cache = None if t.token_type == TOKEN_BLOCK: imatch = inline_re.match(t.contents) bmatch = block_re.match(t.contents) cmatches = constant_re.findall(t.contents) if imatch: g = imatch.group(1) if g[0] == '"': g = g.strip('"') elif g[0] == "'": g = g.strip("'") g = one_percent_re.sub('%%', g) if imatch.group(2): # A context is provided context_match = context_re.match(imatch.group(2)) message_context = context_match.group(1) if message_context[0] == '"': message_context = message_context.strip('"') elif message_context[0] == "'": message_context = message_context.strip("'") out.write(' pgettext(%r, %r) ' % (message_context, g)) message_context = None else: out.write(' gettext(%r) ' % g) elif bmatch: for fmatch in constant_re.findall(t.contents): out.write(' _(%s) ' % fmatch) if bmatch.group(1): # A context is provided context_match = context_re.match(bmatch.group(1)) message_context = context_match.group(1) if message_context[0] == '"': message_context = message_context.strip('"') elif message_context[0] == "'": message_context = message_context.strip("'") intrans = True inplural = False singular = [] plural = [] elif cmatches: for cmatch in cmatches: out.write(' _(%s) ' % cmatch) elif t.contents == 'comment': incomment = True else: out.write(blankout(t.contents, 'B')) elif t.token_type == TOKEN_VAR: parts = t.contents.split('|') cmatch = constant_re.match(parts[0]) if cmatch: out.write(' _(%s) ' % cmatch.group(1)) for p in parts[1:]: if p.find(':_(') >= 0: out.write(' %s ' % p.split(':', 1)[1]) else: out.write(blankout(p, 'F')) elif t.token_type == TOKEN_COMMENT: if t.contents.lstrip().startswith(TRANSLATOR_COMMENT_MARK): lineno_comment_map.setdefault(t.lineno, []).append(t.contents) comment_lineno_cache = t.lineno else: out.write(blankout(t.contents, 'X')) return force_str(out.getvalue())
def extract(fileobj, keywords, comment_tags, options): """Extracts translation messages from underscore template files. This method does also extract django templates. If a template does not contain any django translation tags we always fallback to underscore extraction. This is a plugin to Babel, written according to http://babel.pocoo.org/docs/messages/#writing-extraction-methods :param fileobj: the file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results :param options: a dictionary of additional options (optional) :return: an iterator over ``(lineno, funcname, message, comments)`` tuples :rtype: ``iterator`` """ encoding = options.get('encoding', 'utf-8') original_position = fileobj.tell() text = fileobj.read().decode(encoding) # TODO: There must be another way. Find a way to fix the ordering # in babel directly! vars = [ token.token_type != TOKEN_TEXT for token in Lexer(text, None).tokenize() ] could_be_django = any(list(vars)) if could_be_django: fileobj.seek(original_position) iterator = extract_django(fileobj, keywords, comment_tags, options) for lineno, funcname, message, comments in iterator: yield lineno, funcname, message, comments else: # Underscore template extraction comments = [] fileobj.seek(original_position) for lineno, line in enumerate(fileobj, 1): funcname = None stream = TokenStream.from_tuple_iter( tokenize(line, underscore.rules)) while not stream.eof: if stream.current.type == 'gettext_begin': stream.expect('gettext_begin') funcname = stream.expect('func_name').value args, kwargs = parse_arguments(stream, 'gettext_end') strings = [] for arg in args: try: arg = int(arg) except ValueError: pass if isinstance(arg, six.string_types): strings.append(force_text(arg)) else: strings.append(None) for arg in kwargs: strings.append(None) if len(strings) == 1: strings = strings[0] else: strings = tuple(strings) yield lineno, funcname, strings, [] stream.next()