def add_result(self, token): value = unquote_string(token.value) result = dict(line_number=token.lineno, content=value) for key, value in self.token_params.items(): if key == "alt_token": result["alt_content"] = unquote_string(value.value) result["alt_line_number"] = value.lineno else: result[key] = unquote_string(value) self.results.append(result) self.token_to_add = None self.token_params = {}
def add_result(self, token): value = unquote_string(token.value) result = dict(line_number=token.lineno, content=value) for key, value in self.token_params.items(): if key == 'alt_token': result['alt_content'] = unquote_string(value.value) result['alt_line_number'] = value.lineno else: result[key] = unquote_string(value) self.results.append(result) self.token_to_add = None self.token_params = {}
def add_result(self, token): if self.is_keepable(): result = dict( line_number=token.lineno, function=u'gettext', value=[unquote_string(token.value)] ) self.results.append(result)
def tokenized_results(self): """ Returns a list of dict items: {line_number, function, value} for the valid strings found in the file. The basic logic is to handle: Key:Value pairs, Lists of string values (1 to n) and Lists of List of string values (1 to n). Note 1: The function is always gettext. The value will be a tuple of the matching string. Note 2: The same string value could appear in multiple lines within a a JSON file. The returned list will contain each occurrence. """ encoding = 'utf-8' for token in tokenize(self.data.decode(encoding)): if token.type == u'operator': if token.value == ']': self.add_first_token_in_list() self.processing_list = False elif token.type == u'string': if (self.prev_token.type == u'operator' and self.prev_token.value == ':'): self.add_result(token) elif (self.prev_token.type == u'operator' and self.prev_token.value == '['): if self.processing_list: self.current_key = unquote_string( self.first_token_in_list.value) else: self.processing_list = True self.first_token_in_list = token elif self.processing_list: self.add_first_token_in_list() self.add_result(token) else: self.current_key = unquote_string(token.value) self.prev_token = token return self.results
def get_lines_data(self): """ Returns string:line_numbers list Since all strings are unique it is OK to get line numbers this way. Since same string can occur several times inside single .json file the values should be popped(FIFO) from the list :rtype: list """ encoding = "utf-8" for token in tokenize(self.data.decode(encoding)): if token.type == "operator": if token.value == "{": self.start_object() elif token.value == ":": self.with_separator(token) elif token.value == "}": self.end_object() elif token.value == ",": self.end_pair() elif token.type == "string": if self.state == "key": self.current_key = unquote_string(token.value) if self.current_key == JSON_GETTEXT_KEYWORD: self.gettext_mode = True # ==value not actually used, but if only key was met (like in list) it still will be used. The important part, that key wont be parsed as value, not reversal if self.gettext_mode: if self.current_key == JSON_GETTEXT_KEY_CONTENT: self.token_to_add = token elif self.current_key == JSON_GETTEXT_KEY_ALT_CONTENT: self.token_params["alt_token"] = token elif self.current_key == JSON_GETTEXT_KEY_FUNCNAME: self.token_params["funcname"] = token.value else: self.token_to_add = token return self.results
def get_lines_data(self): """ Returns string:line_numbers list Since all strings are unique it is OK to get line numbers this way. :rtype: list """ trigger_call_prime = False for token in tokenize(self.data, jsx=False): call_primed = trigger_call_prime trigger_call_prime = False if token.type == 'operator': if token.value == '(': if call_primed: self.start_call() else: self.parenthesis_level += 1 elif token.value == ')': if self.parenthesis_level == 0: self.end_call() else: self.parenthesis_level -= 1 elif token.type == 'name': trigger_call_prime = True self.current_name = token.value elif token.type == 'string' and len(self.active_calls) > 0: string_value = unquote_string(token.value) call = self.active_calls[-1] if call.current_value is None: call.current_value = string_value call.value_start_line = token.lineno else: call.current_value += string_value return self.results
def get_lines_data(self): """ Returns string:line_numbers list Since all strings are unique it is OK to get line numbers this way. Since same string can occur several times inside single .json file the values should be popped(FIFO) from the list :rtype: list """ for token in tokenize(self.data): if token.type == 'operator': if token.value == '{': self.start_object() elif token.value == '[': self.start_array() elif token.value == ':': self.with_separator(token) elif token.value == '}': self.end_object() elif token.value == ']': self.end_array() elif token.value == ',': self.end_pair() elif token.type == 'string': if self.state == 'key': self.current_key = unquote_string(token.value) if self.current_key == JSON_GETTEXT_KEYWORD: self.gettext_mode = True else: # TODO: auto-detecting items to extract through the keywords passed to extract_json would be very nice if self.current_key.lower() in ("groupname", "displayname", "name", "message", "messages"): self.token_to_add = token return self.results
def extract_javascript(fileobj, keywords, comment_tags, options): """Extract messages from JavaScript source code. :param fileobj: the seekable, file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results :param options: a dictionary of additional options (optional) Supported options are: * `jsx` -- set to false to disable JSX/E4X support. * `template_string` -- set to false to disable ES6 template string support. """ from babel.messages.jslexer import Token, tokenize, unquote_string funcname = message_lineno = None messages = [] last_argument = None translator_comments = [] concatenate_next = False encoding = options.get('encoding', 'utf-8') last_token = None call_stack = -1 dotted = any('.' in kw for kw in keywords) for token in tokenize(fileobj.read().decode(encoding), jsx=options.get("jsx", True), template_string=options.get("template_string", True), dotted=dotted): if ( # Turn keyword`foo` expressions into keyword("foo") calls: funcname and # have a keyword... (last_token and last_token.type == 'name') and # we've seen nothing after the keyword... token.type == 'template_string' # this is a template string ): message_lineno = token.lineno messages = [unquote_string(token.value)] call_stack = 0 token = Token('operator', ')', token.lineno) if token.type == 'operator' and token.value == '(': if funcname: message_lineno = token.lineno call_stack += 1 elif call_stack == -1 and token.type == 'linecomment': value = token.value[2:].strip() if translator_comments and \ translator_comments[-1][0] == token.lineno - 1: translator_comments.append((token.lineno, value)) continue for comment_tag in comment_tags: if value.startswith(comment_tag): translator_comments.append((token.lineno, value.strip())) break elif token.type == 'multilinecomment': # only one multi-line comment may preceed a translation translator_comments = [] value = token.value[2:-2].strip() for comment_tag in comment_tags: if value.startswith(comment_tag): lines = value.splitlines() if lines: lines[0] = lines[0].strip() lines[1:] = dedent('\n'.join(lines[1:])).splitlines() for offset, line in enumerate(lines): translator_comments.append( (token.lineno + offset, line)) break elif funcname and call_stack == 0: if token.type == 'operator' and token.value == ')': if last_argument is not None: messages.append(last_argument) if len(messages) > 1: messages = tuple(messages) elif messages: messages = messages[0] else: messages = None # Comments don't apply unless they immediately precede the # message if translator_comments and \ translator_comments[-1][0] < message_lineno - 1: translator_comments = [] if messages is not None: yield (message_lineno, funcname, messages, [comment[1] for comment in translator_comments]) funcname = message_lineno = last_argument = None concatenate_next = False translator_comments = [] messages = [] call_stack = -1 elif token.type in ('string', 'template_string'): new_value = unquote_string(token.value) if concatenate_next: last_argument = (last_argument or '') + new_value concatenate_next = False else: last_argument = new_value elif token.type == 'operator': if token.value == ',': if last_argument is not None: messages.append(last_argument) last_argument = None else: messages.append(None) concatenate_next = False elif token.value == '+': concatenate_next = True elif call_stack > 0 and token.type == 'operator' \ and token.value == ')': call_stack -= 1 elif funcname and call_stack == -1: funcname = None elif call_stack == -1 and token.type == 'name' and \ token.value in keywords and \ (last_token is None or last_token.type != 'name' or last_token.value != 'function'): funcname = token.value last_token = token
def extract_javascript(fileobj, keywords, comment_tags, options): """Extract messages from JavaScript source code. :param fileobj: the seekable, file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results :param options: a dictionary of additional options (optional) :return: an iterator over ``(lineno, funcname, message, comments)`` tuples :rtype: ``iterator`` """ from babel.messages.jslexer import tokenize, unquote_string funcname = message_lineno = None messages = [] last_argument = None translator_comments = [] concatenate_next = False encoding = options.get('encoding', 'utf-8') last_token = None call_stack = -1 for token in tokenize(fileobj.read().decode(encoding)): if token.type == 'operator' and token.value == '(': if funcname: message_lineno = token.lineno call_stack += 1 elif call_stack == -1 and token.type == 'linecomment': value = token.value[2:].strip() if translator_comments and \ translator_comments[-1][0] == token.lineno - 1: translator_comments.append((token.lineno, value)) continue for comment_tag in comment_tags: if value.startswith(comment_tag): translator_comments.append((token.lineno, value.strip())) break elif token.type == 'multilinecomment': # only one multi-line comment may preceed a translation translator_comments = [] value = token.value[2:-2].strip() for comment_tag in comment_tags: if value.startswith(comment_tag): lines = value.splitlines() if lines: lines[0] = lines[0].strip() lines[1:] = dedent('\n'.join(lines[1:])).splitlines() for offset, line in enumerate(lines): translator_comments.append((token.lineno + offset, line)) break elif funcname and call_stack == 0: if token.type == 'operator' and token.value == ')': if last_argument is not None: messages.append(last_argument) if len(messages) > 1: messages = tuple(messages) elif messages: messages = messages[0] else: messages = None # Comments don't apply unless they immediately preceed the # message if translator_comments and \ translator_comments[-1][0] < message_lineno - 1: translator_comments = [] if messages is not None: yield (message_lineno, funcname, messages, [comment[1] for comment in translator_comments]) funcname = message_lineno = last_argument = None concatenate_next = False translator_comments = [] messages = [] call_stack = -1 elif token.type == 'string': new_value = unquote_string(token.value) if concatenate_next: last_argument = (last_argument or '') + new_value concatenate_next = False else: last_argument = new_value elif token.type == 'operator': if token.value == ',': if last_argument is not None: messages.append(last_argument) last_argument = None else: messages.append(None) concatenate_next = False elif token.value == '+': concatenate_next = True elif call_stack > 0 and token.type == 'operator' \ and token.value == ')': call_stack -= 1 elif funcname and call_stack == -1: funcname = None elif call_stack == -1 and token.type == 'name' and \ token.value in keywords and \ (last_token is None or last_token.type != 'name' or last_token.value != 'function'): funcname = token.value last_token = token
def extract_javascript(fileobj, keywords, comment_tags, options): """Extract messages from JavaScript source code. :param fileobj: the seekable, file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results :param options: a dictionary of additional options (optional) :return: an iterator over ``(lineno, funcname, message, comments)`` tuples :rtype: ``iterator`` """ from babel.messages.jslexer import tokenize, unquote_string funcname = message_lineno = None messages = [] last_argument = None translator_comments = [] concatenate_next = False encoding = options.get('encoding', 'utf-8') last_token = None call_stack = -1 data = fileobj.read() if isinstance(data, bytes): data = data.decode(encoding) for token in tokenize(data): if token.type == 'operator' and token.value == '(': if funcname: message_lineno = token.lineno call_stack += 1 elif call_stack == -1 and token.type == 'linecomment': value = token.value[2:].strip() if translator_comments and \ translator_comments[-1][0] == token.lineno - 1: translator_comments.append((token.lineno, value)) continue for comment_tag in comment_tags: if value.startswith(comment_tag): translator_comments.append((token.lineno, value.strip())) break elif token.type == 'multilinecomment': # only one multi-line comment may preceed a translation translator_comments = [] value = token.value[2:-2].strip() for comment_tag in comment_tags: if value.startswith(comment_tag): lines = value.splitlines() if lines: lines[0] = lines[0].strip() lines[1:] = dedent('\n'.join(lines[1:])).splitlines() for offset, line in enumerate(lines): translator_comments.append( (token.lineno + offset, line)) break elif funcname and call_stack == 0: if token.type == 'operator' and token.value == ')': if last_argument is not None: messages.append(last_argument) if len(messages) > 1: messages = tuple(messages) elif messages: messages = messages[0] else: messages = None # Comments don't apply unless they immediately precede the # message if translator_comments and \ translator_comments[-1][0] < message_lineno - 1: translator_comments = [] if messages is not None: yield (message_lineno, funcname, messages, [comment[1] for comment in translator_comments]) funcname = message_lineno = last_argument = None concatenate_next = False translator_comments = [] messages = [] call_stack = -1 elif token.type == 'string': new_value = unquote_string(token.value) if concatenate_next: last_argument = (last_argument or '') + new_value concatenate_next = False else: last_argument = new_value elif token.type == 'operator': if token.value == ',': if last_argument is not None: messages.append(last_argument) last_argument = None else: messages.append(None) concatenate_next = False elif token.value == '+': concatenate_next = True elif call_stack > 0 and token.type == 'operator' \ and token.value == ')': call_stack -= 1 elif funcname and call_stack == -1: funcname = None elif call_stack == -1 and token.type == 'name' and \ token.value in keywords and \ (last_token is None or last_token.type != 'name' or last_token.value != 'function'): funcname = token.value last_token = token
def extract_javascript(fileobj, keywords, comment_tags, options): """Extract messages from JavaScript source code. :param fileobj: the seekable, file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results :param options: a dictionary of additional options (optional) Supported options are: * `jsx` -- set to false to disable JSX/E4X support. * `template_string` -- set to false to disable ES6 template string support. """ from babel.messages.jslexer import Token, tokenize, unquote_string funcname = message_lineno = None messages = [] last_argument = None translator_comments = [] concatenate_next = False encoding = options.get('encoding', 'utf-8') last_token = None call_stack = -1 dotted = any('.' in kw for kw in keywords) for token in tokenize( fileobj.read().decode(encoding), jsx=options.get("jsx", True), template_string=options.get("template_string", True), dotted=dotted ): if ( # Turn keyword`foo` expressions into keyword("foo") calls: funcname and # have a keyword... (last_token and last_token.type == 'name') and # we've seen nothing after the keyword... token.type == 'template_string' # this is a template string ): message_lineno = token.lineno messages = [unquote_string(token.value)] call_stack = 0 token = Token('operator', ')', token.lineno) if token.type == 'operator' and token.value == '(': if funcname: message_lineno = token.lineno call_stack += 1 elif call_stack == -1 and token.type == 'linecomment': value = token.value[2:].strip() if translator_comments and \ translator_comments[-1][0] == token.lineno - 1: translator_comments.append((token.lineno, value)) continue for comment_tag in comment_tags: if value.startswith(comment_tag): translator_comments.append((token.lineno, value.strip())) break elif token.type == 'multilinecomment': # only one multi-line comment may preceed a translation translator_comments = [] value = token.value[2:-2].strip() for comment_tag in comment_tags: if value.startswith(comment_tag): lines = value.splitlines() if lines: lines[0] = lines[0].strip() lines[1:] = dedent('\n'.join(lines[1:])).splitlines() for offset, line in enumerate(lines): translator_comments.append((token.lineno + offset, line)) break elif funcname and call_stack == 0: if token.type == 'operator' and token.value == ')': if last_argument is not None: messages.append(last_argument) if len(messages) > 1: messages = tuple(messages) elif messages: messages = messages[0] else: messages = None # Comments don't apply unless they immediately precede the # message if translator_comments and \ translator_comments[-1][0] < message_lineno - 1: translator_comments = [] if messages is not None: yield (message_lineno, funcname, messages, [comment[1] for comment in translator_comments]) funcname = message_lineno = last_argument = None concatenate_next = False translator_comments = [] messages = [] call_stack = -1 elif token.type in ('string', 'template_string'): new_value = unquote_string(token.value) if concatenate_next: last_argument = (last_argument or '') + new_value concatenate_next = False else: last_argument = new_value elif token.type == 'operator': if token.value == ',': if last_argument is not None: messages.append(last_argument) last_argument = None else: messages.append(None) concatenate_next = False elif token.value == '+': concatenate_next = True elif call_stack > 0 and token.type == 'operator' \ and token.value == ')': call_stack -= 1 elif funcname and call_stack == -1: funcname = None elif call_stack == -1 and token.type == 'name' and \ token.value in keywords and \ (last_token is None or last_token.type != 'name' or last_token.value != 'function'): funcname = token.value last_token = token
def extract_javascript(fileobj, keywords, comment_tags, options): """Extract messages from JavaScript source code. :param fileobj: the seekable, file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results :param options: a dictionary of additional options (optional) set the option of 'messages_only' to True and this method will yield a tuple of messages and their file position If 'messages_only' is set to True then the return type is a tuple holding ``(messages, start, end)``. This represents a list of message strings and two number positions of those strings in the file. The positions in the file are absolute positions of the text in the file. For example: $._("Hello.") ^......^ - Message position Will yield: ``(["Hello."], 4, 11)``. And for pluralization: $.ngettext("Singular %s.", "Plural %s.", num) ^..........................^ - Message position Will yield: ``(["Singular %s.", "Plural %s."], 12, 39)``. Note that the position of the start and end even transcend the comma and whitespace inbetween the two strings. This even works for more complicated examples: v............. $._( "Hello %s " + "How are you doing?" , name); .......................^ - Message position. This will yield: ``(["Hello %s How are you doing?"], 6, 43)``. All of this makes it easy to replace those string messages with new messages since you can quickly replace all of the text between the start and end positions with your modified text. """ from babel.messages.jslexer import tokenize, unquote_string funcname = message_lineno = None messages = [] last_argument = None translator_comments = [] concatenate_next = False encoding = options.get('encoding', 'utf-8') last_token = None call_stack = -1 # NOTE(jeresig): Custom functionality added. # setting the option of 'messages_only' to True and this method # will yield a tuple of messages and their file position messages_only = options.get('messages_only', False) if messages_only: messages_start = messages_end = None for token in tokenize(fileobj.read().decode(encoding)): # NOTE(jeresig): Made it so that ( or [ or { all increase # the call stack (to avoid capturing things contained within # these particular constructs). if token.type == 'operator' and token.value in ('{', '[', '('): if funcname: message_lineno = token.lineno call_stack += 1 elif call_stack == -1 and token.type == 'linecomment': value = token.value[2:].strip() for comment_tag in comment_tags: if value.startswith(comment_tag): translator_comments.append((token.lineno, value.strip())) break if translator_comments and \ translator_comments[-1][0] == token.lineno - 1: # Add this comment to the one on the previous row translator_comments[-1] = (token.lineno, "%s %s" % ( translator_comments[-1][1], value)) continue elif token.type == 'multilinecomment': # only one multi-line comment may preceed a translation translator_comments = [] value = token.value[2:-2].strip() for comment_tag in comment_tags: if value.startswith(comment_tag): lines = value.splitlines() if lines: lines[0] = lines[0].strip() lines[1:] = dedent('\n'.join(lines[1:])).splitlines() translator_comments.append( (token.lineno+len(lines), ' '.join(lines))) break elif funcname and call_stack == 0: if token.type == 'operator' and token.value == ')': if last_argument is not None: # NOTE(jeresig): Custom functionality added. if messages_only: # End position is continually updated after every # message (making it so that the end of the last # message is the last reported end position) messages_end = token.match.start() messages.append(last_argument) if len(messages) > 1: messages = tuple(messages) elif messages: messages = messages[0] else: messages = None # Comments don't apply unless they immediately precede the # message if translator_comments and \ translator_comments[-1][0] < message_lineno - 1: translator_comments = [] if messages is not None: # NOTE(jeresig): Custom functionality added. if messages_only: yield (messages, messages_start, messages_end) messages_start = messages_end = None else: yield (message_lineno, funcname, messages, [comment[1] for comment in translator_comments]) funcname = message_lineno = last_argument = None concatenate_next = False translator_comments = [] messages = [] call_stack = -1 elif token.type == 'string': # NOTE(jeresig): Custom functionality added. # We've encountered a string, strings hold messages to be # translated. We use this opportunity to update the # messages_start and messages_end variables which keep track of # the positions of the messages in the file. if messages_only and messages_start is None: # Only update the messages_start position when we're at the # first message. messages_start = token.match.start() new_value = unquote_string(token.value) if concatenate_next: last_argument = (last_argument or '') + new_value concatenate_next = False else: last_argument = new_value elif token.type == 'operator': if token.value == ',': if last_argument is not None: # NOTE(jeresig): Custom functionality added. if messages_only: # End position is continually updated after every # message (making it so that the end of the last # message is the last reported end position) messages_end = token.match.start() messages.append(last_argument) last_argument = None else: messages.append(None) concatenate_next = False elif token.value == '+': concatenate_next = True # NOTE(jeresig): Made it so that ) or ] or } all decrease # the call stack (to avoid capturing things contained within # these particular constructs). elif call_stack > 0 and token.type == 'operator' \ and token.value in ('}', ']', ')'): call_stack -= 1 elif funcname and call_stack == -1: funcname = None elif call_stack == -1 and token.type == 'name' and \ token.value in keywords and \ (last_token is None or last_token.type != 'name' or last_token.value != 'function'): funcname = token.value last_token = token
def test_unquote(): assert jslexer.unquote_string('""') == '' assert jslexer.unquote_string(r'"h\u00ebllo"') == u"hëllo"
def add_result(self, token): value = unquote_string(token.value) if value not in self.results: self.results[value] = deque() self.results[value].append(token.lineno)
def add_result(self, token): if self.is_keepable(): result = dict(line_number=token.lineno, function=u'gettext', value=[unquote_string(token.value)]) self.results.append(result)