Esempio n. 1
0
 def __init__(self, max_output_token_length = MAX_OUTPUT_TOKEN_LENGTH,
              reserved = ()):
   self.types_to_skip = ()
   self.reserved = reserved
   self.mappings: Dict[str, str]
   self.update_mappings({
       # By default, replace \n and \r. This is meant primarily for literals.
       '\n':
           unified_tokenizer.quote_special('NLCHAR'),
       '\r':
           unified_tokenizer.quote_special('CR'),
       unified_tokenizer.SENTINEL:
           unified_tokenizer.quote_special(unified_tokenizer.SENTINEL_ESCAPE),
   })
   self.max_output_token_length = max_output_token_length
Esempio n. 2
0
  def untokenize_agnostic(self, token_list):
    """Turns CuBERT subtokens into whole tokens."""
    # Untokenize agnostic.
    if (not token_list or token_list[-1] != unified_tokenizer.quote_special(
        unified_tokenizer.TokenKind.EOS.name)):
      raise ValueError('Token list %r should end with the EOS token %r.' %
                       (token_list,
                        unified_tokenizer.quote_special(
                            unified_tokenizer.TokenKind.EOS.name)))

    whole_tokens = unified_tokenizer.reconstitute_full_unsanitary_tokens(
        token_list,
        sanitization_mapping=self.mappings,
        sentinel=unified_tokenizer.SENTINEL)
    return whole_tokens
  def untokenize_abstract(self, whole_tokens):
    tokens: List[str] = []

    for token in whole_tokens[:-1]:  # Skip EOS. The caller checked it's there.
      if token == unified_tokenizer.quote_special(
          unified_tokenizer.TokenKind.NEWLINE.name):
        tokens.append('\n')
      else:
        tokens.append(token)
    return ''.join(tokens)
def wordpiece_ids_to_code(wordpiece_ids, initial_tokenizer, subword_tokenizer):
    """Reverses the Wordpiece-to-CuBERT Subtoken-to-whole token conversion."""
    cubert_subtokens: List[str] = (
        subword_tokenizer._subtoken_ids_to_tokens(  # pylint: disable=protected-access
            wordpiece_ids))

    # We strip EOS in `code_to_cubert_sentences`, so we have to add it back here.
    cubert_subtokens.append(
        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name))

    code = initial_tokenizer.untokenize(cubert_subtokens)
    return code
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for java_tokenizer."""
from typing import Sequence, Tuple

from absl.testing import absltest
from absl.testing import parameterized
from cubert import java_tokenizer
from cubert import unified_tokenizer

_NEWLINE_NAME = unified_tokenizer.quote_special(
    unified_tokenizer.TokenKind.NEWLINE.name)
_EOS_NAME = unified_tokenizer.quote_special(
    unified_tokenizer.TokenKind.EOS.name)


class JavaTokenizerTest(parameterized.TestCase):
    @parameterized.named_parameters(
        (
            'nothing',
            '',
            (),
        ),
        (
            'same_line',
            """TokenA TokenB""",
            #  0     67
Esempio n. 6
0
  def tokenize_and_abstract(
      self,
      source_code):
    """Produces a language-agnostic tokenization of the input code."""
    agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

    try:
      token_tuples = unified_tokenizer.code_to_tokens(source_code)
    except (tokenize.TokenError, IndentationError) as e:
      logging.warning('The tokenizer raised exception `%s` while parsing %s', e,
                      source_code)

      # We don't try to do recovery from errors quite yet. Emit just an
      # error and end-of-sequence and return.
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              unified_tokenizer.quote_special(
                  unified_tokenizer.TokenKind.ERROR.name),
              unified_tokenizer.TokenKind.ERROR,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              unified_tokenizer.quote_special(
                  unified_tokenizer.TokenKind.EOS.name),
              unified_tokenizer.TokenKind.EOS,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
      return agnostic_tokens

    for token_tuple in token_tuples:
      spelling = token_tuple.string
      kind = token_tuple.type

      # We'll adjust the spelling of some tokens, e.g., those that we
      # tokenize by their type rather than their original spelling. Indentation
      # and dedentation tokens are like that.
      adjusted_spelling = spelling
      token_kind = unified_tokenizer.TokenKind.NONE
      if kind == tokenize.NAME:
        # Disambiguate identifiers from keywords.
        if keyword.iskeyword(spelling):
          token_kind = unified_tokenizer.TokenKind.KEYWORD
        else:
          token_kind = unified_tokenizer.TokenKind.IDENTIFIER
      else:
        if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE:
          # Replace spelling with type.
          adjusted_spelling = cubert_tokenizer.token_from_token_type(kind)
        elif kind is tokenize.INDENT:
          # For INDENT, in particular, we also record the actual spelling too.
          adjusted_spelling = '{indent}{spelling}'.format(
              indent=cubert_tokenizer.token_from_token_type(kind),
              spelling=spelling)
        elif kind == tokenize.ENDMARKER:
          adjusted_spelling = unified_tokenizer.quote_special(
              unified_tokenizer.TokenKind.EOS.name)

        # Map everything according to table.
        try:
          token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind]
        except KeyError as ke:
          # It's possible we're here because of async/await. Those kept being
          # turned into keywords and then removed from keywords, so we can't
          # rely on knowing which they are. We'll check by spelling.
          # See: https://bugs.python.org/issue30406
          # and https://bugs.python.org/issue33260
          # and https://bugs.python.org/issue35975
          if spelling in ('async', 'await'):
            token_kind = unified_tokenizer.TokenKind.KEYWORD
          else:
            raise ValueError('While trying to turn Python token %r into an '
                             'agnostic one, raised %r.' %
                             ((spelling, kind), ke))

      start_line, start_column = token_tuple.start
      end_line, end_column = token_tuple.end
      # Unlike other languages, NEWLINE tokens are reported as ending on the
      # same line as where they started. We adjust that here, to stick to the
      # same convention as other tokenizers.
      if ((token_kind == unified_tokenizer.TokenKind.NEWLINE) or
          (kind == tokenize.NL)):
        end_line = start_line + 1
        end_column = 0

      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              spelling=adjusted_spelling, kind=token_kind,
              metadata=unified_tokenizer.TokenMetadata(
                  # Python's tokenizer counts lines starting from 1, so we
                  # have to offset what we read from the `TokenInfo` tuple.
                  start=unified_tokenizer.Position(
                      line=start_line - 1, column=start_column),
                  end=unified_tokenizer.Position(
                      line=end_line - 1, column=end_column))))

    return agnostic_tokens
Esempio n. 7
0
class PythonTokenizer(cubert_tokenizer.CuBertTokenizer):
  """Tokenizer that extracts Python's lexical elements preserving strings."""
  _TOKEN_TYPE_MAP = {
      tokenize.COMMENT: unified_tokenizer.TokenKind.COMMENT,
      tokenize.DEDENT: unified_tokenizer.TokenKind.KEYWORD,
      tokenize.ENDMARKER: unified_tokenizer.TokenKind.EOS,
      tokenize.ERRORTOKEN: unified_tokenizer.TokenKind.ERROR,
      tokenize.INDENT: unified_tokenizer.TokenKind.KEYWORD,
      tokenize.NEWLINE: unified_tokenizer.TokenKind.NEWLINE,
      tokenize.NL: unified_tokenizer.TokenKind.PUNCTUATION,
      tokenize.NUMBER: unified_tokenizer.TokenKind.NUMBER,
      tokenize.OP: unified_tokenizer.TokenKind.PUNCTUATION,
      tokenize.STRING: unified_tokenizer.TokenKind.STRING,
  }
  _REVERSE_TOKEN_MAP = {
      cubert_tokenizer.token_from_token_type(tokenize.INDENT):
          tokenize.INDENT,
      cubert_tokenizer.token_from_token_type(tokenize.DEDENT):
          tokenize.DEDENT,
      unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name):
          tokenize.ENDMARKER,
      unified_tokenizer.quote_special(unified_tokenizer.TokenKind.ERROR.name):
          tokenize.ERRORTOKEN,
      unified_tokenizer.quote_special(unified_tokenizer.TokenKind.NEWLINE.name):
          tokenize.NEWLINE,
      cubert_tokenizer.token_from_token_type(tokenize.NL):
          tokenize.NL,
  }
  # Adding the end-of-string anchor \Z below, since re.fullmatch wasn't
  # available in Python2.
  _NUMBERS = re.compile('(' + tokenize.Number + r')\Z')  # pytype: disable=module-attr
  _SINGLE_STRINGS = re.compile('(' + tokenize.String + r')\Z')  # pytype: disable=module-attr
  _TRIPLE_STRING_BEGINNINGS = re.compile(tokenize.Triple)  # pytype: disable=module-attr
  _COMMENTS = re.compile('(' + tokenize.Comment + r')\Z')  # pytype: disable=module-attr

  _EXACT_TOKEN_TYPES = tokenize.EXACT_TOKEN_TYPES.keys()  # pytype: disable=module-attr

  # Token types that CubertTokenizer will tokenize by their type and not
  # content.
  _TOKEN_TYPES_TO_TOKENIZE_BY_TYPE = [
      tokenize.NEWLINE, tokenize.DEDENT, tokenize.NL
  ]

  def tokenize_and_abstract(
      self,
      source_code):
    """Produces a language-agnostic tokenization of the input code."""
    agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

    try:
      token_tuples = unified_tokenizer.code_to_tokens(source_code)
    except (tokenize.TokenError, IndentationError) as e:
      logging.warning('The tokenizer raised exception `%s` while parsing %s', e,
                      source_code)

      # We don't try to do recovery from errors quite yet. Emit just an
      # error and end-of-sequence and return.
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              unified_tokenizer.quote_special(
                  unified_tokenizer.TokenKind.ERROR.name),
              unified_tokenizer.TokenKind.ERROR,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              unified_tokenizer.quote_special(
                  unified_tokenizer.TokenKind.EOS.name),
              unified_tokenizer.TokenKind.EOS,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
      return agnostic_tokens

    for token_tuple in token_tuples:
      spelling = token_tuple.string
      kind = token_tuple.type

      # We'll adjust the spelling of some tokens, e.g., those that we
      # tokenize by their type rather than their original spelling. Indentation
      # and dedentation tokens are like that.
      adjusted_spelling = spelling
      token_kind = unified_tokenizer.TokenKind.NONE
      if kind == tokenize.NAME:
        # Disambiguate identifiers from keywords.
        if keyword.iskeyword(spelling):
          token_kind = unified_tokenizer.TokenKind.KEYWORD
        else:
          token_kind = unified_tokenizer.TokenKind.IDENTIFIER
      else:
        if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE:
          # Replace spelling with type.
          adjusted_spelling = cubert_tokenizer.token_from_token_type(kind)
        elif kind is tokenize.INDENT:
          # For INDENT, in particular, we also record the actual spelling too.
          adjusted_spelling = '{indent}{spelling}'.format(
              indent=cubert_tokenizer.token_from_token_type(kind),
              spelling=spelling)
        elif kind == tokenize.ENDMARKER:
          adjusted_spelling = unified_tokenizer.quote_special(
              unified_tokenizer.TokenKind.EOS.name)

        # Map everything according to table.
        try:
          token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind]
        except KeyError as ke:
          # It's possible we're here because of async/await. Those kept being
          # turned into keywords and then removed from keywords, so we can't
          # rely on knowing which they are. We'll check by spelling.
          # See: https://bugs.python.org/issue30406
          # and https://bugs.python.org/issue33260
          # and https://bugs.python.org/issue35975
          if spelling in ('async', 'await'):
            token_kind = unified_tokenizer.TokenKind.KEYWORD
          else:
            raise ValueError('While trying to turn Python token %r into an '
                             'agnostic one, raised %r.' %
                             ((spelling, kind), ke))

      start_line, start_column = token_tuple.start
      end_line, end_column = token_tuple.end
      # Unlike other languages, NEWLINE tokens are reported as ending on the
      # same line as where they started. We adjust that here, to stick to the
      # same convention as other tokenizers.
      if ((token_kind == unified_tokenizer.TokenKind.NEWLINE) or
          (kind == tokenize.NL)):
        end_line = start_line + 1
        end_column = 0

      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              spelling=adjusted_spelling, kind=token_kind,
              metadata=unified_tokenizer.TokenMetadata(
                  # Python's tokenizer counts lines starting from 1, so we
                  # have to offset what we read from the `TokenInfo` tuple.
                  start=unified_tokenizer.Position(
                      line=start_line - 1, column=start_column),
                  end=unified_tokenizer.Position(
                      line=end_line - 1, column=end_column))))

    return agnostic_tokens

  def untokenize_abstract(self, whole_tokens):
    # Reconstruct Python tokenizer tuples, so that Python's untokenize can be
    # invoked.
    token_tuples: List[Tuple[int, str]] = []

    for whole_token in whole_tokens:
      if whole_token in PythonTokenizer._EXACT_TOKEN_TYPES:
        token_tuples.append((tokenize.OP, whole_token))
      elif cubert_tokenizer.token_from_token_type(
          tokenize.INDENT) in whole_token:
        # We baked the type and spelling into one token. Break them up.
        spelling = whole_token.replace(
            cubert_tokenizer.token_from_token_type(tokenize.INDENT), '')
        token_tuples.append((tokenize.INDENT, spelling))
      elif whole_token in PythonTokenizer._REVERSE_TOKEN_MAP:
        python_kind = PythonTokenizer._REVERSE_TOKEN_MAP[whole_token]
        if python_kind in (tokenize.DEDENT, tokenize.ENDMARKER,
                           tokenize.ERRORTOKEN):
          spelling = ''
        else:  # python_kind in (tokenize.NEWLINE, tokenize.NL)
          spelling = '\n'
        token_tuples.append((python_kind, spelling))
      elif keyword.iskeyword(whole_token):
        token_tuples.append((tokenize.NAME, whole_token))
      elif PythonTokenizer._NUMBERS.match(whole_token):
        token_tuples.append((tokenize.NUMBER, whole_token))
      elif PythonTokenizer._SINGLE_STRINGS.match(whole_token):
        token_tuples.append((tokenize.STRING, whole_token))
      elif PythonTokenizer._TRIPLE_STRING_BEGINNINGS.match(whole_token):
        token_tuples.append((tokenize.STRING, whole_token))
      elif PythonTokenizer._COMMENTS.match(whole_token):
        token_tuples.append((tokenize.COMMENT, whole_token))
      else:
        # Everything else we map back to NAME.
        token_tuples.append((tokenize.NAME, whole_token))

    reconstructed = tokenize.untokenize(typing.cast(Any, token_tuples))
    return reconstructed
Esempio n. 8
0
def token_from_token_type(token_type):
  """Turns a token type into a reserved token string."""
  # We use the tok_name dict from tokenize, not token. The former has
  # NL and COMMENT and such, whereas the latter doesn't.
  return unified_tokenizer.quote_special(tokenize.tok_name[token_type])
  def tokenize_and_abstract(
      self,
      source_code):
    """As per the superclass."""
    agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

    try:
      java_tokens = list(
          extended_javalang_tokenizer.tokenize_extended(source_code))
    except (javalang.LexerError, TypeError) as e:
      # Sometimes, javalang returns a TypeError when reading a number.
      # See
      # https://github.com/c2nes/javalang/blob/0664afb7f4d40254312693f2e833c1ed4ac551c7/javalang/tokenizer.py#L370
      logging.warning('The tokenizer raised exception `%r` while parsing %s', e,
                      source_code)

      # We don't try to do recovery from errors quite yet. Mark the error as
      # occurring at whatever position we are in and terminate
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              unified_tokenizer.quote_special(
                  unified_tokenizer.TokenKind.ERROR.name),
              unified_tokenizer.TokenKind.ERROR,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              '',
              unified_tokenizer.TokenKind.EOS,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
    else:
      start_line = 0
      start_column = 0
      for token in java_tokens:
        # The token kind is the subclass type of the token.
        token_type = type(token)
        if token_type not in JavaTokenizer._TOKEN_TYPE_MAP:
          raise ValueError(
              'Received Java token type %s, but it was unexpected, '
              'while tokenizing \n%s\n' % (token_type, source_code))

        # JavaTokenizer counts lines and columns from 1.
        start_line = token.position.line - 1
        start_column = token.position.column - 1

        # The tokenizer seems to take some liberties with Unicode, returning
        # invalid characters. This cleans spellings up.
        spelling = token.value.encode('utf-8', errors='replace').decode('utf-8')
        agnostic_tokens.append(
            unified_tokenizer.AbstractToken(
                spelling, JavaTokenizer._TOKEN_TYPE_MAP[token_type],
                unified_tokenizer.TokenMetadata(
                    start=unified_tokenizer.Position(
                        line=start_line, column=start_column))))

    # At this point, we have all the tokens, either as produced and abstracted,
    # or a placeholder error and eos in case of an exception. However, the
    # tokens only have start positions. Since the extended tokenizer guarantees
    # that tokens abut, we take a second pass, backwards, setting the end
    # position of a token from the start position of token following it. The
    # final token, `EOS` already has an end position, so we don't modify it.
    eos = agnostic_tokens[-1]
    if not eos.metadata.start:
      # This should be there. Raise an exception
      raise AssertionError('The end of input token is missing positioning '
                           'information: %s' % eos)
    # EOS contains an empty spelling. We replace it here with EOS.name
    eos = dataclasses.replace(
        eos,
        spelling=unified_tokenizer.quote_special(
            unified_tokenizer.TokenKind.EOS.name))

    later_token_start: unified_tokenizer.Position = eos.metadata.start

    # The EOS token has an empty extent, so the end and the start are set to be
    # the same.
    filled_agnostic_tokens = [
        dataclasses.replace(
            eos,
            metadata=dataclasses.replace(eos.metadata, end=eos.metadata.start))
    ]
    # Go backwards, from the element before `eos` to the beginning.
    for token in (
        agnostic_tokens[i] for i in range(len(agnostic_tokens) - 2, -1, -1)):
      filled_token = dataclasses.replace(
          token,
          metadata=dataclasses.replace(token.metadata, end=later_token_start))
      filled_agnostic_tokens.append(filled_token)
      later_token_start = token.metadata.start

    # Now we have the tokens, including end position, but they're reversed.
    # The final step is to break down whitespace tokens into primitive
    # WHITESPACE tokens and NEWLINE tokens.
    with_broken_whitespace = []
    for token in filled_agnostic_tokens[::-1]:
      if token.kind is not unified_tokenizer.TokenKind.WHITESPACE:
        with_broken_whitespace.append(token)
      else:
        # This is whitespace. Replace it with primitive tokens.
        with_broken_whitespace.extend(
            unified_tokenizer.fill_range_with_whitespace(
                token.metadata.start, token.metadata.end))

    return with_broken_whitespace
def next_whole_token(wordpiece_subtokens, initial_tokenizer,
                     subword_tokenizer):
    """Greedily reconstitutes a whole token from a WordPiece list.

  This function assumes that the wordpiece subtokens were constructed correctly
  from a correctly subtokenized CuBERT tokenizer, but the sequence may be
  truncated and thus incomplete.

  The implementation is done in two stages: recognizing the first whole token
  and then finding the correspondence of that first whole token to a prefix of
  the subtoken sequence.

  The implementation assumes that untokenization can do the best job on the full
  context. So, it first untokenizes the whole sequence, and chooses the first
  whole token.

  To figure out the subtoken prefix that corresponds to that whole token, the
  implementation greedily untokenizes longer and longer subtoken prefixes, until
  the whole token is recognized in the output.

  The reason for this somewhat expensive implementation is that the logic for
  merging subtokens (for WordPiece and then for CuBERT) is intricate, and does
  not export how many initial subtokens were consumed for each output token of
  the next higher abstraction. What's more, a subtoken may align itself with
  the previous or the next whole token, when the subtoken sequence is
  incomplete.

  Args:
    wordpiece_subtokens: The subtokens to scan through.
    initial_tokenizer: A CuBERT tokenizer.
    subword_tokenizer: A SubwordTextEncoder.

  Returns:
    The first whole token matched, and the end index of the first subtoken index
    after the first whole token. wordpiece_subtokens[0:end_index] should be
    the subtokens corresponding to the whole token returned.

  Raises:
    ValueError if no whole token can be parsed.
  """

    wordpiece_ids = wordpiece_ids_from_wordpiece_tokens(
        wordpiece_subtokens, subword_tokenizer)
    full_cubert_subtokens: List[str] = (
        subword_tokenizer._subtoken_ids_to_tokens(  # pylint: disable=protected-access
            wordpiece_ids))

    full_cubert_subtokens.append(
        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name))

    full_whole_tokens = initial_tokenizer.untokenize_agnostic(
        full_cubert_subtokens)

    if len(full_whole_tokens) < 2:
        # It all came out a jumble. Reject it.
        raise ValueError(f'Whole tokens {full_whole_tokens} ended up '
                         f'undifferentiable in {wordpiece_subtokens}.')

    whole_token = full_whole_tokens[0]

    for end_index in range(1, len(wordpiece_ids) + 1):
        prefix_list = wordpiece_ids[:end_index]
        partial_cubert_subtokens: List[str] = (
            subword_tokenizer._subtoken_ids_to_tokens(  # pylint: disable=protected-access
                prefix_list))

        # We strip EOS in `code_to_cubert_sentences`, so we have to add it back
        # here.
        partial_cubert_subtokens.append(
            unified_tokenizer.quote_special(
                unified_tokenizer.TokenKind.EOS.name))

        partial_whole_tokens = initial_tokenizer.untokenize_agnostic(
            partial_cubert_subtokens)
        if len(partial_whole_tokens) > 1:
            if partial_whole_tokens[0] == whole_token:
                return whole_token, end_index

    # We got here because we couldn't match the whole token we found from the
    # full sequence
    raise ValueError('Could not find a whole token in %r' %
                     (wordpiece_subtokens, ))
repository. The extension enables the javalang tokenizer to return end positions
as well as end-of-sequence tokens and comments.
"""
import dataclasses
from typing import List
from typing import Sequence

from absl import logging
from javalang import tokenizer as javalang

from cubert import cubert_tokenizer
from cubert import extended_javalang_tokenizer
from cubert import unified_tokenizer


QUOTED_EOS_NAME = unified_tokenizer.quote_special(
    unified_tokenizer.TokenKind.EOS.name)


class JavaTokenizer(cubert_tokenizer.CuBertTokenizer):
  """Tokenizer that extracts Python's lexical elements preserving strings."""
  _TOKEN_TYPE_MAP = {
      javalang.EndOfInput:
          unified_tokenizer.TokenKind.EOS,
      javalang.Keyword:
          unified_tokenizer.TokenKind.KEYWORD,
      javalang.Modifier:
          unified_tokenizer.TokenKind.KEYWORD,
      javalang.Separator:
          unified_tokenizer.TokenKind.PUNCTUATION,
      javalang.Operator:
          unified_tokenizer.TokenKind.PUNCTUATION,
    def tokenize_and_abstract(self, source_code):
        """Produces a language-agnostic tokenization of the input code."""
        token_pairs: Iterable[Tuple[str, int]]
        try:
            token_tuples = unified_tokenizer.code_to_tokens(source_code)
            token_pairs = ((token_name, token_type)
                           for token_type, token_name, _, _, _ in token_tuples)
        except (tokenize.TokenError, IndentationError) as e:
            logging.warning(
                'The tokenizer raised exception `%s` while parsing %s', e,
                source_code)
            token_pairs = (
                (unified_tokenizer.quote_special(
                    unified_tokenizer.TokenKind.ERROR.name),
                 tokenize.ERRORTOKEN),
                ('', tokenize.ENDMARKER),
            )

        agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

        for spelling, kind in token_pairs:
            adjusted_spelling = spelling
            token_kind = unified_tokenizer.TokenKind.NONE
            if kind == tokenize.NAME:
                # Disambiguate identifiers from keywords.
                if keyword.iskeyword(spelling):
                    token_kind = unified_tokenizer.TokenKind.KEYWORD
                else:
                    token_kind = unified_tokenizer.TokenKind.IDENTIFIER
            else:
                if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE:
                    # Replace spelling with type.
                    adjusted_spelling = cubert_tokenizer.token_from_token_type(
                        kind)
                elif kind is tokenize.INDENT:
                    # For INDENT, in particular, we also record the actual spelling too.
                    adjusted_spelling = '{indent}{spelling}'.format(
                        indent=cubert_tokenizer.token_from_token_type(kind),
                        spelling=spelling)
                elif kind == tokenize.ENDMARKER:
                    adjusted_spelling = unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.EOS.name)

                # Map everything according to table.
                try:
                    token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind]
                except KeyError as ke:
                    # It's possible we're here because of async/await. Those kept being
                    # turned into keywords and then removed from keywords, so we can't
                    # rely on knowing which they are. We'll check by spelling.
                    # See: https://bugs.python.org/issue30406
                    # and https://bugs.python.org/issue33260
                    # and https://bugs.python.org/issue35975
                    if spelling in ('async', 'await'):
                        token_kind = unified_tokenizer.TokenKind.KEYWORD
                    else:
                        raise ValueError(
                            'While trying to turn Python token %r into an '
                            'agnostic one, raised %r.' %
                            ((spelling, kind), ke))

            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    spelling=adjusted_spelling,
                    kind=token_kind,
                    # TODO(maniatis): Eventually, we'll store token positioning info
                    # in metadata.
                    metadata=unified_tokenizer.TokenMetadata()))

        return agnostic_tokens
class PythonTokenizer(cubert_tokenizer.CuBertTokenizer):
    """Tokenizer that extracts Python's lexical elements preserving strings."""
    _TOKEN_TYPE_MAP = {
        tokenize.COMMENT: unified_tokenizer.TokenKind.COMMENT,
        tokenize.DEDENT: unified_tokenizer.TokenKind.KEYWORD,
        tokenize.ENDMARKER: unified_tokenizer.TokenKind.EOS,
        tokenize.ERRORTOKEN: unified_tokenizer.TokenKind.ERROR,
        tokenize.INDENT: unified_tokenizer.TokenKind.KEYWORD,
        tokenize.NEWLINE: unified_tokenizer.TokenKind.NEWLINE,
        tokenize.NL: unified_tokenizer.TokenKind.PUNCTUATION,
        tokenize.NUMBER: unified_tokenizer.TokenKind.NUMBER,
        tokenize.OP: unified_tokenizer.TokenKind.PUNCTUATION,
        tokenize.STRING: unified_tokenizer.TokenKind.STRING,
    }
    _REVERSE_TOKEN_MAP = {
        cubert_tokenizer.token_from_token_type(tokenize.INDENT):
        tokenize.INDENT,
        cubert_tokenizer.token_from_token_type(tokenize.DEDENT):
        tokenize.DEDENT,
        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name):
        tokenize.ENDMARKER,
        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.ERROR.name):
        tokenize.ERRORTOKEN,
        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.NEWLINE.name):
        tokenize.NEWLINE,
        cubert_tokenizer.token_from_token_type(tokenize.NL):
        tokenize.NL,
    }
    # Adding the end-of-string anchor \Z below, since re.fullmatch wasn't
    # available in Python2.
    _NUMBERS = re.compile('(' + tokenize.Number + r')\Z')  # pytype: disable=module-attr
    _SINGLE_STRINGS = re.compile('(' + tokenize.String + r')\Z')  # pytype: disable=module-attr
    _TRIPLE_STRING_BEGINNINGS = re.compile(tokenize.Triple)  # pytype: disable=module-attr
    _COMMENTS = re.compile('(' + tokenize.Comment + r')\Z')  # pytype: disable=module-attr

    _EXACT_TOKEN_TYPES = tokenize.EXACT_TOKEN_TYPES.keys()  # pytype: disable=module-attr

    # Token types that CubertTokenizer will tokenize by their type and not
    # content.
    _TOKEN_TYPES_TO_TOKENIZE_BY_TYPE = [
        tokenize.NEWLINE, tokenize.DEDENT, tokenize.NL
    ]

    def __init__(self, *args, **kwargs):
        super(PythonTokenizer, self).__init__(*args, **kwargs)

        # By default, we drop COMMENT tokens.
        self.update_types_to_skip([unified_tokenizer.TokenKind.COMMENT])

    def tokenize_and_abstract(self, source_code):
        """Produces a language-agnostic tokenization of the input code."""
        token_pairs: Iterable[Tuple[str, int]]
        try:
            token_tuples = unified_tokenizer.code_to_tokens(source_code)
            token_pairs = ((token_name, token_type)
                           for token_type, token_name, _, _, _ in token_tuples)
        except (tokenize.TokenError, IndentationError) as e:
            logging.warning(
                'The tokenizer raised exception `%s` while parsing %s', e,
                source_code)
            token_pairs = (
                (unified_tokenizer.quote_special(
                    unified_tokenizer.TokenKind.ERROR.name),
                 tokenize.ERRORTOKEN),
                ('', tokenize.ENDMARKER),
            )

        agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

        for spelling, kind in token_pairs:
            adjusted_spelling = spelling
            token_kind = unified_tokenizer.TokenKind.NONE
            if kind == tokenize.NAME:
                # Disambiguate identifiers from keywords.
                if keyword.iskeyword(spelling):
                    token_kind = unified_tokenizer.TokenKind.KEYWORD
                else:
                    token_kind = unified_tokenizer.TokenKind.IDENTIFIER
            else:
                if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE:
                    # Replace spelling with type.
                    adjusted_spelling = cubert_tokenizer.token_from_token_type(
                        kind)
                elif kind is tokenize.INDENT:
                    # For INDENT, in particular, we also record the actual spelling too.
                    adjusted_spelling = '{indent}{spelling}'.format(
                        indent=cubert_tokenizer.token_from_token_type(kind),
                        spelling=spelling)
                elif kind == tokenize.ENDMARKER:
                    adjusted_spelling = unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.EOS.name)

                # Map everything according to table.
                try:
                    token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind]
                except KeyError as ke:
                    # It's possible we're here because of async/await. Those kept being
                    # turned into keywords and then removed from keywords, so we can't
                    # rely on knowing which they are. We'll check by spelling.
                    # See: https://bugs.python.org/issue30406
                    # and https://bugs.python.org/issue33260
                    # and https://bugs.python.org/issue35975
                    if spelling in ('async', 'await'):
                        token_kind = unified_tokenizer.TokenKind.KEYWORD
                    else:
                        raise ValueError(
                            'While trying to turn Python token %r into an '
                            'agnostic one, raised %r.' %
                            ((spelling, kind), ke))

            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    spelling=adjusted_spelling,
                    kind=token_kind,
                    # TODO(maniatis): Eventually, we'll store token positioning info
                    # in metadata.
                    metadata=unified_tokenizer.TokenMetadata()))

        return agnostic_tokens

    def untokenize_abstract(self, whole_tokens):
        # Reconstruct Python tokenizer tuples, so that Python's untokenize can be
        # invoked.
        token_tuples: List[Tuple[int, str]] = []

        for whole_token in whole_tokens:
            if whole_token in PythonTokenizer._EXACT_TOKEN_TYPES:
                token_tuples.append((tokenize.OP, whole_token))
            elif cubert_tokenizer.token_from_token_type(
                    tokenize.INDENT) in whole_token:
                # We baked the type and spelling into one token. Break them up.
                spelling = whole_token.replace(
                    cubert_tokenizer.token_from_token_type(tokenize.INDENT),
                    '')
                token_tuples.append((tokenize.INDENT, spelling))
            elif whole_token in PythonTokenizer._REVERSE_TOKEN_MAP:
                python_kind = PythonTokenizer._REVERSE_TOKEN_MAP[whole_token]
                if python_kind in (tokenize.DEDENT, tokenize.ENDMARKER,
                                   tokenize.ERRORTOKEN):
                    spelling = ''
                else:  # python_kind in (tokenize.NEWLINE, tokenize.NL)
                    spelling = '\n'
                token_tuples.append((python_kind, spelling))
            elif keyword.iskeyword(whole_token):
                token_tuples.append((tokenize.NAME, whole_token))
            elif PythonTokenizer._NUMBERS.match(whole_token):
                token_tuples.append((tokenize.NUMBER, whole_token))
            elif PythonTokenizer._SINGLE_STRINGS.match(whole_token):
                token_tuples.append((tokenize.STRING, whole_token))
            elif PythonTokenizer._TRIPLE_STRING_BEGINNINGS.match(whole_token):
                token_tuples.append((tokenize.STRING, whole_token))
            elif PythonTokenizer._COMMENTS.match(whole_token):
                token_tuples.append((tokenize.COMMENT, whole_token))
            else:
                # Everything else we map back to NAME.
                token_tuples.append((tokenize.NAME, whole_token))

        reconstructed = tokenize.untokenize(typing.cast(Any, token_tuples))
        return reconstructed