def tokenize( code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0) ) -> Generator[PythonToken, None, None]: """Generate tokens from a the source code (string).""" lines = split_lines(code, keepends=True) return tokenize_lines(lines, version_info, start_pos=start_pos)
def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]: try: from libcst_native import tokenize as native_tokenize return native_tokenize.tokenize(code) except ImportError: lines = split_lines(code, keepends=True) return tokenize_lines(code, lines, version_info)
def end_pos(self): lines = split_lines(self.string) if len(lines) > 1: return self.start_pos[0] + len(lines) - 1, 0 else: return self.start_pos[0], self.start_pos[1] + len(self.string)
def check(code): tokens = _get_token_list(code) lines = split_lines(code) assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
def test_split_lines(self, string, expected_result, keepends): assert split_lines(string, keepends=keepends) == expected_result
def detect_config( source: Union[str, bytes], *, partial: PartialParserConfig, detect_trailing_newline: bool, detect_default_newline: bool, ) -> ConfigDetectionResult: """ Computes a ParserConfig given the current source code to be parsed and a partial config. """ python_version = partial.parsed_python_version partial_encoding = partial.encoding encoding = ( _detect_encoding(source) if isinstance(partial_encoding, AutoConfig) else partial_encoding ) source_str = source if isinstance(source, str) else source.decode(encoding) partial_default_newline = partial.default_newline default_newline = ( ( _detect_default_newline(source_str) if detect_default_newline else _FALLBACK_DEFAULT_NEWLINE ) if isinstance(partial_default_newline, AutoConfig) else partial_default_newline ) # HACK: The grammar requires a trailing newline, but python doesn't actually require # a trailing newline. Add one onto the end to make the parser happy. We'll strip it # out again during cst.Module's codegen. # # I think parso relies on error recovery support to handle this, which we don't # have. lib2to3 doesn't handle this case at all AFAICT. has_trailing_newline = detect_trailing_newline and _detect_trailing_newline( source_str ) if detect_trailing_newline and not has_trailing_newline: source_str += default_newline lines = split_lines(source_str, keepends=True) tokens = tokenize_lines(lines, python_version) partial_default_indent = partial.default_indent if isinstance(partial_default_indent, AutoConfig): # We need to clone `tokens` before passing it to `_detect_indent`, because # `_detect_indent` consumes some tokens, mutating `tokens`. # # Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the # size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup` # once `token_dup` is freed at the end of this method (subject to # GC/refcounting). tokens, tokens_dup = itertools.tee(tokens) default_indent = _detect_indent(tokens_dup) else: default_indent = partial_default_indent partial_future_imports = partial.future_imports if isinstance(partial_future_imports, AutoConfig): # Same note as above re itertools.tee, we will consume tokens. tokens, tokens_dup = itertools.tee(tokens) future_imports = _detect_future_imports(tokens_dup) else: future_imports = partial_future_imports return ConfigDetectionResult( config=ParserConfig( lines=lines, encoding=encoding, default_indent=default_indent, default_newline=default_newline, has_trailing_newline=has_trailing_newline, version=python_version, future_imports=future_imports, ), tokens=tokens, )
def _convert_token( # noqa: C901: too complex state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken]) -> Token: ct_type = curr_token.type ct_string = curr_token.string ct_start_pos = curr_token.start_pos if ct_type is _ERRORTOKEN: raise ParserSyntaxError( f"{ct_string!r} is not a valid token.", lines=state.lines, raw_line=ct_start_pos[0], raw_column=ct_start_pos[1], ) if ct_type is _ERROR_DEDENT: raise ParserSyntaxError( "Inconsistent indentation. Expected a dedent.", lines=state.lines, raw_line=ct_start_pos[0], raw_column=ct_start_pos[1], ) # Compute relative indent changes for indent/dedent nodes relative_indent: Optional[str] = None if ct_type is _INDENT: old_indent = "" if len(state.indents) < 2 else state.indents[-2] new_indent = state.indents[-1] relative_indent = new_indent[len(old_indent):] if next_token is not None: nt_type = next_token.type if nt_type is _INDENT: nt_line, nt_column = next_token.start_pos state.indents.append(state.lines[nt_line - 1][:nt_column]) elif nt_type is _DEDENT: state.indents.pop() whitespace_before = state.previous_whitespace_state if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER: # Don't update whitespace state for these dummy tokens. This makes it possible # to partially parse whitespace for IndentedBlock footers, and then parse the # rest of the whitespace in the following statement's leading_lines. # Unfortunately, that means that the indentation is either wrong for the footer # comments, or for the next line. We've chosen to allow it to be wrong for the # IndentedBlock footer and manually override the state when parsing whitespace # in that particular node. whitespace_after = whitespace_before ct_end_pos = ct_start_pos else: # Not a dummy token, so update the whitespace state. # Compute our own end_pos, since parso's end_pos is wrong for triple-strings. lines = split_lines(ct_string) if len(lines) > 1: ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1]) else: ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string)) # Figure out what mode the whitespace parser should use. If we're inside # parentheses, certain whitespace (e.g. newlines) are allowed where they would # otherwise not be. f-strings override and disable this behavior, however. # # Parso's tokenizer tracks this internally, but doesn't expose it, so we have to # duplicate that logic here. pof_stack = state.parenthesis_or_fstring_stack try: if ct_type is _FSTRING_START: pof_stack.append(_FSTRING_STACK_ENTRY) elif ct_type is _FSTRING_END: pof_stack.pop() elif ct_type is _OP: if ct_string in "([{": pof_stack.append(_PARENTHESIS_STACK_ENTRY) elif ct_string in ")]}": pof_stack.pop() except IndexError: # pof_stack may be empty by the time we need to read from it due to # mismatched braces. raise ParserSyntaxError( "Encountered a closing brace without a matching opening brace.", lines=state.lines, raw_line=ct_start_pos[0], raw_column=ct_start_pos[1], ) is_parenthesized = (len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY) whitespace_after = WhitespaceState(ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized) # Hold onto whitespace_after, so we can use it as whitespace_before in the next # node. state.previous_whitespace_state = whitespace_after return Token( ct_type, ct_string, ct_start_pos, ct_end_pos, whitespace_before, whitespace_after, relative_indent, )
def tokenize(code: str, version_info: PythonVersionInfo) -> Generator[Token, None, None]: lines = split_lines(code, keepends=True) return tokenize_lines(lines, version_info)