def __init__(self, options_dict): o = dict(options_dict) options = {} for name, default in self._defaults.items(): if name in o: value = o.pop(name) if isinstance(default, bool) and name not in ('cache', 'use_bytes', 'propagate_positions'): value = bool(value) else: value = default options[name] = value if isinstance(options['start'], STRING_TYPE): options['start'] = [options['start']] self.__dict__['options'] = options assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) if self.parser == 'earley' and self.transformer: raise ConfigurationError( 'Cannot specify an embedded transformer when using the Earley algorithm.' 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)' ) if o: raise ConfigurationError("Unknown options: %s" % o.keys())
def _load(self, f, **kwargs): if isinstance(f, dict): d = f else: d = pickle.load(f) memo = d['memo'] data = d['data'] assert memo memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) options = dict(data['options']) if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): raise ConfigurationError("Some options are not allowed when loading a Parser: {}" .format(set(kwargs) - _LOAD_ALLOWED_OPTIONS)) options.update(kwargs) self.options = LarkOptions.deserialize(options, memo) self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source_path = '<deserialized>' self._prepare_callbacks() self.parser = self.parser_class.deserialize( data['parser'], memo, self._callbacks, self.options, # Not all, but multiple attributes are used ) self.lexer_conf = self.parser.lexer_conf self.terminals = self.parser.lexer_conf.terminals self._terminals_dict = {t.name: t for t in self.terminals} return self
def __init__(self, grammar, **options): self.options = LarkOptions(options) # Set regex or re module use_regex = self.options.regex if use_regex: if regex: re_module = regex else: raise ImportError( '`regex` module must be installed if calling `Lark(regex=True)`.' ) else: re_module = re # Some, but not all file-like objects have a 'name' attribute if self.options.source_path is None: try: self.source_path = grammar.name except AttributeError: self.source_path = '<string>' else: self.source_path = self.options.source_path # Drain file-like objects to get their contents try: read = grammar.read except AttributeError: pass else: grammar = read() cache_fn = None cache_md5 = None if isinstance(grammar, STRING_TYPE): self.source_grammar = grammar if self.options.use_bytes: if not isascii(grammar): raise ConfigurationError( "Grammar must be ascii only, when use_bytes=True") if sys.version_info[ 0] == 2 and self.options.use_bytes != 'force': raise ConfigurationError( "`use_bytes=True` may have issues on python2." "Use `use_bytes='force'` to use it at your own risk.") if self.options.cache: if self.options.parser != 'lalr': raise ConfigurationError( "cache only works with parser='lalr' for now") unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') options_str = ''.join(k + str(v) for k, v in options.items() if k not in unhashable) from . import __version__ s = grammar + options_str + __version__ + str( sys.version_info[:2]) cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest() if isinstance(self.options.cache, STRING_TYPE): cache_fn = self.options.cache else: if self.options.cache is not True: raise ConfigurationError( "cache argument must be bool or str") # Python2.7 doesn't support * syntax in tuples cache_fn = tempfile.gettempdir( ) + '/.lark_cache_%s_%s_%s.tmp' % ( (cache_md5, ) + sys.version_info[:2]) if FS.exists(cache_fn): logger.debug('Loading grammar from cache: %s', cache_fn) # Remove options that aren't relevant for loading from cache for name in (set(options) - _LOAD_ALLOWED_OPTIONS): del options[name] with FS.open(cache_fn, 'rb') as f: old_options = self.options try: file_md5 = f.readline().rstrip(b'\n') cached_used_files = pickle.load(f) if file_md5 == cache_md5.encode( 'utf8') and verify_used_files( cached_used_files): cached_parser_data = pickle.load(f) self._load(cached_parser_data, **options) return except Exception: # We should probably narrow done which errors we catch here. logger.exception( "Failed to load Lark from cache: %r. We will try to carry on." % cache_fn) # In theory, the Lark instance might have been messed up by the call to `_load`. # In practice the only relevant thing that might have been overriden should be `options` self.options = old_options # Parse the grammar file and compose the grammars self.grammar, used_files = load_grammar( grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) else: assert isinstance(grammar, Grammar) self.grammar = grammar if self.options.lexer == 'auto': if self.options.parser == 'lalr': self.options.lexer = 'contextual' elif self.options.parser == 'earley': if self.options.postlex is not None: logger.info( "postlex can't be used with the dynamic lexer, so we use standard instead. " "Consider using lalr with contextual instead of earley" ) self.options.lexer = 'standard' else: self.options.lexer = 'dynamic' elif self.options.parser == 'cyk': self.options.lexer = 'standard' else: assert False, self.options.parser lexer = self.options.lexer if isinstance(lexer, type): assert issubclass( lexer, Lexer ) # XXX Is this really important? Maybe just ensure interface compliance else: assert_config( lexer, ('standard', 'contextual', 'dynamic', 'dynamic_complete')) if self.options.postlex is not None and 'dynamic' in lexer: raise ConfigurationError( "Can't use postlex with a dynamic lexer. Use standard or contextual instead" ) if self.options.ambiguity == 'auto': if self.options.parser == 'earley': self.options.ambiguity = 'resolve' else: assert_config( self.options.parser, ('earley', 'cyk'), "%r doesn't support disambiguation. Use one of these parsers instead: %s" ) if self.options.priority == 'auto': self.options.priority = 'normal' if self.options.priority not in _VALID_PRIORITY_OPTIONS: raise ConfigurationError( "invalid priority option: %r. Must be one of %r" % (self.options.priority, _VALID_PRIORITY_OPTIONS)) assert self.options.ambiguity not in ( 'resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"' if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: raise ConfigurationError( "invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) if self.options.postlex is not None: terminals_to_keep = set(self.options.postlex.always_accept) else: terminals_to_keep = set() # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile( self.options.start, terminals_to_keep) if self.options.edit_terminals: for t in self.terminals: self.options.edit_terminals(t) self._terminals_dict = {t.name: t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. if self.options.priority == 'invert': for rule in self.rules: if rule.options.priority is not None: rule.options.priority = -rule.options.priority # Else, if the user asked to disable priorities, strip them from the # rules. This allows the Earley parsers to skip an extra forest walk # for improved performance, if you don't need them (or didn't specify any). elif self.options.priority is None: for rule in self.rules: if rule.options.priority is not None: rule.options.priority = None # TODO Deprecate lexer_callbacks? self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) if self.options.parser: self.parser = self._build_parser() elif lexer: self.lexer = self._build_lexer() if cache_fn: logger.debug('Saving grammar to cache: %s', cache_fn) with FS.open(cache_fn, 'wb') as f: f.write(cache_md5.encode('utf8') + b'\n') pickle.dump(used_files, f) self.save(f)
def assert_config(value, options, msg='Got %r, expected one of %s'): if value not in options: raise ConfigurationError(msg % (value, options))