Python Lexer Examples

Programming Language: Python

Namespace/Package Name: chatette.parsing.lexing.lexer

Class/Type: Lexer

Examples at hotexamples.com: 2

Python Lexer - 2 examples found. These are the top rated real world Python examples of chatette.parsing.lexing.lexer.Lexer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Lexer(1)

lex(1)

Frequently Used Methods

Lexer (1)

lex (1)

Example #1

Show file

    def __init__(self, master_file_path):
        if (master_file_path is not None
                and not isinstance(master_file_path, string_types)):
            raise ValueError(
                "Since v1.4.0, the parser takes as an argument " + \
                "the path of the master file directly, rather " + \
                "than the file itself as before.")
        self._master_filepath = master_file_path

        self.input_file_manager = \
            InputFileManager.get_or_create(master_file_path)
        self.lexer = Lexer()
        self.ast = AST.get_or_create()

        self._declaration_line_allowed = True
        self._last_indentation = None

        self._current_unit_declaration = None
        self._current_variation_name = None

Example #2

Show file

class Parser(object):
    def __init__(self, master_file_path):
        if (master_file_path is not None
                and not isinstance(master_file_path, string_types)):
            raise ValueError(
                "Since v1.4.0, the parser takes as an argument " + \
                "the path of the master file directly, rather " + \
                "than the file itself as before.")
        self._master_filepath = master_file_path

        self.input_file_manager = \
            InputFileManager.get_or_create(master_file_path)
        self.lexer = Lexer()
        self.ast = AST.get_or_create()

        self._declaration_line_allowed = True
        self._last_indentation = None

        self._current_unit_declaration = None
        self._current_variation_name = None

    def open_new_file(self, filepath):
        """Opens the new (master) file, making the parser ready to parse it."""
        try:
            self.input_file_manager.open_file(filepath)
        except IOError as e:
            raise IOError(
                "There was an error while opening file '" + \
                str(cast_to_unicode(filepath)) + "': " + str(e) + "."
            )
        except FileAlreadyOpened as e:
            err_msg = str(e)
            current_file_name = self.input_file_manager.get_current_file_name()
            if current_file_name is not None:
                err_msg += \
                    "\nContinuing the parsing of '" + str(current_file_name) + \
                    "'."
            print_warn(err_msg)

    def parse(self):
        """
        Parses the template file(s) and translates them into an AST.
        """
        print_DBG(
            "Parsing master file: " + \
            self.input_file_manager.get_current_file_name()
        )

        while True:
            line = self.input_file_manager.read_line()
            if line is None:  # End of file
                break
            currently_parsing_slot = (
                self._current_unit_declaration is not None
                and self._current_unit_declaration.unit_type == UnitType.slot)
            lexical_tokens = self.lexer.lex(line, currently_parsing_slot)
            lexical_tokens = remove_comment_tokens(lexical_tokens)

            if len(lexical_tokens) == 0:
                continue

            if lexical_tokens[0].type == TerminalType.file_inclusion_marker:
                self._parse_file_inclusion(lexical_tokens)
                self._declaration_line_allowed = True
                self._last_indentation = None
                self._current_unit_declaration = None
                self._current_variation_name = None
            elif lexical_tokens[0].type == TerminalType.indentation:
                self._parse_rule_line(lexical_tokens)
                self._declaration_line_allowed = True
                self._last_indentation = lexical_tokens[0].text
            elif (
                lexical_tokens[0].type in \
                (TerminalType.alias_decl_start,
                 TerminalType.slot_decl_start,
                 TerminalType.intent_decl_start)
            ):
                self._parse_unit_declaration_line(lexical_tokens)
                self._declaration_line_allowed = False
                self._last_indentation = None
            else:
                self.input_file_manager.syntax_error(
                    "Couldn't parse this line: a line can be either " + \
                    "an empty line, a comment line, a file inclusion line, " + \
                    "a unit declaration or a rule."
                )

    def _parse_file_inclusion(self, lexical_tokens):
        """
        Opens the file that is included by the tokenized line `lexical_tokens`.
        @pre: `lexical_tokens` contain a tokenized file inclusion line.
        """
        self.open_new_file(lexical_tokens[1].text)
        print_DBG(
            "Parsing file: " + \
            self.input_file_manager.get_current_file_name()
        )

    def _parse_unit_declaration_line(self, line_tokens):
        """
        Parses the tokens `line_tokens` that correspond to a whole line
        where a unit is declared.
        Adds the definition to the AST.
        """
        if not self._declaration_line_allowed:
            self.input_file_manager.syntax_error(
                "Didn't expect a unit declaration to start here.")

        (unit, variation) = self._parse_unit_declaration(line_tokens)

        try:
            self.ast.add_unit(unit)
        except ValueError as e:
            if variation is None:
                self.input_file_manager.syntax_error(str(e))
            elif variation in self.ast[unit.unit_type][unit.identifier]:
                self.input_file_manager.syntax_error(
                    "Variation '" + str(variation) + "' was already " + \
                    "declared for " + unit.full_name + "."
                )
            else:  # new variation was declared
                pass
        self._current_variation_name = variation
        self._current_unit_declaration = unit

    def _parse_unit_declaration(self, lexical_tokens):
        """
        Parses the tokens `lexical_tokens` that contain a unit declaration.
        Returns the corresponding concrete unit.
        """
        if lexical_tokens[0].type == TerminalType.alias_decl_start:
            builder = AliasDefBuilder()
        elif lexical_tokens[0].type == TerminalType.slot_decl_start:
            builder = SlotDefBuilder()
        elif lexical_tokens[0].type == TerminalType.intent_decl_start:
            builder = IntentDefBuilder()
        else:  # Should never happen
            raise ValueError(
                "Tried to parse a line as if it was a unit declaration " + \
                "while it wasn't."
            )

        i = 1
        while i < len(lexical_tokens):
            token = lexical_tokens[i]
            if token.type == TerminalType.unit_identifier:
                builder.identifier = token.text
            elif token.type == TerminalType.casegen_marker:
                builder.casegen = True
            elif token.type == TerminalType.randgen_marker:
                builder.randgen = True
            elif token.type == TerminalType.randgen_name:
                builder.randgen_name = token.text
            elif token.type == TerminalType.variation_marker:
                pass
            elif token.type == TerminalType.variation_name:
                builder.variation = token.text
            elif token.type == TerminalType.arg_marker:
                pass
            elif token.type == TerminalType.arg_name:
                builder.arg_name = token.text
            elif (
                token.type in \
                (TerminalType.alias_decl_end,
                 TerminalType.slot_decl_end,
                 TerminalType.intent_decl_end)
            ):
                i += 1
                break
            else:
                raise ValueError(  # Should never happen
                    "Detected invalid token type in unit definition: " + \
                    token.type.name
                )
            i += 1

        if (i < len(lexical_tokens)
                and lexical_tokens[i].type == TerminalType.annotation_start):
            if not isinstance(builder, IntentDefBuilder):
                if isinstance(builder, AliasDefBuilder):
                    unit_type = "alias"
                else:
                    unit_type = "slot"
                print_warn(
                    "Found an annotation when parsing " + unit_type + " '" + \
                    identifier + "'\n" + \
                    "Annotations are currently only supported for intent " + \
                    "definitions. Any other annotation is ignored."
                )
            else:
                annotation_tokens = lexical_tokens[i:]
                annotation = self._annotation_tokens_to_dict(annotation_tokens)
                (nb_training_ex, nb_testing_ex) = \
                    self._parse_intent_annotation(annotation)
                builder.nb_training_ex = nb_training_ex
                builder.nb_testing_ex = nb_testing_ex

        return (builder.create_concrete(), builder.variation)

    def _annotation_tokens_to_dict(self, tokens):
        """
        Transforms the tokens `tokens` that contain an annotation into a dictionary
        that contains the same information.
        @pre: `tokens` really contains an annotation (starting at the beginning of
            the list).
        @raises: - `ValueError` if the precondition is not met.
                - `SyntaxError` if the annotation contains the same key twice.
        """
        if len(tokens) == 0 or tokens[0].type != TerminalType.annotation_start:
            raise ValueError(
                "Tried to parse tokens as if they were an annotation while " + \
                "they weren't"
            )

        result = dict()
        current_key = None
        for token in tokens:
            if token.type == TerminalType.annotation_end:
                break
            elif token.type == TerminalType.key:
                current_key = token.text
            elif token.type == TerminalType.value:
                if current_key in result:
                    self.input_file_manager.syntax_error(
                        "Annotation contained the key '" + current_key + \
                        "' twice."
                    )
                result[current_key] = token.text

        return result

    def _parse_intent_annotation(self, annotation):
        """
        Given a dict representing the annotation corresponding to an intent
        declaration, returns the number of examples asked in the training
        and testing sets (as a 2-tuple).
        Returns `None` instead of a number if a number was not provided.
        @raises - `SyntaxError` if the number of examples provided are
                  actually not integral numbers.
                - `SyntaxError` if the annotation contains the same information
                  at least twice.
        Prints a warning if the annotation contains unrecognized keys.
        """
        nb_training_ex = None
        nb_testing_ex = None
        for key in annotation:
            if key is None or key.lower() in ("training", "train"):
                if nb_training_ex is not None:
                    self.input_file_manager.syntax_error(
                        "Detected a number of examples for training set " + \
                        "several times."
                    )
                nb_training_ex = \
                    self._str_to_int(
                        annotation[key],
                        "Couldn't parse the annotation of the intent."
                    )
            elif key.lower() in ("testing", "test"):
                if nb_testing_ex is not None:
                    self.input_file_manager.syntax_error(
                        "Detected a number of examples for testing set " + \
                        "several times."
                    )
                nb_testing_ex = \
                    self._str_to_int(
                        annotation[key],
                        "Couldn't parse the annotation of the intent."
                    )
            else:
                print_warn("Unsupported key in the annotation: '" + key + "'.")
        return (nb_training_ex, nb_testing_ex)

    def _str_to_int(self, text, err_msg):
        """
        Transforms the str `text` into an int.
        @raises: `SyntaxError` with the message `err_msg` and a small message
                 explaining `text` is not a valid int
                 if the cast couldn't be performed.
        """
        try:
            return int(text)
        except ValueError:
            self.input_file_manager.syntax_error(
                err_msg + " '" + text + "' is not a valid integral number.")

    def _parse_rule_line(self, lexical_tokens):
        """
        Handles a line that is a rule within a unit definition.
        Adds the rule to the currently parsed unit.
        """
        if (self._last_indentation is not None
                and lexical_tokens[0].text != self._last_indentation):
            self.input_file_manager.syntax_error("Inconsistent indentation.")
        if self._current_unit_declaration is None:
            self.input_file_manager.syntax_error(
                "Detected a rule outside a unit declaration.")

        rule = self._parse_rule(lexical_tokens[1:])
        self._current_unit_declaration.add_rule(rule,
                                                self._current_variation_name)

        Stats.get_or_create().new_rule_parsed()

    def _parse_rule(self, tokens):
        """
        Handles the tokens `tokens` that contain a rule (inside a unit
        definition).
        Returns the rule (`Rule`) that `tokens` represent.
        """
        # TODO replace this with a (stateful) iterator to make it more readable
        rule_contents = []
        current_builder = None
        leading_space = False
        slot_value = None
        i = 0
        while i < len(tokens):
            token = tokens[i]
            if token.type == TerminalType.whitespace:
                leading_space = True
                if current_builder is not None:
                    rule_contents.append(current_builder.create_concrete())
                    current_builder = None
            # Units and rule contents
            elif token.type == TerminalType.word:
                rule_contents.append(Word(token.text, leading_space))
                leading_space = False
            elif (
                token.type in \
                (TerminalType.alias_ref_start,
                    TerminalType.slot_ref_start,
                    TerminalType.intent_ref_end)
            ):
                if current_builder is not None:
                    rule_contents.append(current_builder.create_concrete())
                current_builder = UnitRefBuilder()
                current_builder.leading_space = leading_space
                if token.type == TerminalType.alias_ref_start:
                    current_builder.type = UnitType.alias
                elif token.type == TerminalType.slot_ref_start:
                    current_builder.type = UnitType.slot
                elif token.type == TerminalType.intent_ref_start:
                    current_builder.type = UnitType.intent
            elif (
                token.type in \
                (TerminalType.alias_ref_end,
                 TerminalType.slot_ref_end,
                 TerminalType.intent_ref_end)
            ):
                rule_contents.append(current_builder.create_concrete())
                current_builder = None
                leading_space = False
            elif token.type == TerminalType.unit_identifier:
                current_builder.identifier = token.text
            elif token.type == TerminalType.choice_start:
                if current_builder is not None:
                    rule_contents.append(current_builder.create_concrete())
                current_builder = ChoiceBuilder()
                current_builder.leading_space = leading_space
                last_internal_choice_token = \
                    find_index_last_choice_content(tokens, i)
                if last_internal_choice_token is not None:
                    i += 1
                    if tokens[i].type == TerminalType.casegen_marker:
                        current_builder.casegen = True
                        i += 1
                    internal_rules = \
                        self._parse_choice(
                            tokens[i:last_internal_choice_token + 1]
                        )
                    current_builder.rules = internal_rules
                    i = last_internal_choice_token
                else:
                    self.input_file_manager.syntax_error(
                        "Inconsistent choice start and ending.")
            elif token.type == TerminalType.choice_end:
                rule_contents.append(current_builder.create_concrete())
                current_builder = None
                leading_space = False
            # Modifiers
            elif token.type == TerminalType.casegen_marker:
                current_builder.casegen = True
            elif token.type == TerminalType.randgen_marker:
                current_builder.randgen = True
            elif token.type == TerminalType.opposite_randgen_marker:
                current_builder.randgen_opposite = True
            elif token.type == TerminalType.randgen_name:
                current_builder.randgen_name = token.text
            elif token.type == TerminalType.percentgen_marker:
                pass
            elif token.type == TerminalType.percentgen:
                current_builder.randgen_percent = \
                    self._str_to_int(
                        token.text,
                        "Couldn't parse the percentage " + \
                        "for the random generation modifier."
                    )
            elif token.type == TerminalType.variation_marker:
                pass
            elif token.type == TerminalType.variation_name:
                current_builder.variation = token.text
            elif token.type == TerminalType.arg_marker:
                pass
            elif token.type == TerminalType.arg_value:
                current_builder.arg_value = token.text
            elif token.type == TerminalType.slot_val_marker:
                pass
            elif token.type == TerminalType.slot_val:
                slot_value = token.text
            else:
                raise ValueError(  # Should never happen
                    "Detected invalid token type in rule: " + \
                    token.type.name + " for text '" + token.text + "'."
                )
            i += 1
        if current_builder is not None:
            rule_contents.append(current_builder.create_concrete())

        if self._current_unit_declaration is not None:
            return Rule(self._current_unit_declaration.full_name,
                        rule_contents, slot_value)
        # NOTE can only come from an interactive command (the 'rule' command)
        return Rule(None, rule_contents, slot_value)

    def _parse_choice(self, tokens):
        rules = []

        current_rule_start_index = 0
        i = 0
        while i < len(tokens):
            token = tokens[i]
            if token.type == TerminalType.choice_sep:
                rules.append(
                    self._parse_rule(tokens[current_rule_start_index:i]))
                current_rule_start_index = i + 1
            if token.type == TerminalType.choice_start:
                end_choice_index = find_matching_choice_end(tokens, i)
                if end_choice_index is None:
                    self.input_file_manager.syntax_error(
                        "Inconsistent choice starts and endings.")
                i = end_choice_index
            i += 1

        rules.append(self._parse_rule(tokens[current_rule_start_index:i]))

        return rules