Example #1
0
 def initialize_queue(self, strategy):
     '''
     Initialize queue according to the parsing strategy chosen by
     the user
     '''
     if strategy == 'fifo':
         self.queue = Queue()
     elif strategy == 'bestfirst':
         self.queue = BestFirstQueue()
     elif strategy == 'altsearch':
         self.queue = AltSearchQueue(self.sentence_length+1)
     else:
         raise QueueException('Invalid strategy (%s). Please try again and choose a strategy from the following set: {fifo, bestfirst, altsearch}' % strategy)
Example #2
0
class BottomUpChartParser:

    grammar = None  # Grammar object that includes lexicon and
                    # production rules
    queue = None    # Queue object on which new edges are stacked
    chart = None    # Chart object in which edges are stored for the
                    # final parse generation
    sentence_length = 0
    will_print_chart = True # Set to false if you want to deactivate printing of the found parses

    def __init__(self, grammar):
        self.grammar = Grammar(grammar)

    def parse(self, sentence, number_of_parses, strategy):
        '''
        Parse the input sentence

        This is the central method to be called from outside.
        '''
        ### Preprocessing ###
        # Tokenize input sentence
        tokens = self.tokenize(sentence)
        self.sentence_length = len(tokens)

        # Check for unknown tokens
        unknown_words = self.get_unknown_words(tokens)
        if unknown_words:
            # TODO: Run fallback solutions to fix unknown words, else
            # raise exception
            raise ParseException("Sentence contains unknown words (%s). Please try again!" % ', '.join(unknown_words))

        ### Main steps ###
        # (1) Initialize empty chart and queue
        self.initialize_chart()
        self.initialize_queue(strategy)

        # (2) For every token, create a complete edge and push it to
        #     the queue
        self.init_rule(tokens)

        # Iteration counter for evaluation purposes
        iters = 0

        # (3) Repeat until no more edges are added
        #     or sufficient number of parses has been found:
        while not self.queue.is_empty() and not self.enough_parses_found(number_of_parses):
            iters = iters + 1
            # (3.1) Add next element on queue to the chart
            edge = self.queue.get_next_edge()
            self.chart.add_edge(edge)

            # (3.2) If input edge is complete,
            #       apply predict rule and fundamental rule.
            #       If input edge is incomplete,
            #       apply fundamental rule only
            if edge.is_complete():
                self.predict_rule(edge)

            self.fundamental_rule(edge)

            # (3.3) For alt search strategy, run search rule
            #       if input edge is a complete parse
            #       or last element of priority queue
            if strategy == 'altsearch':
                if ( ( (not self.queue.is_priority_active()) # Case 1: Complete parse was added to chart
                  and edge.get_prod_rule().get_lhs() == 'S'
                  and edge.is_complete()
                  and edge.get_start() == 0
                  and edge.get_end() == self.sentence_length )
                  or (self.queue.is_priority_active()        # Case 2: Priority queue emptied
                      and self.queue.is_priority_empty() ) ):
                        self.search_rule(edge)

        # 4) Display generated parses
        s_edges = self.chart.get_s_edges()
        print '%s parses found after %s iterations:' % (len(s_edges),iters)
        if self.will_print_chart :
            self.display_parses()
        else:
            for s_edge in s_edges:
                print 'Found s-edge: %s' % s_edge

    def tokenize(self, sentence):
        '''
        Separate a sentence into a list of tokens and return the list.
        Currently this simply splits at each whitespace character with no
        special preprocessing
        '''
        return sentence.split()

    def get_unknown_words(self, tokens):
        '''
        Check list of tokens for unknown words by consulting the
        lexicon and return them
        '''
        lexicon = self.grammar.get_lexicon()
        unknown_words = [token for token in tokens if token not in lexicon]
        return unknown_words

    def initialize_chart(self):
        '''
        Initialize chart
        Size of chart will be sentence_length+1 in both dimensions
        '''
        self.chart = Chart(self.sentence_length)

    def initialize_queue(self, strategy):
        '''
        Initialize queue according to the parsing strategy chosen by
        the user
        '''
        if strategy == 'fifo':
            self.queue = Queue()
        elif strategy == 'bestfirst':
            self.queue = BestFirstQueue()
        elif strategy == 'altsearch':
            self.queue = AltSearchQueue(self.sentence_length+1)
        else:
            raise QueueException('Invalid strategy (%s). Please try again and choose a strategy from the following set: {fifo, bestfirst, altsearch}' % strategy)

    def init_rule(self, tokens):
        '''
        Generate initial edges for all given tokens and add them to
        the queue

        Formal definition:
            For every word w_i add the edge [w_i -> . , (i, i+1)]
        '''
        node = -1   # Position between tokens of sentence
                    # (0 is start of sentence)
        for token in tokens:
            node += 1
            rule = ProductionRule(token, [], 1.0)
            edge = Edge(node, node+1, rule, 0, [])
            self.queue.add_edge(edge)

    def enough_parses_found(self, number_of_parses):
        '''
        Check if enough parses have been found for the input sentence

        Return True if the number of complete S edges that the chart
        contains is >= the number of parses that the user wants, else
        False
        '''
        return False if (number_of_parses == -1 or len(self.chart.get_s_edges()) < number_of_parses) else True

    def predict_rule(self, complete_edge):
        '''
        If the LHS of a complete edge can be the first RHS element of
        a production rule, create a self-loop edge with that rule and
        push it to the queue

        Input: Complete edge
        Push to queue: Incomplete self-loop edges

        Formal definition:
            For each complete edge [A -> alpha . , (i, j)]
            and each production rule  B -> A beta,
            add the self-loop edge [B -> . A beta , (i, i)]
        '''
        start = complete_edge.get_start()
        lhs = complete_edge.get_prod_rule().get_lhs()
        parent_rules = self.grammar.get_possible_parent_rules(lhs)

        for parent_rule in parent_rules:
            new_edge = Edge(start, start, parent_rule, 0, [])
            if not self.queue.has_edge(new_edge) and not self.chart.has_edge(new_edge):
                self.queue.add_edge(new_edge)

    def fundamental_rule(self, input_edge):
        '''
        If an incomplete edge can be advanced by a complete edge,
        create a new edge with the advanced dot.

        Create new edges (which can be complete or incomplete) by
        "advancing the dot", i.e. by matching incomplete edges with
        appropriate complete ones:

        (1) If the input edge is incomplete, find all complete edges
            - whose start node equals the end node of the input edge
            - whose LHS matches the RHS element
              that the input edge is currently looking for.
            If the input edge is complete, find all incomplete edges
            - whose end node equals the start node of the input edge
            - whose dot can be advanced by pairing them with the input
              edge.
        (2) From every pairing, create a new edge with the dot
            advanced over the RHS element that has just been found.
        (3) Push that edge to the queue IFF it does not exist already,
            i.e. if it has not been added to the chart or the queue
            before. This constraint keeps the parser from entering an
            infinite loop when using left-recursive grammar rules.

        Input: Single edge
        Push to queue: Complete and incomplete edges

        Formal definition:
            If the chart contains the edges [A -> alpha . B beta, (i, j)]
            and [B -> gamma . , (j, k)]
            then add a new edge [A -> alpha B . beta, (i, k)].
        '''
        if input_edge.is_complete():
            j = input_edge.get_start()
            incomplete_edges = [edge for edge \
                                in self.chart.get_edges_ending_at(j) \
                                if not edge.is_complete()]
            complete_edges = [input_edge]
        else:
            j = input_edge.get_end()
            incomplete_edges = [input_edge]
            complete_edges = [edge for edge \
                              in self.chart.get_edges_starting_at(j) \
                              if edge.is_complete()]

        ### New Edges ###
        for incomp_edge in incomplete_edges:

            # Prepare info from incomplete edge that is necessary to ...
            prod_rule = incomp_edge.get_prod_rule()
            dot = incomp_edge.get_dot()
            next_missing_dtr = prod_rule.get_rhs_element(dot)
            for comp_edge in complete_edges:

                # ... check for compatibility with complete edges:
                if next_missing_dtr == comp_edge.get_prod_rule().get_lhs():

                    # Prepare additional info from incomplete edge
                    i = incomp_edge.get_start()
                    known_dtrs = incomp_edge.get_known_dtrs()

                    # Prepare info from complete edge
                    k = comp_edge.get_end()

                    # Combine info from both edges,
                    # and use it to create new edge
                    new_dtrs = known_dtrs + [comp_edge]
                    new_edge = Edge(i, k, prod_rule, dot+1, new_dtrs)

                    # Add new edge to queue
                    if not self.queue.has_edge(new_edge) and not self.chart.has_edge(new_edge):
                        self.queue.add_edge(new_edge)

    def search_rule(self, s_edge):
        'Scans queue for priority edges. See project report for detailed explanation'
        self.queue.activate_priority_queue()
        # Check for further complete s-edges on queue
        new_s_edge = self.queue.get_next_particular_edge('S', 0, self.sentence_length)
        if not new_s_edge == None:  # Further s-edge was found
            self.queue.add_edge(new_s_edge)
        else:                       # No further s-edge remained on queue
            s_edges = self.chart.get_s_edges()
            # Sort s-edges by likelihood
            sorted_edges = []
            for s_edge in s_edges:
                pos = bisect([edge.get_prob() for edge in sorted_edges], s_edge)
                sorted_edges.insert(pos, s_edge)

            # Check s-edges until alternative parses are found
            while len(s_edges) > 0 and self.queue.is_empty:
                s_edge = s_edges.pop()
                dtrs = s_edge.get_known_dtrs()
                # Iterate through depths until alternatives are found
                # or tree is exhausted
                while len(dtrs) > 0:
#                    print "---"
                    for dtr in dtrs:
                        lhs = dtr.get_prod_rule().get_lhs()
                        start = dtr.get_start()
                        end = dtr.get_end()
                        alt_dtr = self.queue.get_next_particular_edge(lhs, start, end)
                        while not alt_dtr == None:
                            self.queue.add_edge(alt_dtr)
                            alt_dtr  = self.queue.get_next_particular_edge(lhs, start, end)
                    if self.queue.is_empty:
                        # If no alt edge was found on this level,
                        # check all elements of next lower level.
                        for dtr in dtrs:
                            mthrs = dtrs
                            dtrs = []
                            for mthr in mthrs:
                                dtrs.extend(mthr.get_known_dtrs())

    def display_parses(self):
        '''
        Display parse trees for all successful parses
        '''
        s_edges = self.chart.get_s_edges()

        if len(s_edges) == 0:
            raise ParseException("No parse could be found.")

        for s_edge in s_edges:
            parse_string = self.build_parse_string_from_edge(s_edge, 'S')
            print self.add_indentation_to_parse_string(parse_string) + '\t' + str(s_edge.get_prob())

    def build_parse_string_from_edge(self, edge, root):
        '''
        Recursively work your way down through the known daughters of
        the input edge; return a bracketed structure representing the
        parse tree.

        In order to obtain a complete structure, this
        method needs to be called with a string representing
        the appropriate tree root (as the second argument)
        '''
        if not edge.get_known_dtrs() == []:
            for dtr in edge.get_known_dtrs():
                root += ' [ ' + dtr.get_prod_rule().get_lhs() + self.build_parse_string_from_edge(dtr, '') + ' ]'
        return root

    def add_indentation_to_parse_string(self, parse_string):
        '''
        Convert flat string representation of parse to appropriately
        indented structure
        '''
        parse_string = '[ ' + parse_string + ' ]'
        indented_string = ''
        level = -1
        for char in parse_string:
            if char == '[':
                level += 1
                indented_string += '\n' + '\t'*level + char
            elif char == ']':
                level -= 1
                indented_string += char
            else:
                indented_string += char
        return indented_string