コード例 #1
0
class PCFGParser(Parser):
    def __init__(self):
        self.lexicon = Lexicon()
        self.grammar = Grammar()
        self.parsing_triangle = defaultdict(set)
        self.cell_capacity = 3  #represents the maximum capacity of a cell inside the Parsing Tree

    def write_par(self, train_trees, grammar_file, lexicon_file):
        self.train(train_trees)
        #write Grammar
        lst = []
        lst.append(self.grammar.unary_rules_by_child)
        lst.append(self.grammar.binary_rules_by_left_child)
        lst.append(self.grammar.binary_rules_by_right_child)
        with file(grammar_file, "w") as fout:
            pickle.dump(lst, fout)

        #write Lexicon
        lst = []
        lst.append(self.lexicon.total_tokens)
        lst.append(self.lexicon.total_word_types)
        lst.append(self.lexicon.word_to_tag_counters)
        lst.append(self.lexicon.tag_counter)
        lst.append(self.lexicon.word_counter)
        lst.append(self.lexicon.type_to_tag_counter)
        with file(lexicon_file, "w") as fout:
            pickle.dump(lst, fout)

    def read(self, grammar_file, lexicon_file):
        self.lexicon.read_lexicon(lexicon_file)
        self.grammar.read_grammar(grammar_file)

    def train(self, train_trees):
        self.lexicon.train(train_trees)
        binarized_train_trees = [
            TreeAnnotations.binarize_tree(t) for t in train_trees
        ]
        self.grammar.train(binarized_train_trees)

    def print_parsing_traingle(self):
        """
        This function is used to pring the Parsing Traingle. It's very important for debugging, except that
        you can delete it.
        """
        for t, rules in sorted(self.parsing_triangle.iteritems()):
            for rule in rules:
                print t, str(rule),

    def get_cell_tags(self, row, col):
        #returns the 'tag'  and 'score' of the cell at 'row' and 'col'
        return [(rule.parent, rule.score) for rule in \
            sorted(self.parsing_triangle[row, col], key=lambda rule:rule.score, reverse = True)]\
            [:self.cell_capacity]

    def get_cell_pairs(self, row, level):
        result = []
        col = row + level
        for i in range(level):
            _tuple = ((row, row + i), (row + i + 1, col))
            result.append(_tuple)
        assert len(result) == level
        return result

    def get_best_parse(self, sentence):
        """
        Should return a Tree.
        'sentence' is a list of strings (words) that form a sentence.
        """
        print " ".join(sentence)
        n = len(sentence)

        # ------------------------- (Level 0) -------------------------
        # --------------- Initializing Parsing Traingle ---------------

        possible_tags = self.lexicon.tag_counter.keys()
        for i, word in enumerate(sentence):
            for tag in possible_tags:
                if self.lexicon.word_to_tag_counters[word][tag] != 0:
                    unary_rule = UnaryRule(tag, word)
                    unary_rule.score = self.lexicon.score_tagging(word, tag)
                    self.parsing_triangle[(i, i)].add(unary_rule)

        self.print_parsing_traingle()
        print "\n\n"
        # ------------- End Initializing Parsing Traingle -------------
        # ------------------ Expanding First Cells --------------------
        key = None
        while (key != 0):
            key = 0
            for row, col in sorted(self.parsing_triangle.keys()):
                tags_scores = self.get_cell_tags(row, col)
                for tag, score in tags_scores:
                    rules = self.grammar.get_unary_rules_by_child(tag)
                    rules = sorted(rules,
                                   key=lambda rule: rule.score,
                                   reverse=True)[:self.cell_capacity]
                    print tag, len(rules)
                    for rule in rules:
                        if rule not in self.parsing_triangle[(row, col)]:
                            key += 1
                            tmp_rule = copy.copy(rule)
                            tmp_rule.score = score * rule.score
                            tmp_rule.origin = (row, col)
                            tmp_rule.des = (row, col)
                            self.parsing_triangle[(row, col)].add(tmp_rule)

                # self.print_parsing_traingle()
                # print "\n"
            # break
        print "Finished Level 0"
        # ---------------- End Expanding First Cells ------------------

        # self.print_parsing_traingle()

        # ----------------------- (Level 1...n) -----------------------
        # ------------------- Filling Other Cells ---------------------
        level = 1
        while (level != n):
            for row in range(n - level):
                col = row + level
                cell_pairs = self.get_cell_pairs(row, level)
                for cell_pair in cell_pairs:
                    first_tags_scores = self.get_cell_tags(
                        cell_pair[0][0], cell_pair[0][1])
                    second_tags_scores = self.get_cell_tags(
                        cell_pair[1][0], cell_pair[1][1])
                    for tag1, score1 in first_tags_scores:
                        rules1 = self.grammar.get_binary_rules_by_left_child(
                            tag1)
                        for tag2, score2 in second_tags_scores:
                            rules2 = self.grammar.get_binary_rules_by_right_child(
                                tag2)
                            common_rules = set(rules1).intersection(
                                set(rules2))
                            for rule in common_rules:
                                tmp_rule = copy.copy(rule)
                                tmp_rule.score = score1 * score2 * rule.score
                                tmp_rule.origin = cell_pair
                                tmp_rule.des = (row, col)
                                self.parsing_triangle[row, col].add(tmp_rule)
                # ------------- Findin Better Unary Rules -------------
                #I think this part needs more debugging
                # key = None
                # while(key != 0):
                #     key = 0
                #     tags_scores = self.get_cell_tags(row, col)
                #     tags = [t[0] for t in tags_scores]
                #     for tag, score in tags_scores:
                #         rules = self.grammar.get_unary_rules_by_child(tag)
                #         for rule in rules:
                #             if rule.parent in tags:
                #                 old_rules = [ r for r in self.parsing_triangle[row, col] if r.parent == rule.parent ]
                #                 for old_rule in old_rules:
                #                     if rule.score*score > old_rule.score:
                #                         key += 1
                #                         self.parsing_triangle[row, col].discard(old_rule)
                #                         new_rule = copy.copy(rule)
                #                         new_rule.score = score * rule.score
                #                         self.parsing_triangle[row, col].add(new_rule)
                # ----------- End Finding Better Unary Rules ----------
                # ----------------- Expanding  Cells ------------------
                key = None
                while (key != 0):
                    key = 0
                    tags_scores = self.get_cell_tags(row, col)
                    for tag, score in tags_scores:
                        rules = self.grammar.get_unary_rules_by_child(tag)
                        for rule in sorted(rules,
                                           key=lambda rule: rule.score,
                                           reverse=True):
                            if rule not in self.parsing_triangle[row, col]:
                                key += 1
                                tmp_rule = copy.copy(rule)
                                tmp_rule.score = score * rule.score
                                tmp_rule.origin = (row, col)
                                tmp_rule.des = (row, col)
                                self.parsing_triangle[row, col].add(tmp_rule)
                # ---------------- End Expanding  Cells ---------------
                # print ""
                # self.print_parsing_traingle()
            level += 1
            # break
        print "Finished Parsing traingle"

        self.print_parsing_traingle()
        print "\n\n"
        # ----------------------- Building Tree -----------------------
        """
        The following four methods are defined to build the tree, and they are:
          -> search_rule.
          -> build_branch.
          -> build.
          -> build_tree.
        """
        print len(self.parsing_triangle[0, 3])
        for rule in self.parsing_triangle[0, 3]:
            print str(rule)
        return self.build_tree(n, render=1)

    def search_rule(self, cell, tag):
        row, col = cell
        for rule in self.parsing_triangle[row, col]:
            if rule.parent == tag:
                return rule

    def build_branch(self, rule):
        if isinstance(rule, UnaryRule):
            return Tree(rule.parent, [Tree(rule.child)])
        elif isinstance(rule, BinaryRule):
            return Tree(rule.parent,
                        [Tree(rule.left_child),
                         Tree(rule.right_child)])

    def build(self, rule):
        origin = rule.origin
        if origin == (None, None):
            return self.build_branch(rule)
        else:
            if isinstance(rule, UnaryRule):
                # print str(rule)
                tag = rule.child
                next_rule = self.search_rule(origin, tag)
                return Tree(rule.parent, [self.build(next_rule)])

            elif isinstance(rule, BinaryRule):
                # print str(rule)
                left_origin, right_origin = origin[0], origin[1]
                left_tag, right_tag = rule.left_child, rule.right_child

                next_left_rule = self.search_rule(left_origin, left_tag)
                next_right_rule = self.search_rule(right_origin, right_tag)

                return Tree(
                    rule.parent,
                    [self.build(next_left_rule),
                     self.build(next_right_rule)])

    def build_tree(self, sen_length, render=0):
        # print "It's Here"
        no_levels = sen_length - 1
        cell = (0, no_levels)
        starting_cell = (0, no_levels)
        rule = self.search_rule(starting_cell, 'ROOT')
        starting_rule = self.search_rule(rule.origin, rule.child)
        tree = self.build(starting_rule)
        if render:
            print Trees.PennTreeRenderer.render(tree)
        tree = TreeAnnotations.unannotate_tree(tree)

        return tree
コード例 #2
0
class PCFGParser(Parser):

    def train(self, train_trees):
        self.lexicon = Lexicon(train_trees)
        binarized_train_trees = [TreeAnnotations.binarize_tree(t) for t in train_trees]
        self.grammar = Grammar(binarized_train_trees)
        self.parsing_triangle= defaultdict(set)


    def print_parsing_traingle(self):
        """
        This function is used to pring the Parsing Traingle. It's very important for debugging, except that
        you can delete it.
        """
        for t, rules in sorted(self.parsing_triangle.iteritems()):
            for rule in rules:
                print t, str(rule),


    def get_cell_tags(self, row, col):
        #returns the 'tag'  and 'score' of the cell at 'row' and 'col'
        return [(rule.parent, rule.score) for rule in self.parsing_triangle[row, col]]


    def get_cell_pairs(self, row, level):
        """
        It takes the cell row and level number, and it returns the possible cell tags that could the given cell
        could come from. For example, for first level (level=1), the cell at (0,1) come from the either (0,0)
        or (1,1). Another example, the cell (0,2) at level=2 could come from either ((0,0), (1,2)) or ((0,1), (2,2))
        """
        result = []
        col = row+level
        for i in range(level):
            _tuple = ((row, row+i), (row+i+1, col))
            result.append( _tuple )
        assert len(result) == level
        return result


    def get_best_parse(self, sentence):
        """
        Should return a Tree.
        'sentence' is a list of strings (words) that form a sentence.
        """
        n = len(sentence)

        # ------------------------- (Level 0) -------------------------
        # --------------- Initializing Parsing Traingle ---------------

        possible_tags = self.lexicon.tag_counter.keys()
        for i, word in enumerate(sentence):
            for tag in possible_tags:
                if self.lexicon.word_to_tag_counters[word][tag] != 0:
                    unary_rule = UnaryRule(tag, word)
                    unary_rule.score = self.lexicon.score_tagging(word, tag)
                    # unary_rule.origin = (i,i)
                    # unary_rule.des = (i,i)
                    self.parsing_triangle[(i, i)].add(unary_rule)
        # self.print_parsing_traingle()
        # print '\n\n'

        # ------------- End Initializing Parsing Traingle -------------
        # ------------------ Expanding First Cells --------------------
        key = None
        while(key != 0):
            key = 0
            for row, col in sorted(self.parsing_triangle.keys()):
                tags_scores = self.get_cell_tags(row, col)
                for tag, score in tags_scores:
                    rules = self.grammar.get_unary_rules_by_child(tag)
                    for rule in rules:
                        if rule not in self.parsing_triangle[(row, col)]:
                            key += 1
                            tmp_rule = copy.copy(rule)
                            tmp_rule.score = score * rule.score
                            # print row, col
                            tmp_rule.origin = (row, col)
                            tmp_rule.des = (row, col)
                            self.parsing_triangle[(row, col)].add(tmp_rule)
                          
        # ---------------- End Expanding First Cells ------------------

        # self.print_parsing_traingle()
        
        # ----------------------- (Level 1...n) -----------------------
        # ------------------- Filling Other Cells ---------------------
        level = 1
        while(level != n):
            for row in range(n-level):
                col = row+level
                cell_pairs = self.get_cell_pairs(row, level)
                for cell_pair in cell_pairs:
                    first_tags_scores = self.get_cell_tags(cell_pair[0][0], cell_pair[0][1])
                    second_tags_scores = self.get_cell_tags(cell_pair[1][0], cell_pair[1][1])
                    for tag1, score1 in first_tags_scores:
                        rules1 = self.grammar.get_binary_rules_by_left_child(tag1)
                        for tag2, score2 in second_tags_scores:
                            rules2 = self.grammar.get_binary_rules_by_right_child(tag2)
                            common_rules = set(rules1).intersection(set(rules2))
                            for rule in common_rules:
                                tmp_rule = copy.copy(rule)
                                tmp_rule.score = score1 * score2 * rule.score
                                tmp_rule.origin = cell_pair
                                tmp_rule.des = (row, col)
                                self.parsing_triangle[row, col].add(tmp_rule)
                # ------------- Findin Better Unary Rules -------------
                #I think this part needs more debugging
                key = None
                while(key != 0):
                    key = 0
                    tags_scores = self.get_cell_tags(row, col)
                    tags = [t[0] for t in tags_scores]
                    for tag, score in tags_scores: 
                        rules = self.grammar.get_unary_rules_by_child(tag)
                        for rule in rules:
                            if rule.parent in tags:
                                old_rules = [ r for r in self.parsing_triangle[row, col] if r.parent == rule.parent ]
                                for old_rule in old_rules:
                                    if rule.score*score > old_rule.score:
                                        key += 1
                                        self.parsing_triangle[row, col].discard(old_rule)
                                        new_rule = copy.copy(rule)
                                        new_rule.score = score * rule.score
                                        self.parsing_triangle[row, col].add(new_rule)
                # ----------- End Findin Better Unary Rules -----------
                # ----------------- Expanding  Cells ------------------
                key = None
                t = row, col
                while(key != 0):
                    key = 0
                    tags_scores = self.get_cell_tags(row, col)
                    for tag, score in tags_scores:
                        rules = self.grammar.get_unary_rules_by_child(tag)
                        for rule in rules:
                            if rule not in self.parsing_triangle[t]:
                                key += 1
                                tmp_rule = copy.copy(rule)
                                tmp_rule.score = score * rule.score
                                tmp_rule.origin = (row, col)
                                tmp_rule.des = (row, col)
                                self.parsing_triangle[t].add(tmp_rule)
                # ---------------- End Expanding  Cells ---------------
            level += 1

        # ----------------------- Building Tree -----------------------
        """
        The following four methods are defined to build the tree, and they are:
          -> search_rule.
          -> build_branch.
          -> build.
          -> build_tree.
        """
        return self.build_tree(n)
    


    def search_rule(self, cell, tag):
        row, col = cell
        for rule in self.parsing_triangle[row, col]:
            if rule.parent == tag:
                return rule
            

    def build_branch(self, rule):
        if isinstance(rule, UnaryRule):
            return Tree(rule.parent, [Tree(rule.child)])
        elif isinstance(rule, BinaryRule):
            return Tree(rule.parent, [Tree(rule.left_child), Tree(rule.right_child)])


    def build(self, rule):
        origin = rule.origin
        if origin == (None, None):
            print str(rule)
            return self.build_branch(rule)
        else:
            if isinstance(rule, UnaryRule):
                # print str(rule)
                tag = rule.child
                next_rule = self.search_rule(origin, tag)
                return Tree(rule.parent, [self.build(next_rule)])
            
            elif isinstance(rule, BinaryRule):
                # print str(rule)
                left_origin, right_origin = origin[0], origin[1]
                left_tag, right_tag = rule.left_child, rule.right_child
                
                next_left_rule = self.search_rule(left_origin, left_tag)
                next_right_rule = self.search_rule(right_origin, right_tag)
                
                return Tree(rule.parent, [self.build(next_left_rule), self.build(next_right_rule)])


    def build_tree(self, sen_length, render=0):
        no_levels = sen_length-1
        cell = (0, no_levels)
        starting_cell = (0, no_levels)
        starting_rule = self.search_rule(starting_cell, 'S')
        tree = self.build(starting_rule)
        if render:
            print Trees.PennTreeRenderer.render(tree)
        tree = TreeAnnotations.unannotate_tree(tree)
        
        return tree