Beispiel #1
0
    def __getGrammar(self):
        '''
        getGrammar()
        
        Counts the frequency at which rules occur, and stores the results
        in a dictionary frequency (which is publically accessible). It
        also finds all nonterminals and stores them in a public set NTs.
        '''
        if self.verbose:
            log('Collecting grammar rules from txt file.\n')
            
        # otherwise, count the frequencies
        f = open(self.corpusfilepath, 'r')    
        
        nonterminals = set()
        frequency = defaultdict(lambda : defaultdict(lambda : 0))
        
        for line in f:
            line = Helper.replaceDigits(line) if self.replace_numeric else line
                     
            NT, rules = self.__parse( line )
            
            for rule in rules:                
                A, B = rule
                nonterminals.add(A)
                if type(B) == list:
                    B = tuple(B)
                    frequency[A][B] += 1
                else:

                    B = (B.lower(), ) if self.lowercase else (B, )
                    frequency[A][B] += 1

        self.nonterminals = nonterminals
        self.frequency = frequency
Beispiel #2
0
    def CYK(self, sentence, max_length):
        ''' 
        CYK(sentence)        
        
        CYK algorithm. Constructs a table called chart, which contains terminal
        and nonterminal entries. This chart can be used to find all parse trees
        describing the given sentence, or the most probable parse (viterbi 
        parse).
        
        Sentence        :   string consisting of n characters: a1 ... an.
        '''         
        # Queue used to keep track of words replaced by UNKNOWN or NUMERIC
        replaced_words = deque(sentence.split(' '))
        
        # Replace words with capitals
        if self.lowercase:
            sentence = sentence.lower()
        # Replace numeric values if this is indicated beforehand
        if self.replace_numeric:
            sentence = Helper.replaceDigits(sentence)

        sentence = sentence.split()
        self.sentence_length = n = len(sentence)
        
        if n > max_length:
            return False

        # Chart structure allows for tracking all possible parses, saves 
        # nonterminals in form chart[begin,end] = all_nonterminals
        chart = defaultdict(set)

        # Used to keep track of the most probable parse-route in form 
        # viterbi[begin, end, nonterminal] = [node1, node2, split]
        viterbi = dict()

        # Used to keep track of the probability of nonterminals occurring in a
        # chartposition in form pi[begin,end,nonterminal] = p
        pi = defaultdict(float)
        
        # Local variables for efficiency
        rules_forward = self.Grammar.rules_forward
        rules_reverse = self.Grammar.rules_reverse
        rules_reverse_terminal = self.Grammar.rules_reverse_terminal
        terminals = self.Grammar.terminals

        def addToChart(parentnode, node1, node2, begin, end, split):
            '''
            addToChart(parentnode, nodes, begin, end, split)
            
       
            parentnode  :   A string containing a (non-)terminal.       
            nodes       :   A tuple containing one or two (non-)terminals.
            begin       :   An integer indicating the begin of the span
            end         :   An integer indicating the end of the span
            split       :   An integer indicating where the span is split
            '''
            if not parentnode in chart[begin,end]:
                chart[begin,end].add( parentnode )
            
            ## Handle binary rules            
            if node2:
                # Calculate the probability of this production at this position
                p = pi[begin,split,node1] * \
                    pi[split,end,node2] * \
                    rules_forward[parentnode][(node1,node2)]
                
                # Update the best_so_far, if needed
                if p > pi[begin,end,parentnode]:
                    pi[begin,end,parentnode] = p                
                    viterbi[begin,end,parentnode] = [node1, node2, split]

                # Infer possible next unaries
                for grandparentnode in rules_reverse[(parentnode, )]:
                    addToChart(grandparentnode, parentnode, None, begin, end, split)
            ## Handle unary rules
            else:
                # Calculate the probability of this production at this position
                p = pi[begin, end, node1] * rules_forward[parentnode][(node1,)]

                # Update production probability in this chart
                if p > pi[begin,end,intern(parentnode)] or \
                    (node1 in terminals and parentnode in rules_reverse_terminal[node1]):

                    pi[begin, end, parentnode] = p                
                    viterbi[begin, end, parentnode] = [node1, None, split]
                                    
                # If no infinite recursion is caused:
                if parentnode != node1:
                    # Infer possible next unaries
                    for grandparentnode in rules_reverse[(parentnode, )]:
                        addToChart(grandparentnode, parentnode, None, begin, end, split)

        ## Initialization                
        if self.verbose:
            log('Initializing chart.')
            
        # for every entry in the string
        for i in xrange(n):
            word = sentence[i]
            # If a word does not occur in the set of terminals, replace it
            if word not in terminals:
                # Either by classifying it and adding production rules to the 
                # grammar, based on suffix
                if self.UnknownWordHandler:
                    self.UnknownWordHandler.classify(word)
                else:
                # Or by the UNKNOWN tag    
                    word = Helper.UNKNOWN
            
            # Infer word,begin,end,split            
            pi[i,i+1,word] = 1
            for nonterminal in rules_reverse_terminal[word]:
                addToChart(nonterminal, word, None, i, i+1, 0)
        
        ## Main Loop
        if self.verbose:
            log('Entering main loop.')

        for span in xrange(2,n+1):
            for begin in xrange(0, n-span+1):
                end = begin + span
                for split in xrange(begin+1, end):
                    for node1 in chart[begin,split]:
                        for node2 in chart[split,end]:
                            for A in rules_reverse[(node1,node2)]:
                                # Begin and end are derived from chart pos
                                addToChart(A, node1, node2, begin, end, split)

        self.chart = chart
        self.viterbi = viterbi
        self.pi = pi
        self.replaced_words = replaced_words
        if self.verbose:
            log('Chart complete.')
        return True