예제 #1
파일: Grammar.py 프로젝트: aukejw/ELPSA
    def __getGrammar(self):
        Counts the frequency at which rules occur, and stores the results
        in a dictionary frequency (which is publically accessible). It
        also finds all nonterminals and stores them in a public set NTs.
        if self.verbose:
            log('Collecting grammar rules from txt file.\n')
        # otherwise, count the frequencies
        f = open(self.corpusfilepath, 'r')    
        nonterminals = set()
        frequency = defaultdict(lambda : defaultdict(lambda : 0))
        for line in f:
            line = Helper.replaceDigits(line) if self.replace_numeric else line
            NT, rules = self.__parse( line )
            for rule in rules:                
                A, B = rule
                if type(B) == list:
                    B = tuple(B)
                    frequency[A][B] += 1

                    B = (B.lower(), ) if self.lowercase else (B, )
                    frequency[A][B] += 1

        self.nonterminals = nonterminals
        self.frequency = frequency
예제 #2
파일: TreeParser.py 프로젝트: aukejw/ELPSA
    def CYK(self, sentence, max_length):
        CYK algorithm. Constructs a table called chart, which contains terminal
        and nonterminal entries. This chart can be used to find all parse trees
        describing the given sentence, or the most probable parse (viterbi 
        Sentence        :   string consisting of n characters: a1 ... an.
        # Queue used to keep track of words replaced by UNKNOWN or NUMERIC
        replaced_words = deque(sentence.split(' '))
        # Replace words with capitals
        if self.lowercase:
            sentence = sentence.lower()
        # Replace numeric values if this is indicated beforehand
        if self.replace_numeric:
            sentence = Helper.replaceDigits(sentence)

        sentence = sentence.split()
        self.sentence_length = n = len(sentence)
        if n > max_length:
            return False

        # Chart structure allows for tracking all possible parses, saves 
        # nonterminals in form chart[begin,end] = all_nonterminals
        chart = defaultdict(set)

        # Used to keep track of the most probable parse-route in form 
        # viterbi[begin, end, nonterminal] = [node1, node2, split]
        viterbi = dict()

        # Used to keep track of the probability of nonterminals occurring in a
        # chartposition in form pi[begin,end,nonterminal] = p
        pi = defaultdict(float)
        # Local variables for efficiency
        rules_forward = self.Grammar.rules_forward
        rules_reverse = self.Grammar.rules_reverse
        rules_reverse_terminal = self.Grammar.rules_reverse_terminal
        terminals = self.Grammar.terminals

        def addToChart(parentnode, node1, node2, begin, end, split):
            addToChart(parentnode, nodes, begin, end, split)
            parentnode  :   A string containing a (non-)terminal.       
            nodes       :   A tuple containing one or two (non-)terminals.
            begin       :   An integer indicating the begin of the span
            end         :   An integer indicating the end of the span
            split       :   An integer indicating where the span is split
            if not parentnode in chart[begin,end]:
                chart[begin,end].add( parentnode )
            ## Handle binary rules            
            if node2:
                # Calculate the probability of this production at this position
                p = pi[begin,split,node1] * \
                    pi[split,end,node2] * \
                # Update the best_so_far, if needed
                if p > pi[begin,end,parentnode]:
                    pi[begin,end,parentnode] = p                
                    viterbi[begin,end,parentnode] = [node1, node2, split]

                # Infer possible next unaries
                for grandparentnode in rules_reverse[(parentnode, )]:
                    addToChart(grandparentnode, parentnode, None, begin, end, split)
            ## Handle unary rules
                # Calculate the probability of this production at this position
                p = pi[begin, end, node1] * rules_forward[parentnode][(node1,)]

                # Update production probability in this chart
                if p > pi[begin,end,intern(parentnode)] or \
                    (node1 in terminals and parentnode in rules_reverse_terminal[node1]):

                    pi[begin, end, parentnode] = p                
                    viterbi[begin, end, parentnode] = [node1, None, split]
                # If no infinite recursion is caused:
                if parentnode != node1:
                    # Infer possible next unaries
                    for grandparentnode in rules_reverse[(parentnode, )]:
                        addToChart(grandparentnode, parentnode, None, begin, end, split)

        ## Initialization                
        if self.verbose:
            log('Initializing chart.')
        # for every entry in the string
        for i in xrange(n):
            word = sentence[i]
            # If a word does not occur in the set of terminals, replace it
            if word not in terminals:
                # Either by classifying it and adding production rules to the 
                # grammar, based on suffix
                if self.UnknownWordHandler:
                # Or by the UNKNOWN tag    
                    word = Helper.UNKNOWN
            # Infer word,begin,end,split            
            pi[i,i+1,word] = 1
            for nonterminal in rules_reverse_terminal[word]:
                addToChart(nonterminal, word, None, i, i+1, 0)
        ## Main Loop
        if self.verbose:
            log('Entering main loop.')

        for span in xrange(2,n+1):
            for begin in xrange(0, n-span+1):
                end = begin + span
                for split in xrange(begin+1, end):
                    for node1 in chart[begin,split]:
                        for node2 in chart[split,end]:
                            for A in rules_reverse[(node1,node2)]:
                                # Begin and end are derived from chart pos
                                addToChart(A, node1, node2, begin, end, split)

        self.chart = chart
        self.viterbi = viterbi
        self.pi = pi
        self.replaced_words = replaced_words
        if self.verbose:
            log('Chart complete.')
        return True