def __getGrammar(self): ''' getGrammar() Counts the frequency at which rules occur, and stores the results in a dictionary frequency (which is publically accessible). It also finds all nonterminals and stores them in a public set NTs. ''' if self.verbose: log('Collecting grammar rules from txt file.\n') # otherwise, count the frequencies f = open(self.corpusfilepath, 'r') nonterminals = set() frequency = defaultdict(lambda : defaultdict(lambda : 0)) for line in f: line = Helper.replaceDigits(line) if self.replace_numeric else line NT, rules = self.__parse( line ) for rule in rules: A, B = rule nonterminals.add(A) if type(B) == list: B = tuple(B) frequency[A][B] += 1 else: B = (B.lower(), ) if self.lowercase else (B, ) frequency[A][B] += 1 self.nonterminals = nonterminals self.frequency = frequency
def CYK(self, sentence, max_length): ''' CYK(sentence) CYK algorithm. Constructs a table called chart, which contains terminal and nonterminal entries. This chart can be used to find all parse trees describing the given sentence, or the most probable parse (viterbi parse). Sentence : string consisting of n characters: a1 ... an. ''' # Queue used to keep track of words replaced by UNKNOWN or NUMERIC replaced_words = deque(sentence.split(' ')) # Replace words with capitals if self.lowercase: sentence = sentence.lower() # Replace numeric values if this is indicated beforehand if self.replace_numeric: sentence = Helper.replaceDigits(sentence) sentence = sentence.split() self.sentence_length = n = len(sentence) if n > max_length: return False # Chart structure allows for tracking all possible parses, saves # nonterminals in form chart[begin,end] = all_nonterminals chart = defaultdict(set) # Used to keep track of the most probable parse-route in form # viterbi[begin, end, nonterminal] = [node1, node2, split] viterbi = dict() # Used to keep track of the probability of nonterminals occurring in a # chartposition in form pi[begin,end,nonterminal] = p pi = defaultdict(float) # Local variables for efficiency rules_forward = self.Grammar.rules_forward rules_reverse = self.Grammar.rules_reverse rules_reverse_terminal = self.Grammar.rules_reverse_terminal terminals = self.Grammar.terminals def addToChart(parentnode, node1, node2, begin, end, split): ''' addToChart(parentnode, nodes, begin, end, split) parentnode : A string containing a (non-)terminal. nodes : A tuple containing one or two (non-)terminals. begin : An integer indicating the begin of the span end : An integer indicating the end of the span split : An integer indicating where the span is split ''' if not parentnode in chart[begin,end]: chart[begin,end].add( parentnode ) ## Handle binary rules if node2: # Calculate the probability of this production at this position p = pi[begin,split,node1] * \ pi[split,end,node2] * \ rules_forward[parentnode][(node1,node2)] # Update the best_so_far, if needed if p > pi[begin,end,parentnode]: pi[begin,end,parentnode] = p viterbi[begin,end,parentnode] = [node1, node2, split] # Infer possible next unaries for grandparentnode in rules_reverse[(parentnode, )]: addToChart(grandparentnode, parentnode, None, begin, end, split) ## Handle unary rules else: # Calculate the probability of this production at this position p = pi[begin, end, node1] * rules_forward[parentnode][(node1,)] # Update production probability in this chart if p > pi[begin,end,intern(parentnode)] or \ (node1 in terminals and parentnode in rules_reverse_terminal[node1]): pi[begin, end, parentnode] = p viterbi[begin, end, parentnode] = [node1, None, split] # If no infinite recursion is caused: if parentnode != node1: # Infer possible next unaries for grandparentnode in rules_reverse[(parentnode, )]: addToChart(grandparentnode, parentnode, None, begin, end, split) ## Initialization if self.verbose: log('Initializing chart.') # for every entry in the string for i in xrange(n): word = sentence[i] # If a word does not occur in the set of terminals, replace it if word not in terminals: # Either by classifying it and adding production rules to the # grammar, based on suffix if self.UnknownWordHandler: self.UnknownWordHandler.classify(word) else: # Or by the UNKNOWN tag word = Helper.UNKNOWN # Infer word,begin,end,split pi[i,i+1,word] = 1 for nonterminal in rules_reverse_terminal[word]: addToChart(nonterminal, word, None, i, i+1, 0) ## Main Loop if self.verbose: log('Entering main loop.') for span in xrange(2,n+1): for begin in xrange(0, n-span+1): end = begin + span for split in xrange(begin+1, end): for node1 in chart[begin,split]: for node2 in chart[split,end]: for A in rules_reverse[(node1,node2)]: # Begin and end are derived from chart pos addToChart(A, node1, node2, begin, end, split) self.chart = chart self.viterbi = viterbi self.pi = pi self.replaced_words = replaced_words if self.verbose: log('Chart complete.') return True