def __extract_cfg_line(self, s, lower, numerate): """Extract productions by counting parentheses""" open_p = [] last_lhs = False rhs_list = defaultdict(list) for i in range(len(s)): if s[i] == '(': open_p.append(i) elif s[i] == ')': start = open_p.pop() + 1 end = s.find(' ', start) lhs = s[start:end] # handle case of improper format with space before paren if s[i - 1] == ' ': ch = i - 2 else: ch = i - 1 if len(open_p): lhs = "%s^%s" % (lhs, s[open_p[-1] + 1:s.find(' ', open_p[-1])]) rhs_list[open_p[-1]].append(lhs) if s[ch] != ')': rhs = s[end + 1:i] if lower: # make terminals lowercase rhs = rhs.lower() # tag numerals as all the same if numerate and util.is_numeral(rhs): # digits can have . and , rhs = util.NUMERAL self.__add_production(lhs, rhs) else: self.__add_production(lhs, rhs_list[start - 1])
def __extract_cfg_line(self, s, lower, numerate): """Extract productions by counting parentheses""" open_p = [] last_lhs = False rhs_list = defaultdict(list) for i in range(len(s)): if s[i] == "(": open_p.append(i) elif s[i] == ")": start = open_p.pop() + 1 end = s.find(" ", start) lhs = s[start:end] # handle case of improper format with space before paren if s[i - 1] == " ": ch = i - 2 else: ch = i - 1 if len(open_p): lhs = "%s^%s" % (lhs, s[open_p[-1] + 1 : s.find(" ", open_p[-1])]) rhs_list[open_p[-1]].append(lhs) if s[ch] != ")": rhs = s[end + 1 : i] if lower: # make terminals lowercase rhs = rhs.lower() # tag numerals as all the same if numerate and util.is_numeral(rhs): # digits can have . and , rhs = util.NUMERAL self.__add_production(lhs, rhs) else: self.__add_production(lhs, rhs_list[start - 1])
def __create_chart(self): """Build chart using CYK algorithm""" # create local vars for memory efficiency cfg_r2l = self.G.cfg_r2l pcfg = self.G.pcfg n = self.sentence_len s = self.sentence verbose = self.verbose start_symbol = self.start_symbol unknown = util.UNKNOWN chart = defaultdict(set) covering = defaultdict(set) viterbi_back = dict() pi = defaultdict(float) # local function for efficiency def check_add_prob(prob, a, b, c, begin, end, split): # add production to this chart location if prob > 0: if verbose > 1: util.log_p("add C %s => (%s %s) to [%d, %d] split: %d." % (a, b, c, begin, end, split)) chart[begin, end].add(a) # store our covering productions if a == start_symbol: covering[begin, end, a].add((b, c)) # if max, break ties by not changing if prob > pi[begin, end, a]: if verbose > 1: util.log_p("add pi %s => (%s %s) to [%d, %d] split: %d." % (a, b, c, begin, end, split)) pi[begin, end, a] = prob viterbi_back[begin, end, a] = [b, c, split] return True return False for i in range(n): # replace numerals with code if self.numerate and util.is_numeral(s[i]): word = util.NUMERAL # replace never seen words with code elif len(cfg_r2l[s[i]]) == 0: word = unknown else: word = s[i] for a in cfg_r2l[word]: prob = pcfg[a, word] # split as -1 codes a terminal check_add_prob(prob, a, s[i], 0, i, i+1, -1) for span in range(2, n + 1): for begin in range(n + 1 - span): end = begin + span for split in range(begin + 1, end): for b in chart[begin, split]: for c in chart[split, end]: for a in cfg_r2l[b, c]: # prob for all productions A -> B C prob = pcfg[a, (b, c)] prob = pi[begin, split, b] * pi[split, end, c] * prob check_add_prob(prob, a, b, c, begin, end, split) # for unary productions TOP -> B added = True while end == n and added: added = False nts = copy.copy(chart[begin, end]) for b in nts: a = start_symbol prob = pcfg.get((a, (b,))) if prob: prob = pi[begin, end, b] * prob # c as 0, split as 0 codes a unary rule added = check_add_prob(prob, a, b, 0, begin, end, 0) self.chart = chart self.covering = covering self.viterbi_back = viterbi_back self.pi = pi