Beispiel #1
0
 def __extract_cfg_line(self, s, lower, numerate):
     """Extract productions by counting parentheses"""
     open_p = []
     last_lhs = False
     rhs_list = defaultdict(list)
     for i in range(len(s)):
         if s[i] == '(':
             open_p.append(i)
         elif s[i] == ')':
             start = open_p.pop() + 1
             end = s.find(' ', start)
             lhs = s[start:end]
             # handle case of improper format with space before paren
             if s[i - 1] == ' ':
                 ch = i - 2
             else:
                 ch = i - 1
             if len(open_p):
                 lhs = "%s^%s" % (lhs,
                                  s[open_p[-1] + 1:s.find(' ', open_p[-1])])
                 rhs_list[open_p[-1]].append(lhs)
             if s[ch] != ')':
                 rhs = s[end + 1:i]
                 if lower:
                     # make terminals lowercase
                     rhs = rhs.lower()
                 # tag numerals as all the same
                 if numerate and util.is_numeral(rhs):
                     # digits can have . and ,
                     rhs = util.NUMERAL
                 self.__add_production(lhs, rhs)
             else:
                 self.__add_production(lhs, rhs_list[start - 1])
Beispiel #2
0
 def __extract_cfg_line(self, s, lower, numerate):
     """Extract productions by counting parentheses"""
     open_p = []
     last_lhs = False
     rhs_list = defaultdict(list)
     for i in range(len(s)):
         if s[i] == "(":
             open_p.append(i)
         elif s[i] == ")":
             start = open_p.pop() + 1
             end = s.find(" ", start)
             lhs = s[start:end]
             # handle case of improper format with space before paren
             if s[i - 1] == " ":
                 ch = i - 2
             else:
                 ch = i - 1
             if len(open_p):
                 lhs = "%s^%s" % (lhs, s[open_p[-1] + 1 : s.find(" ", open_p[-1])])
                 rhs_list[open_p[-1]].append(lhs)
             if s[ch] != ")":
                 rhs = s[end + 1 : i]
                 if lower:
                     # make terminals lowercase
                     rhs = rhs.lower()
                     # tag numerals as all the same
                 if numerate and util.is_numeral(rhs):
                     # digits can have . and ,
                     rhs = util.NUMERAL
                 self.__add_production(lhs, rhs)
             else:
                 self.__add_production(lhs, rhs_list[start - 1])
	def __create_chart(self):
		"""Build chart using CYK algorithm"""
		# create local vars for memory efficiency
		cfg_r2l = self.G.cfg_r2l
		pcfg = self.G.pcfg
		n = self.sentence_len
		s = self.sentence
		verbose = self.verbose
		start_symbol = self.start_symbol
		unknown = util.UNKNOWN

		chart = defaultdict(set)
		covering = defaultdict(set)
		viterbi_back = dict()
		pi = defaultdict(float)

		# local function for efficiency
		def check_add_prob(prob, a, b, c, begin, end, split):
			# add production to this chart location
			if prob > 0:
				if verbose > 1:
					util.log_p("add C %s => (%s %s) to [%d, %d] split: %d." % (a, b, c, begin, end, split))
				chart[begin, end].add(a)
				# store our covering productions
				if a == start_symbol:
					covering[begin, end, a].add((b, c))
			# if max, break ties by not changing
			if prob > pi[begin, end, a]:
				if verbose > 1:
					util.log_p("add pi %s => (%s %s) to [%d, %d] split: %d." % (a, b, c, begin, end, split))
				pi[begin, end, a] = prob
				viterbi_back[begin, end, a] = [b, c, split]
				return True
			return False

		for i in range(n):
			# replace numerals with code
			if self.numerate and util.is_numeral(s[i]): word = util.NUMERAL
			# replace never seen words with code
			elif len(cfg_r2l[s[i]]) == 0: word = unknown
			else: word = s[i]
			for a in cfg_r2l[word]:
				prob = pcfg[a, word]
				# split as -1 codes a terminal
				check_add_prob(prob, a, s[i], 0, i, i+1, -1)
		for span in range(2, n + 1):
			for begin in range(n + 1 - span):
				end = begin + span
				for split in range(begin + 1, end):
					for b in chart[begin, split]:
						for c in chart[split, end]:
							for a in cfg_r2l[b, c]:
								# prob for all productions A -> B C
								prob = pcfg[a, (b, c)]
								prob = pi[begin, split, b] * pi[split, end, c] * prob
								check_add_prob(prob, a, b, c, begin, end, split)
					# for unary productions TOP -> B
					added = True
					while end == n and added:
						added = False
						nts = copy.copy(chart[begin, end])
						for b in nts:
							a = start_symbol
							prob = pcfg.get((a, (b,)))
							if prob:
								prob = pi[begin, end, b] * prob
								# c as 0, split as 0 codes a unary rule
								added = check_add_prob(prob, a, b, 0, begin, end, 0)
		self.chart = chart
		self.covering = covering
		self.viterbi_back = viterbi_back
		self.pi = pi