Ejemplo n.º 1
0
    def _extractRules_(self, wordRulesFlag, extensiveRulesFlag,
                       phraseRulesFlag, s2t):
        """
		Return a list of rules extracted from this bead.

		:type wordRulesFlag: bool
		:param wordRulesFlag: True means to add word alignments that are not in the subtree alignments into the ruleList,
							False means no;

		"""
        ruleList = []
        if s2t: glueRuleList = []
        # add in rules with non-terminal Xs
        for key in self.subtreeAlignmentDic:
            if len(self.subtreeAlignmentDic[key]
                   ) > 2:  # prune all rules with more than 2 Xs
                continue
            for subaList in util.allCombinations(
                    self.subtreeAlignmentDic[key]):
                #if self.verbose: print >> debug_log, key, ':', subaList
                ruleType, tmpRule = self._extract_(key, subaList, s2t)
                if s2t and ruleType == 'glueRule':
                    glueRuleList.append(tmpRule)
                elif ruleType == 'regularRule':
                    ruleList.append(tmpRule)

        # add in phrase pairs as rules
        # add in rules with no non-terminal Xs, i.e. rules that are phrase pairs (only phrase pairs that satisfy the tree structures)
        squareList = [suba for lis in self.subtreeAlignment for suba in lis]
        # if phraseRulesFlag, add in all phrase pairs, including the ones that don't satisfy the tree structures
        if phraseRulesFlag:
            squareListWOtags = [suba[:4] for suba in squareList
                                ]  # make sure no duplicated squares are added
            tmp = extractMinPhrasePairs(self.wordAlignment)
            squareList += [
                suba for suba in tmp if suba[:4] not in squareListWOtags
            ]
        #pdb.set_trace()
        for square in squareList:
            if s2t: lhsSrc, lhsTgt = 'X', square[5]
            else: lhsSrc, lhsTgt = 'X', 'X'
            rhsSrc = range(square[0], square[2])
            rhsTgt = range(square[1], square[3])
            align = []
            if self.legalRule(rhsSrc, rhsTgt):
                tmpRule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align,
                               self.wordAlignment, self.srcSnt, self.tgtSnt,
                               square)
                #if self.verbose: print >> debug_log, tmpRule, '\t\t',
                ruleList.append(tmpRule)

        # if not wordRulesFlag, only add in rules that are word alignments (i.e. word pairs) but not corresponding subtree alignments
        #if self.verbose:
        #print >> debug_log, "Bead got wordRulesFlag:", str(wordRulesFlag)
        if not wordRulesFlag and not phraseRulesFlag:
            #if self.verbose: print >> debug_log, "wordRules are:"
            lhsSrc, lhsTgt = 'X', 'X'
            rhsSrc, rhsTgt, align = [], [], [
            ]  # here align is for the alignment of Xs, not word alignment, so keep empty
            for i in xrange(len(self.wordAlignment)):
                for j in xrange(len(self.wordAlignment[0])):
                    if self.wordAlignment[i][j]:
                        if sum(self.wordAlignment[i]) == 1 and sum(
                            [row[j] for row in self.wordAlignment]) == 1:
                            rhsSrc, rhsTgt = [i], [j]
                            #if self.verbose: print >> debug_log, i, j, self.srcSnt[i].encode('utf-8'), self.tgtSnt[j].encode('utf-8')
                            if self.legalRule(rhsSrc, rhsTgt):
                                #if self.verbose: print >> debug_log, "legal"
                                tmpRule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt,
                                               align, self.wordAlignment,
                                               self.srcSnt, self.tgtSnt,
                                               (i, j, i + 1, j + 1))
                                ruleList.append(tmpRule)
                        break

        # if extensiveRulesFlag, add in extensive rules which include:
        #	- a rule with the determiner ("a" or "the") removed, e.g. given an existing rule "... ||| a peace agreement [X] ||| ...",
        #			another rule "... ||| peace agreement [X] ||| ..." is added, iff. "a" is not aligned to any foreign words,
        #			word/X alignments and occurences are kept the same;
        if extensiveRulesFlag:
            tmpRuleList = []
            for rule in ruleList:
                if len(rule.rhsTgt) > 1 and rule.rhsTgt[0] in ["a", "the"]:
                    #and 0 not in [align[1] for align in rule.alignment]:
                    tmpRule = Rule(None, None, None, None, None, None, None,
                                   None, None)
                    tmpRule.lhsSrc, tmpRule.lhsTgt, tmpRule.rhsSrc, tmpRule.rhsTgt = rule.lhsSrc, rule.lhsTgt, rule.rhsSrc, rule.rhsTgt[
                        1:]
                    #tmpRule.alignment = [(align[0], align[1] - 1) for align in rule.alignment]
                    tmpRule.alignment = [(align[0], align[1] - 1)
                                         for align in rule.alignment
                                         if align[1] > 0]
                    #print tmpRule.alignment
                    tmpRule.square = rule.square[
                        0], rule.square[1] + 1, rule.square[2], rule.square[3]
                    tmpRuleList.append(tmpRule)
            ruleList.extend(tmpRuleList)

        if s2t: return ruleList, glueRuleList
        else: return ruleList, None
Ejemplo n.º 2
0
	def _extractRules_(self, wordRulesFlag, extensiveRulesFlag, phraseRulesFlag, s2t):
		"""
		Return a list of rules extracted from this bead.

		:type wordRulesFlag: bool
		:param wordRulesFlag: True means to add word alignments that are not in the subtree alignments into the ruleList,
							False means no;

		"""
		ruleList = []
		if s2t: glueRuleList = []
		# add in rules with non-terminal Xs
		for key in self.subtreeAlignmentDic:
			if len(self.subtreeAlignmentDic[key]) > 2:		# prune all rules with more than 2 Xs
				continue
			for subaList in util.allCombinations(self.subtreeAlignmentDic[key]):
				#if self.verbose: print >> debug_log, key, ':', subaList
				ruleType, tmpRule = self._extract_(key, subaList, s2t)
				if s2t and ruleType == 'glueRule':
					glueRuleList.append(tmpRule)
				elif ruleType == 'regularRule':
					ruleList.append(tmpRule)

		# add in phrase pairs as rules
		# add in rules with no non-terminal Xs, i.e. rules that are phrase pairs (only phrase pairs that satisfy the tree structures)
		squareList = [suba for lis in self.subtreeAlignment for suba in lis]
		# if phraseRulesFlag, add in all phrase pairs, including the ones that don't satisfy the tree structures
		if phraseRulesFlag:
			squareListWOtags = [suba[:4] for suba in squareList]  # make sure no duplicated squares are added
			tmp = extractMinPhrasePairs(self.wordAlignment)
			squareList += [suba for suba in tmp if suba[:4] not in squareListWOtags]
		#pdb.set_trace()
		for square in squareList: 
			if s2t: lhsSrc, lhsTgt = 'X', square[5]
			else: lhsSrc, lhsTgt = 'X', 'X'
			rhsSrc = range(square[0], square[2])
			rhsTgt = range(square[1], square[3])
			align = []
			if self.legalRule(rhsSrc, rhsTgt):
				tmpRule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, self.wordAlignment, self.srcSnt, self.tgtSnt, square)
				#if self.verbose: print >> debug_log, tmpRule, '\t\t',
				ruleList.append(tmpRule)

		# if not wordRulesFlag, only add in rules that are word alignments (i.e. word pairs) but not corresponding subtree alignments
		#if self.verbose:
			#print >> debug_log, "Bead got wordRulesFlag:", str(wordRulesFlag)
		if not wordRulesFlag and not phraseRulesFlag:
			#if self.verbose: print >> debug_log, "wordRules are:"
			lhsSrc, lhsTgt = 'X', 'X'
			rhsSrc, rhsTgt, align = [], [], []   # here align is for the alignment of Xs, not word alignment, so keep empty 
			for i in xrange(len(self.wordAlignment)):
				for j in xrange(len(self.wordAlignment[0])):
					if self.wordAlignment[i][j]:
						if sum(self.wordAlignment[i]) == 1 and sum([row[j] for row in self.wordAlignment]) == 1:
							rhsSrc, rhsTgt = [i], [j]
							#if self.verbose: print >> debug_log, i, j, self.srcSnt[i].encode('utf-8'), self.tgtSnt[j].encode('utf-8')
							if self.legalRule(rhsSrc, rhsTgt):
								#if self.verbose: print >> debug_log, "legal"
								tmpRule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, self.wordAlignment, self.srcSnt, self.tgtSnt, (i, j, i + 1, j + 1))
								ruleList.append(tmpRule)
						break

		# if extensiveRulesFlag, add in extensive rules which include:
		#	- a rule with the determiner ("a" or "the") removed, e.g. given an existing rule "... ||| a peace agreement [X] ||| ...", 
		#			another rule "... ||| peace agreement [X] ||| ..." is added, iff. "a" is not aligned to any foreign words,
		#			word/X alignments and occurences are kept the same;
		if extensiveRulesFlag:
			tmpRuleList = []
			for rule in ruleList:
				if len(rule.rhsTgt) > 1 and rule.rhsTgt[0] in ["a", "the"]:
						#and 0 not in [align[1] for align in rule.alignment]:
					tmpRule = Rule(None, None, None, None, None, None, None, None, None)
					tmpRule.lhsSrc, tmpRule.lhsTgt, tmpRule.rhsSrc, tmpRule.rhsTgt = rule.lhsSrc, rule.lhsTgt, rule.rhsSrc, rule.rhsTgt[1:]
					#tmpRule.alignment = [(align[0], align[1] - 1) for align in rule.alignment]
					tmpRule.alignment = [(align[0], align[1] - 1) for align in rule.alignment if align[1] > 0]
					#print tmpRule.alignment
					tmpRule.square = rule.square[0], rule.square[1] + 1, rule.square[2], rule.square[3]
					tmpRuleList.append(tmpRule)
			ruleList.extend(tmpRuleList)

		if s2t: return ruleList, glueRuleList
		else: return ruleList, None