def _extractRules_(self, wordRulesFlag, extensiveRulesFlag, phraseRulesFlag, s2t): """ Return a list of rules extracted from this bead. :type wordRulesFlag: bool :param wordRulesFlag: True means to add word alignments that are not in the subtree alignments into the ruleList, False means no; """ ruleList = [] if s2t: glueRuleList = [] # add in rules with non-terminal Xs for key in self.subtreeAlignmentDic: if len(self.subtreeAlignmentDic[key] ) > 2: # prune all rules with more than 2 Xs continue for subaList in util.allCombinations( self.subtreeAlignmentDic[key]): #if self.verbose: print >> debug_log, key, ':', subaList ruleType, tmpRule = self._extract_(key, subaList, s2t) if s2t and ruleType == 'glueRule': glueRuleList.append(tmpRule) elif ruleType == 'regularRule': ruleList.append(tmpRule) # add in phrase pairs as rules # add in rules with no non-terminal Xs, i.e. rules that are phrase pairs (only phrase pairs that satisfy the tree structures) squareList = [suba for lis in self.subtreeAlignment for suba in lis] # if phraseRulesFlag, add in all phrase pairs, including the ones that don't satisfy the tree structures if phraseRulesFlag: squareListWOtags = [suba[:4] for suba in squareList ] # make sure no duplicated squares are added tmp = extractMinPhrasePairs(self.wordAlignment) squareList += [ suba for suba in tmp if suba[:4] not in squareListWOtags ] #pdb.set_trace() for square in squareList: if s2t: lhsSrc, lhsTgt = 'X', square[5] else: lhsSrc, lhsTgt = 'X', 'X' rhsSrc = range(square[0], square[2]) rhsTgt = range(square[1], square[3]) align = [] if self.legalRule(rhsSrc, rhsTgt): tmpRule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, self.wordAlignment, self.srcSnt, self.tgtSnt, square) #if self.verbose: print >> debug_log, tmpRule, '\t\t', ruleList.append(tmpRule) # if not wordRulesFlag, only add in rules that are word alignments (i.e. word pairs) but not corresponding subtree alignments #if self.verbose: #print >> debug_log, "Bead got wordRulesFlag:", str(wordRulesFlag) if not wordRulesFlag and not phraseRulesFlag: #if self.verbose: print >> debug_log, "wordRules are:" lhsSrc, lhsTgt = 'X', 'X' rhsSrc, rhsTgt, align = [], [], [ ] # here align is for the alignment of Xs, not word alignment, so keep empty for i in xrange(len(self.wordAlignment)): for j in xrange(len(self.wordAlignment[0])): if self.wordAlignment[i][j]: if sum(self.wordAlignment[i]) == 1 and sum( [row[j] for row in self.wordAlignment]) == 1: rhsSrc, rhsTgt = [i], [j] #if self.verbose: print >> debug_log, i, j, self.srcSnt[i].encode('utf-8'), self.tgtSnt[j].encode('utf-8') if self.legalRule(rhsSrc, rhsTgt): #if self.verbose: print >> debug_log, "legal" tmpRule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, self.wordAlignment, self.srcSnt, self.tgtSnt, (i, j, i + 1, j + 1)) ruleList.append(tmpRule) break # if extensiveRulesFlag, add in extensive rules which include: # - a rule with the determiner ("a" or "the") removed, e.g. given an existing rule "... ||| a peace agreement [X] ||| ...", # another rule "... ||| peace agreement [X] ||| ..." is added, iff. "a" is not aligned to any foreign words, # word/X alignments and occurences are kept the same; if extensiveRulesFlag: tmpRuleList = [] for rule in ruleList: if len(rule.rhsTgt) > 1 and rule.rhsTgt[0] in ["a", "the"]: #and 0 not in [align[1] for align in rule.alignment]: tmpRule = Rule(None, None, None, None, None, None, None, None, None) tmpRule.lhsSrc, tmpRule.lhsTgt, tmpRule.rhsSrc, tmpRule.rhsTgt = rule.lhsSrc, rule.lhsTgt, rule.rhsSrc, rule.rhsTgt[ 1:] #tmpRule.alignment = [(align[0], align[1] - 1) for align in rule.alignment] tmpRule.alignment = [(align[0], align[1] - 1) for align in rule.alignment if align[1] > 0] #print tmpRule.alignment tmpRule.square = rule.square[ 0], rule.square[1] + 1, rule.square[2], rule.square[3] tmpRuleList.append(tmpRule) ruleList.extend(tmpRuleList) if s2t: return ruleList, glueRuleList else: return ruleList, None
def _extractRules_(self, wordRulesFlag, extensiveRulesFlag, phraseRulesFlag, s2t): """ Return a list of rules extracted from this bead. :type wordRulesFlag: bool :param wordRulesFlag: True means to add word alignments that are not in the subtree alignments into the ruleList, False means no; """ ruleList = [] if s2t: glueRuleList = [] # add in rules with non-terminal Xs for key in self.subtreeAlignmentDic: if len(self.subtreeAlignmentDic[key]) > 2: # prune all rules with more than 2 Xs continue for subaList in util.allCombinations(self.subtreeAlignmentDic[key]): #if self.verbose: print >> debug_log, key, ':', subaList ruleType, tmpRule = self._extract_(key, subaList, s2t) if s2t and ruleType == 'glueRule': glueRuleList.append(tmpRule) elif ruleType == 'regularRule': ruleList.append(tmpRule) # add in phrase pairs as rules # add in rules with no non-terminal Xs, i.e. rules that are phrase pairs (only phrase pairs that satisfy the tree structures) squareList = [suba for lis in self.subtreeAlignment for suba in lis] # if phraseRulesFlag, add in all phrase pairs, including the ones that don't satisfy the tree structures if phraseRulesFlag: squareListWOtags = [suba[:4] for suba in squareList] # make sure no duplicated squares are added tmp = extractMinPhrasePairs(self.wordAlignment) squareList += [suba for suba in tmp if suba[:4] not in squareListWOtags] #pdb.set_trace() for square in squareList: if s2t: lhsSrc, lhsTgt = 'X', square[5] else: lhsSrc, lhsTgt = 'X', 'X' rhsSrc = range(square[0], square[2]) rhsTgt = range(square[1], square[3]) align = [] if self.legalRule(rhsSrc, rhsTgt): tmpRule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, self.wordAlignment, self.srcSnt, self.tgtSnt, square) #if self.verbose: print >> debug_log, tmpRule, '\t\t', ruleList.append(tmpRule) # if not wordRulesFlag, only add in rules that are word alignments (i.e. word pairs) but not corresponding subtree alignments #if self.verbose: #print >> debug_log, "Bead got wordRulesFlag:", str(wordRulesFlag) if not wordRulesFlag and not phraseRulesFlag: #if self.verbose: print >> debug_log, "wordRules are:" lhsSrc, lhsTgt = 'X', 'X' rhsSrc, rhsTgt, align = [], [], [] # here align is for the alignment of Xs, not word alignment, so keep empty for i in xrange(len(self.wordAlignment)): for j in xrange(len(self.wordAlignment[0])): if self.wordAlignment[i][j]: if sum(self.wordAlignment[i]) == 1 and sum([row[j] for row in self.wordAlignment]) == 1: rhsSrc, rhsTgt = [i], [j] #if self.verbose: print >> debug_log, i, j, self.srcSnt[i].encode('utf-8'), self.tgtSnt[j].encode('utf-8') if self.legalRule(rhsSrc, rhsTgt): #if self.verbose: print >> debug_log, "legal" tmpRule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, self.wordAlignment, self.srcSnt, self.tgtSnt, (i, j, i + 1, j + 1)) ruleList.append(tmpRule) break # if extensiveRulesFlag, add in extensive rules which include: # - a rule with the determiner ("a" or "the") removed, e.g. given an existing rule "... ||| a peace agreement [X] ||| ...", # another rule "... ||| peace agreement [X] ||| ..." is added, iff. "a" is not aligned to any foreign words, # word/X alignments and occurences are kept the same; if extensiveRulesFlag: tmpRuleList = [] for rule in ruleList: if len(rule.rhsTgt) > 1 and rule.rhsTgt[0] in ["a", "the"]: #and 0 not in [align[1] for align in rule.alignment]: tmpRule = Rule(None, None, None, None, None, None, None, None, None) tmpRule.lhsSrc, tmpRule.lhsTgt, tmpRule.rhsSrc, tmpRule.rhsTgt = rule.lhsSrc, rule.lhsTgt, rule.rhsSrc, rule.rhsTgt[1:] #tmpRule.alignment = [(align[0], align[1] - 1) for align in rule.alignment] tmpRule.alignment = [(align[0], align[1] - 1) for align in rule.alignment if align[1] > 0] #print tmpRule.alignment tmpRule.square = rule.square[0], rule.square[1] + 1, rule.square[2], rule.square[3] tmpRuleList.append(tmpRule) ruleList.extend(tmpRuleList) if s2t: return ruleList, glueRuleList else: return ruleList, None