def loadGlueRules(self): '''Loads the glue rules along with their feature values''' gF = open(settings.opts.glueFile, 'r') sys.stderr.write("Loading Glue rules from file : %s\n" % (settings.opts.glueFile)) try: for line in gF: line = line.strip() if line.startswith('#'): continue # Ignore commented lines glueItems = line.split( '#' ) # Handle and ignore any comments embedded on the same line line = glueItems[0].strip() (src, tgt, glue_val) = line.split(' ||| ') rule_obj = RuleItem.initGlue(src, tgt, float(glue_val)) if (settings.opts.no_glue_penalty and src == 'S__1 X__2'): rule_obj.turnOffGlue() rule_obj.scoreRule() PhraseTable.ruleDict[src] = [] PhraseTable.ruleDict[src].append(rule_obj) finally: gF.close()
def loadGlueRules(self): '''Loads the glue rules along with their feature values''' gF = open(settings.opts.glueFile, 'r') sys.stderr.write( "Loading Glue rules from file : %s\n" % (settings.opts.glueFile) ) try: for line in gF: line = line.strip() if line.startswith('#'): continue # Ignore commented lines glueItems = line.split('#') # Handle and ignore any comments embedded on the same line line = glueItems[0].strip() (src, tgt, glue_val) = line.split(' ||| ') rule_obj = RuleItem.initGlue(src, tgt, float(glue_val)) if (settings.opts.no_glue_penalty and src == 'S__1 X__2'): rule_obj.turnOffGlue() rule_obj.scoreRule() PhraseTable.ruleDict[src] = [] PhraseTable.ruleDict[src].append( rule_obj ) finally: gF.close()
def parse(self): 'Parse the sentence passed in the argument' global consObjsLst final_cell = False glueSrcLst = ['X__1', 'S__1 X__2'] # Phase-1: Initialization # Fill the initial axioms in the chartDict (Dict of dict) in corresponding word positions p_i = 0 for p_word in self.wordsLst: # print "Span:", p_i, p_i, "\tSpan length: 1" if ( p_i == 0 and self.sent_len == 1 ): final_cell = True Parse.chartDict[(p_i, p_i)] = Cell() # if the word is UNK; add it to ruleDict as: X -> <w_i, w_i> with default prob if not PhraseTable.hasRule(p_word): (unk_score, unk_lm_heu, unk_featVec) = FeatureManager.unkRuleTup PhraseTable.addUNKRule( p_word, RuleItem.initUNKRule(p_word, unk_featVec, unk_score, unk_lm_heu) ) # Known (X -> <w_i, w_t>) or unknown (X -> <w_i, w_i>) rules are now flushed to the chart self.__flush2Cell( (p_i, p_i), ('X', p_word), 0, self.__getRulesFromPT(p_word, (p_i, p_i)) ) # Flush the entries to the cell #Parse.chartDict[(p_i, p_i)].printCell('X', self.sent_indx) # Add the glue rule S --> <X__1, X__1> in cell (0, 0) if p_i == 0: p_src = glueSrcLst[0] self.__getGlueRuleSpans((p_i, p_i), p_src) if consObjsLst: Parse.chartDict[(p_i, p_i)].has_S_tree = True self.__reduceCell((p_i, p_i), 'S', 'S', final_cell) # Compute the n-best list from the parse forest if settings.opts.force_decode: force_dec_status = Parse.chartDict[(0, p_i)].forceDecodePrune(self.refsLst, final_cell) if final_cell and not force_dec_status: sys.stderr.write(" INFO :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n" % (p_i)) return 0 #Parse.chartDict[(0, p_i)].printCell('S', self.sent_indx) p_i += 1 # Phase-2: Filling the CKY table # Iterate through all possible spans of length 2 thro' M (maximum phrase length) for p_l in range(1, self.sent_len): for p_j in range(p_l, self.sent_len): p_i = p_j - p_l # print "\nSpan:", p_i, p_j, "\tSpan length:", p_l + 1 # If the span length is greater than the 'maximum phrase length' skip to next iteration of p_l if p_l >= settings.opts.max_phr_len and p_i != 0: break Parse.chartDict[(p_i, p_j)] = Cell() p_cell_type = 'X' p_left_nt = 'X' if ( p_i == 0 and p_j == self.sent_len - 1 ): final_cell = True if p_l < settings.opts.max_phr_len: self.__getRuleSpans( p_i, p_j, ' '.join(self.wordsLst[p_i:p_j+1]) ) if consObjsLst: self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt, final_cell) #Parse.chartDict[(p_i, p_j)].printCell('X', self.sent_indx) # For span beginning at '0' (top row in the parse triangle), add items of the form [S, i, j]:w to chart # Glue rules are: S --> (X__1, X__1) and S --> (S__1 X__2, S__1 X__2) # Sentence boundary markers <s> and </s> are added in Cube-Pruning step (lazyMerge_CP.py) if p_i == 0: p_cell_type = 'S' p_left_nt = 'S' for p_src in glueSrcLst: self.__getGlueRuleSpans((p_i, p_j), p_src) if consObjsLst: Parse.chartDict[(p_i, p_j)].has_S_tree = True self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt, final_cell) if settings.opts.force_decode: force_dec_status = Parse.chartDict[(p_i, p_j)].forceDecodePrune(self.refsLst, final_cell) if final_cell and not force_dec_status: sys.stderr.write(" INFO :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n" % (p_j)) return 0 #Parse.chartDict[(p_i, p_j)].printCell('S', self.sent_indx) p_j = self.sent_len - 1 if not Parse.chartDict[(0, p_j)].has_S_tree: return 99 Parse.chartDict[(0, p_j)].printNBest('S', self.sent_indx) # Print the N-best derivations in the last cell if settings.opts.trace_rules > 0: #Parse.chartDict[(0, p_j)].trackRulesUsed('S') # Track the rules used in the top-k translations Parse.chartDict[(0, p_j)].printTrace('S', self.sent) # Prints the translation trace for the top-3 entries return 1
def loadRules(self): '''Loads the filtered rules and filters them further by using the Suffix Tree of test data''' PhraseTable.tot_rule_pairs = 0 prev_src = '' uniq_src_rules = 0 entriesLst = [] t_beg = time.time() rF = open(settings.opts.ruleFile, 'r') sys.stderr.write("Loading SCFG rules from file : %s\n" % (settings.opts.ruleFile)) try: for line in rF: line = line.strip() (src, tgt, probs) = line.split(' ||| ') # For Kriya phrase table # (src, tgt, f_align, r_align, probs) = line.split(' ||| ') # For Moses phrase table if settings.opts.force_decode and not PhraseTable.tgtMatchesRef( tgt): continue if settings.opts.one_nt_decode and src.find('X__2') >= 0: continue PhraseTable.tot_rule_pairs += 1 if prev_src != src: uniq_src_rules += 1 if PhraseTable.src_trie is None: PhraseTable.src_trie = SimpleSuffixTree( src, settings.opts.fr_rule_terms) else: PhraseTable.src_trie.addText(src) if prev_src: entriesLst.sort(key=operator.attrgetter("prob_e_f"), reverse=True) PhraseTable.ruleDict[prev_src] = [] tgt_options = 0 for trans_option in entriesLst: rule_obj = trans_option.rule rule_obj.scoreRule() PhraseTable.ruleDict[prev_src].append(rule_obj) tgt_options += 1 if (settings.opts.ttl > 0 and tgt_options >= settings.opts.ttl): break del entriesLst[:] rule = RuleItem.initRule(src, tgt, probs) entriesLst.append(TransOption(rule.getScore4TTL(), rule)) prev_src = src # Handle the last rule entriesLst.sort(key=operator.attrgetter("prob_e_f"), reverse=True) PhraseTable.ruleDict[prev_src] = [] tgt_options = 0 for trans_option in entriesLst: rule_obj = trans_option.rule rule_obj.scoreRule() PhraseTable.ruleDict[prev_src].append(rule_obj) tgt_options += 1 if (settings.opts.ttl > 0 and tgt_options >= settings.opts.ttl): break del entriesLst[:] finally: rF.close() t_end = time.time() sys.stderr.write( "Unique source rules found : %d\n" % (uniq_src_rules)) sys.stderr.write( "Total pairs of SCFG rules loaded : %d\n" % (PhraseTable.tot_rule_pairs)) sys.stderr.write( "Time taken for loading rules in dict and Trie : %1.3f sec\n\n" % (t_end - t_beg)) return None
def parse(self): 'Parse the sentence passed in the argument' global consObjsLst final_cell = False glueSrcLst = ['X__1', 'S__1 X__2'] # Phase-1: Initialization # Fill the initial axioms in the chartDict (Dict of dict) in corresponding word positions p_i = 0 for p_word in self.wordsLst: # print "Span:", p_i, p_i, "\tSpan length: 1" if (p_i == 0 and self.sent_len == 1): final_cell = True Parse.chartDict[(p_i, p_i)] = Cell() # if the word is UNK; add it to ruleDict as: X -> <w_i, w_i> with default prob if not PhraseTable.hasRule(p_word): (unk_score, unk_lm_heu, unk_featVec) = FeatureManager.unkRuleTup PhraseTable.addUNKRule( p_word, RuleItem.initUNKRule(p_word, unk_featVec, unk_score, unk_lm_heu)) # Known (X -> <w_i, w_t>) or unknown (X -> <w_i, w_i>) rules are now flushed to the chart self.__flush2Cell( (p_i, p_i), ('X', p_word), 0, self.__getRulesFromPT( p_word, (p_i, p_i))) # Flush the entries to the cell #Parse.chartDict[(p_i, p_i)].printCell('X', self.sent_indx) # Add the glue rule S --> <X__1, X__1> in cell (0, 0) if p_i == 0: p_src = glueSrcLst[0] self.__getGlueRuleSpans((p_i, p_i), p_src) if consObjsLst: Parse.chartDict[(p_i, p_i)].has_S_tree = True self.__reduceCell( (p_i, p_i), 'S', 'S', final_cell ) # Compute the n-best list from the parse forest if settings.opts.force_decode: force_dec_status = Parse.chartDict[( 0, p_i)].forceDecodePrune(self.refsLst, final_cell) if final_cell and not force_dec_status: sys.stderr.write( " INFO :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n" % (p_i)) return 0 #Parse.chartDict[(0, p_i)].printCell('S', self.sent_indx) p_i += 1 # Phase-2: Filling the CKY table # Iterate through all possible spans of length 2 thro' M (maximum phrase length) for p_l in range(1, self.sent_len): for p_j in range(p_l, self.sent_len): p_i = p_j - p_l # print "\nSpan:", p_i, p_j, "\tSpan length:", p_l + 1 # If the span length is greater than the 'maximum phrase length' skip to next iteration of p_l if p_l >= settings.opts.max_phr_len and p_i != 0: break Parse.chartDict[(p_i, p_j)] = Cell() p_cell_type = 'X' p_left_nt = 'X' if (p_i == 0 and p_j == self.sent_len - 1): final_cell = True if p_l < settings.opts.max_phr_len: self.__getRuleSpans(p_i, p_j, ' '.join(self.wordsLst[p_i:p_j + 1])) if consObjsLst: self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt, final_cell) #Parse.chartDict[(p_i, p_j)].printCell('X', self.sent_indx) # For span beginning at '0' (top row in the parse triangle), add items of the form [S, i, j]:w to chart # Glue rules are: S --> (X__1, X__1) and S --> (S__1 X__2, S__1 X__2) # Sentence boundary markers <s> and </s> are added in Cube-Pruning step (lazyMerge_CP.py) if p_i == 0: p_cell_type = 'S' p_left_nt = 'S' for p_src in glueSrcLst: self.__getGlueRuleSpans((p_i, p_j), p_src) if consObjsLst: Parse.chartDict[(p_i, p_j)].has_S_tree = True self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt, final_cell) if settings.opts.force_decode: force_dec_status = Parse.chartDict[( p_i, p_j)].forceDecodePrune(self.refsLst, final_cell) if final_cell and not force_dec_status: sys.stderr.write( " INFO :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n" % (p_j)) return 0 #Parse.chartDict[(p_i, p_j)].printCell('S', self.sent_indx) p_j = self.sent_len - 1 if not Parse.chartDict[(0, p_j)].has_S_tree: return 99 Parse.chartDict[(0, p_j)].printNBest( 'S', self.sent_indx) # Print the N-best derivations in the last cell if settings.opts.trace_rules > 0: #Parse.chartDict[(0, p_j)].trackRulesUsed('S') # Track the rules used in the top-k translations Parse.chartDict[(0, p_j)].printTrace( 'S', self.sent ) # Prints the translation trace for the top-3 entries return 1
def loadRules(self): '''Loads the filtered rules and filters them further by using the Suffix Tree of test data''' PhraseTable.tot_rule_pairs = 0 prev_src = '' uniq_src_rules = 0 entriesLst = [] t_beg = time.time() rF = open(settings.opts.ruleFile, 'r') sys.stderr.write( "Loading SCFG rules from file : %s\n" % (settings.opts.ruleFile) ) try: for line in rF: line = line.strip() (src, tgt, probs) = line.split(' ||| ') # For Kriya phrase table # (src, tgt, f_align, r_align, probs) = line.split(' ||| ') # For Moses phrase table if settings.opts.force_decode and not PhraseTable.tgtMatchesRef(tgt): continue if settings.opts.one_nt_decode and src.find('X__2') >= 0: continue PhraseTable.tot_rule_pairs += 1 if prev_src != src: uniq_src_rules += 1 if PhraseTable.src_trie is None: PhraseTable.src_trie = SimpleSuffixTree(src, settings.opts.fr_rule_terms) else: PhraseTable.src_trie.addText(src) if prev_src: entriesLst.sort(key=operator.attrgetter("prob_e_f"), reverse=True) PhraseTable.ruleDict[prev_src] = [] tgt_options = 0 for trans_option in entriesLst: rule_obj = trans_option.rule rule_obj.scoreRule() PhraseTable.ruleDict[prev_src].append( rule_obj ) tgt_options += 1 if(settings.opts.ttl > 0 and tgt_options >= settings.opts.ttl): break del entriesLst[:] rule = RuleItem.initRule(src, tgt, probs) entriesLst.append( TransOption(rule.getScore4TTL(), rule) ) prev_src = src # Handle the last rule entriesLst.sort(key=operator.attrgetter("prob_e_f"), reverse=True) PhraseTable.ruleDict[prev_src] = [] tgt_options = 0 for trans_option in entriesLst: rule_obj = trans_option.rule rule_obj.scoreRule() PhraseTable.ruleDict[prev_src].append( rule_obj ) tgt_options += 1 if(settings.opts.ttl > 0 and tgt_options >= settings.opts.ttl): break del entriesLst[:] finally: rF.close() t_end = time.time() sys.stderr.write( "Unique source rules found : %d\n" % (uniq_src_rules) ) sys.stderr.write( "Total pairs of SCFG rules loaded : %d\n" % (PhraseTable.tot_rule_pairs) ) sys.stderr.write( "Time taken for loading rules in dict and Trie : %1.3f sec\n\n" % (t_end - t_beg) ) return None