Example #1
0
    def loadGlueRules(self):
        '''Loads the glue rules along with their feature values'''

        gF = open(settings.opts.glueFile, 'r')
        sys.stderr.write("Loading Glue rules from file     : %s\n" %
                         (settings.opts.glueFile))
        try:
            for line in gF:
                line = line.strip()
                if line.startswith('#'): continue  # Ignore commented lines
                glueItems = line.split(
                    '#'
                )  # Handle and ignore any comments embedded on the same line
                line = glueItems[0].strip()

                (src, tgt, glue_val) = line.split(' ||| ')
                rule_obj = RuleItem.initGlue(src, tgt, float(glue_val))
                if (settings.opts.no_glue_penalty and src == 'S__1 X__2'):
                    rule_obj.turnOffGlue()

                rule_obj.scoreRule()
                PhraseTable.ruleDict[src] = []
                PhraseTable.ruleDict[src].append(rule_obj)
        finally:
            gF.close()
Example #2
0
    def loadGlueRules(self):
        '''Loads the glue rules along with their feature values'''

        gF = open(settings.opts.glueFile, 'r')
        sys.stderr.write( "Loading Glue rules from file     : %s\n" % (settings.opts.glueFile) )
        try:
            for line in gF:
                line = line.strip()
                if line.startswith('#'): continue           # Ignore commented lines
                glueItems = line.split('#')                 # Handle and ignore any comments embedded on the same line
                line = glueItems[0].strip()

                (src, tgt, glue_val) = line.split(' ||| ')
                rule_obj = RuleItem.initGlue(src, tgt, float(glue_val))
                if (settings.opts.no_glue_penalty and src == 'S__1 X__2'):
                    rule_obj.turnOffGlue()

                rule_obj.scoreRule()
                PhraseTable.ruleDict[src] = []
                PhraseTable.ruleDict[src].append( rule_obj )
        finally:
            gF.close()
Example #3
0
    def parse(self):
        'Parse the sentence passed in the argument'

        global consObjsLst
        final_cell = False
        glueSrcLst = ['X__1', 'S__1 X__2']

        # Phase-1: Initialization
        # Fill the initial axioms in the chartDict (Dict of dict) in corresponding word positions
        p_i = 0
        for p_word in self.wordsLst:
#            print "Span:", p_i, p_i, "\tSpan length: 1"
            if ( p_i == 0 and self.sent_len == 1 ):
                final_cell = True
            Parse.chartDict[(p_i, p_i)] = Cell()

            # if the word is UNK; add it to ruleDict as: X -> <w_i, w_i> with default prob
            if not PhraseTable.hasRule(p_word):
                (unk_score, unk_lm_heu, unk_featVec) = FeatureManager.unkRuleTup
                PhraseTable.addUNKRule( p_word, RuleItem.initUNKRule(p_word, unk_featVec, unk_score, unk_lm_heu) )

            # Known (X -> <w_i, w_t>) or unknown (X -> <w_i, w_i>) rules are now flushed to the chart
            self.__flush2Cell( (p_i, p_i), ('X', p_word), 0, self.__getRulesFromPT(p_word, (p_i, p_i)) )     # Flush the entries to the cell
            #Parse.chartDict[(p_i, p_i)].printCell('X', self.sent_indx)

            # Add the glue rule S --> <X__1, X__1> in cell (0, 0)
            if p_i == 0:
                p_src = glueSrcLst[0]
                self.__getGlueRuleSpans((p_i, p_i), p_src)
                if consObjsLst:
                    Parse.chartDict[(p_i, p_i)].has_S_tree = True
                    self.__reduceCell((p_i, p_i), 'S', 'S', final_cell)   # Compute the n-best list from the parse forest
                    if settings.opts.force_decode:
                        force_dec_status = Parse.chartDict[(0, p_i)].forceDecodePrune(self.refsLst, final_cell)
                        if final_cell and not force_dec_status:
                            sys.stderr.write("           INFO  :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n" % (p_i))
                            return 0
                    #Parse.chartDict[(0, p_i)].printCell('S', self.sent_indx)

            p_i += 1

        # Phase-2: Filling the CKY table
        # Iterate through all possible spans of length 2 thro' M (maximum phrase length)
        for p_l in range(1, self.sent_len):
            for p_j in range(p_l, self.sent_len):
                p_i = p_j - p_l
#                print "\nSpan:", p_i, p_j, "\tSpan length:", p_l + 1
                # If the span length is greater than the 'maximum phrase length' skip to next iteration of p_l
                if p_l >= settings.opts.max_phr_len and p_i != 0: break

                Parse.chartDict[(p_i, p_j)] = Cell()
                p_cell_type = 'X'
                p_left_nt = 'X'
                if ( p_i == 0 and p_j == self.sent_len - 1 ):
                    final_cell = True
                if p_l < settings.opts.max_phr_len:
                    self.__getRuleSpans( p_i, p_j, ' '.join(self.wordsLst[p_i:p_j+1]) )

                if consObjsLst:
                    self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt, final_cell)
                    #Parse.chartDict[(p_i, p_j)].printCell('X', self.sent_indx)

                # For span beginning at '0' (top row in the parse triangle), add items of the form [S, i, j]:w to chart
                # Glue rules are: S --> (X__1, X__1) and S --> (S__1 X__2, S__1 X__2)
                # Sentence boundary markers <s> and </s> are added in Cube-Pruning step (lazyMerge_CP.py)
                if p_i == 0:
                    p_cell_type = 'S'
                    p_left_nt = 'S'
                    for p_src in glueSrcLst: self.__getGlueRuleSpans((p_i, p_j), p_src)

                    if consObjsLst:
                        Parse.chartDict[(p_i, p_j)].has_S_tree = True
                        self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt, final_cell)
                    if settings.opts.force_decode:
                        force_dec_status = Parse.chartDict[(p_i, p_j)].forceDecodePrune(self.refsLst, final_cell)
                        if final_cell and not force_dec_status:
                            sys.stderr.write("           INFO  :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n" % (p_j))
                            return 0
                    #Parse.chartDict[(p_i, p_j)].printCell('S', self.sent_indx)

        p_j = self.sent_len - 1
        if not Parse.chartDict[(0, p_j)].has_S_tree:
            return 99
        Parse.chartDict[(0, p_j)].printNBest('S', self.sent_indx)       # Print the N-best derivations in the last cell
        if settings.opts.trace_rules > 0:
            #Parse.chartDict[(0, p_j)].trackRulesUsed('S')               # Track the rules used in the top-k translations
            Parse.chartDict[(0, p_j)].printTrace('S', self.sent)        # Prints the translation trace for the top-3 entries

        return 1
Example #4
0
    def loadRules(self):
        '''Loads the filtered rules and filters them further by using the Suffix Tree of test data'''

        PhraseTable.tot_rule_pairs = 0
        prev_src = ''
        uniq_src_rules = 0
        entriesLst = []

        t_beg = time.time()
        rF = open(settings.opts.ruleFile, 'r')
        sys.stderr.write("Loading SCFG rules from file     : %s\n" %
                         (settings.opts.ruleFile))
        try:
            for line in rF:
                line = line.strip()
                (src, tgt,
                 probs) = line.split(' ||| ')  # For Kriya phrase table
                #                (src, tgt, f_align, r_align, probs) = line.split(' ||| ')     # For Moses phrase table

                if settings.opts.force_decode and not PhraseTable.tgtMatchesRef(
                        tgt):
                    continue
                if settings.opts.one_nt_decode and src.find('X__2') >= 0:
                    continue
                PhraseTable.tot_rule_pairs += 1

                if prev_src != src:
                    uniq_src_rules += 1
                    if PhraseTable.src_trie is None:
                        PhraseTable.src_trie = SimpleSuffixTree(
                            src, settings.opts.fr_rule_terms)
                    else:
                        PhraseTable.src_trie.addText(src)

                    if prev_src:
                        entriesLst.sort(key=operator.attrgetter("prob_e_f"),
                                        reverse=True)
                        PhraseTable.ruleDict[prev_src] = []
                        tgt_options = 0
                        for trans_option in entriesLst:
                            rule_obj = trans_option.rule
                            rule_obj.scoreRule()
                            PhraseTable.ruleDict[prev_src].append(rule_obj)
                            tgt_options += 1
                            if (settings.opts.ttl > 0
                                    and tgt_options >= settings.opts.ttl):
                                break
                        del entriesLst[:]

                rule = RuleItem.initRule(src, tgt, probs)
                entriesLst.append(TransOption(rule.getScore4TTL(), rule))
                prev_src = src

            # Handle the last rule
            entriesLst.sort(key=operator.attrgetter("prob_e_f"), reverse=True)
            PhraseTable.ruleDict[prev_src] = []
            tgt_options = 0
            for trans_option in entriesLst:
                rule_obj = trans_option.rule
                rule_obj.scoreRule()
                PhraseTable.ruleDict[prev_src].append(rule_obj)
                tgt_options += 1
                if (settings.opts.ttl > 0
                        and tgt_options >= settings.opts.ttl):
                    break
            del entriesLst[:]

        finally:
            rF.close()
            t_end = time.time()
            sys.stderr.write(
                "Unique source rules found                     : %d\n" %
                (uniq_src_rules))
            sys.stderr.write(
                "Total pairs of SCFG rules loaded              : %d\n" %
                (PhraseTable.tot_rule_pairs))
            sys.stderr.write(
                "Time taken for loading rules in dict and Trie : %1.3f sec\n\n"
                % (t_end - t_beg))

        return None
Example #5
0
    def parse(self):
        'Parse the sentence passed in the argument'

        global consObjsLst
        final_cell = False
        glueSrcLst = ['X__1', 'S__1 X__2']

        # Phase-1: Initialization
        # Fill the initial axioms in the chartDict (Dict of dict) in corresponding word positions
        p_i = 0
        for p_word in self.wordsLst:
            #            print "Span:", p_i, p_i, "\tSpan length: 1"
            if (p_i == 0 and self.sent_len == 1):
                final_cell = True
            Parse.chartDict[(p_i, p_i)] = Cell()

            # if the word is UNK; add it to ruleDict as: X -> <w_i, w_i> with default prob
            if not PhraseTable.hasRule(p_word):
                (unk_score, unk_lm_heu,
                 unk_featVec) = FeatureManager.unkRuleTup
                PhraseTable.addUNKRule(
                    p_word,
                    RuleItem.initUNKRule(p_word, unk_featVec, unk_score,
                                         unk_lm_heu))

            # Known (X -> <w_i, w_t>) or unknown (X -> <w_i, w_i>) rules are now flushed to the chart
            self.__flush2Cell(
                (p_i, p_i), ('X', p_word), 0,
                self.__getRulesFromPT(
                    p_word, (p_i, p_i)))  # Flush the entries to the cell
            #Parse.chartDict[(p_i, p_i)].printCell('X', self.sent_indx)

            # Add the glue rule S --> <X__1, X__1> in cell (0, 0)
            if p_i == 0:
                p_src = glueSrcLst[0]
                self.__getGlueRuleSpans((p_i, p_i), p_src)
                if consObjsLst:
                    Parse.chartDict[(p_i, p_i)].has_S_tree = True
                    self.__reduceCell(
                        (p_i, p_i), 'S', 'S', final_cell
                    )  # Compute the n-best list from the parse forest
                    if settings.opts.force_decode:
                        force_dec_status = Parse.chartDict[(
                            0, p_i)].forceDecodePrune(self.refsLst, final_cell)
                        if final_cell and not force_dec_status:
                            sys.stderr.write(
                                "           INFO  :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n"
                                % (p_i))
                            return 0
                    #Parse.chartDict[(0, p_i)].printCell('S', self.sent_indx)

            p_i += 1

        # Phase-2: Filling the CKY table
        # Iterate through all possible spans of length 2 thro' M (maximum phrase length)
        for p_l in range(1, self.sent_len):
            for p_j in range(p_l, self.sent_len):
                p_i = p_j - p_l
                #                print "\nSpan:", p_i, p_j, "\tSpan length:", p_l + 1
                # If the span length is greater than the 'maximum phrase length' skip to next iteration of p_l
                if p_l >= settings.opts.max_phr_len and p_i != 0: break

                Parse.chartDict[(p_i, p_j)] = Cell()
                p_cell_type = 'X'
                p_left_nt = 'X'
                if (p_i == 0 and p_j == self.sent_len - 1):
                    final_cell = True
                if p_l < settings.opts.max_phr_len:
                    self.__getRuleSpans(p_i, p_j,
                                        ' '.join(self.wordsLst[p_i:p_j + 1]))

                if consObjsLst:
                    self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt,
                                      final_cell)
                    #Parse.chartDict[(p_i, p_j)].printCell('X', self.sent_indx)

                # For span beginning at '0' (top row in the parse triangle), add items of the form [S, i, j]:w to chart
                # Glue rules are: S --> (X__1, X__1) and S --> (S__1 X__2, S__1 X__2)
                # Sentence boundary markers <s> and </s> are added in Cube-Pruning step (lazyMerge_CP.py)
                if p_i == 0:
                    p_cell_type = 'S'
                    p_left_nt = 'S'
                    for p_src in glueSrcLst:
                        self.__getGlueRuleSpans((p_i, p_j), p_src)

                    if consObjsLst:
                        Parse.chartDict[(p_i, p_j)].has_S_tree = True
                        self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt,
                                          final_cell)
                    if settings.opts.force_decode:
                        force_dec_status = Parse.chartDict[(
                            p_i,
                            p_j)].forceDecodePrune(self.refsLst, final_cell)
                        if final_cell and not force_dec_status:
                            sys.stderr.write(
                                "           INFO  :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n"
                                % (p_j))
                            return 0
                    #Parse.chartDict[(p_i, p_j)].printCell('S', self.sent_indx)

        p_j = self.sent_len - 1
        if not Parse.chartDict[(0, p_j)].has_S_tree:
            return 99
        Parse.chartDict[(0, p_j)].printNBest(
            'S',
            self.sent_indx)  # Print the N-best derivations in the last cell
        if settings.opts.trace_rules > 0:
            #Parse.chartDict[(0, p_j)].trackRulesUsed('S')               # Track the rules used in the top-k translations
            Parse.chartDict[(0, p_j)].printTrace(
                'S', self.sent
            )  # Prints the translation trace for the top-3 entries

        return 1
Example #6
0
    def loadRules(self):
        '''Loads the filtered rules and filters them further by using the Suffix Tree of test data'''

        PhraseTable.tot_rule_pairs = 0
        prev_src = ''
        uniq_src_rules = 0
        entriesLst = []

        t_beg = time.time()
        rF = open(settings.opts.ruleFile, 'r')
        sys.stderr.write( "Loading SCFG rules from file     : %s\n" % (settings.opts.ruleFile) )
        try:
            for line in rF:
                line = line.strip()
                (src, tgt, probs) = line.split(' ||| ')                       # For Kriya phrase table
#                (src, tgt, f_align, r_align, probs) = line.split(' ||| ')     # For Moses phrase table

                if settings.opts.force_decode and not PhraseTable.tgtMatchesRef(tgt): continue
                if settings.opts.one_nt_decode and src.find('X__2') >= 0: continue
                PhraseTable.tot_rule_pairs += 1

                if prev_src != src:
                    uniq_src_rules += 1
                    if PhraseTable.src_trie is None:
                        PhraseTable.src_trie = SimpleSuffixTree(src, settings.opts.fr_rule_terms)
                    else:
                        PhraseTable.src_trie.addText(src)

                    if prev_src:
                        entriesLst.sort(key=operator.attrgetter("prob_e_f"), reverse=True)
                        PhraseTable.ruleDict[prev_src] = []
                        tgt_options = 0
                        for trans_option in entriesLst:
                            rule_obj = trans_option.rule
                            rule_obj.scoreRule()
                            PhraseTable.ruleDict[prev_src].append( rule_obj )
                            tgt_options += 1
                            if(settings.opts.ttl > 0 and tgt_options >= settings.opts.ttl): break
                        del entriesLst[:]

                rule = RuleItem.initRule(src, tgt, probs)
                entriesLst.append( TransOption(rule.getScore4TTL(), rule) )
                prev_src = src

            # Handle the last rule
            entriesLst.sort(key=operator.attrgetter("prob_e_f"), reverse=True)
            PhraseTable.ruleDict[prev_src] = []
            tgt_options = 0
            for trans_option in entriesLst:
                rule_obj = trans_option.rule
                rule_obj.scoreRule()
                PhraseTable.ruleDict[prev_src].append( rule_obj )
                tgt_options += 1
                if(settings.opts.ttl > 0 and tgt_options >= settings.opts.ttl): break
            del entriesLst[:]

        finally:
            rF.close()
            t_end = time.time()
            sys.stderr.write( "Unique source rules found                     : %d\n" % (uniq_src_rules) )
            sys.stderr.write( "Total pairs of SCFG rules loaded              : %d\n" % (PhraseTable.tot_rule_pairs) )
            sys.stderr.write( "Time taken for loading rules in dict and Trie : %1.3f sec\n\n" % (t_end - t_beg) )

        return None