Beispiel #1
0
def consolidateRules(cntsFile):

    sys.stderr.write("Consolidating rules from file : %s ...\n" % (cntsFile))
    rulesUsedDict = {}
    rC = open(cntsFile, 'r')
    try:
        for line in rC:
            (src, tgt, cnts) = line.split(" ||| ")
            rule = src + " ||| " + tgt
            if (rulesUsedDict.has_key(rule)): rulesUsedDict[rule] += int(cnts)
            else: rulesUsedDict[rule] = int(cnts)
    finally:
        rC.close()

    tot_used_rules = len(rulesUsedDict.keys())
    tot_PT_rules = PhraseTable.getTotalRules()
    sys.stderr.write("Total SCFG rules found for the set           : %g\n" %
                     (tot_PT_rules))
    sys.stderr.write("# of unique rules used in N-best derivations : %g\n" %
                     (tot_used_rules))
    sys.stderr.write("%% of rules used in the N-best list           : %g\n" %
                     ((float(tot_used_rules) * 100.0) / float(tot_PT_rules)))
    wC = open(cntsFile, 'w')
    for rule, r_cnt in sorted(rulesUsedDict.iteritems(),
                              key=operator.itemgetter(1)):
        wC.write("%s ||| %d\n" % (rule, r_cnt))
    wC.close()
Beispiel #2
0
def consolidateRules(cntsFile):

    sys.stderr.write("Consolidating rules from file : %s ...\n" % (cntsFile))
    rulesUsedDict = {}
    rC = open(cntsFile, "r")
    try:
        for line in rC:
            (src, tgt, cnts) = line.split(" ||| ")
            rule = src + " ||| " + tgt
            if rulesUsedDict.has_key(rule):
                rulesUsedDict[rule] += int(cnts)
            else:
                rulesUsedDict[rule] = int(cnts)
    finally:
        rC.close()

    tot_used_rules = len(rulesUsedDict.keys())
    tot_PT_rules = PhraseTable.getTotalRules()
    sys.stderr.write("Total SCFG rules found for the set           : %g\n" % (tot_PT_rules))
    sys.stderr.write("# of unique rules used in N-best derivations : %g\n" % (tot_used_rules))
    sys.stderr.write(
        "%% of rules used in the N-best list           : %g\n" % ((float(tot_used_rules) * 100.0) / float(tot_PT_rules))
    )
    wC = open(cntsFile, "w")
    for rule, r_cnt in sorted(rulesUsedDict.iteritems(), key=operator.itemgetter(1)):
        wC.write("%s ||| %d\n" % (rule, r_cnt))
    wC.close()
Beispiel #3
0
    def __getRulesFromPT(self, s_rule, span):
        ''' Get the rules from the Phrase table and create new entry object for each rule returned by phrase table '''

        tgtLst = PhraseTable.getRuleEntries(s_rule, self.sent_indx)
        newTgtLst = []
        for r_item in tgtLst:
            new_entry = Hypothesis.createFromRule(r_item, span)
            newTgtLst.append(new_entry)

        return newTgtLst
Beispiel #4
0
    def __getRulesFromPT(self, s_rule, span):
        ''' Get the rules from the Phrase table and create new entry object for each rule returned by phrase table '''

        tgtLst = PhraseTable.getRuleEntries(s_rule, self.sent_indx)
        newTgtLst = []
        for r_item in tgtLst:
            new_entry = Hypothesis.createFromRule(r_item, span)
            newTgtLst.append(new_entry)

        return newTgtLst
Beispiel #5
0
def main():

    global refFiles
    sent_count = settings.opts.sentindex * settings.opts.sent_per_file

    if settings.opts.force_decode:
        getRefFiles()
        RefPhrases(sent_count, refFiles)
    PhraseTable()

    readNParse(sent_count)  # Parse the sentences
Beispiel #6
0
    def __getRuleSpans(self, i, j, span_phrase):
        '''Get the list of rules that match the phrase corresponding to the given span'''

        global consObjsLst
        matchLst = PhraseTable.findConsistentRules(span_phrase)

        for match in matchLst:
            rule = match[0]
            ## Terminal rule
            if len(match[1]) == 0:
                consObjsLst.append( ConsequentRule(rule) )

            ## Hierarchical rule with 1 NT
            elif len(match[1]) == 2:
                span1 = (match[1][0]+i, match[1][1]+i)
                if not Parse.chartDict[span1].has_X_tree: continue

                if settings.opts.shallow_hiero and not self.relaxed_decoding:     # for Shallow-n hiero
                    x_level = -1
                    for x_level_status in Parse.chartDict[span1].getXLevelStats(self.sh_order):
                        x_level += 1
                        if not x_level_status: continue
                        consObjsLst.append( ConsequentRule(rule, x_level, span1, (), x_level) )
                elif not settings.opts.shallow_hiero or self.relaxed_decoding:    # for Full-hiero/ relaxed decoding
                    consObjsLst.append( ConsequentRule(rule, 0, span1) )

            ## Hierarchical rule with 2 NTs
            elif len(match[1]) == 4:
                span1 = (match[1][0]+i, match[1][1]+i)
                span2 = (match[1][2]+i, match[1][3]+i)
                if not Parse.chartDict[span1].has_X_tree or not Parse.chartDict[span2].has_X_tree: continue

                if settings.opts.shallow_hiero and not self.relaxed_decoding:     # for Shallow-n hiero
                    X1Levels = Parse.chartDict[span1].getXLevelStats(self.sh_order)
                    X2Levels = Parse.chartDict[span2].getXLevelStats(self.sh_order)

                    x1_level = -1
                    top_x2_level = len(X2Levels) - 1
                    if X2Levels[top_x2_level]:
                        for x1_level_status in X1Levels:
                            x1_level += 1
                            if not x1_level_status or x1_level > top_x2_level: continue
                            consObjsLst.append( ConsequentRule(rule, top_x2_level, span1, span2, x1_level, top_x2_level) )

                    x2_level = -1
                    top_x1_level = len(X1Levels) - 1
                    if X1Levels[top_x1_level]:
                        for x2_level_status in X2Levels:
                            x2_level += 1
                            if not x2_level_status or x2_level > top_x1_level: continue
                            consObjsLst.append( ConsequentRule(rule, top_x1_level, span1, span2, top_x1_level, x2_level) )
                elif not settings.opts.shallow_hiero or self.relaxed_decoding:    # for Full-hiero/ relaxed decoding
                    consObjsLst.append( ConsequentRule(rule, 0, span1, span2) )
Beispiel #7
0
    def __reduceCell(self, span, cell_type, rule_nt, final_cell):
        '''Reduce the cell entries to merge products and build translations'''

        global consObjsLst  # Consequent Rules are to be processed in order: check 'X' rules and then 'S' rules

        src_side = ' '.join(
            self.wordsLst[span[0]:span[1] +
                          1])  # Get the source side of the span
        merge_obj = Lazy(self.sent_indx, span, cell_type, final_cell)
        cube_indx = 0
        cell_max_X_depth = 0
        for conseq_obj in consObjsLst:
            rule = conseq_obj.rule
            cube_depth_hier = 0 if (not conseq_obj.spanTup or cell_type
                                    == 'S') else conseq_obj.top_X_level + 1
            ruleRHSLst = PhraseTable.getRuleEntries(rule, self.sent_indx)
            if not ruleRHSLst: continue

            # Set the maximum depth of the current Cell
            if cell_type == 'X' and cube_depth_hier > cell_max_X_depth:
                cell_max_X_depth = cube_depth_hier

            # set the source side rule and span for the current cube
            merge_obj.setSourceInfo(cube_indx, rule, conseq_obj.spanTup,
                                    cube_depth_hier, self.refsLst)
            # add the consequent item to the cube as its first dimension
            merge_obj.add2Cube(cube_indx, ruleRHSLst)

            # add the rules for the sub-spans
            if rule.find('X__1') != -1 or rule.startswith(
                    'S__1'):  # process the rules having a non-terminal
                s_indx = 0
                for rterm in rule.split():
                    if rterm.startswith('X__'): left_side = 'X'
                    elif rterm.startswith('S__'): left_side = 'S'
                    else: continue

                    # add the antecedent item(s) of the sub-spans in the derivation
                    s_span = conseq_obj.spanTup[s_indx]
                    s_depth = conseq_obj.depth1 if s_indx == 0 else conseq_obj.depth2
                    merge_obj.add2Cube(
                        cube_indx, Parse.chartDict[s_span].getTupLst4NT(
                            left_side, s_depth))
                    s_indx += 1
            cube_indx += 1

        tgtLst = merge_obj.mergeProducts()
        self.__flush2Cell(span, (rule_nt, src_side), cell_max_X_depth,
                          tgtLst)  # Flush the entries to the cell
        merge_obj = ''  # Important: This clears the mem-obj and calls the garbage collector on Lazy()
        del consObjsLst[:]
Beispiel #8
0
    def __reduceCell(self, span, cell_type, rule_nt, final_cell):
        '''Reduce the cell entries to merge products and build translations'''

        global consObjsLst          # Consequent Rules are to be processed in order: check 'X' rules and then 'S' rules

        src_side = ' '.join( self.wordsLst[span[0]:span[1]+1] ) # Get the source side of the span
        merge_obj = Lazy(self.sent_indx, span, cell_type, final_cell)
        cube_indx = 0
        cell_max_X_depth = 0
        for conseq_obj in consObjsLst:
            rule = conseq_obj.rule
            cube_depth_hier = 0 if (not conseq_obj.spanTup or cell_type == 'S') else conseq_obj.top_X_level + 1
            ruleRHSLst = PhraseTable.getRuleEntries(rule, self.sent_indx)
            if not ruleRHSLst: continue

            # Set the maximum depth of the current Cell
            if cell_type == 'X' and cube_depth_hier > cell_max_X_depth: cell_max_X_depth = cube_depth_hier

            # set the source side rule and span for the current cube
            merge_obj.setSourceInfo( cube_indx, rule, conseq_obj.spanTup, cube_depth_hier, self.refsLst )
            # add the consequent item to the cube as its first dimension
            merge_obj.add2Cube( cube_indx, ruleRHSLst )

            # add the rules for the sub-spans
            if rule.find('X__1') != -1 or rule.startswith('S__1'):  # process the rules having a non-terminal
                s_indx = 0
                for rterm in rule.split():
                    if rterm.startswith('X__'): left_side = 'X'
                    elif rterm.startswith('S__'): left_side = 'S'
                    else: continue
              
                    # add the antecedent item(s) of the sub-spans in the derivation
                    s_span = conseq_obj.spanTup[s_indx]
                    s_depth = conseq_obj.depth1 if s_indx == 0 else conseq_obj.depth2
                    merge_obj.add2Cube( cube_indx, Parse.chartDict[s_span].getTupLst4NT(left_side, s_depth) )
                    s_indx += 1
            cube_indx += 1

        tgtLst = merge_obj.mergeProducts()
        self.__flush2Cell( span, (rule_nt, src_side), cell_max_X_depth, tgtLst)   # Flush the entries to the cell
        merge_obj = ''  # Important: This clears the mem-obj and calls the garbage collector on Lazy()
        del consObjsLst[:]
Beispiel #9
0
    def parse(self):
        'Parse the sentence passed in the argument'

        global consObjsLst
        final_cell = False
        glueSrcLst = ['X__1', 'S__1 X__2']

        # Phase-1: Initialization
        # Fill the initial axioms in the chartDict (Dict of dict) in corresponding word positions
        p_i = 0
        for p_word in self.wordsLst:
#            print "Span:", p_i, p_i, "\tSpan length: 1"
            if ( p_i == 0 and self.sent_len == 1 ):
                final_cell = True
            Parse.chartDict[(p_i, p_i)] = Cell()

            # if the word is UNK; add it to ruleDict as: X -> <w_i, w_i> with default prob
            if not PhraseTable.hasRule(p_word):
                (unk_score, unk_lm_heu, unk_featVec) = FeatureManager.unkRuleTup
                PhraseTable.addUNKRule( p_word, RuleItem.initUNKRule(p_word, unk_featVec, unk_score, unk_lm_heu) )

            # Known (X -> <w_i, w_t>) or unknown (X -> <w_i, w_i>) rules are now flushed to the chart
            self.__flush2Cell( (p_i, p_i), ('X', p_word), 0, self.__getRulesFromPT(p_word, (p_i, p_i)) )     # Flush the entries to the cell
            #Parse.chartDict[(p_i, p_i)].printCell('X', self.sent_indx)

            # Add the glue rule S --> <X__1, X__1> in cell (0, 0)
            if p_i == 0:
                p_src = glueSrcLst[0]
                self.__getGlueRuleSpans((p_i, p_i), p_src)
                if consObjsLst:
                    Parse.chartDict[(p_i, p_i)].has_S_tree = True
                    self.__reduceCell((p_i, p_i), 'S', 'S', final_cell)   # Compute the n-best list from the parse forest
                    if settings.opts.force_decode:
                        force_dec_status = Parse.chartDict[(0, p_i)].forceDecodePrune(self.refsLst, final_cell)
                        if final_cell and not force_dec_status:
                            sys.stderr.write("           INFO  :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n" % (p_i))
                            return 0
                    #Parse.chartDict[(0, p_i)].printCell('S', self.sent_indx)

            p_i += 1

        # Phase-2: Filling the CKY table
        # Iterate through all possible spans of length 2 thro' M (maximum phrase length)
        for p_l in range(1, self.sent_len):
            for p_j in range(p_l, self.sent_len):
                p_i = p_j - p_l
#                print "\nSpan:", p_i, p_j, "\tSpan length:", p_l + 1
                # If the span length is greater than the 'maximum phrase length' skip to next iteration of p_l
                if p_l >= settings.opts.max_phr_len and p_i != 0: break

                Parse.chartDict[(p_i, p_j)] = Cell()
                p_cell_type = 'X'
                p_left_nt = 'X'
                if ( p_i == 0 and p_j == self.sent_len - 1 ):
                    final_cell = True
                if p_l < settings.opts.max_phr_len:
                    self.__getRuleSpans( p_i, p_j, ' '.join(self.wordsLst[p_i:p_j+1]) )

                if consObjsLst:
                    self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt, final_cell)
                    #Parse.chartDict[(p_i, p_j)].printCell('X', self.sent_indx)

                # For span beginning at '0' (top row in the parse triangle), add items of the form [S, i, j]:w to chart
                # Glue rules are: S --> (X__1, X__1) and S --> (S__1 X__2, S__1 X__2)
                # Sentence boundary markers <s> and </s> are added in Cube-Pruning step (lazyMerge_CP.py)
                if p_i == 0:
                    p_cell_type = 'S'
                    p_left_nt = 'S'
                    for p_src in glueSrcLst: self.__getGlueRuleSpans((p_i, p_j), p_src)

                    if consObjsLst:
                        Parse.chartDict[(p_i, p_j)].has_S_tree = True
                        self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt, final_cell)
                    if settings.opts.force_decode:
                        force_dec_status = Parse.chartDict[(p_i, p_j)].forceDecodePrune(self.refsLst, final_cell)
                        if final_cell and not force_dec_status:
                            sys.stderr.write("           INFO  :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n" % (p_j))
                            return 0
                    #Parse.chartDict[(p_i, p_j)].printCell('S', self.sent_indx)

        p_j = self.sent_len - 1
        if not Parse.chartDict[(0, p_j)].has_S_tree:
            return 99
        Parse.chartDict[(0, p_j)].printNBest('S', self.sent_indx)       # Print the N-best derivations in the last cell
        if settings.opts.trace_rules > 0:
            #Parse.chartDict[(0, p_j)].trackRulesUsed('S')               # Track the rules used in the top-k translations
            Parse.chartDict[(0, p_j)].printTrace('S', self.sent)        # Prints the translation trace for the top-3 entries

        return 1
Beispiel #10
0
    def parse(self):
        'Parse the sentence passed in the argument'

        global consObjsLst
        final_cell = False
        glueSrcLst = ['X__1', 'S__1 X__2']

        # Phase-1: Initialization
        # Fill the initial axioms in the chartDict (Dict of dict) in corresponding word positions
        p_i = 0
        for p_word in self.wordsLst:
            #            print "Span:", p_i, p_i, "\tSpan length: 1"
            if (p_i == 0 and self.sent_len == 1):
                final_cell = True
            Parse.chartDict[(p_i, p_i)] = Cell()

            # if the word is UNK; add it to ruleDict as: X -> <w_i, w_i> with default prob
            if not PhraseTable.hasRule(p_word):
                (unk_score, unk_lm_heu,
                 unk_featVec) = FeatureManager.unkRuleTup
                PhraseTable.addUNKRule(
                    p_word,
                    RuleItem.initUNKRule(p_word, unk_featVec, unk_score,
                                         unk_lm_heu))

            # Known (X -> <w_i, w_t>) or unknown (X -> <w_i, w_i>) rules are now flushed to the chart
            self.__flush2Cell(
                (p_i, p_i), ('X', p_word), 0,
                self.__getRulesFromPT(
                    p_word, (p_i, p_i)))  # Flush the entries to the cell
            #Parse.chartDict[(p_i, p_i)].printCell('X', self.sent_indx)

            # Add the glue rule S --> <X__1, X__1> in cell (0, 0)
            if p_i == 0:
                p_src = glueSrcLst[0]
                self.__getGlueRuleSpans((p_i, p_i), p_src)
                if consObjsLst:
                    Parse.chartDict[(p_i, p_i)].has_S_tree = True
                    self.__reduceCell(
                        (p_i, p_i), 'S', 'S', final_cell
                    )  # Compute the n-best list from the parse forest
                    if settings.opts.force_decode:
                        force_dec_status = Parse.chartDict[(
                            0, p_i)].forceDecodePrune(self.refsLst, final_cell)
                        if final_cell and not force_dec_status:
                            sys.stderr.write(
                                "           INFO  :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n"
                                % (p_i))
                            return 0
                    #Parse.chartDict[(0, p_i)].printCell('S', self.sent_indx)

            p_i += 1

        # Phase-2: Filling the CKY table
        # Iterate through all possible spans of length 2 thro' M (maximum phrase length)
        for p_l in range(1, self.sent_len):
            for p_j in range(p_l, self.sent_len):
                p_i = p_j - p_l
                #                print "\nSpan:", p_i, p_j, "\tSpan length:", p_l + 1
                # If the span length is greater than the 'maximum phrase length' skip to next iteration of p_l
                if p_l >= settings.opts.max_phr_len and p_i != 0: break

                Parse.chartDict[(p_i, p_j)] = Cell()
                p_cell_type = 'X'
                p_left_nt = 'X'
                if (p_i == 0 and p_j == self.sent_len - 1):
                    final_cell = True
                if p_l < settings.opts.max_phr_len:
                    self.__getRuleSpans(p_i, p_j,
                                        ' '.join(self.wordsLst[p_i:p_j + 1]))

                if consObjsLst:
                    self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt,
                                      final_cell)
                    #Parse.chartDict[(p_i, p_j)].printCell('X', self.sent_indx)

                # For span beginning at '0' (top row in the parse triangle), add items of the form [S, i, j]:w to chart
                # Glue rules are: S --> (X__1, X__1) and S --> (S__1 X__2, S__1 X__2)
                # Sentence boundary markers <s> and </s> are added in Cube-Pruning step (lazyMerge_CP.py)
                if p_i == 0:
                    p_cell_type = 'S'
                    p_left_nt = 'S'
                    for p_src in glueSrcLst:
                        self.__getGlueRuleSpans((p_i, p_j), p_src)

                    if consObjsLst:
                        Parse.chartDict[(p_i, p_j)].has_S_tree = True
                        self.__reduceCell((p_i, p_j), p_cell_type, p_left_nt,
                                          final_cell)
                    if settings.opts.force_decode:
                        force_dec_status = Parse.chartDict[(
                            p_i,
                            p_j)].forceDecodePrune(self.refsLst, final_cell)
                        if final_cell and not force_dec_status:
                            sys.stderr.write(
                                "           INFO  :: Force decode mode: No matching candidate found for cell (0, %d). Aborting!!\n"
                                % (p_j))
                            return 0
                    #Parse.chartDict[(p_i, p_j)].printCell('S', self.sent_indx)

        p_j = self.sent_len - 1
        if not Parse.chartDict[(0, p_j)].has_S_tree:
            return 99
        Parse.chartDict[(0, p_j)].printNBest(
            'S',
            self.sent_indx)  # Print the N-best derivations in the last cell
        if settings.opts.trace_rules > 0:
            #Parse.chartDict[(0, p_j)].trackRulesUsed('S')               # Track the rules used in the top-k translations
            Parse.chartDict[(0, p_j)].printTrace(
                'S', self.sent
            )  # Prints the translation trace for the top-3 entries

        return 1
Beispiel #11
0
    def __getRuleSpans(self, i, j, span_phrase):
        '''Get the list of rules that match the phrase corresponding to the given span'''

        global consObjsLst
        matchLst = PhraseTable.findConsistentRules(span_phrase)

        for match in matchLst:
            rule = match[0]
            ## Terminal rule
            if len(match[1]) == 0:
                consObjsLst.append(ConsequentRule(rule))

            ## Hierarchical rule with 1 NT
            elif len(match[1]) == 2:
                span1 = (match[1][0] + i, match[1][1] + i)
                if not Parse.chartDict[span1].has_X_tree: continue

                if settings.opts.shallow_hiero and not self.relaxed_decoding:  # for Shallow-n hiero
                    x_level = -1
                    for x_level_status in Parse.chartDict[
                            span1].getXLevelStats(self.sh_order):
                        x_level += 1
                        if not x_level_status: continue
                        consObjsLst.append(
                            ConsequentRule(rule, x_level, span1, (), x_level))
                elif not settings.opts.shallow_hiero or self.relaxed_decoding:  # for Full-hiero/ relaxed decoding
                    consObjsLst.append(ConsequentRule(rule, 0, span1))

            ## Hierarchical rule with 2 NTs
            elif len(match[1]) == 4:
                span1 = (match[1][0] + i, match[1][1] + i)
                span2 = (match[1][2] + i, match[1][3] + i)
                if not Parse.chartDict[span1].has_X_tree or not Parse.chartDict[
                        span2].has_X_tree:
                    continue

                if settings.opts.shallow_hiero and not self.relaxed_decoding:  # for Shallow-n hiero
                    X1Levels = Parse.chartDict[span1].getXLevelStats(
                        self.sh_order)
                    X2Levels = Parse.chartDict[span2].getXLevelStats(
                        self.sh_order)

                    x1_level = -1
                    top_x2_level = len(X2Levels) - 1
                    if X2Levels[top_x2_level]:
                        for x1_level_status in X1Levels:
                            x1_level += 1
                            if not x1_level_status or x1_level > top_x2_level:
                                continue
                            consObjsLst.append(
                                ConsequentRule(rule, top_x2_level, span1,
                                               span2, x1_level, top_x2_level))

                    x2_level = -1
                    top_x1_level = len(X1Levels) - 1
                    if X1Levels[top_x1_level]:
                        for x2_level_status in X2Levels:
                            x2_level += 1
                            if not x2_level_status or x2_level > top_x1_level:
                                continue
                            consObjsLst.append(
                                ConsequentRule(rule, top_x1_level, span1,
                                               span2, top_x1_level, x2_level))
                elif not settings.opts.shallow_hiero or self.relaxed_decoding:  # for Full-hiero/ relaxed decoding
                    consObjsLst.append(ConsequentRule(rule, 0, span1, span2))