Esempio n. 1
0
    def getFeatVec(self):
        '''Return the feature values of the Hypothesis as a vector'''

        cand_hyp = self.getHypothesis()
        agg_sl_feat, agg_sf_feat = self.computeFeatures()
        feat_str = FeatureManager.formatFeatureVals(cand_hyp, agg_sl_feat, agg_sf_feat)
        return feat_str
Esempio n. 2
0
    def printEntry(self):
        '''Prints the specific elements of the result'''

        cand_hyp = self.getHypothesis()
        agg_sl_feat, agg_sf_feat = self.computeFeatures()
        feat_str = FeatureManager.formatFeatureVals(cand_hyp, agg_sl_feat, agg_sf_feat)
        return cand_hyp, feat_str
Esempio n. 3
0
    def printEntry(self):
        '''Prints the specific elements of the result'''

        cand_hyp = self.getHypothesis()
        agg_sl_feat, agg_sf_feat = self.computeFeatures()
        feat_str = FeatureManager.formatFeatureVals(cand_hyp, agg_sl_feat,
                                                    agg_sf_feat)
        return cand_hyp, feat_str
Esempio n. 4
0
    def getFeatVec(self):
        '''Return the feature values of the Hypothesis as a vector'''

        cand_hyp = self.getHypothesis()
        agg_sl_feat, agg_sf_feat = self.computeFeatures()
        feat_str = FeatureManager.formatFeatureVals(cand_hyp, agg_sl_feat,
                                                    agg_sf_feat)
        return feat_str
Esempio n. 5
0
def args():
    import optparse
    optparser = optparse.OptionParser(usage="usage: cat input | %prog [options]")

    optparser.add_option("", "--debug", dest="debug", default=False, action="store_true", help="Debug flag")
    optparser.add_option("", "--config", dest="configFile", type="string", help="Configuration file")
    optparser.add_option("", "--one-nt-decode", dest="one_nt_decode", default=False, action="store_true", help="Run decoder in 1NT mode (ignore 2NT rules)")
    optparser.add_option("", "--shallow-hiero", dest="shallow_hiero", default=False, action="store_true", help="Flag for shallow decoding")
    optparser.add_option("", "--shallow-order", dest="sh_order", default=1, type="int", help="Shallow decoding order")
    optparser.add_option("", "--free-glue", dest="free_glue", default=True, action="store_true", help="Glue rules can freely combine any X")
    optparser.add_option("", "--index", dest="sentindex", default=0, type="int", help="Sentence index")
    optparser.add_option("", "--skip-sents", dest="skip_sents", default=None, type="int", help="Skip sentences (usefel to resume decoding mid-way)")
    optparser.add_option("", "--sentperfile", dest="sent_per_file", default=500, type="int", help="Sentences per file")
    optparser.add_option("", "--fr-rule-terms", dest="fr_rule_terms", default=5, type="int", help="Terms in French side of Hiero rules")
    optparser.add_option("", "--inputfile", dest="inFile", type="string", help="Input data file")
    optparser.add_option("", "--outputfile", dest="outFile", type="string", help="Output file")
    optparser.add_option("", "--glue-file", dest="glueFile", type="string", help="Glue rules file")
    optparser.add_option("", "--ttable-file", dest="ruleFile", type="string", help="SCFG rules file")
    optparser.add_option("", "--lmodel-file", dest="lmFile", type="string", help="LM file")
    optparser.add_option("", "--use-srilm", dest="use_srilm", default=False, action="store_true", help="Flag for using SRILM")
    optparser.add_option("", "--no-lm-state", dest="no_lm_state", default=False, action="store_true", help="Don't use LM state for KENLM")
    optparser.add_option("", "--no-dscnt-UNKlm", dest="no_dscnt_UNKlm", default=False, action="store_true", help="Don't discount LM penalty for UNK")
    optparser.add_option("", "--no-glue-penalty", dest="no_glue_penalty", default=False, action="store_true", help="Don't penalise glue rules")
    optparser.add_option("", "--tm-wgt-cnt", dest="tm_weight_cnt", default=5, type="int", help="# of TM weights")

    optparser.add_option("", "--trace-rules", dest="trace_rules", default=0, type="int", help="Trace the rules used in the k-best candidates as specified")
    optparser.add_option("", "--force-decode", dest="force_decode", default=False, action="store_true", help="Run the decoder in force decode mode")
    optparser.add_option("", "--reffile", dest="refFile", type="string", help="Reference file or prefix for multiple refs (for force decoding)")
    optparser.add_option("", "--use-local", dest="local_path", default="None", type="string", help="Local path to copy the models")
    optparser.add_option("", "--nbest-extremum", dest="nbest_extremum", default=0, type="int", help="Produce nbest_extremum entries if provided; default full nbest list")

    optparser.add_option("", "--lm", dest="weight_lm", default=1.0, type="float", help="Language model weight")
    optparser.add_option("", "--tm", dest="weight_tm", type="string", help="Translation model weights as a string")
    optparser.add_option("", "--tmf", dest="weight_tmf", default=1.0, type="float", help="Forward trans model weight")
    optparser.add_option("", "--tmr", dest="weight_tmr", default=1.0, type="float", help="Reverse trans model weight")
    optparser.add_option("", "--lwf", dest="weight_lwf", default=0.5, type="float", help="Forward lexical trans weight")
    optparser.add_option("", "--lwr", dest="weight_lwr", default=0.5, type="float", help="Reverse lexical trans weight")
    optparser.add_option("", "--pp", dest="weight_pp", default=-1.0, type="float", help="Phrase penalty weight")
    optparser.add_option("", "--wp", dest="weight_wp", default=-2.0, type="float", help="Word penalty weight")
    optparser.add_option("", "--wg", dest="weight_glue", default=0.0, type="float", help="Glue rule weight")

    optparser.add_option("", "--cbp", dest="cbp", default=250, type="int", help="Cube pruning pop limit")
    optparser.add_option("", "--cbp-diversity", dest="cbp_diversity", default=0, type="int", help="Stack diversity in Cube pruning")
    optparser.add_option("", "--ttl", dest="ttl", default=20, type="int", help="# of translations for each source span")
    optparser.add_option("", "--btx", dest="beta_x", default=0.001, type="int", help="Beam threshold for X cells")
    optparser.add_option("", "--bts", dest="beta_s", default=0.001, type="int", help="Beam threshold for S cells")
    optparser.add_option("", "--eps", dest="eps", default=0.1, type="float", help="Beam search margin")

    optparser.add_option("", "--1b", dest="one_best", default=False, action="store_true", help="Just do the best derivation")
    optparser.add_option("", "--zmert-nbest", dest="zmert_nbest", default=False, action="store_true", help="N-best list should be in zmert format")
    optparser.add_option("", "--ng", dest="n_gram_size", default=3, type="int", help="n-gram size")

    global opts, feat
    (opts, args) = optparser.parse_args()

    # Default flags & thresholds
    opts.fr_rule_terms = 5
    opts.max_phr_len = 10
    opts.nbest_limit = 100
    opts.use_unique_nbest = True
    opts.nbest_format = True
    opts.score_diff_threshold = 0.01
    opts.elider = '*__*'
    opts.lmTupLst = []
    opts.weightLM = []
    opts.weightTM = []

    if opts.configFile is None:
        sys.stderr.write('ERROR: Please specify a Config file. Exiting!!')
        sys.exit(1)
    if opts.configFile is not None:
        loadConfig()

    if opts.force_decode and not opts.refFile: 
        sys.stderr.write("ERROR: Forced decoding requires at least one reference file.\n")
        sys.stderr.write("       But, no reference file has been specified. Exiting!!\n\n")
        sys.exit(1)

    if (not opts.no_lm_state) and opts.use_srilm:
        sys.stderr.write("INFO: lm_state and srilm are mutually exclusive; no_lm_state can only be used with KENLM.\n")
        sys.stderr.write("      Setting no_lm_state to True and using SRILM\n")
        opts.no_lm_state = True

    if opts.use_srilm:
        sys.stderr.write("WARNING: SRILM wrapper is not included with Kriya and needs to be build separately.\n")
        sys.stderr.write("         Falling back to use KenLM wrapper.\n")
        sys.stderr.write("** If you would like to use SRILM, comment out/remove the lines: 94-98 in Kriya-Decoder/settings.py **\n")
        opts.use_srilm = False

    sys.stderr.write( "INFO: Using the N-gram size      : %d\n" % (opts.n_gram_size) )
    sys.stderr.write( "INFO: Run decoder in 1NT mode    : %s\n" % (opts.one_nt_decode) )
    sys.stderr.write( "INFO: Use X freely in Glue rules : %s\n" % (opts.free_glue) )
    sys.stderr.write( "INFO: # of rule terms in Fr side : %d\n" % (opts.fr_rule_terms) )
    sys.stderr.write( "INFO: Generating unique N-best   : %s\n" % (opts.use_unique_nbest) )
    sys.stderr.write( "INFO: Use state info for KENLM   : %s\n" % (not opts.no_lm_state) )
    sys.stderr.write( "INFO: Discount LM penalty 4 UNK  : %s\n" % (not opts.no_dscnt_UNKlm) )
    sys.stderr.write( "INFO: Glue rules penalty applied : %s\n" % (not opts.no_glue_penalty) )
    sys.stderr.write( "INFO: Cube pruning diversity     : %d\n" % (opts.cbp_diversity) )

    sys.stderr.write( "INFO: Force decoding status      : %s\n" % (opts.force_decode) )
    sys.stderr.write( "INFO: Reference file             : %s\n" % (opts.refFile) )

    if opts.nbest_extremum > 0:
        if opts.nbest_extremum * 2 >= opts.nbest_limit:
            opts.nbest_extremum = 20
            sys.stderr.write( "INFO: Nbest extremum must be less than half the nbest size. Using default nbest extremum of 20.\n" )
        else:
            sys.stderr.write( "INFO: Nbest extremum set: will produce top-%d and bottom-%d entries as nbest-list\n" % (opts.nbest_extremum, opts.nbest_extremum) )

    # Default weights for different features
    feat = Features()
    if opts.weight_tm:
        feat.tm = map( lambda x: float(x), opts.weight_tm.split(' ') )
    else:
        feat.tm = [opts.weight_tmf, opts.weight_tmr, opts.weight_lwf, \
                    opts.weight_lwr, opts.weight_pp]
    feat.wp = opts.weight_wp

    # Set the nbest_format to 'False' & nbest_limit to '1', if one_best option is set
    if opts.one_best:
        opts.nbest_format = False
        opts.nbest_limit = 1
        sys.stderr.write("INFO: one-best option specified. Option nbest-format will be turned off and nbest_limit set to 1.\n")
    sys.stderr.write( "INFO: cbp/ Nbest limit : %d/ %d\n" % (opts.cbp, opts.nbest_limit) )

    if opts.shallow_hiero: sys.stderr.write( "INFO: Shallow decoding hiero with order : %d...\n" % (opts.sh_order) )
    else: sys.stderr.write( "INFO: Shallow decoding hiero turned off; decoding as full hiero ...\n" )

    if opts.use_srilm: sys.stderr.write( "INFO: Using SRILM language model wrapper ...\n" )
    else: sys.stderr.write( "INFO: Using KenLM language model wrapper ...\n" )

    # Initialize the language models
    LanguageModelManager.initLMs(len(opts.weightLM), opts.lmTupLst, opts.use_srilm)

    # Set weights for the features
    FeatureManager.glue_wgt = opts.weight_glue
    FeatureManager.wp_wgt = opts.weight_wp
    FeatureManager.lmWgt = opts.weightLM[:]
    FeatureManager.tmWgt = opts.weightTM[:]
    FeatureManager.setFeatureWeights(len(opts.weightLM), len(opts.weightTM), opts.tm_weight_cnt)

    if opts.local_path is not 'None':
        sys.stderr.write( "About to copy language model locally ...\n" )
        copyModels()
Esempio n. 6
0
 def scoreRule(self):
     p_score = FeatureManager.scorePTEntry(self.sl_feat)
     lm_score = LanguageModelManager.scoreLMFeat(self.tgt)
     self.lm_heu = lm_score
     self.score = p_score + lm_score
Esempio n. 7
0
 def turnOffGlue(self):
     FeatureManager.turnOffGlue(self.sl_feat)
Esempio n. 8
0
 def getScore4TTL(self):
     return FeatureManager.getScore4TTL(self.sl_feat)
Esempio n. 9
0
 def initGlue(cls, src, tgt, glue_val):
     return RuleItem(src, tgt, FeatureManager.buildGlueFeats(glue_val))
Esempio n. 10
0
 def initRule(cls, src, tgt, probs):
     term_count = 0
     for tgt_term in tgt.split():
         if tgt_term == 'X__1' or tgt_term == 'X__2': continue
         term_count += 1
     return RuleItem(src, tgt, FeatureManager.buildRuleFeats(probs, term_count))
Esempio n. 11
0
def args():
    import optparse
    optparser = optparse.OptionParser(
        usage="usage: cat input | %prog [options]")

    optparser.add_option("",
                         "--debug",
                         dest="debug",
                         default=False,
                         action="store_true",
                         help="Debug flag")
    optparser.add_option("",
                         "--config",
                         dest="configFile",
                         type="string",
                         help="Configuration file")
    optparser.add_option("",
                         "--one-nt-decode",
                         dest="one_nt_decode",
                         default=False,
                         action="store_true",
                         help="Run decoder in 1NT mode (ignore 2NT rules)")
    optparser.add_option("",
                         "--shallow-hiero",
                         dest="shallow_hiero",
                         default=False,
                         action="store_true",
                         help="Flag for shallow decoding")
    optparser.add_option("",
                         "--shallow-order",
                         dest="sh_order",
                         default=1,
                         type="int",
                         help="Shallow decoding order")
    optparser.add_option("",
                         "--free-glue",
                         dest="free_glue",
                         default=True,
                         action="store_true",
                         help="Glue rules can freely combine any X")
    optparser.add_option("",
                         "--index",
                         dest="sentindex",
                         default=0,
                         type="int",
                         help="Sentence index")
    optparser.add_option(
        "",
        "--skip-sents",
        dest="skip_sents",
        default=None,
        type="int",
        help="Skip sentences (usefel to resume decoding mid-way)")
    optparser.add_option("",
                         "--sentperfile",
                         dest="sent_per_file",
                         default=500,
                         type="int",
                         help="Sentences per file")
    optparser.add_option("",
                         "--fr-rule-terms",
                         dest="fr_rule_terms",
                         default=5,
                         type="int",
                         help="Terms in French side of Hiero rules")
    optparser.add_option("",
                         "--inputfile",
                         dest="inFile",
                         type="string",
                         help="Input data file")
    optparser.add_option("",
                         "--outputfile",
                         dest="outFile",
                         type="string",
                         help="Output file")
    optparser.add_option("",
                         "--glue-file",
                         dest="glueFile",
                         type="string",
                         help="Glue rules file")
    optparser.add_option("",
                         "--ttable-file",
                         dest="ruleFile",
                         type="string",
                         help="SCFG rules file")
    optparser.add_option("",
                         "--lmodel-file",
                         dest="lmFile",
                         type="string",
                         help="LM file")
    optparser.add_option("",
                         "--use-srilm",
                         dest="use_srilm",
                         default=False,
                         action="store_true",
                         help="Flag for using SRILM")
    optparser.add_option("",
                         "--no-lm-state",
                         dest="no_lm_state",
                         default=False,
                         action="store_true",
                         help="Don't use LM state for KENLM")
    optparser.add_option("",
                         "--no-dscnt-UNKlm",
                         dest="no_dscnt_UNKlm",
                         default=False,
                         action="store_true",
                         help="Don't discount LM penalty for UNK")
    optparser.add_option("",
                         "--no-glue-penalty",
                         dest="no_glue_penalty",
                         default=False,
                         action="store_true",
                         help="Don't penalise glue rules")
    optparser.add_option("",
                         "--tm-wgt-cnt",
                         dest="tm_weight_cnt",
                         default=5,
                         type="int",
                         help="# of TM weights")

    optparser.add_option(
        "",
        "--trace-rules",
        dest="trace_rules",
        default=0,
        type="int",
        help="Trace the rules used in the k-best candidates as specified")
    optparser.add_option("",
                         "--force-decode",
                         dest="force_decode",
                         default=False,
                         action="store_true",
                         help="Run the decoder in force decode mode")
    optparser.add_option(
        "",
        "--reffile",
        dest="refFile",
        type="string",
        help="Reference file or prefix for multiple refs (for force decoding)")
    optparser.add_option("",
                         "--use-local",
                         dest="local_path",
                         default="None",
                         type="string",
                         help="Local path to copy the models")
    optparser.add_option(
        "",
        "--nbest-extremum",
        dest="nbest_extremum",
        default=0,
        type="int",
        help=
        "Produce nbest_extremum entries if provided; default full nbest list")

    optparser.add_option("",
                         "--lm",
                         dest="weight_lm",
                         default=1.0,
                         type="float",
                         help="Language model weight")
    optparser.add_option("",
                         "--tm",
                         dest="weight_tm",
                         type="string",
                         help="Translation model weights as a string")
    optparser.add_option("",
                         "--tmf",
                         dest="weight_tmf",
                         default=1.0,
                         type="float",
                         help="Forward trans model weight")
    optparser.add_option("",
                         "--tmr",
                         dest="weight_tmr",
                         default=1.0,
                         type="float",
                         help="Reverse trans model weight")
    optparser.add_option("",
                         "--lwf",
                         dest="weight_lwf",
                         default=0.5,
                         type="float",
                         help="Forward lexical trans weight")
    optparser.add_option("",
                         "--lwr",
                         dest="weight_lwr",
                         default=0.5,
                         type="float",
                         help="Reverse lexical trans weight")
    optparser.add_option("",
                         "--pp",
                         dest="weight_pp",
                         default=-1.0,
                         type="float",
                         help="Phrase penalty weight")
    optparser.add_option("",
                         "--wp",
                         dest="weight_wp",
                         default=-2.0,
                         type="float",
                         help="Word penalty weight")
    optparser.add_option("",
                         "--wg",
                         dest="weight_glue",
                         default=0.0,
                         type="float",
                         help="Glue rule weight")

    optparser.add_option("",
                         "--cbp",
                         dest="cbp",
                         default=250,
                         type="int",
                         help="Cube pruning pop limit")
    optparser.add_option("",
                         "--cbp-diversity",
                         dest="cbp_diversity",
                         default=0,
                         type="int",
                         help="Stack diversity in Cube pruning")
    optparser.add_option("",
                         "--ttl",
                         dest="ttl",
                         default=20,
                         type="int",
                         help="# of translations for each source span")
    optparser.add_option("",
                         "--btx",
                         dest="beta_x",
                         default=0.001,
                         type="int",
                         help="Beam threshold for X cells")
    optparser.add_option("",
                         "--bts",
                         dest="beta_s",
                         default=0.001,
                         type="int",
                         help="Beam threshold for S cells")
    optparser.add_option("",
                         "--eps",
                         dest="eps",
                         default=0.1,
                         type="float",
                         help="Beam search margin")

    optparser.add_option("",
                         "--1b",
                         dest="one_best",
                         default=False,
                         action="store_true",
                         help="Just do the best derivation")
    optparser.add_option("",
                         "--zmert-nbest",
                         dest="zmert_nbest",
                         default=False,
                         action="store_true",
                         help="N-best list should be in zmert format")
    optparser.add_option("",
                         "--ng",
                         dest="n_gram_size",
                         default=3,
                         type="int",
                         help="n-gram size")

    global opts, feat
    (opts, args) = optparser.parse_args()

    # Default flags & thresholds
    opts.fr_rule_terms = 5
    opts.max_phr_len = 10
    opts.nbest_limit = 100
    opts.use_unique_nbest = True
    opts.nbest_format = True
    opts.score_diff_threshold = 0.01
    opts.elider = '*__*'
    opts.lmTupLst = []
    opts.weightLM = []
    opts.weightTM = []

    if opts.configFile is None:
        sys.stderr.write('ERROR: Please specify a Config file. Exiting!!')
        sys.exit(1)
    if opts.configFile is not None:
        loadConfig()

    if opts.force_decode and not opts.refFile:
        sys.stderr.write(
            "ERROR: Forced decoding requires at least one reference file.\n")
        sys.stderr.write(
            "       But, no reference file has been specified. Exiting!!\n\n")
        sys.exit(1)

    if (not opts.no_lm_state) and opts.use_srilm:
        sys.stderr.write(
            "INFO: lm_state and srilm are mutually exclusive; no_lm_state can only be used with KENLM.\n"
        )
        sys.stderr.write("      Setting no_lm_state to True and using SRILM\n")
        opts.no_lm_state = True

    if opts.use_srilm:
        sys.stderr.write(
            "WARNING: SRILM wrapper is not included with Kriya and needs to be build separately.\n"
        )
        sys.stderr.write("         Falling back to use KenLM wrapper.\n")
        sys.stderr.write(
            "** If you would like to use SRILM, comment out/remove the lines: 94-98 in Kriya-Decoder/settings.py **\n"
        )
        opts.use_srilm = False

    sys.stderr.write("INFO: Using the N-gram size      : %d\n" %
                     (opts.n_gram_size))
    sys.stderr.write("INFO: Run decoder in 1NT mode    : %s\n" %
                     (opts.one_nt_decode))
    sys.stderr.write("INFO: Use X freely in Glue rules : %s\n" %
                     (opts.free_glue))
    sys.stderr.write("INFO: # of rule terms in Fr side : %d\n" %
                     (opts.fr_rule_terms))
    sys.stderr.write("INFO: Generating unique N-best   : %s\n" %
                     (opts.use_unique_nbest))
    sys.stderr.write("INFO: Use state info for KENLM   : %s\n" %
                     (not opts.no_lm_state))
    sys.stderr.write("INFO: Discount LM penalty 4 UNK  : %s\n" %
                     (not opts.no_dscnt_UNKlm))
    sys.stderr.write("INFO: Glue rules penalty applied : %s\n" %
                     (not opts.no_glue_penalty))
    sys.stderr.write("INFO: Cube pruning diversity     : %d\n" %
                     (opts.cbp_diversity))

    sys.stderr.write("INFO: Force decoding status      : %s\n" %
                     (opts.force_decode))
    sys.stderr.write("INFO: Reference file             : %s\n" %
                     (opts.refFile))

    if opts.nbest_extremum > 0:
        if opts.nbest_extremum * 2 >= opts.nbest_limit:
            opts.nbest_extremum = 20
            sys.stderr.write(
                "INFO: Nbest extremum must be less than half the nbest size. Using default nbest extremum of 20.\n"
            )
        else:
            sys.stderr.write(
                "INFO: Nbest extremum set: will produce top-%d and bottom-%d entries as nbest-list\n"
                % (opts.nbest_extremum, opts.nbest_extremum))

    # Default weights for different features
    feat = Features()
    if opts.weight_tm:
        feat.tm = map(lambda x: float(x), opts.weight_tm.split(' '))
    else:
        feat.tm = [opts.weight_tmf, opts.weight_tmr, opts.weight_lwf, \
                    opts.weight_lwr, opts.weight_pp]
    feat.wp = opts.weight_wp

    # Set the nbest_format to 'False' & nbest_limit to '1', if one_best option is set
    if opts.one_best:
        opts.nbest_format = False
        opts.nbest_limit = 1
        sys.stderr.write(
            "INFO: one-best option specified. Option nbest-format will be turned off and nbest_limit set to 1.\n"
        )
    sys.stderr.write("INFO: cbp/ Nbest limit : %d/ %d\n" %
                     (opts.cbp, opts.nbest_limit))

    if opts.shallow_hiero:
        sys.stderr.write("INFO: Shallow decoding hiero with order : %d...\n" %
                         (opts.sh_order))
    else:
        sys.stderr.write(
            "INFO: Shallow decoding hiero turned off; decoding as full hiero ...\n"
        )

    if opts.use_srilm:
        sys.stderr.write("INFO: Using SRILM language model wrapper ...\n")
    else:
        sys.stderr.write("INFO: Using KenLM language model wrapper ...\n")

    # Initialize the language models
    LanguageModelManager.initLMs(len(opts.weightLM), opts.lmTupLst,
                                 opts.use_srilm)

    # Set weights for the features
    FeatureManager.glue_wgt = opts.weight_glue
    FeatureManager.wp_wgt = opts.weight_wp
    FeatureManager.lmWgt = opts.weightLM[:]
    FeatureManager.tmWgt = opts.weightTM[:]
    FeatureManager.setFeatureWeights(len(opts.weightLM), len(opts.weightTM),
                                     opts.tm_weight_cnt)

    if opts.local_path is not 'None':
        sys.stderr.write("About to copy language model locally ...\n")
        copyModels()