コード例 #1
0
ファイル: thot_train_detok_model.py プロジェクト: buubuu/thot
def main(argv):
    # take parameters
    r_given = False
    rfilename = ""
    t_given = False
    tfilename = ""
    n_given = False
    nval = 3
    o_given = False
    opref = ""
    verbose = False
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "hr:t:n:o:v",
            ["help", "rawfn=", "tokfn=", "nval=", "opref="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)
    if (len(opts) == 0):
        print_help()
        sys.exit()
    else:
        for opt, arg in opts:
            if opt in ("-h", "--help"):
                print_help()
                sys.exit()
            elif opt in ("-r", "--rawfn"):
                rfilename = arg
                r_given = True
            elif opt in ("-t", "--tokfn"):
                tfilename = arg
                t_given = True
            elif opt in ("-n", "--nval"):
                nval = int(arg)
                n_given = True
            elif opt in ("-o", "--opref"):
                opref = arg
                o_given = True
            elif opt in ("-v", "--verbose"):
                verbose = True

    # check parameters
    if (r_given == False):
        print >> sys.stderr, "Error! -r parameter not given"
        sys.exit(2)

    if (o_given == False):
        print >> sys.stderr, "Error! -o parameter not given"
        sys.exit(2)

    # print parameters
    if (r_given == True):
        print >> sys.stderr, "r is %s" % (rfilename)

    if (t_given == True):
        print >> sys.stderr, "t is %s" % (tfilename)

    if (n_given == True):
        print >> sys.stderr, "n is %d" % (nval)

    if (o_given == True):
        print >> sys.stderr, "o is %s" % (opref)

    # open files
    if (r_given == True):
        # open file
        rfile = io.open(rfilename, 'r', encoding="utf-8")

    if (t_given == True):
        # open file
        tfile = io.open(tfilename, 'r', encoding="utf-8")

    # train translation model
    print >> sys.stderr, "Training translation model..."
    tmodel = smtpr.TransModel()
    if (t_given == True):
        tmodel.train_tok_tm_par_files(rfile, tfile, verbose)
    else:
        tmodel.train_tok_tm(rfile, verbose)

    # print translation model
    tmfile = io.open(opref + ".tm", 'w', encoding='utf-8')
    tmodel.print_model_to_file(tmfile)

    # reopen files
    rfile.close()
    rfile = io.open(rfilename, 'r', encoding="utf-8")

    if (t_given == True):
        tfile.close()
        tfile = io.open(tfilename, 'r', encoding="utf-8")

    # train language model
    print >> sys.stderr, "Training language model..."
    lmodel = smtpr.LangModel()
    if (t_given == True):
        lmodel.train_tok_lm_par_files(rfile, tfile, nval, verbose)
    else:
        lmodel.train_tok_lm(rfile, nval, verbose)

    # print language model
    lmfile = io.open(opref + ".lm", 'w', encoding='utf-8')
    lmodel.print_model_to_file(lmfile)
コード例 #2
0
ファイル: thot_rec_translator.py プロジェクト: buubuu/thot
def main(argv):

    # take parameters
    f_given = False
    filename = ""
    m_given = False
    mpref = ""
    i_given = False
    ival = smtpr._global_lm_interp_prob
    w_given = False
    weights = [1, 1, 1, 1]
    verbose = False
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "hf:m:i:w:v",
            ["help", "filename=", "mpref=", "interp=", "weights="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)
    if (len(opts) == 0):
        print_help()
        sys.exit()
    else:
        for opt, arg in opts:
            if opt in ("-h", "--help"):
                print_help()
                sys.exit()
            elif opt in ("-f", "--filename"):
                filename = arg
                f_given = True
            elif opt in ("-m", "--mpref"):
                mpref = arg
                m_given = True
            elif opt in ("-i", "--interp"):
                ival = float(arg)
                i_given = True
            elif opt in ("-w", "--weights"):
                weight_str = arg
                weight_str_array = weight_str.split()
                weights = []
                for i in range(len(weight_str_array)):
                    weights.append(float(weight_str_array[i]))
                w_given = True
            elif opt in ("-v", "--verbose"):
                verbose = True

    # check parameters
    if (m_given == False):
        print >> sys.stderr, "Error! -m parameter not given"
        sys.exit(2)

    # print parameters
    if (f_given == True):
        print >> sys.stderr, "f is %s" % (filename)

    if (m_given == True):
        print >> sys.stderr, "m is %s" % (mpref)

    if (i_given == True):
        print >> sys.stderr, "i is %f" % (ival)

    if (w_given == True):
        print >> sys.stderr, "w is \"%s\"" % (weight_str)

    # open files
    if (f_given == True):
        # open file
        file = io.open(filename, 'r', encoding="utf-8")
    else:
        # fallback to stdin
        file = io.open(sys.stdin.fileno(), 'r', encoding='utf8')

    # load translation model
    tmodel = smtpr.TransModel()
    tmfilename = mpref + ".tm"
    tmfile = io.open(tmfilename, 'r', encoding="utf-8")
    print >> sys.stderr, "Loading translation model from file", tmfilename, "..."
    tmodel.load(tmfile)

    # load language model
    lmodel = smtpr.LangModel()
    lmfilename = mpref + ".lm"
    lmfile = io.open(lmfilename, 'r', encoding="utf-8")
    print >> sys.stderr, "Loading language model from file", lmfilename, "..."
    lmodel.load(lmfile)
    lmodel.set_interp_prob(ival)

    # translate (detokenize)
    decoder = smtpr.Decoder(tmodel, lmodel, weights)
    print >> sys.stderr, "Recasing..."
    decoder.recase(file, verbose)