Example #1
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    segmenter_execpath = config.stanford_execpath
    segmenter_standard = config.stanford_standard

    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)
    
    # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。".
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+".tmp" +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    shutil.move(filename + ".tmp", filename)

    scriptpath = segmenter_execpath
    if scriptpath != None:
        scriptpath = os.path.expanduser(scriptpath)
        scriptname = scriptpath + "/segment.sh"
        print "segmenter path : "  + scriptname

        scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"'+ filename+ ".cntok" +'"' 
        # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path.
        # #2830571
        scriptcmd =  '"' + scriptname + '"' + scriptparams
        print scriptcmd
        os.system(scriptcmd)
        shutil.copy(filename + ".cntok", filename)

    # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect.
    # So filter the corpus with English tokenizor and detoken again.
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext , "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
#    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Example #2
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext, "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
    #    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Example #3
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext , "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
#    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Example #4
0
 def getNamedCorpusFile(self, src, target, name, lang):
     """get the path of corpus file for lang in (src, target) corpus directory."""
     return os.path.join(self.getCorpusDir(src, target), name + "." + langName(lang))
Example #5
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    segmenter_execpath = config.stanford_execpath
    segmenter_standard = config.stanford_standard

    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。".
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ".tmp" + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    shutil.move(filename + ".tmp", filename)

    scriptpath = segmenter_execpath
    if scriptpath != None:
        scriptpath = os.path.expanduser(scriptpath)
        scriptname = scriptpath + "/segment.sh"
        print "segmenter path : " + scriptname

        scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"' + filename + ".cntok" + '"'
        # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path.
        # #2830571
        scriptcmd = '"' + scriptname + '"' + scriptparams
        print scriptcmd
        os.system(scriptcmd)
        shutil.copy(filename + ".cntok", filename)

    # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect.
    # So filter the corpus with English tokenizor and detoken again.
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext, "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
    #    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)