def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config segmenter_execpath = config.stanford_execpath segmenter_standard = config.stanford_standard filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。". scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+".tmp" +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) shutil.move(filename + ".tmp", filename) scriptpath = segmenter_execpath if scriptpath != None: scriptpath = os.path.expanduser(scriptpath) scriptname = scriptpath + "/segment.sh" print "segmenter path : " + scriptname scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"'+ filename+ ".cntok" +'"' # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path. # #2830571 scriptcmd = '"' + scriptname + '"' + scriptparams print scriptcmd os.system(scriptcmd) shutil.copy(filename + ".cntok", filename) # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect. # So filter the corpus with English tokenizor and detoken again. scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext , "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext, "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext , "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def getNamedCorpusFile(self, src, target, name, lang): """get the path of corpus file for lang in (src, target) corpus directory.""" return os.path.join(self.getCorpusDir(src, target), name + "." + langName(lang))
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config segmenter_execpath = config.stanford_execpath segmenter_standard = config.stanford_standard filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。". scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ".tmp" + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) shutil.move(filename + ".tmp", filename) scriptpath = segmenter_execpath if scriptpath != None: scriptpath = os.path.expanduser(scriptpath) scriptname = scriptpath + "/segment.sh" print "segmenter path : " + scriptname scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"' + filename + ".cntok" + '"' # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path. # #2830571 scriptcmd = '"' + scriptname + '"' + scriptparams print scriptcmd os.system(scriptcmd) shutil.copy(filename + ".cntok", filename) # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect. # So filter the corpus with English tokenizor and detoken again. scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext, "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)