def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext, "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def __init__(self, path): """Construct the config object by parsing and validating the configuration file.""" # the config data member self.project = None # project name :string self.exportdir = None # export directory :path self.username = None # user name :string self.userpath = None # user directory :path self.rawfiles = None # raw file list :list of path self.src = None # source lang :string self.targets = None # target langs :list of string self.stanford_execpath = None # Stanford Chinese Word Segmenter path :path self.stanford_standard = None # Stanford Chinese Word Segmenter Standard : string try: log_start("Config") log_stderr("Config file: '{0}'".format(path)) self._readConfig(path) self._validateConfig() log_done("Config") except ConfigException as e: log_error(e.message) log_fail("Config") raise
def generateCorpus(self): log_start("Split") self._prepare() if ( len(self.config.targets) == 0 ): raise SplitException("Prepare the directory failed.") filelist = [] for afile in self.config.rawfiles: try: log_start("Split {0}".format(afile)) self.fillPool(afile) self.splitter.split(afile) self.filepool.closeFiles() self.filepool.clean() filelist.append(afile) log_done("Split {0}".format(afile)) except SplitException as e: log_warning(e.message) # TODO: del the files when failed. log_fail("Split {0}".format(afile)) if filelist == [] : log_error("No corpus file generated.") log_fail("Split") else: self.mergeCorpus(filelist) log_done("Split")
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) oldcwd = os.getcwd() scriptpath = "./corpustool/engine/ja-JP/" os.chdir(scriptpath) scriptname = "./chasen" scriptparams = " -i w " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null" scriptcmd = scriptname + scriptparams print scriptcmd os.system(scriptcmd) os.chdir(oldcwd) infile = open(filename + ext, "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext , "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) oldcwd = os.getcwd() scriptpath = "./corpustool/engine/ja-JP/" os.chdir(scriptpath) scriptname = "./chasen" scriptparams = " -i w " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptname + scriptparams print scriptcmd os.system(scriptcmd) os.chdir(oldcwd) infile = open(filename + ext , "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config segmenter_execpath = config.stanford_execpath segmenter_standard = config.stanford_standard filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。". scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+".tmp" +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) shutil.move(filename + ".tmp", filename) scriptpath = segmenter_execpath if scriptpath != None: scriptpath = os.path.expanduser(scriptpath) scriptname = scriptpath + "/segment.sh" print "segmenter path : " + scriptname scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"'+ filename+ ".cntok" +'"' # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path. # #2830571 scriptcmd = '"' + scriptname + '"' + scriptparams print scriptcmd os.system(scriptcmd) shutil.copy(filename + ".cntok", filename) # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect. # So filter the corpus with English tokenizor and detoken again. scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext , "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("num_clean " + lang) ext = ".numclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") cleanNum(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) log_done("num_clean " + lang)
def filter(pcconfig): log_start("diff_align") ext = ".diff_align" config = pcconfig.config src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src) target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target) xml = pcconfig.xml_frag doc = parseString(xml) elems = doc.getElementsByTagName("diff") diff_threshold = int(elems[0].firstChild.data) clean_weird_diff_align(src_filename, target_filename, diff_threshold) log_done("diff_align")
def filter(pcconfig): log_start("dup_clean") ext = ".dupclean" config = pcconfig.config src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src) target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target) xml = pcconfig.xml_frag doc = parseString(xml) elems = doc.getElementsByTagName("restricted") isRestricted = True if (elems[0].firstChild.data) == "yes" else False cleanDup(src_filename, target_filename, isRestricted) # cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold) # cleaner.clean() log_done("dup_clean")
def filter(pcconfig, lang): log_start("lowercase " + lang) ext = ".low" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) scriptpath = "./corpustool/third-party/scripts/" scriptname = "lowercase.perl" scriptparams = " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) shutil.copyfile(filename + ext , filename) # shutil.move(filename + ext, filename) log_done("lowercase " + lang)
def filter(pcconfig): log_start("extra_long") ext = ".extra_long" config = pcconfig.config src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src) target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target) xml = pcconfig.xml_frag doc = parseString(xml) elems = doc.getElementsByTagName("source") source_threshold = int(elems[0].firstChild.data) elems = doc.getElementsByTagName("target") target_threshold = int(elems[0].firstChild.data) print source_threshold, target_threshold cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold) cleaner.clean() log_done("extra_long")
def filter(pcconfig, lang): log_start("phtag_clean " + lang) ext = ".tagclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") pattern = re.compile("<ph(?:\s+[\w=\"]*)*>(\{\d+\})<\/ph>") for line in infile: line = re.sub(pattern, r'\1', line) outfile.write(line) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) # shutil.move(filename + ext, filename) log_done("phtag_clean " + lang)
def filter(pcconfig, lang): log_start("url_clean " + lang) ext = ".urlclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") # [Protocol] [Username:Password] Subdomains TopLevelDomains [Port] [Directory] [Query] [Anchor] # please to read the re pattern carefully to understand it. # reference: http://flanders.co.nz/2009/11/08/a-good-url-regular-expression-repost/ # A blog posted by Ivan Porto Carrero. # The last group ($|<|{) will be used as \1 again. Cannot use the [$<{] , since the $ is not special in []. #urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)($|<|{)' # \1 <==> $|<|{ # line = re.sub( urlPattern, r'\1', line) # Match the url when is followed by $, < , {. Mostly url should be ended with $, but is followed by < before # phtag_clean and by { after phtag_clean. urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)(?=($|<|{))' line_count = 0 for line in infile: line_count += 1 list_matched = re.findall(urlPattern, line) # TODO: log, not print # for x, y in list_matched: # print str(line_count) + " : " + x line = re.sub( urlPattern, r'', line) outfile.write(line) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) log_done("url_clean " + lang)
def filter(pcconfig, lang): log_start("url_clean " + lang) ext = ".urlclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") # [Protocol] [Username:Password] Subdomains TopLevelDomains [Port] [Directory] [Query] [Anchor] # please to read the re pattern carefully to understand it. # reference: http://flanders.co.nz/2009/11/08/a-good-url-regular-expression-repost/ # A blog posted by Ivan Porto Carrero. # The last group ($|<|{) will be used as \1 again. Cannot use the [$<{] , since the $ is not special in []. #urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)($|<|{)' # \1 <==> $|<|{ # line = re.sub( urlPattern, r'\1', line) # Match the url when is followed by $, < , {. Mostly url should be ended with $, but is followed by < before # phtag_clean and by { after phtag_clean. urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)(?=($|<|{))' line_count = 0 for line in infile: line_count += 1 list_matched = re.findall(urlPattern, line) # TODO: log, not print # for x, y in list_matched: # print str(line_count) + " : " + x line = re.sub(urlPattern, r'', line) outfile.write(line) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) log_done("url_clean " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config segmenter_execpath = config.stanford_execpath segmenter_standard = config.stanford_standard filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。". scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ".tmp" + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) shutil.move(filename + ".tmp", filename) scriptpath = segmenter_execpath if scriptpath != None: scriptpath = os.path.expanduser(scriptpath) scriptname = scriptpath + "/segment.sh" print "segmenter path : " + scriptname scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"' + filename + ".cntok" + '"' # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path. # #2830571 scriptcmd = '"' + scriptname + '"' + scriptparams print scriptcmd os.system(scriptcmd) shutil.copy(filename + ".cntok", filename) # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect. # So filter the corpus with English tokenizor and detoken again. scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext, "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)