def __init__(self, path): """Construct the config object by parsing and validating the configuration file.""" # the config data member self.project = None # project name :string self.exportdir = None # export directory :path self.username = None # user name :string self.userpath = None # user directory :path self.rawfiles = None # raw file list :list of path self.src = None # source lang :string self.targets = None # target langs :list of string self.stanford_execpath = None # Stanford Chinese Word Segmenter path :path self.stanford_standard = None # Stanford Chinese Word Segmenter Standard : string try: log_start("Config") log_stderr("Config file: '{0}'".format(path)) self._readConfig(path) self._validateConfig() log_done("Config") except ConfigException as e: log_error(e.message) log_fail("Config") raise
def generateCorpus(self): log_start("Split") self._prepare() if ( len(self.config.targets) == 0 ): raise SplitException("Prepare the directory failed.") filelist = [] for afile in self.config.rawfiles: try: log_start("Split {0}".format(afile)) self.fillPool(afile) self.splitter.split(afile) self.filepool.closeFiles() self.filepool.clean() filelist.append(afile) log_done("Split {0}".format(afile)) except SplitException as e: log_warning(e.message) # TODO: del the files when failed. log_fail("Split {0}".format(afile)) if filelist == [] : log_error("No corpus file generated.") log_fail("Split") else: self.mergeCorpus(filelist) log_done("Split")
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext, "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) oldcwd = os.getcwd() scriptpath = "./corpustool/engine/ja-JP/" os.chdir(scriptpath) scriptname = "./chasen" scriptparams = " -i w " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null" scriptcmd = scriptname + scriptparams print scriptcmd os.system(scriptcmd) os.chdir(oldcwd) infile = open(filename + ext, "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext , "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) oldcwd = os.getcwd() scriptpath = "./corpustool/engine/ja-JP/" os.chdir(scriptpath) scriptname = "./chasen" scriptparams = " -i w " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptname + scriptparams print scriptcmd os.system(scriptcmd) os.chdir(oldcwd) infile = open(filename + ext , "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config segmenter_execpath = config.stanford_execpath segmenter_standard = config.stanford_standard filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。". scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+".tmp" +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) shutil.move(filename + ".tmp", filename) scriptpath = segmenter_execpath if scriptpath != None: scriptpath = os.path.expanduser(scriptpath) scriptname = scriptpath + "/segment.sh" print "segmenter path : " + scriptname scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"'+ filename+ ".cntok" +'"' # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path. # #2830571 scriptcmd = '"' + scriptname + '"' + scriptparams print scriptcmd os.system(scriptcmd) shutil.copy(filename + ".cntok", filename) # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect. # So filter the corpus with English tokenizor and detoken again. scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext , "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)
def filter(pcconfig, lang): log_start("num_clean " + lang) ext = ".numclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") cleanNum(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) log_done("num_clean " + lang)
def filter(pcconfig): log_start("diff_align") ext = ".diff_align" config = pcconfig.config src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src) target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target) xml = pcconfig.xml_frag doc = parseString(xml) elems = doc.getElementsByTagName("diff") diff_threshold = int(elems[0].firstChild.data) clean_weird_diff_align(src_filename, target_filename, diff_threshold) log_done("diff_align")
def filter(pcconfig): log_start("dup_clean") ext = ".dupclean" config = pcconfig.config src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src) target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target) xml = pcconfig.xml_frag doc = parseString(xml) elems = doc.getElementsByTagName("restricted") isRestricted = True if (elems[0].firstChild.data) == "yes" else False cleanDup(src_filename, target_filename, isRestricted) # cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold) # cleaner.clean() log_done("dup_clean")
def filter(pcconfig, lang): log_start("lowercase " + lang) ext = ".low" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) scriptpath = "./corpustool/third-party/scripts/" scriptname = "lowercase.perl" scriptparams = " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) shutil.copyfile(filename + ext , filename) # shutil.move(filename + ext, filename) log_done("lowercase " + lang)
def main(): """The main function of convert module. Parse the cmdline, and create the config from xml file which describe the configuration for conversion. Then run the conversion to create and filter the corpus files according to config.""" progname = sys.argv[0] usage = """%prog -f command.xml""" parser = OptionParser( usage, version="%prog v0.1 (c) 2010 by Leo Jiang <*****@*****.**>") parser.add_option("-f", "--file", dest="filename", metavar="FILE", type="string", help="read the command from file.") (options, args) = parser.parse_args() log_stderr("convert.py v0.1 (c) 2010 by Leo Jiang <*****@*****.**>") if (options.filename == None): log_stderr("Usage: {0} -f command.xml".format(progname)) log_stderr(os.strerror(errno.EINVAL) + " : config file not specified.") sys.exit(errno.EINVAL) path = os.path.abspath(options.filename) if not os.path.isfile(path): log_error( os.strerror(errno.EINVAL) + " : file '{0}' not existed.".format(path)) log_fail("Convert") sys.exit(errno.EINVAL) try: config = ConversionConfig(path) conversion = Conversion(config) conversion.run() except ConfigException as e: log_fail("Convert: ConfigException") sys.exit(-1) except Exception as e: print "failed." log_fail(e.message) log_fail("Convert: unknown exception.") sys.exit(-1) log_done("Convert") sys.exit(0)
def filter(pcconfig): log_start("extra_long") ext = ".extra_long" config = pcconfig.config src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src) target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target) xml = pcconfig.xml_frag doc = parseString(xml) elems = doc.getElementsByTagName("source") source_threshold = int(elems[0].firstChild.data) elems = doc.getElementsByTagName("target") target_threshold = int(elems[0].firstChild.data) print source_threshold, target_threshold cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold) cleaner.clean() log_done("extra_long")
def filter(pcconfig): log_start("dup_clean") ext = ".dupclean" config = pcconfig.config src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src) target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target) xml = pcconfig.xml_frag doc = parseString(xml) elems = doc.getElementsByTagName("restricted") isRestricted = True if (elems[0].firstChild.data) == "yes" else False cleanDup(src_filename, target_filename, isRestricted) # cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold) # cleaner.clean() log_done("dup_clean")
def filter(pcconfig): log_start("extra_long") ext = ".extra_long" config = pcconfig.config src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src) target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target) xml = pcconfig.xml_frag doc = parseString(xml) elems = doc.getElementsByTagName("source") source_threshold = int(elems[0].firstChild.data) elems = doc.getElementsByTagName("target") target_threshold = int(elems[0].firstChild.data) print source_threshold, target_threshold cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold) cleaner.clean() log_done("extra_long")
def filter(pcconfig, lang): log_start("phtag_clean " + lang) ext = ".tagclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") pattern = re.compile("<ph(?:\s+[\w=\"]*)*>(\{\d+\})<\/ph>") for line in infile: line = re.sub(pattern, r'\1', line) outfile.write(line) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) # shutil.move(filename + ext, filename) log_done("phtag_clean " + lang)
def filter(pcconfig, lang): log_start("phtag_clean " + lang) ext = ".tagclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") pattern = re.compile("<ph(?:\s+[\w=\"]*)*>(\{\d+\})<\/ph>") for line in infile: line = re.sub(pattern, r'\1', line) outfile.write(line) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) # shutil.move(filename + ext, filename) log_done("phtag_clean " + lang)
def main(): """The main function of convert module. Parse the cmdline, and create the config from xml file which describe the configuration for conversion. Then run the conversion to create and filter the corpus files according to config.""" progname = sys.argv[0] usage="""%prog -f command.xml""" parser = OptionParser(usage, version="%prog v0.1 (c) 2010 by Leo Jiang <*****@*****.**>") parser.add_option("-f", "--file", dest="filename", metavar="FILE", type="string", help="read the command from file.") (options, args) = parser.parse_args() log_stderr("convert.py v0.1 (c) 2010 by Leo Jiang <*****@*****.**>") if (options.filename == None): log_stderr("Usage: {0} -f command.xml".format(progname)) log_stderr(os.strerror(errno.EINVAL) + " : config file not specified.") sys.exit(errno.EINVAL) path = os.path.abspath(options.filename) if not os.path.isfile(path): log_error(os.strerror(errno.EINVAL) + " : file '{0}' not existed.".format(path)) log_fail("Convert") sys.exit(errno.EINVAL) try: config = ConversionConfig(path) conversion = Conversion(config) conversion.run() except ConfigException as e: log_fail("Convert: ConfigException") sys.exit(-1) except Exception as e: print "failed." log_fail(e.message) log_fail("Convert: unknown exception.") sys.exit(-1) log_done("Convert") sys.exit(0)
def filter(pcconfig, lang): log_start("url_clean " + lang) ext = ".urlclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") # [Protocol] [Username:Password] Subdomains TopLevelDomains [Port] [Directory] [Query] [Anchor] # please to read the re pattern carefully to understand it. # reference: http://flanders.co.nz/2009/11/08/a-good-url-regular-expression-repost/ # A blog posted by Ivan Porto Carrero. # The last group ($|<|{) will be used as \1 again. Cannot use the [$<{] , since the $ is not special in []. #urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)($|<|{)' # \1 <==> $|<|{ # line = re.sub( urlPattern, r'\1', line) # Match the url when is followed by $, < , {. Mostly url should be ended with $, but is followed by < before # phtag_clean and by { after phtag_clean. urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)(?=($|<|{))' line_count = 0 for line in infile: line_count += 1 list_matched = re.findall(urlPattern, line) # TODO: log, not print # for x, y in list_matched: # print str(line_count) + " : " + x line = re.sub( urlPattern, r'', line) outfile.write(line) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) log_done("url_clean " + lang)
def filter(pcconfig, lang): log_start("url_clean " + lang) ext = ".urlclean" config = pcconfig.config filename = config.getCorpusFile(config.src, pcconfig.target, lang) infile = open(filename, "r") outfile = open(filename + ext, "w") # [Protocol] [Username:Password] Subdomains TopLevelDomains [Port] [Directory] [Query] [Anchor] # please to read the re pattern carefully to understand it. # reference: http://flanders.co.nz/2009/11/08/a-good-url-regular-expression-repost/ # A blog posted by Ivan Porto Carrero. # The last group ($|<|{) will be used as \1 again. Cannot use the [$<{] , since the $ is not special in []. #urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)($|<|{)' # \1 <==> $|<|{ # line = re.sub( urlPattern, r'\1', line) # Match the url when is followed by $, < , {. Mostly url should be ended with $, but is followed by < before # phtag_clean and by { after phtag_clean. urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)(?=($|<|{))' line_count = 0 for line in infile: line_count += 1 list_matched = re.findall(urlPattern, line) # TODO: log, not print # for x, y in list_matched: # print str(line_count) + " : " + x line = re.sub(urlPattern, r'', line) outfile.write(line) infile.close() outfile.close() shutil.copyfile(filename + ext, filename) log_done("url_clean " + lang)
def filter(pcconfig, lang): log_start("tokenize " + lang) ext = ".tok" config = pcconfig.config segmenter_execpath = config.stanford_execpath segmenter_standard = config.stanford_standard filename = config.getCorpusFile(config.src, pcconfig.target, lang) # normalize the lines. infile = open(filename, "r") outfile = open(filename + ".tmp", "w") normalize(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ".tmp", filename) # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。". scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ".tmp" + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) shutil.move(filename + ".tmp", filename) scriptpath = segmenter_execpath if scriptpath != None: scriptpath = os.path.expanduser(scriptpath) scriptname = scriptpath + "/segment.sh" print "segmenter path : " + scriptname scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"' + filename + ".cntok" + '"' # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path. # #2830571 scriptcmd = '"' + scriptname + '"' + scriptparams print scriptcmd os.system(scriptcmd) shutil.copy(filename + ".cntok", filename) # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect. # So filter the corpus with English tokenizor and detoken again. scriptpath = "./corpustool/third-party/scripts/" scriptname = "tokenizer.perl" scriptparams = " -l " + langName( lang ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null" scriptcmd = scriptpath + scriptname + scriptparams print scriptcmd os.system(scriptcmd) infile = open(filename + ext, "r") outfile = open(filename + ext + ".detok", "w") detoken(infile, outfile) infile.close() outfile.close() shutil.copyfile(filename + ext + ".detok", filename) # shutil.move(filename + ext, filename) log_done("tokenize " + lang)