Exemple #1
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext, "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
    #    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemple #2
0
    def __init__(self, path):
        """Construct the config object by parsing and validating the configuration file."""

        # the config data member
        self.project   = None           # project name     :string
        self.exportdir = None           # export directory :path
        self.username  = None           # user name        :string
        self.userpath  = None           # user directory   :path
        self.rawfiles  = None           # raw file list    :list of path
        self.src       = None           # source lang      :string
        self.targets   = None           # target langs     :list of string
        self.stanford_execpath = None        # Stanford Chinese Word Segmenter path :path
        self.stanford_standard = None        # Stanford Chinese Word Segmenter Standard : string

        try:
            log_start("Config")
            log_stderr("Config file: '{0}'".format(path))

            self._readConfig(path)
            self._validateConfig()

            log_done("Config")
        except ConfigException as e:
            log_error(e.message)
            log_fail("Config")
            raise
Exemple #3
0
    def generateCorpus(self):
        log_start("Split")
        self._prepare()
        if ( len(self.config.targets) == 0 ):
            raise SplitException("Prepare the directory failed.")

        filelist = []
        for afile in self.config.rawfiles:
            try:
                log_start("Split {0}".format(afile))
                self.fillPool(afile)
                self.splitter.split(afile)
                self.filepool.closeFiles()
                self.filepool.clean()
                filelist.append(afile)
                log_done("Split {0}".format(afile))
            except SplitException as e:
                log_warning(e.message)
                # TODO: del the files when failed.
                log_fail("Split {0}".format(afile))

        if filelist == [] :
            log_error("No corpus file generated.")
            log_fail("Split")
        else:
            self.mergeCorpus(filelist)
            log_done("Split")
Exemple #4
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    oldcwd = os.getcwd()
    scriptpath = "./corpustool/engine/ja-JP/"
    os.chdir(scriptpath)
    scriptname = "./chasen"
    scriptparams = " -i w " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null"
    scriptcmd = scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    os.chdir(oldcwd)

    infile = open(filename + ext, "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
    #    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemple #5
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext , "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
#    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemple #6
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    oldcwd = os.getcwd()
    scriptpath = "./corpustool/engine/ja-JP/"
    os.chdir(scriptpath)
    scriptname = "./chasen"
    scriptparams = " -i w " + '"'+ filename +'"'  + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd =  scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    os.chdir(oldcwd)

    infile = open(filename + ext , "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
#    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemple #7
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    segmenter_execpath = config.stanford_execpath
    segmenter_standard = config.stanford_standard

    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)
    
    # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。".
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+".tmp" +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    shutil.move(filename + ".tmp", filename)

    scriptpath = segmenter_execpath
    if scriptpath != None:
        scriptpath = os.path.expanduser(scriptpath)
        scriptname = scriptpath + "/segment.sh"
        print "segmenter path : "  + scriptname

        scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"'+ filename+ ".cntok" +'"' 
        # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path.
        # #2830571
        scriptcmd =  '"' + scriptname + '"' + scriptparams
        print scriptcmd
        os.system(scriptcmd)
        shutil.copy(filename + ".cntok", filename)

    # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect.
    # So filter the corpus with English tokenizor and detoken again.
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext , "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
#    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemple #8
0
def filter(pcconfig, lang):
    log_start("num_clean " + lang)
    ext = ".numclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")
    cleanNum(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
    log_done("num_clean " + lang)
Exemple #9
0
def filter(pcconfig):
    log_start("diff_align")
    ext = ".diff_align"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("diff")
    diff_threshold = int(elems[0].firstChild.data)
    clean_weird_diff_align(src_filename, target_filename, diff_threshold)
    log_done("diff_align")
Exemple #10
0
def filter(pcconfig):
    log_start("dup_clean")
    ext = ".dupclean"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("restricted")
    isRestricted = True if (elems[0].firstChild.data) == "yes" else False
    cleanDup(src_filename, target_filename, isRestricted)

    # cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold)
    # cleaner.clean()
    log_done("dup_clean")
Exemple #11
0
def filter(pcconfig, lang):
    log_start("lowercase " + lang)
    ext = ".low"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "lowercase.perl"
    scriptparams = " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    shutil.copyfile(filename + ext , filename)
#    shutil.move(filename + ext, filename)
    log_done("lowercase " + lang)
def filter(pcconfig):
    log_start("extra_long")
    ext = ".extra_long"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("source")
    source_threshold = int(elems[0].firstChild.data)
    elems = doc.getElementsByTagName("target")
    target_threshold = int(elems[0].firstChild.data)
    print source_threshold, target_threshold

    cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold)
    cleaner.clean()
    log_done("extra_long")
Exemple #13
0
def filter(pcconfig):
    log_start("dup_clean")
    ext = ".dupclean"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target,
                                        config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target,
                                           pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("restricted")
    isRestricted = True if (elems[0].firstChild.data) == "yes" else False
    cleanDup(src_filename, target_filename, isRestricted)

    # cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold)
    # cleaner.clean()
    log_done("dup_clean")
Exemple #14
0
def filter(pcconfig):
    log_start("extra_long")
    ext = ".extra_long"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("source")
    source_threshold = int(elems[0].firstChild.data)
    elems = doc.getElementsByTagName("target")
    target_threshold = int(elems[0].firstChild.data)
    print source_threshold, target_threshold

    cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold)
    cleaner.clean()
    log_done("extra_long")
Exemple #15
0
def filter(pcconfig, lang):
    log_start("phtag_clean " + lang)
    ext = ".tagclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")

    pattern = re.compile("<ph(?:\s+[\w=\"]*)*>(\{\d+\})<\/ph>")

    for line in infile:
        line = re.sub(pattern, r'\1', line)
        outfile.write(line)

    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
#    shutil.move(filename + ext, filename)
    log_done("phtag_clean " + lang)
Exemple #16
0
def filter(pcconfig, lang):
    log_start("phtag_clean " + lang)
    ext = ".tagclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")

    pattern = re.compile("<ph(?:\s+[\w=\"]*)*>(\{\d+\})<\/ph>")

    for line in infile:
        line = re.sub(pattern, r'\1', line)
        outfile.write(line)

    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
    #    shutil.move(filename + ext, filename)
    log_done("phtag_clean " + lang)
Exemple #17
0
def filter(pcconfig, lang):
    log_start("url_clean " + lang)
    ext = ".urlclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")

    # [Protocol] [Username:Password] Subdomains TopLevelDomains [Port] [Directory] [Query] [Anchor]
    # please to read the re pattern carefully to understand it.
    # reference: http://flanders.co.nz/2009/11/08/a-good-url-regular-expression-repost/
    # A blog posted by Ivan Porto Carrero.

    # The last group ($|<|{) will be used as \1 again. Cannot use the [$<{] , since the $ is not special in [].
    #urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)($|<|{)'

    # \1 <==> $|<|{
    # line = re.sub( urlPattern, r'\1', line)

    # Match the url when is followed by $, < , {. Mostly url should be ended with $, but is followed by < before
    # phtag_clean and by { after phtag_clean.
    urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)(?=($|<|{))'

    line_count = 0
    for line in infile:
        line_count += 1
        list_matched = re.findall(urlPattern, line)
        # TODO: log, not print
        # for x, y in list_matched:
        #     print str(line_count) + " : " + x
        line = re.sub( urlPattern, r'', line)
        outfile.write(line)

    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
    log_done("url_clean " + lang)
Exemple #18
0
def filter(pcconfig, lang):
    log_start("url_clean " + lang)
    ext = ".urlclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")

    # [Protocol] [Username:Password] Subdomains TopLevelDomains [Port] [Directory] [Query] [Anchor]
    # please to read the re pattern carefully to understand it.
    # reference: http://flanders.co.nz/2009/11/08/a-good-url-regular-expression-repost/
    # A blog posted by Ivan Porto Carrero.

    # The last group ($|<|{) will be used as \1 again. Cannot use the [$<{] , since the $ is not special in [].
    #urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)($|<|{)'

    # \1 <==> $|<|{
    # line = re.sub( urlPattern, r'\1', line)

    # Match the url when is followed by $, < , {. Mostly url should be ended with $, but is followed by < before
    # phtag_clean and by { after phtag_clean.
    urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)(?=($|<|{))'

    line_count = 0
    for line in infile:
        line_count += 1
        list_matched = re.findall(urlPattern, line)
        # TODO: log, not print
        # for x, y in list_matched:
        #     print str(line_count) + " : " + x
        line = re.sub(urlPattern, r'', line)
        outfile.write(line)

    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
    log_done("url_clean " + lang)
Exemple #19
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    segmenter_execpath = config.stanford_execpath
    segmenter_standard = config.stanford_standard

    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。".
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ".tmp" + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    shutil.move(filename + ".tmp", filename)

    scriptpath = segmenter_execpath
    if scriptpath != None:
        scriptpath = os.path.expanduser(scriptpath)
        scriptname = scriptpath + "/segment.sh"
        print "segmenter path : " + scriptname

        scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"' + filename + ".cntok" + '"'
        # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path.
        # #2830571
        scriptcmd = '"' + scriptname + '"' + scriptparams
        print scriptcmd
        os.system(scriptcmd)
        shutil.copy(filename + ".cntok", filename)

    # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect.
    # So filter the corpus with English tokenizor and detoken again.
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext, "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
    #    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)