def __init__(self): self.filepool = CorpusFilePool() self.splitter = Splitter() self.splitter.setFilepool(self.filepool) self.config = None
class SplitTool: def __init__(self): self.filepool = CorpusFilePool() self.splitter = Splitter() self.splitter.setFilepool(self.filepool) self.config = None def setConfig(self, config): self.config = config def _prepare(self): """Prepare the corpus directory hierarchy.""" log_stderr("Preparing corpus directory hierarchy ...") # prepare the project directory. projPath = self.config.getProjectDir() if not os.path.exists(projPath): os.mkdir(projPath) log_stderr("Creating project directory.") # create the directory for corpus if necessary, clean the Corpus.en/zh file. if cannot open the corpus file, # remove the target language from the list, so will not do the process for that target language. srclang = self.config.src targets = self.config.targets[:] # same as: targets = list(self.config.targets) for targetlang in targets: log_stderr("") log_stderr(localePairForm(srclang, targetlang)) corpusDirPath = self.config.getCorpusDir(srclang, targetlang) if not os.path.exists(corpusDirPath): os.mkdir(corpusDirPath) log_stderr("Creating corpus directory '{0}'.".format(corpusDirPath)) log_stderr("Cleaning the corpus files ...") srcCorpusFile = self.config.getCorpusFile(srclang, targetlang, srclang) targetCorpusFile = self.config.getCorpusFile(srclang, targetlang, targetlang) srcfile = None targetfile = None try: srcfile = open(srcCorpusFile, 'w') targetfile = open(targetCorpusFile, 'w') log_stderr("Cleaned: {0}".format(srcCorpusFile)) log_stderr("Cleaned: {0}".format(targetCorpusFile)) except IOError as e: self.config.targets.remove(targetlang) log_stderr(str(e)) finally: if srcfile: srcfile.close() if targetfile: targetfile.close() def fillPool(self, filename): fname = self._pureName(filename) srclang = self.config.src for targetlang in self.config.targets: srcCorpusFile = self.config.getNamedCorpusFile(srclang, targetlang, fname, srclang) targetCorpusFile = self.config.getNamedCorpusFile(srclang, targetlang, fname, targetlang) srcfile = open(srcCorpusFile, 'w') targetfile = open(targetCorpusFile, 'w') self.filepool.setMapping(srclang, targetlang, srcfile, targetfile) def generateCorpus(self): log_start("Split") self._prepare() if ( len(self.config.targets) == 0 ): raise SplitException("Prepare the directory failed.") filelist = [] for afile in self.config.rawfiles: try: log_start("Split {0}".format(afile)) self.fillPool(afile) self.splitter.split(afile) self.filepool.closeFiles() self.filepool.clean() filelist.append(afile) log_done("Split {0}".format(afile)) except SplitException as e: log_warning(e.message) # TODO: del the files when failed. log_fail("Split {0}".format(afile)) if filelist == [] : log_error("No corpus file generated.") log_fail("Split") else: self.mergeCorpus(filelist) log_done("Split") def _pureName(self, filename): basename = os.path.basename(filename) (name, sep, ext) = basename.rpartition('.') return name def _mergeFiles(self, filelist, src, target, lang): corpus = self.config.getCorpusFile(src, target, lang) cf = open(corpus, "w") for afile in filelist: with open(afile, "r") as f: for line in f: cf.write(line) cf.close() corpus_orig = corpus + ".orig" shutil.copyfile(corpus, corpus_orig) def mergeCorpus(self, filelist): srclang = self.config.src for targetlang in self.config.targets: slist = [ self.config.getNamedCorpusFile(srclang, targetlang, self._pureName(filename), srclang) for filename in filelist] tlist = [ self.config.getNamedCorpusFile(srclang, targetlang, self._pureName(filename), targetlang) for filename in filelist] self._mergeFiles(slist, srclang, targetlang, srclang) self._mergeFiles(tlist, srclang, targetlang, targetlang) for file in slist: os.remove(file) for file in tlist: os.remove(file)