def train(self, sourcefile, modelfile, **parameters): self.log("Preparing to generate Language Model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder() classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating pattern model") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=self.maxcontext) model = colibricore.IndexedPatternModel() model.train(corpusfile, options) self.log("Saving model " + modelfile) model.write(modelfile)
def train(self, sourcefile, modelfile, **parameters): self.log("Preparing to generate lexicon") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating frequency list") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.savemodel(model, modelfile, classfile) #in separate function so it can be overloaded
def train(self, sourcefile, modelfile, **parameters): self.log("Preparing to generate bigram model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( ) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) self.log("Generating bigram frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=2) #unigrams and bigrams model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model") model.write(modelfile)
def train(self): if self.sourcefile and not os.path.exists(self.modelfile): classfile = stripsourceextensions(self.sourcefile) + ".cls" corpusfile = stripsourceextensions(self.sourcefile) + ".dat" if not os.path.exists(classfile): classencoder = colibricore.ClassEncoder(self.minlength,self.maxlength) classencoder.build(self.sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile, self.minlength, self.maxlength) if not os.path.exists(corpusfile): classencoder.encodefile( self.sourcefile, corpusfile) options = colibricore.PatternModelOptions(mintokens=self.threshold,minlength=1,maxlength=1) model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) model.save(self.modelfile)
def train(self, sourcefile, modelfile, **parameters): self.log("Preparing to generate lexicon") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( "", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) if not os.path.exists(modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.savemodel( model, modelfile, classfile) #in separate function so it can be overloaded
def train(self): if self.sourcefile and not os.path.exists(self.modelfile): classfile = stripsourceextensions(self.sourcefile) + ".cls" corpusfile = stripsourceextensions(self.sourcefile) + ".dat" if not os.path.exists(classfile): self.classencoder = colibricore.ClassEncoder(self.minlength,self.maxlength) self.classencoder.build(self.sourcefile) self.classencoder.save(classfile) else: self.classencoder = colibricore.ClassEncoder(classfile, self.minlength, self.maxlength) if not os.path.exists(self.modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, self.modelfile + '.cls') if not os.path.exists(corpusfile): self.classencoder.encodefile( self.sourcefile, corpusfile) options = colibricore.PatternModelOptions(mintokens=self.threshold,minlength=1,maxlength=1) self.lexicon = colibricore.UnindexedPatternModel() self.lexicon.train(corpusfile, options) self.lexicon.write(self.modelfile)
def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log( "Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( "", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder( classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log( "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!" ) for suffix in self.suffixes: if pattern_s.endswith( suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len( suffix)] + othersuffix try: otherpattern = classencoder.buildpattern( otherpattern_s, False, False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = ( model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile, 'w', encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: try: self.confusibles except AttributeError: self.confusibles = [] self.log("Loading confusiblefile") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized, ) + rightcontext, suffix) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".nonewlines.dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( ) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile, ignorenewlines=True) if modelfile.endswith('.1'): #unigram model (for recasing) self.log("Generating unigram frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['recasethreshold'], minlength=1, maxlength=1) #unigrams model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model") model.write(modelfile) elif modelfile.endswith('.3'): #trigram model self.log("Generating filtered trigram frequency list") filterpatterns = colibricore.PatternSet() for punc in ColibriPuncRecaseModule.PUNCTUATION: filterpattern = classencoder.buildpattern('{*1*} ' + punc + ' {*1*}') if not filterpattern.unknown(): filterpatterns.add(filterpattern) self.log("(" + str(len(filterpatterns)) + " filters)") options = colibricore.PatternModelOptions( mintokens=self.settings['deletioncutoff'], minlength=3, maxlength=3) #trigrams model = colibricore.UnindexedPatternModel() model.train_filtered(corpusfile, options, filterpatterns) self.log("Saving model") model.write(modelfile) else: #bigram model self.log("Generating bigram frequency list") options = colibricore.PatternModelOptions(mintokens=min( self.settings['insertioncutoff'], self.settings['recasethreshold2']), minlength=2, maxlength=2) #bigrams model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model") model.write(modelfile) del model
def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log("Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log("WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!") for suffix in self.suffixes: if pattern_s.endswith(suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len(suffix)] + othersuffix try: otherpattern = classencoder.buildpattern(otherpattern_s,False,False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = (model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s ) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile,'w',encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized,) + rightcontext , suffix ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() if modelfile.endswith('.ibase'): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): if self.hapaxer: ngram = self.hapaxer(ngram) focus = ngram[l] if self.hapaxer and focus == self.hapaxer.placeholder: continue leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() elif modelfile.endswith('.patternmodel'): self.log("Preparing to generate lexicon for Language Model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder() classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating pattern model") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model " + modelfile) model.write(modelfile)