def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.confusibles = [] #pylint: disable=attribute-defined-outside-init self.log("Loading models...") if not os.path.exists(self.confusiblefile): raise IOError("Missing expected confusible file: " + self.confusiblefile + ". Did you forget to train the system?") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if not os.path.exists(self.modelfile): raise IOError("Missing expected model file: " + self.modelfile + ". Did you forget to train the system?") self.log("Loading Timbl model file " + self.modelfile + "...") fileprefix = self.modelfile.replace(".ibase", "") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init self.classifier.load()
def load(self): """Load the requested modules from self.models""" self.errorlist = {} if not self.models: raise Exception("Specify one or more models to load!") if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() self.log("Loading models...") if len(self.models) == 2: modelfile, lexiconfile = self.models else: modelfile = self.models[0] lexiconfile = None if not os.path.exists(modelfile): raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?") if lexiconfile and not os.path.exists(lexiconfile): raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug) self.classifier.load() if lexiconfile: self.log("Loading colibri model file for lexicon " + lexiconfile) self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls') self.lexicon = colibricore.UnindexedPatternModel(lexiconfile) else: self.lexicon = None
def fit(self, X, y): X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr') n_rows = X.shape[0] self.classes_ = np.unique(y) if sp.sparse.issparse(X): if self.debug: print('Features are sparse, choosing faster learning') self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]), format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000, normalize=self.normalize) for i in range(n_rows): sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)] self.classifier.append(sparse,str(y[i])) else: self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]), debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000, normalize=self.normalize) if y.dtype != 'O': y = y.astype(str) for i in range(n_rows): self.classifier.append(list(X[i].toarray()[0]), y[i]) self.classifier.train() return self
def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.confusibles = []#pylint: disable=attribute-defined-outside-init self.log("Loading models...") if not os.path.exists(self.confusiblefile): raise IOError("Missing expected confusible file: " + self.confusiblefile + ". Did you forget to train the system?") with open(self.confusiblefile,'r',encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if not os.path.exists(self.modelfile): raise IOError("Missing expected model file: " + self.modelfile + ". Did you forget to train the system?") self.log("Loading Timbl model file " + self.modelfile + "...") fileprefix = self.modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init self.classifier.load()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io prevword = "" #buffer = [("<begin>",False,'')] * l buffer = [] with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) words = [ w.strip() for w in line.split(' ') if w.strip() ] for i, word in enumerate(words): if prevword in PUNCTUATION: punc = prevword else: punc = "" if any( c.isalpha() for c in word ): buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc ) ) if len(buffer) == l + r + 1: buffer = self.addtraininstance(classifier, buffer,l,r) prevword = word #for i in range(0,r): # buffer.append( ("<end>",False,'') ) # if len(buffer) == l + r + 1: # buffer = self.addtraininstance(classifier, buffer,l,r) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): focus = ngram[l] leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions()) self.classifier.load()
def load(self): """Load the requested modules from self.models""" self.errorlist = {} if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions()) self.classifier.load()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io prevword = "" #buffer = [("<begin>",False,'')] * l buffer = [] with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) words = [w.strip() for w in line.split(' ') if w.strip()] for i, word in enumerate(words): if prevword in TIMBLPuncRecaseModule.PUNCTUATION: punc = prevword else: punc = "" if any(c.isalpha() for c in word): buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc)) if len(buffer) == l + r + 1: buffer = self.addtraininstance(classifier, buffer, l, r) prevword = word #for i in range(0,r): # buffer.append( ("<end>",False,'') ) # if len(buffer) == l + r + 1: # buffer = self.addtraininstance(classifier, buffer,l,r) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) for ngram in Windower(line, n): confusible = ngram[l] if confusible in self.settings['confusibles']: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) classifier.append(leftcontext + rightcontext, confusible) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init self.classifier.load()
def create_classifier_and_word_freq_list(train_instances,timbl_models_folder,train_users,test_user,tweet_index): timbl_model_name = test_user+'.'+'_'.join(train_users)+'.'+str(tweet_index) classifier = TimblClassifier(timbl_models_folder+timbl_model_name,'-a 0 -k 1 +vs') word_frequencies = Counter() for instance in train_instances: if instance.author == test_user and instance.original_tweet_index == tweet_index: continue classifier.append( instance.features, instance.label) word_frequencies[instance.label]+= 1 classifier.train() return classifier,word_frequencies
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings["leftcontext"] r = self.settings["rightcontext"] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") # has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode="rt", encoding="utf-8", errors="ignore") as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) for ngram in Windower(line, n): confusible = ngram[l] if confusible in self.settings["confusibles"]: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1 :]) classifier.append(leftcontext + rightcontext, confusible) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
class TIMBLSuffixConfusibleModule(Module): """The Suffix Confusible module is capable of disambiguating suffixes on words. The suffixes are passes to the ``suffixes`` settings (a list of string). All words using these suffixes above a certain threshold (``freqthtreshold``) will be found at training time and disambiguated using context. The module is implemented using Timbl. Settings: * ``suffixes`` - List of suffixes (strings) that form a single set of confusibles. (changing this requires retraining) * ``freqthreshold``- Only consider words with a suffix that occur at least this many times (changing this requires retraining) * ``maxratio`` - Maximum ratio expressing the maximally allowed frequency difference between the confusibles (value > 1, 0 = no limit) (changing this requires retraining) * ``minlength`` - Only consider words with a suffix that are at least this long (in characters) (changing this requires retraining) * ``maxlength`` - Only consider words with a suffix that are at most this long (in characters) (changing this requires retraining) * ``leftcontext`` - Left context size (in words) for the feature vector (changing this requires retraining) * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining) * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) (changing this requires retraining) * ``class`` - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible) * ``threshold`` - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8) * ``minocc`` - The minimum number of occurrences (sum of all class weights) (default: 5) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a list of confusibles [``.lst``] * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] Hapaxer: This module supports hapaxing """ UNIT = folia.Word UNITFILTER = nonumbers def verifysettings(self): if 'class' not in self.settings: self.settings['class'] = 'confusion' super().verifysettings() if 'algorithm' not in self.settings: self.settings['algorithm'] = 1 if 'leftcontext' not in self.settings: self.settings['leftcontext'] = 3 if 'rightcontext' not in self.settings: self.settings['rightcontext'] = 3 if 'threshold' not in self.settings: self.settings['threshold'] = 0.8 if 'minocc' not in self.settings: self.settings['minocc'] = 5 self.hapaxer = gethapaxer(self, self.settings) if 'suffixes' not in self.settings: raise Exception("No suffixes specified for " + self.id + "!") self.suffixes = sorted( self.settings['suffixes'], key=lambda x: -1 * len(x)) #sort from long to short #settings for computation of confusible list if 'freqthreshold' not in self.settings: self.settings['freqthreshold'] = 20 if 'maxlength' not in self.settings: self.settings['maxlength'] = 25 #longer words will be ignored if 'minlength' not in self.settings: self.settings['minlength'] = 3 #shorter word will be ignored if 'maxratio' not in self.settings: self.settings['maxratio'] = 0 #no limit if 'debug' in self.settings: self.debug = bool(self.settings['debug']) else: self.debug = False ibasefound = lstfound = False for filename in self.models: if filename.endswith('.ibase'): ibasefound = True self.modelfile = filename elif filename.endswith('.lst'): lstfound = True self.confusiblefile = filename if not ibasefound: raise Exception( "TIMBL models must have the extension ibase, not model file was supplies with that extension" ) if not lstfound: raise Exception( "Specify a model file with extension lst that will store all confusibles found" ) def gettimbloptions(self): return "-F Tabbed " + "-a " + str( self.settings['algorithm']) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.confusibles = [] #pylint: disable=attribute-defined-outside-init self.log("Loading models...") if not os.path.exists(self.confusiblefile): raise IOError("Missing expected confusible file: " + self.confusiblefile + ". Did you forget to train the system?") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if not os.path.exists(self.modelfile): raise IOError("Missing expected model file: " + self.modelfile + ". Did you forget to train the system?") self.log("Loading Timbl model file " + self.modelfile + "...") fileprefix = self.modelfile.replace(".ibase", "") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init self.classifier.load() def clientload(self): self.log("Loading models (for client)...") self.confusibles = [] #pylint: disable=attribute-defined-outside-init if not os.path.exists(self.confusiblefile): raise IOError("Missing expected confusible file: " + self.confusiblefile + ". Did you forget to train the system?") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log( "Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( "", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder( classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log( "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!" ) for suffix in self.suffixes: if pattern_s.endswith( suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len( suffix)] + othersuffix try: otherpattern = classencoder.buildpattern( otherpattern_s, False, False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = ( model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile, 'w', encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: try: self.confusibles except AttributeError: self.confusibles = [] self.log("Loading confusiblefile") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized, ) + rightcontext, suffix) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() def getsuffix(self, confusible): assert isinstance(confusible, str) suffix = None for suffix in self.suffixes: #suffixes are sorted from long to short if confusible.endswith(suffix): break if suffix is None: raise ValueError("No suffix found!") return suffix, confusible[:-len(suffix)] + self.suffixes[ 0] #suffix, normalized def classify(self, features): if self.hapaxer: features = self.hapaxer(features) best, distribution, _ = self.classifier.classify(features) sumweights = sum(distribution.values()) if sumweights < self.settings['minocc']: return best, [] distribution = { sug: weight / sumweights for sug, weight in distribution.items() if weight / sumweights >= self.settings['threshold'] } if self.debug: self.log("(Returning " + str(len(distribution)) + " suggestions after filtering)") return (best, distribution) def getfeatures(self, word): """Get features at testing time, crosses sentence boundaries""" leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'], "<begin>") ]) _, normalized = self.getsuffix(word.text()) rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'], "<end>") ]) return leftcontext + (normalized, ) + rightcontext def prepareinput(self, word, **parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) if wordstr in self.confusibles: features = self.getfeatures(word) return wordstr, features def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" _, features = inputdata best, distribution = self.classify(features) return (best, distribution) def processoutput(self, output, inputdata, unit_id, **parameters): wordstr, _ = inputdata best, distribution = output suffix, _ = self.getsuffix(wordstr) if wordstr != wordstr[:-len(suffix)] + best: return self.addsuggestions( unit_id, [(wordstr[:-len(suffix)] + suggestion, p) for suggestion, p in distribution.items() if suggestion != suffix])
class TIMBLPuncRecaseModule(Module): """This is a memory-based classification module, implemented using Timbl, that predicts where punctuation needs to be inserted, deleted, and whether a word needs to be written with an initial capital. NOTE: This module performs badly!! Settings: * ``leftcontext`` - Left context size (in words) for the feature vector * ``rightcontext`` - Right context size (in words) for the feature vector * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) * ``deletionthreshold`` - If no punctuation insertion is predicted and this confidence threshold is reached, then a deletion will be predicted (should be a high number), default: 0.95 * ``insertionthreshold`` - Necessary confidence threshold to predict an insertion of punctuation (default: 0.5) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] """ UNIT = folia.Word UNITFILTER = nonumbers EOSMARKERS = ('.', '?', '!') PUNCTUATION = EOSMARKERS + (',', ';', ':') def verifysettings(self): if 'class' not in self.settings: self.settings[ 'class'] = 'missingpunctuation' #will be overriden later again super().verifysettings() if 'algorithm' not in self.settings: self.settings['algorithm'] = 1 if 'leftcontext' not in self.settings: self.settings['leftcontext'] = 3 if 'rightcontext' not in self.settings: self.settings['rightcontext'] = 2 if 'deletionthreshold' not in self.settings: self.settings['deletionthreshold'] = 0.95 if 'insertionthreshold' not in self.settings: self.settings['insertionthreshold'] = 0.5 if 'capitalizationthreshold' not in self.settings: self.settings['capitalizationthreshold'] = 0.5 if 'debug' in self.settings: self.debug = bool(self.settings['debug']) else: self.debug = False self.hapaxer = gethapaxer(self, self.settings) try: modelfile = self.models[0] if not modelfile.endswith(".ibase"): raise Exception( "TIMBL models must have the extension ibase, got " + modelfile + " instead") except: raise Exception("Expected one model, got 0 or more") def gettimbloptions(self): return "-F Tabbed " + "-a " + str( self.settings['algorithm']) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions()) self.classifier.load() def addtraininstance(self, classifier, buffer, l, r): """Helper function""" focusword, cased, punc = buffer[l] cls = punc if cased: cls += 'C' if not cls: cls = '-' if self.hapaxer: features = [w for w, _, _ in buffer] features = [ w.lower() for w in self.hapaxer(features[:l]) + (features[l + 1], ) + self.hapaxer(features[l + 2:]) ] else: features = [w.lower() for w, _, _ in buffer] classifier.append(tuple(features), cls) return buffer[1:] def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io prevword = "" #buffer = [("<begin>",False,'')] * l buffer = [] with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) words = [w.strip() for w in line.split(' ') if w.strip()] for i, word in enumerate(words): if prevword in TIMBLPuncRecaseModule.PUNCTUATION: punc = prevword else: punc = "" if any(c.isalpha() for c in word): buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc)) if len(buffer) == l + r + 1: buffer = self.addtraininstance(classifier, buffer, l, r) prevword = word #for i in range(0,r): # buffer.append( ("<end>",False,'') ) # if len(buffer) == l + r + 1: # buffer = self.addtraininstance(classifier, buffer,l,r) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() def classify(self, word): features = self.getfeatures(word) if self.hapaxer: features = self.hapaxer(features) best, distribution, _ = self.classifier.classify(features) return best, distribution def getfeatures(self, word): """Get features at testing time, crosses sentence boundaries""" l = self.settings['leftcontext'] r = self.settings['rightcontext'] leftcontext = [] currentword = word while len(leftcontext) < l: prevword = currentword.previous(folia.Word, None) if prevword: w = prevword.text().lower() if w.isalnum(): leftcontext.insert(0, w) currentword = prevword else: leftcontext.insert(0, "<begin>") rightcontext = [] currentword = word while len(rightcontext) < r: nextword = currentword.next(folia.Word, None) if nextword: w = nextword.text().lower() if w.isalnum(): rightcontext.append(w) currentword = nextword else: rightcontext.append("<end>") return leftcontext + [word.text().lower()] + rightcontext def prepareinput(self, word, **parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) #will be reused in processoutput if not any(c.isalnum() for c in wordstr): #this is punctuation, skip return None prevword = word.previous(folia.Word, None) if prevword: prevwordstr = str(prevword) prevword_id = prevword.id else: prevwordstr = "" prevword_id = "" features = self.getfeatures(word) return wordstr, prevwordstr, prevword_id, features def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" wordstr, prevword, prevword_id, features = inputdata if self.debug: self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")") if self.hapaxer: features = self.hapaxer(features) best, distribution, _ = self.classifier.classify(features) if self.debug: self.log(" (Best: " + best + ")") return [best, distribution] def processoutput(self, outputdata, inputdata, unit_id, **parameters): queries = [] wordstr, prevword, prevword_id, _ = inputdata cls, distribution = outputdata recase = False if cls[-1] == 'C': if wordstr[0] == wordstr[0].lower(): if distribution[cls] >= self.settings[ 'capitalizationthreshold']: recase = True elif self.debug: self.log(" (Capitalization threshold not reached: " + str(distribution[cls]) + ")") cls = cls[:-1] if cls == '-': if prevword and distribution[cls] >= self.settings[ 'deletionthreshold'] and all(not c.isalpha() for c in prevword): if self.debug: self.log(" (Redundant punctuation " + cls + " with threshold " + str(distribution[cls]) + ")") queries.append( self.suggestdeletion( prevword_id, (prevword in TIMBLPuncRecaseModule.EOSMARKERS), cls='redundantpunctuation')) elif cls and cls in distribution: #insertion of punctuation if distribution[cls] >= self.settings['insertionthreshold']: if all(not c.isalnum() for c in prevword): #previous word is punctuation already if prevword != cls: self.log(" (Found punctuation confusion)") queries.append( self.addsuggestions(prevword_id, cls, cls='confusion')) else: recase = False #no punctuation insertion? then no recasing either if self.debug: self.log( " (Predicted punctuation already there, good, ignoring)" ) else: if self.debug: self.log(" (Insertion " + cls + " with threshold " + str(distribution[cls]) + ")") queries.append( self.suggestinsertion( unit_id, cls, (cls in TIMBLPuncRecaseModule.EOSMARKERS))) else: recase = False #no punctuation insertion? then no recasing either if self.debug: self.log(" (Insertion threshold not reached: " + str(distribution[cls]) + ")") if recase and wordstr[0].isalpha(): #recase word t = wordstr if recase: t = t[0].upper() + t[1:] if self.debug: self.log(" (Correcting capitalization for " + wordstr + ")") queries.append( self.addsuggestions(unit_id, [t], cls='capitalizationerror')) return queries
class TIMBLLMModule(Module): """The Language Model predicts words given their context (including right context). It uses a classifier-based approach. Settings: * ``threshold`` - Prediction confidence threshold, only when a prediction exceeds this threshold will it be recommended (default: 0.9, value must be higher than 0.5 by definition) * ``freqthreshold`` - If the previous word occurs below this threshold, then no classification will take place. Only has an effect when a lexicon is enabled (default: 2) * ``leftcontext`` - Left context size (in words) for the feature vector * ``rightcontext`` - Right context size (in words) for the feature vector * ``maxdistance`` - Maximum Levenshtein distance between a word and its correction (larger distances are pruned from suggestions) * ``minlength`` - Minimum length (in characters) for a word to be considered by the LM module * ``probfactor`` - If the predicted word is in the target distribution, any suggestions must be more probable by this factor (default: 10) * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) * ``class`` - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusion) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] * optional: a plain-text corpus (tokenized) [``.txt``] -> a lexicon model [``.colibri.patternmodel``] Hapaxer: This module supports hapaxing Caching: This module supports caching """ UNIT = folia.Word UNITFILTER = nonumbers def verifysettings(self): if 'class' not in self.settings: self.settings['class'] = 'confusion' super().verifysettings() if 'algorithm' not in self.settings: self.settings['algorithm'] = 1 if 'leftcontext' not in self.settings: self.settings['leftcontext'] = 3 if 'rightcontext' not in self.settings: self.settings['rightcontext'] = 3 if 'threshold' not in self.settings: self.threshold = 0.9 else: self.threshold = self.settings['threshold'] if 'freqthreshold' not in self.settings: self.freqthreshold = 2 else: self.freqthreshold = self.settings['freqthreshold'] if 'minlength' not in self.settings: self.minlength = 5 else: self.minlength = self.settings['minlength'] if 'probfactor' not in self.settings: self.probfactor = 10 else: self.probfactor = self.settings['probfactor'] if 'maxdistance' not in self.settings: self.settings['maxdistance'] = 2 if 'debug' in self.settings: self.debug = bool(self.settings['debug']) else: self.debug = False self.hapaxer = gethapaxer(self, self.settings) self.cache = getcache(self.settings, 1000) try: modelfile = self.models[0] if not modelfile.endswith(".ibase"): raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead") if len(self.models) > 1: lexiconfile = self.models[1] if not lexiconfile.endswith("colibri.patternmodel"): raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead") except: raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) ) def gettimbloptions(self): return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" self.errorlist = {} if not self.models: raise Exception("Specify one or more models to load!") if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() self.log("Loading models...") if len(self.models) == 2: modelfile, lexiconfile = self.models else: modelfile = self.models[0] lexiconfile = None if not os.path.exists(modelfile): raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?") if lexiconfile and not os.path.exists(lexiconfile): raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug) self.classifier.load() if lexiconfile: self.log("Loading colibri model file for lexicon " + lexiconfile) self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls') self.lexicon = colibricore.UnindexedPatternModel(lexiconfile) else: self.lexicon = None def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() if modelfile.endswith('.ibase'): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): if self.hapaxer: ngram = self.hapaxer(ngram) focus = ngram[l] if self.hapaxer and focus == self.hapaxer.placeholder: continue leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() elif modelfile.endswith('.patternmodel'): self.log("Preparing to generate lexicon for Language Model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder() classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating pattern model") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model " + modelfile) model.write(modelfile) def getfeatures(self, word): """Get features at testing time""" leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ]) rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ]) return leftcontext + rightcontext def prepareinput(self,word,**parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) #will be reused in processoutput if len(wordstr) > self.minlength: features = self.getfeatures(word) return wordstr, features def processoutput(self, outputdata, inputdata, unit_id,**parameters): wordstr,_ = inputdata if wordstr is not None: best,distribution = outputdata if best != wordstr and distribution: return self.addsuggestions(unit_id, distribution) def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" if self.debug: begintime = time.time() wordstr = inputdata[0] features = tuple(inputdata[1]) if self.debug: self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")") if self.hapaxer: features = self.hapaxer(features) #pylint: disable=not-callable previousword = features[self.settings['leftcontext'] - 1] if previousword == self.hapaxer.placeholder: if self.debug: duration = round(time.time() - begintime,4) self.log(" (Previous word not in hapaxer, returned in " + str(duration) + "s)") return None,None if self.cache is not None: try: cached = self.cache[features] if self.debug: duration = round(time.time() - begintime,4) self.log(" (Return from cache in " + str(duration) + "s)") return cached except KeyError: pass if self.lexicon: #ensure the previous word exists previousword = features[self.settings['leftcontext'] - 1] pattern = self.classencoder.buildpattern(previousword) if pattern.unknown() or pattern not in self.lexicon: if self.debug: duration = round(time.time() - begintime,4) self.log(" (Previous word not in lexicon, returned in " + str(duration) + "s)") return None,None #if self.settings['rightcontext']: # nextword = features[self.settings['leftcontext']] # pattern = self.classencoder.buildpattern(nextword) # if pattern.unknown() or pattern not in self.lexicon: # return None,None #else: # return None,None best,distribution,_ = self.classifier.classify(features,allowtopdistribution=False) if self.debug: duration = round(time.time() - begintime,4) self.log(" (Classification took " + str(duration) + "s, unfiltered distribution size=" + str(len(distribution)) + ")") l = len(wordstr) if self.settings['maxdistance']: #filter suggestions that are too distant if self.debug: begintime = time.time() dist = {} for key, freq in distribution.items(): if freq >= self.threshold and abs(l - len(key)) <= self.settings['maxdistance'] and Levenshtein.distance(wordstr,key) <= self.settings['maxdistance']: dist[key] = freq if wordstr in dist: #typed word is part of distribution, are any of the candidates far more likely? basefreq = dist[wordstr] dist = { key: freq for key, freq in dist.items() if key == wordstr or freq > basefreq * self.probfactor } if len(dist) == 1: #no correction necessary return None, None if self.debug: duration = round(time.time() - begintime,4) self.log(" (Levenshtein filtering took " + str(duration) + "s, final distribution size=" + str(len(dist)) + ")") self.cache.append(features, (best,dist)) return best, dist else: dist = [ x for x in distribution.items() if x[1] >= self.threshold ] self.cache.append(features, (best,dist)) return best, dist
def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log( "Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( "", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder( classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log( "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!" ) for suffix in self.suffixes: if pattern_s.endswith( suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len( suffix)] + othersuffix try: otherpattern = classencoder.buildpattern( otherpattern_s, False, False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = ( model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile, 'w', encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: try: self.confusibles except AttributeError: self.confusibles = [] self.log("Loading confusiblefile") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized, ) + rightcontext, suffix) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
class TIMBLLMModule(Module): """The Language Model predicts words given their context (including right context). It uses a classifier-based approach. Settings: * ``threshold`` - Prediction confidence threshold, only when a prediction exceeds this threshold will it be recommended (default: 0.9, value must be higher than 0.5 by definition) * ``leftcontext`` - Left context size (in words) for the feature vector * ``rightcontext`` - Right context size (in words) for the feature vector * ``maxdistance`` - Maximum Levenshtein distance between a word and its correction (larger distances are pruned from suggestions) * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) * ``class`` - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: contexterror) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] """ UNIT = folia.Word def verifysettings(self): if 'class' not in self.settings: self.settings['class'] = 'contexterror' super().verifysettings() if 'algorithm' not in self.settings: self.settings['algorithm'] = 1 if 'leftcontext' not in self.settings: self.settings['leftcontext'] = 3 if 'rightcontext' not in self.settings: self.settings['rightcontext'] = 3 if 'threshold' not in self.settings: self.threshold = self.settings['threshold'] else: self.threshold = 0.9 if 'maxdistance' not in self.settings: self.settings['maxdistance'] = 2 if 'debug' in self.settings: self.debug = bool(self.settings['debug']) else: self.debug = False self.hapaxer = gethapaxer(self.settings) self.cache = getcache(self.settings, 1000) try: modelfile = self.models[0] if not modelfile.endswith(".ibase"): raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead") except: raise Exception("Expected one model, got 0 or more") def gettimbloptions(self): return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" self.errorlist = {} if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions()) self.classifier.load() def train(self, sourcefile, modelfile, **parameters): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): focus = ngram[l] leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() def getfeatures(self, word): """Get features at testing time""" leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ]) rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ]) return leftcontext + rightcontext def prepareinput(self,word,**parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) #will be reused in processoutput features = self.getfeatures(word) if self.hapaxer: features = self.hapaxer(features) #pylint: disable=not-callable return wordstr, features def processoutput(self, outputdata, inputdata, unit_id,**parameters): wordstr,_ = inputdata best,distribution = outputdata if best != wordstr and distribution: return self.addsuggestions(unit_id, distribution) def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" wordstr = inputdata[0] features = tuple(inputdata[1]) if self.debug: begintime = time.time() if self.cache is not None: try: cached = self.cache[features] if self.debug: duration = round(time.time() - begintime,4) self.log(" (Return from cache in " + str(duration) + "s)") return cached except KeyError: pass best,distribution,_ = self.classifier.classify(features, True) #True=thread-safe if self.debug: duration = round(time.time() - begintime,4) self.log(" (Classification took " + str(duration) + "s, unfiltered distribution size=" + str(len(distribution)) + ")") l = len(wordstr) if self.settings['maxdistance']: #filter suggestions that are too distant if self.debug: begintime = time.time() dist = {} for key, freq in distribution.items(): if freq >= self.threshold and abs(l - len(key)) <= self.settings['maxdistance'] and Levenshtein.distance(wordstr,key) <= self.settings['maxdistance']: dist[key] = freq if self.debug: duration = round(time.time() - begintime,4) self.log(" (Levenshtein filtering took " + str(duration) + "s, final distribution size=" + str(len(dist)) + ")") self.cache.append(features, (best,dist)) return best, dist else: dist = [ x for x in distribution.items() if x[1] >= self.threshold ] self.cache.append(features, (best,dist)) return best, dist
class skTiMBL(BaseEstimator, ClassifierMixin): def __init__(self, prefix='timbl', algorithm=4, dist_metric=None, k=1, normalize=False, debug=0, flushdir=None): self.prefix = prefix self.algorithm = algorithm self.dist_metric = dist_metric self.k = k self.normalize = normalize self.debug = debug self.flushdir = flushdir def _make_timbl_options(self, *options): """ -a algorithm -m metric -w weighting -k amount of neighbours -d class voting weights -L frequency threshold -T which feature index is label -N max number of features -H turn hashing on/off This function still has to be made, for now the appropriate arguments can be passed in fit() """ pass def fit(self, X, y): X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr') n_rows = X.shape[0] self.classes_ = np.unique(y) if sp.sparse.issparse(X): if self.debug: print('Features are sparse, choosing faster learning') self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]), format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000, normalize=self.normalize) for i in range(n_rows): sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)] self.classifier.append(sparse,str(y[i])) else: self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]), debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000, normalize=self.normalize) if y.dtype != 'O': y = y.astype(str) for i in range(n_rows): self.classifier.append(list(X[i].toarray()[0]), y[i]) self.classifier.train() return self def _timbl_predictions(self, X, part_index, y=None): choices = {0 : lambda x : x.append(np.int64(label)), 1 : lambda x : x.append([np.float(distance)]), } X = check_array(X, dtype=np.float64, accept_sparse='csr') n_samples = X.shape[0] pred = [] func = choices[part_index] if sp.sparse.issparse(X): if self.debug: print('Features are sparse, choosing faster predictions') for i in range(n_samples): sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)] label,proba, distance = self.classifier.classify(sparse) func(pred) else: for i in range(n_samples): label,proba, distance = self.classifier.classify(list(X[i].toarray()[0])) func(pred) return np.array(pred) def predict(self, X, y=None): return self._timbl_predictions(X, part_index=0) def predict_proba(self, X, y=None): """ TIMBL is a discrete classifier. It cannot give probability estimations. To ensure that scikit-learn functions with TIMBL (and especially metrics such as ROC_AUC), this method is implemented. For ROC_AUC, the classifier corresponds to a single point in ROC space, instead of a probabilistic continuum such as classifiers that can give a probability estimation (e.g. Linear classifiers). For an explanation, see Fawcett (2005). """ return predict(X) def decision_function(self, X, y=None): """ The decision function is interpreted here as being the distance between the instance that is being classified and the nearest point in k space. """ return self._timbl_predictions(X, part_index=1)
class TIMBLSuffixConfusibleModule(Module): """The Suffix Confusible module is capable of disambiguating suffixes on words. The suffixes are passes to the ``suffixes`` settings (a list of string). All words using these suffixes above a certain threshold (``freqthtreshold``) will be found at training time and disambiguated using context. The module is implemented using Timbl. Settings: * ``suffixes`` - List of suffixes (strings) that form a single set of confusibles. (changing this requires retraining) * ``freqthreshold``- Only consider words with a suffix that occur at least this many times (changing this requires retraining) * ``maxratio`` - Maximum ratio expressing the maximally allowed frequency difference between the confusibles (value > 1, 0 = no limit) (changing this requires retraining) * ``minlength`` - Only consider words with a suffix that are at least this long (in characters) (changing this requires retraining) * ``maxlength`` - Only consider words with a suffix that are at most this long (in characters) (changing this requires retraining) * ``leftcontext`` - Left context size (in words) for the feature vector (changing this requires retraining) * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining) * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) (changing this requires retraining) * ``class`` - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible) * ``threshold`` - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8) * ``minocc`` - The minimum number of occurrences (sum of all class weights) (default: 5) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a list of confusibles [``.lst``] * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] Hapaxer: This module supports hapaxing """ UNIT = folia.Word UNITFILTER = nonumbers def verifysettings(self): if 'class' not in self.settings: self.settings['class'] = 'confusion' super().verifysettings() if 'algorithm' not in self.settings: self.settings['algorithm'] = 1 if 'leftcontext' not in self.settings: self.settings['leftcontext'] = 3 if 'rightcontext' not in self.settings: self.settings['rightcontext'] = 3 if 'threshold' not in self.settings: self.settings['threshold'] = 0.8 if 'minocc' not in self.settings: self.settings['minocc'] = 5 self.hapaxer = gethapaxer(self, self.settings) if 'suffixes' not in self.settings: raise Exception("No suffixes specified for " + self.id + "!") self.suffixes = sorted(self.settings['suffixes'], key= lambda x: -1* len(x)) #sort from long to short #settings for computation of confusible list if 'freqthreshold' not in self.settings: self.settings['freqthreshold'] = 20 if 'maxlength' not in self.settings: self.settings['maxlength'] = 25 #longer words will be ignored if 'minlength' not in self.settings: self.settings['minlength'] = 3 #shorter word will be ignored if 'maxratio' not in self.settings: self.settings['maxratio'] = 0 #no limit if 'debug' in self.settings: self.debug = bool(self.settings['debug']) else: self.debug = False ibasefound = lstfound = False for filename in self.models: if filename.endswith('.ibase'): ibasefound = True self.modelfile = filename elif filename.endswith('.lst'): lstfound = True self.confusiblefile = filename if not ibasefound: raise Exception("TIMBL models must have the extension ibase, not model file was supplies with that extension") if not lstfound: raise Exception("Specify a model file with extension lst that will store all confusibles found") def gettimbloptions(self): return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.confusibles = []#pylint: disable=attribute-defined-outside-init self.log("Loading models...") if not os.path.exists(self.confusiblefile): raise IOError("Missing expected confusible file: " + self.confusiblefile + ". Did you forget to train the system?") with open(self.confusiblefile,'r',encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if not os.path.exists(self.modelfile): raise IOError("Missing expected model file: " + self.modelfile + ". Did you forget to train the system?") self.log("Loading Timbl model file " + self.modelfile + "...") fileprefix = self.modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init self.classifier.load() def clientload(self): self.log("Loading models (for client)...") self.confusibles = []#pylint: disable=attribute-defined-outside-init if not os.path.exists(self.confusiblefile): raise IOError("Missing expected confusible file: " + self.confusiblefile + ". Did you forget to train the system?") with open(self.confusiblefile,'r',encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log("Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log("WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!") for suffix in self.suffixes: if pattern_s.endswith(suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len(suffix)] + othersuffix try: otherpattern = classencoder.buildpattern(otherpattern_s,False,False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = (model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s ) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile,'w',encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized,) + rightcontext , suffix ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() def getsuffix(self, confusible): assert isinstance(confusible, str) suffix = None for suffix in self.suffixes: #suffixes are sorted from long to short if confusible.endswith(suffix): break if suffix is None: raise ValueError("No suffix found!") return suffix, confusible[:-len(suffix)] + self.suffixes[0] #suffix, normalized def classify(self, features): if self.hapaxer: features = self.hapaxer(features) best,distribution,_ = self.classifier.classify(features) sumweights = sum(distribution.values()) if sumweights < self.settings['minocc']: return best, [] distribution = { sug: weight/sumweights for sug,weight in distribution.items() if weight/sumweights >= self.settings['threshold'] } if self.debug: self.log("(Returning " + str(len(distribution)) + " suggestions after filtering)") return (best,distribution) def getfeatures(self, word): """Get features at testing time, crosses sentence boundaries""" leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ]) _, normalized = self.getsuffix(word.text()) rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ]) return leftcontext + (normalized,) + rightcontext def prepareinput(self,word,**parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) if wordstr in self.confusibles: features = self.getfeatures(word) return wordstr, features def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" _,features = inputdata best,distribution = self.classify(features) return (best,distribution) def processoutput(self, output, inputdata, unit_id,**parameters): wordstr,_ = inputdata best,distribution = output suffix,_ = self.getsuffix(wordstr) if wordstr != wordstr[:-len(suffix)] + best: return self.addsuggestions(unit_id, [ (wordstr[:-len(suffix)] + suggestion,p) for suggestion,p in distribution.items() if suggestion != suffix] )
class TIMBLPuncRecaseModule(Module): """This is a memory-based classification module, implemented using Timbl, that predicts where punctuation needs to be inserted, deleted, and whether a word needs to be written with an initial capital. Settings: * ``leftcontext`` - Left context size (in words) for the feature vector * ``rightcontext`` - Right context size (in words) for the feature vector * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) * ``deletionthreshold`` - If no punctuation insertion is predicted and this confidence threshold is reached, then a deletion will be predicted (should be a high number), default: 0.95 * ``insertionthreshold`` - Necessary confidence threshold to predict an insertion of punctuation (default: 0.5) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] """ UNIT = folia.Word UNITFILTER = nonumbers def verifysettings(self): if 'class' not in self.settings: self.settings['class'] = 'missingpunctuation' #will be overriden later again super().verifysettings() if 'algorithm' not in self.settings: self.settings['algorithm'] = 1 if 'leftcontext' not in self.settings: self.settings['leftcontext'] = 3 if 'rightcontext' not in self.settings: self.settings['rightcontext'] = 2 if 'deletionthreshold' not in self.settings: self.settings['deletionthreshold'] = 0.95 if 'insertionthreshold' not in self.settings: self.settings['insertionthreshold'] = 0.5 if 'capitalizationthreshold' not in self.settings: self.settings['capitalizationthreshold'] = 0.5 if 'debug' in self.settings: self.debug = bool(self.settings['debug']) else: self.debug = False self.hapaxer = gethapaxer(self, self.settings) try: modelfile = self.models[0] if not modelfile.endswith(".ibase"): raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead") except: raise Exception("Expected one model, got 0 or more") def gettimbloptions(self): return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions()) self.classifier.load() def addtraininstance(self,classifier, buffer,l,r): """Helper function""" focusword, cased, punc = buffer[l] cls = punc if cased: cls += 'C' if not cls: cls = '-' if self.hapaxer: features = [w for w,_,_ in buffer] features = [w.lower() for w in self.hapaxer(features[:l]) + (features[l+1],) + self.hapaxer(features[l+2:])] else: features = [w.lower() for w,_,_ in buffer] classifier.append( tuple(features) , cls ) return buffer[1:] def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io prevword = "" #buffer = [("<begin>",False,'')] * l buffer = [] with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) words = [ w.strip() for w in line.split(' ') if w.strip() ] for i, word in enumerate(words): if prevword in PUNCTUATION: punc = prevword else: punc = "" if any( c.isalpha() for c in word ): buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc ) ) if len(buffer) == l + r + 1: buffer = self.addtraininstance(classifier, buffer,l,r) prevword = word #for i in range(0,r): # buffer.append( ("<end>",False,'') ) # if len(buffer) == l + r + 1: # buffer = self.addtraininstance(classifier, buffer,l,r) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() def classify(self, word): features = self.getfeatures(word) if self.hapaxer: features = self.hapaxer(features) best, distribution,_ = self.classifier.classify(features) return best, distribution def getfeatures(self, word): """Get features at testing time, crosses sentence boundaries""" l = self.settings['leftcontext'] r = self.settings['rightcontext'] leftcontext = [] currentword = word while len(leftcontext) < l: prevword = currentword.previous(folia.Word,None) if prevword: w = prevword.text().lower() if w.isalnum(): leftcontext.insert(0, w ) currentword = prevword else: leftcontext.insert(0, "<begin>") rightcontext = [] currentword = word while len(rightcontext) < r: nextword = currentword.next(folia.Word,None) if nextword: w = nextword.text().lower() if w.isalnum(): rightcontext.append(w ) currentword = nextword else: rightcontext.append("<end>") return leftcontext + [word.text().lower()] + rightcontext def prepareinput(self,word,**parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) #will be reused in processoutput if not any( c.isalnum() for c in wordstr): #this is punctuation, skip return None prevword = word.previous(folia.Word,None) if prevword: prevwordstr = str(prevword) prevword_id = prevword.id else: prevwordstr = "" prevword_id = "" features = self.getfeatures(word) return wordstr, prevwordstr, prevword_id,features def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" wordstr,prevword,prevword_id, features = inputdata if self.debug: self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")") if self.hapaxer: features = self.hapaxer(features) best,distribution,_ = self.classifier.classify(features) if self.debug: self.log(" (Best: " + best + ")") return [best,distribution] def processoutput(self, outputdata, inputdata, unit_id,**parameters): queries = [] wordstr,prevword,prevword_id, _ = inputdata cls, distribution = outputdata recase = False if cls[-1] == 'C': if wordstr[0] == wordstr[0].lower(): if distribution[cls] >= self.settings['capitalizationthreshold']: recase = True elif self.debug: self.log(" (Capitalization threshold not reached: " + str(distribution[cls]) + ")") cls = cls[:-1] if cls == '-': if prevword and distribution[cls] >= self.settings['deletionthreshold'] and all( not c.isalpha() for c in prevword ): if self.debug: self.log(" (Redundant punctuation " + cls + " with threshold " + str(distribution[cls]) + ")") queries.append( self.suggestdeletion(prevword_id,(prevword in EOSMARKERS), cls='redundantpunctuation') ) elif cls and cls in distribution: #insertion of punctuation if distribution[cls] >= self.settings['insertionthreshold']: if all(not c.isalnum() for c in prevword): #previous word is punctuation already if prevword != cls: self.log(" (Found punctuation confusion)") queries.append( self.addsuggestions(prevword_id,cls, cls='confusion') ) else: recase = False #no punctuation insertion? then no recasing either if self.debug: self.log(" (Predicted punctuation already there, good, ignoring)") else: if self.debug: self.log(" (Insertion " + cls + " with threshold " + str(distribution[cls]) + ")") queries.append( self.suggestinsertion(unit_id, cls, (cls in EOSMARKERS) ) ) else: recase = False #no punctuation insertion? then no recasing either if self.debug: self.log(" (Insertion threshold not reached: " + str(distribution[cls]) + ")") if recase and wordstr[0].isalpha(): #recase word t = wordstr if recase: t = t[0].upper() + t[1:] if self.debug: self.log(" (Correcting capitalization for " + wordstr + ")") queries.append( self.addsuggestions( unit_id, [t], cls='capitalizationerror') ) return queries
class TIMBLWordConfusibleModule(Module): """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context. The module is implemented using memory-based classifiers in Timbl. Settings: * ``confusibles`` - List of words (strings) that form a single set of confusibles. * ``leftcontext`` - Left context size (in words) for the feature vector (changing this requires retraining) * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining) * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree, changing this requires retraining) * ``class`` - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible) * ``threshold`` - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8) * ``minocc`` - The minimum number of occurrences (sum of all class weights) (default: 5) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] Hapaxer: This module supports hapaxing """ UNIT = folia.Word UNITFILTER = nonumbers def verifysettings(self): if 'class' not in self.settings: self.settings['class'] = 'confusion' super().verifysettings() if 'algorithm' not in self.settings: self.settings['algorithm'] = 1 if 'leftcontext' not in self.settings: self.settings['leftcontext'] = 3 if 'rightcontext' not in self.settings: self.settings['rightcontext'] = 3 if 'threshold' not in self.settings: self.settings['threshold'] = 0.8 if 'minocc' not in self.settings: self.settings['minocc'] = 5 self.hapaxer = gethapaxer(self, self.settings) if 'confusibles' not in self.settings: raise Exception("No confusibles specified for " + self.id + "!") self.confusibles = self.settings['confusibles'] if 'debug' in self.settings: self.debug = bool(self.settings['debug']) else: self.debug = False try: modelfile = self.models[0] if not modelfile.endswith(".ibase"): raise Exception( "TIMBL models must have the extension ibase, got " + modelfile + " instead") except: raise Exception("Expected one model, got 0 or more") def gettimbloptions(self): return "-F Tabbed " + "-a " + str( self.settings['algorithm']) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init self.classifier.load() def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) for ngram in Windower(line, n): confusible = ngram[l] if confusible in self.settings['confusibles']: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) classifier.append(leftcontext + rightcontext, confusible) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() def getfeatures(self, word): """Get features at testing time, crosses sentence boundaries""" leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'], "<begin>") ]) rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'], "<end>") ]) return leftcontext + rightcontext def classify(self, features): if self.hapaxer: features = self.hapaxer(features) best, distribution, _ = self.classifier.classify(features) sumweights = sum(distribution.values()) if self.debug: self.log("(Classified " + repr(features) + ", best=" + best + ", sumweights=" + str(sumweights) + ", distribution=" + repr(distribution) + ")") if sumweights < self.settings['minocc']: if self.debug: self.log("(Not passing minocc threshold)") return best, [] distribution = { sug: weight / sumweights for sug, weight in distribution.items() if weight / sumweights >= self.settings['threshold'] } if self.debug: self.log("(Returning " + str(len(distribution)) + " suggestions after filtering)") return best, distribution def prepareinput(self, word, **parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) #will be reused in processoutput if wordstr in self.confusibles: features = self.getfeatures(word) return wordstr, features def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" _, features = inputdata best, distribution = self.classify(features) return (best, distribution) def processoutput(self, output, inputdata, unit_id, **parameters): wordstr, _ = inputdata best, distribution = output if best and best != wordstr and distribution: return self.addsuggestions(unit_id, list(distribution.items()))
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() if modelfile.endswith('.ibase'): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): if self.hapaxer: ngram = self.hapaxer(ngram) focus = ngram[l] if self.hapaxer and focus == self.hapaxer.placeholder: continue leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() elif modelfile.endswith('.patternmodel'): self.log("Preparing to generate lexicon for Language Model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder() classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating pattern model") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model " + modelfile) model.write(modelfile)
class TIMBLWordConfusibleModule(Module): """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context. The module is implemented using memory-based classifiers in Timbl. Settings: * ``confusibles`` - List of words (strings) that form a single set of confusibles. * ``leftcontext`` - Left context size (in words) for the feature vector * ``rightcontext`` - Right context size (in words) for the feature vector * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) * ``class`` - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] """ UNIT = folia.Word def verifysettings(self): if "class" not in self.settings: self.settings["class"] = "confusible" super().verifysettings() if "algorithm" not in self.settings: self.settings["algorithm"] = 1 if "leftcontext" not in self.settings: self.settings["leftcontext"] = 3 if "rightcontext" not in self.settings: self.settings["rightcontext"] = 3 self.hapaxer = gethapaxer(self.settings) if "confusibles" not in self.settings: raise Exception("No confusibles specified for " + self.id + "!") self.confusibles = self.settings["confusibles"] try: modelfile = self.models[0] if not modelfile.endswith(".ibase"): raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead") except: raise Exception("Expected one model, got 0 or more") def gettimbloptions(self): return "-F Tabbed " + "-a " + str(self.settings["algorithm"]) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase", "") # has been verified earlier self.classifier = TimblClassifier( fileprefix, self.gettimbloptions() ) # pylint: disable=attribute-defined-outside-init self.classifier.load() def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings["leftcontext"] r = self.settings["rightcontext"] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") # has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode="rt", encoding="utf-8", errors="ignore") as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) for ngram in Windower(line, n): confusible = ngram[l] if confusible in self.settings["confusibles"]: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1 :]) classifier.append(leftcontext + rightcontext, confusible) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() def getfeatures(self, word): """Get features at testing time, crosses sentence boundaries""" leftcontext = tuple([str(w) for w in word.leftcontext(self.settings["leftcontext"], "<begin>")]) rightcontext = tuple([str(w) for w in word.rightcontext(self.settings["rightcontext"], "<end>")]) return leftcontext + rightcontext def prepareinput(self, word, **parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) # will be reused in processoutput if wordstr in self.confusibles: features = self.getfeatures(word) if self.hapaxer: features = self.hapaxer(features) return wordstr, features def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" _, features = inputdata best, distribution, _ = self.classifier.classify(features) return (best, distribution) def processoutput(self, output, inputdata, unit_id, **parameters): wordstr, _ = inputdata best, distribution = output if best != wordstr: return self.addsuggestions(unit_id, list(distribution.items()))
class TIMBLWordConfusibleModule(Module): """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context. The module is implemented using memory-based classifiers in Timbl. Settings: * ``confusibles`` - List of words (strings) that form a single set of confusibles. * ``leftcontext`` - Left context size (in words) for the feature vector (changing this requires retraining) * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining) * ``algorithm`` - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree, changing this requires retraining) * ``class`` - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible) * ``threshold`` - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8) * ``minocc`` - The minimum number of occurrences (sum of all class weights) (default: 5) Sources and models: * a plain-text corpus (tokenized) [``.txt``] -> a classifier instance base model [``.ibase``] Hapaxer: This module supports hapaxing """ UNIT = folia.Word UNITFILTER = nonumbers def verifysettings(self): if 'class' not in self.settings: self.settings['class'] = 'confusion' super().verifysettings() if 'algorithm' not in self.settings: self.settings['algorithm'] = 1 if 'leftcontext' not in self.settings: self.settings['leftcontext'] = 3 if 'rightcontext' not in self.settings: self.settings['rightcontext'] = 3 if 'threshold' not in self.settings: self.settings['threshold'] = 0.8 if 'minocc' not in self.settings: self.settings['minocc'] = 5 self.hapaxer = gethapaxer(self, self.settings) if 'confusibles' not in self.settings: raise Exception("No confusibles specified for " + self.id + "!") self.confusibles = self.settings['confusibles'] if 'debug' in self.settings: self.debug = bool(self.settings['debug']) else: self.debug = False try: modelfile = self.models[0] if not modelfile.endswith(".ibase"): raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead") except: raise Exception("Expected one model, got 0 or more") def gettimbloptions(self): return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0" def load(self): """Load the requested modules from self.models""" if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init self.classifier.load() def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): confusible = ngram[l] if confusible in self.settings['confusibles']: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , confusible ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() def getfeatures(self, word): """Get features at testing time, crosses sentence boundaries""" leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ]) rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ]) return leftcontext + rightcontext def classify(self, features): if self.hapaxer: features = self.hapaxer(features) best,distribution,_ = self.classifier.classify(features) sumweights = sum(distribution.values()) if self.debug: self.log("(Classified " + repr(features) + ", best=" + best + ", sumweights=" + str(sumweights) + ", distribution=" + repr(distribution) + ")") if sumweights < self.settings['minocc']: if self.debug: self.log("(Not passing minocc threshold)") return best, [] distribution = { sug: weight/sumweights for sug,weight in distribution.items() if weight/sumweights >= self.settings['threshold'] } if self.debug: self.log("(Returning " + str(len(distribution)) + " suggestions after filtering)") return best,distribution def prepareinput(self,word,**parameters): """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()""" wordstr = str(word) #will be reused in processoutput if wordstr in self.confusibles: features = self.getfeatures(word) return wordstr, features def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" _, features = inputdata best, distribution = self.classify(features) return (best,distribution) def processoutput(self, output, inputdata, unit_id,**parameters): wordstr, _ = inputdata best,distribution = output if best and best != wordstr and distribution: return self.addsuggestions(unit_id, list(distribution.items()))
def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log("Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log("WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!") for suffix in self.suffixes: if pattern_s.endswith(suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len(suffix)] + othersuffix try: otherpattern = classencoder.buildpattern(otherpattern_s,False,False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = (model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s ) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile,'w',encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized,) + rightcontext , suffix ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
from timbl import TimblClassifier classifier = TimblClassifier('test','-a 0 -k 1 +vk') classifier.append( ('dit','is','een'), 'idee') classifier.append( ('dat','was','geen'), 'doen') classifier.train() r = classifier.classify(('dit','was','geen')) print(r)