Example #1
0
    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.confusibles = []  #pylint: disable=attribute-defined-outside-init

        self.log("Loading models...")
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: " +
                          self.confusiblefile +
                          ". Did you forget to train the system?")
        with open(self.confusiblefile, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)
        if not os.path.exists(self.modelfile):
            raise IOError("Missing expected model file: " + self.modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading Timbl model file " + self.modelfile + "...")
        fileprefix = self.modelfile.replace(".ibase",
                                            "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix,
                                          self.gettimbloptions(),
                                          normalize=False)  #pylint: disable=attribute-defined-outside-init
        self.classifier.load()
Example #2
0
    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug)
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None
Example #3
0
    def fit(self, X, y):
        X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr')

        n_rows = X.shape[0]
        self.classes_ = np.unique(y)

        if sp.sparse.issparse(X):
            if self.debug: print('Features are sparse, choosing faster learning')

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]),
                                              format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir,
                                              flushthreshold=20000, normalize=self.normalize)

            for i in range(n_rows):
                sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
                self.classifier.append(sparse,str(y[i]))

        else:

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]),
                                              debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000,
                                              normalize=self.normalize)

            if y.dtype != 'O':
                y = y.astype(str)

            for i in range(n_rows):
                self.classifier.append(list(X[i].toarray()[0]), y[i])

        self.classifier.train()
        return self
Example #4
0
    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug) 
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None
Example #5
0
    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")


        self.confusibles = []#pylint: disable=attribute-defined-outside-init

        self.log("Loading models...")
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: "  + self.confusiblefile + ". Did you forget to train the system?")
        with open(self.confusiblefile,'r',encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)
        if not os.path.exists(self.modelfile):
            raise IOError("Missing expected model file: " + self.modelfile + ". Did you forget to train the system?")
        self.log("Loading Timbl model file " + self.modelfile + "...")
        fileprefix = self.modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init
        self.classifier.load()
Example #6
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io

        prevword = ""
        #buffer = [("<begin>",False,'')] * l
        buffer = []
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                words = [ w.strip() for w in line.split(' ') if w.strip() ]
                for i, word in enumerate(words):
                    if prevword in PUNCTUATION:
                        punc = prevword
                    else:
                        punc = ""
                    if any(  c.isalpha() for c in word  ):
                        buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc ) )
                    if len(buffer) == l + r + 1:
                        buffer = self.addtraininstance(classifier, buffer,l,r)
                    prevword = word
        #for i in range(0,r):
        #    buffer.append( ("<end>",False,'') )
        #    if len(buffer) == l + r + 1:
        #        buffer = self.addtraininstance(classifier, buffer,l,r)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()
Example #7
0
File: lm.py Project: wollmers/gecco
    def train(self, sourcefile, modelfile, **parameters):
        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                for ngram in Windower(line, n):
                    focus = ngram[l]
                    leftcontext = tuple(ngram[:l])
                    rightcontext = tuple(ngram[l+1:])
                    classifier.append( leftcontext + rightcontext , focus )

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()
Example #8
0
    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()
Example #9
0
File: lm.py Project: wollmers/gecco
    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()
Example #10
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io

        prevword = ""
        #buffer = [("<begin>",False,'')] * l
        buffer = []
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                words = [w.strip() for w in line.split(' ') if w.strip()]
                for i, word in enumerate(words):
                    if prevword in TIMBLPuncRecaseModule.PUNCTUATION:
                        punc = prevword
                    else:
                        punc = ""
                    if any(c.isalpha() for c in word):
                        buffer.append(
                            (word, word == word[0].upper() + word[1:].lower(),
                             punc))
                    if len(buffer) == l + r + 1:
                        buffer = self.addtraininstance(classifier, buffer, l,
                                                       r)
                    prevword = word
        #for i in range(0,r):
        #    buffer.append( ("<end>",False,'') )
        #    if len(buffer) == l + r + 1:
        #        buffer = self.addtraininstance(classifier, buffer,l,r)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()
Example #11
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings['confusibles']:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1:])
                        classifier.append(leftcontext + rightcontext,
                                          confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()
Example #12
0
    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init
        self.classifier.load()
Example #13
0
def create_classifier_and_word_freq_list(train_instances,timbl_models_folder,train_users,test_user,tweet_index):

	timbl_model_name = test_user+'.'+'_'.join(train_users)+'.'+str(tweet_index)
	classifier = TimblClassifier(timbl_models_folder+timbl_model_name,'-a 0 -k 1 +vs')
	word_frequencies = Counter()

	for instance in train_instances:
		if instance.author == test_user and instance.original_tweet_index == tweet_index:
			continue

		classifier.append( instance.features, instance.label)
		word_frequencies[instance.label]+= 1

	classifier.train()

	return classifier,word_frequencies
Example #14
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings["leftcontext"]
        r = self.settings["rightcontext"]
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase", "")  # has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile, mode="rt", encoding="utf-8", errors="ignore") as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings["confusibles"]:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1 :])
                        classifier.append(leftcontext + rightcontext, confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()
Example #15
0
class TIMBLSuffixConfusibleModule(Module):
    """The Suffix Confusible module is capable of disambiguating suffixes on words. The suffixes are passes to the ``suffixes`` settings (a list of string). All words using these suffixes above a certain threshold (``freqthtreshold``) will be found at training time and disambiguated using context. The module is implemented using Timbl.

    Settings:
    * ``suffixes``     - List of suffixes (strings) that form a single set of confusibles. (changing this requires retraining)
    * ``freqthreshold``- Only consider words with a suffix that occur at least this many times (changing this requires retraining)
    * ``maxratio``     - Maximum ratio expressing the maximally allowed frequency difference between the confusibles (value > 1, 0 = no limit) (changing this requires retraining)
    * ``minlength``    - Only consider words with a suffix that are at least this long (in characters) (changing this requires retraining)
    * ``maxlength``    - Only consider words with a suffix that are at most this long (in characters) (changing this requires retraining)
    * ``leftcontext``  - Left context size (in words) for the feature vector (changing this requires retraining)
    * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) (changing this requires retraining)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)
    * ``threshold``    - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8)
    * ``minocc``       - The minimum number of occurrences (sum of all class weights) (default: 5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a list of confusibles [``.lst``]
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    Hapaxer: This module supports hapaxing
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8
        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'suffixes' not in self.settings:
            raise Exception("No suffixes specified for " + self.id + "!")
        self.suffixes = sorted(
            self.settings['suffixes'],
            key=lambda x: -1 * len(x))  #sort from long to short

        #settings for computation of confusible list
        if 'freqthreshold' not in self.settings:
            self.settings['freqthreshold'] = 20
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25  #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 3  #shorter word will be ignored
        if 'maxratio' not in self.settings:
            self.settings['maxratio'] = 0  #no limit

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        ibasefound = lstfound = False
        for filename in self.models:
            if filename.endswith('.ibase'):
                ibasefound = True
                self.modelfile = filename
            elif filename.endswith('.lst'):
                lstfound = True
                self.confusiblefile = filename

        if not ibasefound:
            raise Exception(
                "TIMBL models must have the extension ibase, not model file was supplies with that extension"
            )
        if not lstfound:
            raise Exception(
                "Specify a model file with extension lst that will store all confusibles found"
            )

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(
            self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.confusibles = []  #pylint: disable=attribute-defined-outside-init

        self.log("Loading models...")
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: " +
                          self.confusiblefile +
                          ". Did you forget to train the system?")
        with open(self.confusiblefile, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)
        if not os.path.exists(self.modelfile):
            raise IOError("Missing expected model file: " + self.modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading Timbl model file " + self.modelfile + "...")
        fileprefix = self.modelfile.replace(".ibase",
                                            "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix,
                                          self.gettimbloptions(),
                                          normalize=False)  #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def clientload(self):
        self.log("Loading models (for client)...")
        self.confusibles = []  #pylint: disable=attribute-defined-outside-init
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: " +
                          self.confusiblefile +
                          ". Did you forget to train the system?")
        with open(self.confusiblefile, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)

    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()

    def getsuffix(self, confusible):
        assert isinstance(confusible, str)
        suffix = None
        for suffix in self.suffixes:  #suffixes are sorted from long to short
            if confusible.endswith(suffix):
                break
        if suffix is None:
            raise ValueError("No suffix found!")
        return suffix, confusible[:-len(suffix)] + self.suffixes[
            0]  #suffix, normalized

    def classify(self, features):
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution, _ = self.classifier.classify(features)
        sumweights = sum(distribution.values())
        if sumweights < self.settings['minocc']:
            return best, []
        distribution = {
            sug: weight / sumweights
            for sug, weight in distribution.items()
            if weight / sumweights >= self.settings['threshold']
        }
        if self.debug:
            self.log("(Returning " + str(len(distribution)) +
                     " suggestions after filtering)")
        return (best, distribution)

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([
            str(w)
            for w in word.leftcontext(self.settings['leftcontext'], "<begin>")
        ])
        _, normalized = self.getsuffix(word.text())
        rightcontext = tuple([
            str(w)
            for w in word.rightcontext(self.settings['rightcontext'], "<end>")
        ])
        return leftcontext + (normalized, ) + rightcontext

    def prepareinput(self, word, **parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _, features = inputdata
        best, distribution = self.classify(features)
        return (best, distribution)

    def processoutput(self, output, inputdata, unit_id, **parameters):
        wordstr, _ = inputdata
        best, distribution = output
        suffix, _ = self.getsuffix(wordstr)
        if wordstr != wordstr[:-len(suffix)] + best:
            return self.addsuggestions(
                unit_id, [(wordstr[:-len(suffix)] + suggestion, p)
                          for suggestion, p in distribution.items()
                          if suggestion != suffix])
Example #16
0
class TIMBLPuncRecaseModule(Module):
    """This is a memory-based classification module, implemented using Timbl, that predicts where punctuation needs to be inserted, deleted, and whether a word needs to be written with an initial capital.
    NOTE: This module performs badly!!

    Settings:
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``deletionthreshold`` - If no punctuation insertion is predicted and this confidence threshold is reached, then a deletion will be predicted (should be a high number), default: 0.95
    * ``insertionthreshold`` - Necessary confidence threshold to predict an insertion of punctuation (default: 0.5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    """

    UNIT = folia.Word
    UNITFILTER = nonumbers

    EOSMARKERS = ('.', '?', '!')
    PUNCTUATION = EOSMARKERS + (',', ';', ':')

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings[
                'class'] = 'missingpunctuation'  #will be overriden later again

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 2

        if 'deletionthreshold' not in self.settings:
            self.settings['deletionthreshold'] = 0.95

        if 'insertionthreshold' not in self.settings:
            self.settings['insertionthreshold'] = 0.5

        if 'capitalizationthreshold' not in self.settings:
            self.settings['capitalizationthreshold'] = 0.5

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        self.hapaxer = gethapaxer(self, self.settings)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception(
                    "TIMBL models must have the extension ibase, got " +
                    modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(
            self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()

    def addtraininstance(self, classifier, buffer, l, r):
        """Helper function"""
        focusword, cased, punc = buffer[l]
        cls = punc
        if cased:
            cls += 'C'
        if not cls:
            cls = '-'
        if self.hapaxer:
            features = [w for w, _, _ in buffer]
            features = [
                w.lower() for w in self.hapaxer(features[:l]) +
                (features[l + 1], ) + self.hapaxer(features[l + 2:])
            ]
        else:
            features = [w.lower() for w, _, _ in buffer]
        classifier.append(tuple(features), cls)
        return buffer[1:]

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io

        prevword = ""
        #buffer = [("<begin>",False,'')] * l
        buffer = []
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                words = [w.strip() for w in line.split(' ') if w.strip()]
                for i, word in enumerate(words):
                    if prevword in TIMBLPuncRecaseModule.PUNCTUATION:
                        punc = prevword
                    else:
                        punc = ""
                    if any(c.isalpha() for c in word):
                        buffer.append(
                            (word, word == word[0].upper() + word[1:].lower(),
                             punc))
                    if len(buffer) == l + r + 1:
                        buffer = self.addtraininstance(classifier, buffer, l,
                                                       r)
                    prevword = word
        #for i in range(0,r):
        #    buffer.append( ("<end>",False,'') )
        #    if len(buffer) == l + r + 1:
        #        buffer = self.addtraininstance(classifier, buffer,l,r)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

    def classify(self, word):
        features = self.getfeatures(word)
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution, _ = self.classifier.classify(features)
        return best, distribution

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        leftcontext = []
        currentword = word
        while len(leftcontext) < l:
            prevword = currentword.previous(folia.Word, None)
            if prevword:
                w = prevword.text().lower()
                if w.isalnum():
                    leftcontext.insert(0, w)
                currentword = prevword
            else:
                leftcontext.insert(0, "<begin>")

        rightcontext = []
        currentword = word
        while len(rightcontext) < r:
            nextword = currentword.next(folia.Word, None)
            if nextword:
                w = nextword.text().lower()
                if w.isalnum():
                    rightcontext.append(w)
                currentword = nextword
            else:
                rightcontext.append("<end>")

        return leftcontext + [word.text().lower()] + rightcontext

    def prepareinput(self, word, **parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)  #will be reused in processoutput
        if not any(c.isalnum() for c in wordstr):
            #this is punctuation, skip
            return None
        prevword = word.previous(folia.Word, None)
        if prevword:
            prevwordstr = str(prevword)
            prevword_id = prevword.id
        else:
            prevwordstr = ""
            prevword_id = ""
        features = self.getfeatures(word)
        return wordstr, prevwordstr, prevword_id, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        wordstr, prevword, prevword_id, features = inputdata
        if self.debug:
            self.log(" (Processing word " + wordstr + ", features: " +
                     repr(features) + ")")
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution, _ = self.classifier.classify(features)
        if self.debug:
            self.log(" (Best: " + best + ")")
        return [best, distribution]

    def processoutput(self, outputdata, inputdata, unit_id, **parameters):
        queries = []
        wordstr, prevword, prevword_id, _ = inputdata
        cls, distribution = outputdata

        recase = False

        if cls[-1] == 'C':
            if wordstr[0] == wordstr[0].lower():
                if distribution[cls] >= self.settings[
                        'capitalizationthreshold']:
                    recase = True
                elif self.debug:
                    self.log(" (Capitalization threshold not reached: " +
                             str(distribution[cls]) + ")")
            cls = cls[:-1]

        if cls == '-':
            if prevword and distribution[cls] >= self.settings[
                    'deletionthreshold'] and all(not c.isalpha()
                                                 for c in prevword):
                if self.debug:
                    self.log(" (Redundant punctuation " + cls +
                             " with threshold " + str(distribution[cls]) + ")")
                queries.append(
                    self.suggestdeletion(
                        prevword_id,
                        (prevword in TIMBLPuncRecaseModule.EOSMARKERS),
                        cls='redundantpunctuation'))
        elif cls and cls in distribution:
            #insertion of punctuation
            if distribution[cls] >= self.settings['insertionthreshold']:
                if all(not c.isalnum() for c in prevword):
                    #previous word is punctuation already
                    if prevword != cls:
                        self.log(" (Found punctuation confusion)")
                        queries.append(
                            self.addsuggestions(prevword_id,
                                                cls,
                                                cls='confusion'))
                    else:
                        recase = False  #no punctuation insertion? then no recasing either
                        if self.debug:
                            self.log(
                                " (Predicted punctuation already there, good, ignoring)"
                            )
                else:
                    if self.debug:
                        self.log(" (Insertion " + cls + " with threshold " +
                                 str(distribution[cls]) + ")")
                    queries.append(
                        self.suggestinsertion(
                            unit_id, cls,
                            (cls in TIMBLPuncRecaseModule.EOSMARKERS)))
            else:
                recase = False  #no punctuation insertion? then no recasing either
                if self.debug:
                    self.log(" (Insertion threshold not reached: " +
                             str(distribution[cls]) + ")")

        if recase and wordstr[0].isalpha():
            #recase word
            t = wordstr
            if recase:
                t = t[0].upper() + t[1:]
            if self.debug:
                self.log(" (Correcting capitalization for " + wordstr + ")")
            queries.append(
                self.addsuggestions(unit_id, [t], cls='capitalizationerror'))

        return queries
Example #17
0
class TIMBLLMModule(Module):
    """The Language Model predicts words given their context (including right context). It uses a classifier-based approach.

    Settings:
    * ``threshold``    - Prediction confidence threshold, only when a prediction exceeds this threshold will it be recommended (default: 0.9, value must be higher than 0.5 by definition)
    * ``freqthreshold`` - If the previous word occurs below this threshold, then no classification will take place. Only has an effect when a lexicon is enabled (default: 2)
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``maxdistance``  - Maximum Levenshtein distance between a word and its correction (larger distances are pruned from suggestions)
    * ``minlength``    - Minimum length (in characters) for a word to be considered by the LM module
    * ``probfactor``   - If the predicted word is in the target distribution, any suggestions must be more probable by this factor (default: 10)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusion)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    * optional: a plain-text corpus (tokenized)  [``.txt``]     ->    a lexicon model [``.colibri.patternmodel``]

    Hapaxer: This module supports hapaxing
    Caching: This module supports caching
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = 0.9
        else:
            self.threshold = self.settings['threshold']

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 2
        else:
            self.freqthreshold = self.settings['freqthreshold']

        if 'minlength' not in self.settings:
            self.minlength = 5
        else:
            self.minlength = self.settings['minlength']

        if 'probfactor' not in self.settings:
            self.probfactor = 10
        else:
            self.probfactor = self.settings['probfactor']


        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead")
            if len(self.models) > 1:
                lexiconfile = self.models[1]
                if not lexiconfile.endswith("colibri.patternmodel"):
                    raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead")
        except:
            raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) )

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug)
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)


    def getfeatures(self, word):
        """Get features at testing time"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + rightcontext


    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        if len(wordstr) > self.minlength:
            features = self.getfeatures(word)
            return wordstr, features

    def processoutput(self, outputdata, inputdata, unit_id,**parameters):
        wordstr,_ = inputdata
        if wordstr is not None:
            best,distribution = outputdata
            if best != wordstr and distribution:
                return self.addsuggestions(unit_id, distribution)

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        if self.debug:
            begintime = time.time()

        wordstr = inputdata[0]
        features = tuple(inputdata[1])
        if self.debug:
            self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")")

        if self.hapaxer:
            features = self.hapaxer(features) #pylint: disable=not-callable
            previousword = features[self.settings['leftcontext'] - 1]
            if previousword == self.hapaxer.placeholder:
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Previous word not in hapaxer, returned in   " + str(duration) + "s)")
                return None,None

        if self.cache is not None:
            try:
                cached = self.cache[features]
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Return from cache in   " + str(duration) + "s)")
                return cached
            except KeyError:
                pass

        if self.lexicon:
            #ensure the previous word exists
            previousword = features[self.settings['leftcontext'] - 1]
            pattern = self.classencoder.buildpattern(previousword)
            if pattern.unknown() or pattern not in self.lexicon:
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Previous word not in lexicon, returned in   " + str(duration) + "s)")
                return None,None
                #if self.settings['rightcontext']:
                #    nextword = features[self.settings['leftcontext']]
                #    pattern = self.classencoder.buildpattern(nextword)
                #    if pattern.unknown() or pattern not in self.lexicon:
                #        return None,None
                #else:
                #    return None,None



        best,distribution,_ = self.classifier.classify(features,allowtopdistribution=False)
        if self.debug:
            duration = round(time.time() - begintime,4)
            self.log(" (Classification took  " + str(duration) + "s, unfiltered distribution size=" + str(len(distribution)) + ")")

        l = len(wordstr)
        if self.settings['maxdistance']:
            #filter suggestions that are too distant
            if self.debug:
                begintime = time.time()
            dist = {}
            for key, freq in distribution.items():
                if freq >= self.threshold and abs(l - len(key)) <= self.settings['maxdistance'] and Levenshtein.distance(wordstr,key) <= self.settings['maxdistance']:
                    dist[key] = freq
            if wordstr in dist:
                #typed word is part of distribution, are any of the candidates far more likely?
                basefreq = dist[wordstr]
                dist = { key: freq for key, freq in dist.items() if key == wordstr or freq > basefreq * self.probfactor }
                if len(dist) == 1:
                    #no correction necessary
                    return None, None
            if self.debug:
                duration = round(time.time() - begintime,4)
                self.log(" (Levenshtein filtering took  " + str(duration) + "s, final distribution size=" + str(len(dist)) + ")")
            self.cache.append(features, (best,dist))
            return best, dist
        else:
            dist = [ x for x in distribution.items() if x[1] >= self.threshold ]
            self.cache.append(features, (best,dist))
            return best, dist
Example #18
0
    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
Example #19
0
File: lm.py Project: wollmers/gecco
class TIMBLLMModule(Module):
    """The Language Model predicts words given their context (including right context). It uses a classifier-based approach.

    Settings:
    * ``threshold``    - Prediction confidence threshold, only when a prediction exceeds this threshold will it be recommended (default: 0.9, value must be higher than 0.5 by definition)
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``maxdistance``  - Maximum Levenshtein distance between a word and its correction (larger distances are pruned from suggestions)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: contexterror) 
    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    """
    UNIT = folia.Word

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'contexterror'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = self.settings['threshold']
        else:
            self.threshold = 0.9

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()

    def train(self, sourcefile, modelfile, **parameters):
        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                for ngram in Windower(line, n):
                    focus = ngram[l]
                    leftcontext = tuple(ngram[:l])
                    rightcontext = tuple(ngram[l+1:])
                    classifier.append( leftcontext + rightcontext , focus )

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()



    def getfeatures(self, word):
        """Get features at testing time"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + rightcontext


    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        features = self.getfeatures(word)
        if self.hapaxer: features = self.hapaxer(features) #pylint: disable=not-callable
        return wordstr, features

    def processoutput(self, outputdata, inputdata, unit_id,**parameters):
        wordstr,_ = inputdata
        best,distribution = outputdata
        if best != wordstr and distribution:
            return self.addsuggestions(unit_id, distribution)

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        wordstr = inputdata[0]
        features = tuple(inputdata[1])
        if self.debug:
            begintime = time.time()
        if self.cache is not None:
            try:
                cached = self.cache[features]
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Return from cache in   " + str(duration) + "s)")
                return cached
            except KeyError:
                pass
        best,distribution,_ = self.classifier.classify(features, True) #True=thread-safe
        if self.debug:
            duration = round(time.time() - begintime,4)
            self.log(" (Classification took  " + str(duration) + "s, unfiltered distribution size=" + str(len(distribution)) + ")")

        l = len(wordstr)
        if self.settings['maxdistance']:
            #filter suggestions that are too distant
            if self.debug:
                begintime = time.time()
            dist = {}
            for key, freq in distribution.items():
                if freq >= self.threshold and abs(l - len(key)) <= self.settings['maxdistance'] and Levenshtein.distance(wordstr,key) <= self.settings['maxdistance']:
                    dist[key] = freq
            if self.debug:
                duration = round(time.time() - begintime,4)
                self.log(" (Levenshtein filtering took  " + str(duration) + "s, final distribution size=" + str(len(dist)) + ")")
            self.cache.append(features, (best,dist))
            return best, dist
        else:
            dist = [ x for x in distribution.items() if x[1] >= self.threshold ]
            self.cache.append(features, (best,dist))
            return best, dist 
Example #20
0
class skTiMBL(BaseEstimator, ClassifierMixin):
    def __init__(self, prefix='timbl', algorithm=4, dist_metric=None,
                 k=1,  normalize=False, debug=0, flushdir=None):
        self.prefix = prefix
        self.algorithm = algorithm
        self.dist_metric = dist_metric
        self.k = k
        self.normalize = normalize
        self.debug = debug
        self.flushdir = flushdir


    def _make_timbl_options(self, *options):
        """
        -a algorithm
        -m metric
        -w weighting
        -k amount of neighbours
        -d class voting weights
        -L frequency threshold
        -T which feature index is label
        -N max number of features
        -H turn hashing on/off

        This function still has to be made, for now the appropriate arguments
        can be passed in fit()
        """
        pass


    def fit(self, X, y):
        X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr')

        n_rows = X.shape[0]
        self.classes_ = np.unique(y)

        if sp.sparse.issparse(X):
            if self.debug: print('Features are sparse, choosing faster learning')

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]),
                                              format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir,
                                              flushthreshold=20000, normalize=self.normalize)

            for i in range(n_rows):
                sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
                self.classifier.append(sparse,str(y[i]))

        else:

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]),
                                              debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000,
                                              normalize=self.normalize)

            if y.dtype != 'O':
                y = y.astype(str)

            for i in range(n_rows):
                self.classifier.append(list(X[i].toarray()[0]), y[i])

        self.classifier.train()
        return self


    def _timbl_predictions(self, X, part_index, y=None):
        choices = {0 : lambda x : x.append(np.int64(label)),
                   1 : lambda x : x.append([np.float(distance)]),
                  }
        X = check_array(X, dtype=np.float64, accept_sparse='csr')

        n_samples = X.shape[0]

        pred = []
        func = choices[part_index]
        if sp.sparse.issparse(X):
            if self.debug: print('Features are sparse, choosing faster predictions')

            for i in range(n_samples):
                sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
                label,proba, distance = self.classifier.classify(sparse)
                func(pred)

        else:
            for i in range(n_samples):
                label,proba, distance = self.classifier.classify(list(X[i].toarray()[0]))
                func(pred)

        return np.array(pred)



    def predict(self, X, y=None):
        return self._timbl_predictions(X, part_index=0)


    def predict_proba(self, X, y=None):
        """
        TIMBL is a discrete classifier. It cannot give probability estimations.
        To ensure that scikit-learn functions with TIMBL (and especially metrics
        such as ROC_AUC), this method is implemented.

        For ROC_AUC, the classifier corresponds to a single point in ROC space,
        instead of a probabilistic continuum such as classifiers that can give
        a probability estimation (e.g. Linear classifiers). For an explanation,
        see Fawcett (2005).
        """
        return predict(X)


    def decision_function(self, X, y=None):
        """
        The decision function is interpreted here as being the distance between
        the instance that is being classified and the nearest point in k space.
        """
        return self._timbl_predictions(X, part_index=1)
Example #21
0
class TIMBLSuffixConfusibleModule(Module):
    """The Suffix Confusible module is capable of disambiguating suffixes on words. The suffixes are passes to the ``suffixes`` settings (a list of string). All words using these suffixes above a certain threshold (``freqthtreshold``) will be found at training time and disambiguated using context. The module is implemented using Timbl.

    Settings:
    * ``suffixes``     - List of suffixes (strings) that form a single set of confusibles. (changing this requires retraining)
    * ``freqthreshold``- Only consider words with a suffix that occur at least this many times (changing this requires retraining)
    * ``maxratio``     - Maximum ratio expressing the maximally allowed frequency difference between the confusibles (value > 1, 0 = no limit) (changing this requires retraining)
    * ``minlength``    - Only consider words with a suffix that are at least this long (in characters) (changing this requires retraining)
    * ``maxlength``    - Only consider words with a suffix that are at most this long (in characters) (changing this requires retraining)
    * ``leftcontext``  - Left context size (in words) for the feature vector (changing this requires retraining)
    * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) (changing this requires retraining)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)
    * ``threshold``    - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8)
    * ``minocc``       - The minimum number of occurrences (sum of all class weights) (default: 5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a list of confusibles [``.lst``]
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    Hapaxer: This module supports hapaxing
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8
        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)


        if 'suffixes' not in self.settings:
            raise Exception("No suffixes specified for " + self.id + "!")
        self.suffixes = sorted(self.settings['suffixes'], key= lambda x: -1* len(x))  #sort from long to short

        #settings for computation of confusible list
        if 'freqthreshold' not in self.settings:
            self.settings['freqthreshold'] = 20
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25 #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 3 #shorter word will be ignored
        if 'maxratio' not in self.settings:
            self.settings['maxratio'] = 0 #no limit

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        ibasefound = lstfound = False
        for filename in self.models:
            if filename.endswith('.ibase'):
                ibasefound = True
                self.modelfile = filename
            elif filename.endswith('.lst'):
                lstfound = True
                self.confusiblefile = filename

        if not ibasefound:
            raise Exception("TIMBL models must have the extension ibase, not model file was supplies with that extension")
        if not lstfound:
            raise Exception("Specify a model file with extension lst that will store all confusibles found")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")


        self.confusibles = []#pylint: disable=attribute-defined-outside-init

        self.log("Loading models...")
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: "  + self.confusiblefile + ". Did you forget to train the system?")
        with open(self.confusiblefile,'r',encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)
        if not os.path.exists(self.modelfile):
            raise IOError("Missing expected model file: " + self.modelfile + ". Did you forget to train the system?")
        self.log("Loading Timbl model file " + self.modelfile + "...")
        fileprefix = self.modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def clientload(self):
        self.log("Loading models (for client)...")
        self.confusibles = []#pylint: disable=attribute-defined-outside-init
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: "  + self.confusiblefile + ". Did you forget to train the system?")
        with open(self.confusiblefile,'r',encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)

    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log("Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength'])


            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)


            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = [] #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log("WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!")
                for suffix in self.suffixes:
                    if pattern_s.endswith(suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(otherpattern_s,False,False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (model.occurrencecount(pattern), model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s )
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile,'w',encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8', errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l+1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append( leftcontext + (normalized,) + rightcontext , suffix )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()


    def getsuffix(self, confusible):
        assert isinstance(confusible, str)
        suffix = None
        for suffix in self.suffixes: #suffixes are sorted from long to short
            if confusible.endswith(suffix):
                break
        if suffix is None:
            raise ValueError("No suffix found!")
        return suffix, confusible[:-len(suffix)] + self.suffixes[0]  #suffix, normalized



    def classify(self, features):
        if self.hapaxer: features = self.hapaxer(features)
        best,distribution,_ = self.classifier.classify(features)
        sumweights = sum(distribution.values())
        if sumweights < self.settings['minocc']:
            return best, []
        distribution = { sug: weight/sumweights for sug,weight in distribution.items() if weight/sumweights >= self.settings['threshold'] }
        if self.debug: self.log("(Returning " + str(len(distribution)) + " suggestions after filtering)")
        return (best,distribution)

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        _, normalized = self.getsuffix(word.text())
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + (normalized,) + rightcontext


    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _,features = inputdata
        best,distribution = self.classify(features)
        return (best,distribution)

    def processoutput(self, output, inputdata, unit_id,**parameters):
        wordstr,_ = inputdata
        best,distribution = output
        suffix,_ = self.getsuffix(wordstr)
        if wordstr != wordstr[:-len(suffix)] + best:
            return self.addsuggestions(unit_id, [ (wordstr[:-len(suffix)] + suggestion,p) for suggestion,p in distribution.items() if suggestion != suffix] )
Example #22
0
class TIMBLPuncRecaseModule(Module):
    """This is a memory-based classification module, implemented using Timbl, that predicts where punctuation needs to be inserted, deleted, and whether a word needs to be written with an initial capital. 

    Settings:
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``deletionthreshold`` - If no punctuation insertion is predicted and this confidence threshold is reached, then a deletion will be predicted (should be a high number), default: 0.95
    * ``insertionthreshold`` - Necessary confidence threshold to predict an insertion of punctuation (default: 0.5)

    Sources and models: 
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    """

    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'missingpunctuation' #will be overriden later again

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 2

        if 'deletionthreshold' not in self.settings:
            self.settings['deletionthreshold'] = 0.95

        if 'insertionthreshold' not in self.settings:
            self.settings['insertionthreshold'] = 0.5

        if 'capitalizationthreshold' not in self.settings:
            self.settings['capitalizationthreshold'] = 0.5

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        self.hapaxer = gethapaxer(self, self.settings)


        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()


    def addtraininstance(self,classifier, buffer,l,r):
        """Helper function"""
        focusword, cased, punc = buffer[l]
        cls = punc
        if cased:
            cls += 'C'
        if not cls:
            cls = '-'
        if self.hapaxer:
            features = [w for w,_,_ in buffer]
            features = [w.lower() for w in  self.hapaxer(features[:l]) + (features[l+1],) + self.hapaxer(features[l+2:])]
        else:
            features = [w.lower() for w,_,_ in buffer]
        classifier.append( tuple(features) , cls )
        return buffer[1:]

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io

        prevword = ""
        #buffer = [("<begin>",False,'')] * l
        buffer = []
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                words = [ w.strip() for w in line.split(' ') if w.strip() ]
                for i, word in enumerate(words):
                    if prevword in PUNCTUATION:
                        punc = prevword
                    else:
                        punc = ""
                    if any(  c.isalpha() for c in word  ):
                        buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc ) )
                    if len(buffer) == l + r + 1:
                        buffer = self.addtraininstance(classifier, buffer,l,r)
                    prevword = word
        #for i in range(0,r):
        #    buffer.append( ("<end>",False,'') )
        #    if len(buffer) == l + r + 1:
        #        buffer = self.addtraininstance(classifier, buffer,l,r)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()


    def classify(self, word):
        features = self.getfeatures(word)
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution,_ = self.classifier.classify(features)
        return best, distribution


    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        leftcontext = []
        currentword = word
        while len(leftcontext) < l:
            prevword = currentword.previous(folia.Word,None)
            if prevword:
                w = prevword.text().lower()
                if w.isalnum():
                    leftcontext.insert(0, w )
                currentword = prevword
            else:
                leftcontext.insert(0, "<begin>")

        rightcontext = []
        currentword = word
        while len(rightcontext) < r:
            nextword = currentword.next(folia.Word,None)
            if nextword:
                w = nextword.text().lower()
                if w.isalnum():
                    rightcontext.append(w )
                currentword = nextword
            else:
                rightcontext.append("<end>")

        return leftcontext + [word.text().lower()] + rightcontext





    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        if not any( c.isalnum() for c in wordstr):
            #this is punctuation, skip
            return None
        prevword = word.previous(folia.Word,None)
        if prevword:
            prevwordstr = str(prevword)
            prevword_id = prevword.id
        else:
            prevwordstr = ""
            prevword_id = ""
        features = self.getfeatures(word)
        return wordstr, prevwordstr, prevword_id,features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        wordstr,prevword,prevword_id, features = inputdata
        if self.debug:
            self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")")
        if self.hapaxer: features = self.hapaxer(features)
        best,distribution,_ = self.classifier.classify(features)
        if self.debug:
            self.log(" (Best: "  + best + ")")
        return [best,distribution]

    def processoutput(self, outputdata, inputdata, unit_id,**parameters):
        queries = []
        wordstr,prevword,prevword_id, _ = inputdata
        cls, distribution = outputdata

        recase = False

        if cls[-1] == 'C':
            if wordstr[0] == wordstr[0].lower():
                if distribution[cls] >= self.settings['capitalizationthreshold']:
                    recase = True
                elif self.debug:
                    self.log(" (Capitalization threshold not reached: " + str(distribution[cls]) + ")")
            cls = cls[:-1]


        if cls == '-':
            if prevword and distribution[cls] >= self.settings['deletionthreshold'] and all( not c.isalpha() for c in  prevword ):
                if self.debug:
                    self.log(" (Redundant punctuation " + cls + " with threshold " + str(distribution[cls]) + ")")
                queries.append( self.suggestdeletion(prevword_id,(prevword in EOSMARKERS), cls='redundantpunctuation') )
        elif cls and cls in distribution:
            #insertion of punctuation
            if distribution[cls] >= self.settings['insertionthreshold']:
                if all(not c.isalnum() for c in prevword):
                    #previous word is punctuation already
                    if prevword != cls:
                        self.log(" (Found punctuation confusion)")
                        queries.append( self.addsuggestions(prevword_id,cls, cls='confusion') )
                    else:
                        recase = False #no punctuation insertion? then no recasing either
                        if self.debug: self.log(" (Predicted punctuation already there, good, ignoring)")
                else:
                    if self.debug: self.log(" (Insertion " + cls + " with threshold " + str(distribution[cls]) + ")")
                    queries.append( self.suggestinsertion(unit_id, cls, (cls in EOSMARKERS) ) )
            else:
                recase = False #no punctuation insertion? then no recasing either
                if self.debug: self.log(" (Insertion threshold not reached: " + str(distribution[cls]) + ")")

        if recase and wordstr[0].isalpha():
            #recase word
            t = wordstr
            if recase:
                t = t[0].upper() + t[1:]
            if self.debug:
                self.log(" (Correcting capitalization for " + wordstr + ")")
            queries.append( self.addsuggestions( unit_id, [t], cls='capitalizationerror') )

        return queries
Example #23
0
class TIMBLLMModule(Module):
    """The Language Model predicts words given their context (including right context). It uses a classifier-based approach.

    Settings:
    * ``threshold``    - Prediction confidence threshold, only when a prediction exceeds this threshold will it be recommended (default: 0.9, value must be higher than 0.5 by definition)
    * ``freqthreshold`` - If the previous word occurs below this threshold, then no classification will take place. Only has an effect when a lexicon is enabled (default: 2)
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``maxdistance``  - Maximum Levenshtein distance between a word and its correction (larger distances are pruned from suggestions)
    * ``minlength``    - Minimum length (in characters) for a word to be considered by the LM module
    * ``probfactor``   - If the predicted word is in the target distribution, any suggestions must be more probable by this factor (default: 10)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusion) 

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    * optional: a plain-text corpus (tokenized)  [``.txt``]     ->    a lexicon model [``.colibri.patternmodel``]

    Hapaxer: This module supports hapaxing
    Caching: This module supports caching
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = 0.9
        else:
            self.threshold = self.settings['threshold']

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 2
        else:
            self.freqthreshold = self.settings['freqthreshold']

        if 'minlength' not in self.settings:
            self.minlength = 5
        else:
            self.minlength = self.settings['minlength']

        if 'probfactor' not in self.settings:
            self.probfactor = 10
        else:
            self.probfactor = self.settings['probfactor']


        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead")
            if len(self.models) > 1:
                lexiconfile = self.models[1]
                if not lexiconfile.endswith("colibri.patternmodel"):
                    raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead")
        except:
            raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) )

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug) 
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)


    def getfeatures(self, word):
        """Get features at testing time"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + rightcontext


    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        if len(wordstr) > self.minlength:
            features = self.getfeatures(word)
            return wordstr, features

    def processoutput(self, outputdata, inputdata, unit_id,**parameters):
        wordstr,_ = inputdata
        if wordstr is not None:
            best,distribution = outputdata
            if best != wordstr and distribution:
                return self.addsuggestions(unit_id, distribution)

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        if self.debug:
            begintime = time.time()

        wordstr = inputdata[0]
        features = tuple(inputdata[1])
        if self.debug: 
            self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")")

        if self.hapaxer: 
            features = self.hapaxer(features) #pylint: disable=not-callable
            previousword = features[self.settings['leftcontext'] - 1]
            if previousword == self.hapaxer.placeholder:
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Previous word not in hapaxer, returned in   " + str(duration) + "s)")
                return None,None

        if self.cache is not None:
            try:
                cached = self.cache[features]
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Return from cache in   " + str(duration) + "s)")
                return cached
            except KeyError:
                pass

        if self.lexicon:
            #ensure the previous word exists
            previousword = features[self.settings['leftcontext'] - 1]
            pattern = self.classencoder.buildpattern(previousword)
            if pattern.unknown() or pattern not in self.lexicon:
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Previous word not in lexicon, returned in   " + str(duration) + "s)")
                return None,None
                #if self.settings['rightcontext']:
                #    nextword = features[self.settings['leftcontext']]
                #    pattern = self.classencoder.buildpattern(nextword)
                #    if pattern.unknown() or pattern not in self.lexicon:
                #        return None,None
                #else:
                #    return None,None



        best,distribution,_ = self.classifier.classify(features,allowtopdistribution=False) 
        if self.debug:
            duration = round(time.time() - begintime,4)
            self.log(" (Classification took  " + str(duration) + "s, unfiltered distribution size=" + str(len(distribution)) + ")")

        l = len(wordstr)
        if self.settings['maxdistance']:
            #filter suggestions that are too distant
            if self.debug:
                begintime = time.time()
            dist = {}
            for key, freq in distribution.items():
                if freq >= self.threshold and abs(l - len(key)) <= self.settings['maxdistance'] and Levenshtein.distance(wordstr,key) <= self.settings['maxdistance']:
                    dist[key] = freq
            if wordstr in dist:
                #typed word is part of distribution, are any of the candidates far more likely?
                basefreq = dist[wordstr]
                dist = { key: freq for key, freq in dist.items() if key == wordstr or freq > basefreq * self.probfactor }
                if len(dist) == 1:
                    #no correction necessary
                    return None, None
            if self.debug:
                duration = round(time.time() - begintime,4)
                self.log(" (Levenshtein filtering took  " + str(duration) + "s, final distribution size=" + str(len(dist)) + ")")
            self.cache.append(features, (best,dist))
            return best, dist
        else:
            dist = [ x for x in distribution.items() if x[1] >= self.threshold ]
            self.cache.append(features, (best,dist))
            return best, dist 
Example #24
0
class TIMBLWordConfusibleModule(Module):
    """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context.
    The module is implemented using memory-based classifiers in Timbl.

    Settings:
    * ``confusibles``  - List of words (strings) that form a single set of confusibles.
    * ``leftcontext``  - Left context size (in words) for the feature vector (changing this requires retraining)
    * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree, changing this requires retraining)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)
    * ``threshold``    - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8)
    * ``minocc``       - The minimum number of occurrences (sum of all class weights) (default: 5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    Hapaxer: This module supports hapaxing
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8

        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'confusibles' not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings['confusibles']

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception(
                    "TIMBL models must have the extension ibase, got " +
                    modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(
            self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix,
                                          self.gettimbloptions(),
                                          normalize=False)  #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings['confusibles']:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1:])
                        classifier.append(leftcontext + rightcontext,
                                          confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([
            str(w)
            for w in word.leftcontext(self.settings['leftcontext'], "<begin>")
        ])
        rightcontext = tuple([
            str(w)
            for w in word.rightcontext(self.settings['rightcontext'], "<end>")
        ])
        return leftcontext + rightcontext

    def classify(self, features):
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution, _ = self.classifier.classify(features)
        sumweights = sum(distribution.values())
        if self.debug:
            self.log("(Classified " + repr(features) + ", best=" + best +
                     ", sumweights=" + str(sumweights) + ", distribution=" +
                     repr(distribution) + ")")
        if sumweights < self.settings['minocc']:
            if self.debug: self.log("(Not passing minocc threshold)")
            return best, []
        distribution = {
            sug: weight / sumweights
            for sug, weight in distribution.items()
            if weight / sumweights >= self.settings['threshold']
        }
        if self.debug:
            self.log("(Returning " + str(len(distribution)) +
                     " suggestions after filtering)")
        return best, distribution

    def prepareinput(self, word, **parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)  #will be reused in processoutput
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _, features = inputdata
        best, distribution = self.classify(features)
        return (best, distribution)

    def processoutput(self, output, inputdata, unit_id, **parameters):
        wordstr, _ = inputdata
        best, distribution = output
        if best and best != wordstr and distribution:
            return self.addsuggestions(unit_id, list(distribution.items()))
Example #25
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)
Example #26
0
class TIMBLWordConfusibleModule(Module):
    """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context.
    The module is implemented using memory-based classifiers in Timbl.

    Settings:
    * ``confusibles``  - List of words (strings) that form a single set of confusibles.
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    """

    UNIT = folia.Word

    def verifysettings(self):
        if "class" not in self.settings:
            self.settings["class"] = "confusible"

        super().verifysettings()

        if "algorithm" not in self.settings:
            self.settings["algorithm"] = 1

        if "leftcontext" not in self.settings:
            self.settings["leftcontext"] = 3

        if "rightcontext" not in self.settings:
            self.settings["rightcontext"] = 3

        self.hapaxer = gethapaxer(self.settings)

        if "confusibles" not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings["confusibles"]

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings["algorithm"]) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase", "")  # has been verified earlier
        self.classifier = TimblClassifier(
            fileprefix, self.gettimbloptions()
        )  # pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings["leftcontext"]
        r = self.settings["rightcontext"]
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase", "")  # has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile, mode="rt", encoding="utf-8", errors="ignore") as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings["confusibles"]:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1 :])
                        classifier.append(leftcontext + rightcontext, confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([str(w) for w in word.leftcontext(self.settings["leftcontext"], "<begin>")])
        rightcontext = tuple([str(w) for w in word.rightcontext(self.settings["rightcontext"], "<end>")])
        return leftcontext + rightcontext

    def prepareinput(self, word, **parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)  # will be reused in processoutput
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            if self.hapaxer:
                features = self.hapaxer(features)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _, features = inputdata
        best, distribution, _ = self.classifier.classify(features)
        return (best, distribution)

    def processoutput(self, output, inputdata, unit_id, **parameters):
        wordstr, _ = inputdata
        best, distribution = output
        if best != wordstr:
            return self.addsuggestions(unit_id, list(distribution.items()))
Example #27
0
class TIMBLWordConfusibleModule(Module):
    """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context.
    The module is implemented using memory-based classifiers in Timbl.

    Settings:
    * ``confusibles``  - List of words (strings) that form a single set of confusibles.
    * ``leftcontext``  - Left context size (in words) for the feature vector (changing this requires retraining)
    * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree, changing this requires retraining)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)
    * ``threshold``    - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8)
    * ``minocc``       - The minimum number of occurrences (sum of all class weights) (default: 5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    Hapaxer: This module supports hapaxing
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8

        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'confusibles' not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings['confusibles']

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings['confusibles']:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , confusible )

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()


    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + rightcontext


    def classify(self, features):
        if self.hapaxer: features = self.hapaxer(features)
        best,distribution,_ = self.classifier.classify(features)
        sumweights = sum(distribution.values())
        if self.debug: self.log("(Classified " + repr(features) + ", best=" + best + ", sumweights=" + str(sumweights) + ", distribution=" + repr(distribution) + ")")
        if sumweights < self.settings['minocc']:
            if self.debug: self.log("(Not passing minocc threshold)")
            return best, []
        distribution = { sug: weight/sumweights for sug,weight in distribution.items() if weight/sumweights >= self.settings['threshold'] }
        if self.debug: self.log("(Returning " + str(len(distribution)) + " suggestions after filtering)")
        return best,distribution

    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _, features = inputdata
        best, distribution = self.classify(features)
        return (best,distribution)

    def processoutput(self, output, inputdata, unit_id,**parameters):
        wordstr, _  = inputdata
        best,distribution = output
        if best and best != wordstr and distribution:
            return self.addsuggestions(unit_id, list(distribution.items()))
Example #28
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)
Example #29
0
    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log("Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength'])


            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)


            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = [] #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log("WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!")
                for suffix in self.suffixes:
                    if pattern_s.endswith(suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(otherpattern_s,False,False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (model.occurrencecount(pattern), model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s )
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile,'w',encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8', errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l+1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append( leftcontext + (normalized,) + rightcontext , suffix )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
Example #30
0
from timbl import TimblClassifier

classifier = TimblClassifier('test','-a 0 -k 1 +vk')

classifier.append( ('dit','is','een'), 'idee')
classifier.append( ('dat','was','geen'), 'doen')

classifier.train()

r = classifier.classify(('dit','was','geen'))
print(r)