Python TimblClassifier Examples

Programming Language: Python

Namespace/Package Name: timbl

Class/Type: TimblClassifier

Examples at hotexamples.com: 30

Python TimblClassifier - 30 examples found. These are the top rated real world Python examples of timbl.TimblClassifier extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TimblClassifier(9)

train(6)

append(5)

classify(5)

load(4)

save(4)

Example #1

Show file

File: confusibles.py Project: andy-wagner/gecco

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.confusibles = []  #pylint: disable=attribute-defined-outside-init

        self.log("Loading models...")
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: " +
                          self.confusiblefile +
                          ". Did you forget to train the system?")
        with open(self.confusiblefile, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)
        if not os.path.exists(self.modelfile):
            raise IOError("Missing expected model file: " + self.modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading Timbl model file " + self.modelfile + "...")
        fileprefix = self.modelfile.replace(".ibase",
                                            "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix,
                                          self.gettimbloptions(),
                                          normalize=False)  #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

Example #2

Show file

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug)
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None

Example #3

Show file

File: utils.py Project: proycon/python-timbl

    def fit(self, X, y):
        X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr')

        n_rows = X.shape[0]
        self.classes_ = np.unique(y)

        if sp.sparse.issparse(X):
            if self.debug: print('Features are sparse, choosing faster learning')

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]),
                                              format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir,
                                              flushthreshold=20000, normalize=self.normalize)

            for i in range(n_rows):
                sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
                self.classifier.append(sparse,str(y[i]))

        else:

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]),
                                              debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000,
                                              normalize=self.normalize)

            if y.dtype != 'O':
                y = y.astype(str)

            for i in range(n_rows):
                self.classifier.append(list(X[i].toarray()[0]), y[i])

        self.classifier.train()
        return self

Example #4

Show file

File: lm.py Project: pombredanne/gecco

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug) 
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None

Example #5

Show file

File: confusibles.py Project: pombredanne/gecco

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")


        self.confusibles = []#pylint: disable=attribute-defined-outside-init

        self.log("Loading models...")
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: "  + self.confusiblefile + ". Did you forget to train the system?")
        with open(self.confusiblefile,'r',encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)
        if not os.path.exists(self.modelfile):
            raise IOError("Missing expected model file: " + self.modelfile + ". Did you forget to train the system?")
        self.log("Loading Timbl model file " + self.modelfile + "...")
        fileprefix = self.modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

Example #6

Show file

File: puncrecase.py Project: pombredanne/gecco

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io

        prevword = ""
        #buffer = [("<begin>",False,'')] * l
        buffer = []
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                words = [ w.strip() for w in line.split(' ') if w.strip() ]
                for i, word in enumerate(words):
                    if prevword in PUNCTUATION:
                        punc = prevword
                    else:
                        punc = ""
                    if any(  c.isalpha() for c in word  ):
                        buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc ) )
                    if len(buffer) == l + r + 1:
                        buffer = self.addtraininstance(classifier, buffer,l,r)
                    prevword = word
        #for i in range(0,r):
        #    buffer.append( ("<end>",False,'') )
        #    if len(buffer) == l + r + 1:
        #        buffer = self.addtraininstance(classifier, buffer,l,r)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

Example #7

Show file

File: lm.py Project: wollmers/gecco

    def train(self, sourcefile, modelfile, **parameters):
        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                for ngram in Windower(line, n):
                    focus = ngram[l]
                    leftcontext = tuple(ngram[:l])
                    rightcontext = tuple(ngram[l+1:])
                    classifier.append( leftcontext + rightcontext , focus )

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

Example #8

Show file

File: puncrecase.py Project: andy-wagner/gecco

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()

Example #9

Show file

File: lm.py Project: wollmers/gecco

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()

Example #10

Show file

File: puncrecase.py Project: andy-wagner/gecco

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io

        prevword = ""
        #buffer = [("<begin>",False,'')] * l
        buffer = []
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                words = [w.strip() for w in line.split(' ') if w.strip()]
                for i, word in enumerate(words):
                    if prevword in TIMBLPuncRecaseModule.PUNCTUATION:
                        punc = prevword
                    else:
                        punc = ""
                    if any(c.isalpha() for c in word):
                        buffer.append(
                            (word, word == word[0].upper() + word[1:].lower(),
                             punc))
                    if len(buffer) == l + r + 1:
                        buffer = self.addtraininstance(classifier, buffer, l,
                                                       r)
                    prevword = word
        #for i in range(0,r):
        #    buffer.append( ("<end>",False,'') )
        #    if len(buffer) == l + r + 1:
        #        buffer = self.addtraininstance(classifier, buffer,l,r)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

Example #11

Show file

File: confusibles.py Project: andy-wagner/gecco

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings['confusibles']:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1:])
                        classifier.append(leftcontext + rightcontext,
                                          confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

Example #12

Show file

File: confusibles.py Project: pombredanne/gecco

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

Example #13

Show file

def create_classifier_and_word_freq_list(train_instances,timbl_models_folder,train_users,test_user,tweet_index):

	timbl_model_name = test_user+'.'+'_'.join(train_users)+'.'+str(tweet_index)
	classifier = TimblClassifier(timbl_models_folder+timbl_model_name,'-a 0 -k 1 +vs')
	word_frequencies = Counter()

	for instance in train_instances:
		if instance.author == test_user and instance.original_tweet_index == tweet_index:
			continue

		classifier.append( instance.features, instance.label)
		word_frequencies[instance.label]+= 1

	classifier.train()

	return classifier,word_frequencies

Example #14

Show file

File: confusibles.py Project: wollmers/gecco

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings["leftcontext"]
        r = self.settings["rightcontext"]
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase", "")  # has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile, mode="rt", encoding="utf-8", errors="ignore") as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings["confusibles"]:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1 :])
                        classifier.append(leftcontext + rightcontext, confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

Example #15

Show file

File: confusibles.py Project: andy-wagner/gecco

class TIMBLSuffixConfusibleModule(Module):
    """The Suffix Confusible module is capable of disambiguating suffixes on words. The suffixes are passes to the ``suffixes`` settings (a list of string). All words using these suffixes above a certain threshold (``freqthtreshold``) will be found at training time and disambiguated using context. The module is implemented using Timbl.

    Settings:
    * ``suffixes``     - List of suffixes (strings) that form a single set of confusibles. (changing this requires retraining)
    * ``freqthreshold``- Only consider words with a suffix that occur at least this many times (changing this requires retraining)
    * ``maxratio``     - Maximum ratio expressing the maximally allowed frequency difference between the confusibles (value > 1, 0 = no limit) (changing this requires retraining)
    * ``minlength``    - Only consider words with a suffix that are at least this long (in characters) (changing this requires retraining)
    * ``maxlength``    - Only consider words with a suffix that are at most this long (in characters) (changing this requires retraining)
    * ``leftcontext``  - Left context size (in words) for the feature vector (changing this requires retraining)
    * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) (changing this requires retraining)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)
    * ``threshold``    - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8)
    * ``minocc``       - The minimum number of occurrences (sum of all class weights) (default: 5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a list of confusibles [``.lst``]
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    Hapaxer: This module supports hapaxing
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8
        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'suffixes' not in self.settings:
            raise Exception("No suffixes specified for " + self.id + "!")
        self.suffixes = sorted(
            self.settings['suffixes'],
            key=lambda x: -1 * len(x))  #sort from long to short

        #settings for computation of confusible list
        if 'freqthreshold' not in self.settings:
            self.settings['freqthreshold'] = 20
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25  #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 3  #shorter word will be ignored
        if 'maxratio' not in self.settings:
            self.settings['maxratio'] = 0  #no limit

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        ibasefound = lstfound = False
        for filename in self.models:
            if filename.endswith('.ibase'):
                ibasefound = True
                self.modelfile = filename
            elif filename.endswith('.lst'):
                lstfound = True
                self.confusiblefile = filename

        if not ibasefound:
            raise Exception(
                "TIMBL models must have the extension ibase, not model file was supplies with that extension"
            )
        if not lstfound:
            raise Exception(
                "Specify a model file with extension lst that will store all confusibles found"
            )

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(
            self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.confusibles = []  #pylint: disable=attribute-defined-outside-init

        self.log("Loading models...")
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: " +
                          self.confusiblefile +
                          ". Did you forget to train the system?")
        with open(self.confusiblefile, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)
        if not os.path.exists(self.modelfile):
            raise IOError("Missing expected model file: " + self.modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading Timbl model file " + self.modelfile + "...")
        fileprefix = self.modelfile.replace(".ibase",
                                            "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix,
                                          self.gettimbloptions(),
                                          normalize=False)  #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def clientload(self):
        self.log("Loading models (for client)...")
        self.confusibles = []  #pylint: disable=attribute-defined-outside-init
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: " +
                          self.confusiblefile +
                          ". Did you forget to train the system?")
        with open(self.confusiblefile, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)

    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()

    def getsuffix(self, confusible):
        assert isinstance(confusible, str)
        suffix = None
        for suffix in self.suffixes:  #suffixes are sorted from long to short
            if confusible.endswith(suffix):
                break
        if suffix is None:
            raise ValueError("No suffix found!")
        return suffix, confusible[:-len(suffix)] + self.suffixes[
            0]  #suffix, normalized

    def classify(self, features):
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution, _ = self.classifier.classify(features)
        sumweights = sum(distribution.values())
        if sumweights < self.settings['minocc']:
            return best, []
        distribution = {
            sug: weight / sumweights
            for sug, weight in distribution.items()
            if weight / sumweights >= self.settings['threshold']
        }
        if self.debug:
            self.log("(Returning " + str(len(distribution)) +
                     " suggestions after filtering)")
        return (best, distribution)

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([
            str(w)
            for w in word.leftcontext(self.settings['leftcontext'], "<begin>")
        ])
        _, normalized = self.getsuffix(word.text())
        rightcontext = tuple([
            str(w)
            for w in word.rightcontext(self.settings['rightcontext'], "<end>")
        ])
        return leftcontext + (normalized, ) + rightcontext

    def prepareinput(self, word, **parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _, features = inputdata
        best, distribution = self.classify(features)
        return (best, distribution)

    def processoutput(self, output, inputdata, unit_id, **parameters):
        wordstr, _ = inputdata
        best, distribution = output
        suffix, _ = self.getsuffix(wordstr)
        if wordstr != wordstr[:-len(suffix)] + best:
            return self.addsuggestions(
                unit_id, [(wordstr[:-len(suffix)] + suggestion, p)
                          for suggestion, p in distribution.items()
                          if suggestion != suffix])

Example #16

Show file

File: puncrecase.py Project: andy-wagner/gecco

class TIMBLPuncRecaseModule(Module):
    """This is a memory-based classification module, implemented using Timbl, that predicts where punctuation needs to be inserted, deleted, and whether a word needs to be written with an initial capital.
    NOTE: This module performs badly!!

    Settings:
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``deletionthreshold`` - If no punctuation insertion is predicted and this confidence threshold is reached, then a deletion will be predicted (should be a high number), default: 0.95
    * ``insertionthreshold`` - Necessary confidence threshold to predict an insertion of punctuation (default: 0.5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    """

    UNIT = folia.Word
    UNITFILTER = nonumbers

    EOSMARKERS = ('.', '?', '!')
    PUNCTUATION = EOSMARKERS + (',', ';', ':')

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings[
                'class'] = 'missingpunctuation'  #will be overriden later again

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 2

        if 'deletionthreshold' not in self.settings:
            self.settings['deletionthreshold'] = 0.95

        if 'insertionthreshold' not in self.settings:
            self.settings['insertionthreshold'] = 0.5

        if 'capitalizationthreshold' not in self.settings:
            self.settings['capitalizationthreshold'] = 0.5

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        self.hapaxer = gethapaxer(self, self.settings)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception(
                    "TIMBL models must have the extension ibase, got " +
                    modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(
            self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()

    def addtraininstance(self, classifier, buffer, l, r):
        """Helper function"""
        focusword, cased, punc = buffer[l]
        cls = punc
        if cased:
            cls += 'C'
        if not cls:
            cls = '-'
        if self.hapaxer:
            features = [w for w, _, _ in buffer]
            features = [
                w.lower() for w in self.hapaxer(features[:l]) +
                (features[l + 1], ) + self.hapaxer(features[l + 2:])
            ]
        else:
            features = [w.lower() for w, _, _ in buffer]
        classifier.append(tuple(features), cls)
        return buffer[1:]

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io

        prevword = ""
        #buffer = [("<begin>",False,'')] * l
        buffer = []
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                words = [w.strip() for w in line.split(' ') if w.strip()]
                for i, word in enumerate(words):
                    if prevword in TIMBLPuncRecaseModule.PUNCTUATION:
                        punc = prevword
                    else:
                        punc = ""
                    if any(c.isalpha() for c in word):
                        buffer.append(
                            (word, word == word[0].upper() + word[1:].lower(),
                             punc))
                    if len(buffer) == l + r + 1:
                        buffer = self.addtraininstance(classifier, buffer, l,
                                                       r)
                    prevword = word
        #for i in range(0,r):
        #    buffer.append( ("<end>",False,'') )
        #    if len(buffer) == l + r + 1:
        #        buffer = self.addtraininstance(classifier, buffer,l,r)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

    def classify(self, word):
        features = self.getfeatures(word)
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution, _ = self.classifier.classify(features)
        return best, distribution

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        leftcontext = []
        currentword = word
        while len(leftcontext) < l:
            prevword = currentword.previous(folia.Word, None)
            if prevword:
                w = prevword.text().lower()
                if w.isalnum():
                    leftcontext.insert(0, w)
                currentword = prevword
            else:
                leftcontext.insert(0, "<begin>")

        rightcontext = []
        currentword = word
        while len(rightcontext) < r:
            nextword = currentword.next(folia.Word, None)
            if nextword:
                w = nextword.text().lower()
                if w.isalnum():
                    rightcontext.append(w)
                currentword = nextword
            else:
                rightcontext.append("<end>")

        return leftcontext + [word.text().lower()] + rightcontext

    def prepareinput(self, word, **parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)  #will be reused in processoutput
        if not any(c.isalnum() for c in wordstr):
            #this is punctuation, skip
            return None
        prevword = word.previous(folia.Word, None)
        if prevword:
            prevwordstr = str(prevword)
            prevword_id = prevword.id
        else:
            prevwordstr = ""
            prevword_id = ""
        features = self.getfeatures(word)
        return wordstr, prevwordstr, prevword_id, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        wordstr, prevword, prevword_id, features = inputdata
        if self.debug:
            self.log(" (Processing word " + wordstr + ", features: " +
                     repr(features) + ")")
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution, _ = self.classifier.classify(features)
        if self.debug:
            self.log(" (Best: " + best + ")")
        return [best, distribution]

    def processoutput(self, outputdata, inputdata, unit_id, **parameters):
        queries = []
        wordstr, prevword, prevword_id, _ = inputdata
        cls, distribution = outputdata

        recase = False

        if cls[-1] == 'C':
            if wordstr[0] == wordstr[0].lower():
                if distribution[cls] >= self.settings[
                        'capitalizationthreshold']:
                    recase = True
                elif self.debug:
                    self.log(" (Capitalization threshold not reached: " +
                             str(distribution[cls]) + ")")
            cls = cls[:-1]

        if cls == '-':
            if prevword and distribution[cls] >= self.settings[
                    'deletionthreshold'] and all(not c.isalpha()
                                                 for c in prevword):
                if self.debug:
                    self.log(" (Redundant punctuation " + cls +
                             " with threshold " + str(distribution[cls]) + ")")
                queries.append(
                    self.suggestdeletion(
                        prevword_id,
                        (prevword in TIMBLPuncRecaseModule.EOSMARKERS),
                        cls='redundantpunctuation'))
        elif cls and cls in distribution:
            #insertion of punctuation
            if distribution[cls] >= self.settings['insertionthreshold']:
                if all(not c.isalnum() for c in prevword):
                    #previous word is punctuation already
                    if prevword != cls:
                        self.log(" (Found punctuation confusion)")
                        queries.append(
                            self.addsuggestions(prevword_id,
                                                cls,
                                                cls='confusion'))
                    else:
                        recase = False  #no punctuation insertion? then no recasing either
                        if self.debug:
                            self.log(
                                " (Predicted punctuation already there, good, ignoring)"
                            )
                else:
                    if self.debug:
                        self.log(" (Insertion " + cls + " with threshold " +
                                 str(distribution[cls]) + ")")
                    queries.append(
                        self.suggestinsertion(
                            unit_id, cls,
                            (cls in TIMBLPuncRecaseModule.EOSMARKERS)))
            else:
                recase = False  #no punctuation insertion? then no recasing either
                if self.debug:
                    self.log(" (Insertion threshold not reached: " +
                             str(distribution[cls]) + ")")

        if recase and wordstr[0].isalpha():
            #recase word
            t = wordstr
            if recase:
                t = t[0].upper() + t[1:]
            if self.debug:
                self.log(" (Correcting capitalization for " + wordstr + ")")
            queries.append(
                self.addsuggestions(unit_id, [t], cls='capitalizationerror'))

        return queries

Example #17

Show file

class TIMBLLMModule(Module):
    """The Language Model predicts words given their context (including right context). It uses a classifier-based approach.

    Settings:
    * ``threshold``    - Prediction confidence threshold, only when a prediction exceeds this threshold will it be recommended (default: 0.9, value must be higher than 0.5 by definition)
    * ``freqthreshold`` - If the previous word occurs below this threshold, then no classification will take place. Only has an effect when a lexicon is enabled (default: 2)
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``maxdistance``  - Maximum Levenshtein distance between a word and its correction (larger distances are pruned from suggestions)
    * ``minlength``    - Minimum length (in characters) for a word to be considered by the LM module
    * ``probfactor``   - If the predicted word is in the target distribution, any suggestions must be more probable by this factor (default: 10)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusion)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    * optional: a plain-text corpus (tokenized)  [``.txt``]     ->    a lexicon model [``.colibri.patternmodel``]

    Hapaxer: This module supports hapaxing
    Caching: This module supports caching
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = 0.9
        else:
            self.threshold = self.settings['threshold']

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 2
        else:
            self.freqthreshold = self.settings['freqthreshold']

        if 'minlength' not in self.settings:
            self.minlength = 5
        else:
            self.minlength = self.settings['minlength']

        if 'probfactor' not in self.settings:
            self.probfactor = 10
        else:
            self.probfactor = self.settings['probfactor']


        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead")
            if len(self.models) > 1:
                lexiconfile = self.models[1]
                if not lexiconfile.endswith("colibri.patternmodel"):
                    raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead")
        except:
            raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) )

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug)
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)


    def getfeatures(self, word):
        """Get features at testing time"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + rightcontext


    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        if len(wordstr) > self.minlength:
            features = self.getfeatures(word)
            return wordstr, features

    def processoutput(self, outputdata, inputdata, unit_id,**parameters):
        wordstr,_ = inputdata
        if wordstr is not None:
            best,distribution = outputdata
            if best != wordstr and distribution:
                return self.addsuggestions(unit_id, distribution)

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        if self.debug:
            begintime = time.time()

        wordstr = inputdata[0]
        features = tuple(inputdata[1])
        if self.debug:
            self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")")

        if self.hapaxer:
            features = self.hapaxer(features) #pylint: disable=not-callable
            previousword = features[self.settings['leftcontext'] - 1]
            if previousword == self.hapaxer.placeholder:
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Previous word not in hapaxer, returned in   " + str(duration) + "s)")
                return None,None

        if self.cache is not None:
            try:
                cached = self.cache[features]
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Return from cache in   " + str(duration) + "s)")
                return cached
            except KeyError:
                pass

        if self.lexicon:
            #ensure the previous word exists
            previousword = features[self.settings['leftcontext'] - 1]
            pattern = self.classencoder.buildpattern(previousword)
            if pattern.unknown() or pattern not in self.lexicon:
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Previous word not in lexicon, returned in   " + str(duration) + "s)")
                return None,None
                #if self.settings['rightcontext']:
                #    nextword = features[self.settings['leftcontext']]
                #    pattern = self.classencoder.buildpattern(nextword)
                #    if pattern.unknown() or pattern not in self.lexicon:
                #        return None,None
                #else:
                #    return None,None



        best,distribution,_ = self.classifier.classify(features,allowtopdistribution=False)
        if self.debug:
            duration = round(time.time() - begintime,4)
            self.log(" (Classification took  " + str(duration) + "s, unfiltered distribution size=" + str(len(distribution)) + ")")

        l = len(wordstr)
        if self.settings['maxdistance']:
            #filter suggestions that are too distant
            if self.debug:
                begintime = time.time()
            dist = {}
            for key, freq in distribution.items():
                if freq >= self.threshold and abs(l - len(key)) <= self.settings['maxdistance'] and Levenshtein.distance(wordstr,key) <= self.settings['maxdistance']:
                    dist[key] = freq
            if wordstr in dist:
                #typed word is part of distribution, are any of the candidates far more likely?
                basefreq = dist[wordstr]
                dist = { key: freq for key, freq in dist.items() if key == wordstr or freq > basefreq * self.probfactor }
                if len(dist) == 1:
                    #no correction necessary
                    return None, None
            if self.debug:
                duration = round(time.time() - begintime,4)
                self.log(" (Levenshtein filtering took  " + str(duration) + "s, final distribution size=" + str(len(dist)) + ")")
            self.cache.append(features, (best,dist))
            return best, dist
        else:
            dist = [ x for x in distribution.items() if x[1] >= self.threshold ]
            self.cache.append(features, (best,dist))
            return best, dist

Example #18

Show file

File: confusibles.py Project: andy-wagner/gecco

    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()

Example #19

Show file

File: lm.py Project: wollmers/gecco

class TIMBLLMModule(Module):
    """The Language Model predicts words given their context (including right context). It uses a classifier-based approach.

    Settings:
    * ``threshold``    - Prediction confidence threshold, only when a prediction exceeds this threshold will it be recommended (default: 0.9, value must be higher than 0.5 by definition)
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``maxdistance``  - Maximum Levenshtein distance between a word and its correction (larger distances are pruned from suggestions)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: contexterror) 
    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    """
    UNIT = folia.Word

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'contexterror'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = self.settings['threshold']
        else:
            self.threshold = 0.9

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()

    def train(self, sourcefile, modelfile, **parameters):
        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                for ngram in Windower(line, n):
                    focus = ngram[l]
                    leftcontext = tuple(ngram[:l])
                    rightcontext = tuple(ngram[l+1:])
                    classifier.append( leftcontext + rightcontext , focus )

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()



    def getfeatures(self, word):
        """Get features at testing time"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + rightcontext


    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        features = self.getfeatures(word)
        if self.hapaxer: features = self.hapaxer(features) #pylint: disable=not-callable
        return wordstr, features

    def processoutput(self, outputdata, inputdata, unit_id,**parameters):
        wordstr,_ = inputdata
        best,distribution = outputdata
        if best != wordstr and distribution:
            return self.addsuggestions(unit_id, distribution)

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        wordstr = inputdata[0]
        features = tuple(inputdata[1])
        if self.debug:
            begintime = time.time()
        if self.cache is not None:
            try:
                cached = self.cache[features]
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Return from cache in   " + str(duration) + "s)")
                return cached
            except KeyError:
                pass
        best,distribution,_ = self.classifier.classify(features, True) #True=thread-safe
        if self.debug:
            duration = round(time.time() - begintime,4)
            self.log(" (Classification took  " + str(duration) + "s, unfiltered distribution size=" + str(len(distribution)) + ")")

        l = len(wordstr)
        if self.settings['maxdistance']:
            #filter suggestions that are too distant
            if self.debug:
                begintime = time.time()
            dist = {}
            for key, freq in distribution.items():
                if freq >= self.threshold and abs(l - len(key)) <= self.settings['maxdistance'] and Levenshtein.distance(wordstr,key) <= self.settings['maxdistance']:
                    dist[key] = freq
            if self.debug:
                duration = round(time.time() - begintime,4)
                self.log(" (Levenshtein filtering took  " + str(duration) + "s, final distribution size=" + str(len(dist)) + ")")
            self.cache.append(features, (best,dist))
            return best, dist
        else:
            dist = [ x for x in distribution.items() if x[1] >= self.threshold ]
            self.cache.append(features, (best,dist))
            return best, dist

Example #20

Show file

File: utils.py Project: proycon/python-timbl

class skTiMBL(BaseEstimator, ClassifierMixin):
    def __init__(self, prefix='timbl', algorithm=4, dist_metric=None,
                 k=1,  normalize=False, debug=0, flushdir=None):
        self.prefix = prefix
        self.algorithm = algorithm
        self.dist_metric = dist_metric
        self.k = k
        self.normalize = normalize
        self.debug = debug
        self.flushdir = flushdir


    def _make_timbl_options(self, *options):
        """
        -a algorithm
        -m metric
        -w weighting
        -k amount of neighbours
        -d class voting weights
        -L frequency threshold
        -T which feature index is label
        -N max number of features
        -H turn hashing on/off

        This function still has to be made, for now the appropriate arguments
        can be passed in fit()
        """
        pass


    def fit(self, X, y):
        X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr')

        n_rows = X.shape[0]
        self.classes_ = np.unique(y)

        if sp.sparse.issparse(X):
            if self.debug: print('Features are sparse, choosing faster learning')

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]),
                                              format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir,
                                              flushthreshold=20000, normalize=self.normalize)

            for i in range(n_rows):
                sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
                self.classifier.append(sparse,str(y[i]))

        else:

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]),
                                              debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000,
                                              normalize=self.normalize)

            if y.dtype != 'O':
                y = y.astype(str)

            for i in range(n_rows):
                self.classifier.append(list(X[i].toarray()[0]), y[i])

        self.classifier.train()
        return self


    def _timbl_predictions(self, X, part_index, y=None):
        choices = {0 : lambda x : x.append(np.int64(label)),
                   1 : lambda x : x.append([np.float(distance)]),
                  }
        X = check_array(X, dtype=np.float64, accept_sparse='csr')

        n_samples = X.shape[0]

        pred = []
        func = choices[part_index]
        if sp.sparse.issparse(X):
            if self.debug: print('Features are sparse, choosing faster predictions')

            for i in range(n_samples):
                sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
                label,proba, distance = self.classifier.classify(sparse)
                func(pred)

        else:
            for i in range(n_samples):
                label,proba, distance = self.classifier.classify(list(X[i].toarray()[0]))
                func(pred)

        return np.array(pred)



    def predict(self, X, y=None):
        return self._timbl_predictions(X, part_index=0)


    def predict_proba(self, X, y=None):
        """
        TIMBL is a discrete classifier. It cannot give probability estimations.
        To ensure that scikit-learn functions with TIMBL (and especially metrics
        such as ROC_AUC), this method is implemented.

        For ROC_AUC, the classifier corresponds to a single point in ROC space,
        instead of a probabilistic continuum such as classifiers that can give
        a probability estimation (e.g. Linear classifiers). For an explanation,
        see Fawcett (2005).
        """
        return predict(X)


    def decision_function(self, X, y=None):
        """
        The decision function is interpreted here as being the distance between
        the instance that is being classified and the nearest point in k space.
        """
        return self._timbl_predictions(X, part_index=1)

Example #21

Show file

File: confusibles.py Project: pombredanne/gecco

class TIMBLSuffixConfusibleModule(Module):
    """The Suffix Confusible module is capable of disambiguating suffixes on words. The suffixes are passes to the ``suffixes`` settings (a list of string). All words using these suffixes above a certain threshold (``freqthtreshold``) will be found at training time and disambiguated using context. The module is implemented using Timbl.

    Settings:
    * ``suffixes``     - List of suffixes (strings) that form a single set of confusibles. (changing this requires retraining)
    * ``freqthreshold``- Only consider words with a suffix that occur at least this many times (changing this requires retraining)
    * ``maxratio``     - Maximum ratio expressing the maximally allowed frequency difference between the confusibles (value > 1, 0 = no limit) (changing this requires retraining)
    * ``minlength``    - Only consider words with a suffix that are at least this long (in characters) (changing this requires retraining)
    * ``maxlength``    - Only consider words with a suffix that are at most this long (in characters) (changing this requires retraining)
    * ``leftcontext``  - Left context size (in words) for the feature vector (changing this requires retraining)
    * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree) (changing this requires retraining)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)
    * ``threshold``    - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8)
    * ``minocc``       - The minimum number of occurrences (sum of all class weights) (default: 5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a list of confusibles [``.lst``]
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    Hapaxer: This module supports hapaxing
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8
        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)


        if 'suffixes' not in self.settings:
            raise Exception("No suffixes specified for " + self.id + "!")
        self.suffixes = sorted(self.settings['suffixes'], key= lambda x: -1* len(x))  #sort from long to short

        #settings for computation of confusible list
        if 'freqthreshold' not in self.settings:
            self.settings['freqthreshold'] = 20
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25 #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 3 #shorter word will be ignored
        if 'maxratio' not in self.settings:
            self.settings['maxratio'] = 0 #no limit

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        ibasefound = lstfound = False
        for filename in self.models:
            if filename.endswith('.ibase'):
                ibasefound = True
                self.modelfile = filename
            elif filename.endswith('.lst'):
                lstfound = True
                self.confusiblefile = filename

        if not ibasefound:
            raise Exception("TIMBL models must have the extension ibase, not model file was supplies with that extension")
        if not lstfound:
            raise Exception("Specify a model file with extension lst that will store all confusibles found")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")


        self.confusibles = []#pylint: disable=attribute-defined-outside-init

        self.log("Loading models...")
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: "  + self.confusiblefile + ". Did you forget to train the system?")
        with open(self.confusiblefile,'r',encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)
        if not os.path.exists(self.modelfile):
            raise IOError("Missing expected model file: " + self.modelfile + ". Did you forget to train the system?")
        self.log("Loading Timbl model file " + self.modelfile + "...")
        fileprefix = self.modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def clientload(self):
        self.log("Loading models (for client)...")
        self.confusibles = []#pylint: disable=attribute-defined-outside-init
        if not os.path.exists(self.confusiblefile):
            raise IOError("Missing expected confusible file: "  + self.confusiblefile + ". Did you forget to train the system?")
        with open(self.confusiblefile,'r',encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    self.confusibles.append(line)

    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log("Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength'])


            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)


            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = [] #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log("WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!")
                for suffix in self.suffixes:
                    if pattern_s.endswith(suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(otherpattern_s,False,False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (model.occurrencecount(pattern), model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s )
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile,'w',encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8', errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l+1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append( leftcontext + (normalized,) + rightcontext , suffix )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()


    def getsuffix(self, confusible):
        assert isinstance(confusible, str)
        suffix = None
        for suffix in self.suffixes: #suffixes are sorted from long to short
            if confusible.endswith(suffix):
                break
        if suffix is None:
            raise ValueError("No suffix found!")
        return suffix, confusible[:-len(suffix)] + self.suffixes[0]  #suffix, normalized



    def classify(self, features):
        if self.hapaxer: features = self.hapaxer(features)
        best,distribution,_ = self.classifier.classify(features)
        sumweights = sum(distribution.values())
        if sumweights < self.settings['minocc']:
            return best, []
        distribution = { sug: weight/sumweights for sug,weight in distribution.items() if weight/sumweights >= self.settings['threshold'] }
        if self.debug: self.log("(Returning " + str(len(distribution)) + " suggestions after filtering)")
        return (best,distribution)

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        _, normalized = self.getsuffix(word.text())
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + (normalized,) + rightcontext


    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _,features = inputdata
        best,distribution = self.classify(features)
        return (best,distribution)

    def processoutput(self, output, inputdata, unit_id,**parameters):
        wordstr,_ = inputdata
        best,distribution = output
        suffix,_ = self.getsuffix(wordstr)
        if wordstr != wordstr[:-len(suffix)] + best:
            return self.addsuggestions(unit_id, [ (wordstr[:-len(suffix)] + suggestion,p) for suggestion,p in distribution.items() if suggestion != suffix] )

Example #22

Show file

File: puncrecase.py Project: pombredanne/gecco

class TIMBLPuncRecaseModule(Module):
    """This is a memory-based classification module, implemented using Timbl, that predicts where punctuation needs to be inserted, deleted, and whether a word needs to be written with an initial capital. 

    Settings:
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``deletionthreshold`` - If no punctuation insertion is predicted and this confidence threshold is reached, then a deletion will be predicted (should be a high number), default: 0.95
    * ``insertionthreshold`` - Necessary confidence threshold to predict an insertion of punctuation (default: 0.5)

    Sources and models: 
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    """

    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'missingpunctuation' #will be overriden later again

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 2

        if 'deletionthreshold' not in self.settings:
            self.settings['deletionthreshold'] = 0.95

        if 'insertionthreshold' not in self.settings:
            self.settings['insertionthreshold'] = 0.5

        if 'capitalizationthreshold' not in self.settings:
            self.settings['capitalizationthreshold'] = 0.5

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        self.hapaxer = gethapaxer(self, self.settings)


        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        self.classifier.load()


    def addtraininstance(self,classifier, buffer,l,r):
        """Helper function"""
        focusword, cased, punc = buffer[l]
        cls = punc
        if cased:
            cls += 'C'
        if not cls:
            cls = '-'
        if self.hapaxer:
            features = [w for w,_,_ in buffer]
            features = [w.lower() for w in  self.hapaxer(features[:l]) + (features[l+1],) + self.hapaxer(features[l+2:])]
        else:
            features = [w.lower() for w,_,_ in buffer]
        classifier.append( tuple(features) , cls )
        return buffer[1:]

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io

        prevword = ""
        #buffer = [("<begin>",False,'')] * l
        buffer = []
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                words = [ w.strip() for w in line.split(' ') if w.strip() ]
                for i, word in enumerate(words):
                    if prevword in PUNCTUATION:
                        punc = prevword
                    else:
                        punc = ""
                    if any(  c.isalpha() for c in word  ):
                        buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc ) )
                    if len(buffer) == l + r + 1:
                        buffer = self.addtraininstance(classifier, buffer,l,r)
                    prevword = word
        #for i in range(0,r):
        #    buffer.append( ("<end>",False,'') )
        #    if len(buffer) == l + r + 1:
        #        buffer = self.addtraininstance(classifier, buffer,l,r)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()


    def classify(self, word):
        features = self.getfeatures(word)
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution,_ = self.classifier.classify(features)
        return best, distribution


    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        l = self.settings['leftcontext']
        r = self.settings['rightcontext']

        leftcontext = []
        currentword = word
        while len(leftcontext) < l:
            prevword = currentword.previous(folia.Word,None)
            if prevword:
                w = prevword.text().lower()
                if w.isalnum():
                    leftcontext.insert(0, w )
                currentword = prevword
            else:
                leftcontext.insert(0, "<begin>")

        rightcontext = []
        currentword = word
        while len(rightcontext) < r:
            nextword = currentword.next(folia.Word,None)
            if nextword:
                w = nextword.text().lower()
                if w.isalnum():
                    rightcontext.append(w )
                currentword = nextword
            else:
                rightcontext.append("<end>")

        return leftcontext + [word.text().lower()] + rightcontext





    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        if not any( c.isalnum() for c in wordstr):
            #this is punctuation, skip
            return None
        prevword = word.previous(folia.Word,None)
        if prevword:
            prevwordstr = str(prevword)
            prevword_id = prevword.id
        else:
            prevwordstr = ""
            prevword_id = ""
        features = self.getfeatures(word)
        return wordstr, prevwordstr, prevword_id,features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        wordstr,prevword,prevword_id, features = inputdata
        if self.debug:
            self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")")
        if self.hapaxer: features = self.hapaxer(features)
        best,distribution,_ = self.classifier.classify(features)
        if self.debug:
            self.log(" (Best: "  + best + ")")
        return [best,distribution]

    def processoutput(self, outputdata, inputdata, unit_id,**parameters):
        queries = []
        wordstr,prevword,prevword_id, _ = inputdata
        cls, distribution = outputdata

        recase = False

        if cls[-1] == 'C':
            if wordstr[0] == wordstr[0].lower():
                if distribution[cls] >= self.settings['capitalizationthreshold']:
                    recase = True
                elif self.debug:
                    self.log(" (Capitalization threshold not reached: " + str(distribution[cls]) + ")")
            cls = cls[:-1]


        if cls == '-':
            if prevword and distribution[cls] >= self.settings['deletionthreshold'] and all( not c.isalpha() for c in  prevword ):
                if self.debug:
                    self.log(" (Redundant punctuation " + cls + " with threshold " + str(distribution[cls]) + ")")
                queries.append( self.suggestdeletion(prevword_id,(prevword in EOSMARKERS), cls='redundantpunctuation') )
        elif cls and cls in distribution:
            #insertion of punctuation
            if distribution[cls] >= self.settings['insertionthreshold']:
                if all(not c.isalnum() for c in prevword):
                    #previous word is punctuation already
                    if prevword != cls:
                        self.log(" (Found punctuation confusion)")
                        queries.append( self.addsuggestions(prevword_id,cls, cls='confusion') )
                    else:
                        recase = False #no punctuation insertion? then no recasing either
                        if self.debug: self.log(" (Predicted punctuation already there, good, ignoring)")
                else:
                    if self.debug: self.log(" (Insertion " + cls + " with threshold " + str(distribution[cls]) + ")")
                    queries.append( self.suggestinsertion(unit_id, cls, (cls in EOSMARKERS) ) )
            else:
                recase = False #no punctuation insertion? then no recasing either
                if self.debug: self.log(" (Insertion threshold not reached: " + str(distribution[cls]) + ")")

        if recase and wordstr[0].isalpha():
            #recase word
            t = wordstr
            if recase:
                t = t[0].upper() + t[1:]
            if self.debug:
                self.log(" (Correcting capitalization for " + wordstr + ")")
            queries.append( self.addsuggestions( unit_id, [t], cls='capitalizationerror') )

        return queries

Example #23

Show file

File: lm.py Project: pombredanne/gecco

class TIMBLLMModule(Module):
    """The Language Model predicts words given their context (including right context). It uses a classifier-based approach.

    Settings:
    * ``threshold``    - Prediction confidence threshold, only when a prediction exceeds this threshold will it be recommended (default: 0.9, value must be higher than 0.5 by definition)
    * ``freqthreshold`` - If the previous word occurs below this threshold, then no classification will take place. Only has an effect when a lexicon is enabled (default: 2)
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``maxdistance``  - Maximum Levenshtein distance between a word and its correction (larger distances are pruned from suggestions)
    * ``minlength``    - Minimum length (in characters) for a word to be considered by the LM module
    * ``probfactor``   - If the predicted word is in the target distribution, any suggestions must be more probable by this factor (default: 10)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusion) 

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    * optional: a plain-text corpus (tokenized)  [``.txt``]     ->    a lexicon model [``.colibri.patternmodel``]

    Hapaxer: This module supports hapaxing
    Caching: This module supports caching
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = 0.9
        else:
            self.threshold = self.settings['threshold']

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 2
        else:
            self.freqthreshold = self.settings['freqthreshold']

        if 'minlength' not in self.settings:
            self.minlength = 5
        else:
            self.minlength = self.settings['minlength']

        if 'probfactor' not in self.settings:
            self.probfactor = 10
        else:
            self.probfactor = self.settings['probfactor']


        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead")
            if len(self.models) > 1:
                lexiconfile = self.models[1]
                if not lexiconfile.endswith("colibri.patternmodel"):
                    raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead")
        except:
            raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) )

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug) 
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)


    def getfeatures(self, word):
        """Get features at testing time"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + rightcontext


    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        if len(wordstr) > self.minlength:
            features = self.getfeatures(word)
            return wordstr, features

    def processoutput(self, outputdata, inputdata, unit_id,**parameters):
        wordstr,_ = inputdata
        if wordstr is not None:
            best,distribution = outputdata
            if best != wordstr and distribution:
                return self.addsuggestions(unit_id, distribution)

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        if self.debug:
            begintime = time.time()

        wordstr = inputdata[0]
        features = tuple(inputdata[1])
        if self.debug: 
            self.log(" (Processing word " + wordstr + ", features: " + repr(features) + ")")

        if self.hapaxer: 
            features = self.hapaxer(features) #pylint: disable=not-callable
            previousword = features[self.settings['leftcontext'] - 1]
            if previousword == self.hapaxer.placeholder:
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Previous word not in hapaxer, returned in   " + str(duration) + "s)")
                return None,None

        if self.cache is not None:
            try:
                cached = self.cache[features]
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Return from cache in   " + str(duration) + "s)")
                return cached
            except KeyError:
                pass

        if self.lexicon:
            #ensure the previous word exists
            previousword = features[self.settings['leftcontext'] - 1]
            pattern = self.classencoder.buildpattern(previousword)
            if pattern.unknown() or pattern not in self.lexicon:
                if self.debug:
                    duration = round(time.time() - begintime,4)
                    self.log(" (Previous word not in lexicon, returned in   " + str(duration) + "s)")
                return None,None
                #if self.settings['rightcontext']:
                #    nextword = features[self.settings['leftcontext']]
                #    pattern = self.classencoder.buildpattern(nextword)
                #    if pattern.unknown() or pattern not in self.lexicon:
                #        return None,None
                #else:
                #    return None,None



        best,distribution,_ = self.classifier.classify(features,allowtopdistribution=False) 
        if self.debug:
            duration = round(time.time() - begintime,4)
            self.log(" (Classification took  " + str(duration) + "s, unfiltered distribution size=" + str(len(distribution)) + ")")

        l = len(wordstr)
        if self.settings['maxdistance']:
            #filter suggestions that are too distant
            if self.debug:
                begintime = time.time()
            dist = {}
            for key, freq in distribution.items():
                if freq >= self.threshold and abs(l - len(key)) <= self.settings['maxdistance'] and Levenshtein.distance(wordstr,key) <= self.settings['maxdistance']:
                    dist[key] = freq
            if wordstr in dist:
                #typed word is part of distribution, are any of the candidates far more likely?
                basefreq = dist[wordstr]
                dist = { key: freq for key, freq in dist.items() if key == wordstr or freq > basefreq * self.probfactor }
                if len(dist) == 1:
                    #no correction necessary
                    return None, None
            if self.debug:
                duration = round(time.time() - begintime,4)
                self.log(" (Levenshtein filtering took  " + str(duration) + "s, final distribution size=" + str(len(dist)) + ")")
            self.cache.append(features, (best,dist))
            return best, dist
        else:
            dist = [ x for x in distribution.items() if x[1] >= self.threshold ]
            self.cache.append(features, (best,dist))
            return best, dist

Example #24

Show file

File: confusibles.py Project: andy-wagner/gecco

class TIMBLWordConfusibleModule(Module):
    """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context.
    The module is implemented using memory-based classifiers in Timbl.

    Settings:
    * ``confusibles``  - List of words (strings) that form a single set of confusibles.
    * ``leftcontext``  - Left context size (in words) for the feature vector (changing this requires retraining)
    * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree, changing this requires retraining)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)
    * ``threshold``    - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8)
    * ``minocc``       - The minimum number of occurrences (sum of all class weights) (default: 5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    Hapaxer: This module supports hapaxing
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8

        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'confusibles' not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings['confusibles']

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception(
                    "TIMBL models must have the extension ibase, got " +
                    modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(
            self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        self.classifier = TimblClassifier(fileprefix,
                                          self.gettimbloptions(),
                                          normalize=False)  #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings['confusibles']:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1:])
                        classifier.append(leftcontext + rightcontext,
                                          confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([
            str(w)
            for w in word.leftcontext(self.settings['leftcontext'], "<begin>")
        ])
        rightcontext = tuple([
            str(w)
            for w in word.rightcontext(self.settings['rightcontext'], "<end>")
        ])
        return leftcontext + rightcontext

    def classify(self, features):
        if self.hapaxer: features = self.hapaxer(features)
        best, distribution, _ = self.classifier.classify(features)
        sumweights = sum(distribution.values())
        if self.debug:
            self.log("(Classified " + repr(features) + ", best=" + best +
                     ", sumweights=" + str(sumweights) + ", distribution=" +
                     repr(distribution) + ")")
        if sumweights < self.settings['minocc']:
            if self.debug: self.log("(Not passing minocc threshold)")
            return best, []
        distribution = {
            sug: weight / sumweights
            for sug, weight in distribution.items()
            if weight / sumweights >= self.settings['threshold']
        }
        if self.debug:
            self.log("(Returning " + str(len(distribution)) +
                     " suggestions after filtering)")
        return best, distribution

    def prepareinput(self, word, **parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)  #will be reused in processoutput
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _, features = inputdata
        best, distribution = self.classify(features)
        return (best, distribution)

    def processoutput(self, output, inputdata, unit_id, **parameters):
        wordstr, _ = inputdata
        best, distribution = output
        if best and best != wordstr and distribution:
            return self.addsuggestions(unit_id, list(distribution.items()))

Example #25

Show file

File: lm.py Project: pombredanne/gecco

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)

Example #26

Show file

File: confusibles.py Project: wollmers/gecco

class TIMBLWordConfusibleModule(Module):
    """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context.
    The module is implemented using memory-based classifiers in Timbl.

    Settings:
    * ``confusibles``  - List of words (strings) that form a single set of confusibles.
    * ``leftcontext``  - Left context size (in words) for the feature vector
    * ``rightcontext`` - Right context size (in words) for the feature vector
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]
    """

    UNIT = folia.Word

    def verifysettings(self):
        if "class" not in self.settings:
            self.settings["class"] = "confusible"

        super().verifysettings()

        if "algorithm" not in self.settings:
            self.settings["algorithm"] = 1

        if "leftcontext" not in self.settings:
            self.settings["leftcontext"] = 3

        if "rightcontext" not in self.settings:
            self.settings["rightcontext"] = 3

        self.hapaxer = gethapaxer(self.settings)

        if "confusibles" not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings["confusibles"]

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings["algorithm"]) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase", "")  # has been verified earlier
        self.classifier = TimblClassifier(
            fileprefix, self.gettimbloptions()
        )  # pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings["leftcontext"]
        r = self.settings["rightcontext"]
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase", "")  # has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile, mode="rt", encoding="utf-8", errors="ignore") as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings["confusibles"]:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1 :])
                        classifier.append(leftcontext + rightcontext, confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()

    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([str(w) for w in word.leftcontext(self.settings["leftcontext"], "<begin>")])
        rightcontext = tuple([str(w) for w in word.rightcontext(self.settings["rightcontext"], "<end>")])
        return leftcontext + rightcontext

    def prepareinput(self, word, **parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word)  # will be reused in processoutput
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            if self.hapaxer:
                features = self.hapaxer(features)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _, features = inputdata
        best, distribution, _ = self.classifier.classify(features)
        return (best, distribution)

    def processoutput(self, output, inputdata, unit_id, **parameters):
        wordstr, _ = inputdata
        best, distribution = output
        if best != wordstr:
            return self.addsuggestions(unit_id, list(distribution.items()))

Example #27

Show file

File: confusibles.py Project: pombredanne/gecco

class TIMBLWordConfusibleModule(Module):
    """The Word Confusible module is capable of disambiguating two or more words that are often confused, by looking at their context.
    The module is implemented using memory-based classifiers in Timbl.

    Settings:
    * ``confusibles``  - List of words (strings) that form a single set of confusibles.
    * ``leftcontext``  - Left context size (in words) for the feature vector (changing this requires retraining)
    * ``rightcontext`` - Right context size (in words) for the feature vector (changing this requires retraining)
    * ``algorithm``    - The Timbl algorithm to use (see -a parameter in timbl) (default: IGTree, changing this requires retraining)
    * ``class``        - Errors found by this module will be assigned the specified class in the resulting FoLiA output (default: confusible)
    * ``threshold``    - The probability threshold that classifier options must attain to be passed on as suggestions. (default: 0.8)
    * ``minocc``       - The minimum number of occurrences (sum of all class weights) (default: 5)

    Sources and models:
    * a plain-text corpus (tokenized)  [``.txt``]     ->    a classifier instance base model [``.ibase``]

    Hapaxer: This module supports hapaxing
    """
    UNIT = folia.Word
    UNITFILTER = nonumbers

    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8

        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'confusibles' not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings['confusibles']

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")

    def gettimbloptions(self):
        return "-F Tabbed " + "-a " + str(self.settings['algorithm']) + " +D +vdb -G0"

    def load(self):
        """Load the requested modules from self.models"""
        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(), normalize=False) #pylint: disable=attribute-defined-outside-init
        self.classifier.load()

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings['confusibles']:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , confusible )

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()


    def getfeatures(self, word):
        """Get features at testing time, crosses sentence boundaries"""
        leftcontext = tuple([ str(w) for w in word.leftcontext(self.settings['leftcontext'],"<begin>") ])
        rightcontext = tuple([ str(w) for w in word.rightcontext(self.settings['rightcontext'],"<end>") ])
        return leftcontext + rightcontext


    def classify(self, features):
        if self.hapaxer: features = self.hapaxer(features)
        best,distribution,_ = self.classifier.classify(features)
        sumweights = sum(distribution.values())
        if self.debug: self.log("(Classified " + repr(features) + ", best=" + best + ", sumweights=" + str(sumweights) + ", distribution=" + repr(distribution) + ")")
        if sumweights < self.settings['minocc']:
            if self.debug: self.log("(Not passing minocc threshold)")
            return best, []
        distribution = { sug: weight/sumweights for sug,weight in distribution.items() if weight/sumweights >= self.settings['threshold'] }
        if self.debug: self.log("(Returning " + str(len(distribution)) + " suggestions after filtering)")
        return best,distribution

    def prepareinput(self,word,**parameters):
        """Takes the specified FoLiA unit for the module, and returns a string that can be passed to process()"""
        wordstr = str(word) #will be reused in processoutput
        if wordstr in self.confusibles:
            features = self.getfeatures(word)
            return wordstr, features

    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        _, features = inputdata
        best, distribution = self.classify(features)
        return (best,distribution)

    def processoutput(self, output, inputdata, unit_id,**parameters):
        wordstr, _  = inputdata
        best,distribution = output
        if best and best != wordstr and distribution:
            return self.addsuggestions(unit_id, list(distribution.items()))

Example #28

Show file

    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)

Example #29

Show file

File: confusibles.py Project: pombredanne/gecco

    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log("Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength'])


            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)


            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = [] #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log("WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!")
                for suffix in self.suffixes:
                    if pattern_s.endswith(suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(otherpattern_s,False,False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (model.occurrencecount(pattern), model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s )
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile,'w',encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8', errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l+1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append( leftcontext + (normalized,) + rightcontext , suffix )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()

Example #30

Show file

from timbl import TimblClassifier

classifier = TimblClassifier('test','-a 0 -k 1 +vk')

classifier.append( ('dit','is','een'), 'idee')
classifier.append( ('dat','was','geen'), 'doen')

classifier.train()

r = classifier.classify(('dit','was','geen'))
print(r)