Example #1
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8
        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'suffixes' not in self.settings:
            raise Exception("No suffixes specified for " + self.id + "!")
        self.suffixes = sorted(
            self.settings['suffixes'],
            key=lambda x: -1 * len(x))  #sort from long to short

        #settings for computation of confusible list
        if 'freqthreshold' not in self.settings:
            self.settings['freqthreshold'] = 20
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25  #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 3  #shorter word will be ignored
        if 'maxratio' not in self.settings:
            self.settings['maxratio'] = 0  #no limit

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        ibasefound = lstfound = False
        for filename in self.models:
            if filename.endswith('.ibase'):
                ibasefound = True
                self.modelfile = filename
            elif filename.endswith('.lst'):
                lstfound = True
                self.confusiblefile = filename

        if not ibasefound:
            raise Exception(
                "TIMBL models must have the extension ibase, not model file was supplies with that extension"
            )
        if not lstfound:
            raise Exception(
                "Specify a model file with extension lst that will store all confusibles found"
            )
Example #2
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = 0.9
        else:
            self.threshold = self.settings['threshold']

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 2
        else:
            self.freqthreshold = self.settings['freqthreshold']

        if 'minlength' not in self.settings:
            self.minlength = 5
        else:
            self.minlength = self.settings['minlength']

        if 'probfactor' not in self.settings:
            self.probfactor = 10
        else:
            self.probfactor = self.settings['probfactor']


        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead")
            if len(self.models) > 1:
                lexiconfile = self.models[1]
                if not lexiconfile.endswith("colibri.patternmodel"):
                    raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead")
        except:
            raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) )
Example #3
0
    def verifysettings(self):
        if "class" not in self.settings:
            self.settings["class"] = "confusible"

        super().verifysettings()

        if "algorithm" not in self.settings:
            self.settings["algorithm"] = 1

        if "leftcontext" not in self.settings:
            self.settings["leftcontext"] = 3

        if "rightcontext" not in self.settings:
            self.settings["rightcontext"] = 3

        self.hapaxer = gethapaxer(self.settings)

        if "confusibles" not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings["confusibles"]

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")
Example #4
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = 0.9
        else:
            self.threshold = self.settings['threshold']

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 2
        else:
            self.freqthreshold = self.settings['freqthreshold']

        if 'minlength' not in self.settings:
            self.minlength = 5
        else:
            self.minlength = self.settings['minlength']

        if 'probfactor' not in self.settings:
            self.probfactor = 10
        else:
            self.probfactor = self.settings['probfactor']


        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead")
            if len(self.models) > 1:
                lexiconfile = self.models[1]
                if not lexiconfile.endswith("colibri.patternmodel"):
                    raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead")
        except:
            raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) )
Example #5
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'missingpunctuation' #will be overriden later again

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 2

        if 'deletionthreshold' not in self.settings:
            self.settings['deletionthreshold'] = 0.95

        if 'insertionthreshold' not in self.settings:
            self.settings['insertionthreshold'] = 0.5


        self.hapaxer = gethapaxer(self.settings)


        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")
Example #6
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8
        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)


        if 'suffixes' not in self.settings:
            raise Exception("No suffixes specified for " + self.id + "!")
        self.suffixes = sorted(self.settings['suffixes'], key= lambda x: -1* len(x))  #sort from long to short

        #settings for computation of confusible list
        if 'freqthreshold' not in self.settings:
            self.settings['freqthreshold'] = 20
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25 #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 3 #shorter word will be ignored
        if 'maxratio' not in self.settings:
            self.settings['maxratio'] = 0 #no limit

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        ibasefound = lstfound = False
        for filename in self.models:
            if filename.endswith('.ibase'):
                ibasefound = True
                self.modelfile = filename
            elif filename.endswith('.lst'):
                lstfound = True
                self.confusiblefile = filename

        if not ibasefound:
            raise Exception("TIMBL models must have the extension ibase, not model file was supplies with that extension")
        if not lstfound:
            raise Exception("Specify a model file with extension lst that will store all confusibles found")
Example #7
0
    def verifysettings(self):
        if "class" not in self.settings:
            self.settings["class"] = "confusible"

        super().verifysettings()

        if "algorithm" not in self.settings:
            self.settings["algorithm"] = 1

        if "leftcontext" not in self.settings:
            self.settings["leftcontext"] = 3

        if "rightcontext" not in self.settings:
            self.settings["rightcontext"] = 3

        self.hapaxer = gethapaxer(self.settings)

        if "suffixes" not in self.settings:
            raise Exception("No suffixes specified for " + self.id + "!")
        self.suffixes = sorted(self.settings["suffixes"], key=lambda x: -1 * len(x))  # sort from long to short

        # settings for computation of confusible list
        if "freqthreshold" not in self.settings:
            self.settings["freqthreshold"] = 20
        if "maxlength" not in self.settings:
            self.settings["maxlength"] = 25  # longer words will be ignored
        if "minlength" not in self.settings:
            self.settings["minlength"] = 3  # shorter word will be ignored
        if "maxratio" not in self.settings:
            self.settings["maxratio"] = 0  # no limit

        ibasefound = lstfound = False
        for filename in self.models:
            if filename.endswith(".ibase"):
                ibasefound = True
                self.modelfile = filename
            elif filename.endswith(".lst"):
                lstfound = True
                self.confusiblefile = filename

        if not ibasefound:
            raise Exception(
                "TIMBL models must have the extension ibase, not model file was supplies with that extension"
            )
        if not lstfound:
            raise Exception("Specify a model file with extension lst that will store all confusibles found")
Example #8
0
File: lm.py Project: wollmers/gecco
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'contexterror'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = self.settings['threshold']
        else:
            self.threshold = 0.9

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")
Example #9
0
File: lm.py Project: wollmers/gecco
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'contexterror'

        super().verifysettings()

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        self.maxcontext = max(self.settings['leftcontext'], self.settings['rightcontext'])

        if 'freqthreshold' not in self.settings:
            self.threshold = 25

        if 'threshold' not in self.settings:
            self.threshold = self.settings['threshold']
        else:
            self.threshold = 0.9

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
        except:
            raise Exception("Expected one model, got 0 or more")
Example #10
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8

        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'confusibles' not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings['confusibles']

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception(
                    "TIMBL models must have the extension ibase, got " +
                    modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")
Example #11
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings[
                'class'] = 'missingpunctuation'  #will be overriden later again

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 2

        if 'deletionthreshold' not in self.settings:
            self.settings['deletionthreshold'] = 0.95

        if 'insertionthreshold' not in self.settings:
            self.settings['insertionthreshold'] = 0.5

        if 'capitalizationthreshold' not in self.settings:
            self.settings['capitalizationthreshold'] = 0.5

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        self.hapaxer = gethapaxer(self, self.settings)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception(
                    "TIMBL models must have the extension ibase, got " +
                    modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")
Example #12
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        self.maxcontext = max(self.settings['leftcontext'], self.settings['rightcontext'])

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 25

        if 'threshold' not in self.settings:
            self.threshold = self.settings['threshold']
        else:
            self.threshold = 0.9

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            self.models[0]
        except:
            raise Exception("Expected one model, got 0 or more")
Example #13
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.settings['threshold'] = 0.8

        if 'minocc' not in self.settings:
            self.settings['minocc'] = 5

        self.hapaxer = gethapaxer(self, self.settings)

        if 'confusibles' not in self.settings:
            raise Exception("No confusibles specified for " + self.id + "!")
        self.confusibles = self.settings['confusibles']

        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")