Ejemplo n.º 1
0
    def train(self, sourcefile, modelfile, **parameters):
        self.log("Preparing to generate bigram model")
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
            )  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile, corpusfile)

        self.log("Generating bigram frequency list")
        options = colibricore.PatternModelOptions(
            mintokens=self.settings['freqthreshold'], minlength=1,
            maxlength=2)  #unigrams and bigrams
        model = colibricore.UnindexedPatternModel()
        model.train(corpusfile, options)

        self.log("Saving model")
        model.write(modelfile)
Ejemplo n.º 2
0
    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug)
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None
Ejemplo n.º 3
0
    def test001_alignmodel(self):
        """Checking alignment model"""
        options = colibricore.PatternModelOptions(mintokens=1,
                                                  doreverseindex=False)

        s = colibricore.ClassEncoder("test-en-nl/test-en-train.colibri.cls")
        t = colibricore.ClassEncoder("test-en-nl/test-nl-train.colibri.cls")
        sdec = colibricore.ClassDecoder("test-en-nl/test-en-train.colibri.cls")
        tdec = colibricore.ClassDecoder("test-en-nl/test-nl-train.colibri.cls")

        print("Loading alignment model", file=sys.stderr)
        model = AlignmentModel()
        model.load("test-en-nl/test-en-nl.colibri.alignmodel", options)
        print("Loaded", file=sys.stderr)
        model.output(sdec, tdec)
        print("Testing contents", file=sys.stderr)
        self.assertTrue((s.buildpattern('a'), t.buildpattern('een')) in model)
        self.assertTrue((s.buildpattern('just'),
                         t.buildpattern('maar')) in model)
        self.assertTrue((s.buildpattern('only'),
                         t.buildpattern('maar')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('oever')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('bank')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('sturen')) in model)
        self.assertTrue((s.buildpattern('couch'),
                         t.buildpattern('bank')) in model)
        self.assertTrue((s.buildpattern('the bank'),
                         t.buildpattern('de oever')) in model)
        self.assertTrue((s.buildpattern('the bank'),
                         t.buildpattern('de bank')) in model)
        self.assertTrue((s.buildpattern('the couch'),
                         t.buildpattern('de bank')) in model)
        self.assertTrue((s.buildpattern('I see'),
                         t.buildpattern('Ik zie')) in model)
        self.assertTrue((s.buildpattern('He'), t.buildpattern('Hij')) in model)
        self.assertTrue((s.buildpattern('sits'),
                         t.buildpattern('zit')) in model)
        self.assertTrue((s.buildpattern('on'), t.buildpattern('on')) in model)
        self.assertTrue((s.buildpattern('today'),
                         t.buildpattern('vandaag')) in model)
        self.assertEqual(len(list(model.triples())), 15)
Ejemplo n.º 4
0
    def load(self):
        """Load the requested modules from self.models"""
        if len(self.models) != 1:
            raise Exception("Specify one and only one model to load!")

        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file:" + modelfile)
        self.log("Loading colibri model file " + modelfile)
        self.classencoder = colibricore.ClassEncoder(modelfile + '.cls')
        self.classdecoder = colibricore.ClassDecoder(modelfile + '.cls')
        self.patternmodel = colibricore.UnindexedPatternModel(modelfile)
Ejemplo n.º 5
0
    def train(self, sourcefile, modelfile, **parameters):
        self.log("Preparing to generate lexicon")
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
                "", self.settings['minlength'],
                self.settings['maxlength'])  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile,
                                                    self.settings['minlength'],
                                                    self.settings['maxlength'])

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile, corpusfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        self.log("Generating frequency list")
        options = colibricore.PatternModelOptions(
            mintokens=self.settings['freqthreshold'], minlength=1,
            maxlength=1)  #unigrams only
        model = colibricore.UnindexedPatternModel()
        model.train(corpusfile, options)

        self.savemodel(
            model, modelfile,
            classfile)  #in separate function so it can be overloaded
Ejemplo n.º 6
0
    def train(self):
        if self.sourcefile and not os.path.exists(self.modelfile):
            classfile = stripsourceextensions(self.sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(self.sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.classencoder = colibricore.ClassEncoder(self.minlength,self.maxlength)
                self.classencoder.build(self.sourcefile)
                self.classencoder.save(classfile)
            else:
                self.classencoder = colibricore.ClassEncoder(classfile, self.minlength, self.maxlength)

            if not os.path.exists(self.modelfile + '.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, self.modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.classencoder.encodefile( self.sourcefile, corpusfile)

            options = colibricore.PatternModelOptions(mintokens=self.threshold,minlength=1,maxlength=1)
            self.lexicon = colibricore.UnindexedPatternModel()
            self.lexicon.train(corpusfile, options)
            self.lexicon.write(self.modelfile)
Ejemplo n.º 7
0
def buildpatternmodel(testfiles):
    print("Loading test data...", file=sys.stderr)

    with open('inputmodel.txt', 'w', encoding='utf-8') as f:
        for testfile in testfiles:
            f.write(loadtext(testfile) + "\n")

    print("Building pattern model...", file=sys.stderr)

    classencoder = colibricore.ClassEncoder()
    classencoder.build('inputmodel.txt')
    classencoder.save('inputmodel.colibri.cls')
    classencoder.encodefile('inputmodel.txt', 'inputmodel.colibri.dat')

    options = colibricore.PatternModelOptions(mintokens=1, maxlength=3)
    patternmodel = colibricore.UnindexedPatternModel()
    patternmodel.train('inputmodel.colibri.dat', options)

    return patternmodel, classencoder
Ejemplo n.º 8
0
    def load(self):
        """Load the requested modules from self.models"""
        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")

        self.log("Loading class encoder/decoder for " + modelfile + " ...")
        self.classencoder = colibricore.ClassEncoder(modelfile + '.cls')
        self.classdecoder = colibricore.ClassDecoder(modelfile + '.cls')

        self.log("Loading model files " + modelfile + ", " + modelfile +
                 ".1  and " + modelfile + ".3 ...")
        self.unigram_model = colibricore.UnindexedPatternModel(modelfile +
                                                               '.1')
        self.bigram_model = colibricore.UnindexedPatternModel(modelfile)
        self.trigram_model = colibricore.UnindexedPatternModel(modelfile +
                                                               '.3')
Ejemplo n.º 9
0
 def load(self):
     if not os.path.exists(self.modelfile):
         raise IOError("Missing expected model file for hapaxer:" + self.modelfile)
     self.classencoder = colibricore.ClassEncoder(self.modelfile + '.cls')
     #self.classdecoder = colibricore.ClassDecoder(self.modelfile + '.cls')
     self.lexicon = colibricore.UnindexedPatternModel(self.modelfile)
Ejemplo n.º 10
0
    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        description="Extract skipgrams from a Moses phrasetable",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t',
                        '--minskiptypes',
                        type=int,
                        help="Minimal skip types",
                        action='store',
                        default=2,
                        required=False)
    parser.add_argument(
        '-i',
        '--inputfile',
        type=str,
        help=
        "Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ",
        action='store',
        required=True)
    parser.add_argument(
        '-o',
        '--outputfile',
        type=str,
        help=
        "Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!",
        default="",
        action='store',
        required=False)
    parser.add_argument('-l',
                        '--maxlength',
                        type=int,
                        help="Maximum length",
                        action='store',
                        default=8,
                        required=False)
    parser.add_argument('-W',
                        '--tmpdir',
                        type=str,
                        help="Temporary work directory",
                        action='store',
                        default="./",
                        required=False)
    parser.add_argument('-S',
                        '--sourceclassfile',
                        type=str,
                        help="Source class file",
                        action='store',
                        required=True)
    parser.add_argument('-T',
                        '--targetclassfile',
                        type=str,
                        help="Target class file",
                        action='store',
                        required=True)
    parser.add_argument(
        '-s',
        '--constrainskipgrams',
        help=
        "Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered",
        action='store_true',
        required=False)
    parser.add_argument(
        '-m',
        '--constrainsourcemodel',
        type=str,
        help="Source patternmodel, used to constrain possible patterns",
        action='store',
        required=False)
    parser.add_argument(
        '-M',
        '--constraintargetmodel',
        type=str,
        help="Target patternmodel, used to constrain possible patterns",
        action='store',
        required=False)
    parser.add_argument(
        '-p',
        '--pts',
        type=float,
        help=
        "Minimum probability p(t|s) for skipgram consideration (set to a high number)",
        default=0.75,
        action='store',
        required=False)
    parser.add_argument(
        '-P',
        '--pst',
        type=float,
        help=
        "Minimum probability p(s|t) for skipgram consideration (set to a high number)",
        default=0.75,
        action='store',
        required=False)
    parser.add_argument('-D',
                        '--debug',
                        help="Enable debug mode",
                        action='store_true',
                        required=False)
    args = parser.parse_args()
    #args.storeconst, args.dataset, args.num, args.bar

    if args.constrainsourcemodel:
        print("Loading source model for constraints", file=sys.stderr)
        if args.constrainskipgrams:
            constrainsourcemodel = colibricore.IndexedPatternModel(
                args.constrainsourcemodel)
        else:
            constrainsourcemodel = colibricore.UnindexedPatternModel(
                args.constrainsourcemodel)
    else:
        constrainsourcemodel = None

    if args.constraintargetmodel:
        print("Loading target model for constraints", file=sys.stderr)
        if args.constrainskipgrams:
            constraintargetmodel = colibricore.IndexedPatternModel(
                args.constraintargetmodel)
        else:
            constraintargetmodel = colibricore.UnindexedPatternModel(
                args.constraintargetmodel)
    else:
        constraintargetmodel = None

    alignmodel = FeaturedAlignmentModel()
    if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'):
        print("Loading colibri alignment model", file=sys.stderr)
        alignmodel.load(args.inputfile)
    else:
        print("Loading class encoders", file=sys.stderr)
        sourceencoder = colibricore.ClassEncoder(args.sourceclassfile)
        targetencoder = colibricore.ClassEncoder(args.targetclassfile)
        print("Loading moses phrase table", file=sys.stderr)
        alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder,
                                        targetencoder)

    if args.debug:
        debug = (colibricore.ClassDecoder(args.sourceclassfile),
                 colibricore.ClassDecoder(args.targetclassfile))
    else:
        debug = False

    scorefilter = lambda features: features[0] >= args.pst and features[
        2] >= args.pts
    extractskipgrams(alignmodel, args.maxlength, args.minskiptypes,
                     args.tmpdir, constrainsourcemodel, constraintargetmodel,
                     args.constrainskipgrams, scorefilter, False, debug)

    if args.outputfile:
        outfile = args.outputfile
    else:
        outfile = os.path.basename(args.inputfile)
        if outfile[-3:] == '.gz': outfile = outfile[:-3]
        if outfile[-4:] == '.bz2': outfile = outfile[:-4]
        if outfile[-11:] == '.phrasetable': outfile = outfile[:-11]
        if outfile[-12:] == '.phrase-table': outfile = outfile[:-12]
    print("Saving alignment model to " + outfile, file=sys.stderr)
    alignmodel.save(outfile)  #extensions will be added automatically
Ejemplo n.º 12
0
            sys.exit(2)


try:
    import colibricore
except ImportError:
    print("Run setup.py install first!", file=sys.stderr)
    raise

with open("/tmp/colibritest", 'w') as f:
    f.write("5\tbe\n6\tTo\n7\tto\n8\tor\n9\tnot\n73477272\tblah\n")

print("Loading class decoder...")
decoder = colibricore.ClassDecoder("/tmp/colibritest")
print("Loading class encoder...")
encoder = colibricore.ClassEncoder("/tmp/colibritest")

print("Building pattern...")
ngram = encoder.buildpattern("To be or not to be")

print("Ngram: ", test(ngram.tostring(decoder), "To be or not to be"))
print("Size: ", test(len(ngram), 6))
print("Bytesize: ", test(ngram.bytesize(), 6))
print("Category==NGRAM", test(ngram.category() == colibricore.Category.NGRAM))
print("Hash: ", test(hash(ngram)))
print("Raw bytes: ", repr(bytes(ngram)))

print("Third token ", test(ngram[2].tostring(decoder), "or"))
print("Last token ", test(ngram[-1].tostring(decoder), "be"))

print("Slicing ngram[2:4]", test(ngram[2:4].tostring(decoder), "or not"))
Ejemplo n.º 13
0
def main():
    dopretests = True
    try:
        tests = sys.argv[1]
        if tests[0] == 'x':
            dopretests = False
            tests = tests[1:]
        if '-' in tests:
            begintest = int(tests.split('-')[0])
            endtest = int(tests.split('-')[1])
        else:
            begintest = endtest = int(tests)
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)
    try:
        textfile = sys.argv[2]
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)

    try:
        tmpdir = sys.argv[3]
    except:
        tmpdir = "/tmp/"

    classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls'
    datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat'
    modelfile = tmpdir + "/" + os.path.basename(
        textfile) + '.colibri.patternmodel'

    if not os.path.exists(textfile):
        print("File does not exist", file=sys.stderr)
        sys.exit(2)

    if dopretests:
        linecount = 0
        print("PRETEST #1 - Reading text file (Python)")
        b = begin()
        with open(textfile, 'r', encoding='utf-8') as f:
            for line in f:
                linecount += 1
        end(b)
        print("\t(Read " + str(linecount) + " lines)")

        print("PRETEST #2 - Building class encoder")
        encoder = colibricore.ClassEncoder()
        b = begin()
        encoder.build(textfile)
        end(b)

        print("PRETEST #3 - Saving class encoder")
        b = begin()
        encoder.save(classfile)
        end(b)

        print("PRETEST #4 - Class encoding corpus")
        b = begin()
        encoder.encodefile(textfile, datafile)
        end(b)

        print("PRETEST #5 - Unloading encoder")
        b = begin()
        del encoder
        gc.collect()
        end(b)

    if begintest < endtest:
        print("Running tests ", begintest, " to ", endtest)
        for testnum in range(begintest, min(endtest + 1, 10)):
            os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " " +
                      textfile + " " + tmpdir)

    else:
        testnum = begintest
        print("-------------------- " + colorf('bold', 'TEST') + " #" +
              str(testnum) + " ----------------------")
        if testnum == 1:

            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")

        elif testnum == 2:
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)"
            )

            from nltk.probability import FreqDist
            from nltk.util import ngrams

            fd = FreqDist()
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.split(' ')
                    for n in range(1, 9):
                        for ngram in ngrams(tokens, n):
                            fd[ngram] += 1
            end(b)
            print("\t(Done)")
        elif testnum == 3:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        if testnum == 4:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            for n in range(1, 9):
                with open(textfile, 'r', encoding='utf-8') as f:
                    for line in f:
                        for ngram in Windower(line, n):
                            docount = True
                            if n > 1:
                                for subngram in Windower(ngram, n - 1):
                                    if not subngram in ngrams:
                                        docount = False
                                        break
                            if docount:
                                ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        if testnum == 5:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            for ngram in list(ngrams.keys()):
                if ngrams[ngram] < 2: del ngrams[ngram]
            gc.collect()
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        elif testnum == 6:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 7:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.UnindexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 8:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

            del model

        elif testnum == 9:
            print(
                "Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 10:

            print(
                "Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2,
                                                      maxlength=8,
                                                      doskipgrams=True)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 11:
            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel"
            )
            model = colibricore.OrderedUnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        else:
            print("No such test", file=sys.stderr)
        print()
Ejemplo n.º 14
0
#!/usr/bin/env python3

from __future__ import print_function, unicode_literals, division, absolute_import
import colibricore

from colibrimt.alignmentmodel import FeaturedAlignmentModel

sourceencoder = colibricore.ClassEncoder()
targetencoder = colibricore.ClassEncoder()

s1 = sourceencoder.buildpattern("het grote huis", False, True)
s2 = sourceencoder.buildpattern("het paleis", False, True)
t1 = targetencoder.buildpattern("the big house", False, True)
t2 = targetencoder.buildpattern("the grand house", False, True)
t3 = targetencoder.buildpattern("the palace", False, True)

sourceencoder.save('/tmp/s.cls')
targetencoder.save('/tmp/t.cls')
sd = colibricore.ClassDecoder('/tmp/s.cls')
td = colibricore.ClassDecoder('/tmp/t.cls')

model = FeaturedAlignmentModel()
model.add(s1, t1, [1, 0, 1, 0])
model.add(s1, t2, [1, 0, 1, 0])
model.add(s2, t2, [1, 0, 1, 0])
model.add(s2, t3, [1, 0, 1, 0])
model.normalize('s-t-')

for source, target, scores in model:
    print(
        source.tostring(sd) + "\t" + target.tostring(td) + "\t" +
Ejemplo n.º 15
0
    for i, infile in enumerate(infiles):
        with open(infile, encoding="utf-8") as f:
            for l in f.readlines():
                js = json.loads(l)
                text = js["text"].lower()
                #text = ''.join(ch for ch in text if ch not in exclude)
                text = text.replace(',', ' ,')
                text = text.replace('.', ' .')
                text = text.replace(':', ' :')
                text = text.replace('(', '')
                text = text.replace(')', '')
                text = text.replace('"', '')
                g.write(text.strip() + "\n")

print("Building class encoder", file=sys.stderr)
classencoder = colibricore.ClassEncoder()
classencoder.build(textfile)
classencoder.save(classfile)

print("Encoding corpus data", file=sys.stderr)
classencoder.encodefile(textfile, corpusfile)

print("Loading class decoder", file=sys.stderr)
classdecoder = colibricore.ClassDecoder(classfile)

anchormodel = colibricore.UnindexedPatternModel()
print("Counting anchors", file=sys.stderr)

for i, infile in enumerate(infiles):
    with open(infile, encoding="utf-8") as f:
        for l in f.readlines():
Ejemplo n.º 16
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)
Ejemplo n.º 17
0
    def train(self, sourcefile, modelfile, **parameters):
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".nonewlines.dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
            )  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile,
                                    corpusfile,
                                    ignorenewlines=True)

        if modelfile.endswith('.1'):
            #unigram model (for recasing)
            self.log("Generating unigram frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['recasethreshold'],
                minlength=1,
                maxlength=1)  #unigrams
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model")
            model.write(modelfile)
        elif modelfile.endswith('.3'):
            #trigram model
            self.log("Generating filtered trigram frequency list")
            filterpatterns = colibricore.PatternSet()
            for punc in ColibriPuncRecaseModule.PUNCTUATION:
                filterpattern = classencoder.buildpattern('{*1*} ' + punc +
                                                          ' {*1*}')
                if not filterpattern.unknown():
                    filterpatterns.add(filterpattern)
            self.log("(" + str(len(filterpatterns)) + " filters)")

            options = colibricore.PatternModelOptions(
                mintokens=self.settings['deletioncutoff'],
                minlength=3,
                maxlength=3)  #trigrams
            model = colibricore.UnindexedPatternModel()
            model.train_filtered(corpusfile, options, filterpatterns)

            self.log("Saving model")
            model.write(modelfile)
        else:
            #bigram model
            self.log("Generating bigram frequency list")
            options = colibricore.PatternModelOptions(mintokens=min(
                self.settings['insertioncutoff'],
                self.settings['recasethreshold2']),
                                                      minlength=2,
                                                      maxlength=2)  #bigrams
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model")
            model.write(modelfile)
            del model
Ejemplo n.º 18
0
 def __init__(self):
     self.classencoder = colibricore.ClassEncoder()
     self.dmodel = colibricore.PatternDict_float()
Ejemplo n.º 19
0
    def handle(self, *args, **options):
        sourceclassfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.cls')
        sourcecorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.dat')
        sourcemodelfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.patternmodel')

        if not os.path.exists(sourceclassfile) or not os.path.exists(sourcecorpusfile) or options['force']:
            self.stdout.write("Encoding source corpus ...")
            sourceclassencoder = colibricore.ClassEncoder()
            sourceclassencoder.build(options['sourcecorpus'])
            sourceclassencoder.save(sourceclassfile)
            sourceclassencoder.encodefile(options['sourcecorpus'], sourcecorpusfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write("Reusing previously encoded source corpus ...")

        targetclassfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.cls')
        targetcorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.dat')
        targetmodelfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.patternmodel')

        if not os.path.exists(targetclassfile) or not os.path.exists(targetcorpusfile) or options['force']:
            self.stdout.write("Encoding target corpus ...")
            targetclassencoder = colibricore.ClassEncoder()
            targetclassencoder.build(options['targetcorpus'])
            targetclassencoder.save(targetclassfile)
            targetclassencoder.encodefile(options['targetcorpus'], targetcorpusfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write("Reusing previously encoded target corpus ...")

        modeloptions = colibricore.PatternModelOptions(mintokens=options['freqthreshold'],maxlength=options['maxlength'])

        if not os.path.exists(sourcemodelfile) or options['force']:
            self.stdout.write('Computing pattern model of source corpus ...')
            sourcemodel = colibricore.UnindexedPatternModel()
            sourcemodel.train(sourcecorpusfile, modeloptions)
            sourcemodel.write(sourcemodelfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            sourcemodel = None
            self.stdout.write("Reusing previously computed source model ...")

        if not os.path.exists(targetmodelfile) or options['force']:
            self.stdout.write('Computing pattern model of target corpus ...')
            targetmodel = colibricore.UnindexedPatternModel()
            targetmodel.train(targetcorpusfile, modeloptions)
            targetmodel.write(targetmodelfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            targetmodel = None
            self.stdout.write("Reusing previously computed target model ...")

        alignmodelfile = os.path.join(options['tmpdir'], "alignmodel.colibri")

        #delete models to conserve memory during next step
        if sourcemodel is not None:
            del sourcemodel
            self.stdout.write(self.style.SUCCESS('Unloaded source patternmodel'))
        if targetmodel is not None:
            del targetmodel
            self.stdout.write(self.style.SUCCESS('Unloaded target patternmodel'))

        if not os.path.exists(alignmodelfile) or options['force']:
            cmd = "colibri-mosesphrasetable2alignmodel -i " + options['phrasetable'] + " -o " + alignmodelfile + " -S " + sourceclassfile + " -T " + targetclassfile + " -m " + sourcemodelfile + " -M " + targetmodelfile + " -t " + str(options['freqthreshold']) + " -l " + str(options['maxlength']) + " -p " + str(options['pts']) + " -P " + str(options['pst']) + " -j " + str(options['joinedthreshold']) + " -d " + str(options['divergencethreshold'])
            self.stdout.write("Computing alignment model: " + cmd)
            os.system(cmd)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write(self.style.SUCCESS('Reusing previously computed alignment model'))


        self.stdout.write("Loading models")
        sourceclassdecoder = colibricore.ClassDecoder(sourceclassfile)
        targetclassdecoder = colibricore.ClassDecoder(targetclassfile)
        sourcemodel = colibricore.UnindexedPatternModel(sourcemodelfile, modeloptions)
        targetmodel = colibricore.UnindexedPatternModel(targetmodelfile, modeloptions)
        alignmodel = colibricore.PatternAlignmentModel_float(alignmodelfile, modeloptions)
        self.stdout.write(self.style.SUCCESS('DONE'))

        #collection,_ = Collection.objects.get_or_create(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang'])
        #collection_id = 1

        l = len(alignmodel)


        self.stdout.write("Connecting to MongoDB server at " + settings.MONGODB_HOST + ":" + str(settings.MONGODB_PORT) )
        mongoengine.connect("colloquery", host=settings.MONGODB_HOST, port=settings.MONGODB_PORT)

        self.stdout.write("Generating translation pairs (this may take a while)..." )

        targetcollocations = {}
        prevsourcepattern = None
        collection = Collection(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang'])
        collection.save()
        sourcecount = 0

        for i, (sourcepattern, targetpattern, scores) in enumerate(alignmodel.triples()):
            if i % 100 == 0:
                self.stdout.write(str(round(((sourcecount + 1) / l) * 100,1)) + "% -- @" + str(sourcecount + 1) + " of " + str(l) + ": inserted " + str(i+1) + " pairs") #(source=" + str(n_source) + ", target=" + str(n_target) + ", source-keywords=" + str(n_source_keywords) + ", target-keywords=" + str(n_target_keywords) + ")")

            if prevsourcepattern is None or sourcepattern != prevsourcepattern:
                prevsourcepattern = sourcepattern
                sourcecount += 1

                sourcefreq = sourcemodel[sourcepattern]
                text = sourcepattern.tostring(sourceclassdecoder)
                if ignorable(text):
                    continue
                sourcecollocation = Collocation(collection=collection, language=options['sourcelang'], text=text, freq=sourcefreq)
                sourcecollocation.save()



            targetfreq = targetmodel[targetpattern]
            text = targetpattern.tostring(targetclassdecoder)
            if ignorable(text):
                continue
            if targetpattern in targetcollocations: #quicker in-memory lookup
                # targetcollocation = Collocation.objects(text=text, language=options['targetlang'], collection=collection)[0] #get from db
                targetcollocation = targetcollocations[targetpattern]
            else:
                targetcollocation = Collocation(collection=collection, language=options['targetlang'], text=text, freq=targetfreq)
                targetcollocation.save()
                #self.stdout.write(repr(targetcollocation.id))
                targetcollocations[targetpattern] = targetcollocation.id

            Translation(source=sourcecollocation, target=targetcollocation, prob=scores[0], revprob=scores[2]).save()
            Translation(source=targetcollocation, target=sourcecollocation, prob=scores[2], revprob=scores[0]).save()