Beispiel #1
0
 def load(self, filename, options=None):
     if not options:
         options = colibricore.PatternModelOptions()
     if os.path.exists(filename):
         super().load(filename, options)
     else:
         raise IOError("File not found: " + filename)
Beispiel #2
0
    def train(self, sourcefile, modelfile, **parameters):
        self.log("Preparing to generate bigram model")
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
            )  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile, corpusfile)

        self.log("Generating bigram frequency list")
        options = colibricore.PatternModelOptions(
            mintokens=self.settings['freqthreshold'], minlength=1,
            maxlength=2)  #unigrams and bigrams
        model = colibricore.UnindexedPatternModel()
        model.train(corpusfile, options)

        self.log("Saving model")
        model.write(modelfile)
Beispiel #3
0
def main_alignmodel():
    parser = argparse.ArgumentParser(description="Load and view the specified alignment model", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i','--inputfile',type=str,help="Input alignment model (file prefix without .colibri.alignmodel-* extension)", action='store',required=True)
    parser.add_argument('-S','--sourceclassfile',type=str,help="Source class file", action='store',required=True)
    parser.add_argument('-T','--targetclassfile',type=str,help="Target class file", action='store',required=True)
    parser.add_argument('-p','--pts',type=float,help="Constrain by minimum probability p(t|s), assumes a moses-style score vector",default=0.0, action='store',required=False)
    parser.add_argument('-P','--pst',type=float,help="Constrain by minimum probability p(s|t), assumes a moses-style score vector", default=0.0,action='store',required=False)
    parser.add_argument('--debug',help="Enabled debug", action='store_true',required=False)
    args = parser.parse_args()
    #args.storeconst, args.dataset, args.num, args.bar


    print("Loading source decoder " + args.sourceclassfile,file=sys.stderr)
    sourcedecoder = colibricore.ClassDecoder(args.sourceclassfile)
    print("Loading target decoder " + args.targetclassfile,file=sys.stderr)
    targetdecoder = colibricore.ClassDecoder(args.targetclassfile)
    print("Loading alignment model",file=sys.stderr)
    model = AlignmentModel()
    options = colibricore.PatternModelOptions(debug=args.debug)
    if options.DEBUG: print("Debug enabled",file=sys.stderr)
    sys.stderr.flush()
    model.load(args.inputfile, options)
    print("Outputting",file=sys.stderr)
    if args.pts or args.pst:
        scorefilter = lambda scores: scores[2] > args.pts and scores[0] > args.pst
    else:
        scorefilter = None
    model.output(sourcedecoder,targetdecoder,scorefilter)
Beispiel #4
0
    def test001_alignmodel(self):
        """Checking alignment model"""
        options = colibricore.PatternModelOptions(mintokens=1,
                                                  doreverseindex=False)

        s = colibricore.ClassEncoder("test-en-nl/test-en-train.colibri.cls")
        t = colibricore.ClassEncoder("test-en-nl/test-nl-train.colibri.cls")
        sdec = colibricore.ClassDecoder("test-en-nl/test-en-train.colibri.cls")
        tdec = colibricore.ClassDecoder("test-en-nl/test-nl-train.colibri.cls")

        print("Loading alignment model", file=sys.stderr)
        model = AlignmentModel()
        model.load("test-en-nl/test-en-nl.colibri.alignmodel", options)
        print("Loaded", file=sys.stderr)
        model.output(sdec, tdec)
        print("Testing contents", file=sys.stderr)
        self.assertTrue((s.buildpattern('a'), t.buildpattern('een')) in model)
        self.assertTrue((s.buildpattern('just'),
                         t.buildpattern('maar')) in model)
        self.assertTrue((s.buildpattern('only'),
                         t.buildpattern('maar')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('oever')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('bank')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('sturen')) in model)
        self.assertTrue((s.buildpattern('couch'),
                         t.buildpattern('bank')) in model)
        self.assertTrue((s.buildpattern('the bank'),
                         t.buildpattern('de oever')) in model)
        self.assertTrue((s.buildpattern('the bank'),
                         t.buildpattern('de bank')) in model)
        self.assertTrue((s.buildpattern('the couch'),
                         t.buildpattern('de bank')) in model)
        self.assertTrue((s.buildpattern('I see'),
                         t.buildpattern('Ik zie')) in model)
        self.assertTrue((s.buildpattern('He'), t.buildpattern('Hij')) in model)
        self.assertTrue((s.buildpattern('sits'),
                         t.buildpattern('zit')) in model)
        self.assertTrue((s.buildpattern('on'), t.buildpattern('on')) in model)
        self.assertTrue((s.buildpattern('today'),
                         t.buildpattern('vandaag')) in model)
        self.assertEqual(len(list(model.triples())), 15)
def buildpatternmodel(testfiles):
    print("Loading test data...", file=sys.stderr)

    with open('inputmodel.txt', 'w', encoding='utf-8') as f:
        for testfile in testfiles:
            f.write(loadtext(testfile) + "\n")

    print("Building pattern model...", file=sys.stderr)

    classencoder = colibricore.ClassEncoder()
    classencoder.build('inputmodel.txt')
    classencoder.save('inputmodel.colibri.cls')
    classencoder.encodefile('inputmodel.txt', 'inputmodel.colibri.dat')

    options = colibricore.PatternModelOptions(mintokens=1, maxlength=3)
    patternmodel = colibricore.UnindexedPatternModel()
    patternmodel.train('inputmodel.colibri.dat', options)

    return patternmodel, classencoder
Beispiel #6
0
    def train(self, sourcefile, modelfile, **parameters):
        self.log("Preparing to generate lexicon")
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
                "", self.settings['minlength'],
                self.settings['maxlength'])  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile,
                                                    self.settings['minlength'],
                                                    self.settings['maxlength'])

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile, corpusfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        self.log("Generating frequency list")
        options = colibricore.PatternModelOptions(
            mintokens=self.settings['freqthreshold'], minlength=1,
            maxlength=1)  #unigrams only
        model = colibricore.UnindexedPatternModel()
        model.train(corpusfile, options)

        self.savemodel(
            model, modelfile,
            classfile)  #in separate function so it can be overloaded
Beispiel #7
0
    def train(self):
        if self.sourcefile and not os.path.exists(self.modelfile):
            classfile = stripsourceextensions(self.sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(self.sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.classencoder = colibricore.ClassEncoder(self.minlength,self.maxlength)
                self.classencoder.build(self.sourcefile)
                self.classencoder.save(classfile)
            else:
                self.classencoder = colibricore.ClassEncoder(classfile, self.minlength, self.maxlength)

            if not os.path.exists(self.modelfile + '.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, self.modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.classencoder.encodefile( self.sourcefile, corpusfile)

            options = colibricore.PatternModelOptions(mintokens=self.threshold,minlength=1,maxlength=1)
            self.lexicon = colibricore.UnindexedPatternModel()
            self.lexicon.train(corpusfile, options)
            self.lexicon.write(self.modelfile)
Beispiel #8
0
corpus = colibricore.IndexedCorpus("/tmp/hamlet.colibri.dat")
print("Total number of tokens: ", len(corpus))
firstword = corpus[(1, 0)]
print("First word: ", test(firstword.tostring(decoder), "To"))
needle = encoder.buildpattern("fair Ophelia")
for match in corpus.findpattern(needle):
    print("'fair Ophelia' found at ", match)
print("Token iteration:")
i = 0
for ref in corpus:
    i += 1
print("Total number of tokens: ", test(len(corpus), i))

print()

options = colibricore.PatternModelOptions(doskipgrams_exhaustive=True)

print("\n===== Building unindexed model ======\n")
unindexedmodel = colibricore.UnindexedPatternModel()
unindexedmodel.train("/tmp/hamlet.colibri.dat", options)
print("Pattern count", test(len(unindexedmodel), 385))
print("Type count", test(unindexedmodel.types(), 186))
print("Token count", test(unindexedmodel.tokens(), 354))

unindexedmodel.printmodel(decoder)
print("REPORT:")
unindexedmodel.report()
print("HISTOGRAM:")
unindexedmodel.histogram()

outputfilename = "/tmp/data.colibri.patternmodel"
Beispiel #9
0
    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
Beispiel #10
0
def extractskipgrams(alignmodel,
                     maxlength=8,
                     minskiptypes=2,
                     tmpdir="./",
                     constrainsourcemodel=None,
                     constraintargetmodel=None,
                     constrainskipgrams=False,
                     scorefilter=None,
                     quiet=False,
                     debug=False):
    if constrainskipgrams:  #strict constraints
        sourcemodel = constrainsourcemodel
        targetmodel = constraintargetmodel
    else:
        if not quiet:
            print("Writing all source patterns to temporary file",
                  file=sys.stderr)
        sourcepatternfile = tmpdir + "/sourcepatterns.colibri.dat"
        with open(sourcepatternfile, 'wb') as f:
            for sourcepattern in alignmodel.sourcepatterns():
                if not constrainsourcemodel or sourcepattern in constrainsourcemodel:
                    f.write(bytes(sourcepattern) + b'\0')

        if not quiet:
            print("Writing all target patterns to temporary file",
                  file=sys.stderr)
        targetpatternfile = tmpdir + "/targetpatterns.colibri.dat"
        with open(targetpatternfile, 'wb') as f:
            for targetpattern in alignmodel.targetpatterns():
                if not constraintargetmodel or targetpattern in constraintargetmodel:
                    f.write(bytes(targetpattern) + b'\0')

        options = colibricore.PatternModelOptions()
        options.MINTOKENS = 1
        options.MINSKIPTYPES = minskiptypes
        options.MAXLENGTH = maxlength
        options.DOSKIPGRAMS = True

        #we first build skipgrams from the patterns found in the phrase-table, for both sides independently,
        #using indexed pattern models

        if not quiet: print("Building source pattern model", file=sys.stderr)
        sourcemodel = colibricore.IndexedPatternModel()
        sourcemodel.train(sourcepatternfile, options, constrainsourcemodel)

        if not quiet: print("Building target pattern model", file=sys.stderr)
        targetmodel = colibricore.IndexedPatternModel()
        targetmodel.train(targetpatternfile, options, constraintargetmodel)

    #then for each pair in the phrasetable, we see if we can find abstracted pairs
    found = 0

    skipped = 0

    if not quiet: print("Computing total count", file=sys.stderr)
    total = alignmodel.itemcount()

    addlist = []
    num = 0

    if not quiet: print("Finding abstracted pairs", file=sys.stderr)
    for sourcepattern, targetpattern, features in alignmodel.items():
        if not isinstance(features, list) and not isinstance(features, tuple):
            print("WARNING: Expected feature vector, got " +
                  str(type(features)),
                  file=sys.stderr)
            continue
        if not isinstance(features[-1], list) and not isinstance(
                features[-1], tuple):
            print("WARNING: Word alignments missing for a pair, skipping....",
                  file=sys.stderr)
            continue
        if sourcepattern.isskipgram() or targetpattern.isskipgram():
            continue

        num += 1
        if not quiet and num % 100 == 0:
            print("@" + str(num) + "/" + str(total) + " = " +
                  str(round((num / total) * 100, 2)) + '%' + ",  found " +
                  str(found) + " skipgram pairs thus-far, skipped " +
                  str(skipped),
                  file=sys.stderr)

        #is this pair strong enough to use? Assuming moses-style score-vector
        if scorefilter and not scorefilter(features):
            skipped += 1
            continue

        if sourcepattern in sourcemodel and targetpattern in targetmodel:
            #find abstractions
            if debug:
                print("\tFinding abstractions for sourcepattern ",
                      sourcepattern.tostring(debug[0]) +
                      " with targetpattern " +
                      targetpattern.tostring(debug[1]),
                      file=sys.stderr)
            sourcetemplates = []
            targettemplates = []

            for template, count in sourcemodel.gettemplates(sourcepattern):
                if template.isskipgram() and template in sourcemodel:
                    if constrainskipgrams and template not in sourcemodel:
                        continue
                    sourcetemplates.append(template)
                    if debug:
                        print("\t\tAdded source template ",
                              template.tostring(debug[0]),
                              file=sys.stderr)

            for template, count in targetmodel.gettemplates(targetpattern):
                if template.isskipgram() and template in targetmodel:
                    if constrainskipgrams and template not in targetmodel:
                        continue
                    targettemplates.append(template)
                    if debug:
                        print("\t\tAdded source template ",
                              template.tostring(debug[1]),
                              file=sys.stderr)

            #these will act as a memory buffer, saving time
            sourceinstances = {}
            targetinstances = {}

            for sourcetemplate in sourcetemplates:
                for targettemplate in targettemplates:
                    if not alignmodel.haspair(
                            sourcetemplate, targettemplate
                    ):  #each pair needs to be processed only once
                        #we now have two skipgrams, to be proper alignments their gaps must only align with gaps:

                        if debug:
                            print("\t\tProcessing skipgram pair ",
                                  sourcetemplate.tostring(debug[0]) + " -- " +
                                  targettemplate.tostring(debug[1]),
                                  file=sys.stderr)

                        validalignment = False
                        for sourceindex, targetindex in features[-1]:
                            validalignment = (
                                sourcetemplate.isgap(sourceindex) ==
                                targettemplate.isgap(targetindex))
                            if not validalignment: break
                        if not validalignment: continue

                        if debug:
                            print("\t\tAlignment valid! Adding!",
                                  file=sys.stderr)

                        #if we made it here we have a proper pair!

                        alignmodel.add(sourcetemplate, targettemplate, [
                            1.0, 0.0, 1.0, 0.0, features[-2],
                            copy(features[-1])
                        ])  #lexical probability disabled (0),
                        found += 1

                        #Now we have to compute a new score vector based on the score vectors of the possible instantiations
                        #find all instantiations
                        #if not sourcetemplate in sourceinstances: #only once per sourcetemplate
                        #    sourceinstances[sourcetemplate] = sourcemodel.getinstantiations(sourcetemplate)
                        #if not targettemplate in targetinstances: #only once per sourcetemplate
                        #    targetinstances[targettemplate] = targetmodel.getinstantiations(targettemplate)

                        #usedsources = colibricore.PatternSet()
                        #usedtargets = colibricore.PatternSet()
                        #scorepart_t = numpy.zeros(2)
                        #scorepart_s = numpy.zeros(2)
                        #total_s = 0
                        #total_t = 0
                        #for sourceinst in sourceinstances[sourcetemplate]:
                        #    for targetinst in targetinstances[sourcetemplate]:
                        #        if alignmodel.haspair(sourceinst, targetinst):
                        #            usedsources.add(sourceinst)
                        #            instfeatures = alignmodel[(sourceinst,targetinst)]

                        #            #we will assume a standard moses configuration of features
                        #            assert(len(instfeatures) == 6)
                        #            #1,2 : p(s|t)   3,4 : p(t|s)    4: word penalty , 5: word alignments (not used here)

                        #            total_t[0] += instfeatures[0]
                        #            scorepart_t[1] += instfeatures[1]
                        #            scorepart_s[0] += instfeatures[3]
                        #            scorepart_s[1] += instfeatures[4]
        else:
            skipped += 1

    if not constrainskipgrams:
        print("Unloading models", file=sys.stderr)
        del sourcemodel
        del targetmodel

    #now we are going to renormalise the scores (leave lexical weights intact as is)
    print("Renormalising alignment model", file=sys.stderr)
    alignmodel.normalize('s-t-')

    print("Cleanup", file=sys.stderr)
    if not constrainskipgrams:
        os.unlink(sourcepatternfile)
        os.unlink(targetpatternfile)

    return alignmodel
Beispiel #11
0
    def train(self, sourcefile, modelfile, **parameters):
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".nonewlines.dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
            )  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile,
                                    corpusfile,
                                    ignorenewlines=True)

        if modelfile.endswith('.1'):
            #unigram model (for recasing)
            self.log("Generating unigram frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['recasethreshold'],
                minlength=1,
                maxlength=1)  #unigrams
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model")
            model.write(modelfile)
        elif modelfile.endswith('.3'):
            #trigram model
            self.log("Generating filtered trigram frequency list")
            filterpatterns = colibricore.PatternSet()
            for punc in ColibriPuncRecaseModule.PUNCTUATION:
                filterpattern = classencoder.buildpattern('{*1*} ' + punc +
                                                          ' {*1*}')
                if not filterpattern.unknown():
                    filterpatterns.add(filterpattern)
            self.log("(" + str(len(filterpatterns)) + " filters)")

            options = colibricore.PatternModelOptions(
                mintokens=self.settings['deletioncutoff'],
                minlength=3,
                maxlength=3)  #trigrams
            model = colibricore.UnindexedPatternModel()
            model.train_filtered(corpusfile, options, filterpatterns)

            self.log("Saving model")
            model.write(modelfile)
        else:
            #bigram model
            self.log("Generating bigram frequency list")
            options = colibricore.PatternModelOptions(mintokens=min(
                self.settings['insertioncutoff'],
                self.settings['recasethreshold2']),
                                                      minlength=2,
                                                      maxlength=2)  #bigrams
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model")
            model.write(modelfile)
            del model
Beispiel #12
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)
Beispiel #13
0
def main():
    dopretests = True
    try:
        tests = sys.argv[1]
        if tests[0] == 'x':
            dopretests = False
            tests = tests[1:]
        if '-' in tests:
            begintest = int(tests.split('-')[0])
            endtest = int(tests.split('-')[1])
        else:
            begintest = endtest = int(tests)
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)
    try:
        textfile = sys.argv[2]
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)

    try:
        tmpdir = sys.argv[3]
    except:
        tmpdir = "/tmp/"

    classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls'
    datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat'
    modelfile = tmpdir + "/" + os.path.basename(
        textfile) + '.colibri.patternmodel'

    if not os.path.exists(textfile):
        print("File does not exist", file=sys.stderr)
        sys.exit(2)

    if dopretests:
        linecount = 0
        print("PRETEST #1 - Reading text file (Python)")
        b = begin()
        with open(textfile, 'r', encoding='utf-8') as f:
            for line in f:
                linecount += 1
        end(b)
        print("\t(Read " + str(linecount) + " lines)")

        print("PRETEST #2 - Building class encoder")
        encoder = colibricore.ClassEncoder()
        b = begin()
        encoder.build(textfile)
        end(b)

        print("PRETEST #3 - Saving class encoder")
        b = begin()
        encoder.save(classfile)
        end(b)

        print("PRETEST #4 - Class encoding corpus")
        b = begin()
        encoder.encodefile(textfile, datafile)
        end(b)

        print("PRETEST #5 - Unloading encoder")
        b = begin()
        del encoder
        gc.collect()
        end(b)

    if begintest < endtest:
        print("Running tests ", begintest, " to ", endtest)
        for testnum in range(begintest, min(endtest + 1, 10)):
            os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " " +
                      textfile + " " + tmpdir)

    else:
        testnum = begintest
        print("-------------------- " + colorf('bold', 'TEST') + " #" +
              str(testnum) + " ----------------------")
        if testnum == 1:

            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")

        elif testnum == 2:
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)"
            )

            from nltk.probability import FreqDist
            from nltk.util import ngrams

            fd = FreqDist()
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.split(' ')
                    for n in range(1, 9):
                        for ngram in ngrams(tokens, n):
                            fd[ngram] += 1
            end(b)
            print("\t(Done)")
        elif testnum == 3:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        if testnum == 4:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            for n in range(1, 9):
                with open(textfile, 'r', encoding='utf-8') as f:
                    for line in f:
                        for ngram in Windower(line, n):
                            docount = True
                            if n > 1:
                                for subngram in Windower(ngram, n - 1):
                                    if not subngram in ngrams:
                                        docount = False
                                        break
                            if docount:
                                ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        if testnum == 5:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            for ngram in list(ngrams.keys()):
                if ngrams[ngram] < 2: del ngrams[ngram]
            gc.collect()
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        elif testnum == 6:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 7:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.UnindexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 8:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

            del model

        elif testnum == 9:
            print(
                "Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 10:

            print(
                "Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2,
                                                      maxlength=8,
                                                      doskipgrams=True)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 11:
            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel"
            )
            model = colibricore.OrderedUnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        else:
            print("No such test", file=sys.stderr)
        print()
Beispiel #14
0
                if ngram:
                    pattern = classencoder.buildpattern(ngram)
                    if pattern.unknown():
                        print("WARNING: Anchor has unknown part " + ngram +
                              ", skipping... (" +
                              pattern.tostring(classdecoder) + ")",
                              file=sys.stderr)
                    else:
                        if len(pattern) <= 5:
                            anchormodel.add(
                                pattern)  #(will count +1  if already exists)

print("Anchors found: ", len(anchormodel), file=sys.stderr)

print("Counting n-grams, constrained by anchors", file=sys.stderr)
options = colibricore.PatternModelOptions(mintokens=1, maxlength=5)
patternmodel = colibricore.UnindexedPatternModel()
patternmodel.train(
    corpusfile, options, anchormodel
)  #(last argument constrains the training to patterns only occuring in that model, i.e the intersectino of these models, saves heaps of space)

outfiles = []
for i in range(1, 6):
    outfiles.append(open(outdir + str(i) + "_grams.txt", 'w',
                         encoding='utf-8'))

for ngram, count in patternmodel.items():
    i = len(ngram)
    anchorcount = anchormodel[ngram]
    outfiles[i - 1].write(
        ngram.tostring(classdecoder) + "\t" + str(anchorcount) + "\t" +
Beispiel #15
0
def main_extractfeatures():
    parser = argparse.ArgumentParser(description="Extract context features and build classifier data (-C) or add to alignment model", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i','--inputfile',type=str,help="Input alignment model", action='store',required=True)
    parser.add_argument('-o','--outputdir',type=str,help="Output directory, when used with -C", action='store',required=True)
    parser.add_argument('-s','--sourcemodel',type=str,help="Source model (indexed pattern model)", action='store',required=True)
    parser.add_argument('-t','--targetmodel',type=str,help="Target model (indexed pattern model)", action='store',required=True)
    parser.add_argument('-S','--sourceclassfile',type=str,help="Source class file", action='store',required=True)
    parser.add_argument('-T','--targetclassfile',type=str,help="Target class file", action='store',required=True)
    parser.add_argument('-f','--corpusfile',type=str,help="Corpus input file for feature extraction, may be specified multiple times, but all data files must cover the exact same data, i.e. have exactly the same indices (describing different factors)", action='append',required=True)
    parser.add_argument('-c','--classfile',type=str,help="Class file for the specified data file (may be specified multiple times, once per -f)", action='append',required=True)
    parser.add_argument('-l','--leftsize',type=int,help="Left context size (may be specified multiple times, once per -f)", action='append',required=True)
    parser.add_argument('-r','--rightsize',type=int,help="Right context size (may be specified multiple times, once per -f)", action='append',required=True)
    parser.add_argument('-C','--buildclassifiers',help="Build classifier training data, one classifier expert per pattern, specify a working directory in -o", action='store_true',default=False)
    parser.add_argument('-w','--weighbyoccurrence',help="When building classifier data (-C), use exemplar weighting to reflect occurrence count, rather than duplicating instances", action='store_true',default=False)
    parser.add_argument('-W','--weighbyscore',help="When building classifier data (-C), use exemplar weighting to weigh in p(t|s) from score vector", action='store_true',default=False)
    parser.add_argument('-I','--instancethreshold',type=int,help="Classifiers (-C) having less than the specified number of instances will be not be generated", action='store',default=2)
    parser.add_argument('-X','--experts', help="Classifier experts, one per source pattern", action="store_true", default=False)
    parser.add_argument('-M','--monolithic', help="Monolithic classifier (won't work with keywords enabled!)", action="store_true", default=False)
    parser.add_argument('-k','--keywords',help="Add global keywords in context", action='store_true',default=False)
    parser.add_argument('--km',dest='keywordmodel',type=str,help="Source-side unigram model (target-side if crosslingual is set!) for keyword extraction. Needs to be an indexed model with only unigrams.", action='store',required=False,default="")
    parser.add_argument("--kt",dest="bow_absolute_threshold", help="Keyword needs to occur at least this many times in the context (absolute number)", type=int, action='store',default=3)
    parser.add_argument("--kp",dest="bow_prob_threshold", help="minimal P(translation|keyword)", type=float, action='store',default=0.001)
    parser.add_argument("--kg",dest="bow_filter_threshold", help="Keyword needs to occur at least this many times globally in the entire corpus (absolute number)", type=int, action='store',default=20)
    #parser.add_argument("--ka",dest="compute_bow_params", help="Attempt to automatically compute --kt,--kp and --kg parameters", action='store_false',default=True)
    parser.add_argument('--crosslingual', help="Extract target-language context features instead of source-language features (for use with Colibrita). In this case, the corpus in -f and in any additional factor must be the *target* corpus", action="store_true", default=False)
    args = parser.parse_args()

    if not (len(args.corpusfile) == len(args.classfile) == len(args.leftsize) == len(args.rightsize)):
        print("Number of mentions of -f, -c, -l and -r has to match",file=sys.stderr)
        sys.exit(2)


    options = colibricore.PatternModelOptions(mintokens=1,doreverseindex=False)

    print("Loading alignment model",file=sys.stderr)
    model = AlignmentModel()
    model.load(args.inputfile,options)


    print("Loading source decoder " + args.sourceclassfile,file=sys.stderr)
    sourcedecoder = colibricore.ClassDecoder(args.sourceclassfile)
    print("Loading target decoder " + args.targetclassfile,file=sys.stderr)
    targetdecoder = colibricore.ClassDecoder(args.targetclassfile)

    print("Loading source model " , args.sourcemodel, file=sys.stderr)
    sourcemodel = colibricore.IndexedPatternModel(args.sourcemodel, options)

    print("Loading target model ", args.targetmodel, file=sys.stderr)
    targetmodel = colibricore.IndexedPatternModel(args.targetmodel, options)



    model.conf = []
    for corpusfile, classfile,left, right in zip(args.corpusfile, args.classfile, args.leftsize, args.rightsize):
        print("Loading corpus file ", corpusfile, file=sys.stderr)
        if classfile == args.sourceclassfile:
            d = sourcedecoder
        elif classfile == args.targetclassfile:
            d = targetdecoder
        else:
            d = colibricore.ClassDecoder(classfile)
        model.conf.append( Configuration(colibricore.IndexedCorpus(corpusfile), d ,left,True, right) )

    if args.keywords:
        if not args.keywordmodel:
            print("Supply an indexed pattern model containing unigrams to extract keywords from!",file=sys.stderr)
            sys.exit(2)
        print("Loading keyword model ", args.keywordmodel, file=sys.stderr)
        kmoptions = colibricore.PatternModelOptions(mintokens=max(args.bow_absolute_threshold,args.bow_filter_threshold),minlength=1,maxlength=1,doreverseindex=True)
        reverseindex = colibricore.IndexedCorpus(args.corpusfile[0])
        model.conf[0].keywordmodel = colibricore.IndexedPatternModel(args.keywordmodel, kmoptions, None, reverseindex)
        model.conf[0].kw_absolute_threshold = args.bow_absolute_threshold
        model.conf[0].kw_prob_threshold = args.bow_prob_threshold


    if args.buildclassifiers:
        print("Building classifiers",file=sys.stderr)
        if not args.monolithic and not args.experts:
            args.experts = True

        if not os.path.isdir(args.outputdir):
            try:
                os.mkdir(args.outputdir)
            except:
                print("Unable to build directory " + args.outputdir,file=sys.stderr)
                sys.exit(2)


        f = None
        trainfile = ""
        if args.monolithic:
            f = open(args.outputdir + "/train.train",'w',encoding='utf-8')
            f2 = open(args.outputdir + "/sourcepatterns.list",'w',encoding='utf-8')

        fconf = open(args.outputdir + "/classifier.conf",'wb')

        confser = []
        for conf in model.conf:
            confser.append({'corpus': conf.corpus.filename(), 'classdecoder': conf.classdecoder.filename(), 'leftcontext': conf.leftcontext, 'focus': conf.focus,'rightcontext': conf.rightcontext})
        classifierconf = { 'weighbyoccurrence': args.weighbyoccurrence, 'weighbyscore': args.weighbyscore, 'experts': args.experts, 'monolithic': args.monolithic, 'featureconf': confser}
        pickle.dump(classifierconf, fconf)
        fconf.close()


        prevsourcepattern = None
        firsttargetpattern = None
        prevtargetpattern = None
        for sourcepattern, targetpattern, featurevectors, scorevector in model.extractcontextfeatures(sourcemodel, targetmodel, model.conf, sourcedecoder, targetdecoder, args.crosslingual, args.outputdir ):
            if prevsourcepattern is None or sourcepattern != prevsourcepattern:
                #write previous buffer to file:
                if prevsourcepattern and firsttargetpattern:
                    sourcepattern_s = prevsourcepattern.tostring(sourcedecoder)
                    if prevtargetpattern and firsttargetpattern != prevtargetpattern:
                        #only bother if there are at least two distinct target options
                        if len(buffer) < min(args.instancethreshold,2):
                            print("Omitting " + trainfile + ", only " + str(len(buffer)) + " instances",file=sys.stderr)
                        else:
                            trainfile = args.outputdir + "/" + quote_plus(sourcepattern_s) + ".train"
                            if len(quote_plus(sourcepattern_s) + ".train") > 100:
                                print("ERROR: Filename too long, skipping: " + trainfile,file=sys.stderr)
                            else:
                                print("Writing " + trainfile + " (" + str(len(buffer)) + " instances)",file=sys.stderr)
                                if args.experts:
                                    f = open(trainfile,'w',encoding='utf-8')
                                elif args.monolithic:
                                    f2.write(sourcepattern_s+"\n")
                                for line, occurrences,pts in buffer:
                                    if args.weighbyscore:
                                        f.write(line + "\t" + str(occurrences*pts) +  "\n")
                                    elif args.weighbyoccurrence:
                                        f.write(line + "\t" + str(occurrences) +  "\n")
                                    else:
                                        for i in range(0,occurrences):
                                            f.write(line + "\n")
                                if args.experts:
                                    f.close()
                    else:
                        print("Only one target option for " + sourcepattern_s + " (" + str(len(buffer)) + " instances), no classifier needed",file=sys.stderr)

                buffer = []
                prevsourcepattern = sourcepattern
                firsttargetpattern = targetpattern

            for featurevector, count in featurevectors:
                buffer.append( (featurestostring(featurevector, model.conf, args.crosslingual, sourcedecoder) + "\t" + targetpattern.tostring(targetdecoder) , count, scorevector[2] ) ) #buffer holds (ine, occurrences, pts)
                #(model.itemtostring(sourcepattern, targetpattern, featurevector,sourcedecoder, targetdecoder,False,True,False), count,scorevector[2] )  )  #buffer holds (line, occurrences, pts)

            prevsourcepattern = sourcepattern
            prevtargetpattern = targetpattern


        #write last one to file:
        if prevsourcepattern and firsttargetpattern and prevtargetpattern and firsttargetpattern != prevtargetpattern:
            #only bother if there are at least two distinct target options
            if len(buffer) < args.instancethreshold:
                print("Omitting " + trainfile + ", only " + str(len(buffer)) + " instances",file=sys.stderr)
            else:
                sourcepattern_s = prevsourcepattern.tostring(sourcedecoder)
                trainfile = args.outputdir + "/" + quote_plus(sourcepattern_s) + ".train"
                print("Writing " + trainfile + " (" + str(len(buffer)) + " instances)",file=sys.stderr)
                if args.experts:
                    f = open(trainfile,'w',encoding='utf-8')
                for line, occurrences,pts in buffer:
                    if args.weighbyscore:
                        f.write(line + "\t" + str(occurrences*pts) +  "\n")
                    elif args.weighbyoccurrence:
                        f.write(line + "\t" + str(occurrences) +  "\n")
                    else:
                        for i in range(0,occurrences):
                            f.write(line + "\n")
                if args.experts:
                    f.close()

        if args.monolithic:
            f.close()
            f2.close()
Beispiel #16
0
    def handle(self, *args, **options):
        sourceclassfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.cls')
        sourcecorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.dat')
        sourcemodelfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.patternmodel')

        if not os.path.exists(sourceclassfile) or not os.path.exists(sourcecorpusfile) or options['force']:
            self.stdout.write("Encoding source corpus ...")
            sourceclassencoder = colibricore.ClassEncoder()
            sourceclassencoder.build(options['sourcecorpus'])
            sourceclassencoder.save(sourceclassfile)
            sourceclassencoder.encodefile(options['sourcecorpus'], sourcecorpusfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write("Reusing previously encoded source corpus ...")

        targetclassfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.cls')
        targetcorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.dat')
        targetmodelfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.patternmodel')

        if not os.path.exists(targetclassfile) or not os.path.exists(targetcorpusfile) or options['force']:
            self.stdout.write("Encoding target corpus ...")
            targetclassencoder = colibricore.ClassEncoder()
            targetclassencoder.build(options['targetcorpus'])
            targetclassencoder.save(targetclassfile)
            targetclassencoder.encodefile(options['targetcorpus'], targetcorpusfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write("Reusing previously encoded target corpus ...")

        modeloptions = colibricore.PatternModelOptions(mintokens=options['freqthreshold'],maxlength=options['maxlength'])

        if not os.path.exists(sourcemodelfile) or options['force']:
            self.stdout.write('Computing pattern model of source corpus ...')
            sourcemodel = colibricore.UnindexedPatternModel()
            sourcemodel.train(sourcecorpusfile, modeloptions)
            sourcemodel.write(sourcemodelfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            sourcemodel = None
            self.stdout.write("Reusing previously computed source model ...")

        if not os.path.exists(targetmodelfile) or options['force']:
            self.stdout.write('Computing pattern model of target corpus ...')
            targetmodel = colibricore.UnindexedPatternModel()
            targetmodel.train(targetcorpusfile, modeloptions)
            targetmodel.write(targetmodelfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            targetmodel = None
            self.stdout.write("Reusing previously computed target model ...")

        alignmodelfile = os.path.join(options['tmpdir'], "alignmodel.colibri")

        #delete models to conserve memory during next step
        if sourcemodel is not None:
            del sourcemodel
            self.stdout.write(self.style.SUCCESS('Unloaded source patternmodel'))
        if targetmodel is not None:
            del targetmodel
            self.stdout.write(self.style.SUCCESS('Unloaded target patternmodel'))

        if not os.path.exists(alignmodelfile) or options['force']:
            cmd = "colibri-mosesphrasetable2alignmodel -i " + options['phrasetable'] + " -o " + alignmodelfile + " -S " + sourceclassfile + " -T " + targetclassfile + " -m " + sourcemodelfile + " -M " + targetmodelfile + " -t " + str(options['freqthreshold']) + " -l " + str(options['maxlength']) + " -p " + str(options['pts']) + " -P " + str(options['pst']) + " -j " + str(options['joinedthreshold']) + " -d " + str(options['divergencethreshold'])
            self.stdout.write("Computing alignment model: " + cmd)
            os.system(cmd)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write(self.style.SUCCESS('Reusing previously computed alignment model'))


        self.stdout.write("Loading models")
        sourceclassdecoder = colibricore.ClassDecoder(sourceclassfile)
        targetclassdecoder = colibricore.ClassDecoder(targetclassfile)
        sourcemodel = colibricore.UnindexedPatternModel(sourcemodelfile, modeloptions)
        targetmodel = colibricore.UnindexedPatternModel(targetmodelfile, modeloptions)
        alignmodel = colibricore.PatternAlignmentModel_float(alignmodelfile, modeloptions)
        self.stdout.write(self.style.SUCCESS('DONE'))

        #collection,_ = Collection.objects.get_or_create(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang'])
        #collection_id = 1

        l = len(alignmodel)


        self.stdout.write("Connecting to MongoDB server at " + settings.MONGODB_HOST + ":" + str(settings.MONGODB_PORT) )
        mongoengine.connect("colloquery", host=settings.MONGODB_HOST, port=settings.MONGODB_PORT)

        self.stdout.write("Generating translation pairs (this may take a while)..." )

        targetcollocations = {}
        prevsourcepattern = None
        collection = Collection(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang'])
        collection.save()
        sourcecount = 0

        for i, (sourcepattern, targetpattern, scores) in enumerate(alignmodel.triples()):
            if i % 100 == 0:
                self.stdout.write(str(round(((sourcecount + 1) / l) * 100,1)) + "% -- @" + str(sourcecount + 1) + " of " + str(l) + ": inserted " + str(i+1) + " pairs") #(source=" + str(n_source) + ", target=" + str(n_target) + ", source-keywords=" + str(n_source_keywords) + ", target-keywords=" + str(n_target_keywords) + ")")

            if prevsourcepattern is None or sourcepattern != prevsourcepattern:
                prevsourcepattern = sourcepattern
                sourcecount += 1

                sourcefreq = sourcemodel[sourcepattern]
                text = sourcepattern.tostring(sourceclassdecoder)
                if ignorable(text):
                    continue
                sourcecollocation = Collocation(collection=collection, language=options['sourcelang'], text=text, freq=sourcefreq)
                sourcecollocation.save()



            targetfreq = targetmodel[targetpattern]
            text = targetpattern.tostring(targetclassdecoder)
            if ignorable(text):
                continue
            if targetpattern in targetcollocations: #quicker in-memory lookup
                # targetcollocation = Collocation.objects(text=text, language=options['targetlang'], collection=collection)[0] #get from db
                targetcollocation = targetcollocations[targetpattern]
            else:
                targetcollocation = Collocation(collection=collection, language=options['targetlang'], text=text, freq=targetfreq)
                targetcollocation.save()
                #self.stdout.write(repr(targetcollocation.id))
                targetcollocations[targetpattern] = targetcollocation.id

            Translation(source=sourcecollocation, target=targetcollocation, prob=scores[0], revprob=scores[2]).save()
            Translation(source=targetcollocation, target=sourcecollocation, prob=scores[2], revprob=scores[0]).save()