def train(self, sourcefile, modelfile, **parameters): self.log("Preparing to generate bigram model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( ) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) self.log("Generating bigram frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=2) #unigrams and bigrams model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model") model.write(modelfile)
def load(self): """Load the requested modules from self.models""" self.errorlist = {} if not self.models: raise Exception("Specify one or more models to load!") if self.hapaxer: self.log("Loading hapaxer...") self.hapaxer.load() self.log("Loading models...") if len(self.models) == 2: modelfile, lexiconfile = self.models else: modelfile = self.models[0] lexiconfile = None if not os.path.exists(modelfile): raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?") if lexiconfile and not os.path.exists(lexiconfile): raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?") self.log("Loading model file " + modelfile + "...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug) self.classifier.load() if lexiconfile: self.log("Loading colibri model file for lexicon " + lexiconfile) self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls') self.lexicon = colibricore.UnindexedPatternModel(lexiconfile) else: self.lexicon = None
def load(self): """Load the requested modules from self.models""" if len(self.models) != 1: raise Exception("Specify one and only one model to load!") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file:" + modelfile) self.log("Loading colibri model file " + modelfile) self.classencoder = colibricore.ClassEncoder(modelfile + '.cls') self.classdecoder = colibricore.ClassDecoder(modelfile + '.cls') self.patternmodel = colibricore.UnindexedPatternModel(modelfile)
def load(self): """Load the requested modules from self.models""" if not self.models: raise Exception("Specify one or more models to load!") self.log("Loading models...") modelfile = self.models[0] if not os.path.exists(modelfile): raise IOError("Missing expected model file: " + modelfile + ". Did you forget to train the system?") self.log("Loading class encoder/decoder for " + modelfile + " ...") self.classencoder = colibricore.ClassEncoder(modelfile + '.cls') self.classdecoder = colibricore.ClassDecoder(modelfile + '.cls') self.log("Loading model files " + modelfile + ", " + modelfile + ".1 and " + modelfile + ".3 ...") self.unigram_model = colibricore.UnindexedPatternModel(modelfile + '.1') self.bigram_model = colibricore.UnindexedPatternModel(modelfile) self.trigram_model = colibricore.UnindexedPatternModel(modelfile + '.3')
def buildpatternmodel(testfiles): print("Loading test data...", file=sys.stderr) with open('inputmodel.txt', 'w', encoding='utf-8') as f: for testfile in testfiles: f.write(loadtext(testfile) + "\n") print("Building pattern model...", file=sys.stderr) classencoder = colibricore.ClassEncoder() classencoder.build('inputmodel.txt') classencoder.save('inputmodel.colibri.cls') classencoder.encodefile('inputmodel.txt', 'inputmodel.colibri.dat') options = colibricore.PatternModelOptions(mintokens=1, maxlength=3) patternmodel = colibricore.UnindexedPatternModel() patternmodel.train('inputmodel.colibri.dat', options) return patternmodel, classencoder
def train(self, sourcefile, modelfile, **parameters): self.log("Preparing to generate lexicon") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( "", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) if not os.path.exists(modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.savemodel( model, modelfile, classfile) #in separate function so it can be overloaded
def train(self): if self.sourcefile and not os.path.exists(self.modelfile): classfile = stripsourceextensions(self.sourcefile) + ".cls" corpusfile = stripsourceextensions(self.sourcefile) + ".dat" if not os.path.exists(classfile): self.classencoder = colibricore.ClassEncoder(self.minlength,self.maxlength) self.classencoder.build(self.sourcefile) self.classencoder.save(classfile) else: self.classencoder = colibricore.ClassEncoder(classfile, self.minlength, self.maxlength) if not os.path.exists(self.modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, self.modelfile + '.cls') if not os.path.exists(corpusfile): self.classencoder.encodefile( self.sourcefile, corpusfile) options = colibricore.PatternModelOptions(mintokens=self.threshold,minlength=1,maxlength=1) self.lexicon = colibricore.UnindexedPatternModel() self.lexicon.train(corpusfile, options) self.lexicon.write(self.modelfile)
def load(self): if not os.path.exists(self.modelfile): raise IOError("Missing expected model file for hapaxer:" + self.modelfile) self.classencoder = colibricore.ClassEncoder(self.modelfile + '.cls') #self.classdecoder = colibricore.ClassDecoder(self.modelfile + '.cls') self.lexicon = colibricore.UnindexedPatternModel(self.modelfile)
def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log( "Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( "", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder( classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log( "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!" ) for suffix in self.suffixes: if pattern_s.endswith( suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len( suffix)] + othersuffix try: otherpattern = classencoder.buildpattern( otherpattern_s, False, False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = ( model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile, 'w', encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: try: self.confusibles except AttributeError: self.confusibles = [] self.log("Loading confusiblefile") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized, ) + rightcontext, suffix) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def main(): parser = argparse.ArgumentParser( description="Extract skipgrams from a Moses phrasetable", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-t', '--minskiptypes', type=int, help="Minimal skip types", action='store', default=2, required=False) parser.add_argument( '-i', '--inputfile', type=str, help= "Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ", action='store', required=True) parser.add_argument( '-o', '--outputfile', type=str, help= "Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!", default="", action='store', required=False) parser.add_argument('-l', '--maxlength', type=int, help="Maximum length", action='store', default=8, required=False) parser.add_argument('-W', '--tmpdir', type=str, help="Temporary work directory", action='store', default="./", required=False) parser.add_argument('-S', '--sourceclassfile', type=str, help="Source class file", action='store', required=True) parser.add_argument('-T', '--targetclassfile', type=str, help="Target class file", action='store', required=True) parser.add_argument( '-s', '--constrainskipgrams', help= "Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered", action='store_true', required=False) parser.add_argument( '-m', '--constrainsourcemodel', type=str, help="Source patternmodel, used to constrain possible patterns", action='store', required=False) parser.add_argument( '-M', '--constraintargetmodel', type=str, help="Target patternmodel, used to constrain possible patterns", action='store', required=False) parser.add_argument( '-p', '--pts', type=float, help= "Minimum probability p(t|s) for skipgram consideration (set to a high number)", default=0.75, action='store', required=False) parser.add_argument( '-P', '--pst', type=float, help= "Minimum probability p(s|t) for skipgram consideration (set to a high number)", default=0.75, action='store', required=False) parser.add_argument('-D', '--debug', help="Enable debug mode", action='store_true', required=False) args = parser.parse_args() #args.storeconst, args.dataset, args.num, args.bar if args.constrainsourcemodel: print("Loading source model for constraints", file=sys.stderr) if args.constrainskipgrams: constrainsourcemodel = colibricore.IndexedPatternModel( args.constrainsourcemodel) else: constrainsourcemodel = colibricore.UnindexedPatternModel( args.constrainsourcemodel) else: constrainsourcemodel = None if args.constraintargetmodel: print("Loading target model for constraints", file=sys.stderr) if args.constrainskipgrams: constraintargetmodel = colibricore.IndexedPatternModel( args.constraintargetmodel) else: constraintargetmodel = colibricore.UnindexedPatternModel( args.constraintargetmodel) else: constraintargetmodel = None alignmodel = FeaturedAlignmentModel() if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'): print("Loading colibri alignment model", file=sys.stderr) alignmodel.load(args.inputfile) else: print("Loading class encoders", file=sys.stderr) sourceencoder = colibricore.ClassEncoder(args.sourceclassfile) targetencoder = colibricore.ClassEncoder(args.targetclassfile) print("Loading moses phrase table", file=sys.stderr) alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder, targetencoder) if args.debug: debug = (colibricore.ClassDecoder(args.sourceclassfile), colibricore.ClassDecoder(args.targetclassfile)) else: debug = False scorefilter = lambda features: features[0] >= args.pst and features[ 2] >= args.pts extractskipgrams(alignmodel, args.maxlength, args.minskiptypes, args.tmpdir, constrainsourcemodel, constraintargetmodel, args.constrainskipgrams, scorefilter, False, debug) if args.outputfile: outfile = args.outputfile else: outfile = os.path.basename(args.inputfile) if outfile[-3:] == '.gz': outfile = outfile[:-3] if outfile[-4:] == '.bz2': outfile = outfile[:-4] if outfile[-11:] == '.phrasetable': outfile = outfile[:-11] if outfile[-12:] == '.phrase-table': outfile = outfile[:-12] print("Saving alignment model to " + outfile, file=sys.stderr) alignmodel.save(outfile) #extensions will be added automatically
def sourcemodel(self): model = colibricore.UnindexedPatternModel() for sourcepattern in self.sourcepatterns(): model[sourcepattern] = model[sourcepattern] + 1 return model
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() if modelfile.endswith('.ibase'): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): if self.hapaxer: ngram = self.hapaxer(ngram) focus = ngram[l] if self.hapaxer and focus == self.hapaxer.placeholder: continue leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() elif modelfile.endswith('.patternmodel'): self.log("Preparing to generate lexicon for Language Model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder() classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating pattern model") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model " + modelfile) model.write(modelfile)
def main(): dopretests = True try: tests = sys.argv[1] if tests[0] == 'x': dopretests = False tests = tests[1:] if '-' in tests: begintest = int(tests.split('-')[0]) endtest = int(tests.split('-')[1]) else: begintest = endtest = int(tests) except: print( "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis", file=sys.stderr) sys.exit(2) try: textfile = sys.argv[2] except: print( "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis", file=sys.stderr) sys.exit(2) try: tmpdir = sys.argv[3] except: tmpdir = "/tmp/" classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls' datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat' modelfile = tmpdir + "/" + os.path.basename( textfile) + '.colibri.patternmodel' if not os.path.exists(textfile): print("File does not exist", file=sys.stderr) sys.exit(2) if dopretests: linecount = 0 print("PRETEST #1 - Reading text file (Python)") b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: linecount += 1 end(b) print("\t(Read " + str(linecount) + " lines)") print("PRETEST #2 - Building class encoder") encoder = colibricore.ClassEncoder() b = begin() encoder.build(textfile) end(b) print("PRETEST #3 - Saving class encoder") b = begin() encoder.save(classfile) end(b) print("PRETEST #4 - Class encoding corpus") b = begin() encoder.encodefile(textfile, datafile) end(b) print("PRETEST #5 - Unloading encoder") b = begin() del encoder gc.collect() end(b) if begintest < endtest: print("Running tests ", begintest, " to ", endtest) for testnum in range(begintest, min(endtest + 1, 10)): os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " " + textfile + " " + tmpdir) else: testnum = begintest print("-------------------- " + colorf('bold', 'TEST') + " #" + str(testnum) + " ----------------------") if testnum == 1: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)" ) ngrams = defaultdict(int) b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in MultiWindower(line, 1, 8): ngrams[ngram] += 1 end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") elif testnum == 2: print( "Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)" ) from nltk.probability import FreqDist from nltk.util import ngrams fd = FreqDist() b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: tokens = line.split(' ') for n in range(1, 9): for ngram in ngrams(tokens, n): fd[ngram] += 1 end(b) print("\t(Done)") elif testnum == 3: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel" ) model = colibricore.UnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=1, maxlength=8, doreverseindex=False) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model if testnum == 4: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back) (Python defaultdict + Pynlpl Windower)" ) ngrams = defaultdict(int) b = begin() for n in range(1, 9): with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in Windower(line, n): docount = True if n > 1: for subngram in Windower(ngram, n - 1): if not subngram in ngrams: docount = False break if docount: ngrams[ngram] += 1 end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") if testnum == 5: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back) (Python defaultdict + Pynlpl Windower)" ) ngrams = defaultdict(int) b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in MultiWindower(line, 1, 8): ngrams[ngram] += 1 for ngram in list(ngrams.keys()): if ngrams[ngram] < 2: del ngrams[ngram] gc.collect() end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") elif testnum == 6: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel" ) model = colibricore.UnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=2, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 7: print( "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.UnindexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=1, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 8: print( "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=1, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model elif testnum == 9: print( "Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=2, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 10: print( "Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=2, maxlength=8, doskipgrams=True) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 11: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel" ) model = colibricore.OrderedUnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=1, maxlength=8, doreverseindex=False) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model else: print("No such test", file=sys.stderr) print()
text = text.replace(')', '') text = text.replace('"', '') g.write(text.strip() + "\n") print("Building class encoder", file=sys.stderr) classencoder = colibricore.ClassEncoder() classencoder.build(textfile) classencoder.save(classfile) print("Encoding corpus data", file=sys.stderr) classencoder.encodefile(textfile, corpusfile) print("Loading class decoder", file=sys.stderr) classdecoder = colibricore.ClassDecoder(classfile) anchormodel = colibricore.UnindexedPatternModel() print("Counting anchors", file=sys.stderr) for i, infile in enumerate(infiles): with open(infile, encoding="utf-8") as f: for l in f.readlines(): js = json.loads(l) text = js["text"].lower() text = text.replace(',', ' ,') text = text.replace('.', ' .') text = text.replace(':', ' :') text = text.replace('(', '') text = text.replace(')', '') text = text.replace('"', '') anchors = js["annotations"] surface = [
def targetmodel(self): model = colibricore.UnindexedPatternModel() for targetpattern in self.targetpatterns(): model[targetpattern] = model[targetpattern] + 1 return model
def train(self, sourcefile, modelfile, **parameters): classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".nonewlines.dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( ) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile + '.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile, ignorenewlines=True) if modelfile.endswith('.1'): #unigram model (for recasing) self.log("Generating unigram frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['recasethreshold'], minlength=1, maxlength=1) #unigrams model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model") model.write(modelfile) elif modelfile.endswith('.3'): #trigram model self.log("Generating filtered trigram frequency list") filterpatterns = colibricore.PatternSet() for punc in ColibriPuncRecaseModule.PUNCTUATION: filterpattern = classencoder.buildpattern('{*1*} ' + punc + ' {*1*}') if not filterpattern.unknown(): filterpatterns.add(filterpattern) self.log("(" + str(len(filterpatterns)) + " filters)") options = colibricore.PatternModelOptions( mintokens=self.settings['deletioncutoff'], minlength=3, maxlength=3) #trigrams model = colibricore.UnindexedPatternModel() model.train_filtered(corpusfile, options, filterpatterns) self.log("Saving model") model.write(modelfile) else: #bigram model self.log("Generating bigram frequency list") options = colibricore.PatternModelOptions(mintokens=min( self.settings['insertioncutoff'], self.settings['recasethreshold2']), minlength=2, maxlength=2) #bigrams model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model") model.write(modelfile) del model
print("First word: ", test(firstword.tostring(decoder), "To")) needle = encoder.buildpattern("fair Ophelia") for match in corpus.findpattern(needle): print("'fair Ophelia' found at ", match) print("Token iteration:") i = 0 for ref in corpus: i += 1 print("Total number of tokens: ", test(len(corpus), i)) print() options = colibricore.PatternModelOptions(doskipgrams_exhaustive=True) print("\n===== Building unindexed model ======\n") unindexedmodel = colibricore.UnindexedPatternModel() unindexedmodel.train("/tmp/hamlet.colibri.dat", options) print("Pattern count", test(len(unindexedmodel), 385)) print("Type count", test(unindexedmodel.types(), 186)) print("Token count", test(unindexedmodel.tokens(), 354)) unindexedmodel.printmodel(decoder) print("REPORT:") unindexedmodel.report() print("HISTOGRAM:") unindexedmodel.histogram() outputfilename = "/tmp/data.colibri.patternmodel" print("Writing to file") unindexedmodel.write(outputfilename)
def handle(self, *args, **options): sourceclassfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.cls') sourcecorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.dat') sourcemodelfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.patternmodel') if not os.path.exists(sourceclassfile) or not os.path.exists(sourcecorpusfile) or options['force']: self.stdout.write("Encoding source corpus ...") sourceclassencoder = colibricore.ClassEncoder() sourceclassencoder.build(options['sourcecorpus']) sourceclassencoder.save(sourceclassfile) sourceclassencoder.encodefile(options['sourcecorpus'], sourcecorpusfile) self.stdout.write(self.style.SUCCESS('DONE')) else: self.stdout.write("Reusing previously encoded source corpus ...") targetclassfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.cls') targetcorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.dat') targetmodelfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.patternmodel') if not os.path.exists(targetclassfile) or not os.path.exists(targetcorpusfile) or options['force']: self.stdout.write("Encoding target corpus ...") targetclassencoder = colibricore.ClassEncoder() targetclassencoder.build(options['targetcorpus']) targetclassencoder.save(targetclassfile) targetclassencoder.encodefile(options['targetcorpus'], targetcorpusfile) self.stdout.write(self.style.SUCCESS('DONE')) else: self.stdout.write("Reusing previously encoded target corpus ...") modeloptions = colibricore.PatternModelOptions(mintokens=options['freqthreshold'],maxlength=options['maxlength']) if not os.path.exists(sourcemodelfile) or options['force']: self.stdout.write('Computing pattern model of source corpus ...') sourcemodel = colibricore.UnindexedPatternModel() sourcemodel.train(sourcecorpusfile, modeloptions) sourcemodel.write(sourcemodelfile) self.stdout.write(self.style.SUCCESS('DONE')) else: sourcemodel = None self.stdout.write("Reusing previously computed source model ...") if not os.path.exists(targetmodelfile) or options['force']: self.stdout.write('Computing pattern model of target corpus ...') targetmodel = colibricore.UnindexedPatternModel() targetmodel.train(targetcorpusfile, modeloptions) targetmodel.write(targetmodelfile) self.stdout.write(self.style.SUCCESS('DONE')) else: targetmodel = None self.stdout.write("Reusing previously computed target model ...") alignmodelfile = os.path.join(options['tmpdir'], "alignmodel.colibri") #delete models to conserve memory during next step if sourcemodel is not None: del sourcemodel self.stdout.write(self.style.SUCCESS('Unloaded source patternmodel')) if targetmodel is not None: del targetmodel self.stdout.write(self.style.SUCCESS('Unloaded target patternmodel')) if not os.path.exists(alignmodelfile) or options['force']: cmd = "colibri-mosesphrasetable2alignmodel -i " + options['phrasetable'] + " -o " + alignmodelfile + " -S " + sourceclassfile + " -T " + targetclassfile + " -m " + sourcemodelfile + " -M " + targetmodelfile + " -t " + str(options['freqthreshold']) + " -l " + str(options['maxlength']) + " -p " + str(options['pts']) + " -P " + str(options['pst']) + " -j " + str(options['joinedthreshold']) + " -d " + str(options['divergencethreshold']) self.stdout.write("Computing alignment model: " + cmd) os.system(cmd) self.stdout.write(self.style.SUCCESS('DONE')) else: self.stdout.write(self.style.SUCCESS('Reusing previously computed alignment model')) self.stdout.write("Loading models") sourceclassdecoder = colibricore.ClassDecoder(sourceclassfile) targetclassdecoder = colibricore.ClassDecoder(targetclassfile) sourcemodel = colibricore.UnindexedPatternModel(sourcemodelfile, modeloptions) targetmodel = colibricore.UnindexedPatternModel(targetmodelfile, modeloptions) alignmodel = colibricore.PatternAlignmentModel_float(alignmodelfile, modeloptions) self.stdout.write(self.style.SUCCESS('DONE')) #collection,_ = Collection.objects.get_or_create(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang']) #collection_id = 1 l = len(alignmodel) self.stdout.write("Connecting to MongoDB server at " + settings.MONGODB_HOST + ":" + str(settings.MONGODB_PORT) ) mongoengine.connect("colloquery", host=settings.MONGODB_HOST, port=settings.MONGODB_PORT) self.stdout.write("Generating translation pairs (this may take a while)..." ) targetcollocations = {} prevsourcepattern = None collection = Collection(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang']) collection.save() sourcecount = 0 for i, (sourcepattern, targetpattern, scores) in enumerate(alignmodel.triples()): if i % 100 == 0: self.stdout.write(str(round(((sourcecount + 1) / l) * 100,1)) + "% -- @" + str(sourcecount + 1) + " of " + str(l) + ": inserted " + str(i+1) + " pairs") #(source=" + str(n_source) + ", target=" + str(n_target) + ", source-keywords=" + str(n_source_keywords) + ", target-keywords=" + str(n_target_keywords) + ")") if prevsourcepattern is None or sourcepattern != prevsourcepattern: prevsourcepattern = sourcepattern sourcecount += 1 sourcefreq = sourcemodel[sourcepattern] text = sourcepattern.tostring(sourceclassdecoder) if ignorable(text): continue sourcecollocation = Collocation(collection=collection, language=options['sourcelang'], text=text, freq=sourcefreq) sourcecollocation.save() targetfreq = targetmodel[targetpattern] text = targetpattern.tostring(targetclassdecoder) if ignorable(text): continue if targetpattern in targetcollocations: #quicker in-memory lookup # targetcollocation = Collocation.objects(text=text, language=options['targetlang'], collection=collection)[0] #get from db targetcollocation = targetcollocations[targetpattern] else: targetcollocation = Collocation(collection=collection, language=options['targetlang'], text=text, freq=targetfreq) targetcollocation.save() #self.stdout.write(repr(targetcollocation.id)) targetcollocations[targetpattern] = targetcollocation.id Translation(source=sourcecollocation, target=targetcollocation, prob=scores[0], revprob=scores[2]).save() Translation(source=targetcollocation, target=sourcecollocation, prob=scores[2], revprob=scores[0]).save()