예제 #1
0
    def train(self, sourcefile, modelfile, **parameters):
        self.log("Preparing to generate bigram model")
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
            )  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile, corpusfile)

        self.log("Generating bigram frequency list")
        options = colibricore.PatternModelOptions(
            mintokens=self.settings['freqthreshold'], minlength=1,
            maxlength=2)  #unigrams and bigrams
        model = colibricore.UnindexedPatternModel()
        model.train(corpusfile, options)

        self.log("Saving model")
        model.write(modelfile)
예제 #2
0
    def load(self):
        """Load the requested modules from self.models"""
        self.errorlist = {}

        if not self.models:
            raise Exception("Specify one or more models to load!")

        if self.hapaxer:
            self.log("Loading hapaxer...")
            self.hapaxer.load()

        self.log("Loading models...")
        if len(self.models) == 2:
            modelfile, lexiconfile = self.models
        else:
            modelfile = self.models[0]
            lexiconfile = None
        if not os.path.exists(modelfile):
            raise IOError("Missing expected timbl model file: " + modelfile + ". Did you forget to train the system?")
        if lexiconfile and not os.path.exists(lexiconfile):
            raise IOError("Missing expected lexicon model file: " + lexiconfile + ". Did you forget to train the system?")
        self.log("Loading model file " + modelfile + "...")
        fileprefix = modelfile.replace(".ibase","") #has been verified earlier
        self.classifier = TimblClassifier(fileprefix, self.gettimbloptions(),threading=True, debug=self.debug)
        self.classifier.load()

        if lexiconfile:
            self.log("Loading colibri model file for lexicon " + lexiconfile)
            self.classencoder = colibricore.ClassEncoder(lexiconfile + '.cls')
            self.lexicon = colibricore.UnindexedPatternModel(lexiconfile)
        else:
            self.lexicon = None
예제 #3
0
    def load(self):
        """Load the requested modules from self.models"""
        if len(self.models) != 1:
            raise Exception("Specify one and only one model to load!")

        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file:" + modelfile)
        self.log("Loading colibri model file " + modelfile)
        self.classencoder = colibricore.ClassEncoder(modelfile + '.cls')
        self.classdecoder = colibricore.ClassDecoder(modelfile + '.cls')
        self.patternmodel = colibricore.UnindexedPatternModel(modelfile)
예제 #4
0
    def load(self):
        """Load the requested modules from self.models"""
        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")

        self.log("Loading class encoder/decoder for " + modelfile + " ...")
        self.classencoder = colibricore.ClassEncoder(modelfile + '.cls')
        self.classdecoder = colibricore.ClassDecoder(modelfile + '.cls')

        self.log("Loading model files " + modelfile + ", " + modelfile +
                 ".1  and " + modelfile + ".3 ...")
        self.unigram_model = colibricore.UnindexedPatternModel(modelfile +
                                                               '.1')
        self.bigram_model = colibricore.UnindexedPatternModel(modelfile)
        self.trigram_model = colibricore.UnindexedPatternModel(modelfile +
                                                               '.3')
def buildpatternmodel(testfiles):
    print("Loading test data...", file=sys.stderr)

    with open('inputmodel.txt', 'w', encoding='utf-8') as f:
        for testfile in testfiles:
            f.write(loadtext(testfile) + "\n")

    print("Building pattern model...", file=sys.stderr)

    classencoder = colibricore.ClassEncoder()
    classencoder.build('inputmodel.txt')
    classencoder.save('inputmodel.colibri.cls')
    classencoder.encodefile('inputmodel.txt', 'inputmodel.colibri.dat')

    options = colibricore.PatternModelOptions(mintokens=1, maxlength=3)
    patternmodel = colibricore.UnindexedPatternModel()
    patternmodel.train('inputmodel.colibri.dat', options)

    return patternmodel, classencoder
예제 #6
0
    def train(self, sourcefile, modelfile, **parameters):
        self.log("Preparing to generate lexicon")
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
                "", self.settings['minlength'],
                self.settings['maxlength'])  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile,
                                                    self.settings['minlength'],
                                                    self.settings['maxlength'])

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile, corpusfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        self.log("Generating frequency list")
        options = colibricore.PatternModelOptions(
            mintokens=self.settings['freqthreshold'], minlength=1,
            maxlength=1)  #unigrams only
        model = colibricore.UnindexedPatternModel()
        model.train(corpusfile, options)

        self.savemodel(
            model, modelfile,
            classfile)  #in separate function so it can be overloaded
예제 #7
0
    def train(self):
        if self.sourcefile and not os.path.exists(self.modelfile):
            classfile = stripsourceextensions(self.sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(self.sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.classencoder = colibricore.ClassEncoder(self.minlength,self.maxlength)
                self.classencoder.build(self.sourcefile)
                self.classencoder.save(classfile)
            else:
                self.classencoder = colibricore.ClassEncoder(classfile, self.minlength, self.maxlength)

            if not os.path.exists(self.modelfile + '.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, self.modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.classencoder.encodefile( self.sourcefile, corpusfile)

            options = colibricore.PatternModelOptions(mintokens=self.threshold,minlength=1,maxlength=1)
            self.lexicon = colibricore.UnindexedPatternModel()
            self.lexicon.train(corpusfile, options)
            self.lexicon.write(self.modelfile)
예제 #8
0
 def load(self):
     if not os.path.exists(self.modelfile):
         raise IOError("Missing expected model file for hapaxer:" + self.modelfile)
     self.classencoder = colibricore.ClassEncoder(self.modelfile + '.cls')
     #self.classdecoder = colibricore.ClassDecoder(self.modelfile + '.cls')
     self.lexicon = colibricore.UnindexedPatternModel(self.modelfile)
예제 #9
0
    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
예제 #10
0
def main():
    parser = argparse.ArgumentParser(
        description="Extract skipgrams from a Moses phrasetable",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t',
                        '--minskiptypes',
                        type=int,
                        help="Minimal skip types",
                        action='store',
                        default=2,
                        required=False)
    parser.add_argument(
        '-i',
        '--inputfile',
        type=str,
        help=
        "Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ",
        action='store',
        required=True)
    parser.add_argument(
        '-o',
        '--outputfile',
        type=str,
        help=
        "Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!",
        default="",
        action='store',
        required=False)
    parser.add_argument('-l',
                        '--maxlength',
                        type=int,
                        help="Maximum length",
                        action='store',
                        default=8,
                        required=False)
    parser.add_argument('-W',
                        '--tmpdir',
                        type=str,
                        help="Temporary work directory",
                        action='store',
                        default="./",
                        required=False)
    parser.add_argument('-S',
                        '--sourceclassfile',
                        type=str,
                        help="Source class file",
                        action='store',
                        required=True)
    parser.add_argument('-T',
                        '--targetclassfile',
                        type=str,
                        help="Target class file",
                        action='store',
                        required=True)
    parser.add_argument(
        '-s',
        '--constrainskipgrams',
        help=
        "Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered",
        action='store_true',
        required=False)
    parser.add_argument(
        '-m',
        '--constrainsourcemodel',
        type=str,
        help="Source patternmodel, used to constrain possible patterns",
        action='store',
        required=False)
    parser.add_argument(
        '-M',
        '--constraintargetmodel',
        type=str,
        help="Target patternmodel, used to constrain possible patterns",
        action='store',
        required=False)
    parser.add_argument(
        '-p',
        '--pts',
        type=float,
        help=
        "Minimum probability p(t|s) for skipgram consideration (set to a high number)",
        default=0.75,
        action='store',
        required=False)
    parser.add_argument(
        '-P',
        '--pst',
        type=float,
        help=
        "Minimum probability p(s|t) for skipgram consideration (set to a high number)",
        default=0.75,
        action='store',
        required=False)
    parser.add_argument('-D',
                        '--debug',
                        help="Enable debug mode",
                        action='store_true',
                        required=False)
    args = parser.parse_args()
    #args.storeconst, args.dataset, args.num, args.bar

    if args.constrainsourcemodel:
        print("Loading source model for constraints", file=sys.stderr)
        if args.constrainskipgrams:
            constrainsourcemodel = colibricore.IndexedPatternModel(
                args.constrainsourcemodel)
        else:
            constrainsourcemodel = colibricore.UnindexedPatternModel(
                args.constrainsourcemodel)
    else:
        constrainsourcemodel = None

    if args.constraintargetmodel:
        print("Loading target model for constraints", file=sys.stderr)
        if args.constrainskipgrams:
            constraintargetmodel = colibricore.IndexedPatternModel(
                args.constraintargetmodel)
        else:
            constraintargetmodel = colibricore.UnindexedPatternModel(
                args.constraintargetmodel)
    else:
        constraintargetmodel = None

    alignmodel = FeaturedAlignmentModel()
    if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'):
        print("Loading colibri alignment model", file=sys.stderr)
        alignmodel.load(args.inputfile)
    else:
        print("Loading class encoders", file=sys.stderr)
        sourceencoder = colibricore.ClassEncoder(args.sourceclassfile)
        targetencoder = colibricore.ClassEncoder(args.targetclassfile)
        print("Loading moses phrase table", file=sys.stderr)
        alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder,
                                        targetencoder)

    if args.debug:
        debug = (colibricore.ClassDecoder(args.sourceclassfile),
                 colibricore.ClassDecoder(args.targetclassfile))
    else:
        debug = False

    scorefilter = lambda features: features[0] >= args.pst and features[
        2] >= args.pts
    extractskipgrams(alignmodel, args.maxlength, args.minskiptypes,
                     args.tmpdir, constrainsourcemodel, constraintargetmodel,
                     args.constrainskipgrams, scorefilter, False, debug)

    if args.outputfile:
        outfile = args.outputfile
    else:
        outfile = os.path.basename(args.inputfile)
        if outfile[-3:] == '.gz': outfile = outfile[:-3]
        if outfile[-4:] == '.bz2': outfile = outfile[:-4]
        if outfile[-11:] == '.phrasetable': outfile = outfile[:-11]
        if outfile[-12:] == '.phrase-table': outfile = outfile[:-12]
    print("Saving alignment model to " + outfile, file=sys.stderr)
    alignmodel.save(outfile)  #extensions will be added automatically
예제 #11
0
 def sourcemodel(self):
     model = colibricore.UnindexedPatternModel()
     for sourcepattern in self.sourcepatterns():
         model[sourcepattern] = model[sourcepattern] + 1
     return model
예제 #12
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)
예제 #13
0
def main():
    dopretests = True
    try:
        tests = sys.argv[1]
        if tests[0] == 'x':
            dopretests = False
            tests = tests[1:]
        if '-' in tests:
            begintest = int(tests.split('-')[0])
            endtest = int(tests.split('-')[1])
        else:
            begintest = endtest = int(tests)
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)
    try:
        textfile = sys.argv[2]
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)

    try:
        tmpdir = sys.argv[3]
    except:
        tmpdir = "/tmp/"

    classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls'
    datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat'
    modelfile = tmpdir + "/" + os.path.basename(
        textfile) + '.colibri.patternmodel'

    if not os.path.exists(textfile):
        print("File does not exist", file=sys.stderr)
        sys.exit(2)

    if dopretests:
        linecount = 0
        print("PRETEST #1 - Reading text file (Python)")
        b = begin()
        with open(textfile, 'r', encoding='utf-8') as f:
            for line in f:
                linecount += 1
        end(b)
        print("\t(Read " + str(linecount) + " lines)")

        print("PRETEST #2 - Building class encoder")
        encoder = colibricore.ClassEncoder()
        b = begin()
        encoder.build(textfile)
        end(b)

        print("PRETEST #3 - Saving class encoder")
        b = begin()
        encoder.save(classfile)
        end(b)

        print("PRETEST #4 - Class encoding corpus")
        b = begin()
        encoder.encodefile(textfile, datafile)
        end(b)

        print("PRETEST #5 - Unloading encoder")
        b = begin()
        del encoder
        gc.collect()
        end(b)

    if begintest < endtest:
        print("Running tests ", begintest, " to ", endtest)
        for testnum in range(begintest, min(endtest + 1, 10)):
            os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " " +
                      textfile + " " + tmpdir)

    else:
        testnum = begintest
        print("-------------------- " + colorf('bold', 'TEST') + " #" +
              str(testnum) + " ----------------------")
        if testnum == 1:

            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")

        elif testnum == 2:
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)"
            )

            from nltk.probability import FreqDist
            from nltk.util import ngrams

            fd = FreqDist()
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.split(' ')
                    for n in range(1, 9):
                        for ngram in ngrams(tokens, n):
                            fd[ngram] += 1
            end(b)
            print("\t(Done)")
        elif testnum == 3:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        if testnum == 4:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            for n in range(1, 9):
                with open(textfile, 'r', encoding='utf-8') as f:
                    for line in f:
                        for ngram in Windower(line, n):
                            docount = True
                            if n > 1:
                                for subngram in Windower(ngram, n - 1):
                                    if not subngram in ngrams:
                                        docount = False
                                        break
                            if docount:
                                ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        if testnum == 5:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            for ngram in list(ngrams.keys()):
                if ngrams[ngram] < 2: del ngrams[ngram]
            gc.collect()
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        elif testnum == 6:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 7:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.UnindexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 8:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

            del model

        elif testnum == 9:
            print(
                "Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 10:

            print(
                "Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2,
                                                      maxlength=8,
                                                      doskipgrams=True)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 11:
            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel"
            )
            model = colibricore.OrderedUnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        else:
            print("No such test", file=sys.stderr)
        print()
예제 #14
0
                text = text.replace(')', '')
                text = text.replace('"', '')
                g.write(text.strip() + "\n")

print("Building class encoder", file=sys.stderr)
classencoder = colibricore.ClassEncoder()
classencoder.build(textfile)
classencoder.save(classfile)

print("Encoding corpus data", file=sys.stderr)
classencoder.encodefile(textfile, corpusfile)

print("Loading class decoder", file=sys.stderr)
classdecoder = colibricore.ClassDecoder(classfile)

anchormodel = colibricore.UnindexedPatternModel()
print("Counting anchors", file=sys.stderr)

for i, infile in enumerate(infiles):
    with open(infile, encoding="utf-8") as f:
        for l in f.readlines():
            js = json.loads(l)
            text = js["text"].lower()
            text = text.replace(',', ' ,')
            text = text.replace('.', ' .')
            text = text.replace(':', ' :')
            text = text.replace('(', '')
            text = text.replace(')', '')
            text = text.replace('"', '')
            anchors = js["annotations"]
            surface = [
예제 #15
0
 def targetmodel(self):
     model = colibricore.UnindexedPatternModel()
     for targetpattern in self.targetpatterns():
         model[targetpattern] = model[targetpattern] + 1
     return model
예제 #16
0
    def train(self, sourcefile, modelfile, **parameters):
        classfile = stripsourceextensions(sourcefile) + ".cls"
        corpusfile = stripsourceextensions(sourcefile) + ".nonewlines.dat"

        if not os.path.exists(classfile):
            self.log("Building class file")
            classencoder = colibricore.ClassEncoder(
            )  #character length constraints
            classencoder.build(sourcefile)
            classencoder.save(classfile)
        else:
            classencoder = colibricore.ClassEncoder(classfile)

        if not os.path.exists(modelfile + '.cls'):
            #make symlink to class file, using model name instead of source name
            os.symlink(classfile, modelfile + '.cls')

        if not os.path.exists(corpusfile):
            self.log("Encoding corpus")
            classencoder.encodefile(sourcefile,
                                    corpusfile,
                                    ignorenewlines=True)

        if modelfile.endswith('.1'):
            #unigram model (for recasing)
            self.log("Generating unigram frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['recasethreshold'],
                minlength=1,
                maxlength=1)  #unigrams
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model")
            model.write(modelfile)
        elif modelfile.endswith('.3'):
            #trigram model
            self.log("Generating filtered trigram frequency list")
            filterpatterns = colibricore.PatternSet()
            for punc in ColibriPuncRecaseModule.PUNCTUATION:
                filterpattern = classencoder.buildpattern('{*1*} ' + punc +
                                                          ' {*1*}')
                if not filterpattern.unknown():
                    filterpatterns.add(filterpattern)
            self.log("(" + str(len(filterpatterns)) + " filters)")

            options = colibricore.PatternModelOptions(
                mintokens=self.settings['deletioncutoff'],
                minlength=3,
                maxlength=3)  #trigrams
            model = colibricore.UnindexedPatternModel()
            model.train_filtered(corpusfile, options, filterpatterns)

            self.log("Saving model")
            model.write(modelfile)
        else:
            #bigram model
            self.log("Generating bigram frequency list")
            options = colibricore.PatternModelOptions(mintokens=min(
                self.settings['insertioncutoff'],
                self.settings['recasethreshold2']),
                                                      minlength=2,
                                                      maxlength=2)  #bigrams
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model")
            model.write(modelfile)
            del model
예제 #17
0
print("First word: ", test(firstword.tostring(decoder), "To"))
needle = encoder.buildpattern("fair Ophelia")
for match in corpus.findpattern(needle):
    print("'fair Ophelia' found at ", match)
print("Token iteration:")
i = 0
for ref in corpus:
    i += 1
print("Total number of tokens: ", test(len(corpus), i))

print()

options = colibricore.PatternModelOptions(doskipgrams_exhaustive=True)

print("\n===== Building unindexed model ======\n")
unindexedmodel = colibricore.UnindexedPatternModel()
unindexedmodel.train("/tmp/hamlet.colibri.dat", options)
print("Pattern count", test(len(unindexedmodel), 385))
print("Type count", test(unindexedmodel.types(), 186))
print("Token count", test(unindexedmodel.tokens(), 354))

unindexedmodel.printmodel(decoder)
print("REPORT:")
unindexedmodel.report()
print("HISTOGRAM:")
unindexedmodel.histogram()

outputfilename = "/tmp/data.colibri.patternmodel"
print("Writing to file")
unindexedmodel.write(outputfilename)
예제 #18
0
    def handle(self, *args, **options):
        sourceclassfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.cls')
        sourcecorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.dat')
        sourcemodelfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.patternmodel')

        if not os.path.exists(sourceclassfile) or not os.path.exists(sourcecorpusfile) or options['force']:
            self.stdout.write("Encoding source corpus ...")
            sourceclassencoder = colibricore.ClassEncoder()
            sourceclassencoder.build(options['sourcecorpus'])
            sourceclassencoder.save(sourceclassfile)
            sourceclassencoder.encodefile(options['sourcecorpus'], sourcecorpusfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write("Reusing previously encoded source corpus ...")

        targetclassfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.cls')
        targetcorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.dat')
        targetmodelfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.patternmodel')

        if not os.path.exists(targetclassfile) or not os.path.exists(targetcorpusfile) or options['force']:
            self.stdout.write("Encoding target corpus ...")
            targetclassencoder = colibricore.ClassEncoder()
            targetclassencoder.build(options['targetcorpus'])
            targetclassencoder.save(targetclassfile)
            targetclassencoder.encodefile(options['targetcorpus'], targetcorpusfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write("Reusing previously encoded target corpus ...")

        modeloptions = colibricore.PatternModelOptions(mintokens=options['freqthreshold'],maxlength=options['maxlength'])

        if not os.path.exists(sourcemodelfile) or options['force']:
            self.stdout.write('Computing pattern model of source corpus ...')
            sourcemodel = colibricore.UnindexedPatternModel()
            sourcemodel.train(sourcecorpusfile, modeloptions)
            sourcemodel.write(sourcemodelfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            sourcemodel = None
            self.stdout.write("Reusing previously computed source model ...")

        if not os.path.exists(targetmodelfile) or options['force']:
            self.stdout.write('Computing pattern model of target corpus ...')
            targetmodel = colibricore.UnindexedPatternModel()
            targetmodel.train(targetcorpusfile, modeloptions)
            targetmodel.write(targetmodelfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            targetmodel = None
            self.stdout.write("Reusing previously computed target model ...")

        alignmodelfile = os.path.join(options['tmpdir'], "alignmodel.colibri")

        #delete models to conserve memory during next step
        if sourcemodel is not None:
            del sourcemodel
            self.stdout.write(self.style.SUCCESS('Unloaded source patternmodel'))
        if targetmodel is not None:
            del targetmodel
            self.stdout.write(self.style.SUCCESS('Unloaded target patternmodel'))

        if not os.path.exists(alignmodelfile) or options['force']:
            cmd = "colibri-mosesphrasetable2alignmodel -i " + options['phrasetable'] + " -o " + alignmodelfile + " -S " + sourceclassfile + " -T " + targetclassfile + " -m " + sourcemodelfile + " -M " + targetmodelfile + " -t " + str(options['freqthreshold']) + " -l " + str(options['maxlength']) + " -p " + str(options['pts']) + " -P " + str(options['pst']) + " -j " + str(options['joinedthreshold']) + " -d " + str(options['divergencethreshold'])
            self.stdout.write("Computing alignment model: " + cmd)
            os.system(cmd)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write(self.style.SUCCESS('Reusing previously computed alignment model'))


        self.stdout.write("Loading models")
        sourceclassdecoder = colibricore.ClassDecoder(sourceclassfile)
        targetclassdecoder = colibricore.ClassDecoder(targetclassfile)
        sourcemodel = colibricore.UnindexedPatternModel(sourcemodelfile, modeloptions)
        targetmodel = colibricore.UnindexedPatternModel(targetmodelfile, modeloptions)
        alignmodel = colibricore.PatternAlignmentModel_float(alignmodelfile, modeloptions)
        self.stdout.write(self.style.SUCCESS('DONE'))

        #collection,_ = Collection.objects.get_or_create(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang'])
        #collection_id = 1

        l = len(alignmodel)


        self.stdout.write("Connecting to MongoDB server at " + settings.MONGODB_HOST + ":" + str(settings.MONGODB_PORT) )
        mongoengine.connect("colloquery", host=settings.MONGODB_HOST, port=settings.MONGODB_PORT)

        self.stdout.write("Generating translation pairs (this may take a while)..." )

        targetcollocations = {}
        prevsourcepattern = None
        collection = Collection(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang'])
        collection.save()
        sourcecount = 0

        for i, (sourcepattern, targetpattern, scores) in enumerate(alignmodel.triples()):
            if i % 100 == 0:
                self.stdout.write(str(round(((sourcecount + 1) / l) * 100,1)) + "% -- @" + str(sourcecount + 1) + " of " + str(l) + ": inserted " + str(i+1) + " pairs") #(source=" + str(n_source) + ", target=" + str(n_target) + ", source-keywords=" + str(n_source_keywords) + ", target-keywords=" + str(n_target_keywords) + ")")

            if prevsourcepattern is None or sourcepattern != prevsourcepattern:
                prevsourcepattern = sourcepattern
                sourcecount += 1

                sourcefreq = sourcemodel[sourcepattern]
                text = sourcepattern.tostring(sourceclassdecoder)
                if ignorable(text):
                    continue
                sourcecollocation = Collocation(collection=collection, language=options['sourcelang'], text=text, freq=sourcefreq)
                sourcecollocation.save()



            targetfreq = targetmodel[targetpattern]
            text = targetpattern.tostring(targetclassdecoder)
            if ignorable(text):
                continue
            if targetpattern in targetcollocations: #quicker in-memory lookup
                # targetcollocation = Collocation.objects(text=text, language=options['targetlang'], collection=collection)[0] #get from db
                targetcollocation = targetcollocations[targetpattern]
            else:
                targetcollocation = Collocation(collection=collection, language=options['targetlang'], text=text, freq=targetfreq)
                targetcollocation.save()
                #self.stdout.write(repr(targetcollocation.id))
                targetcollocations[targetpattern] = targetcollocation.id

            Translation(source=sourcecollocation, target=targetcollocation, prob=scores[0], revprob=scores[2]).save()
            Translation(source=targetcollocation, target=sourcecollocation, prob=scores[2], revprob=scores[0]).save()