Ejemplo n.º 1
0
def main_alignmodel():
    parser = argparse.ArgumentParser(description="Load and view the specified alignment model", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i','--inputfile',type=str,help="Input alignment model (file prefix without .colibri.alignmodel-* extension)", action='store',required=True)
    parser.add_argument('-S','--sourceclassfile',type=str,help="Source class file", action='store',required=True)
    parser.add_argument('-T','--targetclassfile',type=str,help="Target class file", action='store',required=True)
    parser.add_argument('-p','--pts',type=float,help="Constrain by minimum probability p(t|s), assumes a moses-style score vector",default=0.0, action='store',required=False)
    parser.add_argument('-P','--pst',type=float,help="Constrain by minimum probability p(s|t), assumes a moses-style score vector", default=0.0,action='store',required=False)
    parser.add_argument('--debug',help="Enabled debug", action='store_true',required=False)
    args = parser.parse_args()
    #args.storeconst, args.dataset, args.num, args.bar


    print("Loading source decoder " + args.sourceclassfile,file=sys.stderr)
    sourcedecoder = colibricore.ClassDecoder(args.sourceclassfile)
    print("Loading target decoder " + args.targetclassfile,file=sys.stderr)
    targetdecoder = colibricore.ClassDecoder(args.targetclassfile)
    print("Loading alignment model",file=sys.stderr)
    model = AlignmentModel()
    options = colibricore.PatternModelOptions(debug=args.debug)
    if options.DEBUG: print("Debug enabled",file=sys.stderr)
    sys.stderr.flush()
    model.load(args.inputfile, options)
    print("Outputting",file=sys.stderr)
    if args.pts or args.pst:
        scorefilter = lambda scores: scores[2] > args.pts and scores[0] > args.pst
    else:
        scorefilter = None
    model.output(sourcedecoder,targetdecoder,scorefilter)
Ejemplo n.º 2
0
    def test001_alignmodel(self):
        """Checking alignment model"""
        options = colibricore.PatternModelOptions(mintokens=1,
                                                  doreverseindex=False)

        s = colibricore.ClassEncoder("test-en-nl/test-en-train.colibri.cls")
        t = colibricore.ClassEncoder("test-en-nl/test-nl-train.colibri.cls")
        sdec = colibricore.ClassDecoder("test-en-nl/test-en-train.colibri.cls")
        tdec = colibricore.ClassDecoder("test-en-nl/test-nl-train.colibri.cls")

        print("Loading alignment model", file=sys.stderr)
        model = AlignmentModel()
        model.load("test-en-nl/test-en-nl.colibri.alignmodel", options)
        print("Loaded", file=sys.stderr)
        model.output(sdec, tdec)
        print("Testing contents", file=sys.stderr)
        self.assertTrue((s.buildpattern('a'), t.buildpattern('een')) in model)
        self.assertTrue((s.buildpattern('just'),
                         t.buildpattern('maar')) in model)
        self.assertTrue((s.buildpattern('only'),
                         t.buildpattern('maar')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('oever')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('bank')) in model)
        self.assertTrue((s.buildpattern('bank'),
                         t.buildpattern('sturen')) in model)
        self.assertTrue((s.buildpattern('couch'),
                         t.buildpattern('bank')) in model)
        self.assertTrue((s.buildpattern('the bank'),
                         t.buildpattern('de oever')) in model)
        self.assertTrue((s.buildpattern('the bank'),
                         t.buildpattern('de bank')) in model)
        self.assertTrue((s.buildpattern('the couch'),
                         t.buildpattern('de bank')) in model)
        self.assertTrue((s.buildpattern('I see'),
                         t.buildpattern('Ik zie')) in model)
        self.assertTrue((s.buildpattern('He'), t.buildpattern('Hij')) in model)
        self.assertTrue((s.buildpattern('sits'),
                         t.buildpattern('zit')) in model)
        self.assertTrue((s.buildpattern('on'), t.buildpattern('on')) in model)
        self.assertTrue((s.buildpattern('today'),
                         t.buildpattern('vandaag')) in model)
        self.assertEqual(len(list(model.triples())), 15)
Ejemplo n.º 3
0
    def load(self):
        """Load the requested modules from self.models"""
        if len(self.models) != 1:
            raise Exception("Specify one and only one model to load!")

        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file:" + modelfile)
        self.log("Loading colibri model file " + modelfile)
        self.classencoder = colibricore.ClassEncoder(modelfile + '.cls')
        self.classdecoder = colibricore.ClassDecoder(modelfile + '.cls')
        self.patternmodel = colibricore.UnindexedPatternModel(modelfile)
Ejemplo n.º 4
0
 def savemodel(self, model, modelfile, classfile):
     self.log("Saving model")
     classdecoder = colibricore.ClassDecoder(classfile)
     with open(modelfile, 'w', encoding='utf-8') as f:
         if self.settings['ordered']:
             items = sorted(model.items(), key=lambda x: -1 * x[1])
         else:
             items = model.items()
         for pattern, occurrencecount in items:
             if self.settings['reversedformat']:
                 f.write(
                     str(occurrencecount) + self.settings['delimiter'] +
                     pattern.tostring(classdecoder) + "\n")
             else:
                 f.write(
                     pattern.tostring(classdecoder) +
                     self.settings['delimiter'] + str(occurrencecount) +
                     "\n")
Ejemplo n.º 5
0
    def load(self):
        """Load the requested modules from self.models"""
        if not self.models:
            raise Exception("Specify one or more models to load!")

        self.log("Loading models...")
        modelfile = self.models[0]
        if not os.path.exists(modelfile):
            raise IOError("Missing expected model file: " + modelfile +
                          ". Did you forget to train the system?")

        self.log("Loading class encoder/decoder for " + modelfile + " ...")
        self.classencoder = colibricore.ClassEncoder(modelfile + '.cls')
        self.classdecoder = colibricore.ClassDecoder(modelfile + '.cls')

        self.log("Loading model files " + modelfile + ", " + modelfile +
                 ".1  and " + modelfile + ".3 ...")
        self.unigram_model = colibricore.UnindexedPatternModel(modelfile +
                                                               '.1')
        self.bigram_model = colibricore.UnindexedPatternModel(modelfile)
        self.trigram_model = colibricore.UnindexedPatternModel(modelfile +
                                                               '.3')
Ejemplo n.º 6
0
            print(red("FAILED!") + " Got " + str(a) + ", expected " + str(b),
                  file=sys.stderr)
            sys.exit(2)


try:
    import colibricore
except ImportError:
    print("Run setup.py install first!", file=sys.stderr)
    raise

with open("/tmp/colibritest", 'w') as f:
    f.write("5\tbe\n6\tTo\n7\tto\n8\tor\n9\tnot\n73477272\tblah\n")

print("Loading class decoder...")
decoder = colibricore.ClassDecoder("/tmp/colibritest")
print("Loading class encoder...")
encoder = colibricore.ClassEncoder("/tmp/colibritest")

print("Building pattern...")
ngram = encoder.buildpattern("To be or not to be")

print("Ngram: ", test(ngram.tostring(decoder), "To be or not to be"))
print("Size: ", test(len(ngram), 6))
print("Bytesize: ", test(ngram.bytesize(), 6))
print("Category==NGRAM", test(ngram.category() == colibricore.Category.NGRAM))
print("Hash: ", test(hash(ngram)))
print("Raw bytes: ", repr(bytes(ngram)))

print("Third token ", test(ngram[2].tostring(decoder), "or"))
print("Last token ", test(ngram[-1].tostring(decoder), "be"))
Ejemplo n.º 7
0
    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(
        description="Extract skipgrams from a Moses phrasetable",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t',
                        '--minskiptypes',
                        type=int,
                        help="Minimal skip types",
                        action='store',
                        default=2,
                        required=False)
    parser.add_argument(
        '-i',
        '--inputfile',
        type=str,
        help=
        "Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ",
        action='store',
        required=True)
    parser.add_argument(
        '-o',
        '--outputfile',
        type=str,
        help=
        "Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!",
        default="",
        action='store',
        required=False)
    parser.add_argument('-l',
                        '--maxlength',
                        type=int,
                        help="Maximum length",
                        action='store',
                        default=8,
                        required=False)
    parser.add_argument('-W',
                        '--tmpdir',
                        type=str,
                        help="Temporary work directory",
                        action='store',
                        default="./",
                        required=False)
    parser.add_argument('-S',
                        '--sourceclassfile',
                        type=str,
                        help="Source class file",
                        action='store',
                        required=True)
    parser.add_argument('-T',
                        '--targetclassfile',
                        type=str,
                        help="Target class file",
                        action='store',
                        required=True)
    parser.add_argument(
        '-s',
        '--constrainskipgrams',
        help=
        "Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered",
        action='store_true',
        required=False)
    parser.add_argument(
        '-m',
        '--constrainsourcemodel',
        type=str,
        help="Source patternmodel, used to constrain possible patterns",
        action='store',
        required=False)
    parser.add_argument(
        '-M',
        '--constraintargetmodel',
        type=str,
        help="Target patternmodel, used to constrain possible patterns",
        action='store',
        required=False)
    parser.add_argument(
        '-p',
        '--pts',
        type=float,
        help=
        "Minimum probability p(t|s) for skipgram consideration (set to a high number)",
        default=0.75,
        action='store',
        required=False)
    parser.add_argument(
        '-P',
        '--pst',
        type=float,
        help=
        "Minimum probability p(s|t) for skipgram consideration (set to a high number)",
        default=0.75,
        action='store',
        required=False)
    parser.add_argument('-D',
                        '--debug',
                        help="Enable debug mode",
                        action='store_true',
                        required=False)
    args = parser.parse_args()
    #args.storeconst, args.dataset, args.num, args.bar

    if args.constrainsourcemodel:
        print("Loading source model for constraints", file=sys.stderr)
        if args.constrainskipgrams:
            constrainsourcemodel = colibricore.IndexedPatternModel(
                args.constrainsourcemodel)
        else:
            constrainsourcemodel = colibricore.UnindexedPatternModel(
                args.constrainsourcemodel)
    else:
        constrainsourcemodel = None

    if args.constraintargetmodel:
        print("Loading target model for constraints", file=sys.stderr)
        if args.constrainskipgrams:
            constraintargetmodel = colibricore.IndexedPatternModel(
                args.constraintargetmodel)
        else:
            constraintargetmodel = colibricore.UnindexedPatternModel(
                args.constraintargetmodel)
    else:
        constraintargetmodel = None

    alignmodel = FeaturedAlignmentModel()
    if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'):
        print("Loading colibri alignment model", file=sys.stderr)
        alignmodel.load(args.inputfile)
    else:
        print("Loading class encoders", file=sys.stderr)
        sourceencoder = colibricore.ClassEncoder(args.sourceclassfile)
        targetencoder = colibricore.ClassEncoder(args.targetclassfile)
        print("Loading moses phrase table", file=sys.stderr)
        alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder,
                                        targetencoder)

    if args.debug:
        debug = (colibricore.ClassDecoder(args.sourceclassfile),
                 colibricore.ClassDecoder(args.targetclassfile))
    else:
        debug = False

    scorefilter = lambda features: features[0] >= args.pst and features[
        2] >= args.pts
    extractskipgrams(alignmodel, args.maxlength, args.minskiptypes,
                     args.tmpdir, constrainsourcemodel, constraintargetmodel,
                     args.constrainskipgrams, scorefilter, False, debug)

    if args.outputfile:
        outfile = args.outputfile
    else:
        outfile = os.path.basename(args.inputfile)
        if outfile[-3:] == '.gz': outfile = outfile[:-3]
        if outfile[-4:] == '.bz2': outfile = outfile[:-4]
        if outfile[-11:] == '.phrasetable': outfile = outfile[:-11]
        if outfile[-12:] == '.phrase-table': outfile = outfile[:-12]
    print("Saving alignment model to " + outfile, file=sys.stderr)
    alignmodel.save(outfile)  #extensions will be added automatically
Ejemplo n.º 9
0
from __future__ import print_function, unicode_literals, division, absolute_import
import colibricore

from colibrimt.alignmentmodel import FeaturedAlignmentModel

sourceencoder = colibricore.ClassEncoder()
targetencoder = colibricore.ClassEncoder()

s1 = sourceencoder.buildpattern("het grote huis", False, True)
s2 = sourceencoder.buildpattern("het paleis", False, True)
t1 = targetencoder.buildpattern("the big house", False, True)
t2 = targetencoder.buildpattern("the grand house", False, True)
t3 = targetencoder.buildpattern("the palace", False, True)

sourceencoder.save('/tmp/s.cls')
targetencoder.save('/tmp/t.cls')
sd = colibricore.ClassDecoder('/tmp/s.cls')
td = colibricore.ClassDecoder('/tmp/t.cls')

model = FeaturedAlignmentModel()
model.add(s1, t1, [1, 0, 1, 0])
model.add(s1, t2, [1, 0, 1, 0])
model.add(s2, t2, [1, 0, 1, 0])
model.add(s2, t3, [1, 0, 1, 0])
model.normalize('s-t-')

for source, target, scores in model:
    print(
        source.tostring(sd) + "\t" + target.tostring(td) + "\t" +
        " ".join([str(x) for x in scores]))
Ejemplo n.º 10
0
                text = text.replace(':', ' :')
                text = text.replace('(', '')
                text = text.replace(')', '')
                text = text.replace('"', '')
                g.write(text.strip() + "\n")

print("Building class encoder", file=sys.stderr)
classencoder = colibricore.ClassEncoder()
classencoder.build(textfile)
classencoder.save(classfile)

print("Encoding corpus data", file=sys.stderr)
classencoder.encodefile(textfile, corpusfile)

print("Loading class decoder", file=sys.stderr)
classdecoder = colibricore.ClassDecoder(classfile)

anchormodel = colibricore.UnindexedPatternModel()
print("Counting anchors", file=sys.stderr)

for i, infile in enumerate(infiles):
    with open(infile, encoding="utf-8") as f:
        for l in f.readlines():
            js = json.loads(l)
            text = js["text"].lower()
            text = text.replace(',', ' ,')
            text = text.replace('.', ' .')
            text = text.replace(':', ' :')
            text = text.replace('(', '')
            text = text.replace(')', '')
            text = text.replace('"', '')
Ejemplo n.º 11
0
def main_extractfeatures():
    parser = argparse.ArgumentParser(description="Extract context features and build classifier data (-C) or add to alignment model", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i','--inputfile',type=str,help="Input alignment model", action='store',required=True)
    parser.add_argument('-o','--outputdir',type=str,help="Output directory, when used with -C", action='store',required=True)
    parser.add_argument('-s','--sourcemodel',type=str,help="Source model (indexed pattern model)", action='store',required=True)
    parser.add_argument('-t','--targetmodel',type=str,help="Target model (indexed pattern model)", action='store',required=True)
    parser.add_argument('-S','--sourceclassfile',type=str,help="Source class file", action='store',required=True)
    parser.add_argument('-T','--targetclassfile',type=str,help="Target class file", action='store',required=True)
    parser.add_argument('-f','--corpusfile',type=str,help="Corpus input file for feature extraction, may be specified multiple times, but all data files must cover the exact same data, i.e. have exactly the same indices (describing different factors)", action='append',required=True)
    parser.add_argument('-c','--classfile',type=str,help="Class file for the specified data file (may be specified multiple times, once per -f)", action='append',required=True)
    parser.add_argument('-l','--leftsize',type=int,help="Left context size (may be specified multiple times, once per -f)", action='append',required=True)
    parser.add_argument('-r','--rightsize',type=int,help="Right context size (may be specified multiple times, once per -f)", action='append',required=True)
    parser.add_argument('-C','--buildclassifiers',help="Build classifier training data, one classifier expert per pattern, specify a working directory in -o", action='store_true',default=False)
    parser.add_argument('-w','--weighbyoccurrence',help="When building classifier data (-C), use exemplar weighting to reflect occurrence count, rather than duplicating instances", action='store_true',default=False)
    parser.add_argument('-W','--weighbyscore',help="When building classifier data (-C), use exemplar weighting to weigh in p(t|s) from score vector", action='store_true',default=False)
    parser.add_argument('-I','--instancethreshold',type=int,help="Classifiers (-C) having less than the specified number of instances will be not be generated", action='store',default=2)
    parser.add_argument('-X','--experts', help="Classifier experts, one per source pattern", action="store_true", default=False)
    parser.add_argument('-M','--monolithic', help="Monolithic classifier (won't work with keywords enabled!)", action="store_true", default=False)
    parser.add_argument('-k','--keywords',help="Add global keywords in context", action='store_true',default=False)
    parser.add_argument('--km',dest='keywordmodel',type=str,help="Source-side unigram model (target-side if crosslingual is set!) for keyword extraction. Needs to be an indexed model with only unigrams.", action='store',required=False,default="")
    parser.add_argument("--kt",dest="bow_absolute_threshold", help="Keyword needs to occur at least this many times in the context (absolute number)", type=int, action='store',default=3)
    parser.add_argument("--kp",dest="bow_prob_threshold", help="minimal P(translation|keyword)", type=float, action='store',default=0.001)
    parser.add_argument("--kg",dest="bow_filter_threshold", help="Keyword needs to occur at least this many times globally in the entire corpus (absolute number)", type=int, action='store',default=20)
    #parser.add_argument("--ka",dest="compute_bow_params", help="Attempt to automatically compute --kt,--kp and --kg parameters", action='store_false',default=True)
    parser.add_argument('--crosslingual', help="Extract target-language context features instead of source-language features (for use with Colibrita). In this case, the corpus in -f and in any additional factor must be the *target* corpus", action="store_true", default=False)
    args = parser.parse_args()

    if not (len(args.corpusfile) == len(args.classfile) == len(args.leftsize) == len(args.rightsize)):
        print("Number of mentions of -f, -c, -l and -r has to match",file=sys.stderr)
        sys.exit(2)


    options = colibricore.PatternModelOptions(mintokens=1,doreverseindex=False)

    print("Loading alignment model",file=sys.stderr)
    model = AlignmentModel()
    model.load(args.inputfile,options)


    print("Loading source decoder " + args.sourceclassfile,file=sys.stderr)
    sourcedecoder = colibricore.ClassDecoder(args.sourceclassfile)
    print("Loading target decoder " + args.targetclassfile,file=sys.stderr)
    targetdecoder = colibricore.ClassDecoder(args.targetclassfile)

    print("Loading source model " , args.sourcemodel, file=sys.stderr)
    sourcemodel = colibricore.IndexedPatternModel(args.sourcemodel, options)

    print("Loading target model ", args.targetmodel, file=sys.stderr)
    targetmodel = colibricore.IndexedPatternModel(args.targetmodel, options)



    model.conf = []
    for corpusfile, classfile,left, right in zip(args.corpusfile, args.classfile, args.leftsize, args.rightsize):
        print("Loading corpus file ", corpusfile, file=sys.stderr)
        if classfile == args.sourceclassfile:
            d = sourcedecoder
        elif classfile == args.targetclassfile:
            d = targetdecoder
        else:
            d = colibricore.ClassDecoder(classfile)
        model.conf.append( Configuration(colibricore.IndexedCorpus(corpusfile), d ,left,True, right) )

    if args.keywords:
        if not args.keywordmodel:
            print("Supply an indexed pattern model containing unigrams to extract keywords from!",file=sys.stderr)
            sys.exit(2)
        print("Loading keyword model ", args.keywordmodel, file=sys.stderr)
        kmoptions = colibricore.PatternModelOptions(mintokens=max(args.bow_absolute_threshold,args.bow_filter_threshold),minlength=1,maxlength=1,doreverseindex=True)
        reverseindex = colibricore.IndexedCorpus(args.corpusfile[0])
        model.conf[0].keywordmodel = colibricore.IndexedPatternModel(args.keywordmodel, kmoptions, None, reverseindex)
        model.conf[0].kw_absolute_threshold = args.bow_absolute_threshold
        model.conf[0].kw_prob_threshold = args.bow_prob_threshold


    if args.buildclassifiers:
        print("Building classifiers",file=sys.stderr)
        if not args.monolithic and not args.experts:
            args.experts = True

        if not os.path.isdir(args.outputdir):
            try:
                os.mkdir(args.outputdir)
            except:
                print("Unable to build directory " + args.outputdir,file=sys.stderr)
                sys.exit(2)


        f = None
        trainfile = ""
        if args.monolithic:
            f = open(args.outputdir + "/train.train",'w',encoding='utf-8')
            f2 = open(args.outputdir + "/sourcepatterns.list",'w',encoding='utf-8')

        fconf = open(args.outputdir + "/classifier.conf",'wb')

        confser = []
        for conf in model.conf:
            confser.append({'corpus': conf.corpus.filename(), 'classdecoder': conf.classdecoder.filename(), 'leftcontext': conf.leftcontext, 'focus': conf.focus,'rightcontext': conf.rightcontext})
        classifierconf = { 'weighbyoccurrence': args.weighbyoccurrence, 'weighbyscore': args.weighbyscore, 'experts': args.experts, 'monolithic': args.monolithic, 'featureconf': confser}
        pickle.dump(classifierconf, fconf)
        fconf.close()


        prevsourcepattern = None
        firsttargetpattern = None
        prevtargetpattern = None
        for sourcepattern, targetpattern, featurevectors, scorevector in model.extractcontextfeatures(sourcemodel, targetmodel, model.conf, sourcedecoder, targetdecoder, args.crosslingual, args.outputdir ):
            if prevsourcepattern is None or sourcepattern != prevsourcepattern:
                #write previous buffer to file:
                if prevsourcepattern and firsttargetpattern:
                    sourcepattern_s = prevsourcepattern.tostring(sourcedecoder)
                    if prevtargetpattern and firsttargetpattern != prevtargetpattern:
                        #only bother if there are at least two distinct target options
                        if len(buffer) < min(args.instancethreshold,2):
                            print("Omitting " + trainfile + ", only " + str(len(buffer)) + " instances",file=sys.stderr)
                        else:
                            trainfile = args.outputdir + "/" + quote_plus(sourcepattern_s) + ".train"
                            if len(quote_plus(sourcepattern_s) + ".train") > 100:
                                print("ERROR: Filename too long, skipping: " + trainfile,file=sys.stderr)
                            else:
                                print("Writing " + trainfile + " (" + str(len(buffer)) + " instances)",file=sys.stderr)
                                if args.experts:
                                    f = open(trainfile,'w',encoding='utf-8')
                                elif args.monolithic:
                                    f2.write(sourcepattern_s+"\n")
                                for line, occurrences,pts in buffer:
                                    if args.weighbyscore:
                                        f.write(line + "\t" + str(occurrences*pts) +  "\n")
                                    elif args.weighbyoccurrence:
                                        f.write(line + "\t" + str(occurrences) +  "\n")
                                    else:
                                        for i in range(0,occurrences):
                                            f.write(line + "\n")
                                if args.experts:
                                    f.close()
                    else:
                        print("Only one target option for " + sourcepattern_s + " (" + str(len(buffer)) + " instances), no classifier needed",file=sys.stderr)

                buffer = []
                prevsourcepattern = sourcepattern
                firsttargetpattern = targetpattern

            for featurevector, count in featurevectors:
                buffer.append( (featurestostring(featurevector, model.conf, args.crosslingual, sourcedecoder) + "\t" + targetpattern.tostring(targetdecoder) , count, scorevector[2] ) ) #buffer holds (ine, occurrences, pts)
                #(model.itemtostring(sourcepattern, targetpattern, featurevector,sourcedecoder, targetdecoder,False,True,False), count,scorevector[2] )  )  #buffer holds (line, occurrences, pts)

            prevsourcepattern = sourcepattern
            prevtargetpattern = targetpattern


        #write last one to file:
        if prevsourcepattern and firsttargetpattern and prevtargetpattern and firsttargetpattern != prevtargetpattern:
            #only bother if there are at least two distinct target options
            if len(buffer) < args.instancethreshold:
                print("Omitting " + trainfile + ", only " + str(len(buffer)) + " instances",file=sys.stderr)
            else:
                sourcepattern_s = prevsourcepattern.tostring(sourcedecoder)
                trainfile = args.outputdir + "/" + quote_plus(sourcepattern_s) + ".train"
                print("Writing " + trainfile + " (" + str(len(buffer)) + " instances)",file=sys.stderr)
                if args.experts:
                    f = open(trainfile,'w',encoding='utf-8')
                for line, occurrences,pts in buffer:
                    if args.weighbyscore:
                        f.write(line + "\t" + str(occurrences*pts) +  "\n")
                    elif args.weighbyoccurrence:
                        f.write(line + "\t" + str(occurrences) +  "\n")
                    else:
                        for i in range(0,occurrences):
                            f.write(line + "\n")
                if args.experts:
                    f.close()

        if args.monolithic:
            f.close()
            f2.close()
Ejemplo n.º 12
0
    def handle(self, *args, **options):
        sourceclassfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.cls')
        sourcecorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.dat')
        sourcemodelfile = os.path.join(options['tmpdir'], os.path.basename(options['sourcecorpus']).replace('.txt','') + '.colibri.patternmodel')

        if not os.path.exists(sourceclassfile) or not os.path.exists(sourcecorpusfile) or options['force']:
            self.stdout.write("Encoding source corpus ...")
            sourceclassencoder = colibricore.ClassEncoder()
            sourceclassencoder.build(options['sourcecorpus'])
            sourceclassencoder.save(sourceclassfile)
            sourceclassencoder.encodefile(options['sourcecorpus'], sourcecorpusfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write("Reusing previously encoded source corpus ...")

        targetclassfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.cls')
        targetcorpusfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.dat')
        targetmodelfile = os.path.join(options['tmpdir'], os.path.basename(options['targetcorpus']).replace('.txt','') + '.colibri.patternmodel')

        if not os.path.exists(targetclassfile) or not os.path.exists(targetcorpusfile) or options['force']:
            self.stdout.write("Encoding target corpus ...")
            targetclassencoder = colibricore.ClassEncoder()
            targetclassencoder.build(options['targetcorpus'])
            targetclassencoder.save(targetclassfile)
            targetclassencoder.encodefile(options['targetcorpus'], targetcorpusfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write("Reusing previously encoded target corpus ...")

        modeloptions = colibricore.PatternModelOptions(mintokens=options['freqthreshold'],maxlength=options['maxlength'])

        if not os.path.exists(sourcemodelfile) or options['force']:
            self.stdout.write('Computing pattern model of source corpus ...')
            sourcemodel = colibricore.UnindexedPatternModel()
            sourcemodel.train(sourcecorpusfile, modeloptions)
            sourcemodel.write(sourcemodelfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            sourcemodel = None
            self.stdout.write("Reusing previously computed source model ...")

        if not os.path.exists(targetmodelfile) or options['force']:
            self.stdout.write('Computing pattern model of target corpus ...')
            targetmodel = colibricore.UnindexedPatternModel()
            targetmodel.train(targetcorpusfile, modeloptions)
            targetmodel.write(targetmodelfile)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            targetmodel = None
            self.stdout.write("Reusing previously computed target model ...")

        alignmodelfile = os.path.join(options['tmpdir'], "alignmodel.colibri")

        #delete models to conserve memory during next step
        if sourcemodel is not None:
            del sourcemodel
            self.stdout.write(self.style.SUCCESS('Unloaded source patternmodel'))
        if targetmodel is not None:
            del targetmodel
            self.stdout.write(self.style.SUCCESS('Unloaded target patternmodel'))

        if not os.path.exists(alignmodelfile) or options['force']:
            cmd = "colibri-mosesphrasetable2alignmodel -i " + options['phrasetable'] + " -o " + alignmodelfile + " -S " + sourceclassfile + " -T " + targetclassfile + " -m " + sourcemodelfile + " -M " + targetmodelfile + " -t " + str(options['freqthreshold']) + " -l " + str(options['maxlength']) + " -p " + str(options['pts']) + " -P " + str(options['pst']) + " -j " + str(options['joinedthreshold']) + " -d " + str(options['divergencethreshold'])
            self.stdout.write("Computing alignment model: " + cmd)
            os.system(cmd)
            self.stdout.write(self.style.SUCCESS('DONE'))
        else:
            self.stdout.write(self.style.SUCCESS('Reusing previously computed alignment model'))


        self.stdout.write("Loading models")
        sourceclassdecoder = colibricore.ClassDecoder(sourceclassfile)
        targetclassdecoder = colibricore.ClassDecoder(targetclassfile)
        sourcemodel = colibricore.UnindexedPatternModel(sourcemodelfile, modeloptions)
        targetmodel = colibricore.UnindexedPatternModel(targetmodelfile, modeloptions)
        alignmodel = colibricore.PatternAlignmentModel_float(alignmodelfile, modeloptions)
        self.stdout.write(self.style.SUCCESS('DONE'))

        #collection,_ = Collection.objects.get_or_create(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang'])
        #collection_id = 1

        l = len(alignmodel)


        self.stdout.write("Connecting to MongoDB server at " + settings.MONGODB_HOST + ":" + str(settings.MONGODB_PORT) )
        mongoengine.connect("colloquery", host=settings.MONGODB_HOST, port=settings.MONGODB_PORT)

        self.stdout.write("Generating translation pairs (this may take a while)..." )

        targetcollocations = {}
        prevsourcepattern = None
        collection = Collection(name=options['title'], sourcelanguage=options['sourcelang'], targetlanguage=options['targetlang'])
        collection.save()
        sourcecount = 0

        for i, (sourcepattern, targetpattern, scores) in enumerate(alignmodel.triples()):
            if i % 100 == 0:
                self.stdout.write(str(round(((sourcecount + 1) / l) * 100,1)) + "% -- @" + str(sourcecount + 1) + " of " + str(l) + ": inserted " + str(i+1) + " pairs") #(source=" + str(n_source) + ", target=" + str(n_target) + ", source-keywords=" + str(n_source_keywords) + ", target-keywords=" + str(n_target_keywords) + ")")

            if prevsourcepattern is None or sourcepattern != prevsourcepattern:
                prevsourcepattern = sourcepattern
                sourcecount += 1

                sourcefreq = sourcemodel[sourcepattern]
                text = sourcepattern.tostring(sourceclassdecoder)
                if ignorable(text):
                    continue
                sourcecollocation = Collocation(collection=collection, language=options['sourcelang'], text=text, freq=sourcefreq)
                sourcecollocation.save()



            targetfreq = targetmodel[targetpattern]
            text = targetpattern.tostring(targetclassdecoder)
            if ignorable(text):
                continue
            if targetpattern in targetcollocations: #quicker in-memory lookup
                # targetcollocation = Collocation.objects(text=text, language=options['targetlang'], collection=collection)[0] #get from db
                targetcollocation = targetcollocations[targetpattern]
            else:
                targetcollocation = Collocation(collection=collection, language=options['targetlang'], text=text, freq=targetfreq)
                targetcollocation.save()
                #self.stdout.write(repr(targetcollocation.id))
                targetcollocations[targetpattern] = targetcollocation.id

            Translation(source=sourcecollocation, target=targetcollocation, prob=scores[0], revprob=scores[2]).save()
            Translation(source=targetcollocation, target=sourcecollocation, prob=scores[2], revprob=scores[0]).save()