Beispiel #1
0
from __future__ import print_function, unicode_literals, division, absolute_import
import colibricore

from colibrimt.alignmentmodel import FeaturedAlignmentModel

sourceencoder = colibricore.ClassEncoder()
targetencoder = colibricore.ClassEncoder()

s1 = sourceencoder.buildpattern("het grote huis", False, True)
s2 = sourceencoder.buildpattern("het paleis", False, True)
t1 = targetencoder.buildpattern("the big house", False, True)
t2 = targetencoder.buildpattern("the grand house", False, True)
t3 = targetencoder.buildpattern("the palace", False, True)

sourceencoder.save('/tmp/s.cls')
targetencoder.save('/tmp/t.cls')
sd = colibricore.ClassDecoder('/tmp/s.cls')
td = colibricore.ClassDecoder('/tmp/t.cls')

model = FeaturedAlignmentModel()
model.add(s1,t1,[1,0,1,0])
model.add(s1,t2,[1,0,1,0])
model.add(s2,t2,[1,0,1,0])
model.add(s2,t3,[1,0,1,0])
model.normalize('s-t-')

for source, target,scores in model:
    print(source.tostring(sd)+"\t"+target.tostring(td)+"\t" + " ".join([str(x) for x in scores]))

Beispiel #2
0
from __future__ import print_function, unicode_literals, division, absolute_import
import colibricore

from colibrimt.alignmentmodel import FeaturedAlignmentModel

sourceencoder = colibricore.ClassEncoder()
targetencoder = colibricore.ClassEncoder()

s1 = sourceencoder.buildpattern("het grote huis", False, True)
s2 = sourceencoder.buildpattern("het paleis", False, True)
t1 = targetencoder.buildpattern("the big house", False, True)
t2 = targetencoder.buildpattern("the grand house", False, True)
t3 = targetencoder.buildpattern("the palace", False, True)

sourceencoder.save('/tmp/s.cls')
targetencoder.save('/tmp/t.cls')
sd = colibricore.ClassDecoder('/tmp/s.cls')
td = colibricore.ClassDecoder('/tmp/t.cls')

model = FeaturedAlignmentModel()
model.add(s1, t1, [1, 0, 1, 0])
model.add(s1, t2, [1, 0, 1, 0])
model.add(s2, t2, [1, 0, 1, 0])
model.add(s2, t3, [1, 0, 1, 0])
model.normalize('s-t-')

for source, target, scores in model:
    print(
        source.tostring(sd) + "\t" + target.tostring(td) + "\t" +
        " ".join([str(x) for x in scores]))
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Extract skipgrams from a Moses phrasetable",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t',
                        '--minskiptypes',
                        type=int,
                        help="Minimal skip types",
                        action='store',
                        default=2,
                        required=False)
    parser.add_argument(
        '-i',
        '--inputfile',
        type=str,
        help=
        "Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ",
        action='store',
        required=True)
    parser.add_argument(
        '-o',
        '--outputfile',
        type=str,
        help=
        "Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!",
        default="",
        action='store',
        required=False)
    parser.add_argument('-l',
                        '--maxlength',
                        type=int,
                        help="Maximum length",
                        action='store',
                        default=8,
                        required=False)
    parser.add_argument('-W',
                        '--tmpdir',
                        type=str,
                        help="Temporary work directory",
                        action='store',
                        default="./",
                        required=False)
    parser.add_argument('-S',
                        '--sourceclassfile',
                        type=str,
                        help="Source class file",
                        action='store',
                        required=True)
    parser.add_argument('-T',
                        '--targetclassfile',
                        type=str,
                        help="Target class file",
                        action='store',
                        required=True)
    parser.add_argument(
        '-s',
        '--constrainskipgrams',
        help=
        "Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered",
        action='store_true',
        required=False)
    parser.add_argument(
        '-m',
        '--constrainsourcemodel',
        type=str,
        help="Source patternmodel, used to constrain possible patterns",
        action='store',
        required=False)
    parser.add_argument(
        '-M',
        '--constraintargetmodel',
        type=str,
        help="Target patternmodel, used to constrain possible patterns",
        action='store',
        required=False)
    parser.add_argument(
        '-p',
        '--pts',
        type=float,
        help=
        "Minimum probability p(t|s) for skipgram consideration (set to a high number)",
        default=0.75,
        action='store',
        required=False)
    parser.add_argument(
        '-P',
        '--pst',
        type=float,
        help=
        "Minimum probability p(s|t) for skipgram consideration (set to a high number)",
        default=0.75,
        action='store',
        required=False)
    parser.add_argument('-D',
                        '--debug',
                        help="Enable debug mode",
                        action='store_true',
                        required=False)
    args = parser.parse_args()
    #args.storeconst, args.dataset, args.num, args.bar

    if args.constrainsourcemodel:
        print("Loading source model for constraints", file=sys.stderr)
        if args.constrainskipgrams:
            constrainsourcemodel = colibricore.IndexedPatternModel(
                args.constrainsourcemodel)
        else:
            constrainsourcemodel = colibricore.UnindexedPatternModel(
                args.constrainsourcemodel)
    else:
        constrainsourcemodel = None

    if args.constraintargetmodel:
        print("Loading target model for constraints", file=sys.stderr)
        if args.constrainskipgrams:
            constraintargetmodel = colibricore.IndexedPatternModel(
                args.constraintargetmodel)
        else:
            constraintargetmodel = colibricore.UnindexedPatternModel(
                args.constraintargetmodel)
    else:
        constraintargetmodel = None

    alignmodel = FeaturedAlignmentModel()
    if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'):
        print("Loading colibri alignment model", file=sys.stderr)
        alignmodel.load(args.inputfile)
    else:
        print("Loading class encoders", file=sys.stderr)
        sourceencoder = colibricore.ClassEncoder(args.sourceclassfile)
        targetencoder = colibricore.ClassEncoder(args.targetclassfile)
        print("Loading moses phrase table", file=sys.stderr)
        alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder,
                                        targetencoder)

    if args.debug:
        debug = (colibricore.ClassDecoder(args.sourceclassfile),
                 colibricore.ClassDecoder(args.targetclassfile))
    else:
        debug = False

    scorefilter = lambda features: features[0] >= args.pst and features[
        2] >= args.pts
    extractskipgrams(alignmodel, args.maxlength, args.minskiptypes,
                     args.tmpdir, constrainsourcemodel, constraintargetmodel,
                     args.constrainskipgrams, scorefilter, False, debug)

    if args.outputfile:
        outfile = args.outputfile
    else:
        outfile = os.path.basename(args.inputfile)
        if outfile[-3:] == '.gz': outfile = outfile[:-3]
        if outfile[-4:] == '.bz2': outfile = outfile[:-4]
        if outfile[-11:] == '.phrasetable': outfile = outfile[:-11]
        if outfile[-12:] == '.phrase-table': outfile = outfile[:-12]
    print("Saving alignment model to " + outfile, file=sys.stderr)
    alignmodel.save(outfile)  #extensions will be added automatically
def main():
    parser = argparse.ArgumentParser(description="Extract skipgrams from a Moses phrasetable", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t','--minskiptypes', type=int,help="Minimal skip types", action='store',default=2,required=False)
    parser.add_argument('-i','--inputfile',type=str,help="Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ", action='store',required=True)
    parser.add_argument('-o','--outputfile',type=str,help="Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!", default="", action='store',required=False)
    parser.add_argument('-l','--maxlength',type=int,help="Maximum length", action='store',default=8,required=False)
    parser.add_argument('-W','--tmpdir',type=str,help="Temporary work directory", action='store',default="./",required=False)
    parser.add_argument('-S','--sourceclassfile',type=str,help="Source class file", action='store',required=True)
    parser.add_argument('-T','--targetclassfile',type=str,help="Target class file", action='store',required=True)
    parser.add_argument('-s','--constrainskipgrams',help="Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered", action='store_true',required=False)
    parser.add_argument('-m','--constrainsourcemodel',type=str,help="Source patternmodel, used to constrain possible patterns", action='store',required=False)
    parser.add_argument('-M','--constraintargetmodel',type=str,help="Target patternmodel, used to constrain possible patterns", action='store',required=False)
    parser.add_argument('-p','--pts',type=float,help="Minimum probability p(t|s) for skipgram consideration (set to a high number)",default=0.75, action='store',required=False)
    parser.add_argument('-P','--pst',type=float,help="Minimum probability p(s|t) for skipgram consideration (set to a high number)", default=0.75,action='store',required=False)
    parser.add_argument('-D','--debug',help="Enable debug mode", action='store_true',required=False)
    args = parser.parse_args()
    #args.storeconst, args.dataset, args.num, args.bar

    if args.constrainsourcemodel:
        print("Loading source model for constraints",file=sys.stderr)
        if args.constrainskipgrams:
            constrainsourcemodel = colibricore.IndexedPatternModel(args.constrainsourcemodel)
        else:
            constrainsourcemodel = colibricore.UnindexedPatternModel(args.constrainsourcemodel)
    else:
        constrainsourcemodel = None

    if args.constraintargetmodel:
        print("Loading target model for constraints",file=sys.stderr)
        if args.constrainskipgrams:
            constraintargetmodel = colibricore.IndexedPatternModel(args.constraintargetmodel)
        else:
            constraintargetmodel = colibricore.UnindexedPatternModel(args.constraintargetmodel)
    else:
        constraintargetmodel = None


    alignmodel = FeaturedAlignmentModel()
    if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'):
        print("Loading colibri alignment model",file=sys.stderr)
        alignmodel.load(args.inputfile)
    else:
        print("Loading class encoders",file=sys.stderr)
        sourceencoder = colibricore.ClassEncoder(args.sourceclassfile)
        targetencoder = colibricore.ClassEncoder(args.targetclassfile)
        print("Loading moses phrase table",file=sys.stderr)
        alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder, targetencoder)

    if args.debug:
        debug = (colibricore.ClassDecoder(args.sourceclassfile), colibricore.ClassDecoder(args.targetclassfile))
    else:
        debug = False


    scorefilter = lambda features:  features[0] >= args.pst and features[2] >= args.pts
    extractskipgrams(alignmodel, args.maxlength, args.minskiptypes, args.tmpdir, constrainsourcemodel, constraintargetmodel,args.constrainskipgrams,scorefilter,False, debug)

    if args.outputfile:
        outfile = args.outputfile
    else:
        outfile = os.path.basename(args.inputfile)
        if outfile[-3:] == '.gz': outfile = outfile[:-3]
        if outfile[-4:] == '.bz2': outfile = outfile[:-4]
        if outfile[-11:] == '.phrasetable': outfile = outfile[:-11]
        if outfile[-12:] == '.phrase-table': outfile = outfile[:-12]
    print("Saving alignment model to " + outfile,file=sys.stderr)
    alignmodel.save(outfile) #extensions will be added automatically