from __future__ import print_function, unicode_literals, division, absolute_import import colibricore from colibrimt.alignmentmodel import FeaturedAlignmentModel sourceencoder = colibricore.ClassEncoder() targetencoder = colibricore.ClassEncoder() s1 = sourceencoder.buildpattern("het grote huis", False, True) s2 = sourceencoder.buildpattern("het paleis", False, True) t1 = targetencoder.buildpattern("the big house", False, True) t2 = targetencoder.buildpattern("the grand house", False, True) t3 = targetencoder.buildpattern("the palace", False, True) sourceencoder.save('/tmp/s.cls') targetencoder.save('/tmp/t.cls') sd = colibricore.ClassDecoder('/tmp/s.cls') td = colibricore.ClassDecoder('/tmp/t.cls') model = FeaturedAlignmentModel() model.add(s1,t1,[1,0,1,0]) model.add(s1,t2,[1,0,1,0]) model.add(s2,t2,[1,0,1,0]) model.add(s2,t3,[1,0,1,0]) model.normalize('s-t-') for source, target,scores in model: print(source.tostring(sd)+"\t"+target.tostring(td)+"\t" + " ".join([str(x) for x in scores]))
from __future__ import print_function, unicode_literals, division, absolute_import import colibricore from colibrimt.alignmentmodel import FeaturedAlignmentModel sourceencoder = colibricore.ClassEncoder() targetencoder = colibricore.ClassEncoder() s1 = sourceencoder.buildpattern("het grote huis", False, True) s2 = sourceencoder.buildpattern("het paleis", False, True) t1 = targetencoder.buildpattern("the big house", False, True) t2 = targetencoder.buildpattern("the grand house", False, True) t3 = targetencoder.buildpattern("the palace", False, True) sourceencoder.save('/tmp/s.cls') targetencoder.save('/tmp/t.cls') sd = colibricore.ClassDecoder('/tmp/s.cls') td = colibricore.ClassDecoder('/tmp/t.cls') model = FeaturedAlignmentModel() model.add(s1, t1, [1, 0, 1, 0]) model.add(s1, t2, [1, 0, 1, 0]) model.add(s2, t2, [1, 0, 1, 0]) model.add(s2, t3, [1, 0, 1, 0]) model.normalize('s-t-') for source, target, scores in model: print( source.tostring(sd) + "\t" + target.tostring(td) + "\t" + " ".join([str(x) for x in scores]))
def main(): parser = argparse.ArgumentParser( description="Extract skipgrams from a Moses phrasetable", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-t', '--minskiptypes', type=int, help="Minimal skip types", action='store', default=2, required=False) parser.add_argument( '-i', '--inputfile', type=str, help= "Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ", action='store', required=True) parser.add_argument( '-o', '--outputfile', type=str, help= "Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!", default="", action='store', required=False) parser.add_argument('-l', '--maxlength', type=int, help="Maximum length", action='store', default=8, required=False) parser.add_argument('-W', '--tmpdir', type=str, help="Temporary work directory", action='store', default="./", required=False) parser.add_argument('-S', '--sourceclassfile', type=str, help="Source class file", action='store', required=True) parser.add_argument('-T', '--targetclassfile', type=str, help="Target class file", action='store', required=True) parser.add_argument( '-s', '--constrainskipgrams', help= "Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered", action='store_true', required=False) parser.add_argument( '-m', '--constrainsourcemodel', type=str, help="Source patternmodel, used to constrain possible patterns", action='store', required=False) parser.add_argument( '-M', '--constraintargetmodel', type=str, help="Target patternmodel, used to constrain possible patterns", action='store', required=False) parser.add_argument( '-p', '--pts', type=float, help= "Minimum probability p(t|s) for skipgram consideration (set to a high number)", default=0.75, action='store', required=False) parser.add_argument( '-P', '--pst', type=float, help= "Minimum probability p(s|t) for skipgram consideration (set to a high number)", default=0.75, action='store', required=False) parser.add_argument('-D', '--debug', help="Enable debug mode", action='store_true', required=False) args = parser.parse_args() #args.storeconst, args.dataset, args.num, args.bar if args.constrainsourcemodel: print("Loading source model for constraints", file=sys.stderr) if args.constrainskipgrams: constrainsourcemodel = colibricore.IndexedPatternModel( args.constrainsourcemodel) else: constrainsourcemodel = colibricore.UnindexedPatternModel( args.constrainsourcemodel) else: constrainsourcemodel = None if args.constraintargetmodel: print("Loading target model for constraints", file=sys.stderr) if args.constrainskipgrams: constraintargetmodel = colibricore.IndexedPatternModel( args.constraintargetmodel) else: constraintargetmodel = colibricore.UnindexedPatternModel( args.constraintargetmodel) else: constraintargetmodel = None alignmodel = FeaturedAlignmentModel() if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'): print("Loading colibri alignment model", file=sys.stderr) alignmodel.load(args.inputfile) else: print("Loading class encoders", file=sys.stderr) sourceencoder = colibricore.ClassEncoder(args.sourceclassfile) targetencoder = colibricore.ClassEncoder(args.targetclassfile) print("Loading moses phrase table", file=sys.stderr) alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder, targetencoder) if args.debug: debug = (colibricore.ClassDecoder(args.sourceclassfile), colibricore.ClassDecoder(args.targetclassfile)) else: debug = False scorefilter = lambda features: features[0] >= args.pst and features[ 2] >= args.pts extractskipgrams(alignmodel, args.maxlength, args.minskiptypes, args.tmpdir, constrainsourcemodel, constraintargetmodel, args.constrainskipgrams, scorefilter, False, debug) if args.outputfile: outfile = args.outputfile else: outfile = os.path.basename(args.inputfile) if outfile[-3:] == '.gz': outfile = outfile[:-3] if outfile[-4:] == '.bz2': outfile = outfile[:-4] if outfile[-11:] == '.phrasetable': outfile = outfile[:-11] if outfile[-12:] == '.phrase-table': outfile = outfile[:-12] print("Saving alignment model to " + outfile, file=sys.stderr) alignmodel.save(outfile) #extensions will be added automatically
def main(): parser = argparse.ArgumentParser(description="Extract skipgrams from a Moses phrasetable", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-t','--minskiptypes', type=int,help="Minimal skip types", action='store',default=2,required=False) parser.add_argument('-i','--inputfile',type=str,help="Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ", action='store',required=True) parser.add_argument('-o','--outputfile',type=str,help="Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!", default="", action='store',required=False) parser.add_argument('-l','--maxlength',type=int,help="Maximum length", action='store',default=8,required=False) parser.add_argument('-W','--tmpdir',type=str,help="Temporary work directory", action='store',default="./",required=False) parser.add_argument('-S','--sourceclassfile',type=str,help="Source class file", action='store',required=True) parser.add_argument('-T','--targetclassfile',type=str,help="Target class file", action='store',required=True) parser.add_argument('-s','--constrainskipgrams',help="Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered", action='store_true',required=False) parser.add_argument('-m','--constrainsourcemodel',type=str,help="Source patternmodel, used to constrain possible patterns", action='store',required=False) parser.add_argument('-M','--constraintargetmodel',type=str,help="Target patternmodel, used to constrain possible patterns", action='store',required=False) parser.add_argument('-p','--pts',type=float,help="Minimum probability p(t|s) for skipgram consideration (set to a high number)",default=0.75, action='store',required=False) parser.add_argument('-P','--pst',type=float,help="Minimum probability p(s|t) for skipgram consideration (set to a high number)", default=0.75,action='store',required=False) parser.add_argument('-D','--debug',help="Enable debug mode", action='store_true',required=False) args = parser.parse_args() #args.storeconst, args.dataset, args.num, args.bar if args.constrainsourcemodel: print("Loading source model for constraints",file=sys.stderr) if args.constrainskipgrams: constrainsourcemodel = colibricore.IndexedPatternModel(args.constrainsourcemodel) else: constrainsourcemodel = colibricore.UnindexedPatternModel(args.constrainsourcemodel) else: constrainsourcemodel = None if args.constraintargetmodel: print("Loading target model for constraints",file=sys.stderr) if args.constrainskipgrams: constraintargetmodel = colibricore.IndexedPatternModel(args.constraintargetmodel) else: constraintargetmodel = colibricore.UnindexedPatternModel(args.constraintargetmodel) else: constraintargetmodel = None alignmodel = FeaturedAlignmentModel() if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'): print("Loading colibri alignment model",file=sys.stderr) alignmodel.load(args.inputfile) else: print("Loading class encoders",file=sys.stderr) sourceencoder = colibricore.ClassEncoder(args.sourceclassfile) targetencoder = colibricore.ClassEncoder(args.targetclassfile) print("Loading moses phrase table",file=sys.stderr) alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder, targetencoder) if args.debug: debug = (colibricore.ClassDecoder(args.sourceclassfile), colibricore.ClassDecoder(args.targetclassfile)) else: debug = False scorefilter = lambda features: features[0] >= args.pst and features[2] >= args.pts extractskipgrams(alignmodel, args.maxlength, args.minskiptypes, args.tmpdir, constrainsourcemodel, constraintargetmodel,args.constrainskipgrams,scorefilter,False, debug) if args.outputfile: outfile = args.outputfile else: outfile = os.path.basename(args.inputfile) if outfile[-3:] == '.gz': outfile = outfile[:-3] if outfile[-4:] == '.bz2': outfile = outfile[:-4] if outfile[-11:] == '.phrasetable': outfile = outfile[:-11] if outfile[-12:] == '.phrase-table': outfile = outfile[:-12] print("Saving alignment model to " + outfile,file=sys.stderr) alignmodel.save(outfile) #extensions will be added automatically