def main(): parser = argparse.ArgumentParser( description="Extract skipgrams from a Moses phrasetable", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-t', '--minskiptypes', type=int, help="Minimal skip types", action='store', default=2, required=False) parser.add_argument( '-i', '--inputfile', type=str, help= "Input alignment model (file prefix without .colibri.alignmodel-* extension) or moses phrasetable ", action='store', required=True) parser.add_argument( '-o', '--outputfile', type=str, help= "Output alignment model (file prefix without .colibri.alignmodel-* extension). Same as input if not specified!", default="", action='store', required=False) parser.add_argument('-l', '--maxlength', type=int, help="Maximum length", action='store', default=8, required=False) parser.add_argument('-W', '--tmpdir', type=str, help="Temporary work directory", action='store', default="./", required=False) parser.add_argument('-S', '--sourceclassfile', type=str, help="Source class file", action='store', required=True) parser.add_argument('-T', '--targetclassfile', type=str, help="Target class file", action='store', required=True) parser.add_argument( '-s', '--constrainskipgrams', help= "Strictly constrain skipgrams: only skipgrams present in the constrain models (-m and -M) will be considered", action='store_true', required=False) parser.add_argument( '-m', '--constrainsourcemodel', type=str, help="Source patternmodel, used to constrain possible patterns", action='store', required=False) parser.add_argument( '-M', '--constraintargetmodel', type=str, help="Target patternmodel, used to constrain possible patterns", action='store', required=False) parser.add_argument( '-p', '--pts', type=float, help= "Minimum probability p(t|s) for skipgram consideration (set to a high number)", default=0.75, action='store', required=False) parser.add_argument( '-P', '--pst', type=float, help= "Minimum probability p(s|t) for skipgram consideration (set to a high number)", default=0.75, action='store', required=False) parser.add_argument('-D', '--debug', help="Enable debug mode", action='store_true', required=False) args = parser.parse_args() #args.storeconst, args.dataset, args.num, args.bar if args.constrainsourcemodel: print("Loading source model for constraints", file=sys.stderr) if args.constrainskipgrams: constrainsourcemodel = colibricore.IndexedPatternModel( args.constrainsourcemodel) else: constrainsourcemodel = colibricore.UnindexedPatternModel( args.constrainsourcemodel) else: constrainsourcemodel = None if args.constraintargetmodel: print("Loading target model for constraints", file=sys.stderr) if args.constrainskipgrams: constraintargetmodel = colibricore.IndexedPatternModel( args.constraintargetmodel) else: constraintargetmodel = colibricore.UnindexedPatternModel( args.constraintargetmodel) else: constraintargetmodel = None alignmodel = FeaturedAlignmentModel() if os.path.exists(args.inputfile + '.colibri.alignmodel-keys'): print("Loading colibri alignment model", file=sys.stderr) alignmodel.load(args.inputfile) else: print("Loading class encoders", file=sys.stderr) sourceencoder = colibricore.ClassEncoder(args.sourceclassfile) targetencoder = colibricore.ClassEncoder(args.targetclassfile) print("Loading moses phrase table", file=sys.stderr) alignmodel.loadmosesphrasetable(args.inputfile, sourceencoder, targetencoder) if args.debug: debug = (colibricore.ClassDecoder(args.sourceclassfile), colibricore.ClassDecoder(args.targetclassfile)) else: debug = False scorefilter = lambda features: features[0] >= args.pst and features[ 2] >= args.pts extractskipgrams(alignmodel, args.maxlength, args.minskiptypes, args.tmpdir, constrainsourcemodel, constraintargetmodel, args.constrainskipgrams, scorefilter, False, debug) if args.outputfile: outfile = args.outputfile else: outfile = os.path.basename(args.inputfile) if outfile[-3:] == '.gz': outfile = outfile[:-3] if outfile[-4:] == '.bz2': outfile = outfile[:-4] if outfile[-11:] == '.phrasetable': outfile = outfile[:-11] if outfile[-12:] == '.phrase-table': outfile = outfile[:-12] print("Saving alignment model to " + outfile, file=sys.stderr) alignmodel.save(outfile) #extensions will be added automatically
from __future__ import print_function, unicode_literals, division, absolute_import import colibricore from colibrimt.alignmentmodel import FeaturedAlignmentModel sourceencoder = colibricore.ClassEncoder() targetencoder = colibricore.ClassEncoder() s1 = sourceencoder.buildpattern("het grote huis", False, True) s2 = sourceencoder.buildpattern("het paleis", False, True) t1 = targetencoder.buildpattern("the big house", False, True) t2 = targetencoder.buildpattern("the grand house", False, True) t3 = targetencoder.buildpattern("the palace", False, True) sourceencoder.save('/tmp/s.cls') targetencoder.save('/tmp/t.cls') sd = colibricore.ClassDecoder('/tmp/s.cls') td = colibricore.ClassDecoder('/tmp/t.cls') model = FeaturedAlignmentModel() model.add(s1, t1, [1, 0, 1, 0]) model.add(s1, t2, [1, 0, 1, 0]) model.add(s2, t2, [1, 0, 1, 0]) model.add(s2, t3, [1, 0, 1, 0]) model.normalize('s-t-') for source, target, scores in model: print( source.tostring(sd) + "\t" + target.tostring(td) + "\t" + " ".join([str(x) for x in scores]))