def main(): parser = argparse.ArgumentParser(description="run rescoring/force decoding over the cluster", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument("--splitsize", "-z", type=int, default=1, help="number of batches") parser.add_argument("--datafile", "-d", type=str, required=True, help="input source tab trg file") parser.add_argument("--model", "-m", required=True, help="model file") parser.add_argument("--modelnum", "-n", required=True, help="model number") parser.add_argument("--logfile", "-l", type=str, default='/dev/null', help="where to log data") parser.add_argument("--outfile", "-o", type=str, required=True, help="output scores file") parser.add_argument("--workdir", "-w", default=None, help="work directory (defaults to 'work' subdir of outfile") parser.add_argument("--extra_rnn_args", help="extra arguments to rnn binary") parser.add_argument("--extra_qsub_opts", "-q", help="extra options to qsubrun (scorers only)") parser.add_argument("--rescore_single", default=os.path.join(scriptdir, 'rescore_single.py'), help="rescore single script") parser.add_argument("--cat", default=os.path.join(scriptdir, 'cat.py'), help="cat with named output") parser.add_argument("--rnn_location", default=os.path.join(scriptdir, 'helper_programs', 'ZOPH_RNN'), help="rnn binary") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) workdir = os.path.join(os.path.dirname(args.outfile), 'work') if args.workdir is None else args.workdir mkdir_p(workdir) qsubopts = args.extra_qsub_opts if args.extra_qsub_opts is not None else "" # split into desired number of pieces fill = len(str(args.splitsize-1)) splitcmd = run(shlex.split("split -a {} -n l/{} -d {} {}/data.".format(fill, args.splitsize, args.datafile, workdir)), check=True) jobids = [] outfiles = [] joincmd = "qsubrun -q isi -l walltime=24:00:00 -j oe -o {logfile} -N {outfile}.join -W depend=afterok:".format(logfile=args.logfile, outfile=args.outfile) for piece in range(args.splitsize): # split back into source and target piece = str(piece).zfill(fill) df = "{}/data.{}".format(workdir, piece) of = "{}/scores.{}".format(workdir, piece) outfiles.append(of) # launch individual rescore jobs; collect job ids cmd="qsubrun -o {workdir}/split.log.{piece} -N split.{piece} {qsub} -- {rescore_single} -m {model} -n {modelnum} -d {df} -o {of} -l {workdir}/inner.log.{piece}".format(qsub=qsubopts, workdir=workdir, piece=piece, rescore_single=args.rescore_single, model=args.model, modelnum=args.modelnum, df=df, of=of) sys.stderr.write("Launching {}".format(cmd)+"\n") jobid = run(shlex.split(cmd), stdout=PIPE).stdout.decode('utf-8').strip() sys.stderr.write("Got {}".format(jobid)+"\n") jobids.append(jobid) joincmd += "{} -- {} -i {} -o {} ".format(':'.join(jobids), args.cat, ' '.join(outfiles), args.outfile) sys.stderr.write(joincmd+"\n") res = run(shlex.split(joincmd), stdout=PIPE).stdout.decode('utf-8').strip() print(res)
def runaligner(args): ''' launch berkeley aligner ''' root = args.trained_model alroot = os.path.join(root, 'berk_aligner') dataroot=os.path.join(alroot, 'data') for source, target, name in zip((args.train_source, args.dev_source), (args.train_target, args.dev_target), ('train', 'test')): datadir=os.path.join(dataroot, name) mkdir_p(datadir) shutil.copy(source, os.path.join(datadir, "{}.f".format(name))) shutil.copy(target, os.path.join(datadir, "{}.e".format(name))) for file in (args.aligncmd, args.alignjar, args.alignconf): shutil.copy(file, alroot) cmd="qsubrun -N {name}.align -j oe -o {alroot}/align.monitor {qsubopts} -- {berkalignsh} {alroot}".format(alroot=alroot, qsubopts=args.qsubopts, berkalignsh=args.berkalignsh, name=args.name) sys.stderr.write(cmd+"\n") jobid = run(shlex.split(cmd), check=True, stdout=PIPE).stdout.decode('utf-8').strip() return jobid
def main(): MODES = ['parent', 'child', 'standalone'] parser = argparse.ArgumentParser(description="train seq2seq standalone, parent, and child models", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument("--name", required=True, help="a name for this training (combined with mode)") parser.add_argument("--mode", required=True, choices=MODES, help="what kind of training are we doing?") parser.add_argument("--trained_model", "-m", required=True, help="model location (this model will be created)") parser.add_argument("--parent_model", "-p", default=None, help="parent model location (if this model exists, it's a child model)") parser.add_argument("--model_nums", "-n", type=int, nargs='+', default=[x for x in range(1,9)], choices=range(1,9), help="model variants to train") parser.add_argument("--train_source", "-ts", required=True, help="source side of training data") parser.add_argument("--train_target", "-tt", required=True, help="target side of training data") parser.add_argument("--mapping_source", "-ms", help="source side of child data when building parent (for mapping)") parser.add_argument("--mapping_target", "-mt", help="target side of child data when building parent (for mapping)") parser.add_argument("--dev_source", "-ds", required=True, help="source side of dev data") parser.add_argument("--dev_target", "-dt", required=True, help="target side of dev data") parser.add_argument("--epochs", "-e", type=int, default=40, help="number of epochs to run") parser.add_argument("--qsubopts", default="", help="additional options to pass to qsub") parser.add_argument("--extra_rnn_args", default="", help="additional options to pass to rnn binary") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file") parser.add_argument("--previous_alignment", default=None, help="path to berk_align directory that will be simlinked in if align is false and mode is not parent") addonoffarg(parser, 'align', help="run aligner (never run when training parent)", default=True) parser.add_argument("--aligncmd", default=os.path.join(scriptdir, 'helper_programs', 'align'), help="aligner data") parser.add_argument("--alignjar", default=os.path.join(scriptdir, 'helper_programs', 'berkeleyaligner.jar'), help="aligner data") parser.add_argument("--alignconf",default=os.path.join(scriptdir, 'helper_programs', 'unk_replace.conf'), help="aligner data") parser.add_argument("--mappingstandalone",default=os.path.join(scriptdir, 'helper_programs', 'create_mapping_pureNMT.py'), help="mapping program") parser.add_argument("--mappingparent",default=os.path.join(scriptdir, 'helper_programs', 'create_mapping_parent.py'), help="mapping program") parser.add_argument("--berkalignsh", default=os.path.join(scriptdir, 'helper_programs', 'berk_align.sh'), help="aligner cmd") parser.add_argument("--rnnwrap", default=os.path.join(scriptdir, 'helper_programs', 'rnn_wrap.sh'), help="rnn wrapper") parser.add_argument("--pretrain", default=os.path.join(scriptdir, 'helper_programs', 'pretrain.py'), help="pretrain child model trainer") parser.add_argument("--rnn_binary", default=os.path.join(scriptdir, 'helper_programs', 'ZOPH_RNN'), help="rnn binary") workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__), dir=os.getenv('TMPDIR', '/tmp')) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) def cleanwork(): shutil.rmtree(workdir, ignore_errors=True) if args.debug: print(workdir) else: atexit.register(cleanwork) jobids = [] # model configuration dropouts = {} if args.mode == 'parent': args.align = False dropouts[1]="-d 0.8" dropouts[2]="" else: dropouts[1]=dropouts[2]="-d 0.5" modelopts={} for i in range(1, 9): H = 750 if int((i+1)/2) % 2 == 1 else 1000 # 1, 2, 5, 6 = 750 N = 2 if (i % 2) == 1 else 3 # 1 3 5 7 = 2 G = "0 1 1" if N == 2 else "0 0 1 1" # 1 3 5 7 = 0 0 1 dr = dropouts[1] if i <=4 else dropouts[2] modelopts[i] = "-m 128 -l 0.5 -P -0.08 0.08 -w 5 --attention-model 1 --feed-input 1 --screen-print-rate 30 -n {epochs} -L 100 {dr} -H {H} -N {N} -M {G}".format(epochs=args.epochs, H=H, N=N, G=G, dr=dr) #sys.stderr.write("Model {}: {}\n".format(i, modelopts[i])) if not ((args.mode == 'child') ^ (args.parent_model is None)): sys.stderr.write("Should only (and always) specify parent model when building child\n") sys.exit(1) mkdir_p(args.trained_model) # run aligner if args.align: jobids.append(runaligner(args)) elif args.mode != 'parent' and args.previous_alignment is not None: dst = os.path.join(args.trained_model, 'berk_aligner') if not os.path.exists(dst): os.symlink(os.path.abspath(args.previous_alignment), dst) # if not child model, obtain vocabulary based on token frequency if args.mode == 'child': pass else: if args.mode == 'standalone': cmd = "{mapping} {trainsource} {traintarget} 6 {modelroot}/count6.nn".format(mapping=args.mappingstandalone, trainsource=args.train_source, traintarget=args.train_target, modelroot=args.trained_model) elif args.mode == 'parent': cmd = "{mapping} {mapsource} {maptarget} 6 {modelroot}/count6.nn {trainsource}".format(mapping=args.mappingparent, mapsource=args.mapping_source, maptarget=args.mapping_target, trainsource=args.train_source, modelroot=args.trained_model) sys.stderr.write(cmd+"\n") run(shlex.split(cmd), check=True) # launch trainings for modelnum in args.model_nums: modelpath = os.path.join(args.trained_model, "model{}".format(modelnum)) mkdir_p(modelpath) if args.mode == 'child': jobids.append(runchildmodel(args, modelnum)) else: countfile = "{}/count6.nn".format(args.trained_model) opts=modelopts[modelnum]+" -B {}/best.nn".format(modelpath) jobids.append(runmodel(args, modelnum, countfile, opts)) outfile = prepfile(args.outfile, 'w') outfile.write(':'.join(jobids)+"\n")
def main(): parser = argparse.ArgumentParser( description= "apply patterns to data, get in and out results, sample them.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument("--lexiconfile", "-l", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input lexicon file") parser.add_argument("--toklcfile", "-t", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="toklc english file") parser.add_argument("--patternfile", "-p", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="pattern file") parser.add_argument("--sample", "-s", type=int, default=20, help="number of samples to catch") parser.add_argument("--threshhold", "-d", type=float, default=5.0, help="minimum score for patterns") parser.add_argument("--applyprog", default=os.path.join(scriptdir, 'applymatches.py'), help='apply matches program') parser.add_argument("--sampleprog", default=os.path.join(scriptdir, 'sample.py'), help='sample program') parser.add_argument("--maskngram", default=os.path.join(scriptdir, 'maskngram.py'), help='maskngram file') parser.add_argument("--outdir", "-o", default=".", help="output directory") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__), dir=os.getenv('TMPDIR', '/tmp')) def cleanwork(): shutil.rmtree(workdir, ignore_errors=True) if args.debug: print(workdir) else: atexit.register(cleanwork) lexiconfile = prepfile(args.lexiconfile, 'r') toklcfile = prepfile(args.toklcfile, 'r') patternfile = prepfile(args.patternfile, 'r') mkdir_p(args.outdir) changefile = prepfile(os.path.join(args.outdir, "changes"), 'w') samefile = prepfile(os.path.join(args.outdir, "sames"), 'w') changesamplefile = prepfile(os.path.join(args.outdir, "changesamples"), 'w') samesamplefile = prepfile(os.path.join(args.outdir, "samesamples"), 'w') _, tmpfile = tempfile.mkstemp(dir=workdir, text=True) tmpfile = prepfile(tmpfile, 'w') for l, t in izip(lexiconfile, toklcfile): tmpfile.write("%s\t%s" % (l.strip(), t)) tmpfile.close() shchain([ "%s -i %s -t %f --no-passthrough --scoremode" % (args.applyprog, tmpfile.name, args.threshhold), ], input=patternfile, output=changefile) shchain([ "%s -i %s -t %f --no-mods" % (args.applyprog, tmpfile.name, args.threshhold), ], input=patternfile, output=samefile) changefile.close() samefile.close() changefile = prepfile(changefile.name, 'r') _, tmpfile = tempfile.mkstemp(dir=workdir, text=True) tmpfile = prepfile(tmpfile, 'w') shchain([ "%s -s %d" % (args.sampleprog, args.sample), ], input=changefile, output=tmpfile) tmpfile.close() tmpfile = prepfile(tmpfile.name, 'r') for line in tmpfile: toks = line.strip().split('\t') changesamplefile.write('\t'.join(toks[:-1]) + "\n") for tok in toks[-1].split('//'): changesamplefile.write("\t%s\n" % tok.strip())
def main(): parser = argparse.ArgumentParser( description="create elisa pack out of il pack", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file") parser.add_argument("--tarball", "-t", required=True, help="input tarball") parser.add_argument("--language", "-l", required=True, help="iso 639 code/il code") parser.add_argument("--dst", "-d", required=True, help="directory of unpacking") parser.add_argument("--year", "-y", type=int, default=1, help="year of the eval") parser.add_argument("--version", "-v", type=int, default=1, help="version of the eval") parser.add_argument("--release", "-r", type=int, default=1, help="release of the eval") parser.add_argument( "--ruby", default="/Users/jonmay/.rvm/rubies/ruby-2.3.0/bin/ruby", help="path to good ruby") parser.add_argument( "--lex", default="il3", help="lex variant; probably have to make a new one each year") parser.add_argument("--key", "-k", default=None, type=str, help="set 0 key") parser.add_argument("--sets", "-s", nargs='+', default=['syscomb', 'test', 'dev'], type=str, help="list of sets to make") parser.add_argument("--sizes", "-z", nargs='+', default=['10000', '10000', '20000'], type=str, help="list of set sizes") parser.add_argument("--devset", default=None, type=str, help="set of mandatory documents in the devset") addonoffarg(parser, 'swap', help="swap src/translation in found files", default=True) addonoffarg(parser, 'allperseg', help="divide persegment instead of perdoc", default=False) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__), dir=os.getenv('TMPDIR', '/tmp')) def cleanwork(): shutil.rmtree(workdir, ignore_errors=True) if args.debug: print(workdir) else: atexit.register(cleanwork) if len(args.sets) != len(args.sizes): sys.stderr.write("sets and sizes must match!") outfile = prepfile(args.outfile, 'w') outdir = os.path.join(args.dst, args.language) mkdir_p(outdir) lrlpcmd = "{script}/one_button_lrlp.py --lexversion {lex} --evalil -t {tarball} -l {lang} -r {dst} --ruby {ruby}".format( script=scriptdir, lex=args.lex, tarball=args.tarball, lang=args.language, dst=args.dst, ruby=args.ruby) if args.key is not None: lrlpcmd += " -k {} -S set0".format(args.key) if args.swap: lrlpcmd += " --swap" dorun(lrlpcmd, log=outfile, cmdlog=os.path.join(outdir, 'one_button_lrlp.err')) subcmd = "{script}/subselect_data.py -i {outdir}/parallel -e filtered -l {lang} -s {sizes} -c {sets} -t {script}/incidentvocab".format( script=scriptdir, outdir=outdir, lang=args.language, sets=' '.join(args.sets), sizes=' '.join(args.sizes)) if args.devset is not None: subcmd += " -d {}".format(args.devset) if args.allperseg: subcmd += " --allperseg" dorun(subcmd, log=outfile, cmdlog=os.path.join(outdir, 'subselect_data.err')) pkgcmd = "{script}/one_button_package.py --sets {sets} -l {lang} -y {year} -r {release} -v {version} -r {outdir}".format( script=scriptdir, year=args.year, version=args.version, release=args.release, lang=args.language, outdir=outdir, sets=' '.join(args.sets)) dorun(pkgcmd, log=outfile, cmdlog=os.path.join(outdir, 'one_button_package.err')) subsets = ["train", "rejected"] + args.sets for subset in subsets: catcmd = "{script}/elisa2flat.py -f FULL_ID_SOURCE SOURCE.id ORIG_SOURCE ORIG_TARGET -i {outdir}/elisa.{lang}-eng.{subset}.y{year}r{release}.v{ver}.xml.gz -o {outdir}/{subset}.tab".format( script=scriptdir, year=args.year, subset=subset, ver=args.version, release=args.release, lang=args.language, outdir=outdir) dorun(catcmd, log=outfile) sampcmd = "{script}/sample.py -i {outdir}/{subset}.tab -s 10 -o {outdir}/{subset}.samples".format( script=scriptdir, subset=subset, outdir=outdir) dorun(sampcmd, log=outfile) catcmd = "{script}/elisa2flat.py -f FULL_ID_SOURCE SOURCE.id ORIG_SOURCE -i {outdir}/elisa.{lang}.y{year}r{release}.v{ver}.xml.gz -o {outdir}/{lang}.tab".format( script=scriptdir, year=args.year, ver=args.version, release=args.release, lang=args.language, outdir=outdir) dorun(catcmd, log=outfile) sampcmd = "{script}/sample.py -i {outdir}/{lang}.tab -s 10 -o {outdir}/{lang}.samples".format( script=scriptdir, lang=args.language, outdir=outdir) dorun(sampcmd, log=outfile) dorun("ls -l {}".format(outdir))
def main(): parser = argparse.ArgumentParser( description="run rescoring/force decoding over the cluster", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument("--splitsize", "-z", type=int, default=1, help="number of batches") parser.add_argument("--datafile", "-d", type=str, required=True, help="input source tab trg file") parser.add_argument("--model", "-m", required=True, help="model file") parser.add_argument("--modelnum", "-n", required=True, help="model number") parser.add_argument("--logfile", "-l", type=str, default='/dev/null', help="where to log data") parser.add_argument("--outfile", "-o", type=str, required=True, help="output scores file") parser.add_argument( "--workdir", "-w", default=None, help="work directory (defaults to 'work' subdir of outfile") parser.add_argument("--extra_rnn_args", help="extra arguments to rnn binary") parser.add_argument("--extra_qsub_opts", "-q", help="extra options to qsubrun (scorers only)") parser.add_argument("--rescore_single", default=os.path.join(scriptdir, 'rescore_single.py'), help="rescore single script") parser.add_argument("--cat", default=os.path.join(scriptdir, 'cat.py'), help="cat with named output") parser.add_argument("--rnn_location", default=os.path.join(scriptdir, 'helper_programs', 'ZOPH_RNN'), help="rnn binary") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) workdir = os.path.join(os.path.dirname(args.outfile), 'work') if args.workdir is None else args.workdir mkdir_p(workdir) qsubopts = args.extra_qsub_opts if args.extra_qsub_opts is not None else "" # split into desired number of pieces fill = len(str(args.splitsize - 1)) splitcmd = run(shlex.split("split -a {} -n l/{} -d {} {}/data.".format( fill, args.splitsize, args.datafile, workdir)), check=True) jobids = [] outfiles = [] joincmd = "qsubrun -q isi -l walltime=24:00:00 -j oe -o {logfile} -N {outfile}.join -W depend=afterok:".format( logfile=args.logfile, outfile=args.outfile) for piece in range(args.splitsize): # split back into source and target piece = str(piece).zfill(fill) df = "{}/data.{}".format(workdir, piece) of = "{}/scores.{}".format(workdir, piece) outfiles.append(of) # launch individual rescore jobs; collect job ids cmd = "qsubrun -o {workdir}/split.log.{piece} -N split.{piece} {qsub} -- {rescore_single} -m {model} -n {modelnum} -d {df} -o {of} -l {workdir}/inner.log.{piece}".format( qsub=qsubopts, workdir=workdir, piece=piece, rescore_single=args.rescore_single, model=args.model, modelnum=args.modelnum, df=df, of=of) sys.stderr.write("Launching {}".format(cmd) + "\n") jobid = run(shlex.split(cmd), stdout=PIPE).stdout.decode('utf-8').strip() sys.stderr.write("Got {}".format(jobid) + "\n") jobids.append(jobid) joincmd += "{} -- {} -i {} -o {} ".format(':'.join(jobids), args.cat, ' '.join(outfiles), args.outfile) sys.stderr.write(joincmd + "\n") res = run(shlex.split(joincmd), stdout=PIPE).stdout.decode('utf-8').strip() print(res)
def main(): parser = argparse.ArgumentParser( description="hpc launch to rescore n-best lists with a given model", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument( "--input", "-i", help= "input directory containing *.src.hyp, *.trg.ref, weights.final for each set for a language" ) parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file") parser.add_argument("--model", "-m", help="path to zoph trained model") parser.add_argument("--model_nums", "-n", nargs='+', type=int, default=[1, 2, 3, 4, 5, 6, 7, 8], help="which models to use") parser.add_argument("--dev", "-d", type=str, default="dev", help="set to optimize on") parser.add_argument("--lang", "-L", required=True, help="language of the training") parser.add_argument("--label", "-l", type=str, default="x", help="label for job names") parser.add_argument("--eval", "-e", nargs='+', type=str, default=["dev", "test", "syscomb"], help="sets to evaluate on") parser.add_argument("--root", "-r", help="path to put outputs") parser.add_argument("--qsubopts", default=None, help="additional options to pass to qsub") parser.add_argument("--width", "-w", type=int, default=5, help="how many pieces to split each rescore job") parser.add_argument("--suffix", "-S", help="goes on the end of final onebest", default="onebest.rerank") parser.add_argument("--rescore_single", default=os.path.join(scriptdir, "rescore_split.py"), help="rescore script") parser.add_argument("--convert", default=os.path.join(scriptdir, "nmtrescore2sbmtnbest.py"), help="adjoin scores") parser.add_argument("--pipeline", default='/home/nlg-02/pust/pipeline-2.22', help="sbmt pipeline") parser.add_argument("--runrerank", default='runrerank.sh', help="runrerank script") parser.add_argument("--rerankmodel", default=os.path.join('scripts', 'runrerank.py'), help="inner runrerank model script") parser.add_argument("--rerankapply", default=os.path.join('scripts', 'applyrerank.py'), help="inner runrerank apply script") parser.add_argument("--packagecmd", default=os.path.join(scriptdir, 'packagenmt.sh'), help="package script") addonoffarg(parser, 'skipnmt', help="assume nmt results already exist and skip them", default=False) workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__), dir=os.getenv('TMPDIR', '/tmp')) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) def cleanwork(): shutil.rmtree(workdir, ignore_errors=True) if args.debug: print(workdir) else: atexit.register(cleanwork) outfile = prepfile(args.outfile, 'w') mkdir_p(args.root) datasets = set(args.eval) datasets.add(args.dev) combineids = {} adjoins = {} qsub = "" if args.qsubopts is not None: qsub = "--extra_qsub_opts=\"{}\"".format(args.qsubopts) global JOBS for dataset in datasets: jobids = [] allscores = [] # rescore submissions; catch jobids modelroot = os.path.realpath(args.model) for model in args.model_nums: data = os.path.realpath( os.path.join(args.input, "{}.src.hyp".format(dataset))) scores = os.path.realpath( os.path.join(args.root, "{}.m{}.scores".format(dataset, model))) allscores.append(scores) if args.skipnmt: if not os.path.exists(scores): sys.stderr.write( "ERROR: Skipping nmt but {} does not exist\n".format( scores)) sys.exit(1) elif not os.path.exists(data): sys.stderr.write("ERROR: {} does not exist\n".format(data)) sys.exit(1) elif not os.path.exists( modelroot): #TODO: also check model number! sys.stderr.write( "ERROR: {} does not exist\n".format(modelroot)) sys.exit(1) else: log = os.path.realpath( os.path.join(args.root, "{}.m{}.log".format(dataset, model))) cmd = "{rescore} {qsub} --workdir {root}/{dataset} --splitsize {width} --model {modelroot} --modelnum {model} --datafile {data} --outfile {scores} --logfile {log}".format( qsub=qsub, model=model, rescore=args.rescore_single, modelroot=modelroot, data=data, root=args.root, dataset=dataset, scores=scores, width=args.width, log=log) outfile.write(cmd + "\n") job = check_output(shlex.split(cmd)).decode('utf-8').strip() JOBS.add(job) jobids.append(job) # combine rescores and paste in previous nbests; jobidstr = "-W depend=afterok:" + ':'.join(jobids) if len( jobids) >= 1 else "" scorestr = ' '.join(allscores) nbest = os.path.join(args.input, "{}.nbest".format(dataset)) adjoin = os.path.join(args.root, "{}.adjoin.{}".format(dataset, args.suffix)) adjoins[dataset] = adjoin cmd = "qsubrun -j oe -o {root}/{dataset}.convert.monitor -N {label}.{dataset}.convert {jobidstr} -- {convert} -i {scorestr} -a {nbest} -o {adjoin}".format( root=args.root, dataset=dataset, jobidstr=jobidstr, convert=args.convert, scorestr=scorestr, nbest=nbest, adjoin=adjoin, label=args.label) outfile.write(cmd + "\n") job = check_output(shlex.split(cmd)).decode('utf-8').strip() JOBS.add(job) combineids[dataset] = job # the rerank model rerankweights = "{}/rerank.weights".format(args.root) modeljob = _rerankmodel(adjoins[args.dev], combineids[args.dev], rerankweights, outfile, args) # apply and package for dataset in datasets: orig = os.path.join(args.input, "{}.src.orig".format(dataset)) tstmaster = os.path.join(args.input, "*.{}.*.xml.gz".format(dataset)) decodefile = "{}/{}.decode".format(args.root, dataset) applyid = _applyrerank(dataset, adjoins[dataset], ':'.join([modeljob, combineids[dataset]]), rerankweights, decodefile, outfile, args) _package(dataset, applyid, orig, tstmaster, decodefile, outfile, args) # (TODO: run bleu) # no more atexit job deletion JOBS = []