def train(args): option = default_option() # predefined model names pathname, basename = os.path.split(args.model) modelname = get_filename(basename) autoname_format = os.path.join(pathname, modelname + ".iter{epoch}-{batch}.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") # load models if os.path.exists(args.model): opt, params = load_model(args.model) override(option, opt) init = False else: init = True if args.initialize: print "initialize:", args.initialize pretrain_params = load_model(args.initialize) pretrain_params = pretrain_params[1] pretrain = True else: pretrain = False override(option, args_to_dict(args)) # check external validation script ext_val_script = option['ext_val_script'] if not os.path.exists(ext_val_script): raise ValueError('File doesn\'t exist: %s' % ext_val_script) elif not os.access(ext_val_script, os.X_OK): raise ValueError('File is not executable: %s' % ext_val_script) # check references format ref_stem = option['references'] if option['validation'] and option['references']: ref_stem = misc.infer_ref_stem([option['validation']], option['references']) ref_stem = ref_stem[0] # .yaml for ultimate options yaml_name = "%s.settings.yaml" % modelname if init or not os.path.exists(yaml_name): with open(yaml_name, "w") as w: _opt = args.__dict__.copy() for k, v in _opt.iteritems(): if k in option: _opt[k] = option[k] yaml.dump(_opt, w, default_flow_style=False) del _opt print_option(option) # reader batch = option["batch"] sortk = option["sort"] shuffle = option["shuffle"] reader = textreader(option["corpus"][:3], shuffle) processor = [data_length, data_length, data_length] stream = textiterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) reader = textreader(option["corpus"][3:], shuffle) processor = [data_length, data_length, data_length] dstream = textiterator(reader, [batch, batch * sortk], processor, None, option["sort"]) # progress # initialize before building model progress = Progress(option["delay_val"], stream, option["seed"]) # create model regularizer = [] if option["l1_scale"]: regularizer.append(ops.l1_regularizer(option["l1_scale"])) if option["l2_scale"]: regularizer.append(ops.l2_regularizer(option["l2_scale"])) scale = option["scale"] initializer = ops.random_uniform_initializer(-scale, scale) regularizer = ops.sum_regularizer(regularizer) option["scope"] = "rnnsearch" model = build_model(initializer=initializer, regularizer=regularizer, **option) variables = None if pretrain: matched, not_matched = match_variables(ops.trainable_variables(), pretrain_params) if args.finetune: variables = not_matched if not variables: raise RuntimeError("no variables to finetune") if pretrain: restore_variables(matched, not_matched) if not init: set_variables(ops.trainable_variables(), params) print "parameters: %d\n" % count_parameters(ops.trainable_variables()) # tuning option tune_opt = {} tune_opt["algorithm"] = option["optimizer"] tune_opt["constraint"] = ("norm", option["norm"]) tune_opt["norm"] = True tune_opt["variables"] = variables # create optimizer scopes = ["((?!Shared).)*$"] trainer = optimizer(model.inputs, model.outputs, model.cost, scopes, **tune_opt) clascopes = [".*(Shared).*"] clatrainer = optimizer(model.inputs_cla, model.outputs_cla, model.cost_cla, clascopes, **tune_opt) #scopes = [".*(DSAenc).*"] #domain_trainer = optimizer(model.inputs, model.toutputs, model.domaincost, scopes, **tune_opt) # vocabulary and special symbol svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] alpha = option["alpha"] maxepoch = option["maxepoch"] # restore right before training to avoid randomness changing when trying to resume progress if not args.reset: if "#progress" in option: print 'Restore progress >>' progress = (option["#progress"]) stream = progress.iterator stream.set_processor(processor) else: print 'New progress >>' else: print 'Discard progress >>' if args.drop_tasks: print 'drop tasks' progress.drop_tasks() # setup progress progress.oldname = args.model progress.serializer = serialize stream = progress.iterator overwrite = not args.no_overwrite if progress.task_manager: print progress.task_manager register_killer() tagvocab = {} for idx, d in enumerate(option["dvocab"]): tagvocab[d] = idx if len(tagvocab) != option["dnum"]: raise ValueError('length of domain vocab %f not equal to domain num %f!' % (len(tagvocab), option["dnum"])) try: while progress.epoch < maxepoch: epc = progress.epoch for data in stream: progress.tic() if progress.failed(): raise RuntimeError("progress failure") # data = _stream.next() xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym) ydata, ymask = convert_data(data[1], tvocab, unk_sym, eos_sym) tag = convert_tag(data[2], tagvocab) t1 = time.time() cost, dcost, scost, tdcost, norm = trainer.optimize(xdata, xmask, ydata, ymask, tag) clacost, _ = clatrainer.optimize(xdata, xmask, tag) trainer.update(alpha=alpha) clatrainer.update(alpha=alpha) t2 = time.time() # per word cost w_cost = cost * ymask.shape[1] / ymask.sum() progress.batch_count += 1 progress.batch_total += 1 progress.loss_hist.append(w_cost) if not args.pfreq or count % args.pfreq == 0: print epc + 1, progress.batch_count, w_cost, dcost, tdcost, scost, clacost, norm, t2 - t1 count = progress.batch_count if count % option["sfreq"] == 0: dright = 0.0 sright = 0.0 tdright = 0.0 total = 0.0 for ddata in dstream: txdata, txmask = convert_data(ddata[0], svocab, unk_sym, eos_sym) tydata, tymask = convert_data(ddata[1], tvocab, unk_sym, eos_sym) txtag = convert_tag(ddata[2], tagvocab) dtag_pred, stag_pred = model.tag_predict(txdata, txmask) txtag = txtag[0] dpretag = [] for i in dtag_pred: dpretag.append(int(i)) spretag = [] for i in stag_pred: spretag.append(int(i)) tdtag_pred = model.tgt_tag_predict(txdata, txmask, tydata, tymask) tdpretag = [] for i in tdtag_pred[0]: tdpretag.append(int(i)) dright = dright + sum([m == n for m, n in zip(txtag, dpretag)]) sright = sright + sum([m == n for m, n in zip(txtag, spretag)]) tdright = tdright + sum([m == n for m, n in zip(txtag, tdpretag)]) total = total + len(dpretag) dstream.reset() dacc = dright * 1.0 / total sacc = sright * 1.0 / total tdacc = tdright * 1.0 / total print "dacc:", dright, dacc print "sacc", sright, sacc print "tdacc", tdright, tdacc if count % option["vfreq"] == 0 and not should_skip_val(args.skip_val, option["vfreq"], epc, progress.batch_total): if option["validation"] and option["references"]: progress.add_valid(option['scope'], option['validation'], ref_stem, ext_val_script, __file__, option, modelname, bestname, serialize) # save after validation progress.toc() if count % option["freq"] == 0: progress.save(option, autoname_format, overwrite) progress.tic() if count % option["sfreq"] == 0: n = len(data[0]) ind = numpy.random.randint(0, n) sdata = data[0][ind] tdata = data[1][ind] xdata = xdata[:, ind: ind + 1] xmask = xmask[:, ind: ind + 1] hls = beamsearch(model, xdata, xmask) best, score = hls[0] print "--", sdata print "--", tdata print "--", " ".join(best[:-1]) progress.toc() print "--------------------------------------------------" progress.tic() if option["validation"] and option["references"]: progress.add_valid(option['scope'], option['validation'], ref_stem, ext_val_script, __file__, option, modelname, bestname, serialize) print "--------------------------------------------------" progress.toc() print "epoch cost {}".format(numpy.mean(progress.loss_hist)) progress.loss_hist = [] # early stopping if epc + 1 >= option["stop"]: alpha = alpha * option["decay"] stream.reset() progress.epoch += 1 progress.batch_count = 0 # update autosave option["alpha"] = alpha progress.save(option, autoname_format, overwrite) stream.close() progress.tic() print "syncing ..." progress.barrier() # hangup and wait progress.toc() best_valid = max(progress.valid_hist, key=lambda item: item[1]) (epc, count), score = best_valid print "best bleu {}-{}: {:.4f}".format(epc + 1, count, score) if progress.delay_val: task_elapse = sum([task.elapse for task in progress.task_manager.tasks]) print "training finished in {}({})".format(datetime.timedelta(seconds=int(progress.elapse)), datetime.timedelta(seconds=int(progress.elapse + task_elapse))) else: print "training finished in {}".format(datetime.timedelta(seconds=int(progress.elapse))) progress.save(option, autoname_format, overwrite) except KeyboardInterrupt: traceback.print_exc() progress.terminate() sys.exit(1) except Exception: traceback.print_exc() progress.terminate() sys.exit(1)
def train(args): option = default_option() # predefined model names pathname, basename = os.path.split(args.model) modelname = get_filename(basename) autoname_format = os.path.join(pathname, modelname + ".iter{epoch}-{batch}.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") # load models if os.path.exists(args.model): opt, params = load_model(args.model) override(option, opt) init = False else: init = True if args.initialize: pretrain_params = load_model(args.initialize) pretrain_params = pretrain_params[1] pretrain = True else: pretrain = False override(option, args_to_dict(args)) # check external validation script ext_val_script = option['ext_val_script'] if not os.path.exists(ext_val_script): raise ValueError('File doesn\'t exist: %s' % ext_val_script) elif not os.access(ext_val_script, os.X_OK): raise ValueError('File is not executable: %s' % ext_val_script) # check references format ref_stem = None if option['validation'] and option['references']: ref_stem = misc.infer_ref_stem([option['validation']], option['references']) ref_stem = ref_stem[0] # .yaml for ultimate options yaml_name = "%s.settings.yaml" % modelname if init or not os.path.exists(yaml_name): with open(yaml_name, "w") as w: _opt = args.__dict__.copy() for k, v in _opt.iteritems(): if k in option: _opt[k] = option[k] yaml.dump(_opt, w, default_flow_style=False) del _opt print_option(option) # reader batch = option["batch"] sortk = option["sort"] shuffle = option["shuffle"] reader = textreader(option["corpus"], shuffle) processor = [data_length, data_length] stream = textiterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) # progress # initialize before building model progress = Progress(option["delay_val"], stream, option["seed"]) # create model regularizer = [] if option["l1_scale"]: regularizer.append(ops.l1_regularizer(option["l1_scale"])) if option["l2_scale"]: regularizer.append(ops.l2_regularizer(option["l2_scale"])) scale = option["scale"] initializer = ops.random_uniform_initializer(-scale, scale) regularizer = ops.sum_regularizer(regularizer) option["scope"] = "rnnsearch" model = build_model(initializer=initializer, regularizer=regularizer, **option) variables = None if pretrain: print "using pretrain" _pp1 = {} for name, val in pretrain_params: names = name.split('/')[1:] if "embedding" in names[0]: _pp1['/'.join(names)] = val else: _pp1['/'.join(names[1:])] = val matched = [] not_matched = [] for var in ops.trainable_variables(): names = var.name.split('/')[1:] if "decoder2" in var.name: not_matched.append((var.name, var.get_value().size)) continue if "embedding" in names[0]: match_name = '/'.join(names) var.set_value(_pp1[match_name]) else: match_name = '/'.join(names[1:]) var.set_value(_pp1[match_name]) matched.append((var.name, var.get_value().size)) print "------------------- matched -------------------" for name, size in matched: print name, size print "------------------- not matched -------------------" for name, size in not_matched: print name, size print "------------------- end -------------------\n" if not init: set_variables(ops.trainable_variables(), params) print "parameters: %d\n" % count_parameters(ops.trainable_variables()) # tuning option tune_opt = {} tune_opt["algorithm"] = option["optimizer"] tune_opt["constraint"] = ("norm", option["norm"]) tune_opt["norm"] = True tune_opt["variables"] = variables # create optimizer scopes = [".*"] trainer = optimizer(model.inputs, model.outputs, model.cost, scopes, **tune_opt) # vocabulary and special symbol svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] alpha = option["alpha"] maxepoch = option["maxepoch"] # restore right before training to avoid randomness changing when trying to resume progress if not args.reset: if "#progress" in option: print 'Restore progress >>' progress = (option["#progress"]) stream = progress.iterator stream.set_processor(processor) for ttt in progress.task_manager.tasks: ttt.status = 4 ttt.result = 0.0 else: print 'New progress >>' else: print 'Discard progress >>' # setup progress progress.oldname = args.model progress.serializer = serialize stream = progress.iterator overwrite = not args.no_overwrite if progress.task_manager: print progress.task_manager try: while progress.epoch < maxepoch: epc = progress.epoch for data in stream: progress.tic() if progress.failed(): raise RuntimeError("progress failure") xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym) ydata, ymask = convert_data(data[1], tvocab, unk_sym, eos_sym) bydata, _ = convert_data(data[1], tvocab, unk_sym, eos_sym, True) t1 = time.time() tot_cost, soft_cost, true_cost, norm = trainer.optimize( xdata, xmask, ydata, ymask, bydata) trainer.update(alpha=alpha) t2 = time.time() # per word cost w_cost = true_cost * ymask.shape[1] / ymask.sum() progress.batch_count += 1 progress.batch_total += 1 progress.loss_hist.append(w_cost) count = progress.batch_count if not args.pfreq or count % args.pfreq == 0: print epc + 1, progress.batch_count, w_cost, tot_cost, soft_cost, true_cost, norm, t2 - t1 if count % option["vfreq"] == 0 and not should_skip_val( args.skip_val, option["vfreq"], epc, progress.batch_total): if option["validation"] and option["references"]: progress.add_valid(option['scope'], option['validation'], ref_stem, ext_val_script, __file__, option, modelname, bestname, serialize) # save after validation progress.toc() if count % option["freq"] == 0: progress.save(option, autoname_format, overwrite) progress.tic() if count % option["sfreq"] == 0: n = len(data[0]) ind = numpy.random.randint(0, n) sdata = data[0][ind] tdata = data[1][ind] xdata = xdata[:, ind:ind + 1] xmask = xmask[:, ind:ind + 1] hls = beamsearch(model, xdata, xmask) best, score = hls[0] print "--", sdata print "--", tdata print "--", " ".join(best[:-1]) progress.toc() print "--------------------------------------------------" progress.tic() if option["validation"] and option["references"]: progress.add_valid(option['scope'], option['validation'], ref_stem, ext_val_script, __file__, option, modelname, bestname, serialize) print "--------------------------------------------------" progress.toc() # early stopping if epc + 1 >= option["stop"]: alpha = alpha * option["decay"] stream.reset() progress.epoch += 1 progress.batch_count = 0 # update autosave option["alpha"] = alpha progress.save(option, autoname_format, overwrite) stream.close() progress.tic() print "syncing ..." progress.barrier() # hangup and wait progress.toc() best_valid = max(progress.valid_hist, key=lambda item: item[1]) (epc, count), score = best_valid print "best bleu {}-{}: {:.4f}".format(epc + 1, count, score) if progress.delay_val: task_elapse = sum( [task.elapse for task in progress.task_manager.tasks]) print "training finished in {}({})".format( datetime.timedelta(seconds=int(progress.elapse)), datetime.timedelta(seconds=int(progress.elapse + task_elapse))) else: print "training finished in {}".format( datetime.timedelta(seconds=int(progress.elapse))) progress.save(option, autoname_format, overwrite) except KeyboardInterrupt: traceback.print_exc() progress.terminate() sys.exit(1) except Exception: traceback.print_exc() progress.terminate() sys.exit(1)
def train(args): option = default_option() # predefined model names pathname, basename = os.path.split(args.model) modelname = get_filename(basename) autoname = os.path.join(pathname, modelname + ".autosave.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") # load models if os.path.exists(args.model): opt, params = load_model(args.model) option = opt init = False else: init = True if args.initialize: init_params = load_model(args.initialize) init_params = init_params[1] restore = True else: restore = False override(option, args_to_dict(args)) print_option(option) # load references if option["references"]: references = load_references(option["references"]) else: references = None if args.skip_val: references = None criterion = option["criterion"] if criterion == "mrt": sys.stderr.write("warning: In MRT mode, batch is set to 1\n") # input corpus batch = option["batch"] if criterion == "mle" else 1 sortk = option["sort"] or 1 if criterion == "mle" else 1 shuffle = option["seed"] if option["shuffle"] else None reader = textreader(option["corpus"], shuffle) processor = [data_length, data_length] stream = textiterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) if shuffle and option["indices"] is not None: reader.set_indices(option["indices"]) if args.reset: option["count"] = [0, 0] option["epoch"] = 0 option["cost"] = 0.0 skip_stream(reader, option["count"][1]) epoch = option["epoch"] maxepoch = option["maxepoch"] # create model regularizer = [] if option["l1_scale"]: regularizer.append(ops.l1_regularizer(option["l1_scale"])) if option["l2_scale"]: regularizer.append(ops.l2_regularizer(option["l2_scale"])) scale = option["scale"] initializer = ops.random_uniform_initializer(-scale, scale) regularizer = ops.sum_regularizer(regularizer) # set seed numpy.random.seed(option["seed"]) model = rnnsearch(initializer=initializer, regularizer=regularizer, **option) variables = None if restore: matched, not_matched = match_variables(ops.trainable_variables(), init_params) if args.finetune: variables = not_matched if not variables: raise RuntimeError("no variables to finetune") if not init: set_variables(ops.trainable_variables(), params) if restore: restore_variables(matched, not_matched) print "parameters:", count_parameters(ops.trainable_variables()) # tuning option tune_opt = {} tune_opt["algorithm"] = option["optimizer"] tune_opt["constraint"] = ("norm", option["norm"]) tune_opt["norm"] = True tune_opt["variables"] = variables # create optimizer trainer = optimizer(model, **tune_opt) # beamsearch option search_opt = {} search_opt["beamsize"] = option["beamsize"] search_opt["normalize"] = option["normalize"] search_opt["maxlen"] = option["maxlen"] search_opt["minlen"] = option["minlen"] # vocabulary and special symbol svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] # summary count = option["count"][0] totcost = option["cost"] best_score = option["bleu"] alpha = option["alpha"] sharp = option["sharp"] for i in range(epoch, maxepoch): for data in stream: xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym) ydata, ymask = convert_data(data[1], tvocab, unk_sym, eos_sym) if criterion == "mrt": refs = [] for item in data[1]: item = item.split() item = [unk_sym if word not in tvocab else word for word in item] refs.append(" ".join(item)) t1 = time.time() # sample from model nsample = option["sample"] - len(refs) xdata = numpy.repeat(xdata, nsample, 1) xmask = numpy.repeat(xmask, nsample, 1) maxlen = int(1.5 * len(ydata)) examples = batchsample(model, xdata, xmask, maxlen) space = build_sample_space(refs, examples) score = numpy.zeros((len(space),), "float32") refs = [ref.split() for ref in refs] for j in range(len(space)): example = space[j].split() score[j] = 1.0 - bleu([example], [refs], smoothing=True) ydata, ymask = convert_data(space, tvocab, unk_sym, eos_sym) cost, norm = trainer.optimize(xdata[:, 0:1], xmask[:, 0:1], ydata, ymask, score, sharp) trainer.update(alpha=alpha) t2 = time.time() totcost += cost count += 1 t = t2 - t1 ac = totcost / count print i + 1, count, len(space), cost, norm, ac, t else: t1 = time.time() cost, norm = trainer.optimize(xdata, xmask, ydata, ymask) trainer.update(alpha = alpha) t2 = time.time() count += 1 cost = cost * ymask.shape[1] / ymask.sum() totcost += cost / math.log(2) print i + 1, count, cost, norm, t2 - t1 # autosave if count % option["freq"] == 0: option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(autoname, option) if count % option["vfreq"] == 0: if option["validation"] and references: trans = translate(model, option["validation"], **search_opt) bleu_score = bleu(trans, references) print "bleu: %2.4f" % bleu_score if bleu_score > best_score: best_score = bleu_score option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(bestname, option) if count % option["sfreq"] == 0: n = len(data[0]) ind = numpy.random.randint(0, n) sdata = data[0][ind] tdata = data[1][ind] xdata = xdata[:, ind : ind + 1] xmask = xmask[:, ind : ind + 1] hls = beamsearch(model, xdata, xmask) best, score = hls[0] print sdata print tdata print " ".join(best[:-1]) print "--------------------------------------------------" if option["validation"] and references: trans = translate(model, option["validation"], **search_opt) bleu_score = bleu(trans, references) print "iter: %d, bleu: %2.4f" % (i + 1, bleu_score) if bleu_score > best_score: best_score = bleu_score option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(bestname, option) print "averaged cost: ", totcost / count print "--------------------------------------------------" # early stopping if i + 1 >= option["stop"]: alpha = alpha * option["decay"] count = 0 totcost = 0.0 stream.reset() # update autosave option["epoch"] = i + 1 option["alpha"] = alpha option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [0, 0] serialize(autoname, option) print "best(bleu): %2.4f" % best_score stream.close()