def load_dictionaries(self, dictionary_files, n_words_src=None, n_words_trg=None): """ Load the input dictionaries and output dictionary for a model. Note the `n_words_src` kwarg is here to maintain compatibility with the dictionary loading logic in Nematus. Args: dictionary_files: list of strings which are paths to *.json Nematus dictionary files Returns: input_dicts, input_idicts, output_dict, output_idict """ def load_utf8_dict(filename): with codecs.open(filename, 'rb', encoding='utf8') as f: return {k.encode('utf8'): v for k, v in json.load(f).items()} input_dict_files = dictionary_files[:-1] output_dict_file = dictionary_files[-1] # load source dictionary and invert input_dicts = [] input_idicts = [] for dictionary in input_dict_files: input_dict = load_utf8_dict(dictionary) if self.is_utf8( dictionary) else load_dict(dictionary) if n_words_src is not None: for key, idx in input_dict.items(): if idx >= n_words_src: del input_dict[key] input_idict = dict() for kk, vv in input_dict.iteritems(): input_idict[vv] = kk input_idict[0] = '<eos>' input_idict[1] = 'UNK' input_dicts.append(input_dict) input_idicts.append(input_idict) # load target dictionary and invert output_dict = load_utf8_dict(output_dict_file) if self.is_utf8( output_dict_file) else load_dict(output_dict_file) if n_words_trg is not None: for key, idx in output_dict.items(): if idx >= n_words_trg: del output_dict[key] output_idict = dict() for kk, vv in output_dict.iteritems(): output_idict[vv] = kk output_idict[0] = '<eos>' output_idict[1] = 'UNK' return { 'input_dicts': input_dicts, 'input_idicts': input_idicts, 'output_dict': output_dict, 'output_idict': output_idict, 'src_size': n_words_src, 'trg_size': n_words_trg }
def __init__(self, source, target, hter, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=False, maxibatch_size=20): # 每次epoch都,打乱文件顺序 if shuffle_each_epoch: shuffle.main([source, target, hter]) self.source = fopen(source + '.shuf') self.target = fopen(target + '.shuf') self.hter = fopen(hter + '.shuf') else: self.source = fopen(source) self.target = fopen(target) self.hter = fopen(hter) self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for key, idx in self.source_dict.items(): if idx >= self.n_words_source: del self.source_dict[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.hter_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def initialize(model_options, pyro_port, pyro_name, pyro_key): if model_options['dim_per_factor'] is None: if model_options['factors'] == 1: model_options['dim_per_factor'] = [model_options['dim_word']] else: sys.stderr.write( 'Error: if using factored input, you must specify \'dim_per_factor\'\n' ) sys.exit(1) assert (len(model_options['dictionaries']) == model_options['factors'] + 1) # one dictionary per source factor + 1 for target factor assert (len(model_options['dim_per_factor']) == model_options['factors']) # each factor embedding has its own dimensionality assert ( sum(model_options['dim_per_factor']) == model_options['dim_word'] ) # dimensionality of factor embeddings sums up to total dimensionality of input embedding vector # load dictionaries and invert them worddicts = [None] * len(model_options['dictionaries']) for ii, dd in enumerate(model_options['dictionaries']): worddicts[ii] = load_dict(dd) if model_options['n_words_src'] is None: n_words_src = len(worddicts[0]) model_options['n_words_src'] = n_words_src if model_options['n_words'] is None: n_words = len(worddicts[1]) model_options['n_words'] = n_words print 'Initilizing remote theano server' # In order to transfer numpy objects across the network, must use pickle as Pyro Serializer. # Also requires various environment flags (PYRO_SERIALIZERS_ACCEPTED, PYRO_SERIALIZER) # for both name server and server. Pyro4.config.SERIALIZER = 'pickle' Pyro4.config.NS_PORT = pyro_port remote = Pyro4.Proxy("PYRONAME:{0}".format(pyro_name)) remote._pyroHmacKey = pyro_key remote.init(model_options) return remote
def train( batch_size=80, valid_batch_size=80, dim=100, dim_word=500, dispFreq=100, saveFreq=3000, validFreq=1000, saveto='RNN_model/wmt17.en-de.npz', datasets=['tuning/train.bpe.en', 'tuning/train.bpe.de'], valid_datasets=['tuning/dev.bpe.en', 'tuning/dev.bpe.de'], dictionaries=['data/train.bpe.en.json', 'data/train.bpe.de.json'], hter=['tuning/train.hter', 'tuning/dev.hter'], n_words_src=40000, n_words_tgt=40000, nmt_model='model/model.npz.best_bleu', lrate=0.0001, # learning rate use_dropout=True, patience=10, max_epochs=5000, finish_after=1000000, maxibatch_size=20, optimizer='rmsprop', shuffle_each_epoch=True, reload_=True, overwrite=False, sort_by_length=False, maxlen=1000, decay_c=0., # L2 regularization penalty map_decay_c=0., # L2 regularization penalty towards original weights clip_c=1.0, dropout_embedding=0.2, # dropout for input embeddings (0: no dropout) dropout_hidden=0.2, # dropout for hidden layers (0: no dropout) dropout_source=0.1, # dropout source words (0: no dropout) dropout_target=0.1, # dropout target words (0: no dropout) model_version=0.1): #获取局部参数 model_options = locals().copy() print 'Model options:', model_options #加载字典,并且反转 worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): worddicts[ii] = load_dict(dd) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk #若词汇总大小未设置,则给定默认值为词汇表大小 if n_words_src is None: n_words_src = len(worddicts[0]) model_options['n_words_src'] = n_words_src if n_words_tgt is None: n_words_tgt = len(worddicts[1]) model_options['n_words_tgt'] = n_words_tgt #加载数据 print 'Loading data ...' train = TextIterator(datasets[0], datasets[1], hter[0], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words_tgt, batch_size=batch_size, maxlen=maxlen, shuffle_each_epoch=shuffle_each_epoch, sort_by_length=sort_by_length, maxibatch_size=maxibatch_size) valid = TextIterator(valid_datasets[0], valid_datasets[1], hter[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words_tgt, batch_size=valid_batch_size, maxlen=maxlen) #numpy.random.seed(1234) # 初始化模型参数 print 'Init parameters ...' params = init_params(model_options) #reload parameters if reload_ and os.path.exists(saveto): print 'Reloading model parameters' params = load_params(saveto, params) #把网络中的W,b 变为共享变量 tparams = init_theano_params(params) # 建立模型 print 'Building model ...', trng,use_noise,x,x_mask,y,y_mask,hter, \ y_pred,cost = build_model(tparams,model_options) print 'Done' """ @function:调试 print Wt.get_value().shape print tparams['W'].get_value().shape f_tt = theano.function([x,x_mask,y,y_mask],tt) f_emb = theano.function([x,x_mask,y,y_mask],emb) f_pred = theano.function([x,x_mask,y,y_mask],y_pred) f_cost = theano.function([x,x_mask,y,y_mask,hter],cost) for x, y, hter in train: # 准备数据用于训练 x, x_mask, y, y_mask = nmt.prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words_tgt) hter = numpy.array(hter).astype('float32') hter = hter.reshape([hter.shape[0],1]) print f_pred(x,x_mask,y,y_mask).shape print f_cost(x,x_mask,y,y_mask,hter) #print f_cost(x,x_mask,y,y_mask,hter) sys.exit(0) """ # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # apply L2 regularisation to loaded model (map training) if map_decay_c > 0: map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c") weight_map_decay = 0. for kk, vv in tparams.iteritems(): init_value = theano.shared(vv.get_value(), name=kk + "_init") weight_map_decay += ((vv - init_value)**2).sum() weight_map_decay *= map_decay_c cost += weight_map_decay print 'Building f_pred...', inps = [x, x_mask, y, y_mask] f_pred = theano.function(inps, y_pred, profile=False) print 'Done' print 'Building f_cost...', inps = [x, x_mask, y, y_mask, hter] f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, profile=False) print 'Done' print 'Start Optimization' best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] # reload history if reload_ and os.path.exists(saveto): rmodel = numpy.load(saveto) history_errs = list(rmodel['history_errs']) if 'uidx' in rmodel: uidx = rmodel['uidx'] if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size valid_err = None fp = open('RNN_model/valid.error', 'w') for eidx in xrange(max_epochs): n_samples = 0 for x, y, hter in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) # 准备数据用于训练 x, x_mask, y, y_mask = nmt.prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words_tgt) hter = numpy.array(hter).astype('float32') hter = hter.reshape([hter.shape[0], 1]) #长度小于 maxlen 的值的句子为 0 if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask, hter) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far, in addition, save the latest model # into a separate file with the iteration number for external eval if numpy.mod(uidx, saveFreq) == 0: print 'Saving the best model...', if best_p is not None: params = best_p else: params = unzip_from_theano(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2) print 'Done' # save with uidx if not overwrite: print 'Saving the model at iteration {}...'.format(uidx), saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip_from_theano(tparams)) print 'Done' # validate model on validation set and early stop if necessary if valid and validFreq and numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_error(f_cost, nmt.prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip_from_theano(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err fp.writelines('valid error: ' + str(valid_err) + '\n') # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zip_to_theano(best_p, tparams) if valid: use_noise.set_value(0.) valid_errs = pred_error(f_cost, nmt.prepare_data, model_options, valid) valid_err = valid_errs.mean() print 'Valid ', valid_err fp.writelines('Finally error: ' + str(valid_err) + '\n') fp.close() if best_p is not None: params = copy.copy(best_p) else: params = unzip_from_theano(tparams) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err
def sample_par(lines, model_options, f_init, f_next, beam_size=3, suppress_unk=True): dictionaries = model_options['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if model_options['n_words_src']: for key, idx in word_dict.items(): if idx >= model_options['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww).replace('@@ ', '') seqs = [] for idx, line in enumerate(lines): words = line.strip().split() x = [] for w in words: w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] if len(w) != model_options['factors']: raise Exception( 'Error: expected {0} factors, but input word has {1}\n'. format(model_options['factors'], len(w))) x.append(w) x += [[0] * model_options['factors']] seqs.append(x) seqs_y_dummy = [[x[0] for x in element] for element in seqs] sequences, xmask, dummy_y, ymask = prepare_data(seqs, seqs_y_dummy) print 'Calling gen_par_samp' t0 = time.time() parsample, parscore, parword_probs = gen_par_sample( [ f_init, ], [ f_next, ], sequences, xmask, k=beam_size, maxlen=200, suppress_unk=suppress_unk) print 'gen_par_samp returned, took %.1f seconds' % (time.time() - t0) t0 = time.time() compare_samples = [] for i in range(len(seqs)): mask_size = int(round(np.sum(xmask[:, i]))) seq = sequences[:, :mask_size, i:i + 1] print 'calling gen_sample' sample, score, word_probs, _, _ = gen_sample([ f_init, ], [ f_next, ], seq, k=beam_size, maxlen=200, stochastic=False, suppress_unk=suppress_unk) compare_samples += sample print 'iterative gen_sample took %.1f seconds' % (time.time() - t0) sample_words = [] for sents in parsample: #sample_words.append([_seqs2words(cand) for cand in sents]) sample_words += sents return sample_words, compare_samples