def __init__(self, ctx_size, vocab_size, embed_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, use_gate=True, use_hidden=False, h_dim=100, h_activation=tx.elu, h_init=tx.he_normal_init(), h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_dropout=True, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5, use_nce=False, nce_samples=100): # GRAPH INPUTS run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input") loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target") eval_inputs = loss_inputs # RUN GRAPH # if I create a scope here the Tensorboard graph will be a mess to read # because it groups everything by nested scope names # instead if I choose to create different scopes for train and eval only # the graph stays readable because it allows us to use the same names # under different scopes while still sharing variables var_reg = [] with tf.name_scope("run"): feature_lookup = tx.Lookup(run_inputs, ctx_size, [vocab_size, embed_dim], embed_init, name="lookup") var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() if use_gate or use_hidden: hl = tx.Linear(feature_lookup, h_dim, h_init, bias=True, name="h_linear") ha = tx.Activation(hl, h_activation, name="h_activation") h = tx.Compose(hl, ha, name="hidden") var_reg.append(hl.weights) features = feature_lookup if use_gate: gate_w = tx.Linear(h, ctx_size, bias=True) gate = tx.Gate(features, gate_input=gate_w) # gate = tx.Module([h, features], gate) features = gate var_reg.append(gate_w.weights) x_to_f = tx.Linear(features, embed_dim, x_to_f_init, bias=True, name="x_to_f") var_reg.append(x_to_f.weights) f_prediction = x_to_f if use_hidden: h_to_f = tx.Linear(h, embed_dim, h_to_f_init, bias=True, name="h_to_f") var_reg.append(h_to_f.weights) f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted") # RI DECODING =============================================== shared_weights = tf.transpose( feature_lookup.weights) if embed_share else None logit_init = logit_init if not embed_share else None run_logits = tx.Linear(f_prediction, vocab_size, logit_init, shared_weights, bias=True, name="logits") if not embed_share: var_reg.append(run_logits.weights) y_prob = tx.Activation(run_logits, tx.softmax) # TRAIN GRAPH =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(run_inputs) features = tx.Dropout(feature_lookup, probability=keep_prob) else: features = feature_lookup if use_gate or use_hidden: if use_dropout: h = h.reuse_with(features) h = tx.Dropout(h, probability=keep_prob) if use_gate: gate_w = gate_w.reuse_with(h) features = gate.reuse_with(layer=features, gate_input=gate_w) f_prediction = x_to_f.reuse_with(features) if use_hidden: h_to_f = h_to_f.reuse_with(h) if use_dropout: h_to_f = tx.Dropout(h_to_f, probability=keep_prob) f_prediction = tx.Add(f_prediction, h_to_f) else: f_prediction = f_prediction.reuse_with(features) train_logits = run_logits.reuse_with(f_prediction) if use_nce: # uniform gets good enough results if enough samples are used # but we can load the empirical unigram distribution # or learn the unigram distribution during training sampled_values = uniform_sampler(loss_inputs.tensor, 1, nce_samples, True, vocab_size) train_loss = tf.nn.nce_loss(weights=tf.transpose( train_logits.weights), biases=train_logits.bias, inputs=f_prediction.tensor, labels=loss_inputs.tensor, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=sampled_values) else: one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy( one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # EVAL GRAPH =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # SETUP MODEL CONTAINER ==================================== super().__init__(run_inputs=run_inputs, run_outputs=y_prob, train_inputs=run_inputs, train_outputs=y_prob, eval_inputs=run_inputs, eval_outputs=y_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)
# Activation functions if args.h_act == "relu": h_act = tx.relu h_init = tx.he_normal_init() elif args.h_act == "tanh": h_act = tx.tanh h_init = tx.glorot_uniform() elif args.h_act == "elu": h_act = tx.elu h_init = tx.he_normal_init() # Parameter Init if args.embed_init == "normal": embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val) elif args.embed_init == "uniform": embed_init = tx.random_uniform(minval=-args.embed_init_val, maxval=args.embed_init_val) if args.logit_init == "normal": logit_init = tx.random_normal(mean=0., stddev=args.logit_init_val) elif args.logit_init == "uniform": logit_init = tx.random_uniform(minval=-args.logit_init_val, maxval=args.logit_init_val) if args.h_to_f_init == "normal": h_to_f_init = tx.random_normal(mean=0., stddev=args.h_to_f_init_val) elif args.h_to_f_init == "uniform": h_to_f_init = tx.random_uniform(minval=-args.h_to_f_init_val, maxval=args.h_to_f_init_val) if args.x_to_f_init == "normal": x_to_f_init = tx.random_normal(mean=0., stddev=args.x_to_f_init_val)
# Activation functions if args.h_act == "relu": h_act = tx.relu h_init = tx.he_normal_init() elif args.h_act == "tanh": h_act = tx.tanh h_init = tx.glorot_uniform() elif args.h_act == "elu": h_act = tx.elu h_init = tx.he_normal_init() # Parameter Init if args.embed_init == "normal": embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val) elif args.embed_init == "uniform": embed_init = tx.random_uniform(minval=-args.embed_init_val, maxval=args.embed_init_val) if args.logit_init == "normal": logit_init = tx.random_normal(mean=0., stddev=args.logit_init_val) elif args.logit_init == "uniform": logit_init = tx.random_uniform(minval=-args.logit_init_val, maxval=args.logit_init_val) if args.f_init == "normal": f_init = tx.random_normal(mean=0., stddev=args.f_init_val) elif args.f_init == "uniform": f_init = tx.random_uniform(minval=-args.f_init_val, maxval=args.f_init_val) model = NNLMNRP_NCE(ctx_size=args.ngram_size - 1, vocab_size=len(vocab), k_dim=args.k_dim,
def run(**kwargs): arg_dict.from_dict(kwargs) args = arg_dict.to_namespace() # ====================================================================================== # Load Params, Prepare results assets # ====================================================================================== # os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) # print(args.corpus) # Experiment parameter summary res_param_filename = os.path.join(args.out_dir, "params_{id}.csv".format(id=args.run_id)) with open(res_param_filename, "w") as param_file: writer = csv.DictWriter(f=param_file, fieldnames=arg_dict.keys()) writer.writeheader() writer.writerow(arg_dict) param_file.flush() # make dir for model checkpoints if args.save_model: model_ckpt_dir = os.path.join(args.out_dir, "model_{id}".format(id=args.run_id)) os.makedirs(model_ckpt_dir, exist_ok=True) model_path = os.path.join(model_ckpt_dir, "nnlm_{id}.ckpt".format(id=args.run_id)) # start perplexity file ppl_header = ["id", "run", "epoch", "step", "lr", "dataset", "perplexity"] ppl_fname = os.path.join(args.out_dir, "perplexity_{id}.csv".format(id=args.run_id)) ppl_file = open(ppl_fname, "w") ppl_writer = csv.DictWriter(f=ppl_file, fieldnames=ppl_header) ppl_writer.writeheader() # ====================================================================================== # CORPUS, Vocab and RIs # ====================================================================================== corpus = h5py.File(os.path.join(args.corpus, "ptb_{}.hdf5".format(args.ngram_size)), mode='r') vocab = marisa_trie.Trie(corpus["vocabulary"]) # generates k-dimensional random indexes with s_active units all_positive = args.ri_all_positive ri_generator = Generator(dim=args.k_dim, num_active=args.s_active, symmetric=not all_positive) # pre-gen indices for vocab # it doesn't matter which ri gets assign to which word since we are pre-generating the indexes ris = [ri_generator.generate() for i in range(len(vocab))] ri_tensor = ris_to_sp_tensor_value(ris, dim=args.k_dim) # ri_tensor = RandomIndexTensor.from_ri_list(ris, args.k_dim, args.s_active) # ====================================================================================== def data_pipeline(data, epochs=1, batch_size=args.batch_size, shuffle=False): def chunk_fn(x): return chunk_it(x, chunk_size=batch_size * 1000) if epochs > 1: data = repeat_apply(chunk_fn, data, epochs) else: data = chunk_fn(data) if shuffle: data = shuffle_it(data, args.shuffle_buffer_size) data = batch_it(data, size=batch_size, padding=False) return data # ====================================================================================== # MODEL # ====================================================================================== # Activation functions if args.h_act == "relu": h_act = tx.relu h_init = tx.he_normal_init() elif args.h_act == "tanh": h_act = tx.tanh h_init = tx.glorot_uniform() elif args.h_act == "elu": h_act = tx.elu h_init = tx.he_normal_init() # Parameter Init if args.embed_init == "normal": embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val) elif args.embed_init == "uniform": embed_init = tx.random_uniform(minval=-args.embed_init_val, maxval=args.embed_init_val) if args.logit_init == "normal": logit_init = tx.random_normal(mean=0., stddev=args.logit_init_val) elif args.logit_init == "uniform": logit_init = tx.random_uniform(minval=-args.logit_init_val, maxval=args.logit_init_val) if args.f_init == "normal": f_init = tx.random_normal(mean=0., stddev=args.f_init_val) elif args.f_init == "uniform": f_init = tx.random_uniform(minval=-args.f_init_val, maxval=args.f_init_val) # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, # log_device_placement=True)) # with tf.device('/gpu:{}'.format(args.gpu)): model = NNLM_NRP(ctx_size=args.ngram_size - 1, vocab_size=len(vocab), k_dim=args.k_dim, s_active=args.s_active, ri_tensor=ri_tensor, embed_dim=args.embed_dim, embed_init=embed_init, embed_share=args.embed_share, logit_init=logit_init, logit_bias=args.logit_bias, h_dim=args.h_dim, num_h=args.num_h, h_activation=h_act, h_init=h_init, use_dropout=args.dropout, keep_prob=args.keep_prob, embed_dropout=args.embed_dropout, l2_loss=args.l2_loss, l2_loss_coef=args.l2_loss_coef, f_init=f_init, use_nce=True, nce_samples=100) model_runner = tx.ModelRunner(model) # sess = tf.Session(config=tf.ConfigProto( # allow_soft_placement=True, log_device_placement=True)) # model_runner.set_session(sess) # sess = tf.Session(config=tf.ConfigProto( # allow_soft_placement=True, log_device_placement=True)) # model_runner.set_session(sess) # we use an InputParam because we might want to change it during training lr_param = tx.InputParam(value=args.lr) if args.optimizer == "sgd": optimizer = tf.train.GradientDescentOptimizer( learning_rate=lr_param.tensor) elif args.optimizer == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=lr_param.tensor, beta1=args.optimizer_beta1, beta2=args.optimizer_beta2, epsilon=args.optimizer_epsilon) elif args.optimizer == "ams": optimizer = tx.AMSGrad(learning_rate=lr_param.tensor, beta1=args.optimizer_beta1, beta2=args.optimizer_beta2, epsilon=args.optimizer_epsilon) def clip_grad_global(grads): grads, _ = tf.clip_by_global_norm(grads, 12) return grads def clip_grad_local(grad): return tf.clip_by_norm(grad, args.clip_value) if args.clip_grads: if args.clip_local: clip_fn = clip_grad_local else: clip_fn = clip_grad_global if args.clip_grads: model_runner.config_optimizer(optimizer, optimizer_params=lr_param, gradient_op=clip_fn, global_gradient_op=not args.clip_local) else: model_runner.config_optimizer(optimizer, optimizer_params=lr_param) # assert(model_runner.session == sess) # ====================================================================================== # EVALUATION # ====================================================================================== def eval_model(runner, dataset_it, len_dataset=None, display_progress=False): if display_progress: pb = tqdm(total=len_dataset, ncols=60) batches_processed = 0 sum_loss = 0 for batch in dataset_it: batch = np.array(batch, dtype=np.int64) ctx = batch[:, :-1] target = batch[:, -1:] mean_loss = runner.eval(ctx, target) sum_loss += mean_loss if display_progress: pb.update(args.batch_size) batches_processed += 1 if display_progress: pb.close() return np.exp(sum_loss / batches_processed) def evaluation(runner: tx.ModelRunner, pb, cur_epoch, step, display_progress=False): pb.write("[Eval Validation]") val_data = corpus["validation"] ppl_validation = eval_model( runner, data_pipeline(val_data, epochs=1, shuffle=False), len(val_data), display_progress) res_row = { "id": args.id, "run": args.run, "epoch": cur_epoch, "step": step, "lr": lr_param.value, "dataset": "validation", "perplexity": ppl_validation } ppl_writer.writerow(res_row) pb.write("Eval Test") test_data = corpus["test"] ppl_test = eval_model( runner, data_pipeline(test_data, epochs=1, shuffle=False), len(test_data), display_progress) res_row = { "id": args.id, "run": args.run, "epoch": cur_epoch, "step": step, "lr": lr_param.value, "dataset": "test", "perplexity": ppl_test } ppl_writer.writerow(res_row) ppl_file.flush() pb.write("valid. ppl = {} \n test ppl {}".format( ppl_validation, ppl_test)) return ppl_validation # ====================================================================================== # TRAINING LOOP # ====================================================================================== # preparing evaluation steps # I use ceil because I make sure we have padded batches at the end epoch_step = 0 global_step = 0 current_epoch = 0 patience = 0 cfg = tf.ConfigProto() cfg.gpu_options.allow_growth = True sess = tf.Session(config=cfg) model_runner.set_session(sess) model_runner.init_vars() training_dset = corpus["training"] progress = tqdm(total=len(training_dset) * args.epochs) training_data = data_pipeline(training_dset, epochs=args.epochs, shuffle=True) evals = [] try: for ngram_batch in training_data: epoch = progress.n // len(training_dset) + 1 # Start New Epoch if epoch != current_epoch: current_epoch = epoch epoch_step = 0 progress.write("epoch: {}".format(current_epoch)) # Eval Time if epoch_step == 0: current_eval = evaluation(model_runner, progress, epoch, global_step) evals.append(current_eval) if global_step > 0: if args.early_stop: if evals[-2] - evals[-1] < args.eval_threshold: if patience >= 3: progress.write("early stop") break patience += 1 else: patience = 0 # lr decay only at the start of each epoch if args.lr_decay and len(evals) > 0: if evals[-2] - evals[-1] < args.eval_threshold: lr_param.value = max( lr_param.value * args.lr_decay_rate, args.lr_decay_threshold) progress.write("lr changed to {}".format( lr_param.value)) # ================================================ # TRAIN MODEL # ================================================ ngram_batch = np.array(ngram_batch, dtype=np.int64) ctx_ids = ngram_batch[:, :-1] word_ids = ngram_batch[:, -1:] model_runner.train(ctx_ids, word_ids) progress.update(args.batch_size) epoch_step += 1 global_step += 1 # if not early stop, evaluate last state of the model if not args.early_stop or patience < 3: evaluation(model_runner, progress, epoch, epoch_step) ppl_file.close() if args.save_model: model_runner.save_model(model_name=model_path, step=global_step, write_state=False) model_runner.close_session() progress.close() tf.reset_default_graph() except Exception as e: traceback.print_exc() os.remove(ppl_file.name) os.remove(param_file.name) raise e
def run(**kwargs): arg_dict.from_dict(kwargs) args = arg_dict.to_namespace() # ====================================================================================== # Load Corpus & Vocab # ====================================================================================== corpus = PTBReader(path=args.corpus, mark_eos=args.mark_eos) corpus_stats = h5py.File(os.path.join(args.corpus, "ptb_stats.hdf5"), mode='r') vocab = marisa_trie.Trie(corpus_stats["vocabulary"]) to_ngrams_batch = partial(to_ngrams, vocab=vocab, ngram_size=args.ngram_size, batch_size=args.batch_size, epochs=1, shuffle=False, shuffle_buffer_size=args.shuffle_buffer_size, enum_epoch=False) training_len = sum(1 for _ in to_ngrams_batch(corpus.training_set, batch_size=1)) validation_len = None test_len = None if args.eval_progress: validation_len = sum(1 for _ in to_ngrams_batch(corpus.validation_set, batch_size=1)) test_len = sum(1 for _ in to_ngrams_batch(corpus.test_set, batch_size=1)) # ====================================================================================== # Load Params, Prepare results assets # ====================================================================================== # Experiment parameter summary res_param_filename = os.path.join(args.out_dir, "params_{id}_{run}.csv".format(id=args.id, run=args.run)) with open(res_param_filename, "w") as param_file: writer = csv.DictWriter(f=param_file, fieldnames=arg_dict.keys()) writer.writeheader() writer.writerow(arg_dict) param_file.flush() # make dir for model checkpoints if args.save_model: model_ckpt_dir = os.path.join(args.out_dir, "model_{id}_{run}".format(id=args.id, run=args.run)) os.makedirs(model_ckpt_dir, exist_ok=True) model_path = os.path.join(model_ckpt_dir, "nnlm_{id}_{run}.ckpt".format(id=args.id, run=args.run)) # start perplexity file ppl_header = ["id", "run", "epoch", "step", "lr", "dataset", "perplexity"] ppl_filename = os.path.join(args.out_dir, "perplexity_{id}_{run}.csv".format(id=args.id, run=args.run)) ppl_file = open(ppl_filename, "w") ppl_writer = csv.DictWriter(f=ppl_file, fieldnames=ppl_header) ppl_writer.writeheader() # ====================================================================================== # MODEL # ====================================================================================== # Configure weight initializers based on activation functions if args.h_act == "relu": h_act = tx.relu h_init = tx.he_normal_init() elif args.h_act == "tanh": h_act = tx.tanh h_init = tx.glorot_uniform() elif args.h_act == "elu": h_act = tx.elu h_init = tx.he_normal_init() elif args.h_act == "selu": h_act = tf.nn.selu h_init = tx.glorot_uniform() # Configure embedding and logit weight initializers if args.embed_init == "normal": embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val) elif args.embed_init == "uniform": embed_init = tx.random_uniform(minval=-args.embed_init_val, maxval=args.embed_init_val) if args.logit_init == "normal": logit_init = tx.random_normal(mean=0., stddev=args.logit_init_val) elif args.logit_init == "uniform": logit_init = tx.random_uniform(minval=-args.logit_init_val, maxval=args.logit_init_val) f_init = None if args.use_f_predict: if args.f_init == "normal": f_init = tx.random_normal(mean=0., stddev=args.f_init_val) elif args.f_init == "uniform": f_init = tx.random_uniform(minval=-args.f_init_val, maxval=args.f_init_val) inputs = tx.Input(args.ngram_size - 1, dtype=tf.int64, name="ctx_inputs") labels = tx.Input(1, dtype=tf.int64, name="ctx_inputs") model = NNLM(inputs=inputs, label_inputs=labels, vocab_size=len(vocab), embed_dim=args.embed_dim, embed_init=embed_init, embed_share=args.embed_share, logit_init=logit_init, h_dim=args.h_dim, num_h=args.num_h, h_activation=h_act, h_init=h_init, use_dropout=args.dropout, drop_probability=args.drop_probability, embed_dropout=args.embed_dropout, l2_loss=args.l2_loss, l2_weight=args.l2_loss_coef, use_f_predict=args.use_f_predict, f_init=f_init, logit_bias=args.logit_bias, use_nce=False) # Input params can be changed during training by setting their value # lr_param = tx.InputParam(init_value=args.lr) lr_param = tensorx.train.EvalStepDecayParam(value=args.lr, improvement_threshold=args.eval_threshold, less_is_better=True, decay_rate=args.lr_decay_rate, decay_threshold=args.lr_decay_threshold) if args.optimizer == "sgd": optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr_param.tensor) elif args.optimizer == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=lr_param.tensor, beta1=args.optimizer_beta1, beta2=args.optimizer_beta2, epsilon=args.optimizer_epsilon) elif args.optimizer == "ams": optimizer = tx.AMSGrad(learning_rate=lr_param.tensor, beta1=args.optimizer_beta1, beta2=args.optimizer_beta2, epsilon=args.optimizer_epsilon) def clip_grad_global(grads): grads, _ = tf.clip_by_global_norm(grads, 12) return grads def clip_grad_local(grad): return tf.clip_by_norm(grad, args.clip_value) if args.clip_grads: if args.clip_local: clip_fn = clip_grad_local else: clip_fn = clip_grad_global if args.clip_grads: model.config_optimizer(optimizer, optimizer_params=lr_param, gradient_op=clip_fn, global_gradient_op=not args.clip_local) else: model.config_optimizer(optimizer, optimizer_params=lr_param) # ====================================================================================== # EVALUATION # ====================================================================================== def eval_model(model, dataset_it, len_dataset=None, display_progress=False): if display_progress: pb = tqdm(total=len_dataset, ncols=60, position=1) batches_processed = 0 sum_loss = 0 for batch in dataset_it: batch = np.array(batch, dtype=np.int64) ctx = batch[:, :-1] target = batch[:, -1:] mean_loss = model.eval({inputs: ctx, labels: target}) sum_loss += mean_loss if display_progress: pb.update(args.batch_size) batches_processed += 1 if display_progress: pb.close() return np.exp(sum_loss / batches_processed) def evaluation(model: tx.Model, progress_bar, cur_epoch, step, display_progress=False): ppl_validation = eval_model(model, to_ngrams_batch(corpus.validation_set), validation_len, display_progress) res_row = {"id": args.id, "run": args.run, "epoch": cur_epoch, "step": step, "lr": lr_param.value, "dataset": "validation", "perplexity": ppl_validation} ppl_writer.writerow(res_row) if args.eval_test: # pb.write("[Eval Test Set]") ppl_test = eval_model(model, to_ngrams(corpus.test_set), test_len, display_progress) res_row = {"id": args.id, "run": args.run, "epoch": cur_epoch, "step": step, "lr": lr_param.value, "dataset": "test", "perplexity": ppl_test} ppl_writer.writerow(res_row) ppl_file.flush() if args.eval_test: progress_bar.set_postfix({"test PPL ": ppl_test}) # pb.write("valid. ppl = {}".format(ppl_validation)) return ppl_validation # ====================================================================================== # TRAINING LOOP # ====================================================================================== # print("Starting TensorFlow Session") # preparing evaluation steps # I use ceil because I make sure we have padded batches at the end epoch_step = 0 global_step = 0 current_epoch = 0 patience = 0 cfg = tf.ConfigProto() cfg.gpu_options.allow_growth = True sess = tf.Session(config=cfg) model.set_session(sess) model.init_vars() progress = tqdm(total=training_len * args.epochs, position=args.pid + 1, disable=not args.display_progress) training_data = to_ngrams_batch(corpus.training_set, epochs=args.epochs, shuffle=args.shuffle, enum_epoch=True) evaluations = [] try: for i, ngram_batch in training_data: epoch = i + 1 # Start New Epoch if epoch != current_epoch: current_epoch = epoch epoch_step = 0 if args.display_progress: progress.set_postfix({"epoch": current_epoch}) # ================================================ # EVALUATION # ================================================ if epoch_step == 0: current_eval = evaluation(model, progress, epoch, global_step, display_progress=args.eval_progress) evaluations.append(current_eval) lr_param.update(current_eval) # print(lr_param.eval_history) # print("improvement ", lr_param.eval_improvement()) if global_step > 0: if args.early_stop and epoch > 1: if lr_param.eval_improvement() < lr_param.improvement_threshold: if patience >= 3: break patience += 1 else: patience = 0 # ================================================ # TRAIN MODEL # ================================================ ngram_batch = np.array(ngram_batch, dtype=np.int64) ctx_ids = ngram_batch[:, :-1] word_ids = ngram_batch[:, -1:] model.train({inputs: ctx_ids, labels: word_ids}) progress.update(args.batch_size) epoch_step += 1 global_step += 1 # if not early stop, evaluate last state of the model if not args.early_stop or patience < 3: current_eval = evaluation(model, progress, epoch, epoch_step) evaluations.append(current_eval) ppl_file.close() if args.save_model: model.save_model(model_name=model_path, step=global_step, write_state=False) model.close_session() progress.close() tf.reset_default_graph() # return the best validation evaluation return min(evaluations) except Exception as e: traceback.print_exc() os.remove(ppl_file.name) os.remove(param_file.name) raise e
def __init__(self, inputs, labels, vocab_size, embed_dim, h_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), num_h=1, h_activation=tx.tanh, h_init=tx.he_normal_init(), reset_state=True, embed_dropout=False, w_dropout=False, u_dropconnect=False, other_dropout=False, w_keep_prob=0.9, u_keep_prob=0.9, embed_keep_prob=0.9, other_keep_prob=0.9, l2_loss=False, l2_weight=1e-5, use_f_predict=False, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=False, logit_bias=False, use_nce=False, nce_samples=10, ): if not isinstance(inputs, tx.Input): raise TypeError("inputs must be an Input layer") self.inputs = inputs self.labels = labels if not isinstance(labels, tx.Input): raise TypeError("labels must be an Input layer") if inputs.dtype != tf.int32 and inputs.dtype != tf.int64: raise TypeError("Invalid dtype for input: expected int32 or int64, got {}".format(inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") ctx_size = inputs.n_units # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # feature lookup embeddings = tx.Lookup(inputs, ctx_size, [vocab_size, embed_dim], weight_init=embed_init) var_reg.append(embeddings.weights) feature_lookup = embeddings.permute_batch_time() last_layer = feature_lookup last_feature_layer = feature_lookup for i in range(num_h): h_i = tx.QRNN(feature_lookup, n_units=h_dim, activation=h_activation, filter_size= ) last_layer = h_i # save last state, this will be used by state of first cell var_reg += [wi.weights for wi in last_layer.w] var_reg += [ui.weights for ui in last_layer.u] if not reset_state: last_layer = zero_state.reuse_with(last_layer, name="cache_last_state") # feature prediction for Energy-Based Model if use_f_predict: last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict") var_reg.append(last_layer.weights) f_predict = last_layer shared_weights = feature_lookup.weights if embed_share else None transpose_weights = embed_share logit_init = logit_init if not embed_share else None run_logits = tx.Linear(last_layer, n_units=vocab_size, weight_init=logit_init, shared_weights=shared_weights, transpose_weights=transpose_weights, add_bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) run_output = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): embeddings = embeddings.reuse_with(inputs) feature_lookup = embeddings.as_seq() if other_dropout and embed_dropout: feature_lookup = tx.Dropout(feature_lookup, probability=embed_keep_prob, name="drop_features") # last_layer = last_layer.as_seq() # add dropout between each layer # for i, layer in enumerate(h_layers): cell = lstm_cells[0] for i in range(ctx_size): if i == 0: h = cell.reuse_with(input_layer=feature_lookup[i], previous_state=None, # copy from first cell previous_memory=None, # copy from first cell regularized=w_dropout or u_dropconnect, name="lstm_cell_{}".format(i)) else: h = cell.reuse_with(input_layer=feature_lookup[i], previous_state=last_layer, name="lstm_cell_{}".format(i)) cell = h # if use_dropout: # h = tx.ZoneOut(h, # previous_layer=h.previous_state, # keep_prob=keep_prob, # name="zoneout_{}".format(i)) last_layer = h if not reset_state: last_layer = zero_state.reuse_with(last_layer, name="cache_last_cell") # feature prediction for Energy-Based Model if use_f_predict: last_layer = f_predict.reuse_with(last_layer) train_logits = run_logits.reuse_with(last_layer, name="train_logits") train_output = tx.Activation(train_logits, tx.softmax, name="train_output") def categorical_loss(labels, logits): labels = tx.dense_one_hot(column_indices=labels, num_cols=vocab_size) loss = tx.categorical_cross_entropy(labels=labels, logits=logits) # loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits) return tf.reduce_mean(loss) def nce_loss(labels, weights, bias, predict): noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size) loss = tf.nn.nce_loss(weights=weights, biases=bias, inputs=predict, labels=labels, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=noise) return tf.reduce_mean(loss) if use_nce: bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias") nce_weights = tx.WrapLayer(embeddings, n_units=embeddings.n_units, wrap_fn=lambda x: x.weights, layer_fn=True) train_loss = tx.LambdaLayer(labels, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss") else: train_loss = tx.LambdaLayer(labels, train_logits, apply_fn=categorical_loss, name="train_loss") if l2_loss: l2_losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = tx.LambdaLayer(train_loss, apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses), name="train_loss_l2") # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): eval_loss = tx.LambdaLayer(labels, run_logits, apply_fn=categorical_loss, name="eval_loss") # BUILD MODEL super().__init__(run_outputs=run_output, run_inputs=inputs, train_inputs=[inputs, labels], train_outputs=train_output, train_loss=train_loss, eval_inputs=[inputs, labels], eval_outputs=run_output, eval_score=eval_loss)
def __init__( self, inputs, label_inputs, vocab_size, embed_dim, h_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), num_h=1, h_activation=tx.elu, h_init=tx.he_normal_init(), use_dropout=False, embed_dropout=False, drop_probability=0.05, l2_loss=False, l2_weight=1e-5, use_f_predict=False, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=False, logit_bias=False, use_nce=False, nce_samples=10, ): if not isinstance(inputs, tx.Input): raise TypeError("inputs must be an Input layer") self.inputs = inputs self.labels = label_inputs if not isinstance(label_inputs, tx.Input): raise TypeError("labels must be an Input layer") if inputs.dtype != tf.int32 and inputs.dtype != tf.int64: raise TypeError( "Invalid dtype for input: expected int32 or int64, got {}". format(inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") ctx_size = inputs.n_units # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # feature lookup embeddings = tx.Lookup(inputs, ctx_size, [vocab_size, embed_dim], weight_init=embed_init) var_reg.append(embeddings.weights) feature_lookup = embeddings.as_concat() last_layer = feature_lookup h_layers = [] for i in range(num_h): h_i = tx.FC(layer=last_layer, n_units=h_dim, activation=h_activation, weight_init=h_init, add_bias=True, name="h_{}".format(i + 1)) h_layers.append(h_i) last_layer = h_i var_reg.append(h_i.linear.weights) # feature prediction for Energy-Based Model if use_f_predict: last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict") var_reg.append(last_layer.weights) f_predict = last_layer shared_weights = feature_lookup.weights if embed_share else None transpose_weights = embed_share logit_init = logit_init if not embed_share else None run_logits = tx.Linear(last_layer, n_units=vocab_size, weight_init=logit_init, shared_weights=shared_weights, transpose_weights=transpose_weights, add_bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) run_output = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: last_layer = tx.Dropout(feature_lookup, probability=drop_probability, name="dropout_features") else: last_layer = feature_lookup # add dropout between each layer for i, layer in enumerate(h_layers): h = layer.reuse_with(last_layer) if use_dropout: h = tx.Dropout(h, probability=drop_probability, name="dropout_{}".format(i + 1)) last_layer = h # feature prediction for Energy-Based Model if use_f_predict: last_layer = f_predict.reuse_with(last_layer) train_logits = run_logits.reuse_with(last_layer, name="train_logits") train_output = tx.Activation(train_logits, tx.softmax, name="train_output") def categorical_loss(labels, logits): labels = tx.dense_one_hot(column_indices=labels, num_cols=vocab_size) loss = tx.categorical_cross_entropy(labels=labels, logits=logits) return tf.reduce_mean(loss) def nce_loss(labels, weights, bias, predict): noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size) loss = tf.nn.nce_loss(weights=weights, biases=bias, inputs=predict, labels=labels, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=noise) return tf.reduce_mean(loss) if use_nce: bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias") nce_weights = tx.WrapLayer(embeddings, n_units=embeddings.n_units, wrap_fn=lambda x: x.weights, layer_fn=True) train_loss = tx.LambdaLayer(label_inputs, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss") else: train_loss = tx.LambdaLayer(label_inputs, train_logits, apply_fn=categorical_loss, name="train_loss") if l2_loss: l2_losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = tx.WrapLayer( train_loss, wrap_fn=lambda x: x + l2_weight * tf.add_n(l2_losses), name="train_loss_l2") # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): eval_loss = tx.LambdaLayer(label_inputs, run_logits, apply_fn=categorical_loss, name="eval_loss") # BUILD MODEL super().__init__(run_outputs=run_output, run_inputs=inputs, train_inputs=[inputs, label_inputs], train_outputs=train_output, train_loss=train_loss, eval_inputs=[inputs, label_inputs], eval_outputs=run_output, eval_score=eval_loss)
def __init__(self, inputs, labels, vocab_size, embed_dim, h_dim, embed_init=tx.zeros_init(), logit_init=tx.glorot_uniform(), num_h=1, h_activation=tx.tanh, h_init=tx.glorot_uniform(), w_dropconnect=None, u_dropconnect=None, r_dropout=0.4, y_dropout=0.4, embed_dropout=0.3, other_dropout=0.3, l2_loss=False, l2_weight=1e-5, use_f_predict=False, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=False, logit_bias=False, use_nce=False, nce_samples=10, skip_connections=False): if not isinstance(inputs, tx.Input): raise TypeError("inputs must be an Input layer") self.inputs = inputs self.labels = labels if not isinstance(labels, tx.Input): raise TypeError("labels must be an Input layer") if inputs.dtype != tf.int32 and inputs.dtype != tf.int64: raise TypeError( "Invalid dtype for input: expected int32 or int64, got {}". format(inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # feature lookup embeddings = tx.Lookup(inputs, seq_size=None, lookup_shape=[vocab_size, embed_dim], weight_init=embed_init) var_reg.append(embeddings.weights) feature_lookup = embeddings.permute_batch_time() last_layer = feature_lookup cell_proto = tx.LSTMCell.proto( n_units=h_dim, activation=h_activation, gate_activation=tx.hard_sigmoid, w_init=h_init, u_init=h_init, w_dropconnect=w_dropconnect, u_dropconnect=u_dropconnect, r_dropout=r_dropout, x_dropout=None, y_dropout=y_dropout, regularized=False, name="cell", ) lstm_layers = [] for i in range(num_h): lstm_layer = tx.RNN(last_layer, cell_proto=cell_proto, regularized=False, stateful=True, name="LSTM_{}".format(i + 1)) lstm_layers.append(lstm_layer) var_reg += [wi.weights for wi in lstm_layer.cell.w] var_reg += [ui.weights for ui in lstm_layer.cell.u] last_layer = lstm_layer # last time step is the state used to make the prediction # last_layer = tx.Reshape(last_layer, [-1, h_dim]) # TODO this is not consistent with locked dropout for the last layer # where the same mask should be applied across time steps # to do this I need either y_dropout to be available or some sort of map # operation I can use with layers outputting 3D tensors # something equivalent to https://keras.io/layers/wrappers/ which applies # a layer to every temporal slice of an input. They implement this the same way # they implement an RNN # feature prediction for Energy-Based Model if use_f_predict: last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict") var_reg += last_layer.variables f_predict = last_layer shared_weights = feature_lookup.weights if embed_share else None transpose_weights = embed_share logit_init = logit_init if not embed_share else None run_logits = tx.Linear(last_layer, n_units=vocab_size, weight_init=logit_init, shared_weights=shared_weights, transpose_weights=transpose_weights, add_bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) run_output = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): embeddings = embeddings.reuse_with(inputs) feature_lookup = embeddings.permute_batch_time() if embed_dropout: feature_lookup = tx.Dropout(feature_lookup, probability=embed_dropout, name="drop_features") last_layer = feature_lookup for i in range(num_h): lstm_layer = lstm_layers[i].reuse_with(last_layer, regularized=True) last_layer = lstm_layer # last_layer = tx.Reshape(last_layer, [-1, h_dim]) # feature prediction for Energy-Based Model if use_f_predict: # last_layer = f_predict.reuse_with(last_layer) last_layer = f_predict.reuse_with(last_layer) last_layer = tx.Dropout(last_layer, probability=other_dropout, locked=False) train_logits = run_logits.reuse_with(last_layer, name="train_logits") train_output = tx.Activation(train_logits, tx.softmax, name="run_output") def categorical_loss(labels, logits): # labels come as a batch of classes [[1,2],[3,4]] -> [1,3,2,4] time steps are ordered to match logits labels = tx.Transpose(labels) labels = tx.Reshape(labels, [-1]) labels = tx.dense_one_hot(labels, num_cols=vocab_size) loss = tx.categorical_cross_entropy(labels=labels, logits=logits) return tf.reduce_mean(loss) def nce_loss(labels, weights, bias, predict): noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size) loss = tf.nn.nce_loss(weights=weights, biases=bias, inputs=predict, labels=labels, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=noise) return tf.reduce_mean(loss) if use_nce: bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias") # wraps a layer to expose the weights as a layer but with the layer as its input nce_weights = tx.WrapLayer(embeddings, n_units=embeddings.n_units, wrap_fn=lambda x: x.weights, layer_fn=True) train_loss = tx.LambdaLayer(labels, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss") else: train_loss = tx.LambdaLayer(labels, train_logits, apply_fn=categorical_loss, name="train_loss") if l2_loss: l2_losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = tx.LambdaLayer( train_loss, apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses), name="train_loss_l2") # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): eval_loss = tx.LambdaLayer(labels, run_logits, apply_fn=categorical_loss, name="eval_loss") self.stateful_layers = lstm_layers # BUILD MODEL super().__init__(run_outputs=run_output, run_inputs=inputs, train_inputs=[inputs, labels], train_outputs=train_output, train_loss=train_loss, eval_inputs=[inputs, labels], eval_outputs=run_output, eval_score=eval_loss)
def __init__(self, ctx_size, vocab_size, k_dim, ri_tensor: RandomIndexTensor, embed_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, logit_bias=False, use_gate=True, use_hidden=False, h_dim=100, h_activation=tx.elu, h_init=tx.he_normal_init(), h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_dropout=True, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5): # GRAPH INPUTS run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input") loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target") eval_inputs = loss_inputs # RUN GRAPH ===================================================== var_reg = [] with tf.name_scope("run"): # RI ENCODING =============================================== # convert ids to ris gather a set of random indexes based on the ids in a sequence # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim) # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) with tf.name_scope("ri_encode"): # used to compute logits if isinstance(ri_tensor, RandomIndexTensor): ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(), k_dim) ri_inputs = ri_tensor.gather(run_inputs.tensor) ri_inputs = ri_inputs.to_sparse_tensor() ri_inputs = tx.TensorLayer(ri_inputs, k_dim) else: ri_layer = tx.TensorLayer(ri_tensor, k_dim) ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) ri_inputs = tx.TensorLayer(ri_inputs, k_dim) # use those sparse indexes to lookup a set of features based on the ri values feature_lookup = tx.Lookup(ri_inputs, ctx_size, [k_dim, embed_dim], embed_init, name="lookup") var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() # =========================================================== if use_gate or use_hidden: hl = tx.Linear(feature_lookup, h_dim, h_init, bias=True, name="h_linear") ha = tx.Activation(hl, h_activation, name="h_activation") h = tx.Compose(hl, ha, name="hidden") var_reg.append(hl.weights) features = feature_lookup if use_gate: features = tx.Gate(features, ctx_size, gate_input=h) gate = features var_reg.append(features.gate_weights) x_to_f = tx.Linear(features, embed_dim, x_to_f_init, bias=True, name="x_to_f") var_reg.append(x_to_f.weights) f_prediction = x_to_f if use_hidden: h_to_f = tx.Linear(h, embed_dim, h_to_f_init, bias=True, name="h_to_f") var_reg.append(h_to_f.weights) f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted") # RI DECODING =============================================== shared_weights = feature_lookup.weights if embed_share else None logit_init = logit_init if not embed_share else None # embedding feature vectors for all words: shape [vocab_size, embed_dim] # later, for NCE we don't need to get all the features all_embeddings = tx.Linear(ri_layer, embed_dim, logit_init, shared_weights, name="logits", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(f_prediction, n_units=vocab_size, shared_weights=all_embeddings.tensor, transpose_weights=True, bias=logit_bias) if not embed_share: var_reg.append(all_embeddings.weights) # =========================================================== run_embed_prob = tx.Activation(run_logits, tx.softmax) # TRAIN GRAPH =================================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(ri_inputs) features = tx.Dropout(feature_lookup, probability=keep_prob) else: features = feature_lookup if use_gate or use_hidden: if use_dropout: h = h.reuse_with(features) h = tx.Dropout(h, probability=keep_prob) if use_gate: features = gate.reuse_with(features, gate_input=h) f_prediction = x_to_f.reuse_with(features) if use_hidden: h_to_f = h_to_f.reuse_with(h) if use_dropout: h_to_f = tx.Dropout(h_to_f, probability=keep_prob) f_prediction = tx.Add(f_prediction, h_to_f) else: f_prediction = f_prediction.reuse_with(features) # we already define all_embeddings from which these logits are computed before so this should be ok train_logits = run_logits.reuse_with(f_prediction) train_embed_prob = tx.Activation(train_logits, tx.softmax, name="train_output") one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy(one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # EVAL GRAPH =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # SETUP MODEL CONTAINER ==================================== super().__init__(run_inputs=run_inputs, run_outputs=run_embed_prob, train_inputs=run_inputs, train_outputs=train_embed_prob, eval_inputs=run_inputs, eval_outputs=run_embed_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)
def __init__(self, ctx_size, vocab_size, k_dim, s_active, ri_tensor, embed_dim, h_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), num_h=1, h_activation=tx.relu, h_init=tx.he_normal_init, use_dropout=False, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, logit_bias=False, use_nce=False, nce_samples=100, noise_level=0.1): run_inputs = tx.Input(ctx_size, dtype=tf.int32) loss_inputs = tx.Input(n_units=1, dtype=tf.int64) eval_inputs = loss_inputs if run_inputs.dtype != tf.int32 and run_inputs.dtype != tf.int64: raise TypeError( "Invalid dtype for input: expected int32 or int64, got {}". format(run_inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # RI ENCODING =============================================== # convert ids to ris gather a set of random indexes based on the ids in a sequence # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim) # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) # ri_inputs = tx.TensorLayer(ri_inputs, n_units=k_dim) with tf.name_scope("ri_encode"): if isinstance(ri_tensor, RandomIndexTensor): ri_tensor = ri_tensor ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(), k_dim, shape=[vocab_size, k_dim]) ri_inputs = ri_tensor.gather(run_inputs.tensor) ri_inputs = ri_inputs.to_sparse_tensor() ri_inputs = tx.TensorLayer( ri_inputs, k_dim, shape=[ri_inputs.get_shape()[0], k_dim]) # ri_tensor is a sparse tensor else: raise TypeError( "please supply RandomIndexTensor instead of sparse Tensor" ) # ri_layer = tx.TensorLayer(ri_tensor, k_dim) # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) # ri_inputs = tx.TensorLayer(ri_inputs, k_dim) feature_lookup = tx.Lookup(ri_inputs, ctx_size, [k_dim, embed_dim], embed_init, name="lookup") self.embeddings = feature_lookup var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() # =========================================================== last_layer = feature_lookup h_layers = [] for i in range(num_h): h_i = tx.Linear(last_layer, h_dim, h_init, bias=True, name="h_{i}_linear".format(i=i)) h_a = tx.Activation(h_i, h_activation) h = tx.Compose(h_i, h_a, name="h_{i}".format(i=i)) h_layers.append(h) last_layer = h var_reg.append(h_i.weights) self.h_layers = h_layers # feature prediction for Energy-Based Model f_prediction = tx.Linear(last_layer, embed_dim, f_init, bias=True, name="f_predict") var_reg.append(f_prediction.weights) # RI DECODING =============================================== # Shared Embeddings if embed_share: shared_weights = feature_lookup.weights if embed_share else None logit_init = logit_init if not embed_share else None # ri_dense = tx.ToDense(ri_layer) all_embeddings = tx.Linear(ri_layer, embed_dim, logit_init, shared_weights, name="all_features", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(f_prediction, vocab_size, shared_weights=all_embeddings.tensor, transpose_weights=True, bias=logit_bias, name="logits") else: run_logits = tx.Linear(f_prediction, vocab_size, bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) # =========================================================== embed_prob = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(ri_inputs) last_layer = tx.Dropout(feature_lookup, probability=keep_prob) else: last_layer = feature_lookup # add dropout between each layer for layer in h_layers: h = layer.reuse_with(last_layer) if use_dropout: h = tx.Dropout(h, probability=keep_prob) last_layer = h f_prediction = f_prediction.reuse_with(last_layer) train_logits = run_logits.reuse_with(f_prediction, name="train_logits") train_embed_prob = tx.Activation(train_logits, tx.softmax, name="train_output") if use_nce: # labels labels = loss_inputs.tensor # convert labels to random indices def labels_to_ri(x): random_index_tensor = ri_tensor.gather(x) sp_features = random_index_tensor.to_sparse_tensor() return sp_features model_prediction = f_prediction.tensor train_loss = tx.sparse_cnce_loss( label_features=labels, model_prediction=model_prediction, weights=feature_lookup.weights, noise_ratio=noise_level, num_samples=nce_samples, labels_to_sparse_features=labels_to_ri) else: one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy( one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # BUILD MODEL super().__init__(run_inputs=run_inputs, run_outputs=embed_prob, train_inputs=run_inputs, train_outputs=train_embed_prob, eval_inputs=run_inputs, eval_outputs=embed_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)