Beispiel #1
0
        data = chunk_fn(data)

    if shuffle:
        data = shuffle_it(data, args.shuffle_buffer_size)

    data = batch_it(data, size=batch_size, padding=False)
    return data


# ======================================================================================
# MODEL
# ======================================================================================
# Activation functions
if args.h_act == "relu":
    h_act = tx.relu
    h_init = tx.he_normal_init()
elif args.h_act == "tanh":
    h_act = tx.tanh
    h_init = tx.glorot_uniform()
elif args.h_act == "elu":
    h_act = tx.elu
    h_init = tx.he_normal_init()

# Parameter Init
if args.embed_init == "normal":
    embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val)
elif args.embed_init == "uniform":
    embed_init = tx.random_uniform(minval=-args.embed_init_val,
                                   maxval=args.embed_init_val)

if args.logit_init == "normal":
Beispiel #2
0
def run(**kwargs):
    arg_dict.from_dict(kwargs)
    args = arg_dict.to_namespace()

    # ======================================================================================
    # Load Corpus & Vocab
    # ======================================================================================
    corpus = PTBReader(path=args.corpus, mark_eos=args.mark_eos)
    corpus_stats = h5py.File(os.path.join(args.corpus, "ptb_stats.hdf5"),
                             mode='r')
    ri_generator = Generator(dim=args.k_dim,
                             num_active=args.s_active,
                             symmetric=True)
    # vocab = marisa_trie.Trie(corpus_stats["vocabulary"])

    index = TrieSignIndex(generator=ri_generator,
                          vocabulary=corpus_stats["vocabulary"],
                          pregen_indexes=True)

    # for i in range(1000):
    #    w = index.get_sign(i)
    #    ri: RandomIndex = index.get_ri(w)
    #    print(w)
    #    print(ri)
    #    print(ri)
    #    print(index.get_id(w))

    # pre-gen indices for vocab, we could do this iteratively ... same thing
    # ris = [ri_generator.generate() for _ in range(len(vocab))]
    # print(vocab.keys())

    # index = TrieSignIndex(generator=ri_generator,vocabulary=vocab)

    # TODO could create the NRP model with NCE only and input with random indices could be passed to the model
    # dynamically, also for inference and evaluation, we could either work with a dynamic encoding process
    # or give it the current ri tensor with all the known ris if we know there are no OOV words (words that might not
    # have been seen during training.

    # table with random indices for all known symbols
    # ri_tensor = RandomIndexTensor.from_ri_list(ris, k=args.k_dim, s=args.s_active)

    def corpus_pipeline(corpus_stream,
                        n_gram_size=args.ngram_size,
                        epochs=1,
                        batch_size=args.batch_size,
                        shuffle=args.shuffle,
                        flatten=False):
        """ Corpus Processing Pipeline.

        Transforms the corpus reader -a stream of sentences or words- into a stream of n-gram batches.

        Args:
            n_gram_size: the size of the n-gram window
            corpus_stream: the stream of sentences of words
            epochs: number of epochs we want to iterate over this corpus
            batch_size: batch size for the n-gram batch
            shuffle: if true, shuffles the n-grams according to a buffer size
            flatten: if true sliding windows are applied over a stream of words rather than within each sentence
            (n-grams can cross sentence boundaries)
        """

        if flatten:
            word_it = flatten_it(corpus_stream)
            n_grams = window_it(word_it, n_gram_size)
        else:
            sentence_n_grams = (window_it(sentence, n_gram_size)
                                for sentence in corpus_stream)
            n_grams = flatten_it(sentence_n_grams)

        # at this point this is an n_gram iterator
        # n_grams = ([vocab[w] for w in ngram] for ngram in n_grams)
        n_grams = ([index.get_id(w) for w in ngram] for ngram in n_grams)

        if epochs > 1:
            n_grams = repeat_it(n_grams, epochs)

        if shuffle:
            n_grams = shuffle_it(n_grams, args.shuffle_buffer_size)

        n_grams = batch_it(n_grams, size=batch_size, padding=False)
        return n_grams

    # print("counting dataset samples...")
    training_len = sum(1 for _ in corpus_pipeline(
        corpus.training_set(), batch_size=1, epochs=1, shuffle=False))
    validation_len = None
    test_len = None
    if args.eval_progress:
        validation_len = sum(1 for _ in corpus_pipeline(
            corpus.validation_set(), batch_size=1, epochs=1, shuffle=False))
        test_len = sum(1 for _ in corpus_pipeline(
            corpus.test_set(), batch_size=1, epochs=1, shuffle=False))
    # print("done")
    # print("dset len ", training_len)

    # ======================================================================================
    # Load Params, Prepare results assets
    # ======================================================================================
    # Experiment parameter summary
    res_param_filename = os.path.join(
        args.out_dir, "params_{id}_{run}.csv".format(id=args.id, run=args.run))
    with open(res_param_filename, "w") as param_file:
        writer = csv.DictWriter(f=param_file, fieldnames=arg_dict.keys())
        writer.writeheader()
        writer.writerow(arg_dict)
        param_file.flush()

    # make dir for model checkpoints
    if args.save_model:
        model_ckpt_dir = os.path.join(
            args.out_dir, "model_{id}_{run}".format(id=args.id, run=args.run))
        os.makedirs(model_ckpt_dir, exist_ok=True)
        model_path = os.path.join(
            model_ckpt_dir, "nnlm_{id}_{run}.ckpt".format(id=args.id,
                                                          run=args.run))

    # start perplexity file
    ppl_header = ["id", "run", "epoch", "step", "lr", "dataset", "perplexity"]
    ppl_fname = os.path.join(
        args.out_dir, "perplexity_{id}_{run}.csv".format(id=args.id,
                                                         run=args.run))

    ppl_file = open(ppl_fname, "w")
    ppl_writer = csv.DictWriter(f=ppl_file, fieldnames=ppl_header)
    ppl_writer.writeheader()

    # ======================================================================================
    # MODEL
    # ======================================================================================
    # Configure weight initializers based on activation functions
    if args.h_act == "relu":
        h_act = tx.relu
        h_init = tx.he_normal_init()
    elif args.h_act == "tanh":
        h_act = tx.tanh
        h_init = tx.glorot_uniform()
    elif args.h_act == "elu":
        h_act = tx.elu
        h_init = tx.he_normal_init()
    elif args.h_act == "selu":
        h_act = tf.nn.selu
        h_init = tx.glorot_uniform()

    # Configure embedding and logit weight initializers
    if args.embed_init == "normal":
        embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val)
    elif args.embed_init == "uniform":
        embed_init = tx.random_uniform(minval=-args.embed_init_val,
                                       maxval=args.embed_init_val)

    if args.logit_init == "normal":
        logit_init = tx.random_normal(mean=0., stddev=args.logit_init_val)
    elif args.logit_init == "uniform":
        logit_init = tx.random_uniform(minval=-args.logit_init_val,
                                       maxval=args.logit_init_val)

    f_init = None
    if args.use_f_predict:
        if args.f_init == "normal":
            f_init = tx.random_normal(mean=0., stddev=args.f_init_val)
        elif args.f_init == "uniform":
            f_init = tx.random_uniform(minval=-args.f_init_val,
                                       maxval=args.f_init_val)

    model = NRP(ctx_size=args.ngram_size - 1,
                sign_index=index,
                k_dim=args.k_dim,
                s_active=args.s_active,
                embed_dim=args.embed_dim,
                h_dim=args.h_dim,
                embed_init=embed_init,
                logit_init=logit_init,
                num_h=args.num_h,
                h_activation=h_act,
                h_init=h_init,
                use_dropout=args.dropout,
                embed_dropout=args.embed_dropout,
                keep_prob=args.keep_prob,
                l2_loss=args.l2_loss,
                l2_loss_coef=args.l2_loss_coef,
                f_init=f_init,
                embed_share=args.embed_share,
                logit_bias=args.logit_bias,
                use_nce=args.nce,
                nce_samples=args.nce_samples,
                nce_noise_amount=0.04)

    model_runner = tx.ModelRunner(model)

    # Input params can be changed during training by setting their value
    # lr_param = tx.InputParam(init_value=args.lr)
    lr_param = tensorx.train.EvalStepDecayParam(
        value=args.lr,
        improvement_threshold=args.eval_threshold,
        less_is_better=True,
        decay_rate=args.lr_decay_rate,
        decay_threshold=args.lr_decay_threshold)
    if args.optimizer == "sgd":
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=lr_param.tensor)
    elif args.optimizer == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=lr_param.tensor,
                                           beta1=args.optimizer_beta1,
                                           beta2=args.optimizer_beta2,
                                           epsilon=args.optimizer_epsilon)
    elif args.optimizer == "ams":
        optimizer = tx.AMSGrad(learning_rate=lr_param.tensor,
                               beta1=args.optimizer_beta1,
                               beta2=args.optimizer_beta2,
                               epsilon=args.optimizer_epsilon)

    def clip_grad_global(grads):
        grads, _ = tf.clip_by_global_norm(grads, 12)
        return grads

    def clip_grad_local(grad):
        return tf.clip_by_norm(grad, args.clip_value)

    if args.clip_grads:
        if args.clip_local:
            clip_fn = clip_grad_local
        else:
            clip_fn = clip_grad_global

    if args.clip_grads:
        model_runner.config_optimizer(optimizer,
                                      optimizer_params=lr_param,
                                      gradient_op=clip_fn,
                                      global_gradient_op=not args.clip_local)
    else:
        model_runner.config_optimizer(optimizer, optimizer_params=lr_param)

    # ======================================================================================
    # EVALUATION
    # ======================================================================================

    def eval_model(runner,
                   dataset_it,
                   len_dataset=None,
                   display_progress=False):
        if display_progress:
            pb = tqdm(total=len_dataset, ncols=60, position=1)
        batches_processed = 0
        sum_loss = 0
        for batch in dataset_it:
            batch = np.array(batch, dtype=np.int64)
            ctx = batch[:, :-1]
            target = batch[:, -1:]

            mean_loss = runner.eval(ctx, target)
            sum_loss += mean_loss

            if display_progress:
                pb.update(args.batch_size)
            batches_processed += 1

        if display_progress:
            pb.close()

        return np.exp(sum_loss / batches_processed)

    def evaluation(runner: tx.ModelRunner,
                   progress_bar,
                   cur_epoch,
                   step,
                   display_progress=False):

        ppl_validation = eval_model(
            runner,
            corpus_pipeline(corpus.validation_set(), epochs=1, shuffle=False),
            validation_len, display_progress)
        res_row = {
            "id": args.id,
            "run": args.run,
            "epoch": cur_epoch,
            "step": step,
            "lr": lr_param.value,
            "dataset": "validation",
            "perplexity": ppl_validation
        }
        ppl_writer.writerow(res_row)

        if args.eval_test:
            # pb.write("[Eval Test Set]")

            ppl_test = eval_model(
                runner,
                corpus_pipeline(corpus.test_set(), epochs=1, shuffle=False),
                test_len, display_progress)

            res_row = {
                "id": args.id,
                "run": args.run,
                "epoch": cur_epoch,
                "step": step,
                "lr": lr_param.value,
                "dataset": "test",
                "perplexity": ppl_test
            }
            ppl_writer.writerow(res_row)

        ppl_file.flush()

        if args.eval_test:
            progress_bar.set_postfix({"test PPL ": ppl_test})

        # pb.write("valid. ppl = {}".format(ppl_validation))
        return ppl_validation

    # ======================================================================================
    # TRAINING LOOP
    # ======================================================================================
    # print("Starting TensorFlow Session")

    # preparing evaluation steps
    # I use ceil because I make sure we have padded batches at the end

    epoch_step = 0
    global_step = 0
    current_epoch = 0
    patience = 0

    cfg = tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    sess = tf.Session(config=cfg)
    model_runner.set_session(sess)
    model_runner.init_vars()

    progress = tqdm(total=training_len * args.epochs,
                    position=args.pid + 1,
                    disable=not args.display_progress)
    training_data = corpus_pipeline(corpus.training_set(),
                                    batch_size=args.batch_size,
                                    epochs=args.epochs,
                                    shuffle=args.shuffle)

    evaluations = []

    try:

        for ngram_batch in training_data:
            epoch = progress.n // training_len + 1
            # Start New Epoch
            if epoch != current_epoch:
                current_epoch = epoch
                epoch_step = 0
                if args.display_progress:
                    progress.set_postfix({"epoch": current_epoch})

            # ================================================
            # EVALUATION
            # ================================================
            if epoch_step == 0:
                current_eval = evaluation(model_runner,
                                          progress,
                                          epoch,
                                          global_step,
                                          display_progress=args.eval_progress)

                evaluations.append(current_eval)
                lr_param.update(current_eval)
                # print(lr_param.eval_history)
                # print("improvement ", lr_param.eval_improvement())

                if global_step > 0:
                    if args.early_stop and epoch > 1:
                        if lr_param.eval_improvement(
                        ) < lr_param.improvement_threshold:
                            if patience >= 3:
                                break
                            patience += 1
                        else:
                            patience = 0

            # ================================================
            # TRAIN MODEL
            # ================================================
            ngram_batch = np.array(ngram_batch, dtype=np.int64)
            ctx_ids = ngram_batch[:, :-1]
            word_ids = ngram_batch[:, -1:]

            model_runner.train(ctx_ids, word_ids)
            progress.update(args.batch_size)

            epoch_step += 1
            global_step += 1

        # if not early stop, evaluate last state of the model
        if not args.early_stop or patience < 3:
            current_eval = evaluation(model_runner, progress, epoch,
                                      epoch_step)
            evaluations.append(current_eval)
        ppl_file.close()

        if args.save_model:
            model_runner.save_model(model_name=model_path,
                                    step=global_step,
                                    write_state=False)

        model_runner.close_session()
        progress.close()
        tf.reset_default_graph()

        # return the best validation evaluation
        return min(evaluations)

    except Exception as e:
        traceback.print_exc()
        os.remove(ppl_file.name)
        os.remove(param_file.name)
        raise e
Beispiel #3
0
def run(**kwargs):
    arg_dict.from_dict(kwargs)
    args = arg_dict.to_namespace()

    # ======================================================================================
    # Load Params, Prepare results assets
    # ======================================================================================
    # os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
    # print(args.corpus)

    # Experiment parameter summary
    res_param_filename = os.path.join(args.out_dir,
                                      "params_{id}.csv".format(id=args.run_id))
    with open(res_param_filename, "w") as param_file:
        writer = csv.DictWriter(f=param_file, fieldnames=arg_dict.keys())
        writer.writeheader()
        writer.writerow(arg_dict)
        param_file.flush()

    # make dir for model checkpoints
    if args.save_model:
        model_ckpt_dir = os.path.join(args.out_dir,
                                      "model_{id}".format(id=args.run_id))
        os.makedirs(model_ckpt_dir, exist_ok=True)
        model_path = os.path.join(model_ckpt_dir,
                                  "nnlm_{id}.ckpt".format(id=args.run_id))

    # start perplexity file
    ppl_header = ["id", "run", "epoch", "step", "lr", "dataset", "perplexity"]
    ppl_fname = os.path.join(args.out_dir,
                             "perplexity_{id}.csv".format(id=args.run_id))

    ppl_file = open(ppl_fname, "w")
    ppl_writer = csv.DictWriter(f=ppl_file, fieldnames=ppl_header)
    ppl_writer.writeheader()

    # ======================================================================================
    # CORPUS, Vocab and RIs
    # ======================================================================================
    corpus = h5py.File(os.path.join(args.corpus,
                                    "ptb_{}.hdf5".format(args.ngram_size)),
                       mode='r')
    vocab = marisa_trie.Trie(corpus["vocabulary"])

    # generates k-dimensional random indexes with s_active units
    all_positive = args.ri_all_positive
    ri_generator = Generator(dim=args.k_dim,
                             num_active=args.s_active,
                             symmetric=not all_positive)

    # pre-gen indices for vocab
    # it doesn't matter which ri gets assign to which word since we are pre-generating the indexes
    ris = [ri_generator.generate() for i in range(len(vocab))]
    ri_tensor = ris_to_sp_tensor_value(ris, dim=args.k_dim)

    # ri_tensor = RandomIndexTensor.from_ri_list(ris, args.k_dim, args.s_active)

    # ======================================================================================

    def data_pipeline(data,
                      epochs=1,
                      batch_size=args.batch_size,
                      shuffle=False):
        def chunk_fn(x):
            return chunk_it(x, chunk_size=batch_size * 1000)

        if epochs > 1:
            data = repeat_apply(chunk_fn, data, epochs)
        else:
            data = chunk_fn(data)

        if shuffle:
            data = shuffle_it(data, args.shuffle_buffer_size)

        data = batch_it(data, size=batch_size, padding=False)
        return data

    # ======================================================================================
    # MODEL
    # ======================================================================================
    # Activation functions
    if args.h_act == "relu":
        h_act = tx.relu
        h_init = tx.he_normal_init()
    elif args.h_act == "tanh":
        h_act = tx.tanh
        h_init = tx.glorot_uniform()
    elif args.h_act == "elu":
        h_act = tx.elu
        h_init = tx.he_normal_init()

    # Parameter Init
    if args.embed_init == "normal":
        embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val)
    elif args.embed_init == "uniform":
        embed_init = tx.random_uniform(minval=-args.embed_init_val,
                                       maxval=args.embed_init_val)

    if args.logit_init == "normal":
        logit_init = tx.random_normal(mean=0., stddev=args.logit_init_val)
    elif args.logit_init == "uniform":
        logit_init = tx.random_uniform(minval=-args.logit_init_val,
                                       maxval=args.logit_init_val)

    if args.f_init == "normal":
        f_init = tx.random_normal(mean=0., stddev=args.f_init_val)
    elif args.f_init == "uniform":
        f_init = tx.random_uniform(minval=-args.f_init_val,
                                   maxval=args.f_init_val)

    # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
    #                                        log_device_placement=True))
    # with tf.device('/gpu:{}'.format(args.gpu)):

    model = NNLM_NRP(ctx_size=args.ngram_size - 1,
                     vocab_size=len(vocab),
                     k_dim=args.k_dim,
                     s_active=args.s_active,
                     ri_tensor=ri_tensor,
                     embed_dim=args.embed_dim,
                     embed_init=embed_init,
                     embed_share=args.embed_share,
                     logit_init=logit_init,
                     logit_bias=args.logit_bias,
                     h_dim=args.h_dim,
                     num_h=args.num_h,
                     h_activation=h_act,
                     h_init=h_init,
                     use_dropout=args.dropout,
                     keep_prob=args.keep_prob,
                     embed_dropout=args.embed_dropout,
                     l2_loss=args.l2_loss,
                     l2_loss_coef=args.l2_loss_coef,
                     f_init=f_init)

    model_runner = tx.ModelRunner(model)

    # sess = tf.Session(config=tf.ConfigProto(
    #      allow_soft_placement=True, log_device_placement=True))
    # model_runner.set_session(sess)

    # sess = tf.Session(config=tf.ConfigProto(
    #    allow_soft_placement=True, log_device_placement=True))
    # model_runner.set_session(sess)

    # we use an InputParam because we might want to change it during training
    lr_param = tx.InputParam(value=args.lr)
    if args.optimizer == "sgd":
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=lr_param.tensor)
    elif args.optimizer == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=lr_param.tensor,
                                           beta1=args.optimizer_beta1,
                                           beta2=args.optimizer_beta2,
                                           epsilon=args.optimizer_epsilon)
    elif args.optimizer == "ams":
        optimizer = tx.AMSGrad(learning_rate=lr_param.tensor,
                               beta1=args.optimizer_beta1,
                               beta2=args.optimizer_beta2,
                               epsilon=args.optimizer_epsilon)

    def clip_grad_global(grads):
        grads, _ = tf.clip_by_global_norm(grads, 12)
        return grads

    def clip_grad_local(grad):
        return tf.clip_by_norm(grad, args.clip_value)

    if args.clip_grads:
        if args.clip_local:
            clip_fn = clip_grad_local
        else:
            clip_fn = clip_grad_global

    if args.clip_grads:
        model_runner.config_optimizer(optimizer,
                                      optimizer_params=lr_param,
                                      gradient_op=clip_fn,
                                      global_gradient_op=not args.clip_local)
    else:
        model_runner.config_optimizer(optimizer, optimizer_params=lr_param)

    # assert(model_runner.session == sess)
    # ======================================================================================
    # EVALUATION
    # ======================================================================================

    def eval_model(runner,
                   dataset_it,
                   len_dataset=None,
                   display_progress=False):
        if display_progress:
            pb = tqdm(total=len_dataset, ncols=60)
        batches_processed = 0
        sum_loss = 0
        for batch in dataset_it:
            batch = np.array(batch, dtype=np.int64)
            ctx = batch[:, :-1]
            target = batch[:, -1:]

            mean_loss = runner.eval(ctx, target)
            sum_loss += mean_loss

            if display_progress:
                pb.update(args.batch_size)
            batches_processed += 1

        if display_progress:
            pb.close()

        return np.exp(sum_loss / batches_processed)

    def evaluation(runner: tx.ModelRunner,
                   pb,
                   cur_epoch,
                   step,
                   display_progress=False):
        pb.write("[Eval Validation]")

        val_data = corpus["validation"]
        ppl_validation = eval_model(
            runner, data_pipeline(val_data, epochs=1, shuffle=False),
            len(val_data), display_progress)
        res_row = {
            "id": args.id,
            "run": args.run,
            "epoch": cur_epoch,
            "step": step,
            "lr": lr_param.value,
            "dataset": "validation",
            "perplexity": ppl_validation
        }
        ppl_writer.writerow(res_row)

        pb.write("Eval Test")
        test_data = corpus["test"]
        ppl_test = eval_model(
            runner, data_pipeline(test_data, epochs=1, shuffle=False),
            len(test_data), display_progress)

        res_row = {
            "id": args.id,
            "run": args.run,
            "epoch": cur_epoch,
            "step": step,
            "lr": lr_param.value,
            "dataset": "test",
            "perplexity": ppl_test
        }
        ppl_writer.writerow(res_row)

        ppl_file.flush()

        pb.write("valid. ppl = {} \n test ppl {}".format(
            ppl_validation, ppl_test))
        return ppl_validation

    # ======================================================================================
    # TRAINING LOOP
    # ======================================================================================
    # preparing evaluation steps
    # I use ceil because I make sure we have padded batches at the end

    epoch_step = 0
    global_step = 0
    current_epoch = 0
    patience = 0

    cfg = tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    sess = tf.Session(config=cfg)
    model_runner.set_session(sess)
    model_runner.init_vars()

    training_dset = corpus["training"]
    progress = tqdm(total=len(training_dset) * args.epochs)
    training_data = data_pipeline(training_dset,
                                  epochs=args.epochs,
                                  shuffle=True)

    evals = []
    try:
        for ngram_batch in training_data:
            epoch = progress.n // len(training_dset) + 1
            # Start New Epoch
            if epoch != current_epoch:
                current_epoch = epoch
                epoch_step = 0
                progress.write("epoch: {}".format(current_epoch))

            # Eval Time
            if epoch_step == 0:
                current_eval = evaluation(model_runner, progress, epoch,
                                          global_step)
                evals.append(current_eval)

                if global_step > 0:
                    if args.early_stop:
                        if evals[-2] - evals[-1] < args.eval_threshold:
                            if patience >= 3:
                                progress.write("early stop")
                                break
                            patience += 1
                        else:
                            patience = 0

                    # lr decay only at the start of each epoch
                    if args.lr_decay and len(evals) > 0:
                        if evals[-2] - evals[-1] < args.eval_threshold:
                            lr_param.value = max(
                                lr_param.value * args.lr_decay_rate,
                                args.lr_decay_threshold)
                            progress.write("lr changed to {}".format(
                                lr_param.value))

            # ================================================
            # TRAIN MODEL
            # ================================================
            ngram_batch = np.array(ngram_batch, dtype=np.int64)
            ctx_ids = ngram_batch[:, :-1]
            word_ids = ngram_batch[:, -1:]

            model_runner.train(ctx_ids, word_ids)
            progress.update(args.batch_size)

            epoch_step += 1
            global_step += 1

        # if not early stop, evaluate last state of the model
        if not args.early_stop or patience < 3:
            evaluation(model_runner, progress, epoch, epoch_step)
        ppl_file.close()

        if args.save_model:
            model_runner.save_model(model_name=model_path,
                                    step=global_step,
                                    write_state=False)

        model_runner.close_session()
        progress.close()
        tf.reset_default_graph()

    except Exception as e:
        traceback.print_exc()
        os.remove(ppl_file.name)
        os.remove(param_file.name)
        raise e
Beispiel #4
0
    def __init__(self,
                 ctx_size,
                 vocab_size,
                 embed_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=True,
                 use_gate=True,
                 use_hidden=False,
                 h_dim=100,
                 h_activation=tx.elu,
                 h_init=tx.he_normal_init(),
                 h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 use_dropout=True,
                 embed_dropout=False,
                 keep_prob=0.95,
                 l2_loss=False,
                 l2_loss_coef=1e-5,
                 use_nce=False,
                 nce_samples=100):

        # GRAPH INPUTS
        run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input")
        loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target")
        eval_inputs = loss_inputs

        # RUN GRAPH
        # if I create a scope here the Tensorboard graph will be a mess to read
        # because it groups everything by nested scope names
        # instead if I choose to create different scopes for train and eval only
        # the graph stays readable because it allows us to use the same names
        # under different scopes while still sharing variables
        var_reg = []
        with tf.name_scope("run"):
            feature_lookup = tx.Lookup(run_inputs,
                                       ctx_size, [vocab_size, embed_dim],
                                       embed_init,
                                       name="lookup")
            var_reg.append(feature_lookup.weights)
            feature_lookup = feature_lookup.as_concat()

            if use_gate or use_hidden:
                hl = tx.Linear(feature_lookup,
                               h_dim,
                               h_init,
                               bias=True,
                               name="h_linear")
                ha = tx.Activation(hl, h_activation, name="h_activation")
                h = tx.Compose(hl, ha, name="hidden")
                var_reg.append(hl.weights)

            features = feature_lookup
            if use_gate:
                gate_w = tx.Linear(h, ctx_size, bias=True)
                gate = tx.Gate(features, gate_input=gate_w)

                # gate = tx.Module([h, features], gate)

                features = gate
                var_reg.append(gate_w.weights)

            x_to_f = tx.Linear(features,
                               embed_dim,
                               x_to_f_init,
                               bias=True,
                               name="x_to_f")
            var_reg.append(x_to_f.weights)
            f_prediction = x_to_f

            if use_hidden:
                h_to_f = tx.Linear(h,
                                   embed_dim,
                                   h_to_f_init,
                                   bias=True,
                                   name="h_to_f")
                var_reg.append(h_to_f.weights)
                f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted")

            # RI DECODING ===============================================
            shared_weights = tf.transpose(
                feature_lookup.weights) if embed_share else None
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(f_prediction,
                                   vocab_size,
                                   logit_init,
                                   shared_weights,
                                   bias=True,
                                   name="logits")
            if not embed_share:
                var_reg.append(run_logits.weights)
            y_prob = tx.Activation(run_logits, tx.softmax)

        # TRAIN GRAPH ===============================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                feature_lookup = feature_lookup.reuse_with(run_inputs)
                features = tx.Dropout(feature_lookup, probability=keep_prob)
            else:
                features = feature_lookup

            if use_gate or use_hidden:
                if use_dropout:
                    h = h.reuse_with(features)
                    h = tx.Dropout(h, probability=keep_prob)

                if use_gate:
                    gate_w = gate_w.reuse_with(h)
                    features = gate.reuse_with(layer=features,
                                               gate_input=gate_w)

                f_prediction = x_to_f.reuse_with(features)

                if use_hidden:
                    h_to_f = h_to_f.reuse_with(h)
                    if use_dropout:
                        h_to_f = tx.Dropout(h_to_f, probability=keep_prob)
                    f_prediction = tx.Add(f_prediction, h_to_f)
            else:
                f_prediction = f_prediction.reuse_with(features)

            train_logits = run_logits.reuse_with(f_prediction)

            if use_nce:
                # uniform gets good enough results if enough samples are used
                # but we can load the empirical unigram distribution
                # or learn the unigram distribution during training
                sampled_values = uniform_sampler(loss_inputs.tensor, 1,
                                                 nce_samples, True, vocab_size)
                train_loss = tf.nn.nce_loss(weights=tf.transpose(
                    train_logits.weights),
                                            biases=train_logits.bias,
                                            inputs=f_prediction.tensor,
                                            labels=loss_inputs.tensor,
                                            num_sampled=nce_samples,
                                            num_classes=vocab_size,
                                            num_true=1,
                                            sampled_values=sampled_values)
            else:
                one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor,
                                           num_cols=vocab_size)
                train_loss = tx.categorical_cross_entropy(
                    one_hot, train_logits.tensor)

            train_loss = tf.reduce_mean(train_loss)

            if l2_loss:
                losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = train_loss + l2_loss_coef * tf.add_n(losses)

        # EVAL GRAPH ===============================================
        with tf.name_scope("eval"):
            one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor,
                                       num_cols=vocab_size)
            eval_loss = tx.categorical_cross_entropy(one_hot,
                                                     run_logits.tensor)
            eval_loss = tf.reduce_mean(eval_loss)

        # SETUP MODEL CONTAINER ====================================
        super().__init__(run_inputs=run_inputs,
                         run_outputs=y_prob,
                         train_inputs=run_inputs,
                         train_outputs=y_prob,
                         eval_inputs=run_inputs,
                         eval_outputs=y_prob,
                         train_out_loss=train_loss,
                         train_in_loss=loss_inputs,
                         eval_out_score=eval_loss,
                         eval_in_score=eval_inputs)
Beispiel #5
0
def run(**kwargs):
    arg_dict.from_dict(kwargs)
    args = arg_dict.to_namespace()

    # ======================================================================================
    # Load Corpus & Vocab
    # ======================================================================================
    corpus = PTBReader(path=args.corpus, mark_eos=args.mark_eos)
    corpus_stats = h5py.File(os.path.join(args.corpus, "ptb_stats.hdf5"), mode='r')
    vocab = marisa_trie.Trie(corpus_stats["vocabulary"])

    to_ngrams_batch = partial(to_ngrams,
                              vocab=vocab,
                              ngram_size=args.ngram_size,
                              batch_size=args.batch_size,
                              epochs=1,
                              shuffle=False,
                              shuffle_buffer_size=args.shuffle_buffer_size,
                              enum_epoch=False)

    training_len = sum(1 for _ in to_ngrams_batch(corpus.training_set, batch_size=1))

    validation_len = None
    test_len = None
    if args.eval_progress:
        validation_len = sum(1 for _ in to_ngrams_batch(corpus.validation_set, batch_size=1))
        test_len = sum(1 for _ in to_ngrams_batch(corpus.test_set, batch_size=1))

    # ======================================================================================
    # Load Params, Prepare results assets
    # ======================================================================================
    # Experiment parameter summary
    res_param_filename = os.path.join(args.out_dir, "params_{id}_{run}.csv".format(id=args.id, run=args.run))
    with open(res_param_filename, "w") as param_file:
        writer = csv.DictWriter(f=param_file, fieldnames=arg_dict.keys())
        writer.writeheader()
        writer.writerow(arg_dict)
        param_file.flush()

    # make dir for model checkpoints
    if args.save_model:
        model_ckpt_dir = os.path.join(args.out_dir, "model_{id}_{run}".format(id=args.id, run=args.run))
        os.makedirs(model_ckpt_dir, exist_ok=True)
        model_path = os.path.join(model_ckpt_dir, "nnlm_{id}_{run}.ckpt".format(id=args.id, run=args.run))

    # start perplexity file
    ppl_header = ["id", "run", "epoch", "step", "lr", "dataset", "perplexity"]
    ppl_filename = os.path.join(args.out_dir, "perplexity_{id}_{run}.csv".format(id=args.id, run=args.run))

    ppl_file = open(ppl_filename, "w")
    ppl_writer = csv.DictWriter(f=ppl_file, fieldnames=ppl_header)
    ppl_writer.writeheader()

    # ======================================================================================
    # MODEL
    # ======================================================================================
    # Configure weight initializers based on activation functions
    if args.h_act == "relu":
        h_act = tx.relu
        h_init = tx.he_normal_init()
    elif args.h_act == "tanh":
        h_act = tx.tanh
        h_init = tx.glorot_uniform()
    elif args.h_act == "elu":
        h_act = tx.elu
        h_init = tx.he_normal_init()
    elif args.h_act == "selu":
        h_act = tf.nn.selu
        h_init = tx.glorot_uniform()

    # Configure embedding and logit weight initializers
    if args.embed_init == "normal":
        embed_init = tx.random_normal(mean=0.,
                                      stddev=args.embed_init_val)
    elif args.embed_init == "uniform":
        embed_init = tx.random_uniform(minval=-args.embed_init_val,
                                       maxval=args.embed_init_val)

    if args.logit_init == "normal":
        logit_init = tx.random_normal(mean=0.,
                                      stddev=args.logit_init_val)
    elif args.logit_init == "uniform":
        logit_init = tx.random_uniform(minval=-args.logit_init_val,
                                       maxval=args.logit_init_val)

    f_init = None
    if args.use_f_predict:
        if args.f_init == "normal":
            f_init = tx.random_normal(mean=0., stddev=args.f_init_val)
        elif args.f_init == "uniform":
            f_init = tx.random_uniform(minval=-args.f_init_val, maxval=args.f_init_val)

    inputs = tx.Input(args.ngram_size - 1, dtype=tf.int64, name="ctx_inputs")
    labels = tx.Input(1, dtype=tf.int64, name="ctx_inputs")
    model = NNLM(inputs=inputs,
                 label_inputs=labels,
                 vocab_size=len(vocab),
                 embed_dim=args.embed_dim,
                 embed_init=embed_init,
                 embed_share=args.embed_share,
                 logit_init=logit_init,
                 h_dim=args.h_dim,
                 num_h=args.num_h,
                 h_activation=h_act,
                 h_init=h_init,
                 use_dropout=args.dropout,
                 drop_probability=args.drop_probability,
                 embed_dropout=args.embed_dropout,
                 l2_loss=args.l2_loss,
                 l2_weight=args.l2_loss_coef,
                 use_f_predict=args.use_f_predict,
                 f_init=f_init,
                 logit_bias=args.logit_bias,
                 use_nce=False)

    # Input params can be changed during training by setting their value
    # lr_param = tx.InputParam(init_value=args.lr)
    lr_param = tensorx.train.EvalStepDecayParam(value=args.lr,
                                                improvement_threshold=args.eval_threshold,
                                                less_is_better=True,
                                                decay_rate=args.lr_decay_rate,
                                                decay_threshold=args.lr_decay_threshold)
    if args.optimizer == "sgd":
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr_param.tensor)
    elif args.optimizer == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=lr_param.tensor,
                                           beta1=args.optimizer_beta1,
                                           beta2=args.optimizer_beta2,
                                           epsilon=args.optimizer_epsilon)
    elif args.optimizer == "ams":
        optimizer = tx.AMSGrad(learning_rate=lr_param.tensor,
                               beta1=args.optimizer_beta1,
                               beta2=args.optimizer_beta2,
                               epsilon=args.optimizer_epsilon)

    def clip_grad_global(grads):
        grads, _ = tf.clip_by_global_norm(grads, 12)
        return grads

    def clip_grad_local(grad):
        return tf.clip_by_norm(grad, args.clip_value)

    if args.clip_grads:
        if args.clip_local:
            clip_fn = clip_grad_local
        else:
            clip_fn = clip_grad_global

    if args.clip_grads:
        model.config_optimizer(optimizer, optimizer_params=lr_param,
                               gradient_op=clip_fn,
                               global_gradient_op=not args.clip_local)
    else:
        model.config_optimizer(optimizer, optimizer_params=lr_param)

    # ======================================================================================
    # EVALUATION
    # ======================================================================================

    def eval_model(model, dataset_it, len_dataset=None, display_progress=False):
        if display_progress:
            pb = tqdm(total=len_dataset, ncols=60, position=1)
        batches_processed = 0
        sum_loss = 0
        for batch in dataset_it:
            batch = np.array(batch, dtype=np.int64)
            ctx = batch[:, :-1]
            target = batch[:, -1:]

            mean_loss = model.eval({inputs: ctx, labels: target})
            sum_loss += mean_loss

            if display_progress:
                pb.update(args.batch_size)
            batches_processed += 1

        if display_progress:
            pb.close()

        return np.exp(sum_loss / batches_processed)

    def evaluation(model: tx.Model, progress_bar, cur_epoch, step, display_progress=False):

        ppl_validation = eval_model(model,
                                    to_ngrams_batch(corpus.validation_set),
                                    validation_len,
                                    display_progress)
        res_row = {"id": args.id, "run": args.run, "epoch": cur_epoch, "step": step, "lr": lr_param.value,
                   "dataset": "validation",
                   "perplexity": ppl_validation}
        ppl_writer.writerow(res_row)

        if args.eval_test:
            # pb.write("[Eval Test Set]")
            ppl_test = eval_model(model, to_ngrams(corpus.test_set), test_len, display_progress)

            res_row = {"id": args.id, "run": args.run, "epoch": cur_epoch, "step": step, "lr": lr_param.value,
                       "dataset": "test",
                       "perplexity": ppl_test}
            ppl_writer.writerow(res_row)

        ppl_file.flush()

        if args.eval_test:
            progress_bar.set_postfix({"test PPL ": ppl_test})

        # pb.write("valid. ppl = {}".format(ppl_validation))
        return ppl_validation

    # ======================================================================================
    # TRAINING LOOP
    # ======================================================================================
    # print("Starting TensorFlow Session")

    # preparing evaluation steps
    # I use ceil because I make sure we have padded batches at the end

    epoch_step = 0
    global_step = 0
    current_epoch = 0
    patience = 0

    cfg = tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    sess = tf.Session(config=cfg)
    model.set_session(sess)
    model.init_vars()

    progress = tqdm(total=training_len * args.epochs, position=args.pid + 1, disable=not args.display_progress)

    training_data = to_ngrams_batch(corpus.training_set,
                                    epochs=args.epochs,
                                    shuffle=args.shuffle,
                                    enum_epoch=True)

    evaluations = []

    try:

        for i, ngram_batch in training_data:
            epoch = i + 1
            # Start New Epoch
            if epoch != current_epoch:
                current_epoch = epoch
                epoch_step = 0
                if args.display_progress:
                    progress.set_postfix({"epoch": current_epoch})

            # ================================================
            # EVALUATION
            # ================================================
            if epoch_step == 0:
                current_eval = evaluation(model, progress, epoch, global_step,
                                          display_progress=args.eval_progress)

                evaluations.append(current_eval)
                lr_param.update(current_eval)
                # print(lr_param.eval_history)
                # print("improvement ", lr_param.eval_improvement())

                if global_step > 0:
                    if args.early_stop and epoch > 1:
                        if lr_param.eval_improvement() < lr_param.improvement_threshold:
                            if patience >= 3:
                                break
                            patience += 1
                        else:
                            patience = 0

            # ================================================
            # TRAIN MODEL
            # ================================================
            ngram_batch = np.array(ngram_batch, dtype=np.int64)
            ctx_ids = ngram_batch[:, :-1]
            word_ids = ngram_batch[:, -1:]

            model.train({inputs: ctx_ids, labels: word_ids})
            progress.update(args.batch_size)

            epoch_step += 1
            global_step += 1

        # if not early stop, evaluate last state of the model
        if not args.early_stop or patience < 3:
            current_eval = evaluation(model, progress, epoch, epoch_step)
            evaluations.append(current_eval)
        ppl_file.close()

        if args.save_model:
            model.save_model(model_name=model_path, step=global_step, write_state=False)

        model.close_session()
        progress.close()
        tf.reset_default_graph()

        # return the best validation evaluation
        return min(evaluations)

    except Exception as e:
        traceback.print_exc()
        os.remove(ppl_file.name)
        os.remove(param_file.name)
        raise e
Beispiel #6
0
    def __init__(self,
                 inputs,
                 labels,
                 vocab_size,
                 embed_dim,
                 h_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 num_h=1,
                 h_activation=tx.tanh,
                 h_init=tx.he_normal_init(),
                 reset_state=True,
                 embed_dropout=False,
                 w_dropout=False,
                 u_dropconnect=False,
                 other_dropout=False,
                 w_keep_prob=0.9,
                 u_keep_prob=0.9,
                 embed_keep_prob=0.9,
                 other_keep_prob=0.9,
                 l2_loss=False,
                 l2_weight=1e-5,
                 use_f_predict=False,
                 f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=False,
                 logit_bias=False,
                 use_nce=False,
                 nce_samples=10,
                 ):
        if not isinstance(inputs, tx.Input):
            raise TypeError("inputs must be an Input layer")
        self.inputs = inputs
        self.labels = labels
        if not isinstance(labels, tx.Input):
            raise TypeError("labels must be an Input layer")

        if inputs.dtype != tf.int32 and inputs.dtype != tf.int64:
            raise TypeError("Invalid dtype for input: expected int32 or int64, got {}".format(inputs.dtype))

        if num_h < 0:
            raise ValueError("num hidden should be >= 0")

        ctx_size = inputs.n_units
        # ===============================================
        # RUN GRAPH
        # ===============================================
        var_reg = []

        with tf.name_scope("run"):
            # feature lookup
            embeddings = tx.Lookup(inputs, ctx_size, [vocab_size, embed_dim], weight_init=embed_init)
            var_reg.append(embeddings.weights)
            feature_lookup = embeddings.permute_batch_time()

            last_layer = feature_lookup
            last_feature_layer = feature_lookup


            for i in range(num_h):
                h_i = tx.QRNN(feature_lookup,
                              n_units=h_dim,
                              activation=h_activation,
                              filter_size=
                              )


                last_layer = h_i
                # save last state, this will be used by state of first cell

                var_reg += [wi.weights for wi in last_layer.w]
                var_reg += [ui.weights for ui in last_layer.u]

            if not reset_state:
                last_layer = zero_state.reuse_with(last_layer, name="cache_last_state")

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict")
                var_reg.append(last_layer.weights)
                f_predict = last_layer

            shared_weights = feature_lookup.weights if embed_share else None
            transpose_weights = embed_share
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(last_layer,
                                   n_units=vocab_size,
                                   weight_init=logit_init,
                                   shared_weights=shared_weights,
                                   transpose_weights=transpose_weights,
                                   add_bias=logit_bias,
                                   name="logits")

            if not embed_share:
                var_reg.append(run_logits.weights)

            run_output = tx.Activation(run_logits, tx.softmax, name="run_output")

        # ===============================================
        # TRAIN GRAPH
        # ===============================================
        with tf.name_scope("train"):
            embeddings = embeddings.reuse_with(inputs)
            feature_lookup = embeddings.as_seq()

            if other_dropout and embed_dropout:
                feature_lookup = tx.Dropout(feature_lookup, probability=embed_keep_prob, name="drop_features")

            # last_layer = last_layer.as_seq()

            # add dropout between each layer
            # for i, layer in enumerate(h_layers):
            cell = lstm_cells[0]

            for i in range(ctx_size):
                if i == 0:
                    h = cell.reuse_with(input_layer=feature_lookup[i],
                                        previous_state=None,  # copy from first cell
                                        previous_memory=None,  # copy from first cell
                                        regularized=w_dropout or u_dropconnect,
                                        name="lstm_cell_{}".format(i))

                else:
                    h = cell.reuse_with(input_layer=feature_lookup[i],
                                        previous_state=last_layer,
                                        name="lstm_cell_{}".format(i))

                cell = h
                # if use_dropout:
                #    h = tx.ZoneOut(h,
                #                   previous_layer=h.previous_state,
                #                   keep_prob=keep_prob,
                #                   name="zoneout_{}".format(i))
                last_layer = h
            if not reset_state:
                last_layer = zero_state.reuse_with(last_layer, name="cache_last_cell")

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = f_predict.reuse_with(last_layer)

            train_logits = run_logits.reuse_with(last_layer, name="train_logits")
            train_output = tx.Activation(train_logits, tx.softmax, name="train_output")

            def categorical_loss(labels, logits):
                labels = tx.dense_one_hot(column_indices=labels, num_cols=vocab_size)
                loss = tx.categorical_cross_entropy(labels=labels, logits=logits)
                # loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits)
                return tf.reduce_mean(loss)

            def nce_loss(labels, weights, bias, predict):
                noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size)
                loss = tf.nn.nce_loss(weights=weights,
                                      biases=bias,
                                      inputs=predict,
                                      labels=labels,
                                      num_sampled=nce_samples,
                                      num_classes=vocab_size,
                                      num_true=1,
                                      sampled_values=noise)
                return tf.reduce_mean(loss)

            if use_nce:
                bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias")

                nce_weights = tx.WrapLayer(embeddings,
                                           n_units=embeddings.n_units,
                                           wrap_fn=lambda x: x.weights,
                                           layer_fn=True)
                train_loss = tx.LambdaLayer(labels, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss")
            else:
                train_loss = tx.LambdaLayer(labels, train_logits, apply_fn=categorical_loss, name="train_loss")

            if l2_loss:
                l2_losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = tx.LambdaLayer(train_loss,
                                            apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses),
                                            name="train_loss_l2")

        # ===============================================
        # EVAL GRAPH
        # ===============================================
        with tf.name_scope("eval"):
            eval_loss = tx.LambdaLayer(labels, run_logits, apply_fn=categorical_loss, name="eval_loss")

        # BUILD MODEL
        super().__init__(run_outputs=run_output,
                         run_inputs=inputs,
                         train_inputs=[inputs, labels],
                         train_outputs=train_output,
                         train_loss=train_loss,
                         eval_inputs=[inputs, labels],
                         eval_outputs=run_output,
                         eval_score=eval_loss)
Beispiel #7
0
    def __init__(
        self,
        inputs,
        label_inputs,
        vocab_size,
        embed_dim,
        h_dim,
        embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
        logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
        num_h=1,
        h_activation=tx.elu,
        h_init=tx.he_normal_init(),
        use_dropout=False,
        embed_dropout=False,
        drop_probability=0.05,
        l2_loss=False,
        l2_weight=1e-5,
        use_f_predict=False,
        f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
        embed_share=False,
        logit_bias=False,
        use_nce=False,
        nce_samples=10,
    ):
        if not isinstance(inputs, tx.Input):
            raise TypeError("inputs must be an Input layer")
        self.inputs = inputs
        self.labels = label_inputs
        if not isinstance(label_inputs, tx.Input):
            raise TypeError("labels must be an Input layer")

        if inputs.dtype != tf.int32 and inputs.dtype != tf.int64:
            raise TypeError(
                "Invalid dtype for input: expected int32 or int64, got {}".
                format(inputs.dtype))

        if num_h < 0:
            raise ValueError("num hidden should be >= 0")

        ctx_size = inputs.n_units
        # ===============================================
        # RUN GRAPH
        # ===============================================
        var_reg = []

        with tf.name_scope("run"):
            # feature lookup
            embeddings = tx.Lookup(inputs,
                                   ctx_size, [vocab_size, embed_dim],
                                   weight_init=embed_init)
            var_reg.append(embeddings.weights)
            feature_lookup = embeddings.as_concat()

            last_layer = feature_lookup
            h_layers = []
            for i in range(num_h):
                h_i = tx.FC(layer=last_layer,
                            n_units=h_dim,
                            activation=h_activation,
                            weight_init=h_init,
                            add_bias=True,
                            name="h_{}".format(i + 1))
                h_layers.append(h_i)
                last_layer = h_i
                var_reg.append(h_i.linear.weights)

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = tx.Linear(last_layer,
                                       embed_dim,
                                       f_init,
                                       add_bias=True,
                                       name="f_predict")
                var_reg.append(last_layer.weights)
                f_predict = last_layer

            shared_weights = feature_lookup.weights if embed_share else None
            transpose_weights = embed_share
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(last_layer,
                                   n_units=vocab_size,
                                   weight_init=logit_init,
                                   shared_weights=shared_weights,
                                   transpose_weights=transpose_weights,
                                   add_bias=logit_bias,
                                   name="logits")

            if not embed_share:
                var_reg.append(run_logits.weights)

            run_output = tx.Activation(run_logits,
                                       tx.softmax,
                                       name="run_output")

        # ===============================================
        # TRAIN GRAPH
        # ===============================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                last_layer = tx.Dropout(feature_lookup,
                                        probability=drop_probability,
                                        name="dropout_features")
            else:
                last_layer = feature_lookup

            # add dropout between each layer
            for i, layer in enumerate(h_layers):
                h = layer.reuse_with(last_layer)
                if use_dropout:
                    h = tx.Dropout(h,
                                   probability=drop_probability,
                                   name="dropout_{}".format(i + 1))
                last_layer = h

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = f_predict.reuse_with(last_layer)

            train_logits = run_logits.reuse_with(last_layer,
                                                 name="train_logits")
            train_output = tx.Activation(train_logits,
                                         tx.softmax,
                                         name="train_output")

            def categorical_loss(labels, logits):
                labels = tx.dense_one_hot(column_indices=labels,
                                          num_cols=vocab_size)
                loss = tx.categorical_cross_entropy(labels=labels,
                                                    logits=logits)
                return tf.reduce_mean(loss)

            def nce_loss(labels, weights, bias, predict):
                noise = uniform_sampler(labels, 1, nce_samples, True,
                                        vocab_size)
                loss = tf.nn.nce_loss(weights=weights,
                                      biases=bias,
                                      inputs=predict,
                                      labels=labels,
                                      num_sampled=nce_samples,
                                      num_classes=vocab_size,
                                      num_true=1,
                                      sampled_values=noise)
                return tf.reduce_mean(loss)

            if use_nce:
                bias = tx.VariableLayer(var_shape=[vocab_size],
                                        name="nce_bias")

                nce_weights = tx.WrapLayer(embeddings,
                                           n_units=embeddings.n_units,
                                           wrap_fn=lambda x: x.weights,
                                           layer_fn=True)
                train_loss = tx.LambdaLayer(label_inputs,
                                            nce_weights,
                                            bias,
                                            last_layer,
                                            apply_fn=nce_loss,
                                            name="nce_loss")
            else:
                train_loss = tx.LambdaLayer(label_inputs,
                                            train_logits,
                                            apply_fn=categorical_loss,
                                            name="train_loss")

            if l2_loss:
                l2_losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = tx.WrapLayer(
                    train_loss,
                    wrap_fn=lambda x: x + l2_weight * tf.add_n(l2_losses),
                    name="train_loss_l2")

        # ===============================================
        # EVAL GRAPH
        # ===============================================
        with tf.name_scope("eval"):
            eval_loss = tx.LambdaLayer(label_inputs,
                                       run_logits,
                                       apply_fn=categorical_loss,
                                       name="eval_loss")

        # BUILD MODEL
        super().__init__(run_outputs=run_output,
                         run_inputs=inputs,
                         train_inputs=[inputs, label_inputs],
                         train_outputs=train_output,
                         train_loss=train_loss,
                         eval_inputs=[inputs, label_inputs],
                         eval_outputs=run_output,
                         eval_score=eval_loss)
Beispiel #8
0
    def __init__(self,
                 ctx_size,
                 vocab_size,
                 k_dim,
                 ri_tensor: RandomIndexTensor,
                 embed_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=True,
                 logit_bias=False,
                 use_gate=True,
                 use_hidden=False,
                 h_dim=100,
                 h_activation=tx.elu,
                 h_init=tx.he_normal_init(),
                 h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 use_dropout=True,
                 embed_dropout=False,
                 keep_prob=0.95,
                 l2_loss=False,
                 l2_loss_coef=1e-5):

        # GRAPH INPUTS
        run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input")
        loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target")
        eval_inputs = loss_inputs

        # RUN GRAPH =====================================================
        var_reg = []
        with tf.name_scope("run"):
            # RI ENCODING ===============================================
            # convert ids to ris gather a set of random indexes based on the ids in a sequence

            # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim)
            # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor)
            with tf.name_scope("ri_encode"):
                # used to compute logits
                if isinstance(ri_tensor, RandomIndexTensor):
                    ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(),
                                              k_dim)

                    ri_inputs = ri_tensor.gather(run_inputs.tensor)
                    ri_inputs = ri_inputs.to_sparse_tensor()
                    ri_inputs = tx.TensorLayer(ri_inputs, k_dim)
                else:
                    ri_layer = tx.TensorLayer(ri_tensor, k_dim)
                    ri_inputs = tx.gather_sparse(ri_layer.tensor,
                                                 run_inputs.tensor)
                    ri_inputs = tx.TensorLayer(ri_inputs, k_dim)

            # use those sparse indexes to lookup a set of features based on the ri values
            feature_lookup = tx.Lookup(ri_inputs,
                                       ctx_size, [k_dim, embed_dim],
                                       embed_init,
                                       name="lookup")
            var_reg.append(feature_lookup.weights)
            feature_lookup = feature_lookup.as_concat()
            # ===========================================================

            if use_gate or use_hidden:
                hl = tx.Linear(feature_lookup,
                               h_dim,
                               h_init,
                               bias=True,
                               name="h_linear")
                ha = tx.Activation(hl, h_activation, name="h_activation")
                h = tx.Compose(hl, ha, name="hidden")
                var_reg.append(hl.weights)

            features = feature_lookup
            if use_gate:
                features = tx.Gate(features, ctx_size, gate_input=h)
                gate = features
                var_reg.append(features.gate_weights)

            x_to_f = tx.Linear(features,
                               embed_dim,
                               x_to_f_init,
                               bias=True,
                               name="x_to_f")
            var_reg.append(x_to_f.weights)
            f_prediction = x_to_f

            if use_hidden:
                h_to_f = tx.Linear(h,
                                   embed_dim,
                                   h_to_f_init,
                                   bias=True,
                                   name="h_to_f")
                var_reg.append(h_to_f.weights)
                f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted")

            # RI DECODING ===============================================
            shared_weights = feature_lookup.weights if embed_share else None
            logit_init = logit_init if not embed_share else None
            # embedding feature vectors for all words: shape [vocab_size, embed_dim]
            # later, for NCE we don't need to get all the features

            all_embeddings = tx.Linear(ri_layer,
                                       embed_dim,
                                       logit_init,
                                       shared_weights,
                                       name="logits",
                                       bias=False)

            # dot product of f_predicted . all_embeddings with bias for each target word

            run_logits = tx.Linear(f_prediction,
                                   n_units=vocab_size,
                                   shared_weights=all_embeddings.tensor,
                                   transpose_weights=True,
                                   bias=logit_bias)

            if not embed_share:
                var_reg.append(all_embeddings.weights)

            # ===========================================================
            run_embed_prob = tx.Activation(run_logits, tx.softmax)

        # TRAIN GRAPH ===================================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                feature_lookup = feature_lookup.reuse_with(ri_inputs)
                features = tx.Dropout(feature_lookup, probability=keep_prob)
            else:
                features = feature_lookup

            if use_gate or use_hidden:
                if use_dropout:
                    h = h.reuse_with(features)
                    h = tx.Dropout(h, probability=keep_prob)

                if use_gate:
                    features = gate.reuse_with(features, gate_input=h)

                f_prediction = x_to_f.reuse_with(features)

                if use_hidden:
                    h_to_f = h_to_f.reuse_with(h)
                    if use_dropout:
                        h_to_f = tx.Dropout(h_to_f, probability=keep_prob)
                    f_prediction = tx.Add(f_prediction, h_to_f)
            else:
                f_prediction = f_prediction.reuse_with(features)

            # we already define all_embeddings from which these logits are computed before so this should be ok
            train_logits = run_logits.reuse_with(f_prediction)

            train_embed_prob = tx.Activation(train_logits,
                                             tx.softmax,
                                             name="train_output")

            one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor,
                                       num_cols=vocab_size)
            train_loss = tx.categorical_cross_entropy(one_hot,
                                                      train_logits.tensor)

            train_loss = tf.reduce_mean(train_loss)

            if l2_loss:
                losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = train_loss + l2_loss_coef * tf.add_n(losses)

        # EVAL GRAPH ===============================================
        with tf.name_scope("eval"):
            one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor,
                                       num_cols=vocab_size)
            eval_loss = tx.categorical_cross_entropy(one_hot,
                                                     run_logits.tensor)
            eval_loss = tf.reduce_mean(eval_loss)

        # SETUP MODEL CONTAINER ====================================
        super().__init__(run_inputs=run_inputs,
                         run_outputs=run_embed_prob,
                         train_inputs=run_inputs,
                         train_outputs=train_embed_prob,
                         eval_inputs=run_inputs,
                         eval_outputs=run_embed_prob,
                         train_out_loss=train_loss,
                         train_in_loss=loss_inputs,
                         eval_out_score=eval_loss,
                         eval_in_score=eval_inputs)