Exemple #1
0
def evaluate(model, config, checkpoint_manager, checkpoint, ckpt_path,
             model_name_or_path, tokenizer_class, tokenizer_cache_dir):
    if ckpt_path == None:
        ckpt_path = checkpoint_manager.latest_checkpoint
    tf.get_logger().info("Evaluating model %s", ckpt_path)
    checkpoint.restore(ckpt_path)
    validation_dataset = Dataset(config.get("validation_file_path", None),
                                 os.path.join(config.get("model_dir"), "data"),
                                 config.get("seq_size"),
                                 config.get("max_sents"),
                                 config.get("do_shuffle"),
                                 config.get("do_skip_empty"),
                                 procedure="dev",
                                 model_name_or_path=model_name_or_path,
                                 tokenizer_class=tokenizer_class,
                                 tokenizer_cache_dir=tokenizer_cache_dir)
    iterator = iter(
        validation_dataset.create_one_epoch(do_shuffle=False, mode="p"))

    @tf.function
    def encode_next():
        src, tgt = next(iterator)
        padding_mask = build_mask(src["input_ids"], src["lengths"])
        src_sentence_embedding = model.encode(src, padding_mask)
        padding_mask = build_mask(tgt["input_ids"], tgt["lengths"])
        tgt_sentence_embedding = model.encode(tgt, padding_mask)
        return src_sentence_embedding, tgt_sentence_embedding

    # Iterates on the dataset.
    src_sentence_embedding_list = []
    tgt_sentence_embedding_list = []
    while True:
        try:
            src_sentence_embedding_, tgt_sentence_embedding_ = encode_next()
            src_sentence_embedding_list.append(src_sentence_embedding_.numpy())
            tgt_sentence_embedding_list.append(tgt_sentence_embedding_.numpy())
        except tf.errors.OutOfRangeError:
            break
    src_sentences = np.concatenate(src_sentence_embedding_list, axis=0)
    tgt_sentences = np.concatenate(tgt_sentence_embedding_list, axis=0)
    print("src_sentences", src_sentences.shape)
    print("tgt_sentences", tgt_sentences.shape)
    d = src_sentences.shape[-1]
    index = faiss.IndexFlatIP(d)  # build the index
    print("faiss state: ", index.is_trained)
    index.add(src_sentences)  # add vectors to the index
    print("number of sentences: %d" % index.ntotal)
    k = 1
    D, I = index.search(tgt_sentences, k)  # tgt -> src search
    print(sklearn.metrics.accuracy_score(np.arange(index.ntotal), I))
Exemple #2
0
def encode(lang,
           checkpoint_path,
           dataset_path,
           config,
           config_class,
           model_class,
           tokenizer_class,
           output="output"):
    #####
    print("encoding %s in lang %d using ckpt %s" %
          (dataset_path, lang, checkpoint_path))
    #####
    model_name_or_path = config.get("model_name_or_path", "xlm-mlm-enfr-1024")
    config_cache_dir = config.get("pretrained_config_cache_dir")
    model_cache_dir = config.get("pretrained_model_cache_dir")
    tokenizer_cache_dir = config.get("pretrained_tokenizer_cache_dir")
    model_name_or_path_ = config.get("model_name_or_path_",
                                     "xlm-mlm-enfr-1024")
    #####
    dataset = Dataset(dataset_path,
                      config.get("training_data_save_path"),
                      config.get("seq_size"),
                      config.get("max_sents"),
                      config.get("do_shuffle"),
                      config.get("do_skip_empty"),
                      procedure="encode",
                      model_name_or_path=model_name_or_path,
                      tokenizer_class=tokenizer_class,
                      tokenizer_cache_dir=tokenizer_cache_dir)
    pretrained_config = config_class.from_pretrained(
        model_name_or_path,
        cache_dir=config_cache_dir if config_cache_dir else None)
    model = model_class.from_pretrained(
        model_name_or_path_,
        config=pretrained_config,
        cache_dir=model_cache_dir if model_cache_dir else None)
    checkpoint = tf.train.Checkpoint(model=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    config["model_dir"],
                                                    max_to_keep=5)
    if checkpoint_manager.latest_checkpoint is not None:

        if checkpoint_path == None:
            checkpoint_path = checkpoint_manager.latest_checkpoint
        tf.get_logger().info("Restoring parameters from %s", checkpoint_path)
        checkpoint.restore(checkpoint_path)
    iterator = iter(dataset.create_one_epoch(mode="e", lang=lang))

    @tf.function
    def encode_next():
        src = next(iterator)
        padding_mask = build_mask(src["input_ids"], src["lengths"])
        src_sentence_embedding = model.encode(src, padding_mask)
        return src_sentence_embedding

    src_sentence_embedding_list = []
    maxcount = 1000000
    count = 0
    index = 0
    while True:
        try:
            src_sentence_embedding_ = encode_next()
            src_sentence_embedding__ = src_sentence_embedding_.numpy()
            src_sentence_embedding_list.append(src_sentence_embedding__)
            count += src_sentence_embedding__.shape[0]
            print(count)
            if count > maxcount:
                src_sentences = np.concatenate(src_sentence_embedding_list,
                                               axis=0)
                np.savez(output + str(index),
                         sentence_embeddings=src_sentences)
                count = 0
                src_sentence_embedding_list = []
                index += 1
        except tf.errors.OutOfRangeError:
            break
    if len(src_sentence_embedding_list) > 0:
        src_sentences = np.concatenate(src_sentence_embedding_list, axis=0)
        np.savez(output + str(index), sentence_embeddings=src_sentences)
    return True
Exemple #3
0
def train(strategy,
          optimizer,
          learning_rate,
          config,
          config_class,
          model_class,
          tokenizer_class,
          on_top=False):
    #####
    model_name_or_path = config.get("model_name_or_path", "xlm-mlm-enfr-1024")
    config_cache_dir = config.get("pretrained_config_cache_dir")
    model_cache_dir = config.get("pretrained_model_cache_dir")
    tokenizer_cache_dir = config.get("pretrained_tokenizer_cache_dir")
    model_name_or_path_ = config.get("model_name_or_path_",
                                     "xlm-mlm-enfr-1024")
    #####
    train_dataset = Dataset(config.get("filepath", None),
                            config.get("training_data_save_path"),
                            config.get("seq_size"),
                            config.get("max_sents"),
                            config.get("do_shuffle"),
                            config.get("do_skip_empty"),
                            model_name_or_path=model_name_or_path,
                            tokenizer_class=tokenizer_class,
                            tokenizer_cache_dir=tokenizer_cache_dir)
    pretrained_config = config_class.from_pretrained(
        model_name_or_path,
        cache_dir=config_cache_dir if config_cache_dir else None)
    with strategy.scope():
        model = model_class.from_pretrained(
            model_name_or_path_,
            config=pretrained_config,
            cache_dir=model_cache_dir if model_cache_dir else None)

        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
        checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                        config["model_dir"],
                                                        max_to_keep=5)
        if checkpoint_manager.latest_checkpoint is not None:
            tf.get_logger().info("Restoring parameters from %s",
                                 checkpoint_manager.latest_checkpoint)
            checkpoint_path = checkpoint_manager.latest_checkpoint
            checkpoint.restore(checkpoint_path)
    #####
    ##### Training functions
    with strategy.scope():
        gradient_accumulator = GradientAccumulator()

    def _accumulate_gradients(src, tgt, sign):
        src_padding_mask = build_mask(src["input_ids"], src["lengths"])
        tgt_padding_mask = build_mask(tgt["input_ids"], tgt["lengths"])
        align, aggregation_src, aggregation_tgt, loss, similarity_loss = model(
            (src, tgt),
            sign_src=sign,
            sign_tgt=sign,
            src_padding_mask=src_padding_mask,
            tgt_padding_mask=tgt_padding_mask,
            training=True)
        #tf.print("aggregation_src", aggregation_src, "aggregation_tgt", aggregation_tgt, "sign", sign, summarize=1000)
        loss = loss + similarity_loss * 0.1
        if on_top:
            variables = [
                var for var in model.trainable_variables
                if "bidirectional" in var.name
            ]
        else:
            variables = model.trainable_variables
        print("var numb: ", len(variables))
        for var in variables:
            print(var.name)
            print(var)
        gradients = optimizer.get_gradients(loss, variables)
        #gradients = [(tf.clip_by_norm(grad, 0.1)) for grad in gradients]
        gradient_accumulator(gradients)
        num_examples = tf.shape(src["input_ids"])[0]
        return loss, num_examples

    def _apply_gradients():
        #variables = model.trainable_variables
        if on_top:
            variables = [
                var for var in model.trainable_variables
                if "bidirectional" in var.name
            ]
        else:
            variables = model.trainable_variables
        grads_and_vars = []
        for gradient, variable in zip(gradient_accumulator.gradients,
                                      variables):
            scaled_gradient = gradient / 2.0
            grads_and_vars.append((scaled_gradient, variable))
        optimizer.apply_gradients(grads_and_vars)
        gradient_accumulator.reset()

    u_epoch_dataset = train_dataset.create_one_epoch(mode="u")
    p_epoch_dataset = train_dataset.create_one_epoch(mode="p")

    @function_on_next(u_epoch_dataset)
    def _u_train_forward(next_fn):
        with strategy.scope():
            per_replica_source, per_replica_target = next_fn()
            per_replica_loss, per_replica_num_examples = strategy.experimental_run_v2(
                _accumulate_gradients,
                args=(per_replica_source, per_replica_target, 1.0))
            loss = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                   per_replica_loss, None)
            num_examples = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_num_examples, None)
        return loss, num_examples

    @function_on_next(p_epoch_dataset)
    def _p_train_forward(next_fn):
        with strategy.scope():
            per_replica_source, per_replica_target = next_fn()
            per_replica_loss, per_replica_num_examples = strategy.experimental_run_v2(
                _accumulate_gradients,
                args=(per_replica_source, per_replica_target, -1.0))
            loss = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                   per_replica_loss, None)
            num_examples = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_num_examples, None)
        return loss, num_examples

    @tf.function
    def _step():
        with strategy.scope():
            strategy.experimental_run_v2(_apply_gradients)

    #### Training

    _summary_writer = tf.summary.create_file_writer(config["model_dir"])
    report_every = config.get("report_every", 100)
    save_every = config.get("save_every", 1000)
    eval_every = config.get("eval_every", 1000)
    train_steps = config.get("train_steps", 100000)

    u_training_flow = iter(_u_train_forward())
    p_training_flow = iter(_p_train_forward())

    p_losses = []
    u_losses = []
    _number_examples = []
    import time
    start = time.time()
    with _summary_writer.as_default():
        while True:
            try:
                u_loss, u_examples_num = next(u_training_flow)
                p_loss, p_examples_num = next(p_training_flow)
                _step()
                p_losses.append(p_loss)
                u_losses.append(u_loss)

                _number_examples.extend([u_examples_num, p_examples_num])
                step = optimizer.iterations.numpy()
                if step % report_every == 0:
                    elapsed = time.time() - start
                    tf.get_logger().info(
                        "Step = %d ; Learning rate = %f ; u_loss = %f; p_loss = %f, number_examples = %d, after %f seconds",
                        step, learning_rate(step), np.mean(u_losses),
                        np.mean(p_losses), np.sum(_number_examples), elapsed)
                    start = time.time()
                    u_losses = []
                    p_losses = []
                    _number_examples = []
                if step % save_every == 0:
                    tf.get_logger().info("Saving checkpoint for step %d", step)
                    checkpoint_manager.save(checkpoint_number=step)
                if step % eval_every == 0:
                    ckpt_path = None
                    evaluate(model, config, checkpoint_manager, checkpoint,
                             ckpt_path, model_name_or_path, tokenizer_class,
                             tokenizer_cache_dir)
                tf.summary.flush()
                if step > train_steps:
                    break
            except StopIteration:  #tf.errors.OutOfRangeError:
                print("next epoch")
                u_epoch_dataset = train_dataset.create_one_epoch(mode="u")
                p_epoch_dataset = train_dataset.create_one_epoch(mode="p")

                @function_on_next(u_epoch_dataset)
                def _u_train_forward(next_fn):
                    with strategy.scope():
                        per_replica_source, per_replica_target = next_fn()
                        per_replica_loss, per_replica_num_examples = strategy.experimental_run_v2(
                            _accumulate_gradients,
                            args=(per_replica_source, per_replica_target, 1.0))
                        loss = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                               per_replica_loss, None)
                        num_examples = strategy.reduce(
                            tf.distribute.ReduceOp.SUM,
                            per_replica_num_examples, None)
                    return loss, num_examples

                @function_on_next(p_epoch_dataset)
                def _p_train_forward(next_fn):
                    with strategy.scope():
                        per_replica_source, per_replica_target = next_fn()
                        per_replica_loss, per_replica_num_examples = strategy.experimental_run_v2(
                            _accumulate_gradients,
                            args=(per_replica_source, per_replica_target,
                                  -1.0))
                        loss = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                               per_replica_loss, None)
                        num_examples = strategy.reduce(
                            tf.distribute.ReduceOp.SUM,
                            per_replica_num_examples, None)
                    return loss, num_examples

                u_training_flow = iter(_u_train_forward())
                p_training_flow = iter(_p_train_forward())
Exemple #4
0
def align(lang,
          checkpoint_path,
          dataset_path,
          config,
          config_class,
          model_class,
          tokenizer_class,
          output="output"):
    #####
    print("encoding %s in lang %d using ckpt %s" %
          (dataset_path, lang, checkpoint_path))
    #####
    model_name_or_path = config.get("model_name_or_path", "xlm-mlm-enfr-1024")
    config_cache_dir = config.get("pretrained_config_cache_dir")
    model_cache_dir = config.get("pretrained_model_cache_dir")
    tokenizer_cache_dir = config.get("pretrained_tokenizer_cache_dir")
    model_name_or_path_ = config.get("model_name_or_path_",
                                     "xlm-mlm-enfr-1024")
    #####
    dataset = Dataset(dataset_path,
                      config.get("training_data_save_path"),
                      config.get("seq_size"),
                      config.get("max_sents"),
                      config.get("do_shuffle"),
                      config.get("do_skip_empty"),
                      procedure="align",
                      model_name_or_path=model_name_or_path,
                      tokenizer_class=tokenizer_class,
                      tokenizer_cache_dir=tokenizer_cache_dir)
    pretrained_config = config_class.from_pretrained(
        model_name_or_path,
        cache_dir=config_cache_dir if config_cache_dir else None)
    model = model_class.from_pretrained(
        model_name_or_path_,
        config=pretrained_config,
        cache_dir=model_cache_dir if model_cache_dir else None)
    checkpoint = tf.train.Checkpoint(model=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    config["model_dir"],
                                                    max_to_keep=5)
    if checkpoint_manager.latest_checkpoint is not None:

        if checkpoint_path == None:
            checkpoint_path = checkpoint_manager.latest_checkpoint
        tf.get_logger().info("Restoring parameters from %s", checkpoint_path)
        checkpoint.restore(checkpoint_path)
    iterator = iter(dataset.create_one_epoch(mode="p", lang=lang))

    @tf.function
    def encode_next():
        src, tgt = next(iterator)
        src_padding_mask = build_mask(src["input_ids"], src["lengths"])
        tgt_padding_mask = build_mask(tgt["input_ids"], tgt["lengths"])
        sign = -1.0
        align, _, _, _, _ = model((src, tgt),
                                  sign_src=sign,
                                  sign_tgt=sign,
                                  src_padding_mask=src_padding_mask,
                                  tgt_padding_mask=tgt_padding_mask,
                                  training=False)
        tf.print(align, summarize=1000)
        return align

    import matplotlib.pyplot as plt
    import seaborn as sns
    align_ = None
    while True:
        try:
            align = encode_next()
            align_ = tf.squeeze(align).numpy()
        except tf.errors.OutOfRangeError:
            break

    fig, ax = plt.subplots(figsize=(6, 6))
    ax = sns.heatmap(align_, linewidths=.5, ax=ax, cbar=False)
    fig.savefig('heatmap_align.pgf')

    return True