def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
    spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
    spm.Load(str(path))
    return spm
Exemple #2
0
def _create_data(idx, input_paths):
    # Load sentence-piece model
    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.sp_path)

    input_shards = []
    total_line_cnt = 0
    for input_path in input_paths:
        input_data, sent_ids = [], []
        sent_id, line_cnt = True, 0
        tf.logging.info("Processing %s", input_path)
        for line in tf.gfile.Open(input_path):
            if line_cnt % 100000 == 0:
                tf.logging.info("Loading line %d", line_cnt)
            line_cnt += 1

            if not line.strip():
                if FLAGS.use_eod:
                    sent_id = not sent_id
                    cur_sent = [EOD_ID]
                else:
                    continue
            else:
                if FLAGS.from_raw_text:
                    cur_sent = preprocess_text(line.strip(),
                                               lower=FLAGS.uncased)
                    cur_sent = encode_ids(sp, cur_sent)
                else:
                    cur_sent = list(map(int, line.strip().split()))

            input_data.extend(cur_sent)
            sent_ids.extend([sent_id] * len(cur_sent))
            sent_id = not sent_id

        tf.logging.info("Finish with line %d", line_cnt)
        if line_cnt == 0:
            continue

        input_data = np.array(input_data, dtype=np.int64)
        sent_ids = np.array(sent_ids, dtype=np.bool)

        total_line_cnt += line_cnt
        input_shards.append((input_data, sent_ids))

    tf.logging.info("[Task %d] Total number line: %d", idx, total_line_cnt)

    tfrecord_dir = os.path.join(FLAGS.save_dir, "tfrecords")

    filenames, num_batch = [], 0

    # Randomly shuffle input shards (with a fixed but distinct random seed)
    np.random.seed(100 * FLAGS.task + FLAGS.pass_id)

    perm_indices = np.random.permutation(len(input_shards))
    tf.logging.info("Using perm indices %s for pass %d", perm_indices.tolist(),
                    FLAGS.pass_id)

    input_data_list, sent_ids_list = [], []
    prev_sent_id = None
    for perm_idx in perm_indices:
        input_data, sent_ids = input_shards[perm_idx]
        # make sure the `send_ids[0] == not prev_sent_id`
        if prev_sent_id is not None and sent_ids[0] == prev_sent_id:
            sent_ids = np.logical_not(sent_ids)

        # append to temporary list
        input_data_list.append(input_data)
        sent_ids_list.append(sent_ids)

        # update `prev_sent_id`
        prev_sent_id = sent_ids[-1]

    input_data = np.concatenate(input_data_list)
    sent_ids = np.concatenate(sent_ids_list)

    file_name, cur_num_batch = create_tfrecords(
        save_dir=tfrecord_dir,
        basename="{}-{}-{}".format(FLAGS.split, idx, FLAGS.pass_id),
        data=[input_data, sent_ids],
        bsz_per_host=FLAGS.bsz_per_host,
        seq_len=FLAGS.seq_len,
        bi_data=FLAGS.bi_data,
        sp=sp,
    )

    filenames.append(file_name)
    num_batch += cur_num_batch

    record_info = {"filenames": filenames, "num_batch": num_batch}

    return record_info
Exemple #3
0
def main(unused_argv):
    del unused_argv
    if FLAGS.strategy_type == "mirror":
        strategy = tf.distribute.MirroredStrategy()
    elif FLAGS.strategy_type == "tpu":
        cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu)
        strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
    else:
        raise ValueError(
            "The distribution strategy type is not supported: %s" %
            FLAGS.strategy_type)
    if strategy:
        logging.info("***** Number of cores used : %d",
                     strategy.num_replicas_in_sync)
    train_input_fn = functools.partial(data_utils.get_squad_input_data,
                                       FLAGS.train_batch_size, FLAGS.seq_len,
                                       FLAGS.query_len, strategy, True,
                                       FLAGS.train_tfrecord_path)

    test_input_fn = functools.partial(data_utils.get_squad_input_data,
                                      FLAGS.test_batch_size, FLAGS.seq_len,
                                      FLAGS.query_len, strategy, False,
                                      FLAGS.test_tfrecord_path)

    total_training_steps = FLAGS.train_steps
    steps_per_loop = FLAGS.iterations
    eval_steps = int(FLAGS.test_data_size / FLAGS.test_batch_size)

    optimizer, learning_rate_fn = optimization.create_optimizer(
        FLAGS.learning_rate,
        total_training_steps,
        FLAGS.warmup_steps,
        adam_epsilon=FLAGS.adam_epsilon)
    model_config = xlnet_config.XLNetConfig(FLAGS)
    run_config = xlnet_config.create_run_config(True, False, FLAGS)
    input_meta_data = {}
    input_meta_data["start_n_top"] = FLAGS.start_n_top
    input_meta_data["end_n_top"] = FLAGS.end_n_top
    input_meta_data["lr_layer_decay_rate"] = FLAGS.lr_layer_decay_rate
    input_meta_data["predict_dir"] = FLAGS.predict_dir
    input_meta_data["n_best_size"] = FLAGS.n_best_size
    input_meta_data["max_answer_length"] = FLAGS.max_answer_length
    input_meta_data["test_batch_size"] = FLAGS.test_batch_size
    input_meta_data["batch_size_per_core"] = int(FLAGS.train_batch_size /
                                                 strategy.num_replicas_in_sync)
    input_meta_data["mem_len"] = FLAGS.mem_len
    model_fn = functools.partial(get_qaxlnet_model, model_config, run_config,
                                 FLAGS.start_n_top, FLAGS.end_n_top)
    eval_examples = squad_utils.read_squad_examples(FLAGS.predict_file,
                                                    is_training=False)
    if FLAGS.test_feature_path:
        logging.info("start reading pickle file...")
        with tf.io.gfile.GFile(FLAGS.test_feature_path, "rb") as f:
            eval_features = pickle.load(f)
        logging.info("finishing reading pickle file...")
    else:
        sp_model = spm.SentencePieceProcessor()
        sp_model.LoadFromSerializedProto(
            tf.io.gfile.GFile(FLAGS.spiece_model_file, "rb").read())
        spm_basename = os.path.basename(FLAGS.spiece_model_file)
        eval_features = squad_utils.create_eval_data(
            spm_basename, sp_model, eval_examples, FLAGS.max_seq_length,
            FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.uncased)

    with tf.io.gfile.GFile(FLAGS.predict_file) as f:
        original_data = json.load(f)["data"]
    eval_fn = functools.partial(run_evaluation, strategy, test_input_fn,
                                eval_examples, eval_features, original_data,
                                eval_steps, input_meta_data)

    training_utils.train(strategy=strategy,
                         model_fn=model_fn,
                         input_meta_data=input_meta_data,
                         eval_fn=eval_fn,
                         metric_fn=None,
                         train_input_fn=train_input_fn,
                         init_checkpoint=FLAGS.init_checkpoint,
                         init_from_transformerxl=FLAGS.init_from_transformerxl,
                         total_training_steps=total_training_steps,
                         steps_per_loop=steps_per_loop,
                         optimizer=optimizer,
                         learning_rate_fn=learning_rate_fn,
                         model_dir=FLAGS.model_dir,
                         save_steps=FLAGS.save_steps)
Exemple #4
0
def train_and_evaluate(config, workdir, vocab_filepath):
    """Runs a training and evaluation loop.

  Args:
    config: Model and training configuration.
    workdir: Working directory for checkpoints and Tensorboard summaries. If
      this contains a checkpoint, training will be resumed from the latest
      checkpoint.
    vocab_filepath: Absolute path to SentencePiece vocab model.

  Raises:
    ValueError: If training or eval batch sizes won't fit number of processes
      and devices, or config is underspecified.
  """
    n_processes = jax.process_count()  # Number of processes
    n_devices = jax.local_device_count()  # Number of local devices per process

    if config.train_batch_size % (n_processes * n_devices) > 0:
        raise ValueError(
            "Training batch size must be divisible by the total number of devices, "
            "but training batch size = %d, while total number of devices = %d "
            "(%d processes, each with %d devices)" %
            (config.train_batch_size, n_processes * n_devices, n_processes,
             n_devices))

    if config.eval_batch_size % (n_processes * n_devices) > 0:
        raise ValueError(
            "Eval batch size must be divisible by the total number of devices, "
            "but eval batch size = %d, while total number of devices = %d "
            "(%d processes, each with %d devices)" %
            (config.eval_batch_size, n_processes * n_devices, n_processes,
             n_devices))

    per_process_train_batch_size = config.train_batch_size // n_processes
    per_process_eval_batch_size = config.eval_batch_size // n_processes

    if jax.process_index() == 0:
        train_summary_writer = tensorboard.SummaryWriter(
            os.path.join(workdir, "train"))
        eval_summary_writer = tensorboard.SummaryWriter(
            os.path.join(workdir, "eval"))
    else:
        train_summary_writer = None
        eval_summary_writer = None

    rng = random.PRNGKey(config.seed)
    rng, init_rng = random.split(rng)

    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load(vocab_filepath)
    tokenizer.SetEncodeExtraOptions("")
    # Note: [CLS] and [SEP] will be added by the data pipeline, not the tokenizer.

    with config.unlocked():
        config.vocab_size = tokenizer.GetPieceSize()
    frozen_config = ml_collections.FrozenConfigDict(config)
    model = models.PreTrainingModel(config=frozen_config,
                                    random_seed=config.seed)

    params = _init_params(model, init_rng, frozen_config)

    optimizer = _create_adam_optimizer(config.learning_rate, params)
    # We access model state only from optimizer via optimizer.target.
    del params

    # In case current job restarts, ensure that we continue from where we left
    # off.
    optimizer = checkpoints.restore_checkpoint(workdir, optimizer)
    start_step = int(optimizer.state.step)

    # Otherwise, try to restore optimizer and model state from config checkpoint.
    if start_step == 0 and "init_checkpoint_dir" in config and config.init_checkpoint_dir:
        optimizer = checkpoints.restore_checkpoint(config.init_checkpoint_dir,
                                                   optimizer)

    optimizer = jax_utils.replicate(optimizer)

    learning_rate_fn = train_utils.create_learning_rate_scheduler(
        factors="constant * linear_warmup * linear_decay",
        base_learning_rate=config.learning_rate,
        warmup_steps=config.num_warmup_steps,
        decay_steps=config.num_train_steps - config.num_warmup_steps,
    )

    c4_masked_lm_inputs = functools.partial(
        input_pipeline.c4_masked_lm_inputs,
        tokenizer=tokenizer,
        max_seq_length=config.max_seq_length,
        max_predictions_per_seq=config.max_predictions_per_seq,
        masking_rate=config.masking_rate,
        mask_token_proportion=config.mask_token_proportion,
        random_token_proportion=config.random_token_proportion)
    train_ds = c4_masked_lm_inputs(batch_size=per_process_train_batch_size)
    train_iter = iter(train_ds)
    eval_ds = c4_masked_lm_inputs(batch_size=per_process_eval_batch_size)

    # We init the first set of dropout PRNG keys, but update it afterwards inside
    # the main pmap'd training update for performance.
    rngs = random.split(rng, n_devices)

    loss_and_metrics_fn = functools.partial(_compute_loss_and_metrics,
                                            model=model,
                                            pad_id=tokenizer.pad_id())
    p_train_step = jax.pmap(functools.partial(
        train_utils.train_step,
        loss_and_metrics_fn=loss_and_metrics_fn,
        learning_rate_fn=learning_rate_fn,
        clipped_grad_norm=config.clipped_grad_norm),
                            axis_name="batch")

    metric_fn = functools.partial(_compute_eval_stats,
                                  model=model,
                                  pad_id=tokenizer.pad_id())
    p_eval_step = jax.pmap(functools.partial(train_utils.eval_step,
                                             metric_fn=metric_fn),
                           axis_name="batch")

    train_metrics = []
    logging.info("Starting training loop.")
    logging.info("====================")

    for step in range(start_step, config.num_train_steps):
        with jax.profiler.StepTraceAnnotation("train", step_num=step):
            train_batch = next(train_iter)
            train_batch = common_utils.shard(train_batch)

            optimizer, train_step_metrics, rngs = p_train_step(optimizer,
                                                               train_batch,
                                                               rng=rngs)
            train_metrics.append(train_step_metrics)

        if (step > 0 and config.save_checkpoints_steps
                and step % config.save_checkpoints_steps == 0
                and jax.process_index() == 0):
            # Save un-replicated optimizer + model state.
            checkpoints.save_checkpoint(workdir,
                                        jax_utils.unreplicate(optimizer),
                                        step,
                                        keep=2)

        # Periodic metric handling.
        if step % config.eval_frequency != 0 and step > 0:
            continue

        logging.info("Gathering training metrics at step: %d", step)
        train_metrics = common_utils.get_metrics(train_metrics)
        train_summary = _compute_loss_and_accuracy_metrics(train_metrics)
        # Add training specific metrics.
        train_summary["unclipped_grad_l2_norm"] = jnp.sqrt(
            jnp.sum(train_metrics["unclipped_grad_l2_sum"]))
        train_summary["clipped_grad_l2_norm"] = jnp.sqrt(
            jnp.sum(train_metrics["clipped_grad_l2_sum"]))
        train_summary["learning_rate"] = learning_rate_fn(step)

        if jax.process_index() == 0:
            assert train_summary_writer
            for key, val in train_summary.items():
                train_summary_writer.scalar(key, val, step)
            train_summary_writer.flush()
        # Reset metric accumulation for next training evaluation cycle.
        train_metrics = []

        logging.info("Gathering evaluation metrics at step: %d", step)

        all_stats = []
        for _, eval_batch in zip(range(config.max_num_eval_steps), eval_ds):
            eval_batch = common_utils.shard(eval_batch)
            all_stats.append(p_eval_step(optimizer.target, eval_batch))
        flat_stats = {}
        for k in all_stats[0]:
            flat_stats[k] = np.concatenate([stats[k] for stats in all_stats],
                                           axis=0)
        eval_summary = _compute_loss_and_accuracy_metrics(flat_stats)

        if jax.process_index() == 0:
            assert eval_summary_writer
            for key, val in eval_summary.items():
                eval_summary_writer.scalar(key, val, step)
            eval_summary_writer.flush()
 def __setstate__(self, d: Dict) -> None:
     self.__dict__ = d
     self.sp_model = spm.SentencePieceProcessor()
     self.sp_model.Load(self.vocab_file)
Exemple #6
0
train_data = pd.read_csv(train_txt, header=0, delimiter=',')
print(f'전체 학습 raw 개수: {len(train_data)}')
train_data = train_data.dropna()
print(f'전체 학습 valid 개수: {len(train_data)}')
train_data = train_data.sample(1000)  # 빠른 확인을 위해 1000개만 사용
print(f'전체 학습 sample 개수: {len(train_data)}')
label_counts = train_data['label'].value_counts()
print(f'전체 학습 label 개수: {label_counts}')

#
# vocabulary
#

# vocab load
vocab_file = os.path.join(data_dir, 'ko_32000.model')
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)

#
# tokenize
#
questions, answers = [], []
for i, row in train_data.iterrows():
    question = vocab.encode_as_pieces(row['Q'])
    questions.append(question)
    answer = vocab.encode_as_pieces(row['A'])
    answers.append(answer)

assert len(questions) == len(answers)

print(questions[:100])
Exemple #7
0
    def __init__(
            self,
            vocab_file,
            pad_token="<pad>",
            eos_token="</s>",
            unk_token="<unk>",
            mask_token="<mask_2>",
            mask_token_sent="<mask_1>",
            additional_special_tokens=None,
            offset=103,  # entries 2 - 104 are only used for pretraining
            sp_model_kwargs: Optional[Dict[str, Any]] = None,
            **kwargs) -> None:
        self.offset = offset
        if additional_special_tokens is not None:
            if not isinstance(additional_special_tokens, list):
                raise TypeError(
                    f"additional_special_tokens should be of type {type(list)}, but is"
                    f" {type(additional_special_tokens)}")

            additional_special_tokens_extended = (
                ([mask_token_sent] + additional_special_tokens)
                if mask_token_sent not in additional_special_tokens
                and mask_token_sent is not None else additional_special_tokens)
            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
            additional_special_tokens_extended += [
                f"<unk_{i}>" for i in range(
                    len(additional_special_tokens_extended), self.offset - 1)
            ]

            if len(set(additional_special_tokens_extended)) != len(
                    additional_special_tokens_extended):
                raise ValueError(
                    "Please make sure that the provided additional_special_tokens do not contain an incorrectly"
                    f" shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
                )
            additional_special_tokens = additional_special_tokens_extended
        else:
            additional_special_tokens = [
                mask_token_sent
            ] if mask_token_sent is not None else []
            additional_special_tokens += [
                f"<unk_{i}>" for i in range(2, self.offset)
            ]

        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        super().__init__(
            eos_token=eos_token,
            unk_token=unk_token,
            mask_token=mask_token,
            pad_token=pad_token,
            mask_token_sent=mask_token_sent,
            offset=offset,
            additional_special_tokens=additional_special_tokens,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )
        self.mask_token_sent = mask_token_sent
        self.vocab_file = vocab_file
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)

        # add special tokens to encoder dict
        self.encoder: Dict[int, str] = {
            0: self.pad_token,
            1: self.eos_token,
        }

        if self.mask_token_sent is not None:
            self.encoder.update({
                2: self.mask_token_sent,
                3: self.mask_token,
            })

        if self.offset > 0:
            # entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
            # mask_token_sent is already added to list -> so start at 1
            self.encoder.update({
                i + 3: additional_special_tokens[i]
                for i in range(1, self.offset - 1)
            })

        self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
Exemple #8
0
def main(_):
    logging.set_verbosity(logging.INFO)
    processors = {
        "mnli_matched": MnliMatchedProcessor,
        "mnli_mismatched": MnliMismatchedProcessor,
        "sts-b": StsbProcessor,
        "imdb": ImdbProcessor,
        "yelp5": Yelp5Processor
    }

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels() if not FLAGS.is_regression else None

    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.spiece_model_file)

    def tokenize_fn(text):
        text = preprocess_utils.preprocess_text(text, lower=FLAGS.uncased)
        return preprocess_utils.encode_ids(sp, text)

    spm_basename = os.path.basename(FLAGS.spiece_model_file)

    train_file_base = "{}.len-{}.train.tf_record".format(
        spm_basename, FLAGS.max_seq_length)
    train_file = os.path.join(FLAGS.output_dir, train_file_base)
    logging.info("Use tfrecord file %s", train_file)

    train_examples = processor.get_train_examples(FLAGS.data_dir)
    np.random.shuffle(train_examples)
    logging.info("Num of train samples: %d", len(train_examples))

    file_based_convert_examples_to_features(train_examples, label_list,
                                            FLAGS.max_seq_length, tokenize_fn,
                                            train_file, FLAGS.num_passes)
    if FLAGS.eval_split == "dev":
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    else:
        eval_examples = processor.get_test_examples(FLAGS.data_dir)

    logging.info("Num of eval samples: %d", len(eval_examples))

    # TPU requires a fixed batch size for all batches, therefore the number
    # of examples must be a multiple of the batch size, or else examples
    # will get dropped. So we pad with fake examples which are ignored
    # later on. These do NOT count towards the metric (all tf.metrics
    # support a per-instance weight, and these get a weight of 0.0).
    #
    # Modified in XL: We also adopt the same mechanism for GPUs.
    while len(eval_examples) % FLAGS.eval_batch_size != 0:
        eval_examples.append(classifier_utils.PaddingInputExample())

    eval_file_base = "{}.len-{}.{}.eval.tf_record".format(
        spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
    eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

    file_based_convert_examples_to_features(eval_examples, label_list,
                                            FLAGS.max_seq_length, tokenize_fn,
                                            eval_file)
Exemple #9
0
def load_spm(path: str) -> sentencepiece.SentencePieceProcessor:
    spm = sentencepiece.SentencePieceProcessor()
    spm.Load(path)
    return spm
Exemple #10
0
def prepro(hp):
    """Load raw data -> Preprocessing -> Segmenting with sentencepice
    hp: hyperparams. argparse.
    """
    logging.info("# Check if raw files exist")
    # train1 = "iwslt2016/de-en/train.tags.de-en.de"
    # train2 = "iwslt2016/de-en/train.tags.de-en.en"
    # eval1 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.de.xml"
    # eval2 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.en.xml"
    # test1 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.de.xml"
    # test2 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.en.xml"
    train1 = "iwslt2016/ch-ch/train.ch"
    train2 = "iwslt2016/ch-ch/train.ch"
    eval1 = "iwslt2016/ch-ch/tst2018.ch"
    eval2 = "iwslt2016/ch-ch/tst2018.ch"
    test1 = "iwslt2016/ch-ch/tst2019.ch"
    test2 = "iwslt2016/ch-ch/tst2019.ch"
    for f in (train1, train2, eval1, eval2, test1, test2):
        if not os.path.isfile(f):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), f)

    logging.info("# Preprocessing")
    # train
    _prepro = lambda x:  [line.strip() for line in open(x, 'r').read().split("\n") \
                      if not line.startswith("<")]
    prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2)
    assert len(prepro_train1) == len(
        prepro_train2), "Check if train source and target files match."

    # eval
    #_prepro = lambda x: [re.sub("<[^>]+>", "", line).strip() \
    #                 for line in open(x, 'r').read().split("\n") \
    #                 if line.startswith("<seg id")]
    prepro_eval1, prepro_eval2 = _prepro(eval1), _prepro(eval2)
    assert len(prepro_eval1) == len(
        prepro_eval2), "Check if eval source and target files match."

    # test
    prepro_test1, prepro_test2 = _prepro(test1), _prepro(test2)
    assert len(prepro_test1) == len(
        prepro_test2), "Check if test source and target files match."

    logging.info("Let's see how preprocessed data look like")
    logging.info("prepro_train1:", prepro_train1[0])
    logging.info("prepro_train2:", prepro_train2[0])
    logging.info("prepro_eval1:", prepro_eval1[0])
    logging.info("prepro_eval2:", prepro_eval2[0])
    logging.info("prepro_test1:", prepro_test1[0])
    logging.info("prepro_test2:", prepro_test2[0])

    logging.info("# write preprocessed files to disk")
    os.makedirs("iwslt2016/prepro", exist_ok=True)

    def _write(sents, fname):
        with open(fname, 'w') as fout:
            fout.write("\n".join(sents))

    _write(prepro_train1, "iwslt2016/prepro/train.de")
    _write(prepro_train2, "iwslt2016/prepro/train.en")
    _write(prepro_train1 + prepro_train2, "iwslt2016/prepro/train")
    _write(prepro_eval1, "iwslt2016/prepro/eval.de")
    _write(prepro_eval2, "iwslt2016/prepro/eval.en")
    _write(prepro_test1, "iwslt2016/prepro/test.de")
    _write(prepro_test2, "iwslt2016/prepro/test.en")

    logging.info("# Train a joint BPE model with sentencepiece")
    os.makedirs("iwslt2016/segmented", exist_ok=True)
    train = '--input=iwslt2016/prepro/train --pad_id=0 --unk_id=1 \
             --bos_id=2 --eos_id=3\
             --model_prefix=iwslt2016/segmented/bpe --vocab_size={} \
             --model_type=bpe'.format(hp.vocab_size)
    spm.SentencePieceTrainer.Train(train)

    logging.info("# Load trained bpe model")
    sp = spm.SentencePieceProcessor()
    sp.Load("iwslt2016/segmented/bpe.model")

    logging.info("# Segment")

    def _segment_and_write(sents, fname):
        with open(fname, "w") as fout:
            for sent in sents:
                pieces = sp.EncodeAsPieces(sent)
                fout.write(" ".join(pieces) + "\n")

    _segment_and_write(prepro_train1, "iwslt2016/segmented/train.de.bpe")
    _segment_and_write(prepro_train2, "iwslt2016/segmented/train.en.bpe")
    _segment_and_write(prepro_eval1, "iwslt2016/segmented/eval.de.bpe")
    _segment_and_write(prepro_eval2, "iwslt2016/segmented/eval.en.bpe")
    _segment_and_write(prepro_test1, "iwslt2016/segmented/test.de.bpe")

    logging.info("Let's see how segmented data look like")
    print("train1:", open("iwslt2016/segmented/train.de.bpe", 'r').readline())
    print("train2:", open("iwslt2016/segmented/train.en.bpe", 'r').readline())
    print("eval1:", open("iwslt2016/segmented/eval.de.bpe", 'r').readline())
    print("eval2:", open("iwslt2016/segmented/eval.en.bpe", 'r').readline())
    print("test1:", open("iwslt2016/segmented/test.de.bpe", 'r').readline())
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  #### Validate flags
  if FLAGS.save_steps is not None:
    FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps)

  processors = {
      "detect": DetectProcessor,
  }

  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    raise ValueError(
        "At least one of `do_train`, `do_eval, `do_predict` or "
        "`do_submit` must be True.")

  if not tf.gfile.Exists(FLAGS.output_dir):
    tf.gfile.MakeDirs(FLAGS.output_dir)

  task_name = FLAGS.task_name.lower()

  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name]()
  label_list = processor.get_labels() if not FLAGS.is_regression else None

  sp = spm.SentencePieceProcessor()
  sp.Load(FLAGS.spiece_model_file)
  def tokenize_fn(text):
    text = preprocess_text(text, lower=FLAGS.uncased)
    return encode_ids(sp, text)

  run_config = model_utils.configure_tpu(FLAGS)

  model_fn = get_model_fn(len(label_list) if label_list is not None else None)

  spm_basename = os.path.basename(FLAGS.spiece_model_file)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  if FLAGS.use_tpu:
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)
  else:
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config)

  if FLAGS.do_train:
    train_file_base = "{}.len-{}.train.tf_record".format(
        spm_basename, FLAGS.max_seq_length)
    train_file = os.path.join(FLAGS.output_dir, train_file_base)
    tf.logging.info("Use tfrecord file {}".format(train_file))

    train_examples = processor.get_train_examples(FLAGS.data_dir)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    np.random.shuffle(train_examples)
    tf.logging.info("Num of train samples: {}".format(len(train_examples)))
    tf.logging.info("Num of train steps: {}".format(num_train_steps))

    file_based_convert_examples_to_features(
        train_examples, label_list, FLAGS.max_seq_length, tokenize_fn,
        train_file, FLAGS.num_passes)

    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)

    estimator.train(input_fn=train_input_fn, steps=num_train_steps)

  # TODO
  if FLAGS.do_train_test:
    train_test_file_base = "{}.len-{}.train_test.tf_record".format(
        spm_basename, FLAGS.max_seq_length)
    train_test_file = os.path.join(FLAGS.output_dir, train_test_file_base)
    tf.logging.info("Use tfrecord file {}".format(train_test_file))

    train_test_examples = processor.get_train_test_examples(FLAGS.data_dir)
    num_train_test_steps = int(
        len(train_test_examples) / FLAGS.train_batch_size * 1)
    np.random.shuffle(train_examples)
    tf.logging.info("Num of test samples: {}".format(len(train_test_examples)))
    tf.logging.info("Num of test steps: {}".format(num_train_test_steps))

    file_based_convert_examples_to_features(
        train_test_examples, label_list, FLAGS.max_seq_length, tokenize_fn,
        train_test_file, FLAGS.num_passes)

    train_test_input_fn = file_based_input_fn_builder(
        input_file=train_test_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)

    estimator.train(input_fn=train_test_input_fn, steps=num_train_test_steps)

  if FLAGS.do_eval:
    # TPU requires a fixed batch size for all batches, therefore the number
    # of examples must be a multiple of the batch size, or else examples
    # will get dropped. So we pad with fake examples which are ignored
    # later on. These do NOT count towards the metric (all tf.metrics
    # support a per-instance weight, and these get a weight of 0.0).
    #
    # Modified in XL: We also adopt the same mechanism for GPUs.

    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    tf.logging.info("Num of eval samples: {}".format(len(eval_examples)))

    while len(eval_examples) % FLAGS.eval_batch_size != 0:
      eval_examples.append(PaddingInputExample())

    eval_file_base = "{}.len-{}.{}.eval.tf_record".format(
        spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
    eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

    file_based_convert_examples_to_features(
        eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn,
        eval_file)

    assert len(eval_examples) % FLAGS.eval_batch_size == 0
    eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=True)

    # Filter out all checkpoints in the directory
    steps_and_files = []
    filenames = tf.gfile.ListDirectory(FLAGS.model_dir)

    for filename in filenames:
      if filename.endswith(".index"):
        ckpt_name = filename[:-6]
        cur_filename = join(FLAGS.model_dir, ckpt_name)
        global_step = int(cur_filename.split("-")[-1])
        tf.logging.info("Add {} to eval list.".format(cur_filename))
        steps_and_files.append([global_step, cur_filename])
    steps_and_files = sorted(steps_and_files, key=lambda x: x[0])

    # Decide whether to evaluate all ckpts
    if not FLAGS.eval_all_ckpt:
      steps_and_files = steps_and_files[-1:]

    eval_results = []
    for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]):
      ret = estimator.evaluate(
          input_fn=eval_input_fn,
          steps=eval_steps,
          checkpoint_path=filename)

      ret["step"] = global_step
      ret["path"] = filename

      eval_results.append(ret)

      tf.logging.info("=" * 80)
      log_str = "Eval result | "
      for key, val in sorted(ret.items(), key=lambda x: x[0]):
        log_str += "{} {} | ".format(key, val)
      tf.logging.info(log_str)

    key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy"
    eval_results.sort(key=lambda x: x[key_name], reverse=True)

    tf.logging.info("=" * 80)
    log_str = "Best result | "
    for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]):
      log_str += "{} {} | ".format(key, val)
    tf.logging.info(log_str)

  if FLAGS.do_predict:

    predict_dir = FLAGS.predict_dir
    if not tf.gfile.Exists(predict_dir):
      tf.gfile.MakeDirs(predict_dir)

    predict_file_base = "{}.len-{}.{}.predict.tf_record".format(spm_basename, FLAGS.max_seq_length, FLAGS.predict_split)
    predict_file = os.path.join(FLAGS.output_dir, predict_file_base)

    predict_examples = processor.get_test_examples(FLAGS.data_dir)
    num_actual_predict_examples = len(predict_examples)
    tf.logging.info("Num of predict samples: {}".format(len(predict_examples)))
    file_based_convert_examples_to_features(
        predict_examples, label_list, FLAGS.max_seq_length, tokenize_fn,
        predict_file)

    pred_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)

    if FLAGS.predict_batch_size != 1:
        result = estimator.predict(input_fn=pred_input_fn)
    else:
        result = estimator.predict(input_fn=pred_input_fn, yield_single_examples=False)

    if FLAGS.use_stack:
        logits = [ prediction["logits"] for prediction in result ]
        save_pickle(FLAGS.stack_dir, logits)

    # TODO
    output_predict_file = FLAGS.test_save
    original_file = os.path.join(FLAGS.data_dir, FLAGS.test_set)

    df = pd.read_csv(original_file)

    lines = [row['id'] for index, row in df.iterrows()]

    with open(output_predict_file, "w") as f:
      writer = csv.writer(f, delimiter=',')
      writer.writerow(['id','label'])
      num_written_lines = 0
      tf.logging.info("***** Predict results *****")
      for (i, prediction) in enumerate(zip(lines, result)):
        ID = prediction[0]
        label = prediction[1]["labels"]
        if i >= num_actual_predict_examples:
          break
        writer.writerow([ID, label])
        num_written_lines += 1
    assert num_written_lines == num_actual_predict_examples
def main(
        init_run_path,
        dataset_path,
        sp_model_path,
        dist_run_path,
        epochs=10,
        lr=2.5e-4,
        batch_size=2,  # per GPU
        g_accum_gradients=None,  # accumulate gradients N times (globally)
        gradient_checkpointing=False, # saves GPU memory
        n_ctx=1024,
        n_embed=768,
        n_head=12,
        n_layer=12,
        n_hidden=None,  # equal to n_embed by default (better leave at None)
        clean=False,  # clean run folder
        log_every=20,
        save_every=10000,
        validate_every=None,  # same as save_every by default
        only_validate=False,
        max_tokens=None,
        opt_level=None,  # apex.amp opt level (e.g. "O1")
        # train on contexts starting from sentence start
        sample_sentences=False,
        verbose=False,  # print all training contexts
        # Multi-GPU related settings
        master_port='40390',
        master_addr='127.0.0.1',
        # These two are set automatically when multiple GPUs are available
        device_id=None,
        n_devices=None,
        ):
  
    #check multi gpu training 
    if n_devices is None:
        n_devices = torch.cuda.device_count()
        if n_devices > 1:
            locals_ = locals()
            kwargs = {a: locals_[a] for a in get_defined_args(main)}
            mp.spawn(_main_mp, (kwargs,), n_devices)
            return

    #the gradient is accumlated from different devices 
    is_main = device_id in {0, None}
    world_size = max(1, n_devices)
    if g_accum_gradients is None:
        g_accum_gradients = world_size
    assert g_accum_gradients % world_size == 0
    accum_gradients = g_accum_gradients // world_size

    if validate_every is None:
        validate_every = save_every

    #model loading stuff 
    init_run_path = Path(init_run_path)
    dist_run_path = Path(dist_run_path)
    continue_training = False
    if not dist_run_path.exists():
      print(f"Creating {dist_run_path}")
      dist_run_path.mkdir(exist_ok=True, parents=True)
    else:
      continue_training = True 

    model_path = init_run_path / 'model.pt'
    optimizer_path = init_run_path / 'optim.pt'
    if is_main:
        run_path_mark = init_run_path / '.lm'
        if clean and init_run_path.exists():
            assert run_path_mark.exists()  # to avoid removing unrelated folder
            shutil.rmtree(init_run_path)
        init_run_path.mkdir(exist_ok=True, parents=True)
        run_path_mark.touch()
        shutil.copy(sp_model_path, dist_run_path / 'sp.model')

    #load sentence piece model 
    sp_model = spm.SentencePieceProcessor()
    sp_model.load(sp_model_path)

    #model parameters 
    hparams = HParams(
        n_vocab=len(sp_model),
        n_ctx=n_ctx,
        n_embed=n_embed,
        n_hidden=n_hidden or n_embed,
        n_head=n_head,
        n_layer=n_layer,
        gradient_checkpointing=gradient_checkpointing,
    )
    params = dict(
        hparams=attr.asdict(hparams),
        argv=' '.join(sys.argv),
        epochs=epochs,
        lr=lr,
        batch_size=batch_size,
        g_accum_gradients=g_accum_gradients,
    )
    params_s = json.dumps(params, indent=4, sort_keys=True, ensure_ascii=False)
    if is_main:
        print(params_s)
        (dist_run_path / 'params.json').write_text(params_s, encoding='utf8')

    #load encoded dataset 
    dataset_path = Path(dataset_path)
    print(f'Loading dataset from {dataset_path}')
    valid_dataset = np.load(dataset_path / 'valid.npy')
    train_dataset = np.load(dataset_path / 'train.npy')
    step_tokens = n_ctx * batch_size * g_accum_gradients  # all GPUs
    print(f'Train dataset has {len(train_dataset):,} tokens')
    print(f'Validation dataset has {len(valid_dataset):,} tokens')

    if sample_sentences:
        train_sample_index, valid_sample_index = [
            _sentense_sample_index(dataset, n_ctx, sp_model)
            for dataset in [train_dataset, valid_dataset]]
    else:
        train_sample_index = valid_sample_index = None

    #check gpu 
    if torch.cuda.is_available():
        device = torch.device('cuda', index=device_id)
    else:
        device = torch.device('cpu')

    #initialize model, loss and optimizer 
    model = Model(hparams).to(device)
    cross_entropy = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_meter = AverageMeter()
    cudnn.benchmark = True
    
    if opt_level:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level=opt_level)

    seen_tokens = 0

    def load_model(model_path , optimizer_path):
        """ Load model, update seen_tokens value
        """
        nonlocal seen_tokens
        state = torch.load(model_path, map_location=device)
        if 'seen_tokens' in state:
            seen_tokens = state['seen_tokens']
        else:  # legacy format
            seen_tokens = state['step'] * step_tokens
        # seen_tokens = 0 

        state_dict = fixed_state_dict(state['state_dict'])
        model.load_state_dict(state_dict)
        optimizer.load_state_dict(torch.load(optimizer_path, map_location=device))
        print(f'Resuming from seen_tokens {seen_tokens:,}')

    if continue_training:
      print("Continue Training ...")
      load_model(dist_run_path / 'model.pt', dist_run_path / 'optim.pt')

    elif model_path.exists():
      load_model(model_path , optimizer_path)

    if device_id is not None:
        print(f'device {device} initializing process group')
        os.environ['MASTER_PORT'] = master_port
        os.environ['MASTER_ADDR'] = master_addr
        torch.distributed.init_process_group(
            backend='nccl', rank=device_id, world_size=world_size)
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[device_id], output_device=device_id)
        print(f'process group for {device} initialized')

    def loss_fn(logits, ctx):
        return cross_entropy(
            input=logits[:, :-1].reshape([-1, logits.shape[-1]]),
            target=ctx[:, 1:].reshape(-1))

    def train_step():
        """ Train step on one GPU.
        """
        context = _gen_training_batch(
            train_dataset,
            n_ctx=n_ctx,
            batch_size=batch_size * accum_gradients,
            sample_index=train_sample_index)
        if verbose:
            print()
            for ctx in context:
                print(repr(sp_model.decode_ids(list(map(int, ctx)))))
            print()
        context = torch.LongTensor(context)
        optimizer.zero_grad()
        loss_scale = n_ctx * batch_size * accum_gradients / (512 * 4 * 32)
        for ctx in torch.split(context, batch_size):
            ctx = ctx.to(device=device)
            logits = model(ctx)['logits']
            loss = loss_fn(logits, ctx)
            loss_b = loss * loss_scale
            if opt_level:
                with amp.scale_loss(loss_b, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss_b.backward()
            loss_meter.update(float(loss.item()))
        optimizer.step()

    def train():
        nonlocal seen_tokens
        epoch_size = len(train_dataset) // step_tokens * step_tokens
        pbar = tqdm.trange(epochs, desc='epochs', dynamic_ncols=True, disable=not is_main)

        # pbar used for epochs 
        # init_epoch_pbar = lambda: tqdm.trange(
        #     epoch_size, dynamic_ncols=True, disable=not is_main)
        # init_epoch_pbar = lambda: tqdm.trange(epoch_size, disable=not is_main)
        # epoch_pbar = init_epoch_pbar()

        # # pbar.update(seen_tokens // epoch_size)
        # # pbar.refresh()
        # epoch_pbar.update(seen_tokens % epoch_size)
        step = 1
        loss_per_epoch = [] 
        j = 0 
        start_time = time.time()

        while seen_tokens < epochs * epoch_size:
            if max_tokens and seen_tokens >= max_tokens:
                print(f'max_tokens {max_tokens} reached, '
                      f'saving and exiting')
                save()
                validate()
                return

            train_step()
            seen_tokens += step_tokens
            step += 1
            # epoch_pbar.update(step_tokens)
            # epoch_pbar.set_description(f'epoch {1 + seen_tokens // epoch_size}')
            # epoch_pbar.set_postfix(loss=f'{loss_meter.mean():.2f}')
            # epoch_pbar.refresh()
            loss_per_epoch.append(loss_meter.mean())
            if step % save_every == 0:
                save()
            if is_main and step % log_every == 0:
                json_log_plots.write_event(dist_run_path, step=seen_tokens,
                                           loss=loss_meter.mean())
                loss_meter.reset()
                
            if step % validate_every == 0:
                validate()
            
            # create a new progress bar for the next epoch
            if seen_tokens % epoch_size == 0:
                # pbar.update()
                # epoch_pbar.close()
                # epoch_pbar = init_epoch_pbar()
                valid_loss = get_valid_loss()
                print(f'epoch: {j} \t train_loss: {np.mean(loss_per_epoch):.3f} \t valid_loss = {valid_loss:.3f} \t time: {(time.time()-start_time):.2f}')
                j += 1
                loss_per_epoch = []
                start_time = time.time()

        # end of training
        save()
        validate()

    def validate():
        if not is_main or world_size != 1:
            return
        json_log_plots.write_event(dist_run_path, step=seen_tokens,
                                   valid_loss=get_valid_loss())

    def get_valid_loss():
        """ Run validation, return mean loss. This is a pessimistic score,
        as validation contexts are non-overlapping.
        """
        model.eval()
        losses = AverageMeter()
        with torch.no_grad():
            for ctx in _valid_batch_iter(
                    valid_dataset, batch_size=batch_size, n_ctx=n_ctx,
                    sample_index=valid_sample_index):
                if not ctx:
                    continue
                ctx = torch.LongTensor(ctx).to(device)
                logits = model(ctx)['logits']
                loss = loss_fn(logits, ctx)
                losses.update(float(loss.item()))
        model.train()
        return losses.mean()

    def save():
        if not is_main:
            return
        # for path in [model_path, optimizer_path]:
        #     if path.exists():
        #         shutil.copy(path, run_path / f'{path.stem}-prev{path.suffix}')
        torch.save({
            'state_dict': _unwrapped_model(model).state_dict(),
            'seen_tokens': seen_tokens,
        }, dist_run_path / "model.pt")
        torch.save(optimizer.state_dict(), dist_run_path/ "optim.pt")

    if only_validate:
        if world_size != 1:
            print('multi-GPU validation is not supported yet')
            sys.exit(1)
        if is_main:
            print(f'Validation loss: {get_valid_loss():.4f}')
    else:
        try:
            train()
        except KeyboardInterrupt:
            if is_main:
                print('Interrupted, saving')
                save()
                sys.exit(1)
Exemple #13
0
def LoadMultlingualDataset(args):
    """
  Function to load individual datasets and Preprocess them
   individuall. A language token in also added at
  the start of each dataset.
  Takes in Preprocessed data and trains a sentencepiece model on the
  target sentences if enables, else uses default tensorflow tokenizer.

  :param args: The args obj which contains paths to the preprocessed files
  :type args: ArgParse object
  :return: The mulitlingual dataset, source and target vocab
  :rtype: The multilingual dataset is returned as dict,
          source and tgt vocabs.
  """

    dataset = {}
    CUR_DIR = os.getcwd()
    levels_up = 0
    if args.use_colab is not None:
        DATA_PATH = 'GSoC-19/data/processed_data/'
    else:
        DATA_PATH = (os.path.normpath(
            os.path.join(*([CUR_DIR] +
                           [".."] * levels_up)))) + '/data/processed_data/'

    # create vocabs for the source
    src_vocab = tf.keras.preprocessing.text.Tokenizer(filters='')
    target_str = ''
    spl_sym = DATA_PATH + 'special_symbols'

    for lang in languages:

        (dataset[lang + '_train_nodes'], dataset[lang + '_train_labels'],
         dataset[lang + '_train_node1'],
         dataset[lang + '_train_node2']) = PreProcess(
             DATA_PATH + lang + '/train_src', lang)
        (dataset[lang + '_eval_nodes'], dataset[lang + '_eval_labels'],
         dataset[lang + '_eval_node1'],
         dataset[lang + '_eval_node2']) = PreProcess(
             DATA_PATH + lang + '/eval_src', lang)
        (dataset[lang + '_test_nodes'], dataset[lang + '_test_labels'],
         dataset[lang + '_test_node1'],
         dataset[lang + '_test_node2']) = PreProcess(
             DATA_PATH + lang + '/test_src', lang)
        train_tgt = io.open(DATA_PATH + lang + '/train_tgt',
                            encoding='UTF-8').read().strip().split('\n')
        dataset[lang + '_train_tgt'] = [
            (PreProcessSentence(w, args.sentencepiece, lang))
            for w in train_tgt
        ]
        eval_tgt = io.open(DATA_PATH + lang + '/eval_tgt',
                           encoding='UTF-8').read().strip().split('\n')
        dataset[lang + '_eval_tgt'] = [
            (PreProcessSentence(w, args.sentencepiece, lang)) for w in eval_tgt
        ]
        target_str += (DATA_PATH + lang + '/train_tgt') + ','
        target_str += (DATA_PATH + lang + '/eval_tgt') + ','

        # fit the vocab
        src_vocab.fit_on_texts(dataset[lang + '_train_nodes'])
        src_vocab.fit_on_texts(dataset[lang + '_train_labels'])
        src_vocab.fit_on_texts(dataset[lang + '_train_node1'])
        src_vocab.fit_on_texts(dataset[lang + '_train_node2'])
        src_vocab.fit_on_texts(dataset[lang + '_eval_nodes'])
        src_vocab.fit_on_texts(dataset[lang + '_eval_labels'])
        src_vocab.fit_on_texts(dataset[lang + '_eval_node1'])
        src_vocab.fit_on_texts(dataset[lang + '_eval_node2'])

        if args.sentencepiece == 'False':
            src_vocab.fit_on_texts(dataset[lang + '_train_tgt'])
            src_vocab.fit_on_texts(dataset[lang + '_eval_tgt'])

    if args.sentencepiece == 'True':
        print('Tragers : ' + target_str)
        os.makedirs(('vocabs/gat/' + args.lang), exist_ok=True)
        spm.SentencePieceTrainer.Train('--input=' + target_str + spl_sym + '  \
                                                --model_prefix=vocabs/' +
                                       args.model + '/' + args.lang +
                                       '/train_tgt \
                                                --vocab_size=' +
                                       str(args.vocab_size) +
                                       ' --character_coverage=1.0 '
                                       '--model_type=' +
                                       args.sentencepiece_model +
                                       ' --hard_vocab_limit=false')
        sp = spm.SentencePieceProcessor()
        sp.load('vocabs/' + args.model + '/' + args.lang + '/train_tgt.model')

    if args.sentencepiece == 'True':
        return dataset, src_vocab, sp
    else:
        return dataset, src_vocab, src_vocab
Exemple #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model",
                        required=True,
                        help="sentencepiece model to use for encoding")
    parser.add_argument("--inputs",
                        nargs="+",
                        default=['-'],
                        help="input files to filter/encode")
    parser.add_argument("--outputs",
                        nargs="+",
                        default=['-'],
                        help="path to save encoded outputs")
    parser.add_argument("--output_format",
                        choices=["piece", "id"],
                        default="piece")
    parser.add_argument("--min-len",
                        type=int,
                        metavar="N",
                        help="filter sentence pairs with fewer than N tokens")
    parser.add_argument("--max-len",
                        type=int,
                        metavar="N",
                        help="filter sentence pairs with more than N tokens")
    parser.add_argument("--nbest_size",
                        type=int,
                        metavar="N",
                        help="sampling size")
    parser.add_argument("--alpha",
                        type=float,
                        metavar="N",
                        help="smoothing parameter")
    args = parser.parse_args()

    assert len(args.inputs) == len(args.outputs), \
            "number of input and output paths should match"

    sp = spm.SentencePieceProcessor()
    sp.Load(args.model)

    if args.output_format == "piece":

        def encode(l):
            return sp.SampleEncodeAsPieces(l, args.nbest_size, args.alpha)
    elif args.output_format == "id":

        def encode(l):
            return list(
                map(str, sp.SampleEncodeAsIds(l, args.nbest_size, args.alpha)))
    else:
        raise NotImplementedError

    if args.min_len is not None or args.max_len is not None:

        def valid(line):
            return ((args.min_len is None or len(line) >= args.min_len)
                    and (args.max_len is None or len(line) <= args.max_len))
    else:

        def valid(lines):
            return True

    with contextlib.ExitStack() as stack:
        inputs = [
            stack.enter_context(
                open(input,
                     "r",
                     encoding="utf-8",
                     newline="\n",
                     errors="ignore")) if input != "-" else sys.stdin
            for input in args.inputs
        ]
        outputs = [
            stack.enter_context(
                open(output, "w", encoding="utf-8", newline="\n"))
            if output != "-" else sys.stdout for output in args.outputs
        ]

        stats = {
            "num_empty": 0,
            "num_filtered": 0,
        }

        def encode_line(line):
            line = line.strip()
            if len(line) > 0:
                line = encode(line)
                if valid(line):
                    return line
                else:
                    stats["num_filtered"] += 1
            else:
                stats["num_empty"] += 1
            return None

        for i, lines in enumerate(zip(*inputs), start=1):
            enc_lines = list(map(encode_line, lines))
            if not any(enc_line is None for enc_line in enc_lines):
                for enc_line, output_h in zip(enc_lines, outputs):
                    print(" ".join(enc_line), file=output_h)
            if i % 10000 == 0:
                print("processed {} lines".format(i), file=sys.stderr)

        print("skipped {} empty lines".format(stats["num_empty"]),
              file=sys.stderr)
        print("filtered {} lines".format(stats["num_filtered"]),
              file=sys.stderr)
Exemple #15
0
 def loadTrainedBPE(self):
     self.sp = spm.SentencePieceProcessor()
     self.sp.Load(self.basePath + "/segmented/bpe.model")
     return
Exemple #16
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', type=str, required=True)
    # parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--spm_model_en_de', type=str, required=True)
    parser.add_argument('--spm_model_de_en', type=str, required=True)
    parser.add_argument('--model_en_de', type=str, required=True)
    parser.add_argument('--model_de_en', type=str, required=True)

    # parser.add_argument('--vocab', type=str, default=None)
    # parser.add_argument('--vocab_thresh', type=int, default=None)

    args = parser.parse_args()

    sp_en_de = spm.SentencePieceProcessor()
    sp_en_de.Load(args.spm_model_en_de)

    # vocabulary = read_vocabulary(codecs.open(args.vocab, 'r', 'utf-8'), args.vocab_thresh)

    sp_de_en = spm.SentencePieceProcessor()
    sp_de_en.Load(args.spm_model_de_en)

    #************************************************************************
    # English -> German
    #************************************************************************
    translate_en_de_parser = argparse.ArgumentParser()
    translate_opts(translate_en_de_parser)
    translator_en_de_args = translate_en_de_parser.parse_known_args([])[0]

    translator_en_de_args.src = 'na'
Exemple #17
0
# 3rd party librairies
import sentencepiece as sp # Tokenizer
# Local modules
from params import * # set of all parameters


# Let's load the trained model
model = load_model(trained_model_filename)

# Load our ready-to-use numpy arrays for testing
testX = np.load(testX_array_filename)
testY = np.load(testY_array_filename)

# Load trained tokenizer for English and French
# Creating a tokenizer object for English
en_sp = sp.SentencePieceProcessor()
# Loading the English model
en_sp.Load("en.model")
# Creating a tokenizer object for French
fr_sp = sp.SentencePieceProcessor()
# Loading the French model
fr_sp.Load("fr.model")

# Predict
predictions = model.predict_classes(testX)

# Check the translation on a few sentences
for index in range(10):
    print("Original:")
    print(fr_sp.DecodeIds(testX[index, :].tolist()))
    print("Expected:")
Exemple #18
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  #### Validate flags
  if FLAGS.save_steps is not None:
    FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps)

  if FLAGS.do_predict:
    predict_dir = FLAGS.predict_dir
    if not tf.gfile.Exists(predict_dir):
      tf.gfile.MakeDirs(predict_dir)

  processors = {
      "mnli_matched": MnliMatchedProcessor,
      "mnli_mismatched": MnliMismatchedProcessor,
      'sts-b': StsbProcessor,
      'imdb': ImdbProcessor,
       "yelp5": Yelp5Processor,
      "stackoverflowbody": StackoverflowBodyProcessor,
      "stackoverflowtitle": StackoverflowTitleProcessor
  }

  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    raise ValueError(
        "At least one of `do_train`, `do_eval, `do_predict` or "
        "`do_submit` must be True.")

  if not tf.gfile.Exists(FLAGS.output_dir):
    tf.gfile.MakeDirs(FLAGS.output_dir)

  task_name = FLAGS.task_name.lower()

  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name]()
  label_list = processor.get_labels() if not FLAGS.is_regression else None

  sp = spm.SentencePieceProcessor()
  sp.Load(FLAGS.spiece_model_file)
  def tokenize_fn(text):
    text = preprocess_text(text, lower=FLAGS.uncased)
    return encode_ids(sp, text)

  run_config = model_utils.configure_tpu(FLAGS)    
      
  model_fn = get_model_fn(len(label_list) if label_list is not None else None)

  spm_basename = os.path.basename(FLAGS.spiece_model_file)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  if FLAGS.use_tpu:
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)
  else:
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config)

  if FLAGS.do_train:
    train_file_base = "{}.len-{}.train.tf_record".format(
        spm_basename, FLAGS.max_seq_length)
    train_file = os.path.join(FLAGS.output_dir, train_file_base)
    tf.logging.info("Use tfrecord file {}".format(train_file))
    train_examples = processor.get_train_examples(FLAGS.data_dir)
    np.random.shuffle(train_examples)
    tf.logging.info("Num of train samples: {}".format(len(train_examples)))

    file_based_convert_examples_to_features(
        train_examples, label_list, FLAGS.max_seq_length, tokenize_fn,
        train_file, FLAGS.num_passes)

    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)

    estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

  if FLAGS.do_eval or FLAGS.do_predict:
    if FLAGS.eval_split == "dev":
      eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    else:
      eval_examples = processor.get_test_examples(FLAGS.data_dir)

    tf.logging.info("Num of eval samples: {}".format(len(eval_examples)))

  if FLAGS.do_eval:
    # TPU requires a fixed batch size for all batches, therefore the number
    # of examples must be a multiple of the batch size, or else examples
    # will get dropped. So we pad with fake examples which are ignored
    # later on. These do NOT count towards the metric (all tf.metrics
    # support a per-instance weight, and these get a weight of 0.0).
    #
    # Modified in XL: We also adopt the same mechanism for GPUs.
    while len(eval_examples) % FLAGS.eval_batch_size != 0:
      eval_examples.append(PaddingInputExample())

    eval_file_base = "{}.len-{}.{}.eval.tf_record".format(
        spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
    eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

    file_based_convert_examples_to_features(
        eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn,
        eval_file)

    assert len(eval_examples) % FLAGS.eval_batch_size == 0
    eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=True)

    # Filter out all checkpoints in the directory
    steps_and_files = []
    filenames = tf.gfile.ListDirectory(FLAGS.model_dir)

    for filename in filenames:
      if filename.endswith(".index"):
        ckpt_name = filename[:-6]
        cur_filename = join(FLAGS.model_dir, ckpt_name)
        global_step = int(cur_filename.split("-")[-1])
        tf.logging.info("Add {} to eval list.".format(cur_filename))
        steps_and_files.append([global_step, cur_filename])
        
    steps_and_files = sorted(steps_and_files, key=lambda x: x[0])

    # Decide whether to evaluate all ckpts
    if not FLAGS.eval_all_ckpt:
      steps_and_files = steps_and_files[-1:]

    eval_results = []
    for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]):
      ret = estimator.evaluate(
          input_fn=eval_input_fn,
          steps=eval_steps,
          checkpoint_path=filename)

      ret["step"] = global_step
      ret["path"] = filename

      eval_results.append(ret)

      tf.logging.info("=" * 80)
      log_str = "Eval result | "
      for key, val in sorted(ret.items(), key=lambda x: x[0]):
        log_str += "{} {} | ".format(key, val)
      tf.logging.info(log_str)

    key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy"
    eval_results.sort(key=lambda x: x[key_name], reverse=True)

    tf.logging.info("=" * 80)
    log_str = "Best result | "
    for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]):
      log_str += "{} {} | ".format(key, val)
    tf.logging.info(log_str)

  if FLAGS.do_predict:
    eval_file_base = "{}.len-{}.{}.predict.tf_record".format(
        spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
    eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

    file_based_convert_examples_to_features(
        eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn,
        eval_file)

    pred_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)

    predict_results = []
    with tf.gfile.Open(os.path.join(predict_dir, "{}.tsv".format(
        task_name)), "w") as fout:
      fout.write("index\tprediction\n")

      for pred_cnt, result in enumerate(estimator.predict(
          input_fn=pred_input_fn,
          yield_single_examples=True,
          checkpoint_path=FLAGS.predict_ckpt)):
        if pred_cnt % 1000 == 0:
          tf.logging.info("Predicting submission for example: {}".format(
              pred_cnt))

        logits = [float(x) for x in result["logits"].flat]
        predict_results.append(logits)

        if len(logits) == 1:
          label_out = logits[0]
        elif len(logits) == 2:
          if logits[1] - logits[0] > FLAGS.predict_threshold:
            label_out = label_list[1]
          else:
            label_out = label_list[0]
        elif len(logits) > 2:
          max_index = np.argmax(np.array(logits, dtype=np.float32))
          #label_out = label_list[max_index]
          label_out = logits
        else:
          raise NotImplementedError

        fout.write("{}\t{}\n".format(pred_cnt, label_out))

    predict_json_path = os.path.join(predict_dir, "{}.logits.json".format(
        task_name))

    with tf.gfile.Open(predict_json_path, "w") as fp:
      json.dump(predict_results, fp, indent=4)
    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        cls_token_box=[0, 0, 0, 0],
        sep_token_box=[1000, 1000, 1000, 1000],
        pad_token_box=[0, 0, 0, 0],
        pad_token_label=-100,
        only_label_first_subword=True,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs
    ) -> None:
        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            cls_token_box=cls_token_box,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

        # Original fairseq vocab and spm vocab must be "aligned":
        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'

        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}

        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 1

        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

        # additional properties
        self.cls_token_box = cls_token_box
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword
Exemple #20
0
    def __init__(self,
                 vocab_file,
                 bos_token="[SEP]",
                 eos_token="[SEP]",
                 sep_token="[SEP]",
                 unk_token="[UNK]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
                 **kwargs) -> None:
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            unk_token=unk_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning(
                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
                "pip install sentencepiece")
            raise

        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

        # Original fairseq vocab and spm vocab must be "aligned":
        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'

        # put special tokens and [unused] tokens into the vocab
        self.fairseq_tokens_to_ids = {
            "[PAD]": 0,
            "[CLS]": 1,
            "[SEP]": 2,
            "[UNK]": 3,
            "[MASK]": 4
        }

        for i in range(10):
            tok = f"[unused{i}]"
            self.fairseq_tokens_to_ids[tok] = 5 + i

        # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
        self.fairseq_offset = 12
        self.fairseq_ids_to_tokens = {
            v: k
            for k, v in self.fairseq_tokens_to_ids.items()
        }
        for k in self.fairseq_tokens_to_ids.keys():
            self.unique_no_split_tokens.append(k)
Exemple #21
0
    def __init__(self, params, model, num_workers, worker_id):
        """Speech-to-text data layer constructor.
    See parent class for arguments description.
    Config parameters:
    * **num_audio_features** (int) --- number of audio features to extract.
    * **input_type** (str) --- could be either "spectrogram" or "mfcc".
    * **vocab_file** (str) --- path to vocabulary file or sentencepiece model.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
    * **augmentation** (dict) --- optional dictionary with data augmentation
      parameters. Can contain "time_stretch_ratio", "noise_level_min" and
      "noise_level_max" parameters, e.g.::
        {
          'time_stretch_ratio': 0.05,
          'noise_level_min': -90,
          'noise_level_max': -60,
        }
      For additional details on these parameters see
      :func:`data.speech2text.speech_utils.augment_audio_signal` function.
    * **autoregressive** (bool) --- boolean indicating whether the model is
      autoregressive.
    * **syn_enable** (bool) --- boolean indicating whether the model is
      using synthetic data.
    * **syn_subdirs** (list) --- must be defined if using synthetic mode.
      Contains a list of subdirectories that hold the synthetica wav files.
    """
        super(Speech2TextDataLayer, self).__init__(params, model, num_workers,
                                                   worker_id)

        self.params['autoregressive'] = self.params.get(
            'autoregressive', False)
        self.autoregressive = self.params['autoregressive']
        self.params['bpe'] = self.params.get('bpe', False)
        if self.params['bpe']:
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(self.params['vocab_file'])
            self.params['tgt_vocab_size'] = len(self.sp) + 1
        else:
            self.params['char2idx'] = load_pre_existing_vocabulary(
                self.params['vocab_file'],
                read_chars=True,
            )
            if not self.autoregressive:
                # add one for implied blank token
                self.params['tgt_vocab_size'] = len(
                    self.params['char2idx']) + 1
            else:
                num_chars_orig = len(self.params['char2idx'])
                self.params['tgt_vocab_size'] = num_chars_orig + 2
                self.start_index = num_chars_orig
                self.end_index = num_chars_orig + 1
                self.params['char2idx']['<S>'] = self.start_index
                self.params['char2idx']['</S>'] = self.end_index
                self.target_pad_value = self.end_index
            self.params['idx2char'] = {
                i: w
                for w, i in self.params['char2idx'].items()
            }
        self.target_pad_value = 0

        self._files = None
        if self.params["interactive"]:
            return
        for csv in params['dataset_files']:
            files = pd.read_csv(csv, encoding='utf-8')
            if self._files is None:
                self._files = files
            else:
                self._files = self._files.append(files)

        if self.params['mode'] != 'infer':
            cols = ['wav_filename', 'transcript']
        else:
            cols = 'wav_filename'

        self.all_files = self._files.loc[:, cols].values
        self._files = self.split_data(self.all_files)

        self._size = self.get_size_in_samples()
        self._dataset = None
        self._iterator = None
        self._input_tensors = None

        self.params['max_duration'] = params.get('max_duration', -1.0)
        self.params['window_size'] = params.get('window_size', 20e-3)
        self.params['window_stride'] = params.get('window_stride', 10e-3)
Exemple #22
0
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import sentencepiece as spm

import data_loader
#from vocab import Vocab
from log_timer import LogTimer
from datetime import datetime
from utilities import *
from torch.optim.lr_scheduler import StepLR
logging.basicConfig(filename='LSTM_2lAdam_optim_BPE_20000_3_2048.log',
                    level=logging.DEBUG)

sp_bpe = spm.SentencePieceProcessor()
sp_bpe.load(
    '../../rnn/RNN-Sherlock-Language-Model/korr_ukrlib_bpe_model_20000.model')


class RnnLm(nn.Module):
    """ A language model RNN with GRU layer(s). """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, gru_layers, tied,
                 dropout):
        super(RnnLm, self).__init__()
        self.tied = tied
        if not tied:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.LSTM(embedding_dim,
                           hidden_dim,
                           gru_layers,
    def __init__(self,
                 vocab_file,
                 src_lang=None,
                 tgt_lang=None,
                 eos_token="</s>",
                 sep_token="</s>",
                 cls_token="<s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 mask_token="<mask>",
                 **kwargs):
        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token,
                                lstrip=True, rstrip=False) if isinstance(
                                    mask_token, str) else mask_token

        super().__init__(
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

        # Original fairseq vocab and spm vocab must be "aligned":
        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'

        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {
            "<s>": 0,
            "<pad>": 1,
            "</s>": 2,
            "<unk>": 3
        }

        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 1

        self.sp_model_size = len(self.sp_model)
        self.lang_code_to_id = {
            code: self.sp_model_size + i + self.fairseq_offset
            for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
        }
        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(
            self.lang_code_to_id) + self.fairseq_offset

        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
        self.fairseq_ids_to_tokens = {
            v: k
            for k, v in self.fairseq_tokens_to_ids.items()
        }
        self._additional_special_tokens = list(self.lang_code_to_id.keys())

        self._src_lang = src_lang if src_lang is not None else "en_XX"
        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
        self.tgt_lang = tgt_lang
        self.set_src_lang_special_tokens(self._src_lang)
def input_fn_builder(data_dir,
                     vocab_model_file,
                     masked_lm_prob,
                     max_encoder_length,
                     max_predictions_per_seq,
                     preprocessed_data,
                     substitute_newline,
                     is_training,
                     tmp_dir=None):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    sp_model = spm.SentencePieceProcessor()
    sp_proto = tf.io.gfile.GFile(vocab_model_file, "rb").read()
    sp_model.LoadFromSerializedProto(sp_proto)
    vocab_size = sp_model.GetPieceSize()
    word_start_subtoken = np.array(
        [sp_model.IdToPiece(i)[0] == "▁" for i in range(vocab_size)])
    word_to_token = np.array(  #사전 가저 오는 부분
        [sp_model.IdToPiece(i) for i in range(vocab_size)])

    feature_shapes = {
        "input_ids": [max_encoder_length],
        "segment_ids": [max_encoder_length],
        "masked_lm_positions": [max_predictions_per_seq],
        "masked_lm_ids": [max_predictions_per_seq],
        "masked_lm_weights": [max_predictions_per_seq],
        "next_sentence_labels": [1]
    }

    def _decode_record(record):
        """Decodes a record to a TensorFlow example."""
        name_to_features = {
            "input_ids":
            tf.io.FixedLenFeature([max_encoder_length], tf.int64),
            "segment_ids":
            tf.io.FixedLenFeature([max_encoder_length], tf.int64),
            "masked_lm_positions":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_ids":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_weights":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
            "next_sentence_labels":
            tf.io.FixedLenFeature([1], tf.int64),
        }
        example = tf.io.parse_single_example(record, name_to_features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t

        return example

    def do_masking(example):
        if "tfds://" == data_dir[:7]:
            text = example["text"]
        else:
            text = example

        print(text)

        tokenizer = tft.SentencepieceTokenizer(
            model=tf.io.gfile.GFile(vocab_model_file, "rb").read())
        if substitute_newline:
            text = tf.strings.regex_replace(text, "\n", substitute_newline)
        subtokens = tokenizer.tokenize(text)
        (subtokens, masked_lm_positions, masked_lm_ids,
         masked_lm_weights) = tf.compat.v1.py_func(
             numpy_masking, [subtokens],
             [tf.int32, tf.int32, tf.int32, tf.float32],
             stateful=False)
        features = {
            "input_ids": subtokens,
            "segment_ids": tf.zeros_like(subtokens),
            "masked_lm_positions": masked_lm_positions,
            "masked_lm_ids": masked_lm_ids,
            "masked_lm_weights": masked_lm_weights,
            "next_sentence_labels": tf.zeros([1], dtype=tf.int64),
        }
        return features

    def numpy_masking(subtokens):
        # Find a random span in text
        end_pos = max_encoder_length - 2 + np.random.randint(
            max(1,
                len(subtokens) - max_encoder_length - 2))
        start_pos = max(0, end_pos - max_encoder_length + 2)
        subtokens = subtokens[start_pos:end_pos]

        # The start might be inside a word so fix it
        # such that span always starts at a word
        word_begin_mark = word_start_subtoken[subtokens]
        word_begins_pos = np.flatnonzero(word_begin_mark).astype(np.int32)
        if word_begins_pos.size == 0:
            # if no word boundary present, we do not do whole word masking
            # and we fall back to random masking.
            word_begins_pos = np.arange(len(subtokens), dtype=np.int32)
            word_begin_mark = np.logical_not(word_begin_mark)
            print(subtokens, start_pos, end_pos, word_begin_mark)
        correct_start_pos = word_begins_pos[0]
        subtokens = subtokens[correct_start_pos:]
        word_begin_mark = word_begin_mark[correct_start_pos:]
        word_begins_pos = word_begins_pos - correct_start_pos
        num_tokens = len(subtokens)

        # @e want to do whole word masking so split by word boundary
        words = np.split(np.arange(num_tokens, dtype=np.int32),
                         word_begins_pos)[1:]
        assert len(words) == len(word_begins_pos)

        # Decide elements to mask
        num_to_predict = min(
            max_predictions_per_seq,
            max(1, int(round(len(word_begins_pos) * masked_lm_prob))))
        masked_lm_positions = np.concatenate(
            np.random.choice(np.array([[]] + words, dtype=np.object)[1:],
                             num_to_predict,
                             replace=False), 0)
        # but this might have excess subtokens than max_predictions_per_seq
        if len(masked_lm_positions) > max_predictions_per_seq:
            masked_lm_positions = masked_lm_positions[:
                                                      max_predictions_per_seq +
                                                      1]
            # however last word can cross word boundaries, remove crossing words
            truncate_masking_at = np.flatnonzero(
                word_begin_mark[masked_lm_positions])[-1]
            masked_lm_positions = masked_lm_positions[:truncate_masking_at]

        # sort masking positions
        masked_lm_positions = np.sort(masked_lm_positions)
        masked_lm_ids = subtokens[masked_lm_positions]

        # replance input token with [MASK] 80%, random 10%, or leave it as it is.
        randomness = np.random.rand(len(masked_lm_positions))
        mask_index = masked_lm_positions[randomness < 0.8]
        random_index = masked_lm_positions[randomness > 0.9]

        subtokens[mask_index] = 67  # id of masked token
        subtokens[random_index] = np.random.randint(  # ignore special tokens
            101,
            vocab_size,
            len(random_index),
            dtype=np.int32)

        # add [CLS] (65) and [SEP] (66) tokens
        subtokens = np.concatenate([
            np.array([65], dtype=np.int32), subtokens,
            np.array([66], dtype=np.int32)
        ])

        # pad everything to correct shape
        pad_inp = max_encoder_length - num_tokens - 2
        subtokens = np.pad(subtokens, [0, pad_inp], "constant")

        pad_out = max_predictions_per_seq - len(masked_lm_positions)
        masked_lm_weights = np.pad(
            np.ones_like(masked_lm_positions, dtype=np.float32), [0, pad_out],
            "constant")
        masked_lm_positions = np.pad(masked_lm_positions + 1, [0, pad_out],
                                     "constant")
        masked_lm_ids = np.pad(masked_lm_ids, [0, pad_out], "constant")

        return subtokens, masked_lm_positions, masked_lm_ids, masked_lm_weights

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        # Load dataset and handle tfds separately
        split = "train" if is_training else "test"
        if "tfds://" == data_dir[:7]:
            d = tfds.load(data_dir[7:],
                          split=split,
                          shuffle_files=is_training,
                          data_dir=tmp_dir)
        else:
            input_files = tf.io.gfile.glob(
                os.path.join(data_dir, "*{}.tfrecord*".format(split)))

            # For training, we want a lot of parallel reading and shuffling.
            # For eval, we want no shuffling and parallel reading doesn't matter.
            if is_training:
                d = tf.data.Dataset.from_tensor_slices(
                    tf.constant(input_files))
                d = d.shuffle(buffer_size=len(input_files))

                # Non deterministic mode means that the interleaving is not exact.
                # This adds even more randomness to the training pipeline.
                d = d.interleave(
                    tf.data.TFRecordDataset,
                    deterministic=False,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
            else:
                d = tf.data.TFRecordDataset(input_files)

        if preprocessed_data:
            d = d.map(_decode_record,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
        else:
            d = d.map(do_masking,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if is_training:
            d = d.shuffle(
                buffer_size=10000, reshuffle_each_iteration=True
            )  # reshuffle_each_iteration: 데이터를 썩을때 buffer 사이즈 한칸씩 넘어가는데 순서를 다르게 함
            d = d.repeat()

        d = d.padded_batch(batch_size, feature_shapes,
                           drop_remainder=True)  # For static shape
        return d

    return input_fn
    parser.add_argument("--vocab_path", default=None, type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path", default=None, type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--word_embedding_path", default=None, type=str,
                        help="Path of the output word embedding.")

    args = parser.parse_args()

    if args.spm_model_path:
        try:
            import sentencepiece as spm
        except ImportError:
            raise ImportError("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                                                    "pip install sentencepiece")
        sp_model = spm.SentencePieceProcessor()
        sp_model.Load(args.spm_model_path)
        vocab = Vocab()
        vocab.i2w = {i: sp_model.IdToPiece(i) for i in range(sp_model.GetPieceSize())}
    else:
        vocab = Vocab()
        vocab.load(args.vocab_path)

    pretrained_model = torch.load(args.load_model_path)
    embedding = pretrained_model["embedding.word_embedding.weight"]

    with open(args.word_embedding_path, mode="w", encoding="utf-8") as f:
        head = str(list(embedding.size())[0]) + " " + str(list(embedding.size())[1]) + "\n"
        f.write(head)

        for i in range(len(vocab.i2w)):
Exemple #26
0
#@title Create a tokenizer and its model

#@markdown NOTE: Less tokenizer words seem to work better

# %cd /content/

full_path_to_INT_dataset = "/content/Music-Reformer_INT_Dataset.txt" #@param {type:"string"}
tokenizer_vocabulary_size_in_words =  321#@param {type:"integer"}

# Train a BPE model on the dataset
spm.SentencePieceTrainer.train(input=full_path_to_INT_dataset,
                              model_prefix='Music-Reformer-Tokenizer',
                              vocab_size=tokenizer_vocabulary_size_in_words,
                              model_type='bpe')
# Load BPE vocabulary
TOKENIZER = spm.SentencePieceProcessor() 
TOKENIZER.load('Music-Reformer-Tokenizer.model')

# Load the dataset
with open(full_path_to_INT_dataset, 'r') as f:
    text = f.read(512 * 3072)


IDS = TOKENIZER.EncodeAsIds(text)
IDS = np.asarray(IDS, dtype=np.int32)
PAD_AMOUNT = 512 * 1024 - len(IDS)
print("Number of tokens:", IDS.shape[0])

#@title Split the dataset
train_validation_split_ratio = 0.9 #@param {type:"slider", min:0.05, max:0.95, step:0.05}
# -*- coding: utf-8 -*-
import sentencepiece as spm
import os
import codecs
import pythaipiece
templates_dir = os.path.dirname(pythaipiece.__file__)
template_file = os.path.join(templates_dir, 'thai3.model')
sp = spm.SentencePieceProcessor()
sp.Load(template_file)
def segment(text):
	listdata=[i for i in sp.EncodeAsPieces(text) if i!= '▁']
	listword=[]
	for i in listdata:
		if '▁' in i:
			listword.append(' ')
			listword.append(i.replace('▁',''))
		else:
			listword.append(i)
	return listword
 def load_from_file(cls, filepath):
     import sentencepiece as splib
     spm = splib.SentencePieceProcessor()
     spm.load(filepath)
     spm.set_encode_extra_options(":eos")
     return cls(spm)
 def __init__(self):
     self.sp = spm.SentencePieceProcessor()
     #self.sp.load('mtier1_20w.model')
     #self.sp.load('mtier1_10w.model')
     self.sp.load(r'E:\embedding\title\top24_10w.model')
 def __setstate__(self, d):
     self.__dict__ = d
     self.spm = sp.SentencePieceProcessor()
     self.spm.Load(self.vocab_file)