def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor: spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) spm.Load(str(path)) return spm
def _create_data(idx, input_paths): # Load sentence-piece model sp = spm.SentencePieceProcessor() sp.Load(FLAGS.sp_path) input_shards = [] total_line_cnt = 0 for input_path in input_paths: input_data, sent_ids = [], [] sent_id, line_cnt = True, 0 tf.logging.info("Processing %s", input_path) for line in tf.gfile.Open(input_path): if line_cnt % 100000 == 0: tf.logging.info("Loading line %d", line_cnt) line_cnt += 1 if not line.strip(): if FLAGS.use_eod: sent_id = not sent_id cur_sent = [EOD_ID] else: continue else: if FLAGS.from_raw_text: cur_sent = preprocess_text(line.strip(), lower=FLAGS.uncased) cur_sent = encode_ids(sp, cur_sent) else: cur_sent = list(map(int, line.strip().split())) input_data.extend(cur_sent) sent_ids.extend([sent_id] * len(cur_sent)) sent_id = not sent_id tf.logging.info("Finish with line %d", line_cnt) if line_cnt == 0: continue input_data = np.array(input_data, dtype=np.int64) sent_ids = np.array(sent_ids, dtype=np.bool) total_line_cnt += line_cnt input_shards.append((input_data, sent_ids)) tf.logging.info("[Task %d] Total number line: %d", idx, total_line_cnt) tfrecord_dir = os.path.join(FLAGS.save_dir, "tfrecords") filenames, num_batch = [], 0 # Randomly shuffle input shards (with a fixed but distinct random seed) np.random.seed(100 * FLAGS.task + FLAGS.pass_id) perm_indices = np.random.permutation(len(input_shards)) tf.logging.info("Using perm indices %s for pass %d", perm_indices.tolist(), FLAGS.pass_id) input_data_list, sent_ids_list = [], [] prev_sent_id = None for perm_idx in perm_indices: input_data, sent_ids = input_shards[perm_idx] # make sure the `send_ids[0] == not prev_sent_id` if prev_sent_id is not None and sent_ids[0] == prev_sent_id: sent_ids = np.logical_not(sent_ids) # append to temporary list input_data_list.append(input_data) sent_ids_list.append(sent_ids) # update `prev_sent_id` prev_sent_id = sent_ids[-1] input_data = np.concatenate(input_data_list) sent_ids = np.concatenate(sent_ids_list) file_name, cur_num_batch = create_tfrecords( save_dir=tfrecord_dir, basename="{}-{}-{}".format(FLAGS.split, idx, FLAGS.pass_id), data=[input_data, sent_ids], bsz_per_host=FLAGS.bsz_per_host, seq_len=FLAGS.seq_len, bi_data=FLAGS.bi_data, sp=sp, ) filenames.append(file_name) num_batch += cur_num_batch record_info = {"filenames": filenames, "num_batch": num_batch} return record_info
def main(unused_argv): del unused_argv if FLAGS.strategy_type == "mirror": strategy = tf.distribute.MirroredStrategy() elif FLAGS.strategy_type == "tpu": cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu) strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver) else: raise ValueError( "The distribution strategy type is not supported: %s" % FLAGS.strategy_type) if strategy: logging.info("***** Number of cores used : %d", strategy.num_replicas_in_sync) train_input_fn = functools.partial(data_utils.get_squad_input_data, FLAGS.train_batch_size, FLAGS.seq_len, FLAGS.query_len, strategy, True, FLAGS.train_tfrecord_path) test_input_fn = functools.partial(data_utils.get_squad_input_data, FLAGS.test_batch_size, FLAGS.seq_len, FLAGS.query_len, strategy, False, FLAGS.test_tfrecord_path) total_training_steps = FLAGS.train_steps steps_per_loop = FLAGS.iterations eval_steps = int(FLAGS.test_data_size / FLAGS.test_batch_size) optimizer, learning_rate_fn = optimization.create_optimizer( FLAGS.learning_rate, total_training_steps, FLAGS.warmup_steps, adam_epsilon=FLAGS.adam_epsilon) model_config = xlnet_config.XLNetConfig(FLAGS) run_config = xlnet_config.create_run_config(True, False, FLAGS) input_meta_data = {} input_meta_data["start_n_top"] = FLAGS.start_n_top input_meta_data["end_n_top"] = FLAGS.end_n_top input_meta_data["lr_layer_decay_rate"] = FLAGS.lr_layer_decay_rate input_meta_data["predict_dir"] = FLAGS.predict_dir input_meta_data["n_best_size"] = FLAGS.n_best_size input_meta_data["max_answer_length"] = FLAGS.max_answer_length input_meta_data["test_batch_size"] = FLAGS.test_batch_size input_meta_data["batch_size_per_core"] = int(FLAGS.train_batch_size / strategy.num_replicas_in_sync) input_meta_data["mem_len"] = FLAGS.mem_len model_fn = functools.partial(get_qaxlnet_model, model_config, run_config, FLAGS.start_n_top, FLAGS.end_n_top) eval_examples = squad_utils.read_squad_examples(FLAGS.predict_file, is_training=False) if FLAGS.test_feature_path: logging.info("start reading pickle file...") with tf.io.gfile.GFile(FLAGS.test_feature_path, "rb") as f: eval_features = pickle.load(f) logging.info("finishing reading pickle file...") else: sp_model = spm.SentencePieceProcessor() sp_model.LoadFromSerializedProto( tf.io.gfile.GFile(FLAGS.spiece_model_file, "rb").read()) spm_basename = os.path.basename(FLAGS.spiece_model_file) eval_features = squad_utils.create_eval_data( spm_basename, sp_model, eval_examples, FLAGS.max_seq_length, FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.uncased) with tf.io.gfile.GFile(FLAGS.predict_file) as f: original_data = json.load(f)["data"] eval_fn = functools.partial(run_evaluation, strategy, test_input_fn, eval_examples, eval_features, original_data, eval_steps, input_meta_data) training_utils.train(strategy=strategy, model_fn=model_fn, input_meta_data=input_meta_data, eval_fn=eval_fn, metric_fn=None, train_input_fn=train_input_fn, init_checkpoint=FLAGS.init_checkpoint, init_from_transformerxl=FLAGS.init_from_transformerxl, total_training_steps=total_training_steps, steps_per_loop=steps_per_loop, optimizer=optimizer, learning_rate_fn=learning_rate_fn, model_dir=FLAGS.model_dir, save_steps=FLAGS.save_steps)
def train_and_evaluate(config, workdir, vocab_filepath): """Runs a training and evaluation loop. Args: config: Model and training configuration. workdir: Working directory for checkpoints and Tensorboard summaries. If this contains a checkpoint, training will be resumed from the latest checkpoint. vocab_filepath: Absolute path to SentencePiece vocab model. Raises: ValueError: If training or eval batch sizes won't fit number of processes and devices, or config is underspecified. """ n_processes = jax.process_count() # Number of processes n_devices = jax.local_device_count() # Number of local devices per process if config.train_batch_size % (n_processes * n_devices) > 0: raise ValueError( "Training batch size must be divisible by the total number of devices, " "but training batch size = %d, while total number of devices = %d " "(%d processes, each with %d devices)" % (config.train_batch_size, n_processes * n_devices, n_processes, n_devices)) if config.eval_batch_size % (n_processes * n_devices) > 0: raise ValueError( "Eval batch size must be divisible by the total number of devices, " "but eval batch size = %d, while total number of devices = %d " "(%d processes, each with %d devices)" % (config.eval_batch_size, n_processes * n_devices, n_processes, n_devices)) per_process_train_batch_size = config.train_batch_size // n_processes per_process_eval_batch_size = config.eval_batch_size // n_processes if jax.process_index() == 0: train_summary_writer = tensorboard.SummaryWriter( os.path.join(workdir, "train")) eval_summary_writer = tensorboard.SummaryWriter( os.path.join(workdir, "eval")) else: train_summary_writer = None eval_summary_writer = None rng = random.PRNGKey(config.seed) rng, init_rng = random.split(rng) tokenizer = spm.SentencePieceProcessor() tokenizer.Load(vocab_filepath) tokenizer.SetEncodeExtraOptions("") # Note: [CLS] and [SEP] will be added by the data pipeline, not the tokenizer. with config.unlocked(): config.vocab_size = tokenizer.GetPieceSize() frozen_config = ml_collections.FrozenConfigDict(config) model = models.PreTrainingModel(config=frozen_config, random_seed=config.seed) params = _init_params(model, init_rng, frozen_config) optimizer = _create_adam_optimizer(config.learning_rate, params) # We access model state only from optimizer via optimizer.target. del params # In case current job restarts, ensure that we continue from where we left # off. optimizer = checkpoints.restore_checkpoint(workdir, optimizer) start_step = int(optimizer.state.step) # Otherwise, try to restore optimizer and model state from config checkpoint. if start_step == 0 and "init_checkpoint_dir" in config and config.init_checkpoint_dir: optimizer = checkpoints.restore_checkpoint(config.init_checkpoint_dir, optimizer) optimizer = jax_utils.replicate(optimizer) learning_rate_fn = train_utils.create_learning_rate_scheduler( factors="constant * linear_warmup * linear_decay", base_learning_rate=config.learning_rate, warmup_steps=config.num_warmup_steps, decay_steps=config.num_train_steps - config.num_warmup_steps, ) c4_masked_lm_inputs = functools.partial( input_pipeline.c4_masked_lm_inputs, tokenizer=tokenizer, max_seq_length=config.max_seq_length, max_predictions_per_seq=config.max_predictions_per_seq, masking_rate=config.masking_rate, mask_token_proportion=config.mask_token_proportion, random_token_proportion=config.random_token_proportion) train_ds = c4_masked_lm_inputs(batch_size=per_process_train_batch_size) train_iter = iter(train_ds) eval_ds = c4_masked_lm_inputs(batch_size=per_process_eval_batch_size) # We init the first set of dropout PRNG keys, but update it afterwards inside # the main pmap'd training update for performance. rngs = random.split(rng, n_devices) loss_and_metrics_fn = functools.partial(_compute_loss_and_metrics, model=model, pad_id=tokenizer.pad_id()) p_train_step = jax.pmap(functools.partial( train_utils.train_step, loss_and_metrics_fn=loss_and_metrics_fn, learning_rate_fn=learning_rate_fn, clipped_grad_norm=config.clipped_grad_norm), axis_name="batch") metric_fn = functools.partial(_compute_eval_stats, model=model, pad_id=tokenizer.pad_id()) p_eval_step = jax.pmap(functools.partial(train_utils.eval_step, metric_fn=metric_fn), axis_name="batch") train_metrics = [] logging.info("Starting training loop.") logging.info("====================") for step in range(start_step, config.num_train_steps): with jax.profiler.StepTraceAnnotation("train", step_num=step): train_batch = next(train_iter) train_batch = common_utils.shard(train_batch) optimizer, train_step_metrics, rngs = p_train_step(optimizer, train_batch, rng=rngs) train_metrics.append(train_step_metrics) if (step > 0 and config.save_checkpoints_steps and step % config.save_checkpoints_steps == 0 and jax.process_index() == 0): # Save un-replicated optimizer + model state. checkpoints.save_checkpoint(workdir, jax_utils.unreplicate(optimizer), step, keep=2) # Periodic metric handling. if step % config.eval_frequency != 0 and step > 0: continue logging.info("Gathering training metrics at step: %d", step) train_metrics = common_utils.get_metrics(train_metrics) train_summary = _compute_loss_and_accuracy_metrics(train_metrics) # Add training specific metrics. train_summary["unclipped_grad_l2_norm"] = jnp.sqrt( jnp.sum(train_metrics["unclipped_grad_l2_sum"])) train_summary["clipped_grad_l2_norm"] = jnp.sqrt( jnp.sum(train_metrics["clipped_grad_l2_sum"])) train_summary["learning_rate"] = learning_rate_fn(step) if jax.process_index() == 0: assert train_summary_writer for key, val in train_summary.items(): train_summary_writer.scalar(key, val, step) train_summary_writer.flush() # Reset metric accumulation for next training evaluation cycle. train_metrics = [] logging.info("Gathering evaluation metrics at step: %d", step) all_stats = [] for _, eval_batch in zip(range(config.max_num_eval_steps), eval_ds): eval_batch = common_utils.shard(eval_batch) all_stats.append(p_eval_step(optimizer.target, eval_batch)) flat_stats = {} for k in all_stats[0]: flat_stats[k] = np.concatenate([stats[k] for stats in all_stats], axis=0) eval_summary = _compute_loss_and_accuracy_metrics(flat_stats) if jax.process_index() == 0: assert eval_summary_writer for key, val in eval_summary.items(): eval_summary_writer.scalar(key, val, step) eval_summary_writer.flush()
def __setstate__(self, d: Dict) -> None: self.__dict__ = d self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file)
train_data = pd.read_csv(train_txt, header=0, delimiter=',') print(f'전체 학습 raw 개수: {len(train_data)}') train_data = train_data.dropna() print(f'전체 학습 valid 개수: {len(train_data)}') train_data = train_data.sample(1000) # 빠른 확인을 위해 1000개만 사용 print(f'전체 학습 sample 개수: {len(train_data)}') label_counts = train_data['label'].value_counts() print(f'전체 학습 label 개수: {label_counts}') # # vocabulary # # vocab load vocab_file = os.path.join(data_dir, 'ko_32000.model') vocab = spm.SentencePieceProcessor() vocab.load(vocab_file) # # tokenize # questions, answers = [], [] for i, row in train_data.iterrows(): question = vocab.encode_as_pieces(row['Q']) questions.append(question) answer = vocab.encode_as_pieces(row['A']) answers.append(answer) assert len(questions) == len(answers) print(questions[:100])
def __init__( self, vocab_file, pad_token="<pad>", eos_token="</s>", unk_token="<unk>", mask_token="<mask_2>", mask_token_sent="<mask_1>", additional_special_tokens=None, offset=103, # entries 2 - 104 are only used for pretraining sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None: self.offset = offset if additional_special_tokens is not None: if not isinstance(additional_special_tokens, list): raise TypeError( f"additional_special_tokens should be of type {type(list)}, but is" f" {type(additional_special_tokens)}") additional_special_tokens_extended = ( ([mask_token_sent] + additional_special_tokens) if mask_token_sent not in additional_special_tokens and mask_token_sent is not None else additional_special_tokens) # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken additional_special_tokens_extended += [ f"<unk_{i}>" for i in range( len(additional_special_tokens_extended), self.offset - 1) ] if len(set(additional_special_tokens_extended)) != len( additional_special_tokens_extended): raise ValueError( "Please make sure that the provided additional_special_tokens do not contain an incorrectly" f" shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}." ) additional_special_tokens = additional_special_tokens_extended else: additional_special_tokens = [ mask_token_sent ] if mask_token_sent is not None else [] additional_special_tokens += [ f"<unk_{i}>" for i in range(2, self.offset) ] self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs super().__init__( eos_token=eos_token, unk_token=unk_token, mask_token=mask_token, pad_token=pad_token, mask_token_sent=mask_token_sent, offset=offset, additional_special_tokens=additional_special_tokens, sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.mask_token_sent = mask_token_sent self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) # add special tokens to encoder dict self.encoder: Dict[int, str] = { 0: self.pad_token, 1: self.eos_token, } if self.mask_token_sent is not None: self.encoder.update({ 2: self.mask_token_sent, 3: self.mask_token, }) if self.offset > 0: # entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102 # mask_token_sent is already added to list -> so start at 1 self.encoder.update({ i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1) }) self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
def main(_): logging.set_verbosity(logging.INFO) processors = { "mnli_matched": MnliMatchedProcessor, "mnli_mismatched": MnliMismatchedProcessor, "sts-b": StsbProcessor, "imdb": ImdbProcessor, "yelp5": Yelp5Processor } task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() if not FLAGS.is_regression else None sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_utils.preprocess_text(text, lower=FLAGS.uncased) return preprocess_utils.encode_ids(sp, text) spm_basename = os.path.basename(FLAGS.spiece_model_file) train_file_base = "{}.len-{}.train.tf_record".format( spm_basename, FLAGS.max_seq_length) train_file = os.path.join(FLAGS.output_dir, train_file_base) logging.info("Use tfrecord file %s", train_file) train_examples = processor.get_train_examples(FLAGS.data_dir) np.random.shuffle(train_examples) logging.info("Num of train samples: %d", len(train_examples)) file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, train_file, FLAGS.num_passes) if FLAGS.eval_split == "dev": eval_examples = processor.get_dev_examples(FLAGS.data_dir) else: eval_examples = processor.get_test_examples(FLAGS.data_dir) logging.info("Num of eval samples: %d", len(eval_examples)) # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). # # Modified in XL: We also adopt the same mechanism for GPUs. while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(classifier_utils.PaddingInputExample()) eval_file_base = "{}.len-{}.{}.eval.tf_record".format( spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) eval_file = os.path.join(FLAGS.output_dir, eval_file_base) file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, eval_file)
def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: spm = sentencepiece.SentencePieceProcessor() spm.Load(path) return spm
def prepro(hp): """Load raw data -> Preprocessing -> Segmenting with sentencepice hp: hyperparams. argparse. """ logging.info("# Check if raw files exist") # train1 = "iwslt2016/de-en/train.tags.de-en.de" # train2 = "iwslt2016/de-en/train.tags.de-en.en" # eval1 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.de.xml" # eval2 = "iwslt2016/de-en/IWSLT16.TED.tst2013.de-en.en.xml" # test1 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.de.xml" # test2 = "iwslt2016/de-en/IWSLT16.TED.tst2014.de-en.en.xml" train1 = "iwslt2016/ch-ch/train.ch" train2 = "iwslt2016/ch-ch/train.ch" eval1 = "iwslt2016/ch-ch/tst2018.ch" eval2 = "iwslt2016/ch-ch/tst2018.ch" test1 = "iwslt2016/ch-ch/tst2019.ch" test2 = "iwslt2016/ch-ch/tst2019.ch" for f in (train1, train2, eval1, eval2, test1, test2): if not os.path.isfile(f): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), f) logging.info("# Preprocessing") # train _prepro = lambda x: [line.strip() for line in open(x, 'r').read().split("\n") \ if not line.startswith("<")] prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2) assert len(prepro_train1) == len( prepro_train2), "Check if train source and target files match." # eval #_prepro = lambda x: [re.sub("<[^>]+>", "", line).strip() \ # for line in open(x, 'r').read().split("\n") \ # if line.startswith("<seg id")] prepro_eval1, prepro_eval2 = _prepro(eval1), _prepro(eval2) assert len(prepro_eval1) == len( prepro_eval2), "Check if eval source and target files match." # test prepro_test1, prepro_test2 = _prepro(test1), _prepro(test2) assert len(prepro_test1) == len( prepro_test2), "Check if test source and target files match." logging.info("Let's see how preprocessed data look like") logging.info("prepro_train1:", prepro_train1[0]) logging.info("prepro_train2:", prepro_train2[0]) logging.info("prepro_eval1:", prepro_eval1[0]) logging.info("prepro_eval2:", prepro_eval2[0]) logging.info("prepro_test1:", prepro_test1[0]) logging.info("prepro_test2:", prepro_test2[0]) logging.info("# write preprocessed files to disk") os.makedirs("iwslt2016/prepro", exist_ok=True) def _write(sents, fname): with open(fname, 'w') as fout: fout.write("\n".join(sents)) _write(prepro_train1, "iwslt2016/prepro/train.de") _write(prepro_train2, "iwslt2016/prepro/train.en") _write(prepro_train1 + prepro_train2, "iwslt2016/prepro/train") _write(prepro_eval1, "iwslt2016/prepro/eval.de") _write(prepro_eval2, "iwslt2016/prepro/eval.en") _write(prepro_test1, "iwslt2016/prepro/test.de") _write(prepro_test2, "iwslt2016/prepro/test.en") logging.info("# Train a joint BPE model with sentencepiece") os.makedirs("iwslt2016/segmented", exist_ok=True) train = '--input=iwslt2016/prepro/train --pad_id=0 --unk_id=1 \ --bos_id=2 --eos_id=3\ --model_prefix=iwslt2016/segmented/bpe --vocab_size={} \ --model_type=bpe'.format(hp.vocab_size) spm.SentencePieceTrainer.Train(train) logging.info("# Load trained bpe model") sp = spm.SentencePieceProcessor() sp.Load("iwslt2016/segmented/bpe.model") logging.info("# Segment") def _segment_and_write(sents, fname): with open(fname, "w") as fout: for sent in sents: pieces = sp.EncodeAsPieces(sent) fout.write(" ".join(pieces) + "\n") _segment_and_write(prepro_train1, "iwslt2016/segmented/train.de.bpe") _segment_and_write(prepro_train2, "iwslt2016/segmented/train.en.bpe") _segment_and_write(prepro_eval1, "iwslt2016/segmented/eval.de.bpe") _segment_and_write(prepro_eval2, "iwslt2016/segmented/eval.en.bpe") _segment_and_write(prepro_test1, "iwslt2016/segmented/test.de.bpe") logging.info("Let's see how segmented data look like") print("train1:", open("iwslt2016/segmented/train.de.bpe", 'r').readline()) print("train2:", open("iwslt2016/segmented/train.en.bpe", 'r').readline()) print("eval1:", open("iwslt2016/segmented/eval.de.bpe", 'r').readline()) print("eval2:", open("iwslt2016/segmented/eval.en.bpe", 'r').readline()) print("test1:", open("iwslt2016/segmented/test.de.bpe", 'r').readline())
def main(_): tf.logging.set_verbosity(tf.logging.INFO) #### Validate flags if FLAGS.save_steps is not None: FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps) processors = { "detect": DetectProcessor, } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval, `do_predict` or " "`do_submit` must be True.") if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() if not FLAGS.is_regression else None sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_text(text, lower=FLAGS.uncased) return encode_ids(sp, text) run_config = model_utils.configure_tpu(FLAGS) model_fn = get_model_fn(len(label_list) if label_list is not None else None) spm_basename = os.path.basename(FLAGS.spiece_model_file) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size, eval_batch_size=FLAGS.eval_batch_size) else: estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) if FLAGS.do_train: train_file_base = "{}.len-{}.train.tf_record".format( spm_basename, FLAGS.max_seq_length) train_file = os.path.join(FLAGS.output_dir, train_file_base) tf.logging.info("Use tfrecord file {}".format(train_file)) train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) np.random.shuffle(train_examples) tf.logging.info("Num of train samples: {}".format(len(train_examples))) tf.logging.info("Num of train steps: {}".format(num_train_steps)) file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, train_file, FLAGS.num_passes) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, steps=num_train_steps) # TODO if FLAGS.do_train_test: train_test_file_base = "{}.len-{}.train_test.tf_record".format( spm_basename, FLAGS.max_seq_length) train_test_file = os.path.join(FLAGS.output_dir, train_test_file_base) tf.logging.info("Use tfrecord file {}".format(train_test_file)) train_test_examples = processor.get_train_test_examples(FLAGS.data_dir) num_train_test_steps = int( len(train_test_examples) / FLAGS.train_batch_size * 1) np.random.shuffle(train_examples) tf.logging.info("Num of test samples: {}".format(len(train_test_examples))) tf.logging.info("Num of test steps: {}".format(num_train_test_steps)) file_based_convert_examples_to_features( train_test_examples, label_list, FLAGS.max_seq_length, tokenize_fn, train_test_file, FLAGS.num_passes) train_test_input_fn = file_based_input_fn_builder( input_file=train_test_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_test_input_fn, steps=num_train_test_steps) if FLAGS.do_eval: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). # # Modified in XL: We also adopt the same mechanism for GPUs. eval_examples = processor.get_dev_examples(FLAGS.data_dir) tf.logging.info("Num of eval samples: {}".format(len(eval_examples))) while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file_base = "{}.len-{}.{}.eval.tf_record".format( spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) eval_file = os.path.join(FLAGS.output_dir, eval_file_base) file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, eval_file) assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True) # Filter out all checkpoints in the directory steps_and_files = [] filenames = tf.gfile.ListDirectory(FLAGS.model_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = join(FLAGS.model_dir, ckpt_name) global_step = int(cur_filename.split("-")[-1]) tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files.append([global_step, cur_filename]) steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) # Decide whether to evaluate all ckpts if not FLAGS.eval_all_ckpt: steps_and_files = steps_and_files[-1:] eval_results = [] for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): ret = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) ret["step"] = global_step ret["path"] = filename eval_results.append(ret) tf.logging.info("=" * 80) log_str = "Eval result | " for key, val in sorted(ret.items(), key=lambda x: x[0]): log_str += "{} {} | ".format(key, val) tf.logging.info(log_str) key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy" eval_results.sort(key=lambda x: x[key_name], reverse=True) tf.logging.info("=" * 80) log_str = "Best result | " for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]): log_str += "{} {} | ".format(key, val) tf.logging.info(log_str) if FLAGS.do_predict: predict_dir = FLAGS.predict_dir if not tf.gfile.Exists(predict_dir): tf.gfile.MakeDirs(predict_dir) predict_file_base = "{}.len-{}.{}.predict.tf_record".format(spm_basename, FLAGS.max_seq_length, FLAGS.predict_split) predict_file = os.path.join(FLAGS.output_dir, predict_file_base) predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) tf.logging.info("Num of predict samples: {}".format(len(predict_examples))) file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenize_fn, predict_file) pred_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) if FLAGS.predict_batch_size != 1: result = estimator.predict(input_fn=pred_input_fn) else: result = estimator.predict(input_fn=pred_input_fn, yield_single_examples=False) if FLAGS.use_stack: logits = [ prediction["logits"] for prediction in result ] save_pickle(FLAGS.stack_dir, logits) # TODO output_predict_file = FLAGS.test_save original_file = os.path.join(FLAGS.data_dir, FLAGS.test_set) df = pd.read_csv(original_file) lines = [row['id'] for index, row in df.iterrows()] with open(output_predict_file, "w") as f: writer = csv.writer(f, delimiter=',') writer.writerow(['id','label']) num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(zip(lines, result)): ID = prediction[0] label = prediction[1]["labels"] if i >= num_actual_predict_examples: break writer.writerow([ID, label]) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main( init_run_path, dataset_path, sp_model_path, dist_run_path, epochs=10, lr=2.5e-4, batch_size=2, # per GPU g_accum_gradients=None, # accumulate gradients N times (globally) gradient_checkpointing=False, # saves GPU memory n_ctx=1024, n_embed=768, n_head=12, n_layer=12, n_hidden=None, # equal to n_embed by default (better leave at None) clean=False, # clean run folder log_every=20, save_every=10000, validate_every=None, # same as save_every by default only_validate=False, max_tokens=None, opt_level=None, # apex.amp opt level (e.g. "O1") # train on contexts starting from sentence start sample_sentences=False, verbose=False, # print all training contexts # Multi-GPU related settings master_port='40390', master_addr='127.0.0.1', # These two are set automatically when multiple GPUs are available device_id=None, n_devices=None, ): #check multi gpu training if n_devices is None: n_devices = torch.cuda.device_count() if n_devices > 1: locals_ = locals() kwargs = {a: locals_[a] for a in get_defined_args(main)} mp.spawn(_main_mp, (kwargs,), n_devices) return #the gradient is accumlated from different devices is_main = device_id in {0, None} world_size = max(1, n_devices) if g_accum_gradients is None: g_accum_gradients = world_size assert g_accum_gradients % world_size == 0 accum_gradients = g_accum_gradients // world_size if validate_every is None: validate_every = save_every #model loading stuff init_run_path = Path(init_run_path) dist_run_path = Path(dist_run_path) continue_training = False if not dist_run_path.exists(): print(f"Creating {dist_run_path}") dist_run_path.mkdir(exist_ok=True, parents=True) else: continue_training = True model_path = init_run_path / 'model.pt' optimizer_path = init_run_path / 'optim.pt' if is_main: run_path_mark = init_run_path / '.lm' if clean and init_run_path.exists(): assert run_path_mark.exists() # to avoid removing unrelated folder shutil.rmtree(init_run_path) init_run_path.mkdir(exist_ok=True, parents=True) run_path_mark.touch() shutil.copy(sp_model_path, dist_run_path / 'sp.model') #load sentence piece model sp_model = spm.SentencePieceProcessor() sp_model.load(sp_model_path) #model parameters hparams = HParams( n_vocab=len(sp_model), n_ctx=n_ctx, n_embed=n_embed, n_hidden=n_hidden or n_embed, n_head=n_head, n_layer=n_layer, gradient_checkpointing=gradient_checkpointing, ) params = dict( hparams=attr.asdict(hparams), argv=' '.join(sys.argv), epochs=epochs, lr=lr, batch_size=batch_size, g_accum_gradients=g_accum_gradients, ) params_s = json.dumps(params, indent=4, sort_keys=True, ensure_ascii=False) if is_main: print(params_s) (dist_run_path / 'params.json').write_text(params_s, encoding='utf8') #load encoded dataset dataset_path = Path(dataset_path) print(f'Loading dataset from {dataset_path}') valid_dataset = np.load(dataset_path / 'valid.npy') train_dataset = np.load(dataset_path / 'train.npy') step_tokens = n_ctx * batch_size * g_accum_gradients # all GPUs print(f'Train dataset has {len(train_dataset):,} tokens') print(f'Validation dataset has {len(valid_dataset):,} tokens') if sample_sentences: train_sample_index, valid_sample_index = [ _sentense_sample_index(dataset, n_ctx, sp_model) for dataset in [train_dataset, valid_dataset]] else: train_sample_index = valid_sample_index = None #check gpu if torch.cuda.is_available(): device = torch.device('cuda', index=device_id) else: device = torch.device('cpu') #initialize model, loss and optimizer model = Model(hparams).to(device) cross_entropy = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=lr) loss_meter = AverageMeter() cudnn.benchmark = True if opt_level: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level=opt_level) seen_tokens = 0 def load_model(model_path , optimizer_path): """ Load model, update seen_tokens value """ nonlocal seen_tokens state = torch.load(model_path, map_location=device) if 'seen_tokens' in state: seen_tokens = state['seen_tokens'] else: # legacy format seen_tokens = state['step'] * step_tokens # seen_tokens = 0 state_dict = fixed_state_dict(state['state_dict']) model.load_state_dict(state_dict) optimizer.load_state_dict(torch.load(optimizer_path, map_location=device)) print(f'Resuming from seen_tokens {seen_tokens:,}') if continue_training: print("Continue Training ...") load_model(dist_run_path / 'model.pt', dist_run_path / 'optim.pt') elif model_path.exists(): load_model(model_path , optimizer_path) if device_id is not None: print(f'device {device} initializing process group') os.environ['MASTER_PORT'] = master_port os.environ['MASTER_ADDR'] = master_addr torch.distributed.init_process_group( backend='nccl', rank=device_id, world_size=world_size) model = nn.parallel.DistributedDataParallel( model, device_ids=[device_id], output_device=device_id) print(f'process group for {device} initialized') def loss_fn(logits, ctx): return cross_entropy( input=logits[:, :-1].reshape([-1, logits.shape[-1]]), target=ctx[:, 1:].reshape(-1)) def train_step(): """ Train step on one GPU. """ context = _gen_training_batch( train_dataset, n_ctx=n_ctx, batch_size=batch_size * accum_gradients, sample_index=train_sample_index) if verbose: print() for ctx in context: print(repr(sp_model.decode_ids(list(map(int, ctx))))) print() context = torch.LongTensor(context) optimizer.zero_grad() loss_scale = n_ctx * batch_size * accum_gradients / (512 * 4 * 32) for ctx in torch.split(context, batch_size): ctx = ctx.to(device=device) logits = model(ctx)['logits'] loss = loss_fn(logits, ctx) loss_b = loss * loss_scale if opt_level: with amp.scale_loss(loss_b, optimizer) as scaled_loss: scaled_loss.backward() else: loss_b.backward() loss_meter.update(float(loss.item())) optimizer.step() def train(): nonlocal seen_tokens epoch_size = len(train_dataset) // step_tokens * step_tokens pbar = tqdm.trange(epochs, desc='epochs', dynamic_ncols=True, disable=not is_main) # pbar used for epochs # init_epoch_pbar = lambda: tqdm.trange( # epoch_size, dynamic_ncols=True, disable=not is_main) # init_epoch_pbar = lambda: tqdm.trange(epoch_size, disable=not is_main) # epoch_pbar = init_epoch_pbar() # # pbar.update(seen_tokens // epoch_size) # # pbar.refresh() # epoch_pbar.update(seen_tokens % epoch_size) step = 1 loss_per_epoch = [] j = 0 start_time = time.time() while seen_tokens < epochs * epoch_size: if max_tokens and seen_tokens >= max_tokens: print(f'max_tokens {max_tokens} reached, ' f'saving and exiting') save() validate() return train_step() seen_tokens += step_tokens step += 1 # epoch_pbar.update(step_tokens) # epoch_pbar.set_description(f'epoch {1 + seen_tokens // epoch_size}') # epoch_pbar.set_postfix(loss=f'{loss_meter.mean():.2f}') # epoch_pbar.refresh() loss_per_epoch.append(loss_meter.mean()) if step % save_every == 0: save() if is_main and step % log_every == 0: json_log_plots.write_event(dist_run_path, step=seen_tokens, loss=loss_meter.mean()) loss_meter.reset() if step % validate_every == 0: validate() # create a new progress bar for the next epoch if seen_tokens % epoch_size == 0: # pbar.update() # epoch_pbar.close() # epoch_pbar = init_epoch_pbar() valid_loss = get_valid_loss() print(f'epoch: {j} \t train_loss: {np.mean(loss_per_epoch):.3f} \t valid_loss = {valid_loss:.3f} \t time: {(time.time()-start_time):.2f}') j += 1 loss_per_epoch = [] start_time = time.time() # end of training save() validate() def validate(): if not is_main or world_size != 1: return json_log_plots.write_event(dist_run_path, step=seen_tokens, valid_loss=get_valid_loss()) def get_valid_loss(): """ Run validation, return mean loss. This is a pessimistic score, as validation contexts are non-overlapping. """ model.eval() losses = AverageMeter() with torch.no_grad(): for ctx in _valid_batch_iter( valid_dataset, batch_size=batch_size, n_ctx=n_ctx, sample_index=valid_sample_index): if not ctx: continue ctx = torch.LongTensor(ctx).to(device) logits = model(ctx)['logits'] loss = loss_fn(logits, ctx) losses.update(float(loss.item())) model.train() return losses.mean() def save(): if not is_main: return # for path in [model_path, optimizer_path]: # if path.exists(): # shutil.copy(path, run_path / f'{path.stem}-prev{path.suffix}') torch.save({ 'state_dict': _unwrapped_model(model).state_dict(), 'seen_tokens': seen_tokens, }, dist_run_path / "model.pt") torch.save(optimizer.state_dict(), dist_run_path/ "optim.pt") if only_validate: if world_size != 1: print('multi-GPU validation is not supported yet') sys.exit(1) if is_main: print(f'Validation loss: {get_valid_loss():.4f}') else: try: train() except KeyboardInterrupt: if is_main: print('Interrupted, saving') save() sys.exit(1)
def LoadMultlingualDataset(args): """ Function to load individual datasets and Preprocess them individuall. A language token in also added at the start of each dataset. Takes in Preprocessed data and trains a sentencepiece model on the target sentences if enables, else uses default tensorflow tokenizer. :param args: The args obj which contains paths to the preprocessed files :type args: ArgParse object :return: The mulitlingual dataset, source and target vocab :rtype: The multilingual dataset is returned as dict, source and tgt vocabs. """ dataset = {} CUR_DIR = os.getcwd() levels_up = 0 if args.use_colab is not None: DATA_PATH = 'GSoC-19/data/processed_data/' else: DATA_PATH = (os.path.normpath( os.path.join(*([CUR_DIR] + [".."] * levels_up)))) + '/data/processed_data/' # create vocabs for the source src_vocab = tf.keras.preprocessing.text.Tokenizer(filters='') target_str = '' spl_sym = DATA_PATH + 'special_symbols' for lang in languages: (dataset[lang + '_train_nodes'], dataset[lang + '_train_labels'], dataset[lang + '_train_node1'], dataset[lang + '_train_node2']) = PreProcess( DATA_PATH + lang + '/train_src', lang) (dataset[lang + '_eval_nodes'], dataset[lang + '_eval_labels'], dataset[lang + '_eval_node1'], dataset[lang + '_eval_node2']) = PreProcess( DATA_PATH + lang + '/eval_src', lang) (dataset[lang + '_test_nodes'], dataset[lang + '_test_labels'], dataset[lang + '_test_node1'], dataset[lang + '_test_node2']) = PreProcess( DATA_PATH + lang + '/test_src', lang) train_tgt = io.open(DATA_PATH + lang + '/train_tgt', encoding='UTF-8').read().strip().split('\n') dataset[lang + '_train_tgt'] = [ (PreProcessSentence(w, args.sentencepiece, lang)) for w in train_tgt ] eval_tgt = io.open(DATA_PATH + lang + '/eval_tgt', encoding='UTF-8').read().strip().split('\n') dataset[lang + '_eval_tgt'] = [ (PreProcessSentence(w, args.sentencepiece, lang)) for w in eval_tgt ] target_str += (DATA_PATH + lang + '/train_tgt') + ',' target_str += (DATA_PATH + lang + '/eval_tgt') + ',' # fit the vocab src_vocab.fit_on_texts(dataset[lang + '_train_nodes']) src_vocab.fit_on_texts(dataset[lang + '_train_labels']) src_vocab.fit_on_texts(dataset[lang + '_train_node1']) src_vocab.fit_on_texts(dataset[lang + '_train_node2']) src_vocab.fit_on_texts(dataset[lang + '_eval_nodes']) src_vocab.fit_on_texts(dataset[lang + '_eval_labels']) src_vocab.fit_on_texts(dataset[lang + '_eval_node1']) src_vocab.fit_on_texts(dataset[lang + '_eval_node2']) if args.sentencepiece == 'False': src_vocab.fit_on_texts(dataset[lang + '_train_tgt']) src_vocab.fit_on_texts(dataset[lang + '_eval_tgt']) if args.sentencepiece == 'True': print('Tragers : ' + target_str) os.makedirs(('vocabs/gat/' + args.lang), exist_ok=True) spm.SentencePieceTrainer.Train('--input=' + target_str + spl_sym + ' \ --model_prefix=vocabs/' + args.model + '/' + args.lang + '/train_tgt \ --vocab_size=' + str(args.vocab_size) + ' --character_coverage=1.0 ' '--model_type=' + args.sentencepiece_model + ' --hard_vocab_limit=false') sp = spm.SentencePieceProcessor() sp.load('vocabs/' + args.model + '/' + args.lang + '/train_tgt.model') if args.sentencepiece == 'True': return dataset, src_vocab, sp else: return dataset, src_vocab, src_vocab
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", required=True, help="sentencepiece model to use for encoding") parser.add_argument("--inputs", nargs="+", default=['-'], help="input files to filter/encode") parser.add_argument("--outputs", nargs="+", default=['-'], help="path to save encoded outputs") parser.add_argument("--output_format", choices=["piece", "id"], default="piece") parser.add_argument("--min-len", type=int, metavar="N", help="filter sentence pairs with fewer than N tokens") parser.add_argument("--max-len", type=int, metavar="N", help="filter sentence pairs with more than N tokens") parser.add_argument("--nbest_size", type=int, metavar="N", help="sampling size") parser.add_argument("--alpha", type=float, metavar="N", help="smoothing parameter") args = parser.parse_args() assert len(args.inputs) == len(args.outputs), \ "number of input and output paths should match" sp = spm.SentencePieceProcessor() sp.Load(args.model) if args.output_format == "piece": def encode(l): return sp.SampleEncodeAsPieces(l, args.nbest_size, args.alpha) elif args.output_format == "id": def encode(l): return list( map(str, sp.SampleEncodeAsIds(l, args.nbest_size, args.alpha))) else: raise NotImplementedError if args.min_len is not None or args.max_len is not None: def valid(line): return ((args.min_len is None or len(line) >= args.min_len) and (args.max_len is None or len(line) <= args.max_len)) else: def valid(lines): return True with contextlib.ExitStack() as stack: inputs = [ stack.enter_context( open(input, "r", encoding="utf-8", newline="\n", errors="ignore")) if input != "-" else sys.stdin for input in args.inputs ] outputs = [ stack.enter_context( open(output, "w", encoding="utf-8", newline="\n")) if output != "-" else sys.stdout for output in args.outputs ] stats = { "num_empty": 0, "num_filtered": 0, } def encode_line(line): line = line.strip() if len(line) > 0: line = encode(line) if valid(line): return line else: stats["num_filtered"] += 1 else: stats["num_empty"] += 1 return None for i, lines in enumerate(zip(*inputs), start=1): enc_lines = list(map(encode_line, lines)) if not any(enc_line is None for enc_line in enc_lines): for enc_line, output_h in zip(enc_lines, outputs): print(" ".join(enc_line), file=output_h) if i % 10000 == 0: print("processed {} lines".format(i), file=sys.stderr) print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
def loadTrainedBPE(self): self.sp = spm.SentencePieceProcessor() self.sp.Load(self.basePath + "/segmented/bpe.model") return
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, required=True) # parser.add_argument('--output', type=str, required=True) parser.add_argument('--spm_model_en_de', type=str, required=True) parser.add_argument('--spm_model_de_en', type=str, required=True) parser.add_argument('--model_en_de', type=str, required=True) parser.add_argument('--model_de_en', type=str, required=True) # parser.add_argument('--vocab', type=str, default=None) # parser.add_argument('--vocab_thresh', type=int, default=None) args = parser.parse_args() sp_en_de = spm.SentencePieceProcessor() sp_en_de.Load(args.spm_model_en_de) # vocabulary = read_vocabulary(codecs.open(args.vocab, 'r', 'utf-8'), args.vocab_thresh) sp_de_en = spm.SentencePieceProcessor() sp_de_en.Load(args.spm_model_de_en) #************************************************************************ # English -> German #************************************************************************ translate_en_de_parser = argparse.ArgumentParser() translate_opts(translate_en_de_parser) translator_en_de_args = translate_en_de_parser.parse_known_args([])[0] translator_en_de_args.src = 'na'
# 3rd party librairies import sentencepiece as sp # Tokenizer # Local modules from params import * # set of all parameters # Let's load the trained model model = load_model(trained_model_filename) # Load our ready-to-use numpy arrays for testing testX = np.load(testX_array_filename) testY = np.load(testY_array_filename) # Load trained tokenizer for English and French # Creating a tokenizer object for English en_sp = sp.SentencePieceProcessor() # Loading the English model en_sp.Load("en.model") # Creating a tokenizer object for French fr_sp = sp.SentencePieceProcessor() # Loading the French model fr_sp.Load("fr.model") # Predict predictions = model.predict_classes(testX) # Check the translation on a few sentences for index in range(10): print("Original:") print(fr_sp.DecodeIds(testX[index, :].tolist())) print("Expected:")
def main(_): tf.logging.set_verbosity(tf.logging.INFO) #### Validate flags if FLAGS.save_steps is not None: FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps) if FLAGS.do_predict: predict_dir = FLAGS.predict_dir if not tf.gfile.Exists(predict_dir): tf.gfile.MakeDirs(predict_dir) processors = { "mnli_matched": MnliMatchedProcessor, "mnli_mismatched": MnliMismatchedProcessor, 'sts-b': StsbProcessor, 'imdb': ImdbProcessor, "yelp5": Yelp5Processor, "stackoverflowbody": StackoverflowBodyProcessor, "stackoverflowtitle": StackoverflowTitleProcessor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval, `do_predict` or " "`do_submit` must be True.") if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() if not FLAGS.is_regression else None sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_text(text, lower=FLAGS.uncased) return encode_ids(sp, text) run_config = model_utils.configure_tpu(FLAGS) model_fn = get_model_fn(len(label_list) if label_list is not None else None) spm_basename = os.path.basename(FLAGS.spiece_model_file) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size, eval_batch_size=FLAGS.eval_batch_size) else: estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) if FLAGS.do_train: train_file_base = "{}.len-{}.train.tf_record".format( spm_basename, FLAGS.max_seq_length) train_file = os.path.join(FLAGS.output_dir, train_file_base) tf.logging.info("Use tfrecord file {}".format(train_file)) train_examples = processor.get_train_examples(FLAGS.data_dir) np.random.shuffle(train_examples) tf.logging.info("Num of train samples: {}".format(len(train_examples))) file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, train_file, FLAGS.num_passes) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) if FLAGS.do_eval or FLAGS.do_predict: if FLAGS.eval_split == "dev": eval_examples = processor.get_dev_examples(FLAGS.data_dir) else: eval_examples = processor.get_test_examples(FLAGS.data_dir) tf.logging.info("Num of eval samples: {}".format(len(eval_examples))) if FLAGS.do_eval: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). # # Modified in XL: We also adopt the same mechanism for GPUs. while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file_base = "{}.len-{}.{}.eval.tf_record".format( spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) eval_file = os.path.join(FLAGS.output_dir, eval_file_base) file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, eval_file) assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True) # Filter out all checkpoints in the directory steps_and_files = [] filenames = tf.gfile.ListDirectory(FLAGS.model_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = join(FLAGS.model_dir, ckpt_name) global_step = int(cur_filename.split("-")[-1]) tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files.append([global_step, cur_filename]) steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) # Decide whether to evaluate all ckpts if not FLAGS.eval_all_ckpt: steps_and_files = steps_and_files[-1:] eval_results = [] for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): ret = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) ret["step"] = global_step ret["path"] = filename eval_results.append(ret) tf.logging.info("=" * 80) log_str = "Eval result | " for key, val in sorted(ret.items(), key=lambda x: x[0]): log_str += "{} {} | ".format(key, val) tf.logging.info(log_str) key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy" eval_results.sort(key=lambda x: x[key_name], reverse=True) tf.logging.info("=" * 80) log_str = "Best result | " for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]): log_str += "{} {} | ".format(key, val) tf.logging.info(log_str) if FLAGS.do_predict: eval_file_base = "{}.len-{}.{}.predict.tf_record".format( spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) eval_file = os.path.join(FLAGS.output_dir, eval_file_base) file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, eval_file) pred_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) predict_results = [] with tf.gfile.Open(os.path.join(predict_dir, "{}.tsv".format( task_name)), "w") as fout: fout.write("index\tprediction\n") for pred_cnt, result in enumerate(estimator.predict( input_fn=pred_input_fn, yield_single_examples=True, checkpoint_path=FLAGS.predict_ckpt)): if pred_cnt % 1000 == 0: tf.logging.info("Predicting submission for example: {}".format( pred_cnt)) logits = [float(x) for x in result["logits"].flat] predict_results.append(logits) if len(logits) == 1: label_out = logits[0] elif len(logits) == 2: if logits[1] - logits[0] > FLAGS.predict_threshold: label_out = label_list[1] else: label_out = label_list[0] elif len(logits) > 2: max_index = np.argmax(np.array(logits, dtype=np.float32)) #label_out = label_list[max_index] label_out = logits else: raise NotImplementedError fout.write("{}\t{}\n".format(pred_cnt, label_out)) predict_json_path = os.path.join(predict_dir, "{}.logits.json".format( task_name)) with tf.gfile.Open(predict_json_path, "w") as fp: json.dump(predict_results, fp, indent=4)
def __init__( self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token="<pad>", mask_token="<mask>", cls_token_box=[0, 0, 0, 0], sep_token_box=[1000, 1000, 1000, 1000], pad_token_box=[0, 0, 0, 0], pad_token_label=-100, only_label_first_subword=True, sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, cls_token_box=cls_token_box, sep_token_box=sep_token_box, pad_token_box=pad_token_box, pad_token_label=pad_token_label, only_label_first_subword=only_label_first_subword, sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # Original fairseq vocab and spm vocab must be "aligned": # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- # fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's' | '▁de' | '-' # spm | '<unk>' | '<s>' | '</s>' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' # Mimic fairseq token-to-id alignment for the first 4 token self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3} # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab self.fairseq_offset = 1 self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} # additional properties self.cls_token_box = cls_token_box self.sep_token_box = sep_token_box self.pad_token_box = pad_token_box self.pad_token_label = pad_token_label self.only_label_first_subword = only_label_first_subword
def __init__(self, vocab_file, bos_token="[SEP]", eos_token="[SEP]", sep_token="[SEP]", unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs super().__init__( bos_token=bos_token, eos_token=eos_token, sep_token=sep_token, unk_token=unk_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece") raise self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # Original fairseq vocab and spm vocab must be "aligned": # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- # fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's' | '▁de' | '-' # spm | '<unk>' | '<s>' | '</s>' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' # put special tokens and [unused] tokens into the vocab self.fairseq_tokens_to_ids = { "[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4 } for i in range(10): tok = f"[unused{i}]" self.fairseq_tokens_to_ids[tok] = 5 + i # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab self.fairseq_offset = 12 self.fairseq_ids_to_tokens = { v: k for k, v in self.fairseq_tokens_to_ids.items() } for k in self.fairseq_tokens_to_ids.keys(): self.unique_no_split_tokens.append(k)
def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. Config parameters: * **num_audio_features** (int) --- number of audio features to extract. * **input_type** (str) --- could be either "spectrogram" or "mfcc". * **vocab_file** (str) --- path to vocabulary file or sentencepiece model. * **dataset_files** (list) --- list with paths to all dataset .csv files. * **augmentation** (dict) --- optional dictionary with data augmentation parameters. Can contain "time_stretch_ratio", "noise_level_min" and "noise_level_max" parameters, e.g.:: { 'time_stretch_ratio': 0.05, 'noise_level_min': -90, 'noise_level_max': -60, } For additional details on these parameters see :func:`data.speech2text.speech_utils.augment_audio_signal` function. * **autoregressive** (bool) --- boolean indicating whether the model is autoregressive. * **syn_enable** (bool) --- boolean indicating whether the model is using synthetic data. * **syn_subdirs** (list) --- must be defined if using synthetic mode. Contains a list of subdirectories that hold the synthetica wav files. """ super(Speech2TextDataLayer, self).__init__(params, model, num_workers, worker_id) self.params['autoregressive'] = self.params.get( 'autoregressive', False) self.autoregressive = self.params['autoregressive'] self.params['bpe'] = self.params.get('bpe', False) if self.params['bpe']: self.sp = spm.SentencePieceProcessor() self.sp.Load(self.params['vocab_file']) self.params['tgt_vocab_size'] = len(self.sp) + 1 else: self.params['char2idx'] = load_pre_existing_vocabulary( self.params['vocab_file'], read_chars=True, ) if not self.autoregressive: # add one for implied blank token self.params['tgt_vocab_size'] = len( self.params['char2idx']) + 1 else: num_chars_orig = len(self.params['char2idx']) self.params['tgt_vocab_size'] = num_chars_orig + 2 self.start_index = num_chars_orig self.end_index = num_chars_orig + 1 self.params['char2idx']['<S>'] = self.start_index self.params['char2idx']['</S>'] = self.end_index self.target_pad_value = self.end_index self.params['idx2char'] = { i: w for w, i in self.params['char2idx'].items() } self.target_pad_value = 0 self._files = None if self.params["interactive"]: return for csv in params['dataset_files']: files = pd.read_csv(csv, encoding='utf-8') if self._files is None: self._files = files else: self._files = self._files.append(files) if self.params['mode'] != 'infer': cols = ['wav_filename', 'transcript'] else: cols = 'wav_filename' self.all_files = self._files.loc[:, cols].values self._files = self.split_data(self.all_files) self._size = self.get_size_in_samples() self._dataset = None self._iterator = None self._input_tensors = None self.params['max_duration'] = params.get('max_duration', -1.0) self.params['window_size'] = params.get('window_size', 20e-3) self.params['window_stride'] = params.get('window_stride', 10e-3)
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import sentencepiece as spm import data_loader #from vocab import Vocab from log_timer import LogTimer from datetime import datetime from utilities import * from torch.optim.lr_scheduler import StepLR logging.basicConfig(filename='LSTM_2lAdam_optim_BPE_20000_3_2048.log', level=logging.DEBUG) sp_bpe = spm.SentencePieceProcessor() sp_bpe.load( '../../rnn/RNN-Sherlock-Language-Model/korr_ukrlib_bpe_model_20000.model') class RnnLm(nn.Module): """ A language model RNN with GRU layer(s). """ def __init__(self, vocab_size, embedding_dim, hidden_dim, gru_layers, tied, dropout): super(RnnLm, self).__init__() self.tied = tied if not tied: self.embedding = nn.Embedding(vocab_size, embedding_dim) self.gru = nn.LSTM(embedding_dim, hidden_dim, gru_layers,
def __init__(self, vocab_file, src_lang=None, tgt_lang=None, eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token="<pad>", mask_token="<mask>", **kwargs): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance( mask_token, str) else mask_token super().__init__( src_lang=src_lang, tgt_lang=tgt_lang, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs, ) self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # Original fairseq vocab and spm vocab must be "aligned": # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- # fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's' | '▁de' | '-' # spm | '<unk>' | '<s>' | '</s>' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' # Mimic fairseq token-to-id alignment for the first 4 token self.fairseq_tokens_to_ids = { "<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3 } # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab self.fairseq_offset = 1 self.sp_model_size = len(self.sp_model) self.lang_code_to_id = { code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES) } self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()} self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len( self.lang_code_to_id) + self.fairseq_offset self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_ids_to_tokens = { v: k for k, v in self.fairseq_tokens_to_ids.items() } self._additional_special_tokens = list(self.lang_code_to_id.keys()) self._src_lang = src_lang if src_lang is not None else "en_XX" self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] self.tgt_lang = tgt_lang self.set_src_lang_special_tokens(self._src_lang)
def input_fn_builder(data_dir, vocab_model_file, masked_lm_prob, max_encoder_length, max_predictions_per_seq, preprocessed_data, substitute_newline, is_training, tmp_dir=None): """Creates an `input_fn` closure to be passed to TPUEstimator.""" sp_model = spm.SentencePieceProcessor() sp_proto = tf.io.gfile.GFile(vocab_model_file, "rb").read() sp_model.LoadFromSerializedProto(sp_proto) vocab_size = sp_model.GetPieceSize() word_start_subtoken = np.array( [sp_model.IdToPiece(i)[0] == "▁" for i in range(vocab_size)]) word_to_token = np.array( #사전 가저 오는 부분 [sp_model.IdToPiece(i) for i in range(vocab_size)]) feature_shapes = { "input_ids": [max_encoder_length], "segment_ids": [max_encoder_length], "masked_lm_positions": [max_predictions_per_seq], "masked_lm_ids": [max_predictions_per_seq], "masked_lm_weights": [max_predictions_per_seq], "next_sentence_labels": [1] } def _decode_record(record): """Decodes a record to a TensorFlow example.""" name_to_features = { "input_ids": tf.io.FixedLenFeature([max_encoder_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([max_encoder_length], tf.int64), "masked_lm_positions": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_ids": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32), "next_sentence_labels": tf.io.FixedLenFeature([1], tf.int64), } example = tf.io.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.cast(t, tf.int32) example[name] = t return example def do_masking(example): if "tfds://" == data_dir[:7]: text = example["text"] else: text = example print(text) tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(vocab_model_file, "rb").read()) if substitute_newline: text = tf.strings.regex_replace(text, "\n", substitute_newline) subtokens = tokenizer.tokenize(text) (subtokens, masked_lm_positions, masked_lm_ids, masked_lm_weights) = tf.compat.v1.py_func( numpy_masking, [subtokens], [tf.int32, tf.int32, tf.int32, tf.float32], stateful=False) features = { "input_ids": subtokens, "segment_ids": tf.zeros_like(subtokens), "masked_lm_positions": masked_lm_positions, "masked_lm_ids": masked_lm_ids, "masked_lm_weights": masked_lm_weights, "next_sentence_labels": tf.zeros([1], dtype=tf.int64), } return features def numpy_masking(subtokens): # Find a random span in text end_pos = max_encoder_length - 2 + np.random.randint( max(1, len(subtokens) - max_encoder_length - 2)) start_pos = max(0, end_pos - max_encoder_length + 2) subtokens = subtokens[start_pos:end_pos] # The start might be inside a word so fix it # such that span always starts at a word word_begin_mark = word_start_subtoken[subtokens] word_begins_pos = np.flatnonzero(word_begin_mark).astype(np.int32) if word_begins_pos.size == 0: # if no word boundary present, we do not do whole word masking # and we fall back to random masking. word_begins_pos = np.arange(len(subtokens), dtype=np.int32) word_begin_mark = np.logical_not(word_begin_mark) print(subtokens, start_pos, end_pos, word_begin_mark) correct_start_pos = word_begins_pos[0] subtokens = subtokens[correct_start_pos:] word_begin_mark = word_begin_mark[correct_start_pos:] word_begins_pos = word_begins_pos - correct_start_pos num_tokens = len(subtokens) # @e want to do whole word masking so split by word boundary words = np.split(np.arange(num_tokens, dtype=np.int32), word_begins_pos)[1:] assert len(words) == len(word_begins_pos) # Decide elements to mask num_to_predict = min( max_predictions_per_seq, max(1, int(round(len(word_begins_pos) * masked_lm_prob)))) masked_lm_positions = np.concatenate( np.random.choice(np.array([[]] + words, dtype=np.object)[1:], num_to_predict, replace=False), 0) # but this might have excess subtokens than max_predictions_per_seq if len(masked_lm_positions) > max_predictions_per_seq: masked_lm_positions = masked_lm_positions[: max_predictions_per_seq + 1] # however last word can cross word boundaries, remove crossing words truncate_masking_at = np.flatnonzero( word_begin_mark[masked_lm_positions])[-1] masked_lm_positions = masked_lm_positions[:truncate_masking_at] # sort masking positions masked_lm_positions = np.sort(masked_lm_positions) masked_lm_ids = subtokens[masked_lm_positions] # replance input token with [MASK] 80%, random 10%, or leave it as it is. randomness = np.random.rand(len(masked_lm_positions)) mask_index = masked_lm_positions[randomness < 0.8] random_index = masked_lm_positions[randomness > 0.9] subtokens[mask_index] = 67 # id of masked token subtokens[random_index] = np.random.randint( # ignore special tokens 101, vocab_size, len(random_index), dtype=np.int32) # add [CLS] (65) and [SEP] (66) tokens subtokens = np.concatenate([ np.array([65], dtype=np.int32), subtokens, np.array([66], dtype=np.int32) ]) # pad everything to correct shape pad_inp = max_encoder_length - num_tokens - 2 subtokens = np.pad(subtokens, [0, pad_inp], "constant") pad_out = max_predictions_per_seq - len(masked_lm_positions) masked_lm_weights = np.pad( np.ones_like(masked_lm_positions, dtype=np.float32), [0, pad_out], "constant") masked_lm_positions = np.pad(masked_lm_positions + 1, [0, pad_out], "constant") masked_lm_ids = np.pad(masked_lm_ids, [0, pad_out], "constant") return subtokens, masked_lm_positions, masked_lm_ids, masked_lm_weights def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] # Load dataset and handle tfds separately split = "train" if is_training else "test" if "tfds://" == data_dir[:7]: d = tfds.load(data_dir[7:], split=split, shuffle_files=is_training, data_dir=tmp_dir) else: input_files = tf.io.gfile.glob( os.path.join(data_dir, "*{}.tfrecord*".format(split))) # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if is_training: d = tf.data.Dataset.from_tensor_slices( tf.constant(input_files)) d = d.shuffle(buffer_size=len(input_files)) # Non deterministic mode means that the interleaving is not exact. # This adds even more randomness to the training pipeline. d = d.interleave( tf.data.TFRecordDataset, deterministic=False, num_parallel_calls=tf.data.experimental.AUTOTUNE) else: d = tf.data.TFRecordDataset(input_files) if preprocessed_data: d = d.map(_decode_record, num_parallel_calls=tf.data.experimental.AUTOTUNE) else: d = d.map(do_masking, num_parallel_calls=tf.data.experimental.AUTOTUNE) if is_training: d = d.shuffle( buffer_size=10000, reshuffle_each_iteration=True ) # reshuffle_each_iteration: 데이터를 썩을때 buffer 사이즈 한칸씩 넘어가는데 순서를 다르게 함 d = d.repeat() d = d.padded_batch(batch_size, feature_shapes, drop_remainder=True) # For static shape return d return input_fn
parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--word_embedding_path", default=None, type=str, help="Path of the output word embedding.") args = parser.parse_args() if args.spm_model_path: try: import sentencepiece as spm except ImportError: raise ImportError("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece") sp_model = spm.SentencePieceProcessor() sp_model.Load(args.spm_model_path) vocab = Vocab() vocab.i2w = {i: sp_model.IdToPiece(i) for i in range(sp_model.GetPieceSize())} else: vocab = Vocab() vocab.load(args.vocab_path) pretrained_model = torch.load(args.load_model_path) embedding = pretrained_model["embedding.word_embedding.weight"] with open(args.word_embedding_path, mode="w", encoding="utf-8") as f: head = str(list(embedding.size())[0]) + " " + str(list(embedding.size())[1]) + "\n" f.write(head) for i in range(len(vocab.i2w)):
#@title Create a tokenizer and its model #@markdown NOTE: Less tokenizer words seem to work better # %cd /content/ full_path_to_INT_dataset = "/content/Music-Reformer_INT_Dataset.txt" #@param {type:"string"} tokenizer_vocabulary_size_in_words = 321#@param {type:"integer"} # Train a BPE model on the dataset spm.SentencePieceTrainer.train(input=full_path_to_INT_dataset, model_prefix='Music-Reformer-Tokenizer', vocab_size=tokenizer_vocabulary_size_in_words, model_type='bpe') # Load BPE vocabulary TOKENIZER = spm.SentencePieceProcessor() TOKENIZER.load('Music-Reformer-Tokenizer.model') # Load the dataset with open(full_path_to_INT_dataset, 'r') as f: text = f.read(512 * 3072) IDS = TOKENIZER.EncodeAsIds(text) IDS = np.asarray(IDS, dtype=np.int32) PAD_AMOUNT = 512 * 1024 - len(IDS) print("Number of tokens:", IDS.shape[0]) #@title Split the dataset train_validation_split_ratio = 0.9 #@param {type:"slider", min:0.05, max:0.95, step:0.05}
# -*- coding: utf-8 -*- import sentencepiece as spm import os import codecs import pythaipiece templates_dir = os.path.dirname(pythaipiece.__file__) template_file = os.path.join(templates_dir, 'thai3.model') sp = spm.SentencePieceProcessor() sp.Load(template_file) def segment(text): listdata=[i for i in sp.EncodeAsPieces(text) if i!= '▁'] listword=[] for i in listdata: if '▁' in i: listword.append(' ') listword.append(i.replace('▁','')) else: listword.append(i) return listword
def load_from_file(cls, filepath): import sentencepiece as splib spm = splib.SentencePieceProcessor() spm.load(filepath) spm.set_encode_extra_options(":eos") return cls(spm)
def __init__(self): self.sp = spm.SentencePieceProcessor() #self.sp.load('mtier1_20w.model') #self.sp.load('mtier1_10w.model') self.sp.load(r'E:\embedding\title\top24_10w.model')
def __setstate__(self, d): self.__dict__ = d self.spm = sp.SentencePieceProcessor() self.spm.Load(self.vocab_file)