def make_subword_learner(subword_config, subword_dir, tokenizer=None): params = subword_config.get("params") if params is None: raise ValueError( "'params' field should be specified for subword model learning.") subword_type = subword_config.get("type") if subword_type is None: raise ValueError( "'type' field should be specified for subword model learning.") vocab_size = params.get("vocab_size") if vocab_size is None: raise ValueError( "'vocab_size' parameter should be specified for subword model learning." ) if subword_type == "bpe": learner = pyonmttok.BPELearner( tokenizer=tokenizer, symbols=vocab_size, min_frequency=params.get("min-frequency", 0), total_symbols=params.get("total_symbols", False), ) elif subword_type == "sp": learner = pyonmttok.SentencePieceLearner(tokenizer=tokenizer, **params) else: raise ValueError("Invalid subword type : '%s'." % subword_type) return { "learner": learner, "subword_type": subword_type, "size": vocab_size }
def make_subword_learner(subword_config, subword_dir): if 'params' not in subword_config: raise RuntimeError( 'Parameter field \'params\' should be specified for subword model learning.' ) params = subword_config['params'] if 'type' not in subword_config: raise RuntimeError( '\'type\' field should be specified for subword model learning.') subword_type = subword_config['type'] if 'vocab_size' not in params: raise RuntimeError( '\'vocab_size\' should be specified for subword model learning.') size = params['vocab_size'] learner = None if (subword_type == "bpe"): min_frequency = params[ 'min-frequency'] if 'min-frequency' in params else 0 total_symbols = params[ 'total_symbols'] if 'total_symbols' in params else False # If no tokenizer is specified, the default tokenizer is space mode. learner = pyonmttok.BPELearner(symbols=size, min_frequency=min_frequency, total_symbols=total_symbols) elif (subword_type == "sp"): learner = pyonmttok.SentencePieceLearner(**params) else: raise RuntimeError('Invalid subword type : \'%s\'.' % subword_type) return {"learner": learner, "subword_type": subword_type, "size": size}
def make_subword_learner(subword_config, subword_dir): params = subword_config.get('params') if params is None: raise ValueError('\'params\' field should be specified for subword model learning.') subword_type = subword_config.get('type') if subword_type is None: raise ValueError('\'type\' field should be specified for subword model learning.') vocab_size = params.get('vocab_size') if vocab_size is None: raise ValueError('\'vocab_size\' parameter should be specified for subword model learning.') if subword_type == "bpe": learner = pyonmttok.BPELearner( symbols=vocab_size, min_frequency=params.get('min-frequency', 0), total_symbols=params.get('total_symbols', False)) elif subword_type == "sp": learner = pyonmttok.SentencePieceLearner(**params) else: raise ValueError('Invalid subword type : \'%s\'.' % subword_type) return { "learner": learner, "subword_type": subword_type, "size": vocab_size }
def tgt(vocabulary_size): learner = pyonmttok.SentencePieceLearner(vocab_size=vocabulary_size) learner.ingest_file("tgt-train.txt") tokenizer = learner.learn("ca_m.model", verbose=True) tokens = tokenizer.tokenize_file("tgt-train.txt", "tgt-train.txt.token") tokens = tokenizer.tokenize_file("tgt-test.txt", "tgt-test.txt.token") tokens = tokenizer.tokenize_file("tgt-val.txt", "tgt-val.txt.token")
def test_sp_learner(tmpdir): learner = pyonmttok.SentencePieceLearner(vocab_size=17, character_coverage=0.98) learner.ingest("hello word! how are you?") model_path = str(tmpdir.join("sp.model")) tokenizer = learner.learn(model_path) tokens, _ = tokenizer.tokenize("hello") assert tokens == ["▁h", "e", "l", "l", "o"]
def train_joint_tok_model(file_src, file_tgt): learner = pyonmttok.SentencePieceLearner(vocab_size=50000, character_coverage=1.0) learner.ingest_file(file_src) learner.ingest_file(file_tgt) temp_model_file = tempfile.NamedTemporaryFile(delete=False) tokenizer = learner.learn(temp_model_file.name) return tokenizer, temp_model_file.name
def test_sp_learner(tmpdir, keep_vocab): learner = pyonmttok.SentencePieceLearner( keep_vocab=keep_vocab, vocab_size=17, character_coverage=0.98) learner.ingest("hello word! how are you?") model_path = str(tmpdir.join("sp")) tokenizer = learner.learn(model_path) if keep_vocab: assert os.path.exists(model_path + ".model") assert os.path.exists(model_path + ".vocab") else: assert os.path.exists(model_path) tokens, _ = tokenizer.tokenize("hello") assert tokens == ["▁h", "e", "l", "l", "o"]
def learn_sp(sp_model, vocab_size=32000, character_coverage=0.98, files=[]): learner = pyonmttok.SentencePieceLearner( vocab_size=vocab_size, character_coverage=character_coverage) if len(files): for f in files: sys.stderr.write('Ingest file={}\n'.format(f)) sys.stderr.flush() learner.ingest_file(f) else: sys.stderr.write('Ingest stdin\n') sys.stderr.flush() for l in sys.stdin: learner.ingest(l) sys.stderr.write('Learning {}\n'.format(sp_model)) sys.stderr.flush() learner.learn(sp_model)
def main(): tf.get_logger().setLevel("INFO") parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("data", nargs="*", help="List of data files.") parser.add_argument( "--from_vocab", default=None, help="Build from a saved vocabulary (see also --from_format).", ) parser.add_argument( "--from_format", default="default", choices=["default", "sentencepiece"], help="The format of the saved vocabulary (see also --from_vocab).", ) parser.add_argument("--save_vocab", required=True, help="Output vocabulary file.") parser.add_argument("--min_frequency", type=int, default=1, help="Minimum word frequency.") parser.add_argument( "--size", type=int, default=0, help="Maximum vocabulary size. If = 0, do not limit vocabulary.", ) parser.add_argument( "--size_multiple", type=int, default=1, help= ("Ensure that the vocabulary size + 1 is a multiple of this value " "(+ 1 represents the <unk> token that will be added during the training." ), ) parser.add_argument( "--without_sequence_tokens", default=False, action="store_true", help= "If set, do not add special sequence tokens (start, end) in the vocabulary.", ) parser.add_argument( "--tokenizer_config", default=None, help= ("Tokenization configuration as a JSON string or a path to a YAML configuration file. " "When building a SentencePiece model and vocabulary, this is used as a " "pre-tokenization. SentencePiece will receive tokens instead of sentences as " "inputs."), ) parser.add_argument( "--sentencepiece", nargs="*", default=None, help= ("Build a SentencePiece model and vocabulary. This option accepts additional " "training parameters (e.g. --sentencepiece character_coverage=0.98)." ), ) args = parser.parse_args() special_tokens = [constants.PADDING_TOKEN] if not args.without_sequence_tokens: special_tokens.append(constants.START_OF_SENTENCE_TOKEN) special_tokens.append(constants.END_OF_SENTENCE_TOKEN) vocab = data.Vocab(special_tokens=special_tokens) num_oov_buckets = 1 if args.sentencepiece is not None: if args.min_frequency > 1: raise ValueError( "--min_frequency option is not supported when training a SentencePiece " "model and vocabulary") import pyonmttok if args.size_multiple == 1: vocab_size = args.size else: # Round vocabulary size to the next multiple of args.size_multiple vocab_size = (args.size - (args.size + num_oov_buckets) % args.size_multiple + args.size_multiple) if args.tokenizer_config: tokenizer = tokenizers.make_tokenizer(args.tokenizer_config) if not isinstance(tokenizer, tokenizers.OpenNMTTokenizer): tokenizer_type = tokenizer.__class__.__name__ raise ValueError( "Only tokenizer type 'OpenNMTTokenizer' can be used as a SentencePiece " "pre-tokenization, got tokenizer type '%s' instead." % tokenizer_type) else: tokenizer = None sp_params = dict( map(lambda arg: tuple(arg.split("=")), args.sentencepiece)) sp_trainer = pyonmttok.SentencePieceLearner( tokenizer=tokenizer.opennmt_tokenizer if tokenizer is not None else None, keep_vocab=True, vocab_size=vocab_size, **sp_params, ) for data_file in args.data: sp_trainer.ingest_file(data_file) sp_trainer.learn(args.save_vocab, verbose=True) model_path = args.save_vocab + ".model" vocab_path = args.save_vocab + ".vocab" if tokenizer is None: tf.get_logger().info( "Converting SentencePiece vocabulary to OpenNMT-tf format...") vocab.load(vocab_path, file_format="sentencepiece") else: tf.get_logger().info( "Applying SentencePiece model on data and extracting the %d most " "frequent tokens...", vocab_size, ) tokenizer = tokenizers.OpenNMTTokenizer(sp_model_path=model_path, **tokenizer.config) for data_file in args.data: vocab.add_from_text(data_file, tokenizer=tokenizer) vocab = vocab.prune(max_size=vocab_size) vocab.serialize(vocab_path) else: if args.from_vocab is not None: vocab.load(args.from_vocab, file_format=args.from_format) tokenizer = tokenizers.make_tokenizer(args.tokenizer_config) for data_file in args.data: vocab.add_from_text(data_file, tokenizer=tokenizer) vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency) vocab.pad_to_multiple(args.size_multiple, num_oov_buckets=num_oov_buckets) vocab.serialize(args.save_vocab)
assert isinstance(learner, pyonmttok.SubwordLearner) learner.ingest("hello word! how are you?") model_path = str(tmpdir.join("sp")) tokenizer = learner.learn(model_path) if keep_vocab: assert os.path.exists(model_path + ".model") assert os.path.exists(model_path + ".vocab") else: assert os.path.exists(model_path) tokens, _ = tokenizer.tokenize("hello") assert tokens == ["▁h", "e", "l", "l", "o"] @pytest.mark.parametrize("learner", [ pyonmttok.BPELearner(symbols=2, min_frequency=1), pyonmttok.SentencePieceLearner(vocab_size=17, character_coverage=0.98) ]) def test_learner_with_invalid_files(tmpdir, learner): with pytest.raises(ValueError): learner.ingest_file("notfound.txt") learner.ingest("hello word ! how are you ?") directory = tmpdir.join("directory") directory.ensure(dir=True) with pytest.raises(Exception): learner.learn(str(directory)) def test_token_api(): tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True, case_markup=True)
#only for fast_align, eflomal aligns always the two directions at the same time DELETE_EXISTING_VALID = config["DELETE_EXISTING"] DELETE_TEMP_VALID = config["DELETE_TEMP"] SPLIT_LIMIT = int(config["SPLIT_LIMIT"]) sys.path.append(MTUOC) tokenizerASL = importlib.import_module(SL_TOKENIZER) tokenizerATL = importlib.import_module(TL_TOKENIZER) from MTUOC_split_corpus import split_corpus learner = pyonmttok.SentencePieceLearner( vocab_size=VOCAB_SIZE, character_coverage=CHARACTER_COVERAGE, model_type=MODEL_TYPE, input_sentence_size=INPUT_SENTENCE_SIZE, shuffle_input_sentence=True, hard_vocab_limit=False) learnerSL = pyonmttok.SentencePieceLearner( vocab_size=VOCAB_SIZE, character_coverage=CHARACTER_COVERAGE, model_type=MODEL_TYPE, input_sentence_size=INPUT_SENTENCE_SIZE, shuffle_input_sentence=True, hard_vocab_limit=False) learnerTL = pyonmttok.SentencePieceLearner( vocab_size=VOCAB_SIZE, character_coverage=CHARACTER_COVERAGE, model_type=MODEL_TYPE, input_sentence_size=INPUT_SENTENCE_SIZE,
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("data", nargs="*", help="Source text file.") parser.add_argument( "--from_vocab", default=None, help="Build from a saved vocabulary (see also --from_format).") parser.add_argument( "--from_format", default="default", choices=["default", "sentencepiece"], help="The format of the saved vocabulary (see also --from_vocab).") parser.add_argument("--save_vocab", required=True, help="Output vocabulary file.") parser.add_argument("--min_frequency", type=int, default=1, help="Minimum word frequency.") parser.add_argument( "--size", type=int, default=0, help="Maximum vocabulary size. If = 0, do not limit vocabulary.") parser.add_argument( "--size_multiple", type=int, default=1, help= ("Ensure that the vocabulary size + 1 is a multiple of this value " "(+ 1 represents the <unk> token that will be added during the training." )) parser.add_argument( "--without_sequence_tokens", default=False, action="store_true", help= "If set, do not add special sequence tokens (start, end) in the vocabulary." ) parser.add_argument("--tokenizer_config", default=None, help="Tokenization configuration.") parser.add_argument( "--sentencepiece", nargs="*", default=None, help= ("Build a SentencePiece model and vocabulary. This option accepts additional " "training parameters (e.g. --sentencepiece character_coverage=0.98)." )) args = parser.parse_args() special_tokens = [constants.PADDING_TOKEN] if not args.without_sequence_tokens: special_tokens.append(constants.START_OF_SENTENCE_TOKEN) special_tokens.append(constants.END_OF_SENTENCE_TOKEN) vocab = data.Vocab(special_tokens=special_tokens) num_oov_buckets = 1 if args.sentencepiece is not None: import pyonmttok # pylint: disable=import-outside-toplevel if args.size_multiple == 1: vocab_size = args.size else: # Round vocabulary size to the next multiple of args.size_multiple vocab_size = (args.size - (args.size + num_oov_buckets) % args.size_multiple + args.size_multiple) sp_params = dict( map(lambda arg: tuple(arg.split("=")), args.sentencepiece)) sp_trainer = pyonmttok.SentencePieceLearner(keep_vocab=True, vocab_size=vocab_size, **sp_params) for data_file in args.data: sp_trainer.ingest_file(data_file) sp_trainer.learn(args.save_vocab, verbose=True) args.save_vocab = args.save_vocab + ".vocab" vocab.load(args.save_vocab, file_format="sentencepiece") else: if args.from_vocab is not None: vocab.load(args.from_vocab, file_format=args.from_format) tokenizer = tokenizers.make_tokenizer(args.tokenizer_config) for data_file in args.data: vocab.add_from_text(data_file, tokenizer=tokenizer) vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency) vocab.pad_to_multiple(args.size_multiple, num_oov_buckets=num_oov_buckets) vocab.serialize(args.save_vocab)