コード例 #1
0
def train_sentencepiece(
    file_path: str,
    model_path: str,
    vocab_size: int,
    character_coverage: float,
    model_type: str):
  """Train SentencePiece tokenizer from subset of tf dataset.

  Args:
    file_path: path of data to train sentencepiece.
    model_path: path of model file to save vocab model to.
    vocab_size: size of vocab tokens to train.
    character_coverage: amount of characters covered by the model, good defaults
      are 0.9995 for languages with rich character set like Japanese or Chinese
      and 1.0 for other languages with small character set.
    model_type: type of sentencepiece vocab to train.

  Returns:
    path to the trained sentencepiece vocabulary model.
  """
  argstr = " ".join([
      f"--input={file_path}", f"--vocab_size={vocab_size}",
      f"--character_coverage={character_coverage}",
      f"--model_prefix={model_path}", f"--model_type={model_type}",
      "--bos_id=-1", "--pad_id=0", "--eos_id=1", "--unk_id=2"
  ])
  SentencePieceTrainer.Train(argstr)
コード例 #2
0
ファイル: sentencepiece.py プロジェクト: MJ-Jang/octopus
    def train(self,
              sent_path: str,
              model_prefix: str,
              character_coverage=0.9995,
              vocab_size=None,
              model_type: str = "bpe",
              control_symbols: list = ['<pad>']):

        if character_coverage is None and vocab_size is None:
            print("at least character_coverage or vocab_size should be given!")
            assert character_coverage or vocab_size

        coverage_conditions = ""
        if character_coverage is not None:
            coverage_condition = f" --character_coverage={str(character_coverage)} "
        else:
            coverage_condition = f" --vocab_size={str(vocab_size)} "

        symbol_list = ""
        for i in control_symbols:
            symbol_list += i + ","

        args = ("--input={} "
                "--model_prefix={} "
                "--model_type={} "
                "--control_symbols={} ".format(sent_path, model_prefix,
                                               model_type, symbol_list))

        args += coverage_condition

        SentencePieceTrainer.Train(args)
コード例 #3
0
 def train(model_type: str,
           vocab_size: int,
           model_path: str,
           files: Iterator[str],
           no_split_toks: Optional[List[str]] = None):
     """
     Train Sentence Piece Model
     :param model_type: sentence piece model type: {unigram, BPE, word, char}
     :param vocab_size: target vocabulary size
     :param model_path: where to store model
     :param files: input files
     :param no_split_toks: Don't split these tokens
     :return:
     """
     model_prefix = model_path.replace('.model', '')
     files = set(files)  # remove duplicates
     arg = f"--input={','.join(files)} --vocab_size={vocab_size} --model_prefix={model_prefix}" \
           f" --model_type={model_type} --pad_id={PAD_TOK[1]} --bos_id={BOS_TOK[1]}" \
           f" --eos_id={EOS_TOK[1]} --unk_id={UNK_TOK[1]} --hard_vocab_limit=false"
     if no_split_toks:
         arg += f" --user_defined_symbols={','.join(no_split_toks)}"
     log.info(f"SPM: {arg}")
     SentencePieceTrainer.Train(arg)
     log.info("Training complete")
     if not model_path.endswith('.model'):
         model_path += '.model'
     return Field(model_path)
コード例 #4
0
def makeSentencepieceModel(df, fname):
    content = df['plylst_title']
    with open('{}.txt'.format(fname), 'w', encoding='utf8') as f:
        f.write('\n'.join(content))
    SentencePieceTrainer.Train(
        '--input={}.txt --model_prefix={} --vocab_size=3000'.format(
            fname, fname))
コード例 #5
0
def train_sentencepiece(texts: Collection[str],
                        path: PathOrStr,
                        pre_rules: ListRules = None,
                        post_rules: ListRules = None,
                        vocab_sz: int = None,
                        max_vocab_sz: int = 30000,
                        model_type: str = 'unigram',
                        max_sentence_len: int = 20480,
                        lang='en',
                        char_coverage=None,
                        tmp_dir='tmp'):
    "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
    from sentencepiece import SentencePieceTrainer
    cache_dir = Path(path) / tmp_dir
    os.makedirs(cache_dir, exist_ok=True)
    if vocab_sz is None: vocab_sz = get_default_size(texts, max_vocab_sz)
    raw_text_path = cache_dir / 'all_text.out'
    with open(raw_text_path, 'w') as f:
        f.write("\n".join(texts))
    spec_tokens = ['\u2581' + s for s in defaults.text_spec_tok]
    SentencePieceTrainer.Train(" ".join([
        f"--input={raw_text_path} --max_sentence_length={max_sentence_len}",
        f"--character_coverage={ifnone(char_coverage, 0.99999 if lang in full_char_coverage_langs else 0.9998)}",
        f"--unk_id={len(defaults.text_spec_tok)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
        f"--user_defined_symbols={','.join(spec_tokens)}",
        f"--model_prefix={cache_dir/'spm'} --vocab_size={vocab_sz} --model_type={model_type}"
    ]))
    raw_text_path.unlink()
    return cache_dir
def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
    argstr = " ".join([
        f"--input={input_path}", f"--vocab_size={vocab_size}",
        "--character_coverage=0.995", f"--model_prefix={model_path}",
        "--model_type=bpe", "--bos_id=-1", "--pad_id=0", f"--eos_id={eos_id}",
        "--unk_id=2"
    ])
    SentencePieceTrainer.Train(argstr)
コード例 #7
0
def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
  argstr = ' '.join([
      f'--input={input_path}', f'--vocab_size={vocab_size}',
      '--character_coverage=0.995',
      f'--model_prefix={model_path}', '--model_type=bpe',
      '--bos_id=-1', '--pad_id=0', f'--eos_id={eos_id}', '--unk_id=2'
  ])
  SentencePieceTrainer.Train(argstr)
コード例 #8
0
ファイル: lm_bert_dataset.py プロジェクト: vladk17/NeMo
    def create_vocab_mlm(
        self,
        data_dir,
        vocab_size,
        sample_size,
        special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
        train_file='',
    ):
        vocab = special_tokens[:]
        bert_dir = f'{data_dir}/bert'
        if if_exist(bert_dir, ['tokenizer.model']):
            logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
            return data_dir, f'{bert_dir}/tokenizer.model'
        logging.info(f'Processing WikiText dataset and store at {bert_dir}')
        os.makedirs(bert_dir, exist_ok=True)

        if not train_file:
            files = glob.glob(f'{data_dir}/*.txt')
            train_file = f'{bert_dir}/merged.txt'
            logging.info(f"Merging {len(files)} txt files into {train_file}")

            with open(train_file, "w") as merged:
                for file in tqdm(files):
                    with open(file, 'r') as inf:
                        content = inf.read().strip()
                    merged.write(content + '\n\n\n')
        else:
            train_file = f'{data_dir}/{train_file}'

        cmd = (f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
               f"--vocab_size={vocab_size - len(vocab)} "
               f"--input_sentence_size={sample_size} "
               f"--shuffle_input_sentence=true --hard_vocab_limit=false "
               f"--bos_id=-1 --eos_id=-1")

        SPT.Train(cmd)

        # Add BERT control symbols
        tokens = []

        with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
            f.readline()  # skip first <unk> token

            # Read tokens from each line and parse for vocab
            for line in f:
                piece = line.split("\t")[0]
                token = piece[1:] if piece.startswith("▁") else f"##{piece}"
                tokens.append(token)

        vocab.extend(tokens)

        # Save vocabulary to output file
        with open(f'{bert_dir}/vocab.txt', "w") as f:
            for token in vocab:
                f.write(f"{token}\n".format())
        return data_dir, f'{bert_dir}/tokenizer.model'
コード例 #9
0
def train_sentencepiece(dataset,
                        vocab_size,
                        maxchars=1e9,
                        character_coverage=1.0,
                        model_path="model",
                        model_type="unigram",
                        data_keys=("inputs", "targets")):
    """Train SentencePiece tokenizer from subset of tf dataset.

  Args:
    dataset: tf.dataset
    vocab_size: int: size of vocab tokens to train.
    maxchars: int: number of characters to use for sentencepiece training.
    character_coverage: amount of characters covered by the model, good defaults
      are 0.9995 for languages with rich character set like Japanese or Chinese
      and 1.0 for other languages with small character set.
    model_path: str: path of model file to save vocab model to.
    model_type: str: type of sentencepiece vocab to train.
    data_keys: Tuple[str]: keys of dataset to use for training.

  Returns:
    path to the trained sentencepiece vocabulary model.
  """
    fname, _ = dump_chars_to_textfile(dataset,
                                      maxchars=maxchars,
                                      data_keys=data_keys)
    with tempfile.NamedTemporaryFile(delete=False,
                                     prefix="/tmp/sp_tmp") as model_fp:
        pass  # we just want a prefix'd tmp-filename
    argstr = " ".join([
        f"--input={fname}", f"--vocab_size={vocab_size}",
        f"--character_coverage={character_coverage}",
        f"--model_prefix={model_fp.name}", f"--model_type={model_type}"
    ])
    SentencePieceTrainer.Train(argstr)
    if jax.process_index() == 0:
        # Use an intermediate filename that is renamed to the target name to address
        # create and fill delays.
        copy_rename_path = model_path + ".rntmp"
        tf.io.gfile.copy(model_fp.name + ".model",
                         copy_rename_path,
                         overwrite=True)
        tf.io.gfile.rename(copy_rename_path, model_path, overwrite=True)
        tf.io.gfile.copy(model_fp.name + ".vocab",
                         copy_rename_path + ".vocab",
                         overwrite=True)
        tf.io.gfile.rename(copy_rename_path + ".vocab",
                           model_path + ".vocab",
                           overwrite=True)
        logging.info("copied %s to %s", model_fp.name + ".model", model_path)
    else:
        while not tf.io.gfile.exists(model_path):
            time.sleep(1)
        time.sleep(1)
    return model_path
コード例 #10
0
def _train_sentencepiece(dataset: tf.data.Dataset,
                         *,
                         vocab_size: int,
                         maxchars: int = int(1e7),
                         model_path: str,
                         model_type: str = 'unigram',
                         character_coverage: float = 1.0,
                         data_keys=('inputs', 'targets')):
    """Train SentencePiece tokenizer from subset of tf dataset.

  Args:
    dataset: tf.dataset
    vocab_size: int: size of vocab tokens to train.
    maxchars: int: number of characters to use for sentencepiece training.
    model_path: str: path of model file to save vocab model to.
    model_type: str: type of sentencepiece vocab to train.
    character_coverage: amount of characters covered by the model, good defaults
      are 0.9995 for languages with rich character set like Japanese or Chinese
      and 1.0 for other languages with small character set.
    data_keys: Tuple[str]: keys of dataset to use for training.

  Returns:
    path to the trained sentencepiece vocabulary model.
  """
    if model_path.startswith('gs://'):
        abs_model_path = model_path
    else:
        abs_model_path = os.path.abspath(os.path.expanduser(model_path))
    fname, _ = _dump_chars_to_textfile(dataset,
                                       maxchars=maxchars,
                                       data_keys=data_keys)
    with tempfile.NamedTemporaryFile(delete=False,
                                     prefix='/tmp/sp_tmp') as model_fp:
        pass  # we just want a prefix'd tmp-filename
    argstr = ' '.join([
        f'--input={fname}', f'--vocab_size={vocab_size}',
        f'--character_coverage={character_coverage}',
        f'--model_prefix={model_fp.name}', f'--model_type={model_type}'
    ])
    SentencePieceTrainer.Train(argstr)
    if jax.process_index() == 0:
        # Use an intermediate filename that is renamed to the target name to address
        # create and fill delays.
        copy_rename_path = abs_model_path + '.rntmp'
        tf.io.gfile.copy(model_fp.name + '.model',
                         copy_rename_path,
                         overwrite=True)
        tf.io.gfile.rename(copy_rename_path, abs_model_path, overwrite=True)
        logging.info('copied %s to %s', model_fp.name + '.model',
                     abs_model_path)
    else:
        while not tf.io.gfile.exists(abs_model_path):
            time.sleep(1)
        time.sleep(1)
    return abs_model_path
コード例 #11
0
def train(args, inputs, lang, tgt=False):

    spm_dir = args.spm_dir
    if not os.path.exists(spm_dir):
        os.makedirs(spm_dir)

    train_config = {
        k: getattr(args, ("tgt" if tgt else "src") + "_" + k)
        for k in [
            "vocab_size",
            "character_coverage",
            "byte_fallback",
        ]
     }

    SentencePieceTrainer.train(
        input=inputs,
        model_prefix=os.path.join(args.spm_dir, lang),
        **train_config,
    )
コード例 #12
0
def train_sentencepiece(dataset,
                        vocab_size,
                        maxchars=1e7,
                        character_coverage=1.0,
                        model_path='wmt_model.model',
                        model_type='unigram',
                        data_keys=('inputs', 'targets')):
    """Train SentencePiece tokenizer from subset of tf dataset.

  Args:
    dataset: tf.dataset
    vocab_size: int: size of vocab tokens to train.
    maxchars: int: number of characters to use for sentencepiece training.
    character_coverage: amount of characters covered by the model, good
      defaults are 0.9995 for languages with rich character set like Japanese
      or Chinese and 1.0 for other languages with small character set.
    model_path: str: path of model file to save vocab model to.
    model_type: str: type of sentencepiece vocab to train.
    data_keys: Tuple[str]: keys of dataset to use for training.

  Returns:
    path to the trained sentencepiece vocabulary model.
  """
    abs_model_path = os.path.abspath(os.path.expanduser(model_path))
    fname, _ = dump_chars_to_textfile(dataset,
                                      maxchars=maxchars,
                                      data_keys=data_keys)
    with tempfile.NamedTemporaryFile(delete=False,
                                     prefix='/tmp/sp_tmp') as model_fp:
        pass  # we just want a prefix'd tmp-filename
    argstr = ' '.join([
        f'--input={fname}', f'--vocab_size={vocab_size}',
        f'--character_coverage={character_coverage}',
        f'--model_prefix={model_fp.name}', f'--model_type={model_type}'
    ])
    SentencePieceTrainer.Train(argstr)
    # Only write to CNS if host id is 0 to prevent race conditions during
    # multihost training, otherwise wait until host 0 has written the file.
    if jax.host_id() == 0:
        # Use an intermediate filename that is renamed to the target name to address
        # create and fill delays.  Using finalization (CNS) as a indicator is not
        # portable.
        copy_rename_path = abs_model_path + '.rntmp'
        tf.io.gfile.copy(model_fp.name + '.model',
                         copy_rename_path,
                         overwrite=True)
        tf.io.gfile.rename(copy_rename_path, abs_model_path, overwrite=True)
        logging.info('copied %s to %s', model_fp.name + '.model',
                     abs_model_path)
    else:
        while not tf.io.gfile.exists(abs_model_path):
            time.sleep(1)
        time.sleep(1)
    return abs_model_path
コード例 #13
0
 def __create_vocab(self):
     """
     generate pretraining vocab and tokenizer model.
     CORPUS/xxx.txt -> DATA/xxx.vocab, DATA/xxx.model
     """
     for text_file in tqdm(FileUtil.file_list(self.config.corpus_dir),
                           desc='create vocab and tokenizer model'):
         if text_file.endswith('.txt'):
             # """ see: https://github.com/google/sentencepiece#usage-instructions """
             params = f'--input={text_file} --model_prefix={os.path.join(ACE_ROOT, self.config.model_prefix)} --vocab_size={self.config.vocab_size} ' \
                      f'--model_type={self.config.model_type} --character_coverage={self.config.character_coverage}'
             SentencePieceTrainer.Train(params)
コード例 #14
0
    def train(
        self,
        input_path: list,
        model_prefix: str,
        character_coverage=0.9995,
        vocab_size=None,
        model_type: str = "bpe",
        control_symbols: list = [
            "[PAD]", "[SEP]", "[MASK]", "[CLS]", "<s>", "</s>"
        ],
    ):
        """
        Function for train tokenizer

        Args:
            input_path (str):
            model_prefix (str):
            character_coverage (float):
            vocab_size (float):
            model_type (str):
            control_symbols (list):
        """

        if character_coverage is None and vocab_size is None:
            print("at least character_coverage or vocab_size should be given!")
            assert character_coverage or vocab_size

        coverage_conditions = ""
        if character_coverage is not None:
            coverage_condition = f" --character_coverage={str(character_coverage)} "
        else:
            coverage_condition = f" --vocab_size={str(vocab_size)} "

        symbol_list = ""
        for i in control_symbols:
            symbol_list += i + ","

        input_list = ""
        for i in input_path:
            input_list += i + ","

        args = ("--input={} "
                "--model_prefix={} "
                "--model_type={} "
                "--control_symbols={} "
                "--bos_id=5 --eos_id=6 --unk_id=1".format(
                    input_list, model_prefix, model_type, symbol_list))

        args += coverage_condition

        print(args)

        SentencePieceTrainer.Train(args)
コード例 #15
0
ファイル: core.py プロジェクト: MSSandroid/fastai_dev
 def train(self, raw_text_path):
     "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
     from sentencepiece import SentencePieceTrainer
     vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz
     spec_tokens = ['\u2581'+s for s in self.special_toks]
     SentencePieceTrainer.Train(" ".join([
         f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
         f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
         f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
         f"--user_defined_symbols={','.join(spec_tokens)}"]))
     raw_text_path.unlink()
     return self.cache_dir/'spm.model'
コード例 #16
0
def train_sentencepiece(sentence_file, sentencepiece_model_name, vocab_size):
  SentencePieceTrainer.Train(
      " ".join(
          [
              f"--input={sentence_file}",
              f"--character_coverage=1.0",
              f"--unk_id=0 --pad_id=-1 --bos_id=-1 --eos_id=-1",
              f"--input_sentence_size=2000000 --shuffle_input_sentence=true",
              f"--model_prefix={sentencepiece_model_name} --vocab_size={vocab_size} --model_type=unigram",
          ]
      )
  )
コード例 #17
0
def spm(name, path, size=8192, bos=2, eos=1, unk=0, coverage=0.9995):
    """-> SentencePieceProcessor

    trains a sentence piece model of `size` from text file on `path`
    and saves with `name`.

    """
    SentencePieceTrainer.train("--model_prefix={name} \
        --input={path} \
        --vocab_size={size} \
        --bos_id={bos} \
        --eos_id={eos} \
        --unk_id={unk} \
        --unk_surface=☹ \
        --character_coverage={coverage}".format(coverage=coverage,
                                                unk=unk,
                                                eos=eos,
                                                bos=bos,
                                                size=size,
                                                path=path,
                                                name=name))
コード例 #18
0
def train_subwords(train_path, model_path, model_type, vocab_size):
    temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
    for text, title in read(train_path):
        temp.write(text + "\n")
        temp.write(title + "\n")
    temp.close()
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
        temp.name, os.path.join(model_path, model_type), vocab_size,
        model_type)
    sp_trainer.Train(cmd)
    os.unlink(temp.name)
コード例 #19
0
def train_sentencepiece_tokenizer(sentences: list,
                                  vocab_size: int,
                                  folder_name: str = "sentencepiece",
                                  model_name: str = "tokenizer_de") -> None:
    '''Trains a sentencepiece tokenizer on a given corpus.

    Args:
        sentences: contains all sentences of a corpus.
        vocab_size: maximum number of (sub-)words in the vocabulary of the tokenizer.
        folder_name: name of the folder where the trained tokenizer will be placed in.
        model_name: filename of the trained sentencepiece tokenizer.
    '''
    temp_file = "sentences.txt"  # this file will be deleted after training of the tokenizer is done.

    if folder_name != "":
        output_file = folder_name + "/" + model_name
    else:
        output_file = model_name

    # write all sentences to a temporary file
    with open(temp_file, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + "\n")

    parameters = f"--input={temp_file} \
                --model_prefix={output_file} \
                --vocab_size={vocab_size} \
                --bos_id=2 \
                --eos_id=3 \
                --unk_id=1 \
                --pad_id=0 \
                --bos_piece=<s> \
                --eos_piece=</s> \
                --hard_vocab_limit=false"

    # train tokenizer on our corpus
    SentencePieceTrainer.train(parameters)
    # delete temp_file
    os.remove(temp_file)
コード例 #20
0
ファイル: dataset.py プロジェクト: roholazandie/t-vae
    def preprocess(self, directory: str, prefix: str, part: str, spm_model: SentencePieceProcessor = None,
                   pretrain_emb=True, vocab_size=3000, embedding_size=600,
                   max_sentence_length=16384, workers=3, skip_gramm=False):

        # Check data files existing
        workdir = os.path.join(directory, prefix)
        os.makedirs(workdir, exist_ok=True)

        data_part_file = os.path.join(directory, part + ".tsv")
        if not os.path.exists(data_part_file):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), data_part_file)

        if part not in ['train', 'develop']:
            assert spm_model is not None, "For non train part, `spm_model` must be specified."
        else:
            # Train sentecepiece:
            logging.info("Start training sentecepiece")
            spm_directory = os.path.join(workdir, "spm")
            spm_params = (
                "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
                "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format(
                    data_part_file, spm_directory, vocab_size, max_sentence_length
                )
            )
            SentencePieceTrainer.Train(spm_params)
            spm_model = SentencePieceProcessor()
            spm_model.load(spm_directory + ".model")

            if pretrain_emb:
                # Train word2vec
                logging.info("Start training word2vec")
                train_senteces = SentenceIterator(data_part_file, spm_model)
                logging.info("Loaded train sentences")
                w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers,
                                     size=embedding_size, sg=int(skip_gramm))
                w2v_model_filename = os.path.join(workdir, "word2vec.model")
                w2v_model.save(w2v_model_filename)

                # Export embeddings
                logging.info("Export embeddings")
                embeddings_filename = os.path.join(workdir, "embedding.npy")
                export_embeddings(embeddings_filename, spm_model, w2v_model)
                logging.info("Embeddings have been saved into {}".format(embeddings_filename))

        logging.info("Start exporting data file")
        source_file_name = os.path.join(directory, part + ".tsv")
        exported_file_name = os.path.join(workdir, part + ".npy")
        sentence_iterator = SentenceIterator(source_file_name, spm_model)
        sentence_iterator.export(exported_file_name)
        logging.info("{} exported".format(exported_file_name))
        logging.info("Data preprocessing completed")
コード例 #21
0
    def __spm_create(self):
        if os.path.isfile("data/love.model"):
            return 0

        params = '--input=' + c.data_text_path + \
                 ' --model_prefix=data/love ' \
                 ' --vocab_size=2500' \
                 ' --model_type=' + c.model_type[0] + \
                 ' --max_sentence_length=999999' \
                 ' --pad_id=0 --pad_piece=[PAD]' \
                 ' --unk_id=1 --unk_piece=[UNK]' \
                 ' --character_coverage=1.0'
        # 0.9995 for english, 1.0 for Korean
        SentencePieceTrainer.Train(params)
コード例 #22
0
    def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False):
        """Creates Sentencepiece word model with given words plus special tokens.

    The tokens of the resulting model are, in this order:
        <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
    *=if requested by args.

    The words in the input vocab are plain text, without the whitespace marker.
    That makes this function interchangeable with _make_vocab_file().

    Args:
      vocab: a list of strings with the words to put into the model's
        vocabulary. Do not include special tokens here.
      prefix: an optional string, to change the filename prefix for the model
        (relative to the temporary directory created by this function).
      add_mask_token: an optional bool, whether to include a [MASK] token.

    Returns:
      The absolute filename of the created Sentencepiece model file.
    """
        model_prefix = os.path.join(
            tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
            prefix)
        input_file = model_prefix + "_train_input.txt"
        # Create input text for training the sp model from the tokens provided.
        # Repeat tokens, the earlier the more, because they are sorted by frequency.
        input_text = []
        for i, token in enumerate(vocab):
            input_text.append(" ".join([token] * (len(vocab) - i)))
        with tf.io.gfile.GFile(input_file, "w") as f:
            f.write("\n".join(input_text + [""]))
        control_symbols = "[CLS],[SEP]"
        full_vocab_size = len(
            vocab) + 6  # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
        if add_mask_token:
            control_symbols += ",[MASK]"
            full_vocab_size += 1
        flags = dict(model_prefix=model_prefix,
                     model_type="word",
                     input=input_file,
                     pad_id=0,
                     unk_id=1,
                     control_symbols=control_symbols,
                     vocab_size=full_vocab_size,
                     bos_id=full_vocab_size - 2,
                     eos_id=full_vocab_size - 1)
        SentencePieceTrainer.Train(" ".join(
            ["--{}={}".format(k, v) for k, v in flags.items()]))
        return model_prefix + ".model"
コード例 #23
0
ファイル: input_pipeline.py プロジェクト: yanndupis/flax
def train_sentencepiece(dataset,
                        vocab_size,
                        maxchars=1e7,
                        character_coverage=1.0,
                        model_path='wmt_model.model',
                        model_type='unigram',
                        data_keys=('inputs', 'targets')):
    """Train SentencePiece tokenizer from subset of tf dataset.

  Args:
    dataset: tf.dataset
    vocab_size: int: size of vocab tokens to train.
    maxchars: int: number of characters to use for sentencepiece training.
    character_coverage: amount of characters covered by the model, good
      defaults are 0.9995 for languages with rich character set like Japanese
      or Chinese and 1.0 for other languages with small character set.
    model_path: str: path of model file to save vocab model to.
    model_type: str: type of sentencepiece vocab to train.
    data_keys: Tuple[str]: keys of dataset to use for training.

  Returns:
    path to the trained sentencepiece vocabulary model.
  """
    abs_model_path = os.path.abspath(os.path.expanduser(model_path))
    fname, _ = dump_chars_to_textfile(dataset,
                                      maxchars=maxchars,
                                      data_keys=data_keys)
    with tempfile.NamedTemporaryFile(delete=False,
                                     prefix='/tmp/sp_tmp') as model_fp:
        pass  # we just want a prefix'd tmp-filename
    argstr = ' '.join([
        f'--input={fname}', f'--vocab_size={vocab_size}',
        f'--character_coverage={character_coverage}',
        f'--model_prefix={model_fp.name}', f'--model_type={model_type}'
    ])
    SentencePieceTrainer.Train(argstr)
    if jax.host_id() == 0:
        tf.io.gfile.copy(model_fp.name + '.model',
                         abs_model_path,
                         overwrite=True)
        logging.info('copied %s to %s', model_fp.name + '.model',
                     abs_model_path)
    else:
        while not tf.io.gfile.exists(abs_model_path):
            time.sleep(1)
        time.sleep(1)
    return abs_model_path
コード例 #24
0
def train_subwords(train_path, model_path, model_type, vocab_size,
                   config_path):
    temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
    params = Params.from_file(config_path)
    reader_params = params.pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    for text, summary in reader.parse_set(train_path):
        temp.write(text + "\n")
        temp.write(summary + "\n")
    temp.close()
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
        temp.name, os.path.join(model_path, model_type), vocab_size,
        model_type)
    sp_trainer.Train(cmd)
    os.unlink(temp.name)
コード例 #25
0
def train(input_file, opencorpora_file):
    records = []
    with open(input_file, "r") as r:
        next(r)
        reader = csv.reader(r)
        for row in reader:
            _, _, text, _ = row
            text = text.replace("\n", " ").lower()
            nn_count = text.count("нн")
            if nn_count == 1:
                records.append((text, 0))
    with open(opencorpora_file, "r") as r:
        for line in r:
            text = line.strip().lower()
            if "нн" in text:
                records.append((text, 1))
    random.shuffle(records)
    border = int(0.8 * len(records))
    train = records[:border]
    val = records[border:]

    model_path = "subword_model"
    if False:
        temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
        for text, _ in train:
            temp.write(text + "\n")
        temp.close()
        cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
            temp.name, model_path, 30000, "bpe")
        sp_trainer.Train(cmd)
        os.unlink(temp.name)

    processor = sp_processor()
    processor.load(model_path + ".model")
    fixed_train = []
    for text, label in train:
        text = " ".join(tokenize(processor, text))
        fixed_train.append((text, label))
    fixed_val = []
    for text, label in val:
        text = " ".join(tokenize(processor, text))
        fixed_val.append((text, label))

    to_ft_format(fixed_train, "nn_train.txt")
    to_ft_format(fixed_val, "nn_val.txt")
コード例 #26
0
ファイル: codec.py プロジェクト: isi-nlp/rtg
 def train(cls,
           model_type: str,
           vocab_size: int,
           model_path: str,
           files: Iterator[str],
           no_split_toks: Optional[List[str]] = None,
           char_coverage: float = 0):
     """
     Train Sentence Piece Model
     :param model_type: sentence piece model type: {unigram, BPE, word, char}
     :param vocab_size: target vocabulary size
     :param model_path: where to store model
     :param files: input files
     :param no_split_toks: Don't split these tokens
     :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage 0.9995%
     :return:
     """
     model_prefix = model_path.replace('.model', '')
     files = ','.join(files)  # remove duplicates
     arg = f"--input={files} --vocab_size={vocab_size} --model_prefix={model_prefix}" \
           f" --model_type={model_type} --pad_id={cls.pad_idx} --bos_id={cls.bos_idx}" \
           f" --eos_id={cls.eos_idx} --unk_id={cls.unk_idx} --hard_vocab_limit=false"
     if char_coverage > 0:
         assert 0 < char_coverage <= 1
         arg += f" --character_coverage={char_coverage}"
     # CLS token goes in the beginning because we need it get index 4
     extra = [cls.cls_tok] + (no_split_toks or [])
     no_split_toks_str = ','.join(extra)
     arg += f" --user_defined_symbols={no_split_toks_str}"
     if model_type == 'bpe':  # BPE can have longer sentences, default is 2048
         arg += " --max_sentence_length=8192"
     if model_type == 'word':
         arg += ' --use_all_vocab'
     log.info(f"SPM: {arg}")
     SentencePieceTrainer.Train(arg)
     log.info("Training complete")
     if not model_path.endswith('.model'):
         model_path += '.model'
     model = SPField(model_path)
     for piece, idx in cls.reserved():
         assert model.piece_to_id(piece) == idx
     return model
コード例 #27
0
ファイル: grammar_tsya.py プロジェクト: hequs/nghack
def train(input_file):
    records = []
    with open(input_file, "r") as r:
        next(r)
        reader = csv.reader(r)
        for row in reader:
            _, text, label = row
            text = text.replace("\n", " ").lower()
            tjsya_count = text.count("ться")
            tsya_count = text.count("тся")
            if (tjsya_count != 0 and tsya_count == 0) or (tjsya_count == 0
                                                          and tsya_count != 0):
                records.append((text, label))
    random.shuffle(records)
    border = int(0.8 * len(records))
    train = records[:border]
    val = records[border:]

    model_path = "subword_model"
    if True:
        temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
        for text, _ in train:
            temp.write(text + "\n")
        temp.close()
        cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
            temp.name, model_path, 30000, "bpe")
        sp_trainer.Train(cmd)
        os.unlink(temp.name)

    processor = sp_processor()
    processor.load(model_path + ".model")
    fixed_train = []
    for text, label in train:
        text = " ".join(tokenize(processor, text))
        fixed_train.append((text, label))
    fixed_val = []
    for text, label in val:
        text = " ".join(tokenize(processor, text))
        fixed_val.append((text, label))

    to_ft_format(fixed_train, "grammar_endings_train.txt")
    to_ft_format(fixed_val, "grammar_endings_val.txt")
コード例 #28
0
def _create_fake_sentencepiece_model(output_dir):
    vocab = ['a', 'b', 'c', 'd', 'e', 'abc', 'def', 'ABC', 'DEF']
    model_prefix = os.path.join(output_dir, 'spm_model')
    input_text_file_path = os.path.join(output_dir, 'train_input.txt')
    with tf.io.gfile.GFile(input_text_file_path, 'w') as f:
        f.write(' '.join(vocab + ['\n']))
    # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>.
    full_vocab_size = len(vocab) + 7
    flags = dict(model_prefix=model_prefix,
                 model_type='word',
                 input=input_text_file_path,
                 pad_id=0,
                 unk_id=1,
                 control_symbols='[CLS],[SEP],[MASK]',
                 vocab_size=full_vocab_size,
                 bos_id=full_vocab_size - 2,
                 eos_id=full_vocab_size - 1)
    SentencePieceTrainer.Train(' '.join(
        ['--{}={}'.format(k, v) for k, v in flags.items()]))
    return model_prefix + '.model'
コード例 #29
0
 def train(model_type: str,
           vocab_size: int,
           model_path: str,
           files: Iterator[str],
           no_split_toks: Optional[List[str]] = None,
           cover_all_chars: bool = False):
     """
     Train Sentence Piece Model
     :param model_type: sentence piece model type: {unigram, BPE, word, char}
     :param vocab_size: target vocabulary size
     :param model_path: where to store model
     :param files: input files
     :param no_split_toks: Don't split these tokens
     :return:
     """
     model_prefix = model_path.replace('.model', '')
     files = ','.join(files)  # remove duplicates
     arg = f"--input={files} --vocab_size={vocab_size} --model_prefix={model_prefix}" \
         f" --model_type={model_type} --pad_id={PAD_TOK[1]} --bos_id={BOS_TOK[1]}" \
         f" --eos_id={EOS_TOK[1]} --unk_id={UNK_TOK[1]} --hard_vocab_limit=false"
     if cover_all_chars:
         arg += f" --character_coverage=1.0"
     # CLS token goes in the beginning because we need it get index 4
     cls_tok_str = CLS_TOK[0]
     if no_split_toks:
         no_split_toks_str = ','.join([cls_tok_str] + no_split_toks)
     else:
         no_split_toks_str = cls_tok_str
     arg += f" --user_defined_symbols={no_split_toks_str}"
     if model_type == 'bpe':  # BPE can have longer sentences, default is 2048
         arg += " --max_sentence_length=8192"
     log.info(f"SPM: {arg}")
     SentencePieceTrainer.Train(arg)
     log.info("Training complete")
     if not model_path.endswith('.model'):
         model_path += '.model'
     model = Field(model_path)
     for piece, idx in RESERVED_TOKS:
         assert model.piece_to_id(piece) == idx
     return model
コード例 #30
0
ファイル: text_layers_test.py プロジェクト: ykate1998/models
 def setUp(self):
   super().setUp()
   # Make a sentencepiece model.
   tmp_dir = self.get_temp_dir()
   tempfile.mkdtemp(dir=tmp_dir)
   vocab = ["a", "b", "c", "d", "e", "abc", "def", "ABC", "DEF"]
   model_prefix = os.path.join(tmp_dir, "spm_model")
   input_text_file_path = os.path.join(tmp_dir, "train_input.txt")
   with tf.io.gfile.GFile(input_text_file_path, "w") as f:
     f.write(" ".join(vocab + ["\n"]))
   # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>.
   full_vocab_size = len(vocab) + 7
   flags = dict(
       model_prefix=model_prefix,
       model_type="word",
       input=input_text_file_path,
       pad_id=0, unk_id=1, control_symbols="[CLS],[SEP],[MASK]",
       vocab_size=full_vocab_size,
       bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
   SentencePieceTrainer.Train(
       " ".join(["--{}={}".format(k, v) for k, v in flags.items()]))
   self._spm_path = model_prefix + ".model"