Example #1
0
 def train(self, raw_text_path):
     "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
     from sentencepiece import SentencePieceTrainer
     vocab_sz = self._get_vocab_sz(
         raw_text_path) if self.vocab_sz is None else self.vocab_sz
     spec_tokens = ['\u2581' + s for s in self.special_toks]
     SentencePieceTrainer.Train(" ".join([
         f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
         f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
         f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
         f"--user_defined_symbols={','.join(spec_tokens)}"
     ]))
     raw_text_path.unlink()
     return self.cache_dir / 'spm.model'
    def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False):
        """Creates Sentencepiece word model with given words plus special tokens.

    The tokens of the resulting model are, in this order:
        <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
    *=if requested by args.

    The words in the input vocab are plain text, without the whitespace marker.
    That makes this function interchangeable with _make_vocab_file().

    Args:
      vocab: a list of strings with the words to put into the model's
        vocabulary. Do not include special tokens here.
      prefix: an optional string, to change the filename prefix for the model
        (relative to the temporary directory created by this function).
      add_mask_token: an optional bool, whether to include a [MASK] token.

    Returns:
      The absolute filename of the created Sentencepiece model file.
    """
        model_prefix = os.path.join(
            tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
            prefix)
        input_file = model_prefix + "_train_input.txt"
        # Create input text for training the sp model from the tokens provided.
        # Repeat tokens, the earlier the more, because they are sorted by frequency.
        input_text = []
        for i, token in enumerate(vocab):
            input_text.append(" ".join([token] * (len(vocab) - i)))
        with tf.io.gfile.GFile(input_file, "w") as f:
            f.write("\n".join(input_text + [""]))
        control_symbols = "[CLS],[SEP]"
        full_vocab_size = len(
            vocab) + 6  # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
        if add_mask_token:
            control_symbols += ",[MASK]"
            full_vocab_size += 1
        flags = dict(model_prefix=model_prefix,
                     model_type="word",
                     input=input_file,
                     pad_id=0,
                     unk_id=1,
                     control_symbols=control_symbols,
                     vocab_size=full_vocab_size,
                     bos_id=full_vocab_size - 2,
                     eos_id=full_vocab_size - 1)
        SentencePieceTrainer.Train(" ".join(
            ["--{}={}".format(k, v) for k, v in flags.items()]))
        return model_prefix + ".model"
Example #3
0
def train_sentencepiece(dataset,
                        vocab_size,
                        maxchars=1e7,
                        character_coverage=1.0,
                        model_path='wmt_model.model',
                        model_type='unigram',
                        data_keys=('inputs', 'targets')):
  """Train SentencePiece tokenizer from subset of tf dataset.

  Args:
    dataset: tf.dataset
    vocab_size: int: size of vocab tokens to train.
    maxchars: int: number of characters to use for sentencepiece training.
    character_coverage: amount of characters covered by the model, good
      defaults are 0.9995 for languages with rich character set like Japanese
      or Chinese and 1.0 for other languages with small character set.
    model_path: str: path of model file to save vocab model to.
    model_type: str: type of sentencepiece vocab to train.
    data_keys: Tuple[str]: keys of dataset to use for training.

  Returns:
    path to the trained sentencepiece vocabulary model.
  """
  abs_model_path = os.path.abspath(os.path.expanduser(model_path))
  fname, _ = dump_chars_to_textfile(dataset,
                                    maxchars=maxchars,
                                    data_keys=data_keys)
  with tempfile.NamedTemporaryFile(delete=False,
                                   prefix='/tmp/sp_tmp') as model_fp:
    pass  # we just want a prefix'd tmp-filename
  argstr = ' '.join(
      [f'--input={fname}',
       f'--vocab_size={vocab_size}',
       f'--character_coverage={character_coverage}',
       f'--model_prefix={model_fp.name}',
       f'--model_type={model_type}'])
  SentencePieceTrainer.Train(argstr)
  if jax.host_id() == 0:
    # Use an intermediate filename that is renamed to the target name to address
    # create and fill delays.
    copy_rename_path = abs_model_path + '.rntmp'
    tf.io.gfile.copy(model_fp.name + '.model', copy_rename_path, overwrite=True)
    tf.io.gfile.rename(copy_rename_path, abs_model_path, overwrite=True)
    logging.info('copied %s to %s', model_fp.name+'.model', abs_model_path)
  else:
    while not tf.io.gfile.exists(abs_model_path):
      time.sleep(1)
    time.sleep(1)
  return abs_model_path
Example #4
0
def train_subwords(train_path, model_path, model_type, vocab_size):
    temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
    for text, title in parse_ria_json(train_path):
        temp.write(text + "\n")
        temp.write(title + "\n")
    temp.close()
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
        temp.name,
        os.path.join(model_path, model_type),
        vocab_size,
        model_type)
    sp_trainer.Train(cmd)
    os.unlink(temp.name)
def train_subwords(train_path, model_path, model_type, vocab_size,
                   config_path):
    temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
    params = Params.from_file(config_path)
    reader_params = params.pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    for text, summary in reader.parse_set(train_path):
        temp.write(text + "\n")
        temp.write(summary + "\n")
    temp.close()
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
        temp.name, os.path.join(model_path, model_type), vocab_size,
        model_type)
    sp_trainer.Train(cmd)
    os.unlink(temp.name)
Example #6
0
def train(input_file, opencorpora_file):
    records = []
    with open(input_file, "r") as r:
        next(r)
        reader = csv.reader(r)
        for row in reader:
            _, _, text, _ = row
            text = text.replace("\n", " ").lower()
            nn_count = text.count("нн")
            if nn_count == 1:
                records.append((text, 0))
    with open(opencorpora_file, "r") as r:
        for line in r:
            text = line.strip().lower()
            if "нн" in text:
                records.append((text, 1))
    random.shuffle(records)
    border = int(0.8 * len(records))
    train = records[:border]
    val = records[border:]

    model_path = "subword_model"
    if False:
        temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
        for text, _ in train:
            temp.write(text + "\n")
        temp.close()
        cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
            temp.name, model_path, 30000, "bpe")
        sp_trainer.Train(cmd)
        os.unlink(temp.name)

    processor = sp_processor()
    processor.load(model_path + ".model")
    fixed_train = []
    for text, label in train:
        text = " ".join(tokenize(processor, text))
        fixed_train.append((text, label))
    fixed_val = []
    for text, label in val:
        text = " ".join(tokenize(processor, text))
        fixed_val.append((text, label))

    to_ft_format(fixed_train, "nn_train.txt")
    to_ft_format(fixed_val, "nn_val.txt")
Example #7
0
    def __spm_create(self):
        if os.path.isfile("data/love.model"):
            return 0

        params = '--input=' + c.data_text_path + \
                 ' --model_type=' + c.model_type[0] + \
                 ' --model_prefix=data/love ' \
                 ' --vocab_size=2507' \
                 ' --max_sentence_length=999999' \
                 ' --character_coverage=1.0' \
                 ' --pad_id=0 --pad_piece=[PAD]' \
                 ' --unk_id=1 --unk_piece=[UNK]' \
                 ' --bos_id=2 --bos_piece=[BOS]' \
                 ' --eos_id=3 --eos_piece=[EOS]' \
                 ' --user_defined_symbols=[SEP],[CLS],[MASK]'

        # 0.9995 for english, 1.0 for Korean
        SentencePieceTrainer.Train(params)
Example #8
0
def train_sentencepiece(texts:Collection[str], path:PathOrStr, pre_rules: ListRules=None, post_rules:ListRules=None, 
    vocab_sz:int=None, max_vocab_sz:int=30000, model_type:str='unigram', max_sentence_len:int=20480, lang='en',
    char_coverage=None, tmp_dir='tmp'):
    "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
    from sentencepiece import SentencePieceTrainer
    cache_dir = Path(path)/tmp_dir
    os.makedirs(cache_dir, exist_ok=True)
    if vocab_sz is None: vocab_sz=get_default_size(texts, max_vocab_sz)
    raw_text_path = cache_dir / 'all_text.txt'
    with open(raw_text_path, 'w') as f: f.write("\n".join(texts))
    spec_tokens = ['\u2581'+s for s in defaults.text_spec_tok]
    SentencePieceTrainer.Train(" ".join([
        f"--input={raw_text_path} --max_sentence_length={max_sentence_len}",
        f"--character_coverage={ifnone(char_coverage, 1 if lang in full_char_coverage_langs else 0.99)}",
        f"--unk_id={len(defaults.text_spec_tok)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
        f"--user_defined_symbols={','.join(spec_tokens)}",
        f"--model_prefix={cache_dir/'spm'} --vocab_size={vocab_sz} --model_type={model_type}"]))
    return cache_dir
Example #9
0
def train(input_file):
    records = []
    with open(input_file, "r") as r:
        next(r)
        reader = csv.reader(r)
        for row in reader:
            _, text, label = row
            text = text.replace("\n", " ").lower()
            tjsya_count = text.count("ться")
            tsya_count = text.count("тся")
            if (tjsya_count != 0 and tsya_count == 0) or (tjsya_count == 0
                                                          and tsya_count != 0):
                records.append((text, label))
    random.shuffle(records)
    border = int(0.8 * len(records))
    train = records[:border]
    val = records[border:]

    model_path = "subword_model"
    if True:
        temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
        for text, _ in train:
            temp.write(text + "\n")
        temp.close()
        cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
            temp.name, model_path, 30000, "bpe")
        sp_trainer.Train(cmd)
        os.unlink(temp.name)

    processor = sp_processor()
    processor.load(model_path + ".model")
    fixed_train = []
    for text, label in train:
        text = " ".join(tokenize(processor, text))
        fixed_train.append((text, label))
    fixed_val = []
    for text, label in val:
        text = " ".join(tokenize(processor, text))
        fixed_val.append((text, label))

    to_ft_format(fixed_train, "grammar_endings_train.txt")
    to_ft_format(fixed_val, "grammar_endings_val.txt")
Example #10
0
File: codec.py Project: isi-nlp/rtg
 def train(cls,
           model_type: str,
           vocab_size: int,
           model_path: str,
           files: Iterator[str],
           no_split_toks: Optional[List[str]] = None,
           char_coverage: float = 0):
     """
     Train Sentence Piece Model
     :param model_type: sentence piece model type: {unigram, BPE, word, char}
     :param vocab_size: target vocabulary size
     :param model_path: where to store model
     :param files: input files
     :param no_split_toks: Don't split these tokens
     :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage 0.9995%
     :return:
     """
     model_prefix = model_path.replace('.model', '')
     files = ','.join(files)  # remove duplicates
     arg = f"--input={files} --vocab_size={vocab_size} --model_prefix={model_prefix}" \
           f" --model_type={model_type} --pad_id={cls.pad_idx} --bos_id={cls.bos_idx}" \
           f" --eos_id={cls.eos_idx} --unk_id={cls.unk_idx} --hard_vocab_limit=false"
     if char_coverage > 0:
         assert 0 < char_coverage <= 1
         arg += f" --character_coverage={char_coverage}"
     # CLS token goes in the beginning because we need it get index 4
     extra = [cls.cls_tok] + (no_split_toks or [])
     no_split_toks_str = ','.join(extra)
     arg += f" --user_defined_symbols={no_split_toks_str}"
     if model_type == 'bpe':  # BPE can have longer sentences, default is 2048
         arg += " --max_sentence_length=8192"
     if model_type == 'word':
         arg += ' --use_all_vocab'
     log.info(f"SPM: {arg}")
     SentencePieceTrainer.Train(arg)
     log.info("Training complete")
     if not model_path.endswith('.model'):
         model_path += '.model'
     model = SPField(model_path)
     for piece, idx in cls.reserved():
         assert model.piece_to_id(piece) == idx
     return model
Example #11
0
def _create_fake_sentencepiece_model(output_dir):
    vocab = ['a', 'b', 'c', 'd', 'e', 'abc', 'def', 'ABC', 'DEF']
    model_prefix = os.path.join(output_dir, 'spm_model')
    input_text_file_path = os.path.join(output_dir, 'train_input.txt')
    with tf.io.gfile.GFile(input_text_file_path, 'w') as f:
        f.write(' '.join(vocab + ['\n']))
    # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>.
    full_vocab_size = len(vocab) + 7
    flags = dict(model_prefix=model_prefix,
                 model_type='word',
                 input=input_text_file_path,
                 pad_id=0,
                 unk_id=1,
                 control_symbols='[CLS],[SEP],[MASK]',
                 vocab_size=full_vocab_size,
                 bos_id=full_vocab_size - 2,
                 eos_id=full_vocab_size - 1)
    SentencePieceTrainer.Train(' '.join(
        ['--{}={}'.format(k, v) for k, v in flags.items()]))
    return model_prefix + '.model'
Example #12
0
 def train(model_type: str,
           vocab_size: int,
           model_path: str,
           files: Iterator[str],
           no_split_toks: Optional[List[str]] = None,
           cover_all_chars: bool = False):
     """
     Train Sentence Piece Model
     :param model_type: sentence piece model type: {unigram, BPE, word, char}
     :param vocab_size: target vocabulary size
     :param model_path: where to store model
     :param files: input files
     :param no_split_toks: Don't split these tokens
     :return:
     """
     model_prefix = model_path.replace('.model', '')
     files = ','.join(files)  # remove duplicates
     arg = f"--input={files} --vocab_size={vocab_size} --model_prefix={model_prefix}" \
         f" --model_type={model_type} --pad_id={PAD_TOK[1]} --bos_id={BOS_TOK[1]}" \
         f" --eos_id={EOS_TOK[1]} --unk_id={UNK_TOK[1]} --hard_vocab_limit=false"
     if cover_all_chars:
         arg += f" --character_coverage=1.0"
     # CLS token goes in the beginning because we need it get index 4
     cls_tok_str = CLS_TOK[0]
     if no_split_toks:
         no_split_toks_str = ','.join([cls_tok_str] + no_split_toks)
     else:
         no_split_toks_str = cls_tok_str
     arg += f" --user_defined_symbols={no_split_toks_str}"
     if model_type == 'bpe':  # BPE can have longer sentences, default is 2048
         arg += " --max_sentence_length=8192"
     log.info(f"SPM: {arg}")
     SentencePieceTrainer.Train(arg)
     log.info("Training complete")
     if not model_path.endswith('.model'):
         model_path += '.model'
     model = Field(model_path)
     for piece, idx in RESERVED_TOKS:
         assert model.piece_to_id(piece) == idx
     return model
Example #13
0
 def setUp(self):
   super().setUp()
   # Make a sentencepiece model.
   tmp_dir = self.get_temp_dir()
   tempfile.mkdtemp(dir=tmp_dir)
   vocab = ["a", "b", "c", "d", "e", "abc", "def", "ABC", "DEF"]
   model_prefix = os.path.join(tmp_dir, "spm_model")
   input_text_file_path = os.path.join(tmp_dir, "train_input.txt")
   with tf.io.gfile.GFile(input_text_file_path, "w") as f:
     f.write(" ".join(vocab + ["\n"]))
   # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>.
   full_vocab_size = len(vocab) + 7
   flags = dict(
       model_prefix=model_prefix,
       model_type="word",
       input=input_text_file_path,
       pad_id=0, unk_id=1, control_symbols="[CLS],[SEP],[MASK]",
       vocab_size=full_vocab_size,
       bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
   SentencePieceTrainer.Train(
       " ".join(["--{}={}".format(k, v) for k, v in flags.items()]))
   self._spm_path = model_prefix + ".model"
Example #14
0
def train_sentencepiece(file_path: str, model_path: str, vocab_size: int,
                        character_coverage: float, model_type: str):
    """Train SentencePiece tokenizer from subset of tf dataset.

  Args:
    file_path: path of data to train sentencepiece.
    model_path: path of model file to save vocab model to.
    vocab_size: size of vocab tokens to train.
    character_coverage: amount of characters covered by the model, good defaults
      are 0.9995 for languages with rich character set like Japanese or Chinese
      and 1.0 for other languages with small character set.
    model_type: type of sentencepiece vocab to train.

  Returns:
    path to the trained sentencepiece vocabulary model.
  """
    argstr = " ".join([
        f"--input={file_path}", f"--vocab_size={vocab_size}",
        f"--character_coverage={character_coverage}",
        f"--model_prefix={model_path}", f"--model_type={model_type}",
        "--bos_id=-1", "--pad_id=0", "--eos_id=1", "--unk_id=2"
    ])
    SentencePieceTrainer.Train(argstr)
Example #15
0
    def train_model(self, train_config=None):
        '''
		https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb
		see from this tutorial for sentence piece training
		'''
        config = train_config if train_config else self.config
        param = ""
        param += "--input={} ".format(config["corpus"])
        param += "--model_prefix={} ".format(config["model_prefix"])
        param += "--vocab_size={} ".format(config["vocab_size"])
        param += "--model_type={} ".format(config.get("model_type", "unigram"))
        param += "--character_coverage={} ".format(
            config.get("character_coverage", 0.995))
        param += "--mining_sentence_size={} ".format(
            config.get("mining_sentence_size", 5000000))
        param += "--input_sentence_size={} ".format(
            config.get("input_sentence_size", 5000000))
        param += "--max_sentencepiece_length={} ".format(
            config.get("max_sentencepiece_length", 5))
        try:
            SentencePieceTrainer.Train(param)
            self.sp.Load(config["model_prefix"] + ".model")
        except:
            raise ValueError(" training word piece model failed ")
Example #16
0
def train_sentencepiece(dataset,
                        vocab_size,
                        maxchars=1e7,
                        character_coverage=1.0,
                        model_path='wmt_model.model',
                        model_type='unigram',
                        data_keys=('inputs', 'targets')):
    """Train SentencePiece tokenizer from subset of tf dataset.

  Args:
    dataset: tf.dataset
    vocab_size: int: size of vocab tokens to train.
    maxchars: int: number of characters to use for sentencepiece training.
    model_path: str: path of model file to save vocab model to.
    model_type: str: type of sentencepiece vocab to train.
    data_keys: Tuple[str]: keys of dataset to use for training.

  Returns:
    path to the trained sentencepiece vocabulary model.
  """
    abs_model_path = os.path.abspath(os.path.expanduser(model_path))
    fname, _ = dump_chars_to_textfile(dataset,
                                      maxchars=maxchars,
                                      data_keys=data_keys)
    with tempfile.NamedTemporaryFile(delete=False,
                                     prefix='/tmp/sp_tmp') as model_fp:
        pass  # we just want a prefix'd tmp-filename
    argstr = ' '.join([
        f'--input={fname}', f'--vocab_size={vocab_size}',
        f'--character_coverage={character_coverage}',
        f'--model_prefix={model_fp.name}', f'--model_type={model_type}'
    ])
    SentencePieceTrainer.Train(argstr)
    tf.io.gfile.copy(model_fp.name + '.model', abs_model_path, overwrite=True)
    logging.info('copied %s to %s', model_fp.name + '.model', abs_model_path)
    return abs_model_path
Example #17
0
#encoding: utf-8

# portal from fairseq: https://github.com/pytorch/fairseq/blob/master/scripts/spm_train.py

import sys
from sentencepiece SentencePieceTrainer

if __name__ == "__main__":
	SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
Example #18
0
    def preprocess(self,
                   directory: str,
                   prefix: str,
                   part: str,
                   spm_model: SentencePieceProcessor = None,
                   pretrain_emb=True,
                   vocab_size=3000,
                   embedding_size=600,
                   max_sentence_length=16384,
                   workers=3,
                   skip_gramm=False):

        # Check data files existing
        workdir = os.path.join(directory, prefix)
        os.makedirs(workdir, exist_ok=True)

        data_part_file = os.path.join(directory, part + ".tsv")
        if not os.path.exists(data_part_file):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                    data_part_file)

        if part not in ['train', 'develop']:
            assert spm_model is not None, "For non train part, `spm_model` must be specified."
        else:
            # Train sentecepiece:
            logging.info("Start training sentecepiece")
            spm_directory = os.path.join(workdir, "spm")
            spm_params = (
                "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
                "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}"
                .format(data_part_file, spm_directory, vocab_size,
                        max_sentence_length))
            SentencePieceTrainer.Train(spm_params)
            spm_model = SentencePieceProcessor()
            spm_model.load(spm_directory + ".model")

            if pretrain_emb:
                # Train word2vec
                logging.info("Start training word2vec")
                train_senteces = SentenceIterator(data_part_file, spm_model)
                logging.info("Loaded train sentences")
                w2v_model = Word2Vec(train_senteces,
                                     min_count=0,
                                     workers=workers,
                                     vector_size=embedding_size,
                                     sg=int(skip_gramm))
                w2v_model_filename = os.path.join(workdir, "word2vec.model")
                w2v_model.save(w2v_model_filename)

                # Export embeddings
                logging.info("Export embeddings")
                embeddings_filename = os.path.join(workdir, "embedding.npy")
                export_embeddings(embeddings_filename, spm_model, w2v_model)
                logging.info("Embeddings have been saved into {}".format(
                    embeddings_filename))

        logging.info("Start exporting data file")
        source_file_name = os.path.join(directory, part + ".tsv")
        exported_file_name = os.path.join(workdir, part + ".npy")
        sentence_iterator = SentenceIterator(source_file_name, spm_model)
        sentence_iterator.export(exported_file_name)
        logging.info("{} exported".format(exported_file_name))
        logging.info("Data preprocessing completed")
Example #19
0
    def preprocess(directory: str,
                   prefix: str,
                   part: str,
                   spm: SentencePieceProcessor = None,
                   pretrain_emb=True,
                   vocab_size=30000,
                   embedding_size=300,
                   max_sentence_length=16384,
                   workers=3,
                   skip_gramm=False):
        """Preprocess dataset.

        Args:
            directory (str): Dataset directory.
            prefix (str): Dataset preprocessing prefix.
            part (str): Dataset part. :attr:`directory` must contain :attr:`part`.tsv file with data.
            spm (SentencePieceProcessor, optional): Defaults to None. Sentecepiece model.
            pretrain_emb (bool, optional): Defaults to True. Whether to pretrain embeddings.
            vocab_size (int, optional): Defaults to 30000. Vocabulary size.
            embedding_size (int, optional): Defaults to 300. Pretrained embedding size.
            max_sentence_length (int, optional): Defaults to 16384. Maximum sentence length for sentencepiece.
            workers (int, optional): Defaults to 3. Number of workers.
            skip_gramm (bool, optional): Defaults to False. Whether to use skip-gram type of Word2Vec training.

        Raises:
            FileNotFoundError: Raises if source data file doesn't exist.
        """

        data_workdir = os.path.join(directory, prefix)
        part_source_filename = os.path.join(directory, part + ".tsv")
        part_exported_filename = os.path.join(data_workdir, part + ".npy")
        spm_filename = os.path.join(data_workdir, "spm.model")
        spm_directory = os.path.join(data_workdir, "spm")
        w2v_model_filename = os.path.join(data_workdir, "word2vec.model")
        embeddings_filename = os.path.join(data_workdir, "embedding.npy")

        logger.info("Preprocess {}/{} dataset.".format(data_workdir, part))
        os.makedirs(data_workdir, exist_ok=True)

        if not os.path.exists(part_source_filename):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), part_source_filename)

        if part not in ["train", "dev"]:
            assert spm is not None, "For non train part, `spm` must be specified."
        else:
            logger.info("Start training sentencepiece")
            spm_params = (
                "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
                "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format(
                    part_source_filename,
                    spm_directory,
                    vocab_size,
                    max_sentence_length
                )
            )
            SentencePieceTrainer.Train(spm_params)
            spm = SentencePieceProcessor()
            spm.load(spm_filename)

            if pretrain_emb:
                logger.info("Start training Word2Vec embeddings")

                train_senteces = SentenceIterator(part_source_filename, spm)
                logger.info("Loaded train sentences")
                w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers,
                                     size=embedding_size, sg=int(skip_gramm))
                w2v_model.save(w2v_model_filename)

                # Export embeddings
                logger.info("Export embeddings")
                export_embeddings(embeddings_filename, spm, w2v_model)
                logger.info("Embeddings have been saved into {}".format(embeddings_filename))

        logger.info("Start exporting data file")
        sentence_iterator = SentenceIterator(part_source_filename, spm)
        sentence_iterator.export(part_exported_filename)
        logger.info("{} exported".format(part_exported_filename))
Example #20
0
                text = re.sub('─', '─', text)
                text = re.sub('•', '•', text)
                text = re.sub('☆', '☆', text)
                text = re.sub('’', '’', text)
                text = re.sub('‎', '', text)
                text = re.sub('Ñ€', 'p', text)
                out.write(item[1:-1] + '\n')
            tq.update()


def fetch_text(text_file, loop=None):
    if loop is None:
        loop = asyncio.get_event_loop()
    torrent = database.Torrent('207.148.124.42', loop=loop)
    queue = asyncio.Queue(10000)
    loop.run_until_complete(
        asyncio.gather(torrent.fetch_text(queue), write(queue, text_file)))


if __name__ == '__main__':
    # fetch_text(sys.argv[1])
    Trainer.Train(
        f'--input={sys.argv[1]} --model_prefix={sys.argv[2]} --vocab_size={sys.argv[3]}'
    )

    tokenizer = Tokenizer()
    tokenizer.Load('spm.model')
    with open(sys.argv[1]) as input:
        for line in input:
            print(line)
            print(tokenizer.encode_as_pieces(line))
Example #21
0
def main(argv):
    SentencePieceTrainer.Train(' '.join(argv[1:]))
Example #22
0
        with open(raw_text_path, 'r') as f:
            for line in f.readlines():
                cnt.update(line.split())
                if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz
        res = len(cnt)//4
        while res%8 != 0: res+=1
        return res

    def train(self, raw_text_path):
        "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
from sentencepiece import SentencePieceTrainer
        vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz
        spec_tokens = ['\u2581'+s for s in self.special_toks]
        SentencePieceTrainer.Train(" ".join([
            f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
            f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
            f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
            f"--user_defined_symbols={','.join(spec_tokens)}"]))
        raw_text_path.unlink()
        return self.cache_dir/'spm.model'

    def setup(self, items, rules):
        if self.tok is not None: return {'sp_model': self.sp_model}
        raw_text_path = self.cache_dir/'texts.out'
        with open(raw_text_path, 'w') as f:
            for t in progress_bar(apply_rules(items, rules), total=len(items), leave=False):
                f.write(f'{t}\n')
        return {'sp_model': self.train(raw_text_path)}

    def pipe(self, items):
        for t in items: yield self.tok.EncodeAsPieces(t)
Example #23
0
from glob import glob

from sentencepiece import SentencePieceTrainer


NUM_THREADS = 24
VOCABSIZE = 30_001
NUM_SENTS = 100_000_000


SOURCE_PATH = '/home/s2971992/Bertje/clean-data-v2/*/*.txt'
# SOURCE_PATH = '/Volumes/Data/Corpora/DutchWebNews/clean/*.txt'

input_paths = list(glob(SOURCE_PATH))
input_path = ','.join(input_paths)

print('Total number of files: {}'.format(len(input_paths)))

cmd = '--input={} --vocab_size={} --num_threads={} --input_sentence_size={} --shuffle_input_sentence=true --model_type=unigram --split_by_number=false --split_by_unicode_script=false --model_prefix=dutch --bos_piece=[CLS] --eos_piece=[SEP] --unk_piece=[UNK] --control_symbols=[PAD],[MASK]'.format(
    input_path, VOCABSIZE, NUM_THREADS, NUM_SENTS
)
trainer = SentencePieceTrainer.Train(cmd)
Example #24
0
def main():
    parser = argparse.ArgumentParser(description="Create vocabulary")
    parser.add_argument("--dataset_dir", type=str)
    parser.add_argument("--model_prefix", default="tokenizer", type=str)
    parser.add_argument("--num_placeholders", default=100, type=int)
    parser.add_argument("--sample_size", default=1e7, type=int)
    parser.add_argument("--train_path", type=str)
    parser.add_argument("--vocab_filename", default="vocab.txt", type=str)
    parser.add_argument("--vocab_size", default=32000, type=int)
    args = parser.parse_args()

    if args.dataset_dir is not None and args.train_path is not None:
        print("Only one of 'dataset_dir' and 'train_path' can be specified")
        return
    elif args.dataset_dir is not None:
        # If the dataset is distributed across multiple files, merge into one
        # file before proceeding
        # filepaths = glob.glob(os.path.join(args.dataset_dir, "**", "*.txt"))
        filepaths = glob.glob(os.path.join(args.dataset_dir, "*.txt"))
        print(
            "Found {} files, concatenenating dataset into one file...".format(
                len(filepaths)))

        with open(MERGED_FILE, "w") as f:
            for filepath in tqdm(filepaths):
                f.write(open(filepath, "r", errors="ignore").read())

        train_path = MERGED_FILE
    elif args.train_path is not None:
        train_path = args.train_path
    else:
        print("One of 'dataset_dir' and 'train_path' must be specified")
        return

    SPT.Train("--input={} ".format(train_path) +
              "--model_prefix={} ".format(args.model_prefix) +
              "--vocab_size={} ".format(args.vocab_size -
                                        args.num_placeholders) +
              "--input_sentence_size={} ".format(args.sample_size) +
              "--shuffle_input_sentence=true " + "--hard_vocab_limit=false " +
              "--bos_id=-1 " + "--eos_id=-1")

    # Add BERT control symbols
    vocab = ["[PAD]"]
    tokens = []

    with open("{}.vocab".format(args.model_prefix), "r") as f:
        # Skip first <unk> token
        f.seek(8)

        # Read tokens from each line and parse for vocab
        for line in f:
            piece = line.split("\t")[0]

            if piece.startswith("▁"):
                token = piece[1:]
            else:
                token = "##{}".format(piece)

            tokens.append(token)

    vocab.extend(
        ["[unused{}]".format(i) for i in range(args.vocab_size - len(tokens))])
    vocab.extend(["[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    vocab.extend(tokens)

    # Save vocabulary to output file
    with open(args.vocab_filename, "w") as f:
        for token in vocab:
            f.write("{}\n".format(token))
Example #25
0
def main():
    options = parse_args()
    torch.manual_seed(options.seed)
    basename = os.path.splitext(os.path.basename(options.input))[0]
    out_dir = options.out_dir or "data/{}/".format(basename)
    spinner = Halo(spinner="dots", placement="right")

    with open(options.input, "r", encoding="utf8") as fd:
        reader = csv.reader(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="")
        lines = [[line[0]] for line in reader]

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    output_full = os.path.join(out_dir, "{}.tsv".format(basename))
    with open(output_full, "w", encoding="utf8") as fd:
        writer = csv.writer(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="")
        writer.writerows(lines)

    vocab_size = 32000
    spiece_out = os.path.join(out_dir, "spiece")
    spiece_args = (
        "--input={} "
        "--model_prefix={} "
        "--vocab_size={} "
        "--character_coverage=1.0"
    ).format(output_full, spiece_out, vocab_size)
    SentencePieceTrainer.Train(spiece_args)
    # Load the generated vocabulary
    with open("{}.vocab".format(spiece_out), "r", encoding="utf8") as fd:
        reader = csv.reader(
            fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
        )
        vocab = [line[0] for line in reader]
    # Remove the special tokens <unk>, <s>, </s>
    vocab = vocab[3:]

    # Convert to BERT style
    bert_vocab = [
        v[1:] if v.startswith("▁") else "##{}".format(v) for v in vocab if v != "▁"
    ]
    # Add BERT's special tokens to the beginning
    bert_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + bert_vocab
    # Fill up with unused tokens
    pad_size = vocab_size - len(bert_vocab)
    bert_vocab += ["unused{}".format(i) for i in range(pad_size)]
    with open(os.path.join(out_dir, "vocab.txt"), "w", encoding="utf8") as fd:
        writer = csv.writer(
            fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
        )
        writer.writerows([[b] for b in bert_vocab])

    # Convert to GPT-2 style
    # Unfortunately it's slow and tedious.
    spinner.start(text="Generating BPE vocabulary")
    gpt2_vocab = ["Ġ{}".format(v[1:]) if v.startswith("▁") else v for v in vocab]
    # Add the GPT-2 special token to the end
    gpt2_vocab.append("<|endoftext|>")
    with open(os.path.join(out_dir, "vocab.json"), "w", encoding="utf8") as fd:
        json.dump({v: i for i, v in enumerate(gpt2_vocab)}, fd, ensure_ascii=False)
    spiece_processor = SentencePieceProcessor()
    spiece_processor.Load("{}.model".format(spiece_out))
    # Encode the whole text
    encoded = [
        [" ".join(spiece_processor.EncodeAsPieces(line[0])).replace("▁", "Ġ")]
        for line in lines
    ]
    tmp_encoded_fd, tmp_encoded_path = tempfile.mkstemp()
    tmp_bpe_fd, tmp_bpe_path = tempfile.mkstemp()
    try:
        # Write the encoded text to a temporary file.
        with os.fdopen(tmp_encoded_fd, "w", encoding="utf8") as fd:
            writer = csv.writer(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            writer.writerows(encoded)
        learn_bpe(
            open(tmp_encoded_path, "r", encoding="utf8"),
            open(tmp_bpe_path, "w", encoding="utf8"),
            num_symbols=vocab_size,
        )
        with open(tmp_bpe_path, "r", encoding="utf8") as fd:
            reader = csv.reader(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            seen = set()
            merges = []
            for line in reader:
                # Get rid of the </w> tokens
                line = line[0].replace("</w>", "")
                # Remove duplicates (due to </w> tokens)
                if line not in seen:
                    seen.add(line)
                    merges.append([line])
        with open(os.path.join(out_dir, "merges.txt"), "w", encoding="utf8") as fd:
            writer = csv.writer(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            writer.writerows(merges)
    finally:
        os.remove(tmp_encoded_path)
        os.remove(tmp_bpe_path)
    spinner.stop()