Esempio n. 1
0
def make_subword_learner(subword_config, subword_dir, tokenizer=None):
    params = subword_config.get("params")
    if params is None:
        raise ValueError(
            "'params' field should be specified for subword model learning.")
    subword_type = subword_config.get("type")
    if subword_type is None:
        raise ValueError(
            "'type' field should be specified for subword model learning.")
    vocab_size = params.get("vocab_size")
    if vocab_size is None:
        raise ValueError(
            "'vocab_size' parameter should be specified for subword model learning."
        )

    if subword_type == "bpe":
        learner = pyonmttok.BPELearner(
            tokenizer=tokenizer,
            symbols=vocab_size,
            min_frequency=params.get("min-frequency", 0),
            total_symbols=params.get("total_symbols", False),
        )
    elif subword_type == "sp":
        learner = pyonmttok.SentencePieceLearner(tokenizer=tokenizer, **params)
    else:
        raise ValueError("Invalid subword type : '%s'." % subword_type)

    return {
        "learner": learner,
        "subword_type": subword_type,
        "size": vocab_size
    }
Esempio n. 2
0
def make_subword_learner(subword_config, subword_dir):

    if 'params' not in subword_config:
        raise RuntimeError(
            'Parameter field \'params\' should be specified for subword model learning.'
        )
    params = subword_config['params']

    if 'type' not in subword_config:
        raise RuntimeError(
            '\'type\' field should be specified for subword model learning.')
    subword_type = subword_config['type']

    if 'vocab_size' not in params:
        raise RuntimeError(
            '\'vocab_size\' should be specified for subword model learning.')
    size = params['vocab_size']

    learner = None
    if (subword_type == "bpe"):
        min_frequency = params[
            'min-frequency'] if 'min-frequency' in params else 0
        total_symbols = params[
            'total_symbols'] if 'total_symbols' in params else False
        # If no tokenizer is specified, the default tokenizer is space mode.
        learner = pyonmttok.BPELearner(symbols=size,
                                       min_frequency=min_frequency,
                                       total_symbols=total_symbols)
    elif (subword_type == "sp"):
        learner = pyonmttok.SentencePieceLearner(**params)
    else:
        raise RuntimeError('Invalid subword type : \'%s\'.' % subword_type)

    return {"learner": learner, "subword_type": subword_type, "size": size}
Esempio n. 3
0
def make_subword_learner(subword_config, subword_dir):
    params = subword_config.get('params')
    if params is None:
        raise ValueError('\'params\' field should be specified for subword model learning.')
    subword_type = subword_config.get('type')
    if subword_type is None:
        raise ValueError('\'type\' field should be specified for subword model learning.')
    vocab_size = params.get('vocab_size')
    if vocab_size is None:
        raise ValueError('\'vocab_size\' parameter should be specified for subword model learning.')

    if subword_type == "bpe":
        learner = pyonmttok.BPELearner(
            symbols=vocab_size,
            min_frequency=params.get('min-frequency', 0),
            total_symbols=params.get('total_symbols', False))
    elif subword_type == "sp":
        learner = pyonmttok.SentencePieceLearner(**params)
    else:
        raise ValueError('Invalid subword type : \'%s\'.' % subword_type)

    return {
        "learner": learner,
        "subword_type": subword_type,
        "size": vocab_size
    }
Esempio n. 4
0
def test_bpe_learner_tokens(tmpdir):
    learner = pyonmttok.BPELearner(symbols=2, min_frequency=1)
    learner.ingest_token("hello")
    learner.ingest_token("world")
    model_path = str(tmpdir.join("bpe.model"))
    learner.learn(model_path)
    with open(model_path) as model:
        assert model.read() == "#version: 0.2\ne l\nel l\n"
Esempio n. 5
0
def test_bpe_learner_tokens(tmpdir):
    tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True)
    learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=2, min_frequency=1)
    learner.ingest_token("ab■")
    learner.ingest_token("cd")
    model_path = str(tmpdir.join("bpe.model"))
    learner.learn(model_path)
    with open(model_path) as model:
        assert model.read() == "#version: 0.2\na b</w>\nc d</w>\n"
Esempio n. 6
0
def test_bpe_learner(tmpdir):
    tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True)
    learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=2, min_frequency=1)
    learner.ingest("hello world")
    model_path = str(tmpdir.join("bpe.model"))
    tokenizer = learner.learn(model_path)
    with open(model_path) as model:
        assert model.read() == "#version: 0.2\ne l\nel l\n"
    tokens, _ = tokenizer.tokenize("hello")
    assert tokens == ["h■", "ell■", "o"]
Esempio n. 7
0
    def tokenize(self):
        print('Tokenizing and training BPE model')
        
        tokenizer_default = pyonmttok.Tokenizer(**defaults.tokenizer["args"])
        learner = pyonmttok.BPELearner(tokenizer=tokenizer_default, symbols=defaults.tokenizer["symbols"])

        # load training corpus
        learner.ingest_file(path.join('data', 'toy-ende', 'src-train.txt'))

        # learn and store bpe model
        tokenizer = learner.learn(path.join('data', 'toy-ende', 'run', 'subwords.bpe'))

        # tokenize corpus and save results
        for data_file in ['src-train', 'src-test', 'src-val']:
            data_file = path.join('data', 'toy-ende', data_file) 
            tokenizer.tokenize_file(f'{data_file}.txt', f'{data_file}.bpe')
        
        return
Esempio n. 8
0
def learn_bpe(tok_config, bpe_model, symbols=32000, files=[]):
    tokenizer = onmttok(tok_config)
    learner = pyonmttok.BPELearner(tokenizer=tokenizer.tokenizer,
                                   symbols=symbols)

    if len(files):
        for f in files:
            sys.stderr.write('Ingest file={}\n'.format(f))
            sys.stderr.flush()
            learner.ingest_file(f)
    else:
        sys.stderr.write('Ingest stdin\n')
        sys.stderr.flush()
        for l in sys.stdin:
            learner.ingest(l)
    sys.stderr.write('Learning {}\n'.format(bpe_model))
    sys.stderr.flush()
    learner.learn(bpe_model)
Esempio n. 9
0
 def learn_model_for_file(file_path):
     self_learner = pyonmttok.BPELearner(tokenizer=tokenizer,
                                         symbols=args.symbols)
     self_learner.ingest_file(file_path)
     name = "".join(file_path.split("/")[-1].split(".")[:-1])
     self_learner.learn("{}_{}".format(args.output, name))
Esempio n. 10
0
        required=True)
    parser.add_argument(
        '--share_src',
        action='store_true',
        help='use all files in src directory to train the same model',
        required=False)
    parser.add_argument('--symbols',
                        action='store',
                        type=int,
                        help='amount of symbols to use',
                        required=False,
                        default=32000)

    args = parser.parse_args()
    tokenizer = pyonmttok.Tokenizer('conservative', joiner_annotate=True)
    learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=args.symbols)

    print("Learning BPE model(s)...")
    if os.path.isdir(args.source):
        if args.share_src:
            iterate_files_in_dir(args.source, learner.ingest_file)
            learner.learn(args.output)
        else:

            def learn_model_for_file(file_path):
                self_learner = pyonmttok.BPELearner(tokenizer=tokenizer,
                                                    symbols=args.symbols)
                self_learner.ingest_file(file_path)
                name = "".join(file_path.split("/")[-1].split(".")[:-1])
                self_learner.learn("{}_{}".format(args.output, name))
Esempio n. 11
0
                                             character_coverage=0.98)
    assert isinstance(learner, pyonmttok.SubwordLearner)
    learner.ingest("hello word! how are you?")
    model_path = str(tmpdir.join("sp"))
    tokenizer = learner.learn(model_path)
    if keep_vocab:
        assert os.path.exists(model_path + ".model")
        assert os.path.exists(model_path + ".vocab")
    else:
        assert os.path.exists(model_path)
    tokens, _ = tokenizer.tokenize("hello")
    assert tokens == ["▁h", "e", "l", "l", "o"]


@pytest.mark.parametrize("learner", [
    pyonmttok.BPELearner(symbols=2, min_frequency=1),
    pyonmttok.SentencePieceLearner(vocab_size=17, character_coverage=0.98)
])
def test_learner_with_invalid_files(tmpdir, learner):
    with pytest.raises(ValueError):
        learner.ingest_file("notfound.txt")
    learner.ingest("hello word ! how are you ?")
    directory = tmpdir.join("directory")
    directory.ensure(dir=True)
    with pytest.raises(Exception):
        learner.learn(str(directory))


def test_token_api():
    tokenizer = pyonmttok.Tokenizer("aggressive",
                                    joiner_annotate=True,
Esempio n. 12
0
        elif tok == "-min_frequency" and len(sys.argv):
            min_frequency = int(sys.argv.pop(0))
        else:
            sys.stderr.write('error: unparsed {} option\n'.format(tok))
            sys.stderr.write("{}\n".format(usage))
            sys.exit()

    if fout is None:
        sys.stderr.error('option -o must be used\n')
        sys.stderr.write("{}\n".format(usage))
        sys.exit()

    create_logger()

    l = pyonmttok.BPELearner(tokenizer=t.get_tokenizer(),
                             symbols=symbols,
                             min_frequency=min_frequency)

    if len(fin) == 0:
        logging.info('Read stdin')
        for line in sys.stdin:
            l.ingest(str(line.strip('\n')))
    else:
        for f in fin:
            logging.info('Read {}'.format(f))
            l.ingest_file(f)

    logging.info('learning... symbols={} min_frequency={}'.format(
        symbols, min_frequency))
    l.learn(fout)
    logging.info('Done')