コード例 #1
0
def test_token_copy():
    a = pyonmttok.Token("a")
    b = pyonmttok.Token(a)
    assert b.surface == "a"
    b.surface = "b"
    assert b.surface == "b"
    assert a.surface == "a"
コード例 #2
0
ファイル: test.py プロジェクト: innerNULL/Tokenizer
def test_token_dict():
    a = pyonmttok.Token("a")
    b = pyonmttok.Token("b", join_left=True)
    c = pyonmttok.Token("c", features=["X"])
    d = pyonmttok.Token("d")

    collection = {a: 0, b: 1, c: 2}

    assert d not in collection
    for i, token in enumerate((a, b, c)):
        assert collection[token] == i
        assert collection[pyonmttok.Token(token)] == i  # Hashing is based on token equivalence.
コード例 #3
0
ファイル: tu.py プロジェクト: celestialized/nmt-wizard-docker
 def append(self, other):
     other_token_objects = other.tok.token_objects
     if other_token_objects is None:
         if not other.detok:
             return False
         self.detok = (self.detok +
                       ((' ' + other.output_delimiter)
                        if other.output_delimiter is not None else '') +
                       ' ' + other.detok)
     else:
         if not other_token_objects[0]:
             return False
         tok = self.tok
         tokenizer = tok.tokenizer
         token_objects = tok.token_objects
         if token_objects is None:
             token_objects = [[]]
         elif len(token_objects) > 1:
             return False
         elif token_objects[0] and other.output_delimiter is not None:
             token_objects[0].append(pyonmttok.Token(
                 other.output_delimiter))
         token_objects[0].extend(other_token_objects[0])
         self.tok = (tokenizer, token_objects)
     return True
コード例 #4
0
ファイル: test.py プロジェクト: innerNULL/Tokenizer
def test_bpe_learner_tokens(tmpdir):
    tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True)
    learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=2, min_frequency=1)
    learner.ingest_token("ab■")
    token = pyonmttok.Token("cd")
    learner.ingest_token(token)
    model_path = str(tmpdir.join("bpe.model"))
    learner.learn(model_path)
    with open(model_path) as model:
        assert model.read() == "#version: 0.2\na b</w>\nc d</w>\n"
コード例 #5
0
def test_token_repr():
    token = pyonmttok.Token()
    assert repr(token) == "Token()"

    token = pyonmttok.Token("Hello")
    assert repr(token) == "Token('Hello')"

    token = pyonmttok.Token("Hello",
                            type=pyonmttok.TokenType.LEADING_SUBWORD,
                            casing=pyonmttok.Casing.MIXED,
                            join_right=True,
                            join_left=True,
                            preserve=True,
                            features=["X", "Y"])
    assert repr(token) == ("Token('Hello', "
                           "type=TokenType.LEADING_SUBWORD, "
                           "join_left=True, "
                           "join_right=True, "
                           "preserve=True, "
                           "features=['X', 'Y'], "
                           "casing=Casing.MIXED)")
コード例 #6
0
ファイル: test.py プロジェクト: keichi/Tokenizer
def test_token_pickle():
    token = pyonmttok.Token("Hello",
                            type=pyonmttok.TokenType.LEADING_SUBWORD,
                            casing=pyonmttok.Casing.MIXED,
                            join_right=True,
                            join_left=True,
                            preserve=True,
                            features=["X", "Y"])

    data = pickle.dumps(token)
    token2 = pickle.loads(data)
    assert token == token2
コード例 #7
0
ファイル: test.py プロジェクト: panosk/Tokenizer
def test_token_length():
    assert len(pyonmttok.Token("測試")) == 2