def test_token_copy(): a = pyonmttok.Token("a") b = pyonmttok.Token(a) assert b.surface == "a" b.surface = "b" assert b.surface == "b" assert a.surface == "a"
def test_token_dict(): a = pyonmttok.Token("a") b = pyonmttok.Token("b", join_left=True) c = pyonmttok.Token("c", features=["X"]) d = pyonmttok.Token("d") collection = {a: 0, b: 1, c: 2} assert d not in collection for i, token in enumerate((a, b, c)): assert collection[token] == i assert collection[pyonmttok.Token(token)] == i # Hashing is based on token equivalence.
def append(self, other): other_token_objects = other.tok.token_objects if other_token_objects is None: if not other.detok: return False self.detok = (self.detok + ((' ' + other.output_delimiter) if other.output_delimiter is not None else '') + ' ' + other.detok) else: if not other_token_objects[0]: return False tok = self.tok tokenizer = tok.tokenizer token_objects = tok.token_objects if token_objects is None: token_objects = [[]] elif len(token_objects) > 1: return False elif token_objects[0] and other.output_delimiter is not None: token_objects[0].append(pyonmttok.Token( other.output_delimiter)) token_objects[0].extend(other_token_objects[0]) self.tok = (tokenizer, token_objects) return True
def test_bpe_learner_tokens(tmpdir): tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True) learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=2, min_frequency=1) learner.ingest_token("ab■") token = pyonmttok.Token("cd") learner.ingest_token(token) model_path = str(tmpdir.join("bpe.model")) learner.learn(model_path) with open(model_path) as model: assert model.read() == "#version: 0.2\na b</w>\nc d</w>\n"
def test_token_repr(): token = pyonmttok.Token() assert repr(token) == "Token()" token = pyonmttok.Token("Hello") assert repr(token) == "Token('Hello')" token = pyonmttok.Token("Hello", type=pyonmttok.TokenType.LEADING_SUBWORD, casing=pyonmttok.Casing.MIXED, join_right=True, join_left=True, preserve=True, features=["X", "Y"]) assert repr(token) == ("Token('Hello', " "type=TokenType.LEADING_SUBWORD, " "join_left=True, " "join_right=True, " "preserve=True, " "features=['X', 'Y'], " "casing=Casing.MIXED)")
def test_token_pickle(): token = pyonmttok.Token("Hello", type=pyonmttok.TokenType.LEADING_SUBWORD, casing=pyonmttok.Casing.MIXED, join_right=True, join_left=True, preserve=True, features=["X", "Y"]) data = pickle.dumps(token) token2 = pickle.loads(data) assert token == token2
def test_token_length(): assert len(pyonmttok.Token("測試")) == 2