def test_label_name_containing_whitespaces(): commands = ["foo bar:", "execute run", " say baz"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except ValueError as error: assert error.args[0].startswith("Unknown or incomplete command") else: assert False
def test_unknown_label(): commands = ["execute run", " redo foo"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except NameError as error: assert error.args[0].startswith("Unknown label") else: assert False
def test_label_name_beginning_with_number(): commands = ["1foo:", "execute run", " say bar"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except SyntaxError as error: assert error.args[0].startswith("Label names must satisfy") else: assert False
def test_invalid_indentation(): commands = ["say foo", " say bar"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except IndentationError: assert True else: assert False
def test_invalid_command(): commands = ["foo bar"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except ValueError as error: assert error.args[0].startswith("Unknown or incomplete command") else: assert False
def test_excess_indentation(): commands = ["execute as @a run", " say foo"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except IndentationError: assert True else: assert False
def test_invalid_indentation_space(): commands = ["execute as @a run", " say abc"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except IndentationError: assert True else: assert False
def test_non_indented_redo(): commands = ["say abc", "redo"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except ValueError as error: assert error.args[0].startswith( "Command 'redo' must be indented in an execute command") else: assert False
def test_redo_having_more_arguments(): commands = ["foo:", "execute run", " redo foo bar"] try: tokenize([(i + 1, command) for i, command in enumerate(commands)]) except SyntaxError as error: assert error.args[0].startswith( "Command 'redo' has more than 1 argument at line ") else: assert False
def test_tokinzer(): pred = tokenize("これはテストのテキストです。ファミリーマート") assert ["これ", "は", "テスト", "の", "テキスト", "です", "。", "ファミリーマート"] == pred pred = tokenize("「これは、テストのテキストです。ファミリーマート」") assert ["これ", "は", "テスト", "の", "テキスト", "です", "。", "ファミリーマート"] == pred pred = tokenize("これは、テストのテキストです。ファミリーマート🍎") assert ["これ", "は", "テスト", "の", "テキスト", "です", "。", "ファミリーマート"] == pred
def char_tokenization_2(input): tokens = [] for i in range(len(input)): if input[i] == ' ': # uses token with prepended space token = tokenize([input[i] + input[i + 1]])[0][0] i += 1 else: token = tokenize([input[i]])[0][0] tokens.append(token) return tokens
def get_response(self, user_input): """Given `user_input`, this method tokenizes it, and returns a response that is most likely to respond to the user in an appropriate way. """ tokens = tokenize(user_input) latest_response = LatestResponse(None, 0) if not self.data: raise ValueError("Data must have at least one item") response = self._run_thread(tokens) if response: return response # Normal handling for response in self.data: response_weight = self._get_weight(response, tokens) # Check whether the response should become the most weighted or not, if they are tied it's a random chance if response_weight == latest_response.weight: if random.random() > 0.5: latest_response = LatestResponse(response, response_weight) elif response_weight > latest_response.weight: latest_response = LatestResponse(response, response_weight) # Set the response to a generic one if nothing matched, generic meaning something like "I don't understand" self.last_response = latest_response if latest_response.weight == 0: return random.choice(self.generics) return random.choice(latest_response.response["value"])
def _process_file(fin, fout): print(f"Expanding {fin} to {fout}...") with open(fin, "r") as src: code = src.read() with open(fout, "w") as dest: dest.write(code) clear(fout) if CLEAN: return _env = copy.deepcopy(ENV) _env["XGLSL"] = LANG_CURRENT == "glsl" _env["XHLSL"] = LANG_CURRENT in ["hlsl9", "hlsl11"] _env["XHLSL9"] = LANG_CURRENT == "hlsl9" _env["XHLSL11"] = LANG_CURRENT == "hlsl11" tokens = tokenize(fout) tree = make_tree(tokens) processed = process_tree(tree, _env, XPATH, XPATH_DEFAULT, CLEAR_PRAGMA_INCLUDES) with open(fout, "w") as f: processed = handle_compatibility(processed, LANG_CURRENT) processed = re.sub(r"\n{3,}", "\n\n", processed) if MINIFY: processed = minify(processed) f.write(processed) print("-" * 80)
def test_indentation_children(): commands = [ "execute as @a run", " say foo", " execute as @p run", " say bar" ] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) children = tokens[0].get_children() assert isinstance(children[0], LiteralToken) assert isinstance(children[1], IndentationToken)
def char_tokenization(input): tokens = [] for char in input: if char == ' ': # replaces spaces with underscores token = 62 else: token = tokenize([char])[0][0] tokens.append(token) return tokens
def _pre_process(doc_dict): """ Pre-process a document data; 1) generate id, 2) tokenize the document. :param doc_dict: A document stored in a dictionary object with the following keys; 'title', 'desc', 'tag'. :return: A list of tokenized keys; job_id, title, desc, tag, title_seg, desc_seg, False. """ from src import tokenizer import tltk import hashlib import time def tltk_tokenize(text): ret = tltk.segment(text).replace('<u/>', '').replace('<s/>', '').split('|') return ret cleaner = tokenizer.cleaner_generator('../Resource/charset') title = doc_dict['title'] desc = doc_dict['desc'] title_seg = tokenizer.tokenize(title, cleaner, tltk_tokenize, 5) desc_seg = tokenizer.tokenize(desc, cleaner, tltk_tokenize, 5) tag = doc_dict['tag'] in_str = str(time.time()) + title + desc job_id = hashlib.md5(bytes(in_str, 'utf-8')).hexdigest() return [job_id, title, desc, tag, title_seg, desc_seg, False]
def wrapper_tokenize(text): return tt.tokenize(text, tltk_tokenize, ngram, './Dict/charset', cleaner)
def test_label_assignment(): commands = ["foo:", "execute run", " say bar"] tokenize([(i + 1, command) for i, command in enumerate(commands)])
def test_new_lines(self): text = "\n\n" tokens = tokenize(text) assert len(tokens) == 1 assert isinstance(tokens[0], Spacing)
def test_just_text(self): text = 'how are you' tokens = tokenize(text) for x in tokens: print x
def test_display_math(self): text = '$$\sigma$$' tokens = tokenize(text) print tokens self._print_tokens(tokens)
def test_non_inline_redo(): commands = ["execute as @a run", " say abc"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert tokens[0].get_redo_condition() == ""
def parse(program): "Read a Scheme expression from a string." return create_ast(tokenize(program))
def test_indentation_command_token(): commands = ["execute as @a run", " say abc"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert isinstance(tokens[0], IndentationToken)
def test_indentation_exit(): commands = ["execute as @a run", " say foo", "say bar"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert isinstance(tokens[0], IndentationToken) assert isinstance(tokens[1], LiteralToken)
def test_token_value(): commands = ["say foo", "execute as @a run", " say bar"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert tokens[0].get_command() == "say foo" assert tokens[1].get_command() == "execute as @a run"
def test_indentation_children_length(): commands = ["execute as @a run", " say foo", " say bar"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert len(tokens[0].get_children()) == 2
def test_should_not_redo(): commands = ["execute as @a run", " say abc"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert tokens[0].should_redo() == False
def test_ignore_empty_lines(): commands = ["say foo", "", "say bar", ""] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert len(tokens) == 2 assert tokens[0].get_command() == "say foo" assert tokens[1].get_command() == "say bar"
def test_some_latex(self): text = '$\epsilon$ is awesome!' tokens = tokenize(text) assert tokens[0].content == '$\epsilon$'
def test_simple_command_token(): commands = ["say abc"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert isinstance(tokens[0], LiteralToken)
def test_escapes(self): text = '$\$$' tokens = tokenize(text) print tokens assert tokens[0].content == '$\$$'
def test_output_length(): commands = ["say foo", "say bar", "say far", "say boo"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert len(tokens) == 4
def test_indentation(): commands = ["execute as @a run", " say abc"] tokens = tokenize([(i + 1, command) for i, command in enumerate(commands)]) assert tokens[0].get_children()[0].get_command() == "say abc"