def tokenize_single(sent, vocab): token_ids = util.sentenc_to_token_ids(sent, vocab, flag_ascii=FLAGS.flag_ascii) ones = [1] * len(token_ids) source = np.array(token_ids).reshape([-1, 1]) mask = np.array(ones).reshape([-1, 1]) return source, mask
def tokenize_multi(sents, vocab): token_ids = [] for sent in sents: token_ids.append(util.sentenc_to_token_ids(sent, vocab)) token_ids = padded(token_ids) source = np.array(token_ids).T source_mask = (source != 0).astype(np.int32) return source, source_mask
def data_to_token_ids(data_path, target_path, vocab): print("Tokenizing data in %s" % data_path) with open(data_path, encoding='utf-8') as data_file: with open(target_path, mode="w") as tokens_file: for line in data_file: line = line.strip('\n') token_ids = util.sentenc_to_token_ids(line, vocab) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def data_to_token_ids(data_path, target_path, vocab, flag_ascii=False): print("Tokenizing data in %s" % data_path) with open(data_path, mode="r") as data_file: with open(target_path, mode="w") as tokens_file: for line in data_file: line = line.strip('\n') token_ids = util.sentenc_to_token_ids(line, vocab, flag_ascii) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") # ?????????????????