コード例 #1
0
def tokenize_single(sent, vocab):
    token_ids = util.sentenc_to_token_ids(sent,
                                          vocab,
                                          flag_ascii=FLAGS.flag_ascii)
    ones = [1] * len(token_ids)
    source = np.array(token_ids).reshape([-1, 1])
    mask = np.array(ones).reshape([-1, 1])
    return source, mask
コード例 #2
0
def tokenize_multi(sents, vocab):
    token_ids = []
    for sent in sents:
        token_ids.append(util.sentenc_to_token_ids(sent, vocab))
    token_ids = padded(token_ids)
    source = np.array(token_ids).T
    source_mask = (source != 0).astype(np.int32)
    return source, source_mask
コード例 #3
0
def data_to_token_ids(data_path, target_path, vocab):
    print("Tokenizing data in %s" % data_path)
    with open(data_path, encoding='utf-8') as data_file:
        with open(target_path, mode="w") as tokens_file:
            for line in data_file:
                line = line.strip('\n')
                token_ids = util.sentenc_to_token_ids(line, vocab)
                tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
コード例 #4
0
def data_to_token_ids(data_path, target_path, vocab, flag_ascii=False):
    print("Tokenizing data in %s" % data_path)
    with open(data_path, mode="r") as data_file:
        with open(target_path, mode="w") as tokens_file:
            for line in data_file:
                line = line.strip('\n')
                token_ids = util.sentenc_to_token_ids(line, vocab, flag_ascii)
                tokens_file.write(" ".join([str(tok) for tok in token_ids]) +
                                  "\n")  # ?????????????????