Esempio n. 1
0
lf = open("od-captionsonly.txt",
          "r")  # file contains captions only, one caption per line

# build vocabulary

from Vocabulary import Vocabulary

vocab = Vocabulary("captions")

captions = []
for lin in lf:
    captions.append(lin)

for caption in captions:
    vocab.add_sentence(caption)


def tokenizer(text):  # create a tokenizer function
    return text.split(' ')


inp_text = sys.argv[1]
print(inp_text)
tokens = tokenizer(inp_text)
codes = []
for t in tokens:
    codes.append(vocab.to_index(t))

print(codes)
c_tokens = [0] * 256  # fill to match text_seq_len