lf = open("od-captionsonly.txt", "r") # file contains captions only, one caption per line # build vocabulary from Vocabulary import Vocabulary vocab = Vocabulary("captions") captions = [] for lin in lf: captions.append(lin) for caption in captions: vocab.add_sentence(caption) def tokenizer(text): # create a tokenizer function return text.split(' ') inp_text = sys.argv[1] print(inp_text) tokens = tokenizer(inp_text) codes = [] for t in tokens: codes.append(vocab.to_index(t)) print(codes) c_tokens = [0] * 256 # fill to match text_seq_len