inputs = [] responses = [] for row in df: if row[1] != '' and row[2] != '': inputs.append(row[1]) responses.append(row[2]) print(len(inputs)) print(len(responses)) lang = Lang() for i in range(len(inputs)): lang.tokenize(inputs[i]) lang.tokenize(responses[i]) prev_vocab_size = lang.vocab_size lang.unk_data(unk_threshold=unk_threshold) print("prev: " + str(prev_vocab_size)) print("curr: " + str(lang.vocab_size)) tokenized_inputs = [] tokenized_responses = [] for i in range(len(inputs)): tokenized_inputs.append( combine_lists(lang.tokenize(inputs[i], create_ids=False))) tokenized_responses.append(