Esempio n. 1
0
inputs = []
responses = []

for row in df:
    if row[1] != '' and row[2] != '':
        inputs.append(row[1])
        responses.append(row[2])

print(len(inputs))
print(len(responses))

lang = Lang()

for i in range(len(inputs)):
    lang.tokenize(inputs[i])
    lang.tokenize(responses[i])

prev_vocab_size = lang.vocab_size
lang.unk_data(unk_threshold=unk_threshold)

print("prev: " + str(prev_vocab_size))
print("curr: " + str(lang.vocab_size))

tokenized_inputs = []
tokenized_responses = []

for i in range(len(inputs)):
    tokenized_inputs.append(
        combine_lists(lang.tokenize(inputs[i], create_ids=False)))
    tokenized_responses.append(