tgt_line = output.strip() except KeyboardInterrupt: sys.exit(0) #except: # skipped_error += 1 # continue cnt += 1 if cnt % 100 == 0: print("{} / {} ...".format(cnt, len(cpy_list))) try: if sys.argv[1] == "bpe": src_ids = src_lookup.encode(src_line, add_bos_eos_tokens=True) if sys.argv[1] == "gpt2": src_ids = src_lookup.encode(src_line, add_bos_eos_tokens=False) tgt_ids = tgt_lookup.encode(tgt_line, add_bos_eos_tokens=True) if cnt % 100 == 0: print("\n+++++++SRC:") print(src_line) print(src_ids) print(src_lookup.decode(src_ids)) print(src_lookup.decode(src_ids, skip_bos_eos_tokens=True)) print("+++++++TGT") print(tgt_line) print(tgt_ids) print(tgt_lookup.decode(tgt_ids))
# CREATE LOOKUPS src_lookup = Lookup(type="gpt2") src_lookup.save_special_tokens( file_prefix=os.path.join(output_lookup_folder, "src")) tgt_lookup = Lookup(type="gpt2") tgt_lookup.save_special_tokens( file_prefix=os.path.join(output_lookup_folder, "tgt")) print("Done.") # check everything is ok lookup = Lookup(type="gpt2") lookup.load(file_prefix=os.path.join(output_lookup_folder, "tgt")) text = "This is a test." token_ids = lookup.encode(text) print("Encode: {}".format(token_ids)) recreated_string = lookup.decode(token_ids) print("Decode: {}".format(recreated_string)) print("Map w2i:") tokens = lookup.tokenize(text) for i in range(len(tokens)): print("\t[{}] = [{}]".format(tokens[i], lookup.convert_tokens_to_ids(tokens[i]))) print("Map i2w:") for i in range(len(token_ids)): print("\t[{}] = [{}]".format(token_ids[i], lookup.convert_ids_to_tokens(token_ids[i]))) token_ids = lookup.encode(text, add_bos_eos_tokens=True)