tokenizer.i2tw = id2lb_dict tg2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=tokenizer.tw2i, unk_words=False, sos=False, eos=False) pad_id = tokenizer.sw2i.get(PAD, 0) sw_size = len(tokenizer.sw2i) tw_size = len(tokenizer.tw2i) collate_fn = Tokenizer.collate_fn(pad_id, True) else: vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" tokenizer = BPE.load(vocab_file) tokenizer.add_tokens(sys_tokens) nl2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False) tg2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False) pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id(BPAD) is not None else tokenizer.token_to_id(PAD) sw_size = tokenizer.get_vocab_size() tw_size = tokenizer.get_vocab_size() collate_fn = BPE.collate_fn(pad_id, True) train_data, num_lines = Tokenizer.prepare_iter(filename, firstline=False, task=1) train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=lb2ids, num_lines=num_lines) train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn) for i, batch in enumerate(train_dataloader): # inputs, outputs = batch[0], batch[1] nl_tensor, lb_tensor = batch nl_len_tensor = (nl_tensor != pad_id).sum(dim=1) break use_selfatt = True if use_selfatt:
nl2ids = Tokenizer.lst2idx(tokenizer=vocab.process_nl, vocab_words=vocab.sw2i, unk_words=True, eos=True) tg2ids = Tokenizer.lst2idx(tokenizer=vocab.process_target, vocab_words=vocab.tw2i, unk_words=False, sos=True, eos=True) pad_id = vocab.sw2i.get(PAD, 0) sw_size = len(vocab.sw2i) tw_size = len(vocab.tw2i) else: vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" vocab = BPE.load(vocab_file) vocab.add_tokens([SOT, EOT, NULL]) nl2ids = BPE.tokens2ids(vocab) tg2ids = BPE.tokens2ids(vocab) pad_id = vocab.token_to_id(BPAD) if vocab.token_to_id(BPAD) else 0 sw_size = vocab.get_vocab_size() tw_size = vocab.get_vocab_size() collate_fn = BPE.collate_fn(pad_id, True) # load datasets to map into indexes train_data = JSON.get_iterator(filename) num_lines = JSON._len(filename) # train_data = CSV.get_iterator(filename, firstline=True) # num_lines = CSV._len(filename) train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=tg2ids, num_lines=num_lines) train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn) for i, batch in enumerate(train_dataloader): # inputs, outputs = batch[0], batch[1] nl_tensor, lb_tensor = batch # nl_len_tensor = (nl_tensor > 0).sum(dim=1)