Beispiel #1
0
        tokenizer.i2tw = id2lb_dict
        tg2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=tokenizer.tw2i,
                                   unk_words=False, sos=False, eos=False)
        pad_id = tokenizer.sw2i.get(PAD, 0)
        sw_size = len(tokenizer.sw2i)
        tw_size = len(tokenizer.tw2i)
        collate_fn = Tokenizer.collate_fn(pad_id, True)
    else:
        vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt"
        tokenizer = BPE.load(vocab_file)
        tokenizer.add_tokens(sys_tokens)
        nl2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False)
        tg2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False)

        pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id(BPAD) is not None else tokenizer.token_to_id(PAD)
        sw_size = tokenizer.get_vocab_size()
        tw_size = tokenizer.get_vocab_size()
        collate_fn = BPE.collate_fn(pad_id, True)

    train_data, num_lines = Tokenizer.prepare_iter(filename, firstline=False, task=1)
    train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=lb2ids, num_lines=num_lines)
    train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn)

    for i, batch in enumerate(train_dataloader):
        # inputs, outputs = batch[0], batch[1]
        nl_tensor, lb_tensor = batch
        nl_len_tensor = (nl_tensor != pad_id).sum(dim=1)
        break

    use_selfatt = True
    if use_selfatt:
Beispiel #2
0
        nl2ids = Tokenizer.lst2idx(tokenizer=vocab.process_nl, vocab_words=vocab.sw2i,
                                   unk_words=True, eos=True)
        tg2ids = Tokenizer.lst2idx(tokenizer=vocab.process_target, vocab_words=vocab.tw2i,
                                   unk_words=False, sos=True, eos=True)
        pad_id = vocab.sw2i.get(PAD, 0)
        sw_size = len(vocab.sw2i)
        tw_size = len(vocab.tw2i)
    else:
        vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt"
        vocab = BPE.load(vocab_file)
        vocab.add_tokens([SOT, EOT, NULL])
        nl2ids = BPE.tokens2ids(vocab)
        tg2ids = BPE.tokens2ids(vocab)

        pad_id = vocab.token_to_id(BPAD) if vocab.token_to_id(BPAD) else 0
        sw_size = vocab.get_vocab_size()
        tw_size = vocab.get_vocab_size()

    collate_fn = BPE.collate_fn(pad_id, True)
    # load datasets to map into indexes
    train_data = JSON.get_iterator(filename)
    num_lines = JSON._len(filename)
    # train_data = CSV.get_iterator(filename, firstline=True)
    # num_lines = CSV._len(filename)
    train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=tg2ids, num_lines=num_lines)
    train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn)

    for i, batch in enumerate(train_dataloader):
        # inputs, outputs = batch[0], batch[1]
        nl_tensor, lb_tensor = batch
        # nl_len_tensor = (nl_tensor > 0).sum(dim=1)