Example #1
0
from transformers import PreTrainedTokenizer
from transformers.models.bart.modeling_bart import (
    shift_tokens_right as _shift_tokens_right, )
from datasets import Features, Sequence, Value, load_dataset
from datasets import Dataset as HFDataset
from transformers import (
    DPRContextEncoder,
    DPRContextEncoderTokenizerFast,
)

logger = logging.getLogger(__name__)

if transformers.__version__ < "4.2.0":
    shift_tokens_right = (
        lambda input_ids, pad_token_id, decoder_start_token_id:
        _shift_tokens_right(input_ids, pad_token_id))
else:
    shift_tokens_right = _shift_tokens_right


def preprocess_batch_for_hf_dataset(dataset, encoder_tokenizer,
                                    decoder_tokenizer, args):
    if args.model_type == "bart":
        input_ids = encoder_tokenizer.batch_encode_plus(
            dataset["input_text"],
            max_length=args.max_seq_length,
            padding="max_length",
            return_tensors="np",
            truncation=True,
        )
Example #2
0
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizer
from transformers.models.bart.modeling_bart import shift_tokens_right as _shift_tokens_right
from datasets import Features, Sequence, Value, load_dataset
from datasets import Dataset as HFDataset
from transformers import (
    DPRContextEncoder,
    DPRContextEncoderTokenizerFast,
)


logger = logging.getLogger(__name__)

if transformers.__version__ < "4.2.0":
    shift_tokens_right = lambda input_ids, pad_token_id, decoder_start_token_id: _shift_tokens_right(
        input_ids, pad_token_id
    )
else:
    shift_tokens_right = _shift_tokens_right


def preprocess_batch_for_hf_dataset(dataset, encoder_tokenizer, decoder_tokenizer, args):
    if args.model_type == "bart":
        input_ids = encoder_tokenizer.batch_encode_plus(
            dataset["input_text"],
            max_length=args.max_seq_length,
            padding="max_length",
            return_tensors="np",
            truncation=True,
        )
 def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
     return _shift_tokens_right(input_ids, pad_token_id)