def regular_encode(texts: list, tokenizer: transformers.AutoTokenizer, maxlen: int = 512, multi_class: bool = True): """ Encode sentences for input to Transformer models. :param texts: list of strings to be encoded :param tokenizer: tokenizer for encoding :param maxlen: max number of characters of input string being encoded :param multi_class: if True, the default truncation is applied. If False, implies auxillary input and custom truncation is applied. :return: numpy array of encoded strings """ # TODO: Intersphinx link to transformers.AutoTokenizer is failing. What's wrong with my docs/source/conf.py? if not multi_class: # If len > maxlen, truncate text upto maxlen-8 characters and append the 8-character auxillary input texts = [ text[:maxlen - 8] + text[-8:] if len(text) > maxlen else text for text in texts ] enc_di = tokenizer.batch_encode_plus( texts, return_attention_mask=False, return_token_type_ids=False, pad_to_max_length=True, # sep_token='[SEP]', max_length=maxlen, truncation=True) # Is this what we want? return np.array(enc_di['input_ids'])
def regular_encode(texts: list, tokenizer: transformers.AutoTokenizer, maxlen: int = 512): """ Encode sentences for input to Transformer models. :param texts: list of strings to be encoded :param tokenizer: tokenizer for encoding :param maxlen: max number of characters of input string being encoded :return: numpy array of encoded strings """ # TODO: Intersphinx link to transformers.AutoTokenizer is failing. What's wrong with my docs/source/conf.py? enc_di = tokenizer.batch_encode_plus(texts, return_attention_mask=False, return_token_type_ids=False, pad_to_max_length=True, # sep_token='[SEP]', max_length=maxlen, truncation=True) # Is this what we want? return np.array(enc_di['input_ids'])