def load_data(self): train = tfds.load('glue/' + self.dataset_name, split='train', shuffle_files=True) train_unshuffled = tfds.load('glue/' + self.dataset_name, split='train', shuffle_files=False) validation = tfds.load('glue/' + self.dataset_name, split='validation', shuffle_files=True) # test = tfds.load('glue/' + self.dataset_name, split='test', shuffle_files=True) # Prepare datasets for Huggingface's transformers tokenizer = DistilBertTokenizer.from_pretrained(self.model_name) train = glue_convert_examples_to_features(train, tokenizer, max_length=self.max_length, task=self.dataset_name) self.train_unshuffled = glue_convert_examples_to_features( train_unshuffled, tokenizer, max_length=self.max_length, task=self.dataset_name) validation = glue_convert_examples_to_features( validation, tokenizer, max_length=self.max_length, task=self.dataset_name) # test = glue_convert_examples_to_features(test, tokenizer, max_length=self.max_length, task=self.dataset_name) self.validation = validation.batch(self.max_length).prefetch(1) self.train = train.shuffle(1000).repeat().batch( int(self.max_length / 2)).prefetch(1)
def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs): if model_name not in ModelsByFamily.Supported: raise ValueError(f'Model {model_name} not supported.') do_lower_case = False if 'uncased' in model_name.lower(): do_lower_case = True tokenizer_kwargs.update({'do_lower_case': do_lower_case}) self._tokenizer = None self._model = None if model_name in ModelsByFamily.Bert: self._tokenizer = BertTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._model = TFBertForSequenceClassification.from_pretrained( model_name, **model_kwargs) elif model_name in ModelsByFamily.Roberta: self._tokenizer = RobertaTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._model = TFRobertaForSequenceClassification.from_pretrained( model_name, **model_kwargs) elif model_name in ModelsByFamily.XLNet: self._tokenizer = XLNetTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._model = TFXLNetForSequenceClassification.from_pretrained( model_name, **model_kwargs) elif model_name in ModelsByFamily.DistilBert: self._tokenizer = DistilBertTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._model = TFDistilBertForSequenceClassification.from_pretrained( model_name, **model_kwargs) assert self._tokenizer and self._model
def simple_inference(): ''' this one is simpler and better for general case. It doesn't show the distribution of all the sentiments. this one uses the TextClassificationPipeline from transformers lib which is preferable :return: ''' tokenizer = DistilBertTokenizer.from_pretrained("./model_out/") model = DistilBertForSequenceClassification.from_pretrained("./model_out/") model.to('cpu') sentiment_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=-1) t1 = time.time() result = sentiment_classifier("this is so cute!") t2 = time.time() print(t2 - t1, result) result = sentiment_classifier("That's so disgusting!") t3 = time.time() print(t3 - t2, result) result = sentiment_classifier("this is a simple test.") t4 = time.time() print(t4 - t3, result)
def training_data( tickets_data_path: str, text_column: str, label_column: str, test_size: float = 0.25, subset_size: int = -1, max_length: int = 100, pad_to_max_length: bool = True, ) -> Tuple[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], DistilBertTokenizer]: df = pd.read_csv(tickets_data_path) x = df[text_column].tolist() y = df[label_column].tolist() unique_labels = sorted(list(set(y))) y = encode_labels(y, unique_labels) tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased") tokenizer.max_length = max_length tokenizer.pad_to_max_length = pad_to_max_length print("tokenizing all texts...") x = encode_texts(tokenizer, x) subset_size = len(x) if subset_size < 0 else subset_size x_train, x_test, y_train, y_test = train_test_split(x[:subset_size], y[:subset_size], test_size=test_size, random_state=42) return (x_train, x_test, y_train, y_test), tokenizer
def load_model(manifest): """Loads the model object from the file at model_filepath key in config dict""" checkpoints_path = manifest["model_filepath"] if __name__ == "__main__": checkpoints = checkpoints_path else: checkpoints = client.file(checkpoints_path).getFile().name assert_model_md5(checkpoints) class_mapping = { 0: "Movies_Negative", 1: "Movies_Positive", 2: "Food_Negative", 3: "Food_Positive", 4: "Clothing_Negative", 5: "Clothing_Positive", } model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=len(class_mapping), output_attentions=False, output_hidden_states=False, ) tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") model.load_state_dict( torch.load(checkpoints, map_location=torch.device("cpu"))) return model, tokenizer, class_mapping
def test_distilbert(self): for tokenizer_name in DistilBertTokenizer.pretrained_vocab_files_map["vocab_file"].keys(): tokenizer_p = DistilBertTokenizer.from_pretrained(tokenizer_name) tokenizer_r = DistilBertTokenizerFast.from_pretrained(tokenizer_name) # Check we have the same number of added_tokens for both pair and non-pair inputs. self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False)) self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True)) # Check we have the correct max_length for both pair and non-pair inputs. self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) # DistilBert should match 100% # Assert the set of special tokens match. self.assertSequenceEqual( tokenizer_p.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(), "DistilBert tokenizers doesn't have the same set of special_tokens", ) # Assure tokenization overlap between python and rust impl. self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0) # Ensure add_tokens and add_special_tokens return the correct vocab size self.assert_add_tokens(tokenizer_r) # Check for offsets mapping self.assert_offsets_mapping(tokenizer_r) # Check for dynamic encoding sequence handling in batch_encode_plus self.assert_batch_encode_dynamic_overflowing(tokenizer_r) # Check alignment for build_inputs_with_special_tokens self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
def load_transformer(model_type): if model_type == "distilbert": tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') model = TFDistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=1) elif model_type == "bert_x12": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=1) elif model_type == "bert_x24": tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = TFBertForSequenceClassification.from_pretrained( "bert-large-uncased", num_labels=1) elif model_type == "albert_v2_x12": tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForSequenceClassification.from_pretrained( "albert-base-v2", num_labels=1) elif model_type == "longformer_x12": tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') model = TFLongformerForSequenceClassification.from_pretrained( "allenai/longformer-base-4096", num_labels=1) elif model_type == "longformer_x24": tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-large-4096') model = TFLongformerForSequenceClassification.from_pretrained( "allenai/longformer-large-4096", num_labels=1) else: raise ValueError(model_type + " was invalid") return model, tokenizer
def main(): # model files check and download check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) ailia_model = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id) tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased-finetuned-sst-2-english') model_inputs = tokenizer.encode_plus(args.input, return_tensors="pt") inputs_onnx = { k: v.cpu().detach().numpy() for k, v in model_inputs.items() } print("Input : ", args.input) # inference if args.benchmark: print('BENCHMARK mode') for i in range(5): start = int(round(time.time() * 1000)) score = ailia_model.predict(inputs_onnx) end = int(round(time.time() * 1000)) print("\tailia processing time {} ms".format(end - start)) else: score = ailia_model.predict(inputs_onnx) score = numpy.exp(score) / numpy.exp(score).sum(-1, keepdims=True) label_name = ["negative", "positive"] label_id = numpy.argmax(numpy.array(score)) print("Label : ", label_name[label_id]) print("Score : ", score[0][0][label_id]) print('Script finished successfully.')
def __init__( self, model=None, tokenizer=None, model_name="bert-large-uncased", mask_token="***mask***", disable_gpu=False, ): self.mask_token = mask_token self.delemmatizer = Delemmatizer() self.device = torch.device( "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu" ) print("using model:", model_name) print("device:", self.device) if not model: if "distilbert" in model_name: self.bert = DistilBertForMaskedLM.from_pretrained(model_name) else: self.bert = BertForMaskedLM.from_pretrained(model_name) self.bert.to(self.device) else: self.bert = model self.bert.to(self.device) if not tokenizer: if "distilbert" in model_name: self.tokenizer = DistilBertTokenizer.from_pretrained(model_name) else: self.tokenizer = BertTokenizer.from_pretrained(model_name) else: self.tokenizer = tokenizer self.bert.eval()
def get_tokenizer(self): if self.hparams.model_type == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif self.hparams.model_type == 'bert-cased': tokenizer = BertTokenizer.from_pretrained('bert-base-cased') elif self.hparams.model_type == 'bert-large': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') elif self.hparams.model_type == 'distilbert': tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') elif self.hparams.model_type == 'roberta': tokenizer = RobertaTokenizer.from_pretrained('roberta-base') elif self.hparams.model_type == 'roberta-large': tokenizer = RobertaTokenizer.from_pretrained('roberta-large') elif self.hparams.model_type == 'albert': tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') elif self.hparams.model_type == 'albert-xxlarge': tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') elif self.hparams.model_type == 'electra': tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-base-discriminator') elif self.hparams.model_type == 'electra-large': tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-large-discriminator') else: raise ValueError return tokenizer
def answergen(context, question): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=True) model = DistilBertForQuestionAnswering.from_pretrained( 'distilbert-base-uncased-distilled-squad') encoding = tokenizer.encode_plus(question, context) input_ids, attention_mask = encoding["input_ids"], encoding[ "attention_mask"] start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor( [attention_mask])) ans_tokens = input_ids[torch.argmax(start_scores ):torch.argmax(end_scores) + 1] answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens, skip_special_tokens=True) print("\nQuestion ", question) #print ("\nAnswer Tokens: ") #print (answer_tokens) answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens) #print ("\nAnswer : ",answer_tokens_to_string) return answer_tokens_to_string
def find_matches(model, image_embeddings, query, image_filenames, n=9): tokenizer = DistilBertTokenizer.from_pretrained(CFG.text_tokenizer) encoded_query = tokenizer([query]) batch = { key: torch.tensor(values).to(CFG.device) for key, values in encoded_query.items() } with torch.no_grad(): text_features = model.text_encoder( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]) text_embeddings = model.text_projection(text_features) image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1) text_embeddings_n = F.normalize(text_embeddings, p=2, dim=-1) dot_similarity = text_embeddings_n @ image_embeddings_n.T _, indices = torch.topk(dot_similarity.squeeze(0), n * 5) matches = [image_filenames[idx] for idx in indices[::5]] _, axes = plt.subplots(3, 3, figsize=(10, 10)) for match, ax in zip(matches, axes.flatten()): image = cv2.imread(f"{CFG.image_path}/{match}") image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) ax.imshow(image) ax.axis("off") plt.show()
def build_model_pretrained(config): #Create different tokenizers for both source and target language. src_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-multilingual-cased') tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tgt_tokenizer.bos_token = '<s>' tgt_tokenizer.eos_token = '</s>' #encoder_config = DistilBertConfig.from_pretrained('distilbert-base-multilingual-cased') encoder = DistilBertModel.from_pretrained( 'distilbert-base-multilingual-cased') if config.decoder.pretrained: decoder = BertForMaskedLM.from_pretrained('bert-base-uncased') else: decoder_config = BertConfig(vocab_size=tgt_tokenizer.vocab_size, is_decoder=True) decoder = BertForMaskedLM(decoder_config) model = TranslationModel(encoder, decoder) model.cuda() tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer}) return model, tokenizers
def get_tokenizer(lm='bert'): """Return the tokenizer. Intiailize it if not initialized. Args: lm (string): the name of the language model (bert, albert, or distilbert) Returns: BertTokenizer or DistilBertTokenizer or AlbertTokenizer """ global tokenizer if tokenizer is None: if lm == 'bert': from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif lm == 'distilbert': from transformers import DistilBertTokenizer tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') elif lm == 'albert': from transformers import AlbertTokenizer tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') elif lm == 'roberta': from transformers import RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') elif lm == 'xlnet': from transformers import XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') elif lm == 'longformer': from transformers import LongformerTokenizer tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') return tokenizer
def load_tokenizer(self): # Load the tokenizer. if self.verbose == True: print('Loading {} tokenizer...'.format(self.model_name)) if self.model_name == 'bert': self.tokenizer = BertTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'distilbert': self.tokenizer = DistilBertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'albert': self.tokenizer = AlbertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'bart': self.tokenizer = BartTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'xlnet': self.tokenizer = XLNetTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'roberta': self.tokenizer = RobertaTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'camenbert': self.tokenizer = CamembertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'flaubert': self.tokenizer = FlaubertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'gpt2': self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_type)
def main(): # 1 get data into dataframe df = read_into_pandas() (mlb_category, df) = replace_column_with_label_representation(df, 'category', 'category_int') df_train, df_test = train_test_split(df, test_size=0.2) # 2 transform into BERT format df_bert = pd.DataFrame({ 'id':df_train['id'], 'label':df_train['category_int'], 'alpha':['a']*df_train.shape[0], 'text': df_train['text'].str[:512].replace(r'\n', ' ', regex=True) }) df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01) df_bert_test = pd.DataFrame({ 'id':df_test['id'], 'text': df_test['text'].str[:512].replace(r'\n', ' ', regex=True) }) # Saving dataframes to .tsv format as required by BERT df_bert_train.to_csv('../datasets/Newswire_BERT/train.tsv', sep='\t', index=False, header=False) df_bert_dev.to_csv('../datasets/Newswire_BERT/dev.tsv', sep='\t', index=False, header=False) df_bert_test.to_csv('../datasets/Newswire_BERT/test.tsv', sep='\t', index=False, header=False) # 3 load pretrained model tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', return_dict=True) # 4 transform tokenized = df_bert_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True))) print('Padding') max_len = 0 for i in tokenized.values: if len(i) > max_len: max_len = len(i) padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values]) print('Shape after padding ' + str(np.array(padded).shape)) attention_mask = np.where(padded != 0, 1, 0) attention_mask.shape input_ids = torch.tensor(padded) attention_mask = torch.tensor(attention_mask).to('cuda:0') print('Embedding model start') model.train() with torch.no_grad(): input_ids = input_ids.clone().detach().to(torch.int64).to('cuda:0') model = model.to('cuda:0') labels = torch.tensor(df_bert_train['label'].values).to(torch.int64).to('cuda:0') print(labels) last_hidden_states = model(input_ids, attention_mask=attention_mask, labels=labels) print(model) model.save_pretrained('models/BERT1')
def __init__(self, max_len): self.model_name = 'distilbert-base-uncased' self.max_len = max_len self.tkzr = DistilBertTokenizer.from_pretrained(self.model_name) self.model = TFDistilBertForSequenceClassification.from_pretrained( self.model_name) self.optimizer = optimizers.Adam(learning_rate=3e-5) self.loss = losses.SparseCategoricalCrossentropy(from_logits=True)
def model_load(self, path: str): config = DistilBertConfig.from_pretrained(path + "/config.json") tokenizer = DistilBertTokenizer.from_pretrained( path, do_lower_case=self.do_lower_case) model = DistilBertForQuestionAnswering.from_pretrained(path, from_tf=False, config=config) return model, tokenizer
def __init__(self, filename, maxlen): # Store the contents of the file in a pandas dataframe self.df = pd.read_csv(filename, delimiter="\t") # Initialize the BERT tokenizer self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") self.maxlen = maxlen
def answer(context: str, question: str): tokenizer: DistilBertTokenizer = DistilBertTokenizer.from_pretrained( MODEL_PATH, return_token_type_ids=True) input_ids, input_mask = encode(context, question, tokenizer) answer_tokens = get_answer_tokens(input_ids, input_mask, tokenizer) answer = tokenizer.convert_tokens_to_string(answer_tokens) return answer
def __init__(self, model_name="distilbert-base-uncased-distilled-squad", device="cuda"): super().__init__() self.tokenizer = DistilBertTokenizer.from_pretrained(model_name) self.model = DistilBertForQuestionAnswering.from_pretrained(model_name) self.device = device self.model = self.model.to(self.device)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model = DistilBertForQuestionAnswering.from_pretrained( self.model_dir) self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_dir) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device)
def __init__(self, *args, **kwargs): # initialize super class with request & response schema, configs super().__init__(*args, **kwargs) # initialize model and other tools self.tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased-finetuned-sst-2-english')
def __init__( self, semantic_analysis_config: BrainSentimentAnalysisConfiguration): super().__init__() self._semantic_analysis_config = semantic_analysis_config model_dir = semantic_analysis_config.model_dir tokenizer = DistilBertTokenizer.from_pretrained(model_dir) model = DistilBertForSequenceClassification.from_pretrained(model_dir) self.sentiment_classifier = SentimentClassifer(model, tokenizer)
def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) self.model_path = model_path self.tokenizer = DistilBertTokenizer.from_pretrained(model_path) self.model = DistilBertForMaskedLM.from_pretrained(model_path) self.model.to(self.device) self.model.eval()
def __init__(self, qas: list, qids: list, aids: list, goldids: dict): self.tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-cased') # BertTokenizer.from_pretrained('bert-base-cased') self.qas = qas self.qids = qids self.aids = aids self.goldids = goldids self.max_len = 512
def check_sentiment(text): tokenizer = DistilBertTokenizer.from_pretrained('./pretrain_distillbert_full_sst') model = DistilBertForSequenceClassification.from_pretrained('./pretrain_distillbert_full_sst') sentiment_classifier = SentimentClassifer(model, tokenizer) result = sentiment_classifier(text) sentiment = max(result, key=result.get) sentiment_distribution = list(result.values()) print("sentiment of {}: {}".format(text, sentiment)) return sentiment
def build_model(config): src_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-multilingual-cased') tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tgt_tokenizer.bos_token = '<s>' tgt_tokenizer.eos_token = '</s>' #hidden_size and intermediate_size are both wrt all the attention heads. #Should be divisible by num_attention_heads encoder_config = BertConfig( vocab_size=src_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12) decoder_config = BertConfig( vocab_size=tgt_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, is_decoder=True) #Create encoder and decoder embedding layers. encoder_embeddings = nn.Embedding(src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id) decoder_embeddings = nn.Embedding(tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id) encoder = BertModel(encoder_config) encoder.set_input_embeddings(encoder_embeddings) decoder = BertForMaskedLM(decoder_config) decoder.set_input_embeddings(decoder_embeddings) model = TranslationModel(encoder, decoder) return model, src_tokenizer, tgt_tokenizer
def __init__(self, model_name="distilbert-base-uncased", device="cuda"): super().__init__() self.device = device self.tokenizer = DistilBertTokenizer.from_pretrained(model_name) self.model = DistilBertModel.from_pretrained(model_name).to( self.device) self.linear = nn.Linear(self.model.config.dim, self.model.config.num_labels).to(self.device) self.dropout = nn.Dropout(self.model.config.qa_dropout).to(self.device)
def get_tokenizer(name, size): if name == 'bert': return BertTokenizer.from_pretrained(f"bert-{size}-uncased") elif name == "albert": return AlbertTokenizer.from_pretrained(f"albert-{size}-v2") elif name == "distilbert": return DistilBertTokenizer.from_pretrained(f"distilbert-{size}-uncased") else: raise AssertionError()