def __init__(self, model: str = None, service: str = "classification"): """ Constructor to the class that does the Classification in the back end :param model: Transfomer model that will be used for Classification Task :param service: string to represent the service, this will be defaulted to classification """ if model is None: model = "distilbert" # path to all the files that will be used for inference self.path = f"./{service}/{model}/" # json file for mapping of network output to the correct category self.mapping = self.path + "mapping.json" self.model_path = self.path + "model.bin" # Selecting the correct model based on the passed madel input. Default distilbert if model == "distilbert": self.model = DistillBERTClass() self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.path) else: self.model = DistillBERTClass() self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.path) self.model.eval() self.model.load_state_dict( torch.load(self.model_path, map_location=device)) with open(self.mapping) as f: self.config = json.load(f)
def get_train_test_embeddings(): tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') de_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-german-cased') #de_tokenizer = bert-base-german-dbmdz-cased train_texts, train_qualities = read_en_de_split('./data/en-de') test_texts, test_qualities = read_en_de_split('./data/en-de', train=False) print(max([len(i) for i in train_texts[0]]), max([len(i) for i in train_texts[1]])) train_encodings = ((tokenizer(train_texts[0], max_length=201, truncation=True, padding='max_length'), de_tokenizer(train_texts[1], max_length=201, truncation=True, padding='max_length')), train_qualities) test_encodings = ((tokenizer(test_texts[0], max_length=201, truncation=True, padding='max_length'), de_tokenizer(test_texts[1], max_length=201, truncation=True, padding='max_length')), test_qualities) return train_encodings, test_encodings
def __init__(self, text_mod: MimicText, clf): super().__init__() self.text_mod = text_mod self.clf = clf tokenizer_path = Path(__file__).parent.parent / 'classifiers/tokenizer' if not tokenizer_path.exists(): tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') tokenizer.save_pretrained(tokenizer_path) else: tokenizer = DistilBertTokenizerFast.from_pretrained(tokenizer_path) self.tokenizer = tokenizer
def makeUnilabelModel(self, modelName, num_labels=10, root='', **kwargs): if modelName == 'distilbert-base-uncased': tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained( root + "distilbert-base-uncased", num_labels=num_labels, **kwargs) if modelName == 'gpt2': tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model = GPT2ForSequenceClassification.from_pretrained( root + "gpt2", num_labels=num_labels, **kwargs) model.resize_token_embeddings(len(tokenizer)) # add padding token model.config.pad_token_id = tokenizer('[PAD]').input_ids[0] if modelName == 'bertweet': tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base') model = AutoModelForSequenceClassification.from_pretrained( root + "vinai/bertweet-base", num_labels=num_labels, **kwargs) if modelName == 'distilroberta-base': tokenizer = AutoTokenizer.from_pretrained('distilroberta-base') model = AutoModelForSequenceClassification.from_pretrained( root + "distilroberta-base", num_labels=num_labels, **kwargs) if modelName == 'lstm': tokenizer = AutoTokenizer.from_pretrained( 'distilbert-base-uncased') model = LSTMCclassifier(128, 64, 2, tokenizer.vocab_size, num_labels) return tokenizer, model
def get_bert_tokenizer(model): if model == "bert-base-uncased": return BertTokenizerFast.from_pretrained(model) elif model == "distilbert-base-uncased": return DistilBertTokenizerFast.from_pretrained(model) else: raise ValueError(f"Model: {model} not recognized.")
def getTweets(): print("Getting tweets now ...") # Default keyword if you hit search keyword = request.args.get( 'keyword', default='coronavirus covid vaccine vaccination COVID-19') # Fetch the 20 most recent tweets matching the query. Change the argument # in `items()` to decrease or increase the number of retrieved tweets. # The larger the number the longer the retreival time query = keyword # text from the search box tweets_ = tweepy.Cursor(api.search, query, result_type='recent').items(20) tweets = [tweet.text for tweet in tweets_] print("Done ... retrieving tweets from API based on the keyword=" + keyword) df = pd.DataFrame(data=tweets, columns=['Tweet']) print("Done ... creating dataframe") # Iterate over the tweet texts in `tweets` and pass each item to the model # to obtain a prediction, then write those predictions to a Pandas dataframe model = pipeline( 'sentiment-analysis', model=DistilBertForSequenceClassification.from_pretrained("model"), tokenizer=DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased')) results = list(model(tweet) for tweet in tweets) df['Sentiment'] = list(LABELS[s[0].get('label')] for s in results) df['Score'] = list(s_[0].get('score') for s_ in results) print("Done ... sentiment-analysis") print(df) return render_template("covid.html", data=list(df.values.tolist()))
def load_from_pretrained(cls, path: Path): model = TFDistilBertForSequenceClassification.from_pretrained(path) tokenizer = DistilBertTokenizerFast.from_pretrained(path) processing_pipeline = TransformersProcessingPipeline( TextPipeline.encode_dataset, tokenizer) validate_variables(model, tokenizer, processing_pipeline) return cls(model=model, processing_pipeline=processing_pipeline)
def __init__(self, model_path, tag_path): with open(tag_path, "r") as tag_file: file_content = tag_file.read().strip() self.id_to_tag = file_content.splitlines() self.model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(self.id_to_tag)) self.model.load_state_dict(torch.load(model_path)) self.model.eval() self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
def load_tokenizer(self): tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased", padding="max_length", max_length=self.params.max_len, truncation=True, is_split_into_words=True) return tokenizer
def load_model(self, model_name: str = "bert_ner_test"): # TODO model loaded from mlflow # Load model and tokenizer. config = DistilBertConfig.from_pretrained(model_name) model = DistilBertForTokenClassification(config).from_pretrained( model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) return model, config, tokenizer
def get_bert_tokenizer(bert_model_type): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: if '-cased' in bert_model_type: do_lower_case = False else: do_lower_case = True # default return BertTokenizerFast(vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: return RobertaTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], merges_file=BERT_MERGE_FILE[bert_model_type], add_prefix_space=True) elif bert_model_type in ['xlnet-base-cased']: if '-uncased' in bert_model_type: do_lower_case = True else: do_lower_case = False # default return XLNetTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: return AlbertTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: tokenizer = GPT2TokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], merges_file=BERT_MERGE_FILE[bert_model_type], add_prefix_space=True) # https://github.com/huggingface/transformers/issues/3859 tokenizer.pad_token = tokenizer.eos_token return tokenizer elif bert_model_type in ['transfo-xl']: return TransfoXLTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: if '-cased' in bert_model_type: do_lower_case = False else: do_lower_case = True # default return DistilBertTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}')
def getBertTokenizer(model): if model == 'bert-base-uncased': tokenizer = BertTokenizerFast.from_pretrained(model) elif model == 'distilbert-base-uncased': tokenizer = DistilBertTokenizerFast.from_pretrained(model) else: raise ValueError(f'Model: {model} not recognized.') return tokenizer
def generate_tokenizer_and_model(model_name): if model_name == "bert-base-uncased": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained("bert-base-uncased") elif model_name == 'distilbert-base-uncased': tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") return tokenizer, model
def __init__(self, data_path, split): self.tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') self.original_sentences, self.original_spans, self.fixed_spans = \ self.get_sentences_from_data_split(data_path, split) self.token_ids, self.offsets, self.att_masks, self.special_masks, self.labels_ids = \ self.preprocess_and_tokenize(self.original_sentences, self.fixed_spans)
def create_dataset(data_dir: Text) -> Tuple[CabbyDataset, CabbyDataset, CabbyDataset]: '''Loads data and creates datasets and train, validate and test sets. Arguments: data_dir: The directory of the data. Returns: The train, validate and test sets. ''' LABELS = data.Field( sequential=False, preprocessing=lambda xs: 1 if xs == "manhattan" else 0, use_vocab=False, batch_first=True, ) TEXT = data.Field( use_vocab=False, batch_first=True, sequential=False, ) train_ds, valid_ds, test_ds = data.TabularDataset.splits( path=data_dir, format='tsv', skip_header=False, train='train.tsv', validation='dev.tsv', test='test.tsv', fields=[ ('label', LABELS), ('instructions', TEXT)]) logging.info('Data sample: %s', vars(train_ds[0])) # Get list of instructions. train_texts = [train_ds.examples[idx].instructions for idx in range(len(train_ds))] val_texts = [valid_ds.examples[idx].instructions for idx in range(len(valid_ds))] test_texts = [test_ds.examples[idx].instructions for idx in range(len(test_ds))] # Get list of lables. train_labels = [train_ds.examples[idx].label for idx in range(len(train_ds))] val_labels = [valid_ds.examples[idx].label for idx in range(len(valid_ds))] test_labels = [test_ds.examples[idx].label for idx in range(len(test_ds))] # Tokenize instructions. tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') train_encodings = tokenizer(train_texts, truncation=True, padding=True, add_special_tokens=True) val_encodings = tokenizer(val_texts, truncation=True, padding=True) test_encodings = tokenizer(test_texts, truncation=True, padding=True, add_special_tokens=True) # Create Cabby dataset. train_dataset = CabbyDataset(train_encodings, train_labels) val_dataset = CabbyDataset(val_encodings, val_labels) test_dataset = CabbyDataset(test_encodings, test_labels) return train_dataset, val_dataset, test_dataset
def __init__(self, model='bert-base-uncased'): super().__init__(model, "BERT") self.mlm = None # Masked Language Model self.nsp = None # Next Sentence Prediction self.qa = None # Question Answering self.tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') self.masked_token = self.tokenizer.mask_token self.sep_token = self.tokenizer.sep_token self.cls_token = self.tokenizer.cls_token
def __init__(self, max_vocab_size = 30522, embedding_dim = 768, from_pt=False): super().__init__(max_vocab_size, embedding_dim) config = transformers.DistilBertConfig() config.vocab_size = max_vocab_size config.dim = embedding_dim self.model = transformers.modeling_tf_distilbert.TFDistilBertModel.from_pretrained(self.model_path, config=config, from_pt=from_pt) from transformers import DistilBertTokenizerFast self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.tokenizer_path) self.pipeline = transformers.pipeline(task='feature-extraction', model=self.model, tokenizer=self.tokenizer)
def __init__(self, path=None, model_name=None): if path: self.model = DistilBertForSequenceClassification.from_pretrained( path) tokenizer_path = os.path.join(path, "model/") if os.path.exists(tokenizer_path): self.tokenizer = DistilBertTokenizerFast.from_pretrained( tokenizer_path) else: self.tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased") elif model_name: config = DistilBertConfig.from_pretrained(model_name, return_dict=True, num_labels=2) self.model = DistilBertForSequenceClassification.from_pretrained( model_name, config=config) self.tokenizer = DistilBertTokenizerFast.from_pretrained( model_name)
def __init__(self, model_path=None, use_cuda=False): if not model_path: model_path = get_model_path() if not os.path.exists(model_path): raise FileNotFoundError("Cannot find model under " + model_path) self.device = "cuda" if use_cuda and torch.cuda.is_available() else "cpu" self.model = DistilBertForTokenClassification.from_pretrained(model_path) self.model.to(self.device) self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path) self.label_map = self.get_label_map(model_path)
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') DistilBert = DistilBertModel.from_pretrained('distilbert-base-uncased') Experts = [DistilBertQA(DistilBertModel.from_pretrained('distilbert-base-uncased')).to(device) for _ in range(args.num_experts)] tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') gate_model = GateNetwork(384, 3,3, DistilBert.config).to(device) print(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = device trainer = train.Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=1, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(Experts, gate_model, train_loader, val_loader, val_dict, args.num_experts) if args.do_eval: split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = train.Trainer(args, log) # load model restore_model("",args.num_experts, Experts, gate_model) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=1, sampler=SequentialSampler(eval_dataset)) args.device = device eval_preds, eval_scores = trainer.evaluate(Experts, gate_model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def get_tokenizer(model_type='BERT'): if model_type == 'distilBERT': tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') elif model_type == 'BERT': tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') elif model_type == 'alBERT': tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') else: print('model_type not allowed ', model_type) return tokenizer
def __criar_base_treinamento_validacao(): tokenizer = DistilBertTokenizerFast.from_pretrained( 'neuralmind/bert-base-portuguese-cased', model_max_length=512, do_lower_case=False) # Obtém a base rotulada a partir de um arquivo JSONL gerado pela ferramenta de anotação Docanno textos, tags = __get_textos_tags() textos, tags = __pre_processar_base(textos, tags, tokenizer) # Divide os textos com quantidade de tokens maior do que o suportado em textos menores. train_texts, val_texts, train_tags, val_tags = train_test_split( textos, tags, test_size=.2, random_state=42) return tags, tokenizer, train_tags, train_texts, val_tags, val_texts
def get_tokenizer() -> DistilBertTokenizerFast: """ Returns tokenizer for that model. Parameters: None Returns: tokenizer (DistilBertTokenizerFast) : loaded and set tokenizer. """ tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') return tokenizer
def tokenize(self): """ Tokenize DLND data """ tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased', cache_dir=TRANSFORMERS_CACHE_DIR, local_files_only=True) return tokenizer( [('. '.join(src_docs), '. '.join(tgt_docs)) for src_docs, tgt_docs in zip(self.data[-3], self.data[-2])], padding=True, truncation=True)
def test_load_from_pretrained(self, tmp_path): pickle_dir = tmp_path / "pickle_dir" pickle_dir.mkdir() model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') model.save_pretrained(pickle_dir) tokenizer.save_pretrained(pickle_dir) modeling_pipeline = TransformersModelingPipeline.load_from_pretrained(str(pickle_dir)) assert isinstance(modeling_pipeline, TransformersModelingPipeline)
def __init__(self, hparams): self.train = {} self.dev = {} self.test = {} self.data_turns = {} self.data_path = hparams.data_path self.tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased") self.batch_size = hparams.batch_size self.max_len = hparams.max_len self.max_value_len = hparams.max_value_len self.max_context_len = hparams.max_context_len self.no_history = hparams.no_history
def __init__(self, model): """ Inicializa com um modelo passado como parâmetro. Utiliza o tokenizador para língua portuguesa neuralmind/bert-base-portuguese-cased. """ super().__init__() self.tokenizer = DistilBertTokenizerFast.from_pretrained( 'neuralmind/bert-base-portuguese-cased', model_max_length=512, do_lower_case=False) self.nlp = pipeline('ner', model=model, tokenizer=self.tokenizer, grouped_entities=True)
def fine_tune_model(dir): train_texts, train_labels = read_imdb_split(dir + '/train/') test_texts, test_labels = read_imdb_split(dir + '/test/') train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2) # Use DistilBert tokenizer. Usually to match the pretrained models, we need to use the same tokenization and # numericalization as the model. Fortunately, the tokenizer class from transformers provides the correct # pre-process tools that correspond to each pre-trained models. tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') # PAss truncation=True and padding=True, which ensure that all sequences are padded to the same length # and are truncated to be no longer maximum input length. train_encodings = tokenizer(train_texts, truncation=True, padding=True) val_encodings = tokenizer(val_texts, truncation=True, padding=True) test_encodings = tokenizer(test_texts, truncation=True, padding=True) # Now, let's turn our labels and encodings into a Dataset object.In PyTorch, this is done by subclassing # a torch.utils.data.Dataset object and implementing __len__ and __getitem__. train_dataset = IMDbDataset(train_encodings, train_labels) val_dataset = IMDbDataset(val_encodings, val_labels) test_dataset = IMDbDataset(test_encodings, test_labels) training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=3, # total number of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10, ) model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset # evaluation dataset ) trainer.train()
def prepare_dataset(self): saved_path = os.path.join(self.spot_and_diff_dir, self.mode + '.pt') if os.path.exists(saved_path): print(self.mode, 'data exists, read from', saved_path) else: raw_dataset = [] neg_dataset = [] tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-cased') with open(self.data_dir) as f: data = json.load(f) for i in range(len(data)): idx = data[i]['img_id'] sentences = data[i]['sentences'] self.all_sentences += sentences img_0, img_1 = img2tensor(self.img_dir, idx) sample = { 'img_0': img_0, 'img_1': img_1, 'sentences': sentences, 'label': 1 } raw_dataset.append(sample) if i % 100 == 99: print(i, '/', len(data)) for i in range(len(data)): # Here, we do shallow copy to avoid dict-level in-place modification current_sample = {**raw_dataset[i]} current_sample[ 'sentences'] = [] + current_sample['sentences'] if self.torch_bernoulli() >= .0: negative_sample = self.augment_sample_sentences( current_sample, n_replacement=1) current_sample['label'] = 0 neg_dataset.append(current_sample) if i % 100 == 99: print(i, '/', len(data)) new_dataset = raw_dataset + neg_dataset for r in new_dataset: r['sentences'] = text2tensor(r['sentences'], tokenizer) os.makedirs(self.spot_and_diff_dir, exist_ok=True) torch.save(new_dataset, saved_path) print('Saved to', saved_path) print(len(raw_dataset), 'positive samples') print(len(neg_dataset), 'negative samples')
def load_model(self): if self.model_name == 'distilbert-base-uncased': from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.model_name) self.model = DistilBertForSequenceClassification.from_pretrained(self.model_name) elif self.model_name == 'distilbert-base-multilingual-cased': from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.model_name) self.model = DistilBertForSequenceClassification.from_pretrained(self.model_name) elif self.model_name == 'bert-base-uncased': from transformers import BertTokenizerFast, BertForSequenceClassification self.tokenizer = BertTokenizerFast.from_pretrained(self.model_name) self.model = BertForSequenceClassification.from_pretrained(self.model_name) elif self.model_name == 'bert-base-cased-finetuned-mrpc': from transformers import AutoTokenizer, AutoModelForSequenceClassification self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name) elif self.model_name == 'bert-base-multilingual-cased': from transformers import BertTokenizerFast, BertForSequenceClassification self.tokenizer = BertTokenizerFast.from_pretrained(self.model_name) self.model = BertForSequenceClassification.from_pretrained(self.model_name) else: print('wrongly model name!') pass