def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32) input_lengths = None if self.use_input_lengths: input_lengths = (ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 ) # small variation of seq_length token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs) sequence_labels = None token_labels = None is_impossible_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) config = XLMConfig( vocab_size=self.vocab_size, n_special=self.n_special, emb_dim=self.hidden_size, n_layers=self.num_hidden_layers, n_heads=self.num_attention_heads, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, gelu_activation=self.gelu_activation, sinusoidal_embeddings=self.sinusoidal_embeddings, asm=self.asm, causal=self.causal, n_langs=self.n_langs, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, summary_type=self.summary_type, use_proj=self.use_proj, bos_token_id=self.bos_token_id, ) return ( config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, )
def xlm_model(): config = XLMConfig( vocab_size=93000, emb_dim=32, n_layers=5, n_heads=4, dropout=0.1, max_position_embeddings=512, lang2id={ "ar": 0, "bg": 1, "de": 2, "el": 3, "en": 4, "es": 5, "fr": 6, "hi": 7, "ru": 8, "sw": 9, "th": 10, "tr": 11, "ur": 12, "vi": 13, "zh": 14, }, ) return XLMModel(config=config)
def test_TFXLMForQuestionAnsweringSimple(self): from transformers import XLMConfig, TFXLMForQuestionAnsweringSimple keras.backend.clear_session() # pretrained_weights = 'xlm-mlm-enfr-1024' tokenizer_file = 'xlm_xlm-mlm-enfr-1024.pickle' tokenizer = self._get_tokenzier(tokenizer_file) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) config = XLMConfig() model = TFXLMForQuestionAnsweringSimple(config) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def __init__(self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True): super().__init__(config=config) if pre_load_model: self.xlm = XLMModel.from_pretrained(self.config.model_name, output_hidden_states=True) else: xlm_config = XLMConfig.from_pretrained(self.config.model_name, output_hidden_states=True) self.xlm = XLMModel(xlm_config) self.source_lang_id = self.xlm.config.lang2id.get( self.config.source_language) self.target_lang_id = self.xlm.config.lang2id.get( self.config.target_language) if None in (self.source_lang_id, self.target_lang_id): raise ValueError( f'Invalid lang_id for XLM model.' f' Valid ids are: {self.xlm.config.lang2id.keys()}') self.mlp = None if self.config.use_mlp: self.mlp = nn.Sequential( nn.Linear(self.xlm.config.hidden_size, self.config.hidden_size), nn.Tanh(), ) output_size = self.config.hidden_size else: output_size = self.xlm.config.hidden_size self._sizes = { const.TARGET: output_size, const.TARGET_LOGITS: output_size, const.TARGET_SENTENCE: 2 * output_size, const.SOURCE: output_size, const.SOURCE_LOGITS: output_size, } self.vocabs = { const.TARGET: vocabs[const.TARGET], const.SOURCE: vocabs[const.SOURCE], } self.output_embeddings = self.xlm.embeddings if self.config.freeze: for param in self.xlm.parameters(): param.requires_grad = False
def __init__(self, config: Munch = None, **kwargs): """ Initialize a new XLM synapse module. Args: config (:obj:`munch.Munch`, `required`): munched config class. """ super(XLMSynapse, self).__init__(config=config, **kwargs) if config == None: config = XLMSynapse.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) XLMSynapse.check_config(config) self.config = config # Build config. xlm_config = XLMConfig( vocab_size=bittensor.__vocab_size__, emb_dim=bittensor.__network_dim__, n_layers=config.synapse.n_layers, n_heads=config.synapse.n_heads, # More needed ) # model layer: encodes tokenized sequences to network dim. self.xlm = XLMModel(xlm_config) # pooler layer: pools the hidden units for use by the pkm dendrite rpc query. self.pooler = XLMPooler(xlm_config) # router: (PKM layer) queries network using embeddings as context self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # hidden layer: transforms context and encoding to network dimension hidden units. self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # target layer: maps from hidden layer to vocab dimension for each token. self.target_layer = nn.Linear(bittensor.__network_dim__, bittensor.__vocab_size__, bias=False) # Loss function self.loss_fct = nn.CrossEntropyLoss() self.to(self.device)
def xlm_convert_to_huggingface(args): """ Given a FaceBook's XLM model checkpoint, a BPE merges file, create and save a HuggingFace XLMTokenizer and a XLMModel. """ xlm_pth = torch.load(args.checkpoint, map_location=torch.device('cpu')) with NamedTemporaryFile() as tfile: tfile.write(b'{}') tfile.flush() tokenizer = XLMTokenizer( tfile.name, args.merges, do_lowercase_and_remove_accent=False) tokenizer.encoder = convert_vocab(xlm_pth['dico_word2id']) vocab_size = len(tokenizer) params = xlm_pth['params'] xlm_config = XLMConfig( emb_dim=params['emb_dim'], vocab_size=params['n_words'], n_layers=params['n_layers'], n_heads=params['n_heads'], n_langs=params['n_langs'], sinusoidal_embeddings=params['sinusoidal_embeddings'], use_lang_emb=params['use_lang_emb'], is_encoder=params['encoder_only'], output_hidden_states=True, n_words = params['n_words'], ) # Provide both config and state dict to model init model = XLMModel.from_pretrained( None, config=xlm_config, state_dict=xlm_pth['model']) # Save save_directory = Path(args.output_dir) if not save_directory.exists(): save_directory.mkdir(parents=True, exist_ok=True) model.save_pretrained(str(save_directory)) tokenizer.save_pretrained(str(save_directory)) tokenizer.save_vocabulary(str(save_directory))
def get_config(self): return XLMConfig( vocab_size=self.vocab_size, n_special=self.n_special, emb_dim=self.hidden_size, n_layers=self.num_hidden_layers, n_heads=self.num_attention_heads, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, gelu_activation=self.gelu_activation, sinusoidal_embeddings=self.sinusoidal_embeddings, asm=self.asm, causal=self.causal, n_langs=self.n_langs, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, summary_type=self.summary_type, use_proj=self.use_proj, num_labels=self.num_labels, bos_token_id=self.bos_token_id, )
config = RobertaConfig(vocab_size=50265, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture elif args.LM == 'XLM': from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel config = XLMConfig(vocab_size=64139, emb_dim=1024, max_position_embeddings=512, n_heads=8, n_layers=6, ) tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False) model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config) # 6-layer, 1024-hidden, 8-heads # XLM English-French model trained on the concatenation of English and French wikipedia else: print('need to define LM from Bert,RoBerta,XLM') print(model) def freeze_layer_fun(freeze_layer): for name, param in model.named_parameters():
def load_model(args): if 'bert-base-multilingual' in args['model_checkpoint']: # bert-base-multilingual-uncased or bert-base-multilingual-cased # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-mlm' in args['model_checkpoint']: # xlm-mlm-100-1280 # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint']) config = XLMConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-roberta' in args['model_checkpoint']: # xlm-roberta-base or xlm-roberta-large # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMRobertaTokenizer.from_pretrained( args['model_checkpoint']) config = XLMRobertaConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMRobertaForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMRobertaForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMRobertaForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'fasttext' in args['model_checkpoint']: # Prepare config & tokenizer vocab_path = args['vocab_path'] config_path = None word_tokenizer = args['word_tokenizer_class']() emb_path = args['embedding_path'][args['model_checkpoint']] _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=args["lower"]) vocab_list = list(tokenizer.vocab.keys()) config = BertConfig.from_pretrained('bert-base-uncased') if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == args['task']: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'scratch' in args['model_checkpoint']: vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") config = BertConfig.from_pretrained("bert-base-uncased") if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] config.hidden_size = 300 config.num_attention_heads = 10 if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config=config) elif 'token_classification' == args['task']: model = BertForWordClassification(config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config=config) elif 'indobenchmark' in args['model_checkpoint']: # indobenchmark models # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model model_class = None if 'sequence_classification' == args['task']: model_class = AlbertForSequenceClassification if 'lite' in args[ 'model_checkpoint'] else BertForSequenceClassification elif 'token_classification' == args['task']: model_class = AlbertForWordClassification if 'lite' in args[ 'model_checkpoint'] else BertForWordClassification elif 'multi_label_classification' == args['task']: model_class = AlbertForMultiLabelClassification if 'lite' in args[ 'model_checkpoint'] else BertForMultiLabelClassification model = model_class.from_pretrained(args['model_checkpoint'], config=config) return model, tokenizer, vocab_path, config_path
def load_eval_model(args): vocab_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/vocab.txt' config_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/config.json' model_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/best_model_0.th' # Load for word2vec and fasttext if 'word2vec' in args['model_type'] or 'fasttext' in args['model_type']: emb_path = args['embedding_path'][args['model_type']] model, tokenizer = load_word_embedding_model( args['model_type'], args['task'], vocab_path, args['word_tokenizer_class'], emb_path, args['num_labels'], lower=args['lower']) return model, tokenizer # Load config & tokenizer if 'albert' in args['model_type']: config = AlbertConfig.from_json_file(config_path) tokenizer = BertTokenizer(vocab_path) elif 'babert' in args['model_type']: config = BertConfig.from_json_file(config_path) tokenizer = BertTokenizer(vocab_path) elif 'scratch' in args['model_type']: config = BertConfig.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") elif 'bert-base-multilingual' in args['model_type']: config = BertConfig.from_pretrained(args['model_type']) tokenizer = BertTokenizer.from_pretrained(args['model_type']) elif 'xlm-mlm-100-1280' in args['model_type']: config = XLMConfig.from_pretrained(args['model_type']) tokenizer = XLMTokenizer.from_pretrained(args['model_type']) elif 'xlm-roberta' in args['model_type']: config = XLMRobertaConfig.from_pretrained(args['model_type']) tokenizer = XLMRobertaTokenizer.from_pretrained(args['model_type']) else: raise ValueError('Invalid `model_type` argument values') # Get model class base_cls, pred_cls = get_model_class(args['model_type'], args['task']) # Adjust config if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model model = pred_cls(config=config) base_model = base_cls.from_pretrained(model_path, from_tf=False, config=config) # Plug pretrained base model to classification model if 'bert' in model.__dir__(): model.bert = base_model elif 'albert' in model.__dir__(): model.albert = base_model elif 'roberta' in model.__dir__(): model.roberta = base_model elif 'transformer' in model.__dir__(): model.transformer = base_model else: ValueError( 'Model attribute not found, is there any change in the `transformers` library?' ) return model, tokenizer
def load_model(args): if 'albert-large-wwmlm-512' == args['model_checkpoint']: vocab_path = "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_vocab_uncased_30000.txt" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file( "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_albert_large_config.json" ) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_pytorch_albert_large_512_629k.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-wwmlm-512' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_vocab_uncased_30000.txt" config_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_albert_base_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_pytorch_model_albert_base_162k.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-large-wwmlm-128' == args['model_checkpoint']: vocab_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_vocab_uncased_30000.txt" config_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_albert_large_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_pytorch_albert_large_128_500k.bin", from_tf=False, config=config) model.albert = albert_model elif 'babert-bpe-mlm-large-512' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_bert_large_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_pytorch_babert_uncased_large_512_dup10-5_1120k.bin", config=config) model.bert = bert_model.bert elif 'albert-base-uncased-112500' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-112500/vocab.txt" config_path = "../embeddings/albert-base-uncased-112500/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-112500/albert_base_uncased_112500.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-uncased-96000' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-96000/vocab.txt" config_path = "../embeddings/albert-base-uncased-96000/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-96000/albert_base_uncased_96000.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-uncased-191k' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_vocab_uncased_30000.txt" config_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_albert_base_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_pytorch_model_albert_base_191k.bin", from_tf=False, config=config) model.albert = albert_model elif 'babert-opensubtitle' == args['model_checkpoint']: # babert-opensubtitle # Prepare config & tokenizer vocab_path = "../embeddings/babert-opensubtitle/vocab.txt" config_path = "../embeddings/babert-opensubtitle/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-opensubtitle/model.ckpt-1000000.index", from_tf=True, config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased-1100k' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_pytorch_model_babert_large_1100k.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased-1m' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_pytorch_model_babert_large_1mil.bin", config=config) model.bert = bert_model.bert elif 'babert-base-512' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_vocab_uncased_30522.txt" config_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-base-512/pytorch_models_babert_base_512_pytorch_model_babert_base_uncased_512.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_pytorch_model_babert_large_778500.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-uncased-128-dup10-5' == args['model_checkpoint']: # babert_bpe_wwmlm # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/vocab.txt" config_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/pytorch_model.bin", config=config) model.bert = bert_model.bert elif 'bert-base-multilingual' in args['model_checkpoint']: # bert-base-multilingual-uncased or bert-base-multilingual-cased # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-mlm' in args['model_checkpoint']: # xlm-mlm-100-1280 # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint']) config = XLMConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-roberta' in args['model_checkpoint']: # xlm-roberta-base or xlm-roberta-large # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMRobertaTokenizer.from_pretrained( args['model_checkpoint']) config = XLMRobertaConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMRobertaForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMRobertaForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMRobertaForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'word2vec' in args['model_checkpoint'] or 'fasttext' in args[ 'model_checkpoint']: # Prepare config & tokenizer vocab_path = args['vocab_path'] config_path = None word_tokenizer = args['word_tokenizer_class']() emb_path = args['embedding_path'][args['model_checkpoint']] _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=args["lower"]) vocab_list = list(tokenizer.vocab.keys()) config = BertConfig.from_pretrained('bert-base-uncased') if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] if args['model_checkpoint'] == 'word2vec-twitter': embeddings = gen_embeddings(vocab_list, emb_path) config.hidden_size = 400 config.num_attention_heads = 8 if args['model_checkpoint'] == 'fasttext-cc-id' or args[ 'model_checkpoint'] == 'fasttext-cc-id-300-no-oov-uncased' or args[ 'model_checkpoint'] == 'fasttext-4B-id-300-no-oov-uncased': embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == args['task']: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'scratch' in args['model_checkpoint']: vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") config = BertConfig.from_pretrained("bert-base-uncased") if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] config.hidden_size = 300 config.num_attention_heads = 10 if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config=config) elif 'token_classification' == args['task']: model = BertForWordClassification(config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config=config) elif 'indobenchmark' in args['model_checkpoint']: # indobenchmark models # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) return model, tokenizer, vocab_path, config_path