def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Args: new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`): Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not the tokens should be added as special tokens. Returns: :obj:`int`: The number of tokens actually added to the vocabulary. Examples:: # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) print('We have added', num_added_toks, 'tokens') # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. model.resize_token_embeddings(len(tokenizer)) """ new_tokens = [str(tok) for tok in new_tokens] tokens_to_add = [] for token in new_tokens: assert isinstance(token, str) if not special_tokens and hasattr( self, "do_lower_case") and self.do_lower_case: token = token.lower() if (token != self.unk_token and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and token not in tokens_to_add): tokens_to_add.append(token) if self.verbose: logger.info("Adding %s to the vocabulary", token) added_tok_encoder = dict( (tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} self.added_tokens_encoder.update(added_tok_encoder) self.added_tokens_decoder.update(added_tok_decoder) # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) if special_tokens: self.unique_no_split_tokens = sorted( set(self.unique_no_split_tokens).union(set(new_tokens))) else: # Or on the newly added tokens self.unique_no_split_tokens = sorted( set(self.unique_no_split_tokens).union(set(tokens_to_add))) return len(tokens_to_add)
def is_valid_mbart(self) -> bool: """Is the configuration aligned with the MBART paper.""" if self.normalize_before and self.add_final_layer_norm and self.scale_embedding: return True if self.normalize_before or self.add_final_layer_norm or self.scale_embedding: logger.info( "This configuration is a mixture of MBART and BART settings") return False
def initialize_detector(self): t1 = time.time() if self.enable_rnnlm: self.lm = LM(self.rnnlm_model_dir, self.rnnlm_vocab_path) logger.debug('Loaded language model: %s, spend: %s s' % (self.rnnlm_model_dir, str(time.time() - t1))) else: try: import kenlm except ImportError: raise ImportError( 'pycorrector dependencies are not fully installed, ' 'they are required for statistical language model.' 'Please use "pip install kenlm" to install it, not support Win.' 'if you are Win, Please install tensorflow and set enable_rnnlm=True.' ) self.lm = kenlm.Model(self.language_model_path) logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1))) # 词、频数dict t2 = time.time() self.word_freq = self.load_word_freq_dict(self.word_freq_path) t3 = time.time() logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' % (self.word_freq_path, len(self.word_freq), str(t3 - t2))) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) t4 = time.time() logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_confusion), str(t4 - t3))) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) t5 = time.time() logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_word_freq), str(t5 - t4))) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) t6 = time.time() logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1)) self.initialized_detector = True
def infer(self, input_text): encoder_input_data = np.zeros((1, self.max_input_texts_len, len(self.input_token_index)), dtype='float32') # one hot representation for i, char in enumerate(input_text): if char in self.input_token_index: encoder_input_data[0, i, self.input_token_index[char]] = 1.0 # Take one sequence decoding. decoded_sentence = self._decode_sequence(encoder_input_data) logger.info('Input sentence:%s' % input_text) logger.info('Decoded sentence:%s' % decoded_sentence)
def save_pretrained(self, save_directory_or_file): """Save a model card object to the directory or file `save_directory_or_file`.""" if os.path.isdir(save_directory_or_file): # If we save using the predefined names, we can load using `from_pretrained` output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME) else: output_model_card_file = save_directory_or_file self.to_json_file(output_model_card_file) logger.info("Model card saved in {}".format(output_model_card_file))
def set_custom_word(self, path): self.check_detector_initialized() word_freqs = self.load_word_freq_dict(path) # 合并字典 self.custom_word_freq.update(word_freqs) # 合并切词词典及自定义词典 self.word_freq.update(self.custom_word_freq) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) for k, v in word_freqs.items(): self.set_word_frequency(k, v) logger.info('Loaded custom word path: %s, size: %d' % (path, len(word_freqs)))
def args_parse(config_file=''): parser = argparse.ArgumentParser(description="csc") parser.add_argument("--config_file", default="train_macbert4csc.yml", help="path to config file", type=str) parser.add_argument( "--opts", help="Modify config options using the command-line key value", default=[], nargs=argparse.REMAINDER) args = parser.parse_args() config_file = args.config_file or config_file cfg.merge_from_file(config_file) cfg.merge_from_list(args.opts) cfg.freeze() logger.info(args) if config_file != '': logger.info("Loaded configuration file {}".format(config_file)) with open(config_file, 'r') as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) return cfg
def callback(save_model_path, logger): # Print the batch number at the beginning of every batch. batch_print_callback = LambdaCallback( on_batch_begin=lambda batch, logs: logger.info(' batch: %d' % batch)) # define the checkpoint, save model checkpoint = ModelCheckpoint(save_model_path) early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=2) return [batch_print_callback, checkpoint, early_stop]
def save_preds(preds, test_ids, X_test, ids_word_dict, label_ids_dict, ids_label_dict, out_path, test_words, test_labels): with open(out_path, 'w', encoding='utf-8') as f: for i in range(len(X_test)): sent_ids = X_test[i] sid = test_ids[i] sentence = test_words[i] gold_error = test_labels[i] label = [] for j in range(len(sent_ids)): if sent_ids[j] != 0: label.append(preds[i][j]) print(label) continue_error = False has_error = False current_error = 0 start_pos = 0 for k in range(len(label)): error_label_id = is_error_label_id(label[k], label_ids_dict) if error_label_id and not continue_error: continue_error = True start_pos = k + 1 current_error = label[k] has_error = True if continue_error and label[ k] != current_error and not error_label_id: end_pos = k f.write( '%s\t%d\t%d\t%s\t%s\t%s\n' % (sid, start_pos, end_pos, ids_label_dict[current_error], sentence, gold_error)) continue_error = False current_error = 0 if continue_error and label[ k] != current_error and error_label_id: end_pos = k f.write( '%s\t%d\t%d\t%s\t%s\t%s\n' % (sid, start_pos, end_pos, ids_label_dict[current_error], sentence, gold_error)) start_pos = k + 1 current_error = label[k] if not has_error: f.write('%s\tcorrect\t%s\t%s\n' % (sid, sentence, gold_error)) logger.info('save to %s done, data size: %d' % (out_path, len(X_test)))
def __init__(self, vocab_path='', model_path='', src_seq_lens=128, trg_seq_lens=128, beam_size=5, batch_size=1, gpu_id=0): use_gpu = False if gpu_id > -1: os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_id) if torch.cuda.is_available(): device = torch.device('cuda') use_gpu = True else: device = torch.device('cpu') else: device = torch.device('cpu') print('device:', device) # load vocab self.vocab2id = load_word_dict(vocab_path) self.id2vocab = {v: k for k, v in self.vocab2id.items()} logger.debug('Loaded vocabulary file:%s, size: %s' % (vocab_path, len(self.vocab2id))) # load model start_time = time.time() self.model = self._create_model(self.vocab2id, device) if use_gpu: self.model.load_state_dict(torch.load(model_path)) else: # 把所有的张量加载到CPU中 self.model.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) logger.info("Loaded model:%s, spend:%s s" % (model_path, time.time() - start_time)) self.model.eval() self.src_seq_lens = src_seq_lens self.trg_seq_lens = trg_seq_lens self.beam_size = beam_size self.batch_size = batch_size self.device = device
def from_encoder_decoder_configs(cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs) -> PretrainedConfig: r""" Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model configuration and decoder model configuration. Returns: :class:`EncoderDecoderConfig`: An instance of a configuration object """ logger.info( "Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config" ) decoder_config.is_decoder = True decoder_config.add_cross_attention = True return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
def __init__(self, tokenizer, args, data, mode): self.tokenizer = tokenizer cached_features_file = os.path.join( args.cache_dir, args.model_name + "_cached_" + str(args.max_seq_length) + str(len(data))) if os.path.exists(cached_features_file) and ( (not args.reprocess_input_data and not args.no_cache) or (mode == "dev" and args.use_cached_eval_features and not args.no_cache)): logger.info(" Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info(" Creating features from dataset file at %s", args.cache_dir) data = [(input_text, target_text, tokenizer, args) for input_text, target_text in zip(data["input_text"], data["target_text"])] preprocess_fn = preprocess_data_mbart if args.model_type == "mbart" else preprocess_data_bart if (mode == "train" and args.use_multiprocessing) or ( mode == "dev" and args.use_multiprocessing_for_evaluation): if args.multiprocessing_chunksize == -1: chunksize = max(len(data) // (args.process_count * 2), 500) else: chunksize = args.multiprocessing_chunksize with Pool(args.process_count) as p: self.examples = list( tqdm( p.imap(preprocess_fn, data, chunksize=chunksize), total=len(data), disable=args.silent, )) else: self.examples = [ preprocess_fn(d) for d in tqdm(data, disable=args.silent) ]
def save_pretrained(self, save_directory: Union[str, os.PathLike]): """ Save a configuration object to the directory ``save_directory``, so that it can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. Args: save_directory (:obj:`str` or :obj:`os.PathLike`): Directory where the configuration JSON file will be saved (will be created if it does not exist). """ if os.path.isfile(save_directory): raise AssertionError( "Provided path ({}) should be a directory, not a file".format( save_directory)) os.makedirs(save_directory, exist_ok=True) # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(save_directory, CONFIG_NAME) self.to_json_file(output_config_file, use_diff=True) logger.info("Configuration saved in {}".format(output_config_file))
def initialize_detector(self): t1 = time.time() self.lm = kenlm.Model(self.language_model_path) t2 = time.time() logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(t2 - t1))) # 词、频数dict self.word_freq = self.load_word_freq_dict(self.word_freq_path) t3 = time.time() logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' % (self.word_freq_path, len(self.word_freq), str(t3 - t2))) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) t4 = time.time() logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_confusion), str(t4 - t3))) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) t5 = time.time() logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_word_freq), str(t5 - t4))) logger.debug('Loaded all word freq file done, size: %d' % len(self.word_freq)) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) t6 = time.time() logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1)) self.initialized_detector = True
def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig": """ Instantiates a :class:`~transformers.PretrainedConfig` from a Python dictionary of parameters. Args: config_dict (:obj:`Dict[str, Any]`): Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved from a pretrained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict` method. kwargs (:obj:`Dict[str, Any]`): Additional parameters from which to initialize the configuration object. Returns: :class:`PretrainedConfig`: The configuration object instantiated from those parameters. """ return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) config = cls(**config_dict) if hasattr(config, "pruned_heads"): config.pruned_heads = dict( (int(key), value) for key, value in config.pruned_heads.items()) # Update config with kwargs if needed to_remove = [] for key, value in kwargs.items(): if hasattr(config, key): setattr(config, key, value) to_remove.append(key) for key in to_remove: kwargs.pop(key, None) logger.info("Model config %s", str(config)) if return_unused_kwargs: return config, kwargs else: return config
def __init__(self, config=None): train_path = config.train_path encoder_model_path = config.encoder_model_path decoder_model_path = config.decoder_model_path save_input_token_path = config.input_vocab_path save_target_token_path = config.target_vocab_path # load dict self.input_token_index = load_word_dict(save_input_token_path) self.target_token_index = load_word_dict(save_target_token_path) data_reader = CGEDReader(train_path) input_texts, target_texts = data_reader.build_dataset(train_path) self.max_input_texts_len = max([len(text) for text in input_texts]) self.max_target_texts_len = max([len(text) for text in target_texts]) logger.info("Data loaded.") # load model self.encoder_model = load_model(encoder_model_path) self.decoder_model = load_model(decoder_model_path) logger.info("Loaded seq2seq model.") self.graph = tf.get_default_graph()
def train(train_word_path=None, train_label_path=None, word_dict_path=None, label_dict_path=None, save_model_path=None, batch_size=64, dropout=0.5, epoch=10, embedding_dim=100, rnn_hidden_dim=200, maxlen=300, cutoff_frequency=0): """ Train the bilstm_crf model for grammar correction. """ # build the word dictionary build_dict(train_word_path, word_dict_path, cutoff_frequency, insert_extra_words=[UNK_TOKEN, PAD_TOKEN]) # build the label dictionary build_dict(train_label_path, label_dict_path) # load dict word_ids_dict = load_dict(word_dict_path) label_ids_dict = load_dict(label_dict_path) # read data to index word_ids = vectorize_data(train_word_path, word_ids_dict) label_ids = vectorize_data(train_label_path, label_ids_dict) max_len = np.max([len(i) for i in word_ids]) print('max_len:', max_len) # pad sequence word_seq = pad_sequence(word_ids, maxlen=maxlen) label_seq = pad_sequence(label_ids, maxlen=maxlen) # reshape label for crf model use label_seq = np.reshape(label_seq, (label_seq.shape[0], label_seq.shape[1], 1)) print(word_seq.shape) print(label_seq.shape) logger.info("Data loaded.") # model logger.info("Training BILSTM_CRF model...") model = create_model(word_ids_dict, label_ids_dict, embedding_dim, rnn_hidden_dim, dropout) # callback callbacks_list = callback(save_model_path, logger) # fit model.fit(word_seq, label_seq, batch_size=batch_size, epochs=epoch, validation_split=0.2, callbacks=callbacks_list) logger.info("Training has finished.")
def validation_epoch_end(self, outputs) -> None: det_acc_labels = [] cor_acc_labels = [] results = [] for out in outputs: det_acc_labels += out[1] cor_acc_labels += out[2] results += out[3] loss = np.mean([out[0] for out in outputs]) self.log('val_loss', loss) logger.info(f'loss: {loss}') logger.info(f'Detection:\n' f'acc: {np.mean(det_acc_labels):.4f}') logger.info(f'Correction:\n' f'acc: {np.mean(cor_acc_labels):.4f}') compute_corrector_prf(results, logger) compute_sentence_level_prf(results, logger)
def __init__(self, encoder_tokenizer, decoder_tokenizer, args, data, mode): cached_features_file = os.path.join( args.cache_dir, args.model_name.replace("/", "_") + "_cached_" + str(args.max_seq_length) + str(len(data))) if os.path.exists(cached_features_file) and ( (not args.reprocess_input_data and not args.no_cache) or (mode == "dev" and args.use_cached_eval_features and not args.no_cache)): logger.info(" Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info(" Creating features from dataset file at %s", args.cache_dir) data = [(input_text, target_text, encoder_tokenizer, decoder_tokenizer, args) for input_text, target_text in zip(data["input_text"], data["target_text"])] if (mode == "train" and args.use_multiprocessing) or ( mode == "dev" and args.use_multiprocessing_for_evaluation): if args.multiprocessing_chunksize == -1: chunksize = max(len(data) // (args.process_count * 2), 500) else: chunksize = args.multiprocessing_chunksize with Pool(args.process_count) as p: self.examples = list( tqdm( p.imap(preprocess_data, data, chunksize=chunksize), total=len(data), disable=args.silent, )) else: self.examples = [ preprocess_data(d) for d in tqdm(data, disable=args.silent) ] if not args.no_cache: logger.info(" Saving features into cached file %s", cached_features_file) with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def get_config_dict(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a :class:`~transformers.PretrainedConfig` using ``from_dict``. Parameters: pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. Returns: :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object. """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) use_auth_token = kwargs.pop("use_auth_token", None) local_files_only = kwargs.pop("local_files_only", False) revision = kwargs.pop("revision", None) pretrained_model_name_or_path = str(pretrained_model_name_or_path) if os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url( pretrained_model_name_or_path): config_file = pretrained_model_name_or_path else: config_file = hf_bucket_url(pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None) try: # Load from URL or cache if already cached resolved_config_file = cached_path( config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, ) # Load config dict config_dict = cls._dict_from_json_file(resolved_config_file) except EnvironmentError as err: logger.error(err) msg = ( f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n" ) raise EnvironmentError(msg) except json.JSONDecodeError: msg = ( "Couldn't reach server at '{}' to download configuration file or " "configuration file is not a valid JSON file. " "Please check network or file content here: {}.".format( config_file, resolved_config_file)) raise EnvironmentError(msg) if resolved_config_file == config_file: logger.info("loading configuration file {}".format(config_file)) else: logger.info( "loading configuration file {} from cache at {}".format( config_file, resolved_config_file)) return config_dict, kwargs
def set_custom_confusion_dict(self, path): self.check_detector_initialized() custom_confusion = self._get_custom_confusion_dict(path) self.custom_confusion.update(custom_confusion) logger.info('Loaded confusion path: %s, size: %d' % (path, len(custom_confusion)))
def set_language_model_path(self, path): self.check_detector_initialized() import kenlm self.lm = kenlm.Model(path) logger.info('Loaded language model: %s' % path)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--bert_model_dir", default=None, type=str, required=True, help="Bert pre-trained model config dir") parser.add_argument("--bert_model_vocab", default=None, type=str, required=True, help="Bert pre-trained model vocab path") parser.add_argument( "--output_dir", default="./output", type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) # Other parameters parser.add_argument("--predict_file", default=None, type=str, help="for predictions.") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() device = torch.device("cpu") random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer(args.bert_model_vocab) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model_dir) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if not os.path.exists(output_model_file): torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model.to(device) # Tokenized input text = "吸 烟 的 人 容 易 得 癌 症" print(text) tokenized_text = tokenizer.tokenize(text) # Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = 8 tokenized_text[masked_index] = '[MASK]' print(tokenized_text) # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model.eval() # Predict all tokens predictions = model(tokens_tensor, segments_tensors) # confirm we were able to predict 'henson' predicted_index = torch.argmax(predictions[0, masked_index]).item() print(predicted_index) predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print(predicted_token) # infer one line end if args.predict_file: eval_examples = read_lm_examples(input_file=args.predict_file) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info("Start predict ...") for f in eval_features: input_ids = torch.tensor([f.input_ids]) segment_ids = torch.tensor([f.segment_ids]) predictions = model(input_ids, segment_ids) # confirm we were able to predict 'henson' masked_ids = f.mask_ids if masked_ids: print(masked_ids) for idx, i in enumerate(masked_ids): predicted_index = torch.argmax(predictions[0, i]).item() predicted_token = tokenizer.convert_ids_to_tokens( [predicted_index])[0] print('original text is:', f.input_tokens) print('Mask predict is:', predicted_token)
def convert_examples_to_features(examples, tokenizer, max_seq_length): """Loads a data file into a list of `InputBatch`s.""" features = [] for (example_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) # The -3 accounts for [CLS], [SEP] and [SEP] _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # "-2" is [CLS] and [SEP] if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens_a = [i.replace('*', MASK_TOKEN) for i in tokens_a] tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) if tokens_b: tokens_b = [i.replace('*', '[MASK]') for i in tokens_b] tokens += tokens_b + ["[SEP]"] segment_ids += [1] * (len(tokens_b) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) mask_ids = [i for i, v in enumerate(input_ids) if v == MASK_ID] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if example_index < 5: logger.info("*** Example ***") logger.info("example_index: %s" % (example_index)) logger.info("guid: %s" % (example.guid)) logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, mask_ids=mask_ids, segment_ids=segment_ids, input_tokens=tokens)) return features
def train(train_path=config.train_path, output_dir=config.output_dir, save_model_dir=config.save_model_dir, vocab_path=config.vocab_path, val_path=config.val_path, vocab_max_size=config.vocab_max_size, vocab_min_count=config.vocab_min_count, batch_size=config.batch_size, epochs=config.epochs, learning_rate=0.0001, src_emb_dim=128, trg_emb_dim=128, src_hidden_dim=256, trg_hidden_dim=256, src_num_layers=1, batch_first=True, src_bidirection=True, dropout=0.0, attn_method='luong_concat', repetition='vanilla', network='lstm', pointer_net=True, attn_decoder=True, shared_embedding=True, share_emb_weight=True, src_seq_lens=128, trg_seq_lens=128, grad_clip=2.0, save_model_batch_num=config.save_model_batch_num, gpu_id=config.gpu_id): print('Training model...') if gpu_id > -1: os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_id) if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') else: device = torch.device('cpu') print('device:', device) source_texts, target_texts = build_dataset(train_path) vocab2id = read_vocab(source_texts + target_texts, max_size=vocab_max_size, min_count=vocab_min_count) num_encoder_tokens = len(vocab2id) max_input_texts_len = max([len(text) for text in source_texts]) print('source_texts:', source_texts[0]) print('target_texts:', target_texts[0]) print('num of samples:', len(source_texts)) print('num of unique input tokens:', num_encoder_tokens) print('max sequence length for inputs:', max_input_texts_len) id2vocab = {v: k for k, v in vocab2id.items()} # save word dict save_word_dict(vocab2id, vocab_path) print('The vocabulary file:%s, size: %s' % (vocab_path, len(vocab2id))) model = Seq2Seq( src_emb_dim=src_emb_dim, trg_emb_dim=trg_emb_dim, src_hidden_dim=src_hidden_dim, trg_hidden_dim=trg_hidden_dim, src_vocab_size=len(vocab2id), trg_vocab_size=len(vocab2id), src_nlayer=src_num_layers, batch_first=batch_first, src_bidirect=src_bidirection, dropout=dropout, attn_method=attn_method, repetition=repetition, network=network, pointer_net=pointer_net, shared_emb=shared_embedding, attn_decoder=attn_decoder, share_emb_weight=share_emb_weight, device=device ).to(device) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # read the last check point and continue training uf_model = [0, -1] if not os.path.exists(save_model_dir): os.mkdir(save_model_dir) model_para_files = glob.glob(os.path.join(save_model_dir, '*.model')) if len(model_para_files) > 0: uf_model = [] for fl_ in model_para_files: arr = re.split('\/', fl_)[-1] arr = re.split('\_|\.', arr) uf_model.append([int(arr[1]), int(arr[2])]) uf_model = sorted(uf_model)[-1] fl_ = os.path.join(save_model_dir, 'seq2seq_' + str(uf_model[0]) + '_' + str(uf_model[1]) + '.model') model.load_state_dict(torch.load(fl_)) # train models losses = [] start_time = time.time() last_model_path = '' model.train() for epoch in range(uf_model[0], epochs): n_batch = create_batch_file(output_dir, file_type='train', file_path=train_path, batch_size=batch_size) print('The number of batches: {}'.format(n_batch)) for batch_id in range(n_batch): ext_id2oov, src_arr, trg_input_arr, src_arr_ex, trg_output_arr_ex = process_minibatch_explicit( batch_id=batch_id, output_dir=output_dir, file_type='train', batch_size=batch_size, vocab2id=vocab2id, max_lens=[src_seq_lens, trg_seq_lens]) src_var = Variable(torch.LongTensor(src_arr)) trg_input_var = Variable(torch.LongTensor(trg_input_arr)) # extend oov src_var_ex = Variable(torch.LongTensor(src_arr_ex)) trg_output_var_ex = Variable(torch.LongTensor(trg_output_arr_ex)) src_var = src_var.to(device) trg_input_var = trg_input_var.to(device) src_var_ex = src_var_ex.to(device) trg_output_var_ex = trg_output_var_ex.to(device) weight_mask = torch.ones(len(vocab2id) + len(ext_id2oov)).to(device) weight_mask[vocab2id[PAD_TOKEN]] = 0 loss_criterion = torch.nn.NLLLoss(weight=weight_mask).to(device) logits, attn_, p_gen, loss_cv = model(src_var, trg_input_var) logits = torch.softmax(logits, dim=2) # use the pointer generator loss if len(ext_id2oov) > 0: logits = model.cal_dist_explicit(src_var_ex, logits, attn_, p_gen, vocab2id, ext_id2oov) logits = logits + 1e-20 else: logits = model.cal_dist(src_var, logits, attn_, p_gen, vocab2id) if batch_id % 1 == 0: word_prob = logits.topk(1, dim=2)[1].squeeze(2).data.cpu().numpy() logits = torch.log(logits) loss = loss_criterion( logits.contiguous().view(-1, len(vocab2id) + len(ext_id2oov)), trg_output_var_ex.view(-1)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) optimizer.step() end_time = time.time() losses.append([ epoch, batch_id, loss.data.cpu().numpy(), loss_cv.data.cpu().numpy()[0], (end_time - start_time) / 3600.0]) if batch_id % save_model_batch_num == 0: model_path = os.path.join(save_model_dir, 'seq2seq_' + str(epoch) + '_' + str(batch_id) + '.model') with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) logger.info("Model save to " + model_path) if batch_id % 1 == 0: end_time = time.time() sen_pred = [id2vocab[x] if x in id2vocab else ext_id2oov[x] for x in word_prob[0]] print('epoch={}, batch={}, loss={}, loss_cv={}, time_escape={}s={}h'.format( epoch, batch_id, loss.data.cpu().numpy(), loss_cv.data.cpu().numpy()[0], end_time - start_time, (end_time - start_time) / 3600.0 )) print(' '.join(sen_pred)) del logits, attn_, p_gen, loss_cv, loss with open(os.path.join(save_model_dir, 'loss.txt'), 'a', encoding='utf-8') as f: for i in losses: f.write(str(i) + '\n') model_path = os.path.join(save_model_dir, 'seq2seq_' + str(epoch) + '_' + str(batch_id) + '.model') with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) logger.info("Model save to " + model_path) last_model_path = model_path logger.info("Training has finished.") # Eval model eval(model, last_model_path, val_path, output_dir, batch_size, vocab2id, src_seq_lens, trg_seq_lens, device) logger.info("Eval has finished.")
def test_epoch_end(self, outputs) -> None: logger.info('Test.') self.validation_epoch_end(outputs)
from urllib.parse import urlparse from zipfile import ZipFile, is_zipfile import numpy as np import requests from filelock import FileLock from tqdm.auto import tqdm from pycorrector.utils.logger import logger ENV_VARS_TRUE_VALUES = {"1", "ON", "YES"} ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"}) import torch _torch_available = True # pylint: disable=invalid-name logger.info("PyTorch version {} available.".format(torch.__version__)) _tf_available = False try: USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper() if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES: import flax import jax logger.info("JAX version {}, Flax: available".format(jax.__version__)) logger.info("Flax available: {}".format(flax)) _flax_available = True else: _flax_available = False except ImportError:
def get_from_cache( url: str, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent: Union[Dict, str, None] = None, use_auth_token: Union[bool, str, None] = None, local_files_only=False, ) -> Optional[str]: """ Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the path to the cached file. Return: Local path (string) of file or if networking is off, last version of file cached on disk. Raises: In case of non-recoverable file (non-existent or inaccessible url + no cache on disk). """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) os.makedirs(cache_dir, exist_ok=True) headers = {"user-agent": http_user_agent(user_agent)} if isinstance(use_auth_token, str): headers["authorization"] = "Bearer {}".format(use_auth_token) url_to_download = url etag = None if not local_files_only: try: r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout) r.raise_for_status() etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag") # We favor a custom header indicating the etag of the linked resource, and # we fallback to the regular etag header. # If we don't have any of those, raise an error. if etag is None: raise OSError( "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." ) # In case of a redirect, # save an extra redirect on the request.get call, # and ensure we download the exact atomic version even if it changed # between the HEAD and the GET (unlikely, but hey). if 300 <= r.status_code <= 399: url_to_download = r.headers["Location"] except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): # etag is already None pass filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) # etag is None == we don't have a connection or we passed local_files_only. # try to get the last downloaded one if etag is None: if os.path.exists(cache_path): return cache_path else: matching_files = [ file for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*") if not file.endswith(".json") and not file.endswith(".lock") ] if len(matching_files) > 0: return os.path.join(cache_dir, matching_files[-1]) else: # If files cannot be found and local_files_only=True, # the models might've been found if local_files_only=False # Notify the user about that if local_files_only: raise ValueError( "Cannot find the requested files in the cached path and outgoing traffic has been" " disabled. To enable model look-ups and downloads online, set 'local_files_only'" " to False." ) else: raise ValueError( "Connection error, and we cannot find the requested files in the cached path." " Please try again or make sure your Internet connection is on." ) # From now on, etag is not None. if os.path.exists(cache_path) and not force_download: return cache_path # Prevent parallel downloads of the same file with a lock. lock_path = cache_path + ".lock" with FileLock(lock_path): # If the download just completed while the lock was activated. if os.path.exists(cache_path) and not force_download: # Even if returning early like here, the lock will be released. return cache_path if resume_download: incomplete_path = cache_path + ".incomplete" @contextmanager def _resumable_file_manager() -> "io.BufferedWriter": with open(incomplete_path, "ab") as f: yield f temp_file_manager = _resumable_file_manager if os.path.exists(incomplete_path): resume_size = os.stat(incomplete_path).st_size else: resume_size = 0 else: temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False) resume_size = 0 # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers) logger.info("storing %s in cache at %s", url, cache_path) os.replace(temp_file.name, cache_path) logger.info("creating metadata file for %s", cache_path) meta = {"url": url, "etag": etag} meta_path = cache_path + ".json" with open(meta_path, "w") as meta_file: json.dump(meta, meta_file) return cache_path
def evaluate(encoder_model, decoder_model, num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim, target_token_index, max_decoder_seq_length, encoder_input_data, input_texts): # Define an input sequence and process it. encoder_inputs = Input(shape=(None, num_encoder_tokens)) encoder = LSTM(rnn_hidden_dim, return_state=True) encoder_outputs, state_h, state_c = encoder(encoder_inputs) # We discard `encoder_outputs` and only keep the states. encoder_states = [state_h, state_c] # Set up the decoder, using `encoder_states` as initial state. decoder_inputs = Input(shape=(None, num_decoder_tokens)) # We set up our decoder to return full output sequences, # and to return internal states as well. We don't use the # return states in the training model, but we will use them in inference. decoder_lstm = LSTM(rnn_hidden_dim, return_sequences=True, return_state=True) decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) # Reverse-lookup token index to decode sequences back to # something readable. reverse_target_char_index = dict( (i, char) for char, i in target_token_index.items()) def decode_seq(input_seq): decoded_sentence = '' # Encode the input as state vectors. states_value = encoder_model.predict(input_seq) # Generate empty target sequence of length 1. target_seq = np.zeros((1, 1, num_decoder_tokens)) # Populate the first character of target sequence with the start character. first_char = GO_TOKEN target_seq[0, 0, target_token_index[first_char]] = 1. # Sampling loop for a batch of sequences # (to simplify, here we assume a batch of size 1). stop_condition = False while not stop_condition: output_tokens, h, c = decoder_model.predict([target_seq] + states_value) # Sample a token sampled_token_index = np.argmax(output_tokens[0, -1, :]) sampled_char = reverse_target_char_index[sampled_token_index] if sampled_char != EOS_TOKEN: decoded_sentence += sampled_char # Exit condition: either hit max length # or find stop character. if (sampled_char == EOS_TOKEN or len(decoded_sentence) > max_decoder_seq_length): stop_condition = True # Update the target sequence (of length 1). target_seq = np.zeros((1, 1, num_decoder_tokens)) target_seq[0, 0, sampled_token_index] = 1. # Update states states_value = [h, c] return decoded_sentence for seq_index in range(10): # Take one sequence (part of the training set) # for trying out decoding. input_seq = encoder_input_data[seq_index:seq_index + 1] output_text = decode_seq(input_seq) logger.info('Input sentence:%s' % input_texts[seq_index]) logger.info('Decoded sentence:%s' % output_text)
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], dtype: jnp.dtype = jnp.float32, *model_args, **kwargs): r""" Instantiate a pretrained flax model from a pre-trained model configuration. The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those weights are discarded. Parameters: pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): Can be either: - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `pt index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In this case, ``from_pt`` should be set to :obj:`True`. model_args (sequence of positional arguments, `optional`): All remaning positional arguments will be passed to the underlying model's ``__init__`` method. config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`): Can be either: - an instance of a class derived from :class:`~transformers.PretrainedConfig`, - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`. Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - The model is a model provided by the library (loaded with the `model id` string of a pretrained model). - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by supplying the save directory. - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. cache_dir (:obj:`Union[str, os.PathLike]`, `optional`): Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used. from_pt (:obj:`bool`, `optional`, defaults to :obj:`False`): Load the model weights from a PyTorch checkpoint save file (see docstring of ``pretrained_model_name_or_path`` argument). force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to delete incompletely received files. Will attempt to resume the download if such a file exists. proxies (:obj:`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to only look at local files (i.e., do not try to download the model). revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. kwargs (remaining dictionary of keyword arguments, `optional`): Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: >>> from transformers import BertConfig, FlaxBertModel >>> # Download model and configuration from huggingface.co and cache. >>> model = FlaxBertModel.from_pretrained('bert-base-cased') >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable). >>> model = FlaxBertModel.from_pretrained('./test/saved_model/') >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable). >>> config = BertConfig.from_json_file('./pt_model/config.json') >>> model = FlaxBertModel.from_pretrained('./pt_model/pytorch_model.bin', from_pt=True, config=config) """ config = kwargs.pop("config", None) cache_dir = kwargs.pop("cache_dir", None) from_pt = kwargs.pop("from_pt", False) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) use_auth_token = kwargs.pop("use_auth_token", None) revision = kwargs.pop("revision", None) # Load config if we don't provide a configuration if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( config_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, use_auth_token=use_auth_token, revision=revision, **kwargs, ) else: model_kwargs = kwargs # Add the dtype to model_kwargs model_kwargs["dtype"] = dtype # Load model if pretrained_model_name_or_path is not None: if os.path.isdir(pretrained_model_name_or_path): if from_pt and os.path.isfile( os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) elif os.path.isfile( os.path.join(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME)): # Load from a Flax checkpoint archive_file = os.path.join(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME) else: raise EnvironmentError( "Error no file named {} found in directory {} or `from_pt` set to False" .format( [FLAX_WEIGHTS_NAME, WEIGHTS_NAME], pretrained_model_name_or_path, )) elif os.path.isfile( pretrained_model_name_or_path) or is_remote_url( pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path else: archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=WEIGHTS_NAME if from_pt else FLAX_WEIGHTS_NAME, revision=revision, ) # redirect to the cache, if necessary try: resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, ) except EnvironmentError as err: logger.error(err) msg = ( f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named {WEIGHTS_NAME}.\n\n" ) raise EnvironmentError(msg) if resolved_archive_file == archive_file: logger.info(f"loading weights file {archive_file}") else: logger.info( f"loading weights file {archive_file} from cache at {resolved_archive_file}" ) else: resolved_archive_file = None # Instantiate model. with open(resolved_archive_file, "rb") as state_f: try: if from_pt: import torch state = torch.load(state_f) state = convert_state_dict_from_pt(cls, state, config) else: state = from_bytes(cls, state_f.read()) except UnpicklingError: raise EnvironmentError( f"Unable to convert pytorch model {archive_file} to Flax deserializable object. " ) # init random models model = cls(config, *model_args, **model_kwargs) # if model is base model only use model_prefix key if cls.base_model_prefix not in dict( model.params) and cls.base_model_prefix in state: state = state[cls.base_model_prefix] # flatten dicts state = flatten_dict(state) random_state = flatten_dict(unfreeze(model.params)) missing_keys = model.required_params - set(state.keys()) unexpected_keys = set(state.keys()) - model.required_params # add missing keys as random parameters for missing_key in missing_keys: state[missing_key] = random_state[missing_key] if len(unexpected_keys) > 0: logger.warning( f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when " f"initializing {model.__class__.__name__}: {unexpected_keys}\n" f"- This IS expected if you are initializing {model.__class__.__name__} from the checkpoint of a model trained on another task " f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n" f"- This IS NOT expected if you are initializing {model.__class__.__name__} from the checkpoint of a model that you expect " f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." ) else: logger.info( f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n" ) if len(missing_keys) > 0: logger.warning( f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} " f"and are newly initialized: {missing_keys}\n" f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference." ) else: logger.info( f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n" f"If your task is similar to the task the model of the checkpoint was trained on, " f"you can already use {model.__class__.__name__} for predictions without further training." ) # set correct parameters model.params = unflatten_dict(state) return model