Esempio n. 1
0
    def _add_tokens(self,
                    new_tokens: Union[List[str], List[AddedToken]],
                    special_tokens: bool = False) -> int:
        """
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary.

        Args:
            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
                checking if the tokenizer assign the index of the ``unk_token`` to them).
            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
            :obj:`int`: The number of tokens actually added to the vocabulary.

        Examples::

            # Let's see how to increase the vocabulary of Bert model and tokenizer
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            model = BertModel.from_pretrained('bert-base-uncased')

            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
            print('We have added', num_added_toks, 'tokens')
            # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
            model.resize_token_embeddings(len(tokenizer))
        """
        new_tokens = [str(tok) for tok in new_tokens]

        tokens_to_add = []
        for token in new_tokens:
            assert isinstance(token, str)
            if not special_tokens and hasattr(
                    self, "do_lower_case") and self.do_lower_case:
                token = token.lower()
            if (token != self.unk_token and self.convert_tokens_to_ids(token)
                    == self.convert_tokens_to_ids(self.unk_token)
                    and token not in tokens_to_add):
                tokens_to_add.append(token)
                if self.verbose:
                    logger.info("Adding %s to the vocabulary", token)

        added_tok_encoder = dict(
            (tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
        self.added_tokens_encoder.update(added_tok_encoder)
        self.added_tokens_decoder.update(added_tok_decoder)

        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
        if special_tokens:
            self.unique_no_split_tokens = sorted(
                set(self.unique_no_split_tokens).union(set(new_tokens)))
        else:
            # Or on the newly added tokens
            self.unique_no_split_tokens = sorted(
                set(self.unique_no_split_tokens).union(set(tokens_to_add)))

        return len(tokens_to_add)
Esempio n. 2
0
 def is_valid_mbart(self) -> bool:
     """Is the configuration aligned with the MBART paper."""
     if self.normalize_before and self.add_final_layer_norm and self.scale_embedding:
         return True
     if self.normalize_before or self.add_final_layer_norm or self.scale_embedding:
         logger.info(
             "This configuration is a mixture of MBART and BART settings")
     return False
Esempio n. 3
0
    def initialize_detector(self):
        t1 = time.time()
        if self.enable_rnnlm:
            self.lm = LM(self.rnnlm_model_dir, self.rnnlm_vocab_path)
            logger.debug('Loaded language model: %s, spend: %s s' %
                         (self.rnnlm_model_dir, str(time.time() - t1)))
        else:
            try:
                import kenlm
            except ImportError:
                raise ImportError(
                    'pycorrector dependencies are not fully installed, '
                    'they are required for statistical language model.'
                    'Please use "pip install kenlm" to install it, not support Win.'
                    'if you are Win, Please install tensorflow and set enable_rnnlm=True.'
                )

            self.lm = kenlm.Model(self.language_model_path)
            logger.debug('Loaded language model: %s, spend: %s s' %
                         (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        t3 = time.time()
        logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' %
                     (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t6 = time.time()
        logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
        self.initialized_detector = True
Esempio n. 4
0
 def infer(self, input_text):
     encoder_input_data = np.zeros((1, self.max_input_texts_len, len(self.input_token_index)),
                                   dtype='float32')
     # one hot representation
     for i, char in enumerate(input_text):
         if char in self.input_token_index:
             encoder_input_data[0, i, self.input_token_index[char]] = 1.0
     # Take one sequence decoding.
     decoded_sentence = self._decode_sequence(encoder_input_data)
     logger.info('Input sentence:%s' % input_text)
     logger.info('Decoded sentence:%s' % decoded_sentence)
Esempio n. 5
0
    def save_pretrained(self, save_directory_or_file):
        """Save a model card object to the directory or file `save_directory_or_file`."""
        if os.path.isdir(save_directory_or_file):
            # If we save using the predefined names, we can load using `from_pretrained`
            output_model_card_file = os.path.join(save_directory_or_file,
                                                  MODEL_CARD_NAME)
        else:
            output_model_card_file = save_directory_or_file

        self.to_json_file(output_model_card_file)
        logger.info("Model card saved in {}".format(output_model_card_file))
Esempio n. 6
0
 def set_custom_word(self, path):
     self.check_detector_initialized()
     word_freqs = self.load_word_freq_dict(path)
     # 合并字典
     self.custom_word_freq.update(word_freqs)
     # 合并切词词典及自定义词典
     self.word_freq.update(self.custom_word_freq)
     self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq,
                                custom_confusion_dict=self.custom_confusion)
     for k, v in word_freqs.items():
         self.set_word_frequency(k, v)
     logger.info('Loaded custom word path: %s, size: %d' % (path, len(word_freqs)))
Esempio n. 7
0
def args_parse(config_file=''):
    parser = argparse.ArgumentParser(description="csc")
    parser.add_argument("--config_file",
                        default="train_macbert4csc.yml",
                        help="path to config file",
                        type=str)
    parser.add_argument(
        "--opts",
        help="Modify config options using the command-line key value",
        default=[],
        nargs=argparse.REMAINDER)

    args = parser.parse_args()

    config_file = args.config_file or config_file
    cfg.merge_from_file(config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    logger.info(args)

    if config_file != '':
        logger.info("Loaded configuration file {}".format(config_file))
        with open(config_file, 'r') as cf:
            config_str = "\n" + cf.read()
            logger.info(config_str)

    logger.info("Running with config:\n{}".format(cfg))
    return cfg
Esempio n. 8
0
def callback(save_model_path, logger):
    # Print the batch number at the beginning of every batch.
    batch_print_callback = LambdaCallback(
        on_batch_begin=lambda batch, logs: logger.info(' batch: %d' % batch))
    # define the checkpoint, save model
    checkpoint = ModelCheckpoint(save_model_path)
    early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=2)
    return [batch_print_callback, checkpoint, early_stop]
Esempio n. 9
0
def save_preds(preds, test_ids, X_test, ids_word_dict, label_ids_dict,
               ids_label_dict, out_path, test_words, test_labels):
    with open(out_path, 'w', encoding='utf-8') as f:
        for i in range(len(X_test)):
            sent_ids = X_test[i]
            sid = test_ids[i]
            sentence = test_words[i]
            gold_error = test_labels[i]
            label = []
            for j in range(len(sent_ids)):
                if sent_ids[j] != 0:
                    label.append(preds[i][j])
            print(label)
            continue_error = False
            has_error = False
            current_error = 0
            start_pos = 0
            for k in range(len(label)):
                error_label_id = is_error_label_id(label[k], label_ids_dict)
                if error_label_id and not continue_error:
                    continue_error = True
                    start_pos = k + 1
                    current_error = label[k]
                    has_error = True
                if continue_error and label[
                        k] != current_error and not error_label_id:
                    end_pos = k
                    f.write(
                        '%s\t%d\t%d\t%s\t%s\t%s\n' %
                        (sid, start_pos, end_pos,
                         ids_label_dict[current_error], sentence, gold_error))
                    continue_error = False
                    current_error = 0
                if continue_error and label[
                        k] != current_error and error_label_id:
                    end_pos = k
                    f.write(
                        '%s\t%d\t%d\t%s\t%s\t%s\n' %
                        (sid, start_pos, end_pos,
                         ids_label_dict[current_error], sentence, gold_error))
                    start_pos = k + 1
                    current_error = label[k]
            if not has_error:
                f.write('%s\tcorrect\t%s\t%s\n' % (sid, sentence, gold_error))
        logger.info('save to %s done, data size: %d' % (out_path, len(X_test)))
Esempio n. 10
0
    def __init__(self,
                 vocab_path='',
                 model_path='',
                 src_seq_lens=128,
                 trg_seq_lens=128,
                 beam_size=5,
                 batch_size=1,
                 gpu_id=0):
        use_gpu = False
        if gpu_id > -1:
            os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_id)
            if torch.cuda.is_available():
                device = torch.device('cuda')
                use_gpu = True
            else:
                device = torch.device('cpu')
        else:
            device = torch.device('cpu')
        print('device:', device)
        # load vocab
        self.vocab2id = load_word_dict(vocab_path)
        self.id2vocab = {v: k for k, v in self.vocab2id.items()}
        logger.debug('Loaded vocabulary file:%s, size: %s' %
                     (vocab_path, len(self.vocab2id)))

        # load model
        start_time = time.time()
        self.model = self._create_model(self.vocab2id, device)
        if use_gpu:
            self.model.load_state_dict(torch.load(model_path))
        else:
            # 把所有的张量加载到CPU中
            self.model.load_state_dict(
                torch.load(model_path,
                           map_location=lambda storage, loc: storage))
        logger.info("Loaded model:%s, spend:%s s" %
                    (model_path, time.time() - start_time))

        self.model.eval()
        self.src_seq_lens = src_seq_lens
        self.trg_seq_lens = trg_seq_lens
        self.beam_size = beam_size
        self.batch_size = batch_size
        self.device = device
    def from_encoder_decoder_configs(cls, encoder_config: PretrainedConfig,
                                     decoder_config: PretrainedConfig,
                                     **kwargs) -> PretrainedConfig:
        r"""
        Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model
        configuration and decoder model configuration.

        Returns:
            :class:`EncoderDecoderConfig`: An instance of a configuration object
        """
        logger.info(
            "Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config"
        )
        decoder_config.is_decoder = True
        decoder_config.add_cross_attention = True

        return cls(encoder=encoder_config.to_dict(),
                   decoder=decoder_config.to_dict(),
                   **kwargs)
Esempio n. 12
0
    def __init__(self, tokenizer, args, data, mode):
        self.tokenizer = tokenizer

        cached_features_file = os.path.join(
            args.cache_dir, args.model_name + "_cached_" +
            str(args.max_seq_length) + str(len(data)))

        if os.path.exists(cached_features_file) and (
            (not args.reprocess_input_data and not args.no_cache) or
            (mode == "dev" and args.use_cached_eval_features
             and not args.no_cache)):
            logger.info(" Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info(" Creating features from dataset file at %s",
                        args.cache_dir)

            data = [(input_text, target_text, tokenizer, args)
                    for input_text, target_text in zip(data["input_text"],
                                                       data["target_text"])]

            preprocess_fn = preprocess_data_mbart if args.model_type == "mbart" else preprocess_data_bart

            if (mode == "train" and args.use_multiprocessing) or (
                    mode == "dev" and args.use_multiprocessing_for_evaluation):
                if args.multiprocessing_chunksize == -1:
                    chunksize = max(len(data) // (args.process_count * 2), 500)
                else:
                    chunksize = args.multiprocessing_chunksize

                with Pool(args.process_count) as p:
                    self.examples = list(
                        tqdm(
                            p.imap(preprocess_fn, data, chunksize=chunksize),
                            total=len(data),
                            disable=args.silent,
                        ))
            else:
                self.examples = [
                    preprocess_fn(d) for d in tqdm(data, disable=args.silent)
                ]
Esempio n. 13
0
    def save_pretrained(self, save_directory: Union[str, os.PathLike]):
        """
        Save a configuration object to the directory ``save_directory``, so that it can be re-loaded using the
        :func:`~transformers.PretrainedConfig.from_pretrained` class method.

        Args:
            save_directory (:obj:`str` or :obj:`os.PathLike`):
                Directory where the configuration JSON file will be saved (will be created if it does not exist).
        """
        if os.path.isfile(save_directory):
            raise AssertionError(
                "Provided path ({}) should be a directory, not a file".format(
                    save_directory))
        os.makedirs(save_directory, exist_ok=True)
        # If we save using the predefined names, we can load using `from_pretrained`
        output_config_file = os.path.join(save_directory, CONFIG_NAME)

        self.to_json_file(output_config_file, use_diff=True)
        logger.info("Configuration saved in {}".format(output_config_file))
Esempio n. 14
0
    def initialize_detector(self):
        t1 = time.time()
        self.lm = kenlm.Model(self.language_model_path)
        t2 = time.time()
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(t2 - t1)))
        # 词、频数dict
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        t3 = time.time()
        logger.debug('Loaded word freq file: %s, size: %d, spend: %s s' %
                     (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        logger.debug('Loaded all word freq file done, size: %d' %
                     len(self.word_freq))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        t6 = time.time()
        logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
        self.initialized_detector = True
Esempio n. 15
0
    def from_dict(cls, config_dict: Dict[str, Any],
                  **kwargs) -> "PretrainedConfig":
        """
        Instantiates a :class:`~transformers.PretrainedConfig` from a Python dictionary of parameters.

        Args:
            config_dict (:obj:`Dict[str, Any]`):
                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
                retrieved from a pretrained checkpoint by leveraging the
                :func:`~transformers.PretrainedConfig.get_config_dict` method.
            kwargs (:obj:`Dict[str, Any]`):
                Additional parameters from which to initialize the configuration object.

        Returns:
            :class:`PretrainedConfig`: The configuration object instantiated from those parameters.
        """
        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)

        config = cls(**config_dict)

        if hasattr(config, "pruned_heads"):
            config.pruned_heads = dict(
                (int(key), value)
                for key, value in config.pruned_heads.items())

        # Update config with kwargs if needed
        to_remove = []
        for key, value in kwargs.items():
            if hasattr(config, key):
                setattr(config, key, value)
                to_remove.append(key)
        for key in to_remove:
            kwargs.pop(key, None)

        logger.info("Model config %s", str(config))
        if return_unused_kwargs:
            return config, kwargs
        else:
            return config
Esempio n. 16
0
    def __init__(self, config=None):
        train_path = config.train_path
        encoder_model_path = config.encoder_model_path
        decoder_model_path = config.decoder_model_path
        save_input_token_path = config.input_vocab_path
        save_target_token_path = config.target_vocab_path

        # load dict
        self.input_token_index = load_word_dict(save_input_token_path)
        self.target_token_index = load_word_dict(save_target_token_path)

        data_reader = CGEDReader(train_path)
        input_texts, target_texts = data_reader.build_dataset(train_path)
        self.max_input_texts_len = max([len(text) for text in input_texts])
        self.max_target_texts_len = max([len(text) for text in target_texts])
        logger.info("Data loaded.")

        # load model
        self.encoder_model = load_model(encoder_model_path)
        self.decoder_model = load_model(decoder_model_path)
        logger.info("Loaded seq2seq model.")
        self.graph = tf.get_default_graph()
Esempio n. 17
0
def train(train_word_path=None,
          train_label_path=None,
          word_dict_path=None,
          label_dict_path=None,
          save_model_path=None,
          batch_size=64,
          dropout=0.5,
          epoch=10,
          embedding_dim=100,
          rnn_hidden_dim=200,
          maxlen=300,
          cutoff_frequency=0):
    """
    Train the bilstm_crf model for grammar correction.
    """
    # build the word dictionary
    build_dict(train_word_path,
               word_dict_path,
               cutoff_frequency,
               insert_extra_words=[UNK_TOKEN, PAD_TOKEN])
    # build the label dictionary
    build_dict(train_label_path, label_dict_path)
    # load dict
    word_ids_dict = load_dict(word_dict_path)
    label_ids_dict = load_dict(label_dict_path)
    # read data to index
    word_ids = vectorize_data(train_word_path, word_ids_dict)
    label_ids = vectorize_data(train_label_path, label_ids_dict)
    max_len = np.max([len(i) for i in word_ids])
    print('max_len:', max_len)
    # pad sequence
    word_seq = pad_sequence(word_ids, maxlen=maxlen)
    label_seq = pad_sequence(label_ids, maxlen=maxlen)
    # reshape label for crf model use
    label_seq = np.reshape(label_seq,
                           (label_seq.shape[0], label_seq.shape[1], 1))
    print(word_seq.shape)
    print(label_seq.shape)
    logger.info("Data loaded.")
    # model
    logger.info("Training BILSTM_CRF model...")
    model = create_model(word_ids_dict, label_ids_dict, embedding_dim,
                         rnn_hidden_dim, dropout)
    # callback
    callbacks_list = callback(save_model_path, logger)
    # fit
    model.fit(word_seq,
              label_seq,
              batch_size=batch_size,
              epochs=epoch,
              validation_split=0.2,
              callbacks=callbacks_list)
    logger.info("Training has finished.")
Esempio n. 18
0
 def validation_epoch_end(self, outputs) -> None:
     det_acc_labels = []
     cor_acc_labels = []
     results = []
     for out in outputs:
         det_acc_labels += out[1]
         cor_acc_labels += out[2]
         results += out[3]
     loss = np.mean([out[0] for out in outputs])
     self.log('val_loss', loss)
     logger.info(f'loss: {loss}')
     logger.info(f'Detection:\n' f'acc: {np.mean(det_acc_labels):.4f}')
     logger.info(f'Correction:\n' f'acc: {np.mean(cor_acc_labels):.4f}')
     compute_corrector_prf(results, logger)
     compute_sentence_level_prf(results, logger)
Esempio n. 19
0
    def __init__(self, encoder_tokenizer, decoder_tokenizer, args, data, mode):
        cached_features_file = os.path.join(
            args.cache_dir,
            args.model_name.replace("/", "_") + "_cached_" +
            str(args.max_seq_length) + str(len(data)))

        if os.path.exists(cached_features_file) and (
            (not args.reprocess_input_data and not args.no_cache) or
            (mode == "dev" and args.use_cached_eval_features
             and not args.no_cache)):
            logger.info(" Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info(" Creating features from dataset file at %s",
                        args.cache_dir)

            data = [(input_text, target_text, encoder_tokenizer,
                     decoder_tokenizer, args) for input_text, target_text in
                    zip(data["input_text"], data["target_text"])]

            if (mode == "train" and args.use_multiprocessing) or (
                    mode == "dev" and args.use_multiprocessing_for_evaluation):
                if args.multiprocessing_chunksize == -1:
                    chunksize = max(len(data) // (args.process_count * 2), 500)
                else:
                    chunksize = args.multiprocessing_chunksize

                with Pool(args.process_count) as p:
                    self.examples = list(
                        tqdm(
                            p.imap(preprocess_data, data, chunksize=chunksize),
                            total=len(data),
                            disable=args.silent,
                        ))
            else:
                self.examples = [
                    preprocess_data(d) for d in tqdm(data, disable=args.silent)
                ]

            if not args.no_cache:
                logger.info(" Saving features into cached file %s",
                            cached_features_file)
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
Esempio n. 20
0
    def get_config_dict(cls, pretrained_model_name_or_path: Union[str,
                                                                  os.PathLike],
                        **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
        :class:`~transformers.PretrainedConfig` using ``from_dict``.

        Parameters:
            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.

        Returns:
            :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.

        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        use_auth_token = kwargs.pop("use_auth_token", None)
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        if os.path.isdir(pretrained_model_name_or_path):
            config_file = os.path.join(pretrained_model_name_or_path,
                                       CONFIG_NAME)
        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
                pretrained_model_name_or_path):
            config_file = pretrained_model_name_or_path
        else:
            config_file = hf_bucket_url(pretrained_model_name_or_path,
                                        filename=CONFIG_NAME,
                                        revision=revision,
                                        mirror=None)

        try:
            # Load from URL or cache if already cached
            resolved_config_file = cached_path(
                config_file,
                cache_dir=cache_dir,
                force_download=force_download,
                proxies=proxies,
                resume_download=resume_download,
                local_files_only=local_files_only,
                use_auth_token=use_auth_token,
            )
            # Load config dict
            config_dict = cls._dict_from_json_file(resolved_config_file)

        except EnvironmentError as err:
            logger.error(err)
            msg = (
                f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n"
            )
            raise EnvironmentError(msg)

        except json.JSONDecodeError:
            msg = (
                "Couldn't reach server at '{}' to download configuration file or "
                "configuration file is not a valid JSON file. "
                "Please check network or file content here: {}.".format(
                    config_file, resolved_config_file))
            raise EnvironmentError(msg)

        if resolved_config_file == config_file:
            logger.info("loading configuration file {}".format(config_file))
        else:
            logger.info(
                "loading configuration file {} from cache at {}".format(
                    config_file, resolved_config_file))

        return config_dict, kwargs
Esempio n. 21
0
 def set_custom_confusion_dict(self, path):
     self.check_detector_initialized()
     custom_confusion = self._get_custom_confusion_dict(path)
     self.custom_confusion.update(custom_confusion)
     logger.info('Loaded confusion path: %s, size: %d' % (path, len(custom_confusion)))
Esempio n. 22
0
 def set_language_model_path(self, path):
     self.check_detector_initialized()
     import kenlm
     self.lm = kenlm.Model(path)
     logger.info('Loaded language model: %s' % path)
Esempio n. 23
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--bert_model_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="Bert pre-trained model config dir")
    parser.add_argument("--bert_model_vocab",
                        default=None,
                        type=str,
                        required=True,
                        help="Bert pre-trained model vocab path")
    parser.add_argument(
        "--output_dir",
        default="./output",
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    # Other parameters
    parser.add_argument("--predict_file",
                        default=None,
                        type=str,
                        help="for predictions.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()

    device = torch.device("cpu")
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer(args.bert_model_vocab)

    # Prepare model
    model = BertForMaskedLM.from_pretrained(args.bert_model_dir)

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if not os.path.exists(output_model_file):
        torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(output_model_file)
    model.to(device)

    # Tokenized input
    text = "吸 烟 的 人 容 易 得 癌 症"
    print(text)
    tokenized_text = tokenizer.tokenize(text)

    # Mask a token that we will try to predict back with `BertForMaskedLM`
    masked_index = 8
    tokenized_text[masked_index] = '[MASK]'
    print(tokenized_text)

    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0]

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    # Load pre-trained model (weights)
    model.eval()

    # Predict all tokens
    predictions = model(tokens_tensor, segments_tensors)

    # confirm we were able to predict 'henson'
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    print(predicted_index)
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token)
    # infer one line end

    if args.predict_file:
        eval_examples = read_lm_examples(input_file=args.predict_file)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("Start predict ...")
        for f in eval_features:
            input_ids = torch.tensor([f.input_ids])
            segment_ids = torch.tensor([f.segment_ids])
            predictions = model(input_ids, segment_ids)
            # confirm we were able to predict 'henson'
            masked_ids = f.mask_ids
            if masked_ids:
                print(masked_ids)
                for idx, i in enumerate(masked_ids):
                    predicted_index = torch.argmax(predictions[0, i]).item()
                    predicted_token = tokenizer.convert_ids_to_tokens(
                        [predicted_index])[0]
                    print('original text is:', f.input_tokens)
                    print('Mask predict is:', predicted_token)
Esempio n. 24
0
def convert_examples_to_features(examples, tokenizer, max_seq_length):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (example_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # The -3 accounts for [CLS], [SEP] and [SEP]
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # "-2" is [CLS] and [SEP]
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0     0     0      0   0    1  1  1  1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0      0   0   0  0    0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens_a = [i.replace('*', MASK_TOKEN) for i in tokens_a]
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens_b = [i.replace('*', '[MASK]') for i in tokens_b]
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        mask_ids = [i for i, v in enumerate(input_ids) if v == MASK_ID]
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if example_index < 5:
            logger.info("*** Example ***")
            logger.info("example_index: %s" % (example_index))
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("input_mask: %s" %
                        " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" %
                        " ".join([str(x) for x in segment_ids]))

        features.append(
            InputFeatures(input_ids=input_ids,
                          input_mask=input_mask,
                          mask_ids=mask_ids,
                          segment_ids=segment_ids,
                          input_tokens=tokens))
    return features
Esempio n. 25
0
def train(train_path=config.train_path,
          output_dir=config.output_dir,
          save_model_dir=config.save_model_dir,
          vocab_path=config.vocab_path,
          val_path=config.val_path,
          vocab_max_size=config.vocab_max_size,
          vocab_min_count=config.vocab_min_count,
          batch_size=config.batch_size,
          epochs=config.epochs,
          learning_rate=0.0001,
          src_emb_dim=128,
          trg_emb_dim=128,
          src_hidden_dim=256,
          trg_hidden_dim=256,
          src_num_layers=1,
          batch_first=True,
          src_bidirection=True,
          dropout=0.0,
          attn_method='luong_concat',
          repetition='vanilla',
          network='lstm',
          pointer_net=True,
          attn_decoder=True,
          shared_embedding=True,
          share_emb_weight=True,
          src_seq_lens=128,
          trg_seq_lens=128,
          grad_clip=2.0,
          save_model_batch_num=config.save_model_batch_num,
          gpu_id=config.gpu_id):
    print('Training model...')

    if gpu_id > -1:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_id)
        if torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
    else:
        device = torch.device('cpu')
    print('device:', device)
    source_texts, target_texts = build_dataset(train_path)
    vocab2id = read_vocab(source_texts + target_texts, max_size=vocab_max_size, min_count=vocab_min_count)
    num_encoder_tokens = len(vocab2id)
    max_input_texts_len = max([len(text) for text in source_texts])

    print('source_texts:', source_texts[0])
    print('target_texts:', target_texts[0])
    print('num of samples:', len(source_texts))
    print('num of unique input tokens:', num_encoder_tokens)
    print('max sequence length for inputs:', max_input_texts_len)

    id2vocab = {v: k for k, v in vocab2id.items()}
    # save word dict
    save_word_dict(vocab2id, vocab_path)
    print('The vocabulary file:%s, size: %s' % (vocab_path, len(vocab2id)))

    model = Seq2Seq(
        src_emb_dim=src_emb_dim,
        trg_emb_dim=trg_emb_dim,
        src_hidden_dim=src_hidden_dim,
        trg_hidden_dim=trg_hidden_dim,
        src_vocab_size=len(vocab2id),
        trg_vocab_size=len(vocab2id),
        src_nlayer=src_num_layers,
        batch_first=batch_first,
        src_bidirect=src_bidirection,
        dropout=dropout,
        attn_method=attn_method,
        repetition=repetition,
        network=network,
        pointer_net=pointer_net,
        shared_emb=shared_embedding,
        attn_decoder=attn_decoder,
        share_emb_weight=share_emb_weight,
        device=device
    ).to(device)
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # read the last check point and continue training
    uf_model = [0, -1]
    if not os.path.exists(save_model_dir):
        os.mkdir(save_model_dir)
    model_para_files = glob.glob(os.path.join(save_model_dir, '*.model'))
    if len(model_para_files) > 0:
        uf_model = []
        for fl_ in model_para_files:
            arr = re.split('\/', fl_)[-1]
            arr = re.split('\_|\.', arr)
            uf_model.append([int(arr[1]), int(arr[2])])
        uf_model = sorted(uf_model)[-1]
        fl_ = os.path.join(save_model_dir, 'seq2seq_' + str(uf_model[0]) + '_' + str(uf_model[1]) + '.model')
        model.load_state_dict(torch.load(fl_))

    # train models
    losses = []
    start_time = time.time()
    last_model_path = ''
    model.train()
    for epoch in range(uf_model[0], epochs):
        n_batch = create_batch_file(output_dir, file_type='train', file_path=train_path, batch_size=batch_size)
        print('The number of batches: {}'.format(n_batch))
        for batch_id in range(n_batch):
            ext_id2oov, src_arr, trg_input_arr, src_arr_ex, trg_output_arr_ex = process_minibatch_explicit(
                batch_id=batch_id,
                output_dir=output_dir,
                file_type='train',
                batch_size=batch_size,
                vocab2id=vocab2id,
                max_lens=[src_seq_lens, trg_seq_lens])

            src_var = Variable(torch.LongTensor(src_arr))
            trg_input_var = Variable(torch.LongTensor(trg_input_arr))
            # extend oov
            src_var_ex = Variable(torch.LongTensor(src_arr_ex))
            trg_output_var_ex = Variable(torch.LongTensor(trg_output_arr_ex))

            src_var = src_var.to(device)
            trg_input_var = trg_input_var.to(device)
            src_var_ex = src_var_ex.to(device)
            trg_output_var_ex = trg_output_var_ex.to(device)

            weight_mask = torch.ones(len(vocab2id) + len(ext_id2oov)).to(device)
            weight_mask[vocab2id[PAD_TOKEN]] = 0
            loss_criterion = torch.nn.NLLLoss(weight=weight_mask).to(device)

            logits, attn_, p_gen, loss_cv = model(src_var, trg_input_var)
            logits = torch.softmax(logits, dim=2)
            # use the pointer generator loss
            if len(ext_id2oov) > 0:
                logits = model.cal_dist_explicit(src_var_ex, logits, attn_, p_gen, vocab2id, ext_id2oov)
                logits = logits + 1e-20
            else:
                logits = model.cal_dist(src_var, logits, attn_, p_gen, vocab2id)

            if batch_id % 1 == 0:
                word_prob = logits.topk(1, dim=2)[1].squeeze(2).data.cpu().numpy()

            logits = torch.log(logits)
            loss = loss_criterion(
                logits.contiguous().view(-1, len(vocab2id) + len(ext_id2oov)),
                trg_output_var_ex.view(-1))

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()

            end_time = time.time()
            losses.append([
                epoch, batch_id,
                loss.data.cpu().numpy(),
                loss_cv.data.cpu().numpy()[0],
                (end_time - start_time) / 3600.0])

            if batch_id % save_model_batch_num == 0:
                model_path = os.path.join(save_model_dir, 'seq2seq_' + str(epoch) + '_' + str(batch_id) + '.model')
                with open(model_path, 'wb') as f:
                    torch.save(model.state_dict(), f)
                    logger.info("Model save to " + model_path)

            if batch_id % 1 == 0:
                end_time = time.time()
                sen_pred = [id2vocab[x] if x in id2vocab else ext_id2oov[x] for x in word_prob[0]]
                print('epoch={}, batch={}, loss={}, loss_cv={}, time_escape={}s={}h'.format(
                    epoch,
                    batch_id,
                    loss.data.cpu().numpy(),
                    loss_cv.data.cpu().numpy()[0],
                    end_time - start_time, (end_time - start_time) / 3600.0
                ))
                print(' '.join(sen_pred))
            del logits, attn_, p_gen, loss_cv, loss

        with open(os.path.join(save_model_dir, 'loss.txt'), 'a', encoding='utf-8') as f:
            for i in losses:
                f.write(str(i) + '\n')
        model_path = os.path.join(save_model_dir, 'seq2seq_' + str(epoch) + '_' + str(batch_id) + '.model')
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
            logger.info("Model save to " + model_path)
            last_model_path = model_path
    logger.info("Training has finished.")

    # Eval model
    eval(model, last_model_path, val_path, output_dir, batch_size, vocab2id, src_seq_lens, trg_seq_lens, device)
    logger.info("Eval has finished.")
Esempio n. 26
0
 def test_epoch_end(self, outputs) -> None:
     logger.info('Test.')
     self.validation_epoch_end(outputs)
Esempio n. 27
0
from urllib.parse import urlparse
from zipfile import ZipFile, is_zipfile

import numpy as np
import requests
from filelock import FileLock
from tqdm.auto import tqdm
from pycorrector.utils.logger import logger


ENV_VARS_TRUE_VALUES = {"1", "ON", "YES"}
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
import torch

_torch_available = True  # pylint: disable=invalid-name
logger.info("PyTorch version {} available.".format(torch.__version__))
_tf_available = False

try:
    USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()

    if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
        import flax
        import jax

        logger.info("JAX version {}, Flax: available".format(jax.__version__))
        logger.info("Flax available: {}".format(flax))
        _flax_available = True
    else:
        _flax_available = False
except ImportError:
Esempio n. 28
0
def get_from_cache(
        url: str,
        cache_dir=None,
        force_download=False,
        proxies=None,
        etag_timeout=10,
        resume_download=False,
        user_agent: Union[Dict, str, None] = None,
        use_auth_token: Union[bool, str, None] = None,
        local_files_only=False,
) -> Optional[str]:
    """
    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
    path to the cached file.

    Return:
        Local path (string) of file or if networking is off, last version of file cached on disk.

    Raises:
        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
    """
    if cache_dir is None:
        cache_dir = TRANSFORMERS_CACHE
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

    os.makedirs(cache_dir, exist_ok=True)

    headers = {"user-agent": http_user_agent(user_agent)}
    if isinstance(use_auth_token, str):
        headers["authorization"] = "Bearer {}".format(use_auth_token)

    url_to_download = url
    etag = None
    if not local_files_only:
        try:
            r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
            r.raise_for_status()
            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
            # We favor a custom header indicating the etag of the linked resource, and
            # we fallback to the regular etag header.
            # If we don't have any of those, raise an error.
            if etag is None:
                raise OSError(
                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
                )
            # In case of a redirect,
            # save an extra redirect on the request.get call,
            # and ensure we download the exact atomic version even if it changed
            # between the HEAD and the GET (unlikely, but hey).
            if 300 <= r.status_code <= 399:
                url_to_download = r.headers["Location"]
        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
            # etag is already None
            pass

    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    # etag is None == we don't have a connection or we passed local_files_only.
    # try to get the last downloaded one
    if etag is None:
        if os.path.exists(cache_path):
            return cache_path
        else:
            matching_files = [
                file
                for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
                if not file.endswith(".json") and not file.endswith(".lock")
            ]
            if len(matching_files) > 0:
                return os.path.join(cache_dir, matching_files[-1])
            else:
                # If files cannot be found and local_files_only=True,
                # the models might've been found if local_files_only=False
                # Notify the user about that
                if local_files_only:
                    raise ValueError(
                        "Cannot find the requested files in the cached path and outgoing traffic has been"
                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
                        " to False."
                    )
                else:
                    raise ValueError(
                        "Connection error, and we cannot find the requested files in the cached path."
                        " Please try again or make sure your Internet connection is on."
                    )

    # From now on, etag is not None.
    if os.path.exists(cache_path) and not force_download:
        return cache_path

    # Prevent parallel downloads of the same file with a lock.
    lock_path = cache_path + ".lock"
    with FileLock(lock_path):

        # If the download just completed while the lock was activated.
        if os.path.exists(cache_path) and not force_download:
            # Even if returning early like here, the lock will be released.
            return cache_path

        if resume_download:
            incomplete_path = cache_path + ".incomplete"

            @contextmanager
            def _resumable_file_manager() -> "io.BufferedWriter":
                with open(incomplete_path, "ab") as f:
                    yield f

            temp_file_manager = _resumable_file_manager
            if os.path.exists(incomplete_path):
                resume_size = os.stat(incomplete_path).st_size
            else:
                resume_size = 0
        else:
            temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
            resume_size = 0

        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        with temp_file_manager() as temp_file:
            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)

            http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers)

        logger.info("storing %s in cache at %s", url, cache_path)
        os.replace(temp_file.name, cache_path)

        logger.info("creating metadata file for %s", cache_path)
        meta = {"url": url, "etag": etag}
        meta_path = cache_path + ".json"
        with open(meta_path, "w") as meta_file:
            json.dump(meta, meta_file)

    return cache_path
Esempio n. 29
0
def evaluate(encoder_model, decoder_model, num_encoder_tokens,
             num_decoder_tokens, rnn_hidden_dim, target_token_index,
             max_decoder_seq_length, encoder_input_data, input_texts):
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None, num_encoder_tokens))
    encoder = LSTM(rnn_hidden_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, num_decoder_tokens))
    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_lstm = LSTM(rnn_hidden_dim,
                        return_sequences=True,
                        return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    # Reverse-lookup token index to decode sequences back to
    # something readable.
    reverse_target_char_index = dict(
        (i, char) for char, i in target_token_index.items())

    def decode_seq(input_seq):
        decoded_sentence = ''
        # Encode the input as state vectors.
        states_value = encoder_model.predict(input_seq)

        # Generate empty target sequence of length 1.
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        # Populate the first character of target sequence with the start character.
        first_char = GO_TOKEN
        target_seq[0, 0, target_token_index[first_char]] = 1.

        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        while not stop_condition:
            output_tokens, h, c = decoder_model.predict([target_seq] +
                                                        states_value)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_char = reverse_target_char_index[sampled_token_index]
            if sampled_char != EOS_TOKEN:
                decoded_sentence += sampled_char

            # Exit condition: either hit max length
            # or find stop character.
            if (sampled_char == EOS_TOKEN
                    or len(decoded_sentence) > max_decoder_seq_length):
                stop_condition = True

            # Update the target sequence (of length 1).
            target_seq = np.zeros((1, 1, num_decoder_tokens))
            target_seq[0, 0, sampled_token_index] = 1.

            # Update states
            states_value = [h, c]

        return decoded_sentence

    for seq_index in range(10):
        # Take one sequence (part of the training set)
        # for trying out decoding.
        input_seq = encoder_input_data[seq_index:seq_index + 1]
        output_text = decode_seq(input_seq)

        logger.info('Input sentence:%s' % input_texts[seq_index])
        logger.info('Decoded sentence:%s' % output_text)
Esempio n. 30
0
    def from_pretrained(cls,
                        pretrained_model_name_or_path: Union[str, os.PathLike],
                        dtype: jnp.dtype = jnp.float32,
                        *model_args,
                        **kwargs):
        r"""
        Instantiate a pretrained flax model from a pre-trained model configuration.

        The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
        task.

        The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
        weights are discarded.

        Parameters:
            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
                Can be either:

                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
                    - A path to a `directory` containing model weights saved using
                      :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
                    - A path or url to a `pt index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In this
                      case, ``from_pt`` should be set to :obj:`True`.
            model_args (sequence of positional arguments, `optional`):
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
            config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`):
                Can be either:

                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
                    - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.

                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
                be automatically loaded when:

                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
                      model).
                    - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
                      by supplying the save directory.
                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
                      configuration JSON file named `config.json` is found in the directory.
            cache_dir (:obj:`Union[str, os.PathLike]`, `optional`):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            from_pt (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Load the model weights from a PyTorch checkpoint save file (see docstring of
                ``pretrained_model_name_or_path`` argument).
            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (:obj:`Dict[str, str], `optional`):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to only look at local files (i.e., do not try to download the model).
            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
                identifier allowed by git.
            kwargs (remaining dictionary of keyword arguments, `optional`):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
                automatically loaded:

                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
                      attribute will be passed to the underlying model's ``__init__`` function.

        Examples::

            >>> from transformers import BertConfig, FlaxBertModel
            >>> # Download model and configuration from huggingface.co and cache.
            >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
            >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
            >>> model = FlaxBertModel.from_pretrained('./test/saved_model/')
            >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
            >>> config = BertConfig.from_json_file('./pt_model/config.json')
            >>> model = FlaxBertModel.from_pretrained('./pt_model/pytorch_model.bin', from_pt=True, config=config)
        """
        config = kwargs.pop("config", None)
        cache_dir = kwargs.pop("cache_dir", None)
        from_pt = kwargs.pop("from_pt", False)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
        use_auth_token = kwargs.pop("use_auth_token", None)
        revision = kwargs.pop("revision", None)

        # Load config if we don't provide a configuration
        if not isinstance(config, PretrainedConfig):
            config_path = config if config is not None else pretrained_model_name_or_path
            config, model_kwargs = cls.config_class.from_pretrained(
                config_path,
                *model_args,
                cache_dir=cache_dir,
                return_unused_kwargs=True,
                force_download=force_download,
                resume_download=resume_download,
                proxies=proxies,
                local_files_only=local_files_only,
                use_auth_token=use_auth_token,
                revision=revision,
                **kwargs,
            )
        else:
            model_kwargs = kwargs

        # Add the dtype to model_kwargs
        model_kwargs["dtype"] = dtype

        # Load model
        if pretrained_model_name_or_path is not None:
            if os.path.isdir(pretrained_model_name_or_path):
                if from_pt and os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     WEIGHTS_NAME)):
                    # Load from a PyTorch checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                WEIGHTS_NAME)
                elif os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     FLAX_WEIGHTS_NAME)):
                    # Load from a Flax checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                FLAX_WEIGHTS_NAME)
                else:
                    raise EnvironmentError(
                        "Error no file named {} found in directory {} or `from_pt` set to False"
                        .format(
                            [FLAX_WEIGHTS_NAME, WEIGHTS_NAME],
                            pretrained_model_name_or_path,
                        ))
            elif os.path.isfile(
                    pretrained_model_name_or_path) or is_remote_url(
                        pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
            else:
                archive_file = hf_bucket_url(
                    pretrained_model_name_or_path,
                    filename=WEIGHTS_NAME if from_pt else FLAX_WEIGHTS_NAME,
                    revision=revision,
                )

            # redirect to the cache, if necessary
            try:
                resolved_archive_file = cached_path(
                    archive_file,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    local_files_only=local_files_only,
                    use_auth_token=use_auth_token,
                )
            except EnvironmentError as err:
                logger.error(err)
                msg = (
                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named {WEIGHTS_NAME}.\n\n"
                )
                raise EnvironmentError(msg)

            if resolved_archive_file == archive_file:
                logger.info(f"loading weights file {archive_file}")
            else:
                logger.info(
                    f"loading weights file {archive_file} from cache at {resolved_archive_file}"
                )
        else:
            resolved_archive_file = None

        # Instantiate model.
        with open(resolved_archive_file, "rb") as state_f:
            try:
                if from_pt:
                    import torch

                    state = torch.load(state_f)

                    state = convert_state_dict_from_pt(cls, state, config)
                else:
                    state = from_bytes(cls, state_f.read())
            except UnpicklingError:
                raise EnvironmentError(
                    f"Unable to convert pytorch model {archive_file} to Flax deserializable object. "
                )

        # init random models
        model = cls(config, *model_args, **model_kwargs)

        # if model is base model only use model_prefix key
        if cls.base_model_prefix not in dict(
                model.params) and cls.base_model_prefix in state:
            state = state[cls.base_model_prefix]

        # flatten dicts
        state = flatten_dict(state)
        random_state = flatten_dict(unfreeze(model.params))

        missing_keys = model.required_params - set(state.keys())
        unexpected_keys = set(state.keys()) - model.required_params

        # add missing keys as random parameters
        for missing_key in missing_keys:
            state[missing_key] = random_state[missing_key]

        if len(unexpected_keys) > 0:
            logger.warning(
                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
                f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
                f"- This IS expected if you are initializing {model.__class__.__name__} from the checkpoint of a model trained on another task "
                f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n"
                f"- This IS NOT expected if you are initializing {model.__class__.__name__} from the checkpoint of a model that you expect "
                f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
            )
        else:
            logger.info(
                f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
            )

        if len(missing_keys) > 0:
            logger.warning(
                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
                f"and are newly initialized: {missing_keys}\n"
                f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
            )
        else:
            logger.info(
                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
                f"If your task is similar to the task the model of the checkpoint was trained on, "
                f"you can already use {model.__class__.__name__} for predictions without further training."
            )

        # set correct parameters
        model.params = unflatten_dict(state)
        return model