Example #1
0
    def from_pretrained(cls, pretrained_model_name_or_path:str):
        """
            Setting up this method will enable to load directly from huggingface hub just like other HF models are loaded
        """
        model_id = pretrained_model_name_or_path

        if len(model_id.split("/")) == 1:
            name = model_id
        else:
            username, name = model_id.split("/")

        if name in os.listdir():
            print("LOADING config & model weights from local directory")
            config_file = os.path.join(name, "config.json")
            model_file = os.path.join(name, "pytorch_model.bin")
        else:
            config_url = hf_bucket_url(model_id, filename="config.json")
            config_file = cached_path(config_url)
            # downloading & load only the adapter weights from huggingface hub
            # and corresponding bert weights will be loaded when class is getting initiated
            model_url = hf_bucket_url(model_id, filename="pytorch_model.bin")
            model_file = cached_path(model_url)

        with open(config_file, "r", encoding="utf-8") as f:
            config = json.load(f)
        config = Dict.from_nested_dict(config)

        state_dict = torch.load(model_file, map_location="cpu")
        # randomly initializing model from given config with bert weights restored
        model = cls(config)
        # now restoring adapter weights
        model.load_state_dict(state_dict, strict=False)
        model.eval()

        return model
Example #2
0
def get_affect_words_and_int (affect_class):
  emotions = "https://raw.githubusercontent.com/ishikasingh/Affective-text-gen/master/NRC-Emotion-Intensity-Lexicon-v1.txt"
  filepath = cached_path(emotions)
  with open(filepath, "r") as f:
      words = f.read().strip().split("\n")[1:]
  words = [w.split("\t") for w in words]
  return [w[0] for w in words if w[1] == affect_class], [float(w[-1]) for w in words if w[1] == affect_class]
Example #3
0
def download(url, file, dir="."):
    import shutil
    from transformers.file_utils import cached_path
    t = os.path.join(dir, "filesize.txt")
    shutil.copy(cached_path(url + "filesize.txt"), t)
    with open(t, "r") as f:
        r = f.read()
    ft = 0
    for t in r.split("\n"):
        s = t.split()
        if len(s) == 2:
            if s[0] == file:
                ft = int(s[1])
    if ft == 0:
        return
    shutil.copy(cached_path(url + file), os.path.join(dir, file))
Example #4
0
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
        List[List[List[int]]]:
    bow_indices = []
    # 여러개가 들어올 것으로 가정 현재는 ['military']
    for id_or_path in bag_of_words_ids_or_paths:
        # 자신들의 미리 설정된 데이터에 존재한다면
        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
            # transformer의 cached_path로 filepath 설정
            # 즉 현상황은 cached_path("https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt")
            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
        else:
            # 아니면 단순 path 로 설정
            filepath = id_or_path

        # 내부에는 military 와 연관 단어 리스트가 존재 이를 정리해서 words 에 정리
        with open(filepath, "r") as f:
            words = f.read().strip().split("\n")

        # 각각의 단어를 tokenize 하여 리스트에 삽입
        bow_indices.append(
            [tokenizer.encode(word.strip(),
                              add_prefix_space=True,
                              add_special_tokens=False)
             for word in words])
    return bow_indices
Example #5
0
    def detect(self, texts):
        """
        Detects the language for each element in texts.

        Args:
            texts: list of text

        Returns:
            list of languages
        """

        if not self.detector:
            # Suppress unnecessary warning
            fasttext.FastText.eprint = lambda x: None

            # Load language detection model
            path = cached_path(self.langdetect)
            self.detector = fasttext.load_model(path)

        # Transform texts to format expected by language detection model
        texts = [
            x.lower().replace("\n", " ").replace("\r\n", " ") for x in texts
        ]

        return [x[0].split("__")[-1] for x in self.detector.predict(texts)[0]]
Example #6
0
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer, omit_file=None) -> \
        List[List[List[int]]]:
    bow_indices = []
    for id_or_path in bag_of_words_ids_or_paths:
        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
        else:
            filepath = id_or_path
        with open(filepath, "r") as f:
            words = f.read().strip().split("\n")
        if (omit_file):
            with open(omit_file, "r") as f:
                omit_words = f.read().strip().split("\n")

            new_words = list()
            for word in words:
                if (word not in omit_words):
                    new_words.append(word)
            bow_indices.append([
                tokenizer.encode(word.strip(),
                                 add_prefix_space=True,
                                 add_special_tokens=False)
                for word in new_words
            ])
        else:
            bow_indices.append([
                tokenizer.encode(word.strip(),
                                 add_prefix_space=True,
                                 add_special_tokens=False) for word in words
            ])
    return bow_indices
    def setUp(self):
        super().setUp()

        data_cached = cached_path(
            "https://cdn-datasets.huggingface.co/translation/wmt_en_ro-tr40k-va0.5k-te0.5k.tar.gz",
            extract_compressed_file=True,
        )
        self.data_dir = f"{data_cached}/wmt_en_ro-tr40k-va0.5k-te0.5k"
def get_bag_of_words_indices(bag_of_words_ids_or_paths, tokenizer):
    bow_indices = []

    for id_or_path in bag_of_words_ids_or_paths:
        print(id_or_path)
        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
    return
Example #9
0
def download_file_from_hf(pretrained_model_name_or_path: str,
                          file_name: str) -> str:
    # Load model
    if pretrained_model_name_or_path is not None:
        if os.path.isdir(pretrained_model_name_or_path):
            if os.path.isfile(
                    os.path.join(pretrained_model_name_or_path, file_name)):
                # Load from a PyTorch checkpoint
                archive_file = os.path.join(pretrained_model_name_or_path,
                                            file_name)
            else:
                raise EnvironmentError(
                    "Error no file named {} found in directory {}".format(
                        file_name,
                        pretrained_model_name_or_path,
                    ))
        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
                pretrained_model_name_or_path):
            archive_file = pretrained_model_name_or_path
        else:
            archive_file = hf_bucket_url(
                pretrained_model_name_or_path,
                filename=file_name,
                revision=None,
                mirror=None,
            )

        try:
            # Load from URL or cache if already cached
            resolved_archive_file = cached_path(
                archive_file,
                cache_dir=None,
                force_download=False,
                proxies=None,
                resume_download=False,
                local_files_only=False,
            )
        except EnvironmentError as err:
            logger.error(err)
            msg = (
                f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on"
                f"'https://huggingface.co/models'\n\n"
                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a"
                f"file named one of {file_name}.\n\n")
            raise EnvironmentError(msg)

        if resolved_archive_file == archive_file:
            logger.info("loading weights file {}".format(archive_file))
        else:
            logger.info("loading weights file {} from cache at {}".format(
                archive_file, resolved_archive_file))
    else:
        resolved_archive_file = None

    return resolved_archive_file
Example #10
0
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
    bow_indices = []
    for id_or_path in bag_of_words_ids_or_paths:
        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
        else:
            filepath = id_or_path
        with open(filepath, "r") as f:
            words = f.read().strip().split("\n")
        bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words])
    return bow_indices
Example #11
0
def load_model_from_cache(model_name_or_path, model_arch, cache_dir, filename,
                          config):
    url = hf_bucket_url(model_name_or_path, filename=filename)
    path = cached_path(url, cache_dir=cache_dir) + "." + model_arch
    xml_path = path + ".xml"
    bin_path = path + ".bin"
    model = None
    if os.path.exists(xml_path) and os.path.exists(bin_path):
        logger.info(f"Load OpenVINO model from cache: {xml_path}")
        model = load_ov_model_from_ir(xml_path, bin_path, config)
    return model, path
Example #12
0
def load_cached_hf_parameters(model_name_or_path, cache_dir):
    archive_file = hf_bucket_url(
        model_name_or_path,
        filename='pytorch_model.bin'
    )
    resolved_archive_file = cached_path(
        archive_file,
        cache_dir=cache_dir
    )
    state_dict = torch.load(resolved_archive_file, map_location="cpu")
    return state_dict
Example #13
0
 def _get_config_dict(cls, path, **kw):
     local_files_only = kw.pop("local_files_only", False)
     from_pipeline = kw.pop("_from_pipeline", None)
     user_agent = {
         "file_type": "config",
         "from_auto_class": kw.pop("_from_auto", False)
     }
     if from_pipeline is not None:
         user_agent["using_pipeline"] = from_pipeline
     if is_offline_mode() and not local_files_only:
         log.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
     path = str(path)
     if os.path.isfile(path) or is_remote_url(path):
         x = path
     else:
         f = kw.pop("_configuration_file", CONFIG_NAME)
         if os.path.isdir(path):
             x = os.path.join(path, f)
         else:
             x = hf_bucket_url(path,
                               filename=f,
                               revision=kw.pop("revision", None),
                               mirror=None)
     try:
         x2 = cached_path(
             x,
             cache_dir=kw.pop("cache_dir", None),
             force_download=kw.pop("force_download", False),
             proxies=kw.pop("proxies", None),
             resume_download=kw.pop("resume_download", False),
             local_files_only=local_files_only,
             use_auth_token=kw.pop("use_auth_token", None),
             user_agent=user_agent,
         )
     except RepositoryNotFoundError as e:
         raise OSError() from e
     except RevisionNotFoundError as e:
         raise OSError() from e
     except EntryNotFoundError as e:
         raise OSError() from e
     except HTTPError as e:
         raise OSError() from e
     except OSError as e:
         raise e
     try:
         y = cls._dict_from_json_file(x2)
     except (json.JSONDecodeError, UnicodeDecodeError) as e:
         raise OSError() from e
     if x2 == x:
         log.info(f"loading {x}")
     else:
         log.info(f"loading {x} from cache at {x2}")
     return y, kw
def get_classifier(name):

    params = DISCRIMINATOR_MODELS_PARAMS[name]
    if "url" in params:
        resolved_archive_file = cached_path(params["url"])
    elif "path" in params:
        resolved_archive_file = params["path"]
    else:
        raise ValueError("Either url or path have to be specified "
                         "in the discriminator model parameters")

    return
Example #15
0
def get_classifier(
    name: Optional[str],
    class_label: Union[str, int],
    device: str,
    verbosity_level: int = REGULAR
) -> Tuple[Optional[ClassificationHead], Optional[int]]:
    """
    Загружаем предварительно сохранный малый торчевский классификатор-дискриминатор по имени.
    """
    if name is None:
        return None, None

    params = DISCRIMINATOR_MODELS_PARAMS[name]
    classifier = ClassificationHead(class_size=params['class_size'],
                                    embed_size=params['embed_size']).to(device)
    if "url" in params:
        resolved_archive_file = cached_path(params["url"])
    elif "path" in params:
        resolved_archive_file = params["path"]
    else:
        raise ValueError("Either url or path have to be specified "
                         "in the discriminator model parameters")
    classifier.load_state_dict(
        torch.load(resolved_archive_file, map_location=device))
    classifier.eval()

    if isinstance(class_label, str):
        if class_label in params["class_vocab"]:
            label_id = params["class_vocab"][class_label]
        else:
            label_id = params["default_class"]
            if verbosity_level >= REGULAR:
                print("class_label {} not in class_vocab".format(class_label))
                print("available values are: {}".format(params["class_vocab"]))
                print("using default class {}".format(label_id))

    elif isinstance(class_label, int):
        if class_label in set(params["class_vocab"].values()):
            label_id = class_label
        else:
            label_id = params["default_class"]
            if verbosity_level >= REGULAR:
                print("class_label {} not in class_vocab".format(class_label))
                print("available values are: {}".format(params["class_vocab"]))
                print("using default class {}".format(label_id))

    else:
        label_id = params["default_class"]

    return classifier, label_id
Example #16
0
def get_classifier(name: str, device: str) -> ClassificationHead:
    params = DISCRIMINATOR_MODELS_PARAMS[name]
    classifier = ClassificationHead(class_size=params["class_size"],
                                    embed_size=params["embed_size"]).to(device)
    if "url" in params:
        resolved_archive_file = cached_path(params["url"])
    elif "path" in params:
        resolved_archive_file = params["path"]
    else:
        raise ValueError("Either url or path have to be specified "
                         "in the discriminator model parameters")
    classifier.load_state_dict(
        torch.load(resolved_archive_file, map_location=device))
    classifier.eval()
    return classifier
Example #17
0
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
    """
    BOW 词集进行tokenizer
    :param bag_of_words_ids_or_paths:
    :param tokenizer:
    :return:
    """
    bow_indices = []
    for id_or_path in bag_of_words_ids_or_paths:
        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
        else:
            filepath = id_or_path
        with open(filepath, "r") as f:
            words = f.read().strip().split("\n")
Example #18
0
def get_classifier(
        model, name: Optional[str], class_label: Union[str, int],
        device: str) -> Tuple[Optional[ClassificationHead], Optional[int]]:

    if name is None:
        return None, None

    params = DISCRIMINATOR_MODELS_PARAMS[name]
    classifier = ClassificationHead(class_size=params["class_size"],
                                    embed_size=params["embed_size"]).to(device)
    if "url" in params:
        resolved_archive_file = cached_path(params["url"])
    elif "path" in params:
        resolved_archive_file = params["path"]
    else:
        raise ValueError(
            "Either url or path have to be specified in the discriminator model parameters"
        )

    classifier.load_state_dict(
        torch.load(resolved_archive_file, map_location=device))
    classifier.eval()

    if isinstance(class_label, str):
        if class_label in params["class_vocab"]:
            label_id = params["class_vocab"][class_label]
        else:
            label_id = params["default_class"]
            print("class_label {} not in class_vocab".format(class_label))
            print("available values are: {}".format(params["class_vocab"]))
            print("using default class {}".format(label_id))

    elif isinstance(class_label, int):
        if class_label in set(params["class_vocab"].values()):
            label_id = class_label
        else:
            label_id = params["default_class"]
            print("class_label {} not in class_vocab".format(class_label))
            print("available values are: {}".format(params["class_vocab"]))
            print("using default class {}".format(label_id))

    else:
        label_id = params["default_class"]

    return classifier, label_id
Example #19
0
def load_from_cache(pretrained_model_name_or_path, s3_dict, **kwargs):
    # Adjusted from HF Transformers to fit loading WordEmbeddings from deepsets s3
    # Load from URL or cache if already cached
    cache_dir = kwargs.pop("cache_dir", None)
    force_download = kwargs.pop("force_download", False)
    resume_download = kwargs.pop("resume_download", False)
    proxies = kwargs.pop("proxies", None)

    s3_file = s3_dict[pretrained_model_name_or_path]
    try:
        resolved_file = cached_path(
            s3_file,
            cache_dir=cache_dir,
            force_download=force_download,
            proxies=proxies,
            resume_download=resume_download,
        )

        if resolved_file is None:
            raise EnvironmentError

    except EnvironmentError:
        if pretrained_model_name_or_path in s3_dict:
            msg = "Couldn't reach server at '{}' to download data.".format(
                s3_file)
        else:
            msg = (
                "Model name '{}' was not found in model name list. "
                "We assumed '{}' was a path, a model identifier, or url to a configuration file or "
                "a directory containing such a file but couldn't find any such file at this path or url."
                .format(
                    pretrained_model_name_or_path,
                    s3_file,
                ))
        raise EnvironmentError(msg)

    if resolved_file == s3_file:
        logger.info("loading file {}".format(s3_file))
    else:
        logger.info("loading file {} from cache at {}".format(
            s3_file, resolved_file))

    return resolved_file
Example #20
0
def get_classifier(discrim_meta: Optional[dict],
                   device: str) -> Optional[ClassificationHead]:
    if discrim_meta is None:
        return None, None

    params = discrim_meta
    classifier = ClassificationHead(class_size=params['class_size'],
                                    embed_size=params['embed_size']).to(device)
    if "url" in params:
        resolved_archive_file = cached_path(params["url"])
    elif "path" in params:
        resolved_archive_file = params["path"]
    else:
        raise ValueError("Either url or path have to be specified "
                         "in the discriminator model parameters")
    classifier.load_state_dict(
        torch.load(resolved_archive_file, map_location=device))
    classifier.eval()

    return classifier
Example #21
0
def get_bag_of_words_indices_rhyming(bag_of_words_ids_or_paths: List[str], tokenizer, rhyming_words: List[str]) -> \
  List[List[List[int]]]:
    bow_indices = []
    for id_or_path in bag_of_words_ids_or_paths:
        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
        else:
            filepath = id_or_path
        with open(filepath, "r") as f:
            if len(rhyming_words) > 0:
                words = rhyming_words
            else:
                words = f.read().strip().split("\n")
            # words.extend(rhyming_words)
        bow_indices.append([
            tokenizer.encode(word.strip(),
                             add_prefix_space=True,
                             add_special_tokens=False) for word in words
        ])
    return bow_indices
Example #22
0
    def from_pretrained(cls,
                        pretrained_model_name,
                        cache_dir=None,
                        *inputs,
                        **kwargs):
        """
		Instantiate a PreTrainedBertModel from a pre-trained model file.
		Download and cache the pre-trained model file if needed.
		"""
        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
        else:
            vocab_file = pretrained_model_name
        if os.path.isdir(vocab_file):
            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
        except FileNotFoundError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
                    pretrained_model_name,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    vocab_file))
            return None
        if resolved_vocab_file == vocab_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
        if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
                pretrained_model_name]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
        return tokenizer
    def load_from_bert_pretrained(cls, config_file, pretrained_model_name='bert-base-uncased', **kwargs):
        model = cls(config_file, **kwargs)
        model(model.dummy_inputs, training=False)

        ckpt_layer_mapping = {}
        for vind, ckpt_ind in enumerate(model.config.ckpt_layer_mapping.split(',')):
            ckpt_layer_mapping['layer_._{}'.format(vind)] = 'layer_._{}'.format(ckpt_ind)

        archive_file = hf_bucket_url(pretrained_model_name, filename=TF2_WEIGHTS_NAME, use_cdn=True)
        resolved_archive_file = cached_path(archive_file, cache_dir=None, force_download=False, resume_download=False,
                                            proxies=None)
        f = h5py.File(resolved_archive_file, mode='r')

        layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
        g = f[layer_names[0]]
        weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
        weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
        weights_map = {'/'.join(name.split('/')[2:]): i for i, name in enumerate(weight_names)}
        weight_value_tuples = []
        w_names = []
        for w in model.layers[0].weights:
            w_name = '/'.join(w.name.split('/')[3:])
            for k in ckpt_layer_mapping:
                if w_name.find(k):
                    w_name = w_name.replace(k, ckpt_layer_mapping[k])
                    break

            if w_name in weights_map and w.shape == weight_values[weights_map[w_name]].shape:
                w_names.append(w_name)
                weight_value_tuples.append((w, weight_values[weights_map[w_name]]))

        logger.info("Loaded %d weights" % (len(w_names)))
        logger.info("Loaded weights names are: %s" % (", ".join(w_names)))

        K.batch_set_value(weight_value_tuples)

        print("Loaded %d weights" % (len(w_names)))
        print("Loaded weights names are: %s" % (", ".join(w_names)))

        model(model.dummy_inputs, training=False)
        return model
Example #24
0
    def __init__(self,
                 vocab_size=30522,
                 bert_model_name=None,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 max_position_embeddings=512,
                 intermediate_size=3072,
                 hidden_size=768,
                 dropout=0.1):
        config = BertConfig(vocab_size_or_config_json_file=vocab_size,
                            num_hidden_layers=num_hidden_layers,
                            num_attention_heads=num_attention_heads,
                            max_position_embeddings=max_position_embeddings,
                            intermediate_size=intermediate_size,
                            hidden_size=hidden_size,
                            attention_probs_dropout_prob=dropout,
                            hidden_act="gelu",
                            hidden_dropout_prob=dropout,
                            initializer_range=0.02,
                            layer_norm_eps=1e-12,
                            type_vocab_size=2)
        super(TransformerEncoder, self).__init__(config=config)
        if bert_model_name:
            # extract file path
            if bert_model_name in BERT_PRETRAINED_MODEL_ARCHIVE_MAP:
                archive_file = BERT_PRETRAINED_MODEL_ARCHIVE_MAP[
                    bert_model_name]
            else:
                archive_file = bert_model_name
            archive_file = cached_path(archive_file)

            bert_state_dict = torch.load(archive_file)

            bert_state_keys = bert_state_dict.keys()

            for k, p in self.named_parameters():
                bert_key = "bert." + k
                if bert_key in bert_state_keys and bert_state_dict[
                        bert_key].size() == p.size():
                    p.data.copy_(bert_state_dict[bert_key].data)
Example #25
0
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
        List[List[List[int]]]:
    """
    Превращает список слов из файла в список списков индексов токенов.
    Для каждой строки - свой список с токенами
    """
    bow_indices = []
    for id_or_path in bag_of_words_ids_or_paths:
        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
        else:
            filepath = id_or_path
        with open(filepath, "r") as f:
            words = f.read().strip().split("\n")
        bow_indices.append([
            tokenizer.encode(
                word.strip(),
                # похоже, разбивает длинные слова на несколько коротких частей-токенов
                add_prefix_space=True,
                add_special_tokens=False) for word in words
        ])
    return bow_indices
Example #26
0
def get_regressor(
    name: Optional[str],
    device: str,
    verbosity_level: int = REGULAR
) -> Tuple[Optional[ClassificationHead], Optional[int]]:
    if name is None:
        return None, None

    params = DISCRIMINATOR_MODELS_PARAMS[name]
    classifier = RegressionHead1(embed_size=params['embed_size']).to(device)
    if "url" in params:
        resolved_archive_file = cached_path(params["url"])
    elif "path" in params:
        resolved_archive_file = params["path"]
    else:
        raise ValueError("Either url or path have to be specified "
                         "in the discriminator model parameters")
    classifier.load_state_dict(
        torch.load(resolved_archive_file, map_location=device))
    classifier.eval()

    return classifier
def get_classifier(
        discrim_meta: Optional[dict], class_label: Union[str, int],
        device: str) -> Tuple[Optional[ClassificationHead], Optional[int]]:
    if discrim_meta is None:
        return None, None

    params = discrim_meta
    classifier = ClassificationHead(class_size=params['class_size'],
                                    embed_size=params['embed_size']).to(device)
    if "url" in params:
        resolved_archive_file = cached_path(params["url"])
    elif "path" in params:
        resolved_archive_file = params["path"]
    else:
        raise ValueError("Either url or path have to be specified "
                         "in the discriminator model parameters")
    classifier.load_state_dict(
        torch.load(resolved_archive_file, map_location=device))
    classifier.eval()

    if isinstance(class_label, str):
        if class_label in params["class_vocab"]:
            label_id = params["class_vocab"][class_label]
        else:
            label_id = params["default_class"]

    elif isinstance(class_label, int):
        if class_label in set(params["class_vocab"].values()):
            label_id = class_label
        else:
            label_id = params["default_class"]

    else:
        label_id = params["default_class"]

    return classifier, label_id
Example #28
0
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
                        **kwargs):
        """Instantiate a pretrained pytorch model from a pre-trained model configuration.

        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with ``model.train()``

        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
        It is up to you to train those weights with a downstream fine-tuning task.

        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.

        Parameters:
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)

            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method

            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:

                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.

            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.

            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.

            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.

            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.

            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:

                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.

        Examples::

            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
        config = kwargs.pop('config', None)
        state_dict = kwargs.pop('state_dict', None)
        cache_dir = kwargs.pop('cache_dir', None)
        from_tf = kwargs.pop('from_tf', False)
        force_download = kwargs.pop('force_download', False)
        proxies = kwargs.pop('proxies', None)
        output_loading_info = kwargs.pop('output_loading_info', False)
        random_init = kwargs.pop("random_init", False)
        use_cdn = kwargs.pop("use_cdn", True)
        local_files_only = kwargs.pop("local_files_only", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        kwargs_config = kwargs.copy()

        mapping_keys_state_dic = kwargs.pop("mapping_keys_state_dic", None)
        kwargs_config.pop("mapping_keys_state_dic", None)

        if config is None:

            config, model_kwargs = cls.config_class.from_pretrained(
                pretrained_model_name_or_path,
                *model_args,
                cache_dir=cache_dir,
                return_unused_kwargs=True,
                force_download=force_download,
                **kwargs_config)
        else:
            model_kwargs = kwargs

        # Load model
        if pretrained_model_name_or_path is not None:
            if os.path.isdir(pretrained_model_name_or_path):
                if from_tf and os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     TF_WEIGHTS_NAME + ".index")):
                    # Load from a TF 1.0 checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                TF_WEIGHTS_NAME + ".index")
                elif from_tf and os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     TF2_WEIGHTS_NAME)):
                    # Load from a TF 2.0 checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                TF2_WEIGHTS_NAME)
                elif os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     WEIGHTS_NAME)):
                    # Load from a PyTorch checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                WEIGHTS_NAME)
                else:
                    raise EnvironmentError(
                        "Error no file named {} found in directory {} or `from_tf` set to False"
                        .format(
                            [
                                WEIGHTS_NAME, TF2_WEIGHTS_NAME,
                                TF_WEIGHTS_NAME + ".index"
                            ],
                            pretrained_model_name_or_path,
                        ))
            elif os.path.isfile(
                    pretrained_model_name_or_path) or is_remote_url(
                        pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                assert (
                    from_tf
                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                    pretrained_model_name_or_path + ".index")
                archive_file = pretrained_model_name_or_path + ".index"
            else:
                archive_file = hf_bucket_url(
                    pretrained_model_name_or_path,
                    filename=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME),
                    use_cdn=use_cdn,
                )

            try:
                # Load from URL or cache if already cached
                resolved_archive_file = cached_path(
                    archive_file,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    local_files_only=local_files_only,
                )
                if resolved_archive_file is None:
                    raise EnvironmentError
            except EnvironmentError:
                msg = (
                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME}.\n\n"
                )
                raise EnvironmentError(msg)

            if resolved_archive_file == archive_file:
                logger.info("loading weights file {}".format(archive_file))
            else:
                logger.info("loading weights file {} from cache at {}".format(
                    archive_file, resolved_archive_file))
        else:
            resolved_archive_file = None

        # Instantiate model.

        model = cls(config, *model_args, **model_kwargs)

        if state_dict is None and not from_tf:
            state_dict = torch.load(resolved_archive_file, map_location='cpu')

        missing_keys = []
        unexpected_keys = []
        error_msgs = []

        if from_tf:
            if resolved_archive_file.endswith('.index'):
                # Load from a TensorFlow 1.X checkpoint - provided by original authors
                model = cls.load_tf_weights(
                    model, config,
                    resolved_archive_file[:-6])  # Remove the '.index'
            else:
                # Load from our TensorFlow 2.0 checkpoints
                try:
                    from transformers import load_tf2_checkpoint_in_pytorch_model
                    model = load_tf2_checkpoint_in_pytorch_model(
                        model, resolved_archive_file, allow_missing_keys=True)
                except ImportError as e:
                    logger.error(
                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
                    )
                    raise e
        else:
            # Convert old format to new format if needed from a PyTorch state_dict
            old_keys = []
            new_keys = []

            for key in state_dict.keys():
                new_key = None
                if 'gamma' in key:
                    new_key = key.replace('gamma', 'weight')
                if 'beta' in key:
                    new_key = key.replace('beta', 'bias')
                if new_key:
                    old_keys.append(key)
                    new_keys.append(new_key)
            for old_key, new_key in zip(old_keys, new_keys):
                state_dict[new_key] = state_dict.pop(old_key)

            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, '_metadata', None)
            state_dict = state_dict.copy()
            if metadata is not None:
                state_dict._metadata = metadata
            # assert mapping_keys_state_dic is not None, "ERROR did not found mapping dicts for {} ".format(pretrained_model_name_or_path)
            # mapping_keys_state_dic = {"roberta": "encoder", "lm_head": "head.mlm"}
            if mapping_keys_state_dic is not None:
                assert isinstance(mapping_keys_state_dic, dict), "ERROR "
                print(
                    "INFO : from loading from pretrained method (assuming loading original google model : "
                    "need to rename some keys {})".format(
                        mapping_keys_state_dic))
                state_dict = cls.adapt_state_dic_to_multitask(
                    state_dict,
                    keys_mapping=mapping_keys_state_dic,
                    add_prefix=pretrained_model_name_or_path ==
                    "asafaya/bert-base-arabic")
                #pdb.set_trace()

            def load(module, prefix=''):

                local_metadata = {"version": 1}

                if not prefix.startswith("head") or prefix.startswith(
                        "head.mlm"):
                    assert len(
                        missing_keys
                    ) == 0, "ERROR {} missing keys in state_dict {}".format(
                        prefix, missing_keys)
                else:
                    if len(missing_keys) == 0:
                        print(
                            "Warning {} missing keys in state_dict {} (warning expected for task-specific fine-tuning)"
                            .format(prefix, missing_keys))

                module._load_from_state_dict(state_dict, prefix,
                                             local_metadata, True,
                                             missing_keys, unexpected_keys,
                                             error_msgs)
                for name, child in module._modules.items():

                    # load_params_only_ls = kwargs.get("load_params_only_ls ")
                    not_load_params_ls = kwargs.get(
                        "not_load_params_ls") if kwargs.get(
                            "not_load_params_ls") is not None else []
                    assert isinstance(
                        not_load_params_ls, list
                    ), f"Argument error not_load_params_ls should be a list but is {not_load_params_ls}"
                    matching_not_load = []
                    # RANDOM-INIT
                    for pattern in not_load_params_ls:
                        matching = re.match(pattern, prefix + name)
                        if matching is not None:
                            matching_not_load.append(matching)
                    if len(matching_not_load) > 0:
                        # means there is at least one patter in not load pattern that matched --> so should load
                        print("MATCH not loading : {} parameters {} ".format(
                            prefix + name, not_load_params_ls))
                    if child is not None and len(matching_not_load) == 0:
                        #print("MODEL loading : child {} full {} ".format(name, prefix + name + '.'))
                        load(child, prefix + name + '.')
                    else:
                        print(
                            "MODEL not loading : child {} matching_not_load {} "
                            .format(child, matching_not_load))

            # Make sure we are able to load base models as well as derived models (with heads)
            start_prefix = ''
            model_to_load = model
            if not hasattr(model, cls.base_model_prefix) and any(
                    s.startswith(cls.base_model_prefix)
                    for s in state_dict.keys()):
                start_prefix = cls.base_model_prefix + '.'
            if hasattr(model, cls.base_model_prefix) and not any(
                    s.startswith(cls.base_model_prefix)
                    for s in state_dict.keys()):
                model_to_load = getattr(model, cls.base_model_prefix)
            if not random_init:
                load(model_to_load, prefix=start_prefix)
            else:
                print("WARNING : RANDOM INTIALIZATION OF BERTMULTITASK")

            if len(missing_keys) > 0:
                logger.info(
                    "Weights of {} not initialized from pretrained model: {}".
                    format(model.__class__.__name__, missing_keys))
            if len(unexpected_keys) > 0:
                logger.info(
                    "Weights from pretrained model not used in {}: {}".format(
                        model.__class__.__name__, unexpected_keys))
            if len(error_msgs) > 0:
                raise RuntimeError(
                    'Error(s) in loading state_dict for {}:\n\t{}'.format(
                        model.__class__.__name__, "\n\t".join(error_msgs)))

        if hasattr(model, 'tie_weights'):
            model.tie_weights(
            )  # make sure word embedding weights are still tied

        # Set model in evaluation mode to desactivate DropOut modules by default
        model.eval()

        if output_loading_info:
            loading_info = {
                "missing_keys": missing_keys,
                "unexpected_keys": unexpected_keys,
                "error_msgs": error_msgs
            }
            return model, loading_info

        return model
Example #29
0
    def from_pretrained(cls, model_name_or_path, *model_args, **kwargs):
        cache_dir = kwargs.get("cache_dir", None)
        from_pt = kwargs.pop("from_pt", False)
        from_tf = kwargs.pop("from_tf", False)
        from_ov = kwargs.get("from_ov", not (from_pt | from_tf))
        force_download = kwargs.get("force_download", False)
        resume_download = kwargs.get("resume_download", False)
        proxies = kwargs.get("proxies", None)
        local_files_only = kwargs.get("local_files_only", False)
        use_auth_token = kwargs.get("use_auth_token", None)
        revision = kwargs.get("revision", None)
        from_pipeline = kwargs.get("_from_pipeline", None)
        from_auto_class = kwargs.get("_from_auto", False)

        config = kwargs.get(
            "config") if "config" in kwargs else AutoConfig.from_pretrained(
                model_name_or_path)

        if from_pt:
            model = cls._pt_auto_model.from_pretrained(model_name_or_path,
                                                       *model_args, **kwargs)
            net = load_ov_model_from_pytorch(model)
            return OVPreTrainedModel(net, model.config)
        elif from_tf:
            model, cache_path = load_model_from_cache(model_name_or_path,
                                                      cls.__name__, cache_dir,
                                                      TF2_WEIGHTS_NAME, config)
            if model is not None:
                return model
            model = cls._tf_auto_model.from_pretrained(model_name_or_path,
                                                       *model_args, **kwargs)
            return load_ov_model_from_tf(model, cache_path)

        user_agent = {
            "file_type": "model",
            "framework": "openvino",
            "from_auto_class": from_auto_class
        }
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline

        # Load model
        OV_BIN_NAME = OV_WEIGHTS_NAME.replace(".xml", ".bin")
        if model_name_or_path is not None:
            if os.path.isdir(model_name_or_path):
                if (from_ov and os.path.isfile(
                        os.path.join(model_name_or_path, OV_WEIGHTS_NAME))
                        and os.path.isfile(
                            os.path.join(model_name_or_path, OV_BIN_NAME))):
                    # Load from an OpenVINO IR
                    archive_files = [
                        os.path.join(model_name_or_path, name)
                        for name in [OV_WEIGHTS_NAME, OV_BIN_NAME]
                    ]
                else:
                    raise EnvironmentError(
                        f"Error no files named {[OV_WEIGHTS_NAME, OV_BIN_NAME]} found in directory "
                        f"{model_name_or_path} or `from_ov` set to False")
            # elif os.path.isfile(model_name_or_path) or is_remote_url(model_name_or_path):
            #     archive_file = model_name_or_path
            else:
                names = [OV_WEIGHTS_NAME, OV_BIN_NAME]
                archive_files = [
                    hf_bucket_url(
                        model_name_or_path,
                        filename=name,
                        revision=revision,
                    ) for name in names
                ]

            # redirect to the cache, if necessary
            try:
                resolved_archive_files = [
                    cached_path(
                        archive_file,
                        cache_dir=cache_dir,
                        force_download=force_download,
                        proxies=proxies,
                        resume_download=resume_download,
                        local_files_only=local_files_only,
                        use_auth_token=use_auth_token,
                        user_agent=user_agent,
                    ) for archive_file in archive_files
                ]
            except EnvironmentError as err:
                logger.error(err)
                name = model_name_or_path
                msg = (
                    f"Can't load weights for '{name}'. Make sure that:\n\n"
                    f"- '{name}' is a correct model identifier listed on 'https://huggingface.co/models'\n"
                    f"  (make sure '{name}' is not a path to a local directory with something else, in that case)\n\n"
                    f"- or '{name}' is the correct path to a directory containing a file named {OV_WEIGHTS_NAME}.\n\n"
                )
                raise EnvironmentError(msg)

            if resolved_archive_files == archive_files:
                logger.info(f"loading weights file {archive_files}")
            else:
                logger.info(
                    f"loading weights file {archive_files} from cache at {resolved_archive_files}"
                )
        else:
            resolved_archive_files = None

        return load_ov_model_from_ir(*resolved_archive_files, config=config)
Example #30
0
def get_pretrained_state_dict(pretrained_model_name_or_path, *model_args,
                              **kwargs):
    """Get PyTorch state dict via HuggingFace transformers library."""
    config = kwargs.pop("config", None)
    state_dict = kwargs.pop("state_dict", None)
    cache_dir = kwargs.pop("cache_dir", None)
    # from_tf = kwargs.pop("from_tf", False)
    force_download = kwargs.pop("force_download", False)
    resume_download = kwargs.pop("resume_download", False)
    proxies = kwargs.pop("proxies", None)
    output_loading_info = kwargs.pop("output_loading_info", False)
    local_files_only = kwargs.pop("local_files_only", False)
    use_cdn = kwargs.pop("use_cdn", True)
    mirror = kwargs.pop("mirror", None)

    if pretrained_model_name_or_path is not None:
        if os.path.isdir(pretrained_model_name_or_path):
            if os.path.isfile(
                    os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
                # Load from a PyTorch checkpoint
                archive_file = os.path.join(pretrained_model_name_or_path,
                                            WEIGHTS_NAME)
            else:
                raise EnvironmentError(
                    "Error no file named {} found in directory {}".format(
                        WEIGHTS_NAME,
                        pretrained_model_name_or_path,
                    ))
        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
                pretrained_model_name_or_path):
            archive_file = pretrained_model_name_or_path
        elif os.path.isfile(pretrained_model_name_or_path + ".index"):
            assert False, "Loading TensorFlow checkpoints is not supported"
        else:
            archive_file = hf_bucket_url(
                pretrained_model_name_or_path,
                filename=WEIGHTS_NAME,
                use_cdn=use_cdn,
                mirror=mirror,
            )

        try:
            # Load from URL or cache if already cached
            resolved_archive_file = cached_path(
                archive_file,
                cache_dir=cache_dir,
                force_download=force_download,
                proxies=proxies,
                resume_download=resume_download,
                local_files_only=local_files_only,
            )
            if resolved_archive_file is None:
                raise EnvironmentError
        except EnvironmentError:
            msg = (
                f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named {WEIGHTS_NAME}.\n\n"
            )
            raise EnvironmentError(msg)

        if resolved_archive_file == archive_file:
            print("loading weights file {}".format(archive_file))
        else:
            print("loading weights file {} from cache at {}".format(
                archive_file, resolved_archive_file))
    else:
        resolved_archive_file = None

    if state_dict is None:
        try:
            state_dict = torch.load(resolved_archive_file, map_location="cpu")
        except Exception:
            raise OSError(
                "Unable to load weights from pytorch checkpoint file.")
    return state_dict