Esempio n. 1
0
    def get_infersent_model(self):
        """Retrieves the InferSent model.

        Returns:
            The pretrained InferSent model.
        """
        infersent_version = 2
        model_folder_path = utils.download_if_needed(InferSent.MODEL_PATH)
        model_path = os.path.join(model_folder_path,
                                  f"infersent{infersent_version}.pkl")
        params_model = {
            "bsize": 64,
            "word_emb_dim": 300,
            "enc_lstm_dim": 2048,
            "pool_type": "max",
            "dpout_model": 0.0,
            "version": infersent_version,
        }
        infersent = InferSentModel(params_model)
        infersent.load_state_dict(torch.load(model_path))
        word_embedding_path = utils.download_if_needed(
            InferSent.WORD_EMBEDDING_PATH)
        w2v_path = os.path.join(word_embedding_path, "fastText",
                                "crawl-300d-2M.vec")
        infersent.set_w2v_path(w2v_path)
        infersent.build_vocab_k_words(K=100000)
        return infersent
Esempio n. 2
0
    def _load_classification_text_file(self,
                                       text_file_name,
                                       offset=0,
                                       shuffle=False):
        """Loads tuples from lines of a classification text file.

        Format must look like:

            1 this is a great little ...
            0 "i love hot n juicy .  ...
            0 "\""this world needs a ...

        Arguments:
            text_file_name (str): name of the text file to load from.
            offset (int): line to start reading from
            shuffle (bool): If True, randomly shuffle loaded data
        """
        text_file_path = utils.download_if_needed(text_file_name)
        text_file = open(text_file_path, "r")
        raw_lines = text_file.readlines()[offset:]
        raw_lines = [self._clean_example(ex) for ex in raw_lines]
        self.examples = [
            self._process_example_from_file(ex) for ex in raw_lines
        ]
        self._i = 0
        text_file.close()
        if shuffle:
            random.shuffle(self.examples)
Esempio n. 3
0
 def __init__(self, max_candidates=15, embedding_type='paragramcf', **kwargs):
     super().__init__(**kwargs)
     self.max_candidates = max_candidates
     self.embedding_type = embedding_type
     if embedding_type == 'paragramcf':
         word_embeddings_folder = 'paragramcf'
         word_embeddings_file = 'paragram.npy'
         word_list_file = 'wordlist.pickle'
         nn_matrix_file = 'nn.npy'
     else:
         raise ValueError(f'Could not find word embedding {embedding_type}')
     
     # Download embeddings if they're not cached.
     cache_path = utils.download_if_needed('{}/{}'.format(
         WordSwapEmbedding.PATH, embedding_type))
     # Concatenate folder names to create full path to files.
     word_embeddings_file = os.path.join(cache_path, word_embeddings_file)
     word_list_file = os.path.join(cache_path, word_list_file)
     nn_matrix_file = os.path.join(cache_path, nn_matrix_file)
     
     # Actually load the files from disk.
     self.word_embeddings = np.load(word_embeddings_file)
     self.word_embedding_word2index = np.load(word_list_file, allow_pickle=True)
     self.nn = np.load(nn_matrix_file)
     
     # Build glove dict and index.
     self.word_embedding_index2word = {}
     for word, index in self.word_embedding_word2index.items():
         self.word_embedding_index2word[index] = word
Esempio n. 4
0
    def from_pretrained(cls, name_or_path):
        """Load trained LSTM model by name or from path.

        Args:
            name_or_path (str): Name of the model (e.g. "lstm-imdb") or model saved via `save_pretrained`.
        """
        if name_or_path in TEXTATTACK_MODELS:
            path = utils.download_if_needed(TEXTATTACK_MODELS[name_or_path])
        else:
            path = name_or_path

        config_path = os.path.join(path, "config.json")
        if os.path.exists(config_path):
            with open(config_path, "r") as f:
                config = json.load(f)
        else:
            # Default config
            config = {
                "architectures": "LSTMForClassification",
                "hidden_size": 150,
                "depth": 1,
                "dropout": 0.3,
                "num_labels": 2,
                "max_seq_length": 128,
                "model_path": None,
                "emb_layer_trainable": True,
            }
        del config["architectures"]
        model = cls(**config)
        state_dict = load_cached_state_dict(path)
        model.load_state_dict(state_dict)
        return model
Esempio n. 5
0
    def __init__(self, model_path, num_labels=2):
        model_file_path = utils.download_if_needed(model_path)
        self.model = BertForSequenceClassification.from_pretrained(
            model_file_path, num_labels=num_labels)

        self.model.to(utils.device)
        self.model.eval()
        self.tokenizer = BERTTokenizer(model_file_path)
 def load_glove200(self):
     glove_path = utils.download_if_needed(
         GloveEmbeddingLayer.EMBEDDING_PATH)
     glove_word_list_path = os.path.join(glove_path, "glove.wordlist.npy")
     word_list = np.load(glove_word_list_path)
     glove_matrix_path = os.path.join(glove_path, "glove.6B.200d.mat.npy")
     embedding_matrix = np.load(glove_matrix_path)
     return embedding_matrix, word_list
Esempio n. 7
0
    def counterfitted_GLOVE_embedding():
        """Returns a prebuilt counter-fitted GLOVE word embedding proposed by
        "Counter-fitting Word Vectors to Linguistic Constraints" (Mrkšić et
        al., 2016)"""
        if ("textattack_counterfitted_GLOVE_embedding" in utils.GLOBAL_OBJECTS
                and isinstance(
                    utils.
                    GLOBAL_OBJECTS["textattack_counterfitted_GLOVE_embedding"],
                    WordEmbedding,
                )):
            # avoid recreating same embedding (same memory) and instead share across different components
            return utils.GLOBAL_OBJECTS[
                "textattack_counterfitted_GLOVE_embedding"]

        word_embeddings_folder = "paragramcf"
        word_embeddings_file = "paragram.npy"
        word_list_file = "wordlist.pickle"
        mse_dist_file = "mse_dist.p"
        cos_sim_file = "cos_sim.p"
        nn_matrix_file = "nn.npy"

        # Download embeddings if they're not cached.
        word_embeddings_folder = os.path.join(WordEmbedding.PATH,
                                              word_embeddings_folder).replace(
                                                  "\\", "/")
        word_embeddings_folder = utils.download_if_needed(
            word_embeddings_folder)
        # Concatenate folder names to create full path to files.
        word_embeddings_file = os.path.join(word_embeddings_folder,
                                            word_embeddings_file)
        word_list_file = os.path.join(word_embeddings_folder, word_list_file)
        mse_dist_file = os.path.join(word_embeddings_folder, mse_dist_file)
        cos_sim_file = os.path.join(word_embeddings_folder, cos_sim_file)
        nn_matrix_file = os.path.join(word_embeddings_folder, nn_matrix_file)

        # loading the files
        embedding_matrix = np.load(word_embeddings_file)
        word2index = np.load(word_list_file, allow_pickle=True)
        index2word = {}
        for word, index in word2index.items():
            index2word[index] = word
        nn_matrix = np.load(nn_matrix_file)

        embedding = WordEmbedding(embedding_matrix, word2index, index2word,
                                  nn_matrix)

        with open(mse_dist_file, "rb") as f:
            mse_dist_mat = pickle.load(f)
        with open(cos_sim_file, "rb") as f:
            cos_sim_mat = pickle.load(f)

        embedding._mse_dist_mat = mse_dist_mat
        embedding._cos_sim_mat = cos_sim_mat

        utils.GLOBAL_OBJECTS[
            "textattack_counterfitted_GLOVE_embedding"] = embedding

        return embedding
 def __init__(self):
     glove_path = utils.download_if_needed(
         GloveEmbeddingLayer.EMBEDDING_PATH)
     glove_word_list_path = os.path.join(glove_path, "glove.wordlist.npy")
     word_list = np.load(glove_word_list_path)
     glove_matrix_path = os.path.join(glove_path, "glove.6B.200d.mat.npy")
     embedding_matrix = np.load(glove_matrix_path)
     super().__init__(embedding_matrix=embedding_matrix,
                      word_list=word_list)
 def __init__(self, model_path, num_labels=2, entailment=False):
     model_file_path = utils.download_if_needed(model_path)
     self.model = BertForSequenceClassification.from_pretrained(
         model_file_path, num_labels=num_labels)
     self.model.to(utils.get_device())
     self.model.eval()
     if entailment:
         self.tokenizer = BERTEntailmentTokenizer()
     else:
         self.tokenizer = BERTTokenizer(model_file_path)
 def __init__(self, emb_layer_trainable=True):
     glove_path = utils.download_if_needed(
         GloveEmbeddingLayer.EMBEDDING_PATH)
     glove_word_list_path = os.path.join(glove_path, "glove.wordlist.npy")
     word_list = np.load(glove_word_list_path)
     glove_matrix_path = os.path.join(glove_path, "glove.6B.200d.mat.npy")
     embedding_matrix = np.load(glove_matrix_path)
     super().__init__(embedding_matrix=embedding_matrix,
                      word_list=word_list)
     self.embedding.weight.requires_grad = emb_layer_trainable
Esempio n. 11
0
    def __init__(
        self,
        embedding_type="paragramcf",
        include_unknown_words=True,
        min_cos_sim=None,
        max_mse_dist=None,
        cased=False,
        compare_against_original=True,
    ):
        super().__init__(compare_against_original)
        self.include_unknown_words = include_unknown_words
        self.cased = cased
        self.min_cos_sim = min_cos_sim
        self.max_mse_dist = max_mse_dist

        self.embedding_type = embedding_type
        if embedding_type == "paragramcf":
            word_embeddings_folder = "paragramcf"
            word_embeddings_file = "paragram.npy"
            word_list_file = "wordlist.pickle"
            mse_dist_file = "mse_dist.p"
            cos_sim_file = "cos_sim.p"
        else:
            raise ValueError(f"Could not find word embedding {embedding_type}")

        # Download embeddings if they're not cached.
        word_embeddings_folder = os.path.join(WordEmbeddingDistance.PATH,
                                              word_embeddings_folder)

        word_embeddings_folder = utils.download_if_needed(
            word_embeddings_folder)

        # Concatenate folder names to create full path to files.
        word_embeddings_file = os.path.join(word_embeddings_folder,
                                            word_embeddings_file)
        word_list_file = os.path.join(word_embeddings_folder, word_list_file)
        mse_dist_file = os.path.join(word_embeddings_folder, mse_dist_file)
        cos_sim_file = os.path.join(word_embeddings_folder, cos_sim_file)

        # Actually load the files from disk.
        self.word_embeddings = np.load(word_embeddings_file)
        self.word_embedding_word2index = np.load(word_list_file,
                                                 allow_pickle=True)
        # Precomputed distance matrices store distances at mat[x][y], where
        # x and y are word IDs and x < y.
        if self.max_mse_dist is not None and os.path.exists(mse_dist_file):
            with open(mse_dist_file, "rb") as f:
                self.mse_dist_mat = pickle.load(f)
        else:
            self.mse_dist_mat = {}
        if self.min_cos_sim is not None and os.path.exists(cos_sim_file):
            with open(cos_sim_file, "rb") as f:
                self.cos_sim_mat = pickle.load(f)
        else:
            self.cos_sim_mat = {}
Esempio n. 12
0
def load_cached_state_dict(model_folder_path):
    if not os.path.exists(model_folder_path):
        model_folder_path = utils.download_if_needed(model_folder_path)
    # Take the first model matching the pattern *model.bin.
    model_path_list = glob.glob(os.path.join(model_folder_path, "*model.bin"))
    if not model_path_list:
        raise FileNotFoundError(
            f"model.bin not found in model folder {model_folder_path}.")
    model_path = model_path_list[0]
    state_dict = torch.load(model_path, map_location=utils.device)
    return state_dict
Esempio n. 13
0
    def get_infersent_model(self):
        """
        Retrieves the InferSent model. 

        Returns:
            The pretrained InferSent model. 

        """
        infersent_version = 2
        model_folder_path = utils.download_if_needed(InferSent.MODEL_PATH)
        model_path = os.path.join(model_folder_path, f'infersent{infersent_version}.pkl')
        params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': infersent_version}
        infersent = InferSentModel(params_model)
        infersent.load_state_dict(torch.load(model_path))
        word_embedding_path = utils.download_if_needed(InferSent.WORD_EMBEDDING_PATH)
        w2v_path = os.path.join(word_embedding_path, 'fastText', 
            'crawl-300d-2M.vec')
        infersent.set_w2v_path(w2v_path)
        infersent.build_vocab_k_words(K=100000)
        return infersent
Esempio n. 14
0
    def __init__(self, max_candidates=-1, **kwargs):
        super().__init__(**kwargs)
        self.max_candidates = max_candidates

        # Download synonym candidates bank if they're not cached.
        cache_path = utils.download_if_needed("{}/{}".format(
            WordSwapHowNet.PATH, "word_candidates_sense.pkl"))

        # Actually load the files from disk.
        with open(cache_path, "rb") as fp:
            self.candidates_bank = pickle.load(fp)

        self.pos_dict = {"JJ": "adj", "NN": "noun", "RB": "adv", "VB": "verb"}
Esempio n. 15
0
    def __init__(self):
        lm_folder = utils.download_if_needed(GoogLMHelper.CACHE_PATH)
        self.PBTXT_PATH = os.path.join(lm_folder, "graph-2016-09-10-gpu.pbtxt")
        self.CKPT_PATH = os.path.join(lm_folder, "ckpt-*")
        self.VOCAB_PATH = os.path.join(lm_folder, "vocab-2016-09-10.txt")

        self.BATCH_SIZE = 1
        self.NUM_TIMESTEPS = 1
        self.MAX_WORD_LEN = 50

        self.vocab = lm_data_utils.CharsVocabulary(self.VOCAB_PATH, self.MAX_WORD_LEN)
        with tf.device("/gpu:1"):
            self.graph = tf.Graph()
            self.sess = tf.compat.v1.Session(graph=self.graph)
        with self.graph.as_default():
            self.t = lm_utils.LoadModel(
                self.sess, self.graph, self.PBTXT_PATH, self.CKPT_PATH
            )

        self.lm_cache = lru.LRU(2 ** 18)
Esempio n. 16
0
 def _load_classification_text_file(self, text_file_name, offset=0):
     """ Loads tuples from lines of a classification text file. 
     
         Format must look like:
         
             1 this is a great little ...
             0 "i love hot n juicy .  ...
             0 "\""this world needs a ...
         
         Arguments:
             n (int): number of samples to return
             offset (int): line to start reading from
     """
     text_file_path = utils.download_if_needed(text_file_name)
     text_file = open(text_file_path, 'r')
     raw_lines = text_file.readlines()[offset:]
     raw_lines = [self._clean_example(ex) for ex in raw_lines]
     self.examples = [self._process_example_from_file(ex) for ex in raw_lines]
     self.i = 0
     text_file.close()
Esempio n. 17
0
    def __init__(self, embedding_type='paragramcf', include_unknown_words=True,
        min_cos_sim=None, max_mse_dist=None, cased=False):
        self.include_unknown_words = include_unknown_words
        self.cased = cased
        self.min_cos_sim = min_cos_sim
        self.max_mse_dist = max_mse_dist
        
        self.embedding_type = embedding_type
        if embedding_type == 'paragramcf':
            word_embeddings_folder = 'paragramcf'
            word_embeddings_file = 'paragram.npy'
            word_list_file = 'wordlist.pickle'
            mse_dist_file = 'mse_dist.p'
            cos_sim_file  = 'cos_sim.p'
        else:
            raise ValueError(f'Could not find word embedding {word_embedding}')

        # Download embeddings if they're not cached.
        word_embeddings_path = utils.download_if_needed(WordEmbeddingDistance.PATH)
        word_embeddings_folder = os.path.join(word_embeddings_path, word_embeddings_folder)
        
        # Concatenate folder names to create full path to files.
        word_embeddings_file = os.path.join(word_embeddings_folder, word_embeddings_file)
        word_list_file = os.path.join(word_embeddings_folder, word_list_file)
        mse_dist_file = os.path.join(word_embeddings_folder, mse_dist_file)
        cos_sim_file = os.path.join(word_embeddings_folder, cos_sim_file)
        
        # Actually load the files from disk.
        self.word_embeddings = np.load(word_embeddings_file)
        self.word_embedding_word2index = np.load(word_list_file, allow_pickle=True)
        # Precomputed distance matrices store distances at mat[x][y], where
        # x and y are word IDs and x < y.
        if self.max_mse_dist is not None and os.path.exists(mse_dist_file):
            self.mse_dist_mat = pickle.load(open(mse_dist_file, 'rb'))
        else:
            self.mse_dist_mat = {}
        if self.min_cos_sim is not None and os.path.exists(cos_sim_file):
            self.cos_sim_mat = pickle.load(open(cos_sim_file, 'rb'))
        else:
            self.cos_sim_mat = {}
Esempio n. 18
0
 def __init__(self):
     path = BERTForSNLI.MODEL_PATH
     utils.download_if_needed(path)
     super().__init__(path, entailment=True, num_labels=3)
Esempio n. 19
0
 def _load_pickle_file(self, file_name, offset=0):
     self._i = 0
     file_path = utils.download_if_needed(file_name)
     with open(file_path, "rb") as f:
         self.examples = pickle.load(f)
     self.examples = self.examples[offset:]
Esempio n. 20
0
 def __init__(self):
     glove_path = utils.download_if_needed(
         GloveEmbeddingLayer.EMBEDDING_PATH)
     glove_path = os.path.join(glove_path, 'glove.6B.200d.txt')
     super().__init__(embs=load_embedding(glove_path))
Esempio n. 21
0
def load_cached_state_dict(model_folder_path):
    model_folder_path = utils.download_if_needed(model_folder_path)
    model_path = os.path.join(model_folder_path, "model.bin")
    state_dict = torch.load(model_path, map_location=utils.device)
    return state_dict