Ejemplo n.º 1
0
def no_stopwords():
    infersent2 = InferSent(params_model)
    infersent2.load_state_dict(torch.load(MODEL_PATH))
    infersent2.set_w2v_path(W2V_PATH)
    use_cuda = True
    infersent2 = infersent.cuda() if use_cuda else infersent
    pdss = pd.DataFrame(columns=['embds', 'set', 'catg'])
    start = time.time()
    global current_idx
    for x in range(3):
        crix = current_idx
        abss, catg, sets, crix = get_batch_from_dataframe(crix)
        for index in range(len(abss)):
            doc = nlp(abss[index])
            strs_after_stop_arr = []
            for token in doc:
                if not token.is_stop:
                    strs_after_stop_arr.append(token.text)

            abss[index] = ' '.join(strs_after_stop_arr)

        if x == 0:
            infersent2.build_vocab(abss, tokenize=True)
        else:
            infersent2.update_vocab(abss, tokenize=True)

        embed = infersent2.encode(abss, tokenize=True)
        df2 = pd.DataFrame({
            'embds': embed.tolist(),
            'set': sets,
            'catg': catg
        })
        pdss = pdss.append(df2, ignore_index=True)

        current_idx = crix
    end = time.time() - start
    print("Time without stopwords", end)
    pdss.to_csv("/home/psrivastava/Intern_Summer/data/embeds_no_stopwords.csv")
Ejemplo n.º 2
0
class LCPR_I:
    def __init__(self):
        self.filename = "LCP/lcpr_i.sav"
        self.cmudict = cmudict.dict()
        self.wnlp = WonderlicNLP()
        self.embeddings_index = {}
        self.wiki_top10 = [
            word[0].split()[0]
            for word in pd.read_csv("LCP/wiki_top10.csv").values
        ][:10001]
        self.infersent_model_path = 'LCP/infersent%s.pkl' % 1
        self.infersent_model_params = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(self.infersent_model_params)
        self.model = RandomForestRegressor(n_estimators=100)

    #InferSent setup (boilerplate code from InferSent's repository):
    def initialize_infersent(self, sentences):
        print("INITIALIZING INFERSENT...", datetime.now().strftime("%H:%M:%S"))
        self.infersent.load_state_dict(torch.load(self.infersent_model_path))
        w2v_path = 'LCP/glove.42B.300d.txt'
        self.infersent.set_w2v_path(w2v_path)
        self.infersent.build_vocab(sentences, tokenize=True)
        print("INFERSENT READY!", datetime.now().strftime("%H:%M:%S"))

    def infersent_embedding(self, sentence):
        return self.infersent.encode(sentence, tokenize=True)

    # GloVe setup:
    def initialize_glove(self):
        print("INITIALIZING GLOVE...", datetime.now().strftime("%H:%M:%S"))
        f = open('LCP/glove.42B.300d.txt', encoding="utf8")
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coefs
        f.close()
        print("GLOVE READY!", datetime.now().strftime("%H:%M:%S"))

    def glove_embedding(self, word):
        embedding = [
            emb for emb in self.embeddings_index[str(word).lower()]
        ] if str(word).lower() in self.embeddings_index.keys() else [
            -1 for i in range(300)
        ]
        return embedding

    def find_word_pos(self, word, tokens):
        lemmatizer = WordNetLemmatizer()
        search_tokens = [lemmatizer.lemmatize(word) for word in tokens]
        if word in tokens:
            return tokens.index(word)
        elif word in search_tokens:
            return search_tokens.index(word)
        else:
            return None

    # Used to find the index of the word in the sentence
    def extract_features(self, data):
        features = defaultdict(list)
        for id in tqdm(data.index, desc="PROCESSING DATA"):
            raw_token = "null" if str(data.loc[id]["token"]) == "nan" else str(
                data.loc[id]["token"])
            token = raw_token.lower()
            sent = data.loc[id]["sentence"]
            mrc_features = self.wnlp.get_mrc_features(token)
            glove = self.glove_embedding(token)
            infersent = self.infersent_embedding([sent])[0]

            # Sentence InferSent embedding:
            for i in range(1, 4097):
                features[f"infersent{i}"].append(infersent[i - 1])

            # Word GloVe embedding:
            for i in range(1, 301):
                features[f"glove{i}"].append(glove[i - 1])

            # MRC features:
            features["word_length"].append(mrc_features["Nlet"])
            features["syl_count"].append(mrc_features["Nsyl"])
            features["brown_freq"].append(mrc_features["Brown-freq"])
            features["familiarity"].append(mrc_features["Fam"])
            features["concreteness"].append(mrc_features["Conc"])
            features["imagability"].append(mrc_features["Imag"])
            features["meaningfulness_c"].append(mrc_features["Meanc"])
            features["meaningfulness_p"].append(mrc_features["Meanp"])
            features["age_of_aquisition"].append(mrc_features["AOA"])

            features["wiki_freq"].append(int(token in self.wiki_top10))

        return features

    def fit(self, train_data, train_labels):
        print("TRAINING...", datetime.now().strftime("%H:%M:%S"))
        self.initialize_glove()
        self.initialize_infersent(train_data["sentence"])
        features = self.extract_features(train_data)
        self.model.fit(pd.DataFrame(features), train_labels)
        print("TRAINING DONE!", datetime.now().strftime("%H:%M:%S"))

    def to_likert(self, prediction):
        if prediction >= 0 and prediction < 0.2:
            return 1
        elif prediction >= 0.2 and prediction < 0.4:
            return 2
        elif prediction >= 0.4 and prediction < 0.6:
            return 3
        elif prediction >= 0.6 and prediction < 0.8:
            return 4
        else:
            return 5

    def predict(self, test_data, development=False):
        print("LOOKING INTO THE ORB...", datetime.now().strftime("%H:%M:%S"))
        self.infersent.update_vocab(test_data)
        tokens = test_data["token"]
        predictions = self.model.predict(
            pd.DataFrame(self.extract_features(test_data)))
        if not development:
            for i in range(len(predictions)):
                print(
                    f"{tokens[i]} is a {self.to_likert(predictions[i])} on the Likert scale."
                )
        return predictions

    def score(self, train_data, train_labels):
        print("SCORING MODEL...", datetime.now().strftime("%H:%M:%S"))
        return self.model.score(
            pd.DataFrame(self.extract_features(train_data)), train_labels)

    def metrics(self, test_data, test_labels):
        labels_pred = self.predict(test_data, True)
        mae = mean_absolute_error(test_labels, labels_pred)
        rmse = math.sqrt(mean_squared_error(test_labels, labels_pred))
        print("MAE:", mae)
        print("RMSE:", rmse)

    def save(self):
        pickle.dump([self.model, self.embeddings_index, self.infersent],
                    open(self.filename, "wb"))

    def load(self):
        data = pickle.load(open(self.filename, "rb"))
        self.model = data[0]
        self.embeddings_index = data[1]
        self.infersent = data[2]
Ejemplo n.º 3
0
class InferSentEmbeddings(EmbeddingBaseClass, FlairDocumentEmbeddings):
    """
    Class to infer the InferSent embeddings to flair sentences. cf.
    `here <https://github.com/facebookresearch/InferSent>`_
    """
    def __init__(self, version=1):
        super().__init__()

        self.version = version
        if version == 1:
            self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                            'word_embeddings',
                                            'glove.840B.300d',
                                            'glove.840B.300d.txt')
        if version == 2:
            self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                            'word_embeddings', 'crawl-300d-2M',
                                            'crawl-300d-2M.vec')

        self.MODEL_PATH = os.path.join(NLP_MODELS_PATH, 'pretrained',
                                       'word_embeddings',
                                       'infersent%s' % version,
                                       'infersent%s.pkl' % version)

        # Set up logger
        logging.basicConfig(format='%(asctime)s : %(message)s',
                            level=logging.DEBUG)

        # Load InferSent model
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': version
        }

        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(self.MODEL_PATH))
        self.model.set_w2v_path(self.PATH_TO_W2V)

        self._embedding_length: int = params_model['enc_lstm_dim']

        self.name = f"{self.__class__.__name__ }_v{self.version}"
        self.static_embeddings = True

    @property
    def embedding_length(self) -> int:
        return self._embedding_length

    def _add_embeddings_internal(self, sentences: List[Sentence]):
        everything_embedded: bool = True
        infersent_sentences = []

        for sentence in sentences:
            if self.name not in sentence._embeddings.keys():
                everything_embedded = False

        if not everything_embedded:
            for sentence in sentences:
                infersent_sentences.append(sentence.to_tokenized_string())

            self.model.build_vocab(infersent_sentences, tokenize=False)
            self.model.update_vocab(infersent_sentences, tokenize=False)
            embeddings = self.model.encode(infersent_sentences, tokenize=False)

            for sentence, sentence_embedding in zip(sentences, embeddings):
                sentence.set_embedding(self.name,
                                       torch.tensor(sentence_embedding))