def no_stopwords(): infersent2 = InferSent(params_model) infersent2.load_state_dict(torch.load(MODEL_PATH)) infersent2.set_w2v_path(W2V_PATH) use_cuda = True infersent2 = infersent.cuda() if use_cuda else infersent pdss = pd.DataFrame(columns=['embds', 'set', 'catg']) start = time.time() global current_idx for x in range(3): crix = current_idx abss, catg, sets, crix = get_batch_from_dataframe(crix) for index in range(len(abss)): doc = nlp(abss[index]) strs_after_stop_arr = [] for token in doc: if not token.is_stop: strs_after_stop_arr.append(token.text) abss[index] = ' '.join(strs_after_stop_arr) if x == 0: infersent2.build_vocab(abss, tokenize=True) else: infersent2.update_vocab(abss, tokenize=True) embed = infersent2.encode(abss, tokenize=True) df2 = pd.DataFrame({ 'embds': embed.tolist(), 'set': sets, 'catg': catg }) pdss = pdss.append(df2, ignore_index=True) current_idx = crix end = time.time() - start print("Time without stopwords", end) pdss.to_csv("/home/psrivastava/Intern_Summer/data/embeds_no_stopwords.csv")
class LCPR_I: def __init__(self): self.filename = "LCP/lcpr_i.sav" self.cmudict = cmudict.dict() self.wnlp = WonderlicNLP() self.embeddings_index = {} self.wiki_top10 = [ word[0].split()[0] for word in pd.read_csv("LCP/wiki_top10.csv").values ][:10001] self.infersent_model_path = 'LCP/infersent%s.pkl' % 1 self.infersent_model_params = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.infersent = InferSent(self.infersent_model_params) self.model = RandomForestRegressor(n_estimators=100) #InferSent setup (boilerplate code from InferSent's repository): def initialize_infersent(self, sentences): print("INITIALIZING INFERSENT...", datetime.now().strftime("%H:%M:%S")) self.infersent.load_state_dict(torch.load(self.infersent_model_path)) w2v_path = 'LCP/glove.42B.300d.txt' self.infersent.set_w2v_path(w2v_path) self.infersent.build_vocab(sentences, tokenize=True) print("INFERSENT READY!", datetime.now().strftime("%H:%M:%S")) def infersent_embedding(self, sentence): return self.infersent.encode(sentence, tokenize=True) # GloVe setup: def initialize_glove(self): print("INITIALIZING GLOVE...", datetime.now().strftime("%H:%M:%S")) f = open('LCP/glove.42B.300d.txt', encoding="utf8") for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print("GLOVE READY!", datetime.now().strftime("%H:%M:%S")) def glove_embedding(self, word): embedding = [ emb for emb in self.embeddings_index[str(word).lower()] ] if str(word).lower() in self.embeddings_index.keys() else [ -1 for i in range(300) ] return embedding def find_word_pos(self, word, tokens): lemmatizer = WordNetLemmatizer() search_tokens = [lemmatizer.lemmatize(word) for word in tokens] if word in tokens: return tokens.index(word) elif word in search_tokens: return search_tokens.index(word) else: return None # Used to find the index of the word in the sentence def extract_features(self, data): features = defaultdict(list) for id in tqdm(data.index, desc="PROCESSING DATA"): raw_token = "null" if str(data.loc[id]["token"]) == "nan" else str( data.loc[id]["token"]) token = raw_token.lower() sent = data.loc[id]["sentence"] mrc_features = self.wnlp.get_mrc_features(token) glove = self.glove_embedding(token) infersent = self.infersent_embedding([sent])[0] # Sentence InferSent embedding: for i in range(1, 4097): features[f"infersent{i}"].append(infersent[i - 1]) # Word GloVe embedding: for i in range(1, 301): features[f"glove{i}"].append(glove[i - 1]) # MRC features: features["word_length"].append(mrc_features["Nlet"]) features["syl_count"].append(mrc_features["Nsyl"]) features["brown_freq"].append(mrc_features["Brown-freq"]) features["familiarity"].append(mrc_features["Fam"]) features["concreteness"].append(mrc_features["Conc"]) features["imagability"].append(mrc_features["Imag"]) features["meaningfulness_c"].append(mrc_features["Meanc"]) features["meaningfulness_p"].append(mrc_features["Meanp"]) features["age_of_aquisition"].append(mrc_features["AOA"]) features["wiki_freq"].append(int(token in self.wiki_top10)) return features def fit(self, train_data, train_labels): print("TRAINING...", datetime.now().strftime("%H:%M:%S")) self.initialize_glove() self.initialize_infersent(train_data["sentence"]) features = self.extract_features(train_data) self.model.fit(pd.DataFrame(features), train_labels) print("TRAINING DONE!", datetime.now().strftime("%H:%M:%S")) def to_likert(self, prediction): if prediction >= 0 and prediction < 0.2: return 1 elif prediction >= 0.2 and prediction < 0.4: return 2 elif prediction >= 0.4 and prediction < 0.6: return 3 elif prediction >= 0.6 and prediction < 0.8: return 4 else: return 5 def predict(self, test_data, development=False): print("LOOKING INTO THE ORB...", datetime.now().strftime("%H:%M:%S")) self.infersent.update_vocab(test_data) tokens = test_data["token"] predictions = self.model.predict( pd.DataFrame(self.extract_features(test_data))) if not development: for i in range(len(predictions)): print( f"{tokens[i]} is a {self.to_likert(predictions[i])} on the Likert scale." ) return predictions def score(self, train_data, train_labels): print("SCORING MODEL...", datetime.now().strftime("%H:%M:%S")) return self.model.score( pd.DataFrame(self.extract_features(train_data)), train_labels) def metrics(self, test_data, test_labels): labels_pred = self.predict(test_data, True) mae = mean_absolute_error(test_labels, labels_pred) rmse = math.sqrt(mean_squared_error(test_labels, labels_pred)) print("MAE:", mae) print("RMSE:", rmse) def save(self): pickle.dump([self.model, self.embeddings_index, self.infersent], open(self.filename, "wb")) def load(self): data = pickle.load(open(self.filename, "rb")) self.model = data[0] self.embeddings_index = data[1] self.infersent = data[2]
class InferSentEmbeddings(EmbeddingBaseClass, FlairDocumentEmbeddings): """ Class to infer the InferSent embeddings to flair sentences. cf. `here <https://github.com/facebookresearch/InferSent>`_ """ def __init__(self, version=1): super().__init__() self.version = version if version == 1: self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'glove.840B.300d', 'glove.840B.300d.txt') if version == 2: self.PATH_TO_W2V = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'crawl-300d-2M', 'crawl-300d-2M.vec') self.MODEL_PATH = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'infersent%s' % version, 'infersent%s.pkl' % version) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': version } self.model = InferSent(params_model) self.model.load_state_dict(torch.load(self.MODEL_PATH)) self.model.set_w2v_path(self.PATH_TO_W2V) self._embedding_length: int = params_model['enc_lstm_dim'] self.name = f"{self.__class__.__name__ }_v{self.version}" self.static_embeddings = True @property def embedding_length(self) -> int: return self._embedding_length def _add_embeddings_internal(self, sentences: List[Sentence]): everything_embedded: bool = True infersent_sentences = [] for sentence in sentences: if self.name not in sentence._embeddings.keys(): everything_embedded = False if not everything_embedded: for sentence in sentences: infersent_sentences.append(sentence.to_tokenized_string()) self.model.build_vocab(infersent_sentences, tokenize=False) self.model.update_vocab(infersent_sentences, tokenize=False) embeddings = self.model.encode(infersent_sentences, tokenize=False) for sentence, sentence_embedding in zip(sentences, embeddings): sentence.set_embedding(self.name, torch.tensor(sentence_embedding))