def _get_embedder(self, model_name): log = make_logger(__name__) if model_name is None: model_name = self.model_name log.info(f"Reading in pretrained model {model_name}") self.embedder = SentenceTransformer(model_name) return self.embedder
def get_cos_sim( text_a: List[str], text_b: List[str], st: ST = None, model_name: str = None, cache_dir: str = None, ) -> List[float]: """[summary] Args: text_a (List[str]): [description] text_b (List[str]): [description] st (ST) : sentence transformer object. model_name (str, optional): [description]. Defaults to None. cache_dir (str, optional): [description]. Defaults to None. Returns: (List[float]): List of normalized similarity scores. Example: >>> get_cos_sim(text_a = ["damaging effect", "positive effect"], text_b = ["detrimental effect", "detrimental effect"], ) """ log = make_logger(__name__) # sentence transformer: if st is None: st = ST(model_name=model_name) if len(text_a) != len(text_b): log.fatal( f"text_a {len(text_a)} must be the same len as text_b {len(text_b)}" ) assert len(text_a) == len(text_b) log.info(f"Starting to get embedding") embedded_a = st.encode(text_a) embedded_b = st.encode(text_b) log.info(f"Calculating cosine similarity scores") # TODO: This can be optimized if we want to do bulk scoring. sim_scores = [ calc_cos_sim(a, embedded_b[i]) for i, a in enumerate(embedded_a) ] return sim_scores
def get_lesk( context_list: List[str], tar_word_list: List[str]) -> List[nltk.corpus.reader.wordnet.Synset]: """Given a list of sentences, and a list of target words, use lesk to get the synset most closely aligned with the word in that context. Args: context_list (List[str]): [description] tar_word_list (List[str]): [description] Returns: List[synset]: [description] Example: >>> get_lesk(context_list = ["Steve Jobs founded Apple", "Apples are more tasty than oranges"], tar_word_list = ["apple", "apple"], ) >>> # Here we see that apple isn't disambiguated through get_lesk() >>> get_lesk(context_list = ["I went to the bank to deposit my money", "I will bank my earnings"], tar_word_list = ["bank", "bank"], ) >>> # Here bank does get correctly identified. """ log = make_logger(__name__) if len(context_list) != len(tar_word_list): log.fatal( f"context_list {len(context_list)} must be the same len as tar_word_list {len(tar_word_list)}" ) assert len(context_list) == len(tar_word_list) new_synsets = [] # For each sentence and target word, get what lesk thinks is the wordnet synset # that the target word represents. for i, context in enumerate(context_list): sent = context.split() tar_word: str = tar_word_list[i] new_synsets.append(lesk(sent, tar_word)) return new_synsets
def encode(self, text=List[str], show_progress=True, num_workers=8, save_file=None): log = make_logger(__name__) # Read in presaved embedding if save_file is not None and os.path.exists(save_file): log.info(f"Reading in pre-saved file {save_file}") return np.load(save_file) # Encoding log.info( f"Starting to embed {len(text)} text with {self.model_name}, workers : {num_workers}" ) corpus_embeddings = self.embedder.encode( text, show_progress_bar=show_progress, num_workers=num_workers) if save_file is not None: np.save(save_file, np.array(corpus_embeddings)) return corpus_embeddings
def get_best_synset_bert( context_list: List[str], tar_word_list: List[str], st: ST, pos: List[str]) -> List[nltk.corpus.reader.wordnet.Synset]: """Given a list of sentences, and a list of target words, use lesk to get the synset most closely aligned with the word in that context. Args: context_list (List[str]): [description] tar_word_list (List[str]): [description] st (ST) : sentence transformer object tar_post (List[str]): the pos of the word Returns: List[synset]: [description] Example: >>> # Load in bert model >>> st = ST() >>> get_best_synset_bert(context_list = ["Steve Jobs founded Apple", "Apples are more tasty than oranges"], tar_word_list = ["apple", "apple"], st = st, pos = ["noun", "noun"] ) >>> # Here we see that apple isn't disambiguated through bert either, however this is >>> # Because apple the company isn't defined in wordnet. >>> best_synsets, all_def = get_best_synset_bert(context_list = ["I went to the bank to deposit my money", "I will bank my earnings"], tar_word_list = ["bank", "bank"], st = st, pos = ["noun", "verb"] ) >>> # Here lesk actually does better as the second bank doesn't get correctly matched. """ log = make_logger(__name__) if len(context_list) != len(tar_word_list): log.fatal( f"context_list {len(context_list)} must be the same len as tar_word_list {len(tar_word_list)}" ) assert len(context_list) == len(tar_word_list) all_def = pd.DataFrame() # For each sentence and target word, get what bert thinks is the wordnet synset # that the target word represents. for i, context in enumerate(context_list): tar_word: str = tar_word_list[i] tar_pos: str = pos[i] # Get all definitions for the target word. tmp_def = pd.DataFrame(dict(synsets=wn.synsets(tar_word))) # Only keep synsets which have the target pos. tmp_def["definition"] = tmp_def["synsets"].apply( lambda x: x.definition() if POS_MAP[x.pos()] == tar_pos else None) # Get word examples. tmp_def["example"] = tmp_def["synsets"].apply( lambda x: x.examples()[0] if POS_MAP[x.pos()] == tar_pos and len(x.examples()) > 0 else None) tmp_def = tmp_def.query("definition.notnull()") # Create an index to do a groupby on. This is because get_cos_sim is best run once vectorized. tmp_def["idx"] = i tmp_def["context"] = context all_def = all_def.append(tmp_def) # Compare the word in context with all definitions of that word in wordnet. all_def["def_score"] = get_cos_sim(text_a=list(all_def["definition"]), text_b=list(all_def["context"])) # Do a comparison with example sentences. is_example = all_def["example"].notnull() all_def.loc[is_example, "example_score"] = get_cos_sim( text_a=list(all_def[is_example]["example"]), text_b=list(all_def[is_example]["context"]), st=st, ) # If example score isn't available, fill na with the definition score. all_def["score"] = all_def[["def_score", "example_score"]].mean(axis=1) best_synsets = list( all_def.sort_values( "score", ascending=False).groupby("idx").head(1).reset_index( drop=True)["synsets"]) return best_synsets, all_def
def get_replace_example( news_df: pd.DataFrame, word_syn_df: pd.DataFrame, seed: int = None, sample_size: int = 1, bert_model_name: str = None, spacy_model=en_core_web_sm, num_sentences: int = 1, run_lesk_wsd: bool = True, run_bert_wsd: bool = True, ) -> pd.DataFrame: """ Args: news_df (pd.DataFrame): word_syn_df (pd.DataFrame): [word : str, syn : str, syn_pos: str] seed (int, optional): [description]. Defaults to None. sample_size (int, optional): [description]. Defaults to 1. bert_model_name (str, optional): [description]. Defaults to None. spacy_model (str, optional): [description]. Defaults to "en_core_web_sm". Returns: List[dict]: [{"article" : str, "highlights_df" : pd.DataFrame}] Example: >>> news_df = pd.DataFrame(dict(article = [(f"This is an example news article which " f"shows the damaging effects of being " f"unable to learn vocabulary."), (f"Learning vocabulary by rote memorization has a passing " f"effect on long term vocabulary understanding.") ] )) # Get the word to synonym df. >>> gre_syn_obj = SynWords(raw_data=gre_df) >>> gre_syn = gre_syn_obj.get_synonyms() >>> ubi_vocab : List[dict] = get_replace_example(news_df = news_df, word_syn_df = gre_syn) """ log = make_logger(__name__) if seed is not None: tmp = news_df.sample(sample_size, random_state=seed) log.info(f"sampling down news_df to {tmp.shape} using seed {seed}") else: tmp = news_df # Load Spacy and BERT models. log.info(f"Loading in spacy and sentence transformer models.") # For some reason finding the model by string doesn't work well on binder. spcy = spacy_model.load() st = ST(model_name=bert_model_name) # For each news article, # Highlight sentences within an article that have replaced words. rv: List[dict] = [] log.info(f"Getting replaced sentences for each news article.") for row in tmp.itertuples(): article: str = row.article # Get one article's worth of replaced sentences. highlights: pd.DataFrame = _get_sents_containing_word( all_text=article, word_syn_df=word_syn_df, num_sentences=num_sentences, st=st, spcy=spcy, ) # Add the synset to results. highlights = highlights.merge(word_syn_df[["word", "syn", "synset"]], on=["word", "syn"], how="left") # Replacements should only occur when pos is the same. good_replace_query = ["syn_pos_context == syn_pos"] if run_lesk_wsd: # Word sense disambiguation comparing sentence used in a sentence with # the intended usage of a word (utilizing lesk/wordnet). log.info(f"Adding LESK results") highlights["lesk"] = get_lesk(context_list=highlights["mod_text"], tar_word_list=highlights["word"]) # Only make replacements when the synset is the same as the lesk results. good_replace_query.append("synset == lesk") if run_bert_wsd: # Compare the word definition to the sentence containing the original sentence. # Return the best synset based on cosine similarity to the target word definition # and example sentences. best_synsets, _ = get_best_synset_bert( context_list=highlights["mod_text"], tar_word_list=highlights["word"], st=st, pos=highlights["syn_pos"], ) highlights["bert_wsd"] = best_synsets # The best results don't use synset and bert_wsd. good_replace_query.append("synset == bert_wsd") # TODO: want to compare to the averge sentences of all sentence use cases SEMCOR. # Instead let's use the rank the similarity with each synset dictionary definition. # And choose the highest score. to_replace: pd.DataFrame = highlights.copy() for q in good_replace_query: to_replace = to_replace.query(q) to_replace = to_replace[["word", "syn"]] log.info(f"Replacing the entire article {to_replace.shape}") # \\b is necessary to match full words. # too lazy to use re.sub, just use pandas for replacement. new_article: str = pd.Series(article).replace( {f"\\b{t.syn}\\b": t.word for t in to_replace.itertuples()}, regex=True) rv.append( dict(article=article, highlights_df=highlights, new_article=new_article)) # pd.set_option('display.max_colwidth', None) # see how well scores perform here. # bart_rv = px.histogram(orig_highlights["sim_score"]) return rv
def _get_sents_containing_word( all_text: str, word_syn_df: pd.DataFrame, num_sentences: int = 1, st: ST = None, spcy=None, ) -> pd.DataFrame: """For a given news article/text, get (num_sentences) surrounding sentences around all synonyms listed in word_syn_df. Create a modified sentence which contains the original sentence(s) with the synonym of a target vocabulary word replaced with that vocabulary word. - Use Spacy to get POS values for the target word in the sentence in all_text. - Use BERT from the sentence transformer module (st) to calculate similarity score between the original sentence(s) and the modified sentence. Args: all_text (str): a news article word_syn_df (pd.DataFrame): [word : str, syn : str, syn_pos: str] A dataframe mapping target word to synonym. num_sentences (int, optional): How many sentences before/after the sentence containing a target word to include when highlighting a replacement. Defaults to 1. st (ST, optional): sentence transformer object defined in transformer. Defaults to None. spcy ([type], optional): Spacy model used to get POS. Defaults to None. Returns: pd.DataFrame: word : str = target vocabulary word, syn : str = synonym of vocabulary word found in original text, orig_text :str = num_sentences surrounding the syn in all_text, mod_text : str = orig_text with the syn replaced by word, sim_score : float = cosine similarity score comparing vectorized orig_text and mod_text, syn_pos_context : str = pos of the syn in orig_text, syn_pos : str = pos of the intended syn. If this doesn't match syn_pos_context, this would be a bad replacement., """ log = make_logger(__name__) if st is None: # st = sentence transformer object. st = ST(model_name=model_name) # word_syn_df contains a word and syn column. sentences: List[str] = all_text.split(".") # if we want more than one sentence surrounding the word, # grab num_sentences//2 before and after the word. more_sent = num_sentences // 2 total_sent = len(sentences) - 1 all_replaced_context = [] # TODO: this can be much more efficient. (all_orig_text, all_mod_text, all_words, all_syn, all_context_pos, all_syn_pos) = ( [], [], [], [], [], [], ) # For each synonym, go through each sentence to see if it can be replaced. for tup in word_syn_df.itertuples(): word: str = tup.word syn: str = tup.syn syn_pos: str = tup.syn_pos # A syn can occur in multiple places within an article. curr_orig_text, curr_mod_text, curr_pos = [], [], [] for sent_id, sent in enumerate(sentences): # Fill curr_orig_text and curr_mod_text with neighboring sentences. is_replaced = _add_replaced_surrounding_sents( syn=syn, word=word, sentences=sentences, sent_id=sent_id, more_sent=more_sent, total_sent=total_sent, curr_orig_text=curr_orig_text, curr_mod_text=curr_mod_text, ) # Extract the pos of the target word in the given sentence. if is_replaced: curr_pos.append( _get_word_in_sent_pos(context=sent, tar_word=syn, spcy=spcy)) # There was a match, so add a replaced highlight. if len(curr_orig_text) > 0: # Append to master list. all_orig_text += curr_orig_text all_mod_text += curr_mod_text all_words += [word] * len(curr_orig_text) all_syn += [syn] * len(curr_orig_text) all_syn_pos += [syn_pos] * len(curr_orig_text) all_context_pos += curr_pos # TODO: consider removing. # all_replaced_context.append(ReplacedContext(vocab = word, # syn = syn, # orig_text = curr_orig_text, # mod_text = curr_mod_text # ) # ) # Calc cosine similarity - should be run in a more vectorized fashion. sim_scores = get_cos_sim(text_a=all_orig_text, text_b=all_mod_text, st=st) log.info( f"word: {len(all_words)}, syn : {len(all_syn)}, " f"orig_text : {len(all_orig_text)}, mod_text : {len(all_mod_text)}, " f"sim_score : {len(sim_scores)}") # TODO: either store replaced context as a list of replacedcontext obj # or just as a dataframe. article_df = pd.DataFrame( dict( word=all_words, syn=all_syn, orig_text=all_orig_text, mod_text=all_mod_text, sim_score=sim_scores, syn_pos_context=all_context_pos, syn_pos=all_syn_pos, )) return article_df