Beispiel #1
0
    def _get_embedder(self, model_name):
        log = make_logger(__name__)
        if model_name is None:
            model_name = self.model_name

        log.info(f"Reading in pretrained model {model_name}")
        self.embedder = SentenceTransformer(model_name)
        return self.embedder
Beispiel #2
0
def get_cos_sim(
    text_a: List[str],
    text_b: List[str],
    st: ST = None,
    model_name: str = None,
    cache_dir: str = None,
) -> List[float]:
    """[summary]

    Args:
        text_a (List[str]): [description]
        text_b (List[str]): [description]
        st (ST) : sentence transformer object.
        model_name (str, optional): [description]. Defaults to None.
        cache_dir (str, optional): [description]. Defaults to None.

    Returns:
        (List[float]): List of normalized similarity scores.
    
    Example:
    >>> get_cos_sim(text_a = ["damaging effect", "positive effect"],
                    text_b = ["detrimental effect", "detrimental effect"],
                    )

    """
    log = make_logger(__name__)
    # sentence transformer:
    if st is None:
        st = ST(model_name=model_name)

    if len(text_a) != len(text_b):
        log.fatal(
            f"text_a {len(text_a)} must be the same len as text_b {len(text_b)}"
        )
        assert len(text_a) == len(text_b)

    log.info(f"Starting to get embedding")
    embedded_a = st.encode(text_a)
    embedded_b = st.encode(text_b)

    log.info(f"Calculating cosine similarity scores")
    # TODO: This can be optimized if we want to do bulk scoring.
    sim_scores = [
        calc_cos_sim(a, embedded_b[i]) for i, a in enumerate(embedded_a)
    ]
    return sim_scores
Beispiel #3
0
def get_lesk(
        context_list: List[str],
        tar_word_list: List[str]) -> List[nltk.corpus.reader.wordnet.Synset]:
    """Given a list of sentences, and a list of target words,
     use lesk to get the synset most closely aligned with the word in that context.

    Args:
        context_list (List[str]): [description]
        tar_word_list (List[str]): [description]

    Returns:
        List[synset]: [description]
    
    Example:
    >>> get_lesk(context_list = ["Steve Jobs founded Apple", "Apples are more tasty than oranges"],
                    tar_word_list = ["apple", "apple"],
                    )
    >>> # Here we see that apple isn't disambiguated through get_lesk()
    >>> get_lesk(context_list = ["I went to the bank to deposit my money", "I will bank my earnings"],
                    tar_word_list = ["bank", "bank"],
                    )
    >>> # Here bank does get correctly identified.
    """
    log = make_logger(__name__)

    if len(context_list) != len(tar_word_list):
        log.fatal(
            f"context_list {len(context_list)} must be the same len as tar_word_list {len(tar_word_list)}"
        )
        assert len(context_list) == len(tar_word_list)

    new_synsets = []
    # For each sentence and target word, get what lesk thinks is the wordnet synset
    # that the target word represents.
    for i, context in enumerate(context_list):
        sent = context.split()
        tar_word: str = tar_word_list[i]
        new_synsets.append(lesk(sent, tar_word))

    return new_synsets
Beispiel #4
0
    def encode(self,
               text=List[str],
               show_progress=True,
               num_workers=8,
               save_file=None):
        log = make_logger(__name__)
        # Read in presaved embedding
        if save_file is not None and os.path.exists(save_file):
            log.info(f"Reading in pre-saved file {save_file}")
            return np.load(save_file)

        # Encoding
        log.info(
            f"Starting to embed {len(text)} text with {self.model_name}, workers : {num_workers}"
        )
        corpus_embeddings = self.embedder.encode(
            text, show_progress_bar=show_progress, num_workers=num_workers)

        if save_file is not None:
            np.save(save_file, np.array(corpus_embeddings))

        return corpus_embeddings
Beispiel #5
0
def get_best_synset_bert(
        context_list: List[str], tar_word_list: List[str], st: ST,
        pos: List[str]) -> List[nltk.corpus.reader.wordnet.Synset]:
    """Given a list of sentences, and a list of target words,
     use lesk to get the synset most closely aligned with the word in that context.

    Args:
        context_list (List[str]): [description]
        tar_word_list (List[str]): [description]
        st (ST) : sentence transformer object
        tar_post (List[str]): the pos of the word 

    Returns:
        List[synset]: [description]
    
    Example:
    >>> # Load in bert model
    >>> st = ST()
    >>> get_best_synset_bert(context_list = ["Steve Jobs founded Apple", "Apples are more tasty than oranges"],
                    tar_word_list = ["apple", "apple"],
                    st = st,
                    pos = ["noun", "noun"]
                    )
    >>> # Here we see that apple isn't disambiguated through bert either, however this is
    >>> # Because apple the company isn't defined in wordnet.

    >>> best_synsets, all_def = get_best_synset_bert(context_list = ["I went to the bank to deposit my money", "I will bank my earnings"],
                    tar_word_list = ["bank", "bank"],
                    st = st,
                    pos = ["noun", "verb"]
                    )
    >>> # Here lesk actually does better as the second bank doesn't get correctly matched.
    """
    log = make_logger(__name__)

    if len(context_list) != len(tar_word_list):
        log.fatal(
            f"context_list {len(context_list)} must be the same len as tar_word_list {len(tar_word_list)}"
        )
        assert len(context_list) == len(tar_word_list)

    all_def = pd.DataFrame()
    # For each sentence and target word, get what bert thinks is the wordnet synset
    # that the target word represents.
    for i, context in enumerate(context_list):
        tar_word: str = tar_word_list[i]
        tar_pos: str = pos[i]
        # Get all definitions for the target word.
        tmp_def = pd.DataFrame(dict(synsets=wn.synsets(tar_word)))
        # Only keep synsets which have the target pos.
        tmp_def["definition"] = tmp_def["synsets"].apply(
            lambda x: x.definition() if POS_MAP[x.pos()] == tar_pos else None)
        # Get word examples.
        tmp_def["example"] = tmp_def["synsets"].apply(
            lambda x: x.examples()[0]
            if POS_MAP[x.pos()] == tar_pos and len(x.examples()) > 0 else None)
        tmp_def = tmp_def.query("definition.notnull()")
        # Create an index to do a groupby on. This is because get_cos_sim is best run once vectorized.
        tmp_def["idx"] = i
        tmp_def["context"] = context
        all_def = all_def.append(tmp_def)

    # Compare the word in context with all definitions of that word in wordnet.
    all_def["def_score"] = get_cos_sim(text_a=list(all_def["definition"]),
                                       text_b=list(all_def["context"]))

    # Do a comparison with example sentences.
    is_example = all_def["example"].notnull()
    all_def.loc[is_example, "example_score"] = get_cos_sim(
        text_a=list(all_def[is_example]["example"]),
        text_b=list(all_def[is_example]["context"]),
        st=st,
    )

    # If example score isn't available, fill na with the definition score.
    all_def["score"] = all_def[["def_score", "example_score"]].mean(axis=1)

    best_synsets = list(
        all_def.sort_values(
            "score", ascending=False).groupby("idx").head(1).reset_index(
                drop=True)["synsets"])

    return best_synsets, all_def
Beispiel #6
0
def get_replace_example(
    news_df: pd.DataFrame,
    word_syn_df: pd.DataFrame,
    seed: int = None,
    sample_size: int = 1,
    bert_model_name: str = None,
    spacy_model=en_core_web_sm,
    num_sentences: int = 1,
    run_lesk_wsd: bool = True,
    run_bert_wsd: bool = True,
) -> pd.DataFrame:
    """

    Args:
        news_df (pd.DataFrame): 
        word_syn_df (pd.DataFrame): [word : str, syn : str, syn_pos: str]
        seed (int, optional): [description]. Defaults to None.
        sample_size (int, optional): [description]. Defaults to 1.
        bert_model_name (str, optional): [description]. Defaults to None.
        spacy_model (str, optional): [description]. Defaults to "en_core_web_sm".

    Returns:
        List[dict]: [{"article" : str, "highlights_df" : pd.DataFrame}]

    Example:
    >>> news_df = pd.DataFrame(dict(article = [(f"This is an example news article which " 
                                        f"shows the damaging effects of being " 
                                        f"unable to learn vocabulary."),
                                        (f"Learning vocabulary by rote memorization has a passing "
                                        f"effect on long term vocabulary understanding.")
                                        ]
                            ))
    # Get the word to synonym df.
    >>> gre_syn_obj = SynWords(raw_data=gre_df)
    >>> gre_syn = gre_syn_obj.get_synonyms()

    >>> ubi_vocab : List[dict] = get_replace_example(news_df = news_df,
                            word_syn_df = gre_syn)
    """
    log = make_logger(__name__)

    if seed is not None:
        tmp = news_df.sample(sample_size, random_state=seed)
        log.info(f"sampling down news_df to {tmp.shape} using seed {seed}")
    else:
        tmp = news_df

    # Load Spacy and BERT models.
    log.info(f"Loading in spacy and sentence transformer models.")

    # For some reason finding the model by string doesn't work well on binder.

    spcy = spacy_model.load()

    st = ST(model_name=bert_model_name)

    # For each news article,
    # Highlight sentences within an article that have replaced words.
    rv: List[dict] = []
    log.info(f"Getting replaced sentences for each news article.")
    for row in tmp.itertuples():
        article: str = row.article

        # Get one article's worth of replaced sentences.
        highlights: pd.DataFrame = _get_sents_containing_word(
            all_text=article,
            word_syn_df=word_syn_df,
            num_sentences=num_sentences,
            st=st,
            spcy=spcy,
        )

        # Add the synset to results.
        highlights = highlights.merge(word_syn_df[["word", "syn", "synset"]],
                                      on=["word", "syn"],
                                      how="left")

        # Replacements should only occur when pos is the same.
        good_replace_query = ["syn_pos_context == syn_pos"]

        if run_lesk_wsd:
            # Word sense disambiguation comparing sentence used in a sentence with
            # the intended usage of a word (utilizing lesk/wordnet).
            log.info(f"Adding LESK results")
            highlights["lesk"] = get_lesk(context_list=highlights["mod_text"],
                                          tar_word_list=highlights["word"])

            # Only make replacements when the synset is the same as the lesk results.
            good_replace_query.append("synset == lesk")

        if run_bert_wsd:
            # Compare the word definition to the sentence containing the original sentence.
            # Return the best synset based on cosine similarity to the target word definition
            # and example sentences.

            best_synsets, _ = get_best_synset_bert(
                context_list=highlights["mod_text"],
                tar_word_list=highlights["word"],
                st=st,
                pos=highlights["syn_pos"],
            )
            highlights["bert_wsd"] = best_synsets
            # The best results don't use synset and bert_wsd.
            good_replace_query.append("synset == bert_wsd")
            # TODO: want to compare to the averge sentences of all sentence use cases SEMCOR.
            # Instead let's use the rank the similarity with each synset dictionary definition.
            # And choose the highest score.

        to_replace: pd.DataFrame = highlights.copy()
        for q in good_replace_query:
            to_replace = to_replace.query(q)
        to_replace = to_replace[["word", "syn"]]

        log.info(f"Replacing the entire article {to_replace.shape}")
        # \\b is necessary to match full words.
        # too lazy to use re.sub, just use pandas for replacement.
        new_article: str = pd.Series(article).replace(
            {f"\\b{t.syn}\\b": t.word
             for t in to_replace.itertuples()},
            regex=True)

        rv.append(
            dict(article=article,
                 highlights_df=highlights,
                 new_article=new_article))

    # pd.set_option('display.max_colwidth', None)
    # see how well scores perform here.
    # bart_rv = px.histogram(orig_highlights["sim_score"])

    return rv
Beispiel #7
0
def _get_sents_containing_word(
    all_text: str,
    word_syn_df: pd.DataFrame,
    num_sentences: int = 1,
    st: ST = None,
    spcy=None,
) -> pd.DataFrame:
    """For a given news article/text, get (num_sentences) surrounding
    sentences around all synonyms listed in word_syn_df. Create a modified sentence
    which contains the original sentence(s) with the synonym of a target vocabulary word
    replaced with that vocabulary word.

    - Use Spacy to get POS values for the target word in the sentence in all_text.
    - Use BERT from the sentence transformer module (st) to calculate similarity score
    between the original sentence(s) and the modified sentence.

    Args:
        all_text (str): a news article
        word_syn_df (pd.DataFrame): 
        [word : str, syn : str, syn_pos: str]
        A dataframe mapping target word to synonym.
        num_sentences (int, optional): How many sentences before/after the sentence
        containing a target word to include when highlighting a replacement. Defaults to 1.
        st (ST, optional): sentence transformer object defined in transformer. Defaults to None.
        spcy ([type], optional): Spacy model used to get POS. Defaults to None.

    Returns:
        pd.DataFrame: 
            word : str = target vocabulary word,
            syn : str = synonym of vocabulary word found in original text,
            orig_text :str = num_sentences surrounding the syn in all_text,
            mod_text : str = orig_text with the syn replaced by word,
            sim_score : float = cosine similarity score comparing vectorized orig_text and mod_text,
            syn_pos_context : str = pos of the syn in orig_text,
            syn_pos : str = pos of the intended syn. If this doesn't match syn_pos_context, this would
            be a bad replacement.,

    """
    log = make_logger(__name__)
    if st is None:
        # st = sentence transformer object.
        st = ST(model_name=model_name)

    # word_syn_df contains a word and syn column.

    sentences: List[str] = all_text.split(".")

    # if we want more than one sentence surrounding the word,
    # grab num_sentences//2 before and after the word.
    more_sent = num_sentences // 2
    total_sent = len(sentences) - 1

    all_replaced_context = []

    # TODO: this can be much more efficient.
    (all_orig_text, all_mod_text, all_words, all_syn, all_context_pos,
     all_syn_pos) = (
         [],
         [],
         [],
         [],
         [],
         [],
     )

    # For each synonym, go through each sentence to see if it can be replaced.
    for tup in word_syn_df.itertuples():
        word: str = tup.word
        syn: str = tup.syn
        syn_pos: str = tup.syn_pos

        # A syn can occur in multiple places within an article.
        curr_orig_text, curr_mod_text, curr_pos = [], [], []
        for sent_id, sent in enumerate(sentences):
            # Fill curr_orig_text and curr_mod_text with neighboring sentences.
            is_replaced = _add_replaced_surrounding_sents(
                syn=syn,
                word=word,
                sentences=sentences,
                sent_id=sent_id,
                more_sent=more_sent,
                total_sent=total_sent,
                curr_orig_text=curr_orig_text,
                curr_mod_text=curr_mod_text,
            )

            # Extract the pos of the target word in the given sentence.
            if is_replaced:
                curr_pos.append(
                    _get_word_in_sent_pos(context=sent,
                                          tar_word=syn,
                                          spcy=spcy))

        # There was a match, so add a replaced highlight.
        if len(curr_orig_text) > 0:

            # Append to master list.
            all_orig_text += curr_orig_text
            all_mod_text += curr_mod_text
            all_words += [word] * len(curr_orig_text)
            all_syn += [syn] * len(curr_orig_text)
            all_syn_pos += [syn_pos] * len(curr_orig_text)
            all_context_pos += curr_pos

            # TODO: consider removing.
            # all_replaced_context.append(ReplacedContext(vocab = word,
            #                                             syn = syn,
            #                                             orig_text = curr_orig_text,
            #                                             mod_text = curr_mod_text
            #                                             )

            #                                 )
    # Calc cosine similarity -  should be run in a more vectorized fashion.
    sim_scores = get_cos_sim(text_a=all_orig_text, text_b=all_mod_text, st=st)

    log.info(
        f"word: {len(all_words)}, syn : {len(all_syn)}, "
        f"orig_text : {len(all_orig_text)}, mod_text : {len(all_mod_text)}, "
        f"sim_score : {len(sim_scores)}")
    # TODO: either store replaced context as a list of replacedcontext obj
    # or just as a dataframe.
    article_df = pd.DataFrame(
        dict(
            word=all_words,
            syn=all_syn,
            orig_text=all_orig_text,
            mod_text=all_mod_text,
            sim_score=sim_scores,
            syn_pos_context=all_context_pos,
            syn_pos=all_syn_pos,
        ))

    return article_df