def compute_lexical_richness(events,
                             by=["event_type"],
                             extent=["discontiguous_triggers"],
                             preproc=None):
    """
    Compute lexical richness measures of unit attributes.
    event_type extracts the mention tokens

    :return:
    :param events: list of Event objects
    :param by: Group metric by attribute name. Used for grouping by event_type or subtype
    :param extent: Extent of the text getter functions on Event. Default: Full even trigger with discont.,
    :param preproc: list of preprocessing functions that take a string of text as input.
    :return:
    """
    from lexicalrichness import LexicalRichness

    print(
        f"Computing lexical richness of {str(extent).upper()} grouped by {str(by).upper()} with preprocessing: {str(preproc)}"
    )
    # collect text by attribute
    all_text = {}
    for attrib_name, g in groupby(events,
                                  key=lambda x:
                                  (getattr(x, attrib_n) for attrib_n in by)):
        attrib_name = ".".join(str(attrib_name))

        for event in g:
            text = event.get_extent_text(extent=extent)
            if preproc:
                for preproc_func in preproc:
                    text = preproc_func(text)
            all_text.setdefault(attrib_name, []).append(text)

    # compute lexical diversity by attribute
    d = []
    for attrib_name, text in all_text.items():
        # This was a bad idea because mention TTR is nearly always 1.
        # # mean mention type-token ratio: variant of mean segment ttr (Johnsson 1944)
        # # instead of to_fix segments: annotation mentions
        # mention_ttr = [LexicalRichness(t).ttr for t in text]
        # mmttr = sum(mention_ttr) / len(text)
        # print(mention_ttr)
        # print(mmttr)

        # Lexical entropy
        p, lns = Counter(text), float(len(text))
        entropy = -sum(count / lns * math.log(count / lns, 2)
                       for count in p.values())

        # metrics on all mentions together
        text = " ".join(text)
        lr = LexicalRichness(text)

        d.append({
            "annotation_type": attrib_name,
            # "Mean mention TTR": mmttr, # this was a bad idea of mine
            "cttr": lr.cttr,
            "entropy": entropy,
            "dugast": lr.Dugast,
            "type_count": lr.terms,
            "token_count": lr.words,
            "herdan": lr.Herdan,
            "somers": lr.Summer,
            "maas": lr.Maas,  #  low sensivitty
            "ttr": lr.ttr,
            "rttr": lr.rttr,
            "mtld": lr.mtld(threshold=0.72),  # length correct,  mid sensivitty
            "msttr":
            lr.msttr(segment_window=25),  # length correct, mid sensivity
            "mattr":
            lr.mattr(window_size=25),  # length correct, mid sensivitty
            "hdd": lr.hdd(draws=42),  # length correct, high sensitivity
        })

    df_lr = pd.DataFrame(d)
    # invert Maas for plotting
    df_lr["maas_inv"] = df_lr["maas"] * -1.0

    rec_metrics = ["maas", "hdd",
                   "mtld"]  # recommended metrics in McCarthy 2010
    # rank
    df_lr = util.rank_dataframe_column(
        df_lr, ascending=False)  # add rank column for easy comparison
    df_lr["maas_rank"] = (df_lr["maas"].rank().astype(int)
                          )  # Maas is inverted, lower score is more richness
    df_lr = df_lr.drop(labels=["annotation_type_rank"],
                       axis=1)  # no need for index column ranking

    # nicer output
    df_lr = df_lr.sort_index(axis=1)  # sort columns alphabetically
    rank_cols = [c for c in df_lr if "_rank" in c and "_count" not in c]
    df_lr["rank_all"] = (df_lr[rank_cols].sum(axis=1).rank().astype(int)
                         )  # sum every metric rank and rank inversely
    df_lr["rank_maas_hdd_mtld"] = (df_lr[[m + "_rank" for m in rec_metrics
                                          ]].sum(axis=1).rank().astype(int)
                                   )  # combine recommended metrics
    df_lr = df_lr.set_index("annotation_type")
    df_lr = df_lr.sort_values(
        by="rank_maas_hdd_mtld"
    )  # sort values by conbination of recommended metrics in McCarthy 2010
    return df_lr
 def calculate_lexical_richness_measure(self, text, window_size = 200, threshold = 0.72):
     lex = LexicalRichness(text)
     self.measures['mattr'] = lex.mattr(window_size=window_size) #moving average
     self.measures['mtld'] = lex.mtld(threshold=threshold) #measure of lexical diversity
     return self.measures
Esempio n. 3
0
def process_speech(transcribe_df, r_config):
    """
        Preparing speech features
        Args:
            transcribe_df: Transcribed dataframe
            r_config: raw config file object
        Returns:
            Dataframe for speech features
    """

    err_transcribe = transcribe_df[r_config.err_reason].iloc[0]
    transcribe = transcribe_df[r_config.nlp_transcribe].iloc[0]
    total_time = transcribe_df[r_config.nlp_totalTime].iloc[0]
    master_url = transcribe_df['dbm_master_url'].iloc[0]

    #clean transcribe
    transcribe = transcribe.replace(",", "")
    transcribe = " ".join(re.findall(r"[\w']+|[.!?]", transcribe))

    if err_transcribe != 'Pass':
        df_speech = empty_speech(r_config, master_url, error_txt)

        return df_speech

    speech_dict = {}
    nltk_download()

    sentences = nltk.tokenize.sent_tokenize(transcribe)
    words_all = nltk.tokenize.word_tokenize(transcribe)
    num_sentences = len(sentences)

    speech_dict[r_config.nlp_numSentences] = num_sentences

    #nlp_singPron
    i_s = transcribe.count('I')
    me_s = transcribe.count('me')
    my_s = transcribe.count('my')
    sing_count = i_s + me_s + my_s

    speech_dict[r_config.nlp_singPronPerAns] = sing_count if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_singPronPerSen] = divide_var(
        speech_dict[r_config.nlp_singPronPerAns], num_sentences)

    tagged = nltk.pos_tag(transcribe.split())
    tagged_df = pd.DataFrame(tagged, columns=['word', 'pos_tag'])

    #Past tense per answer
    all_POSs = tagged_df['pos_tag'].tolist()
    speech_dict[r_config.nlp_pastTensePerAns] = all_POSs.count(
        'VBD') if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_pastTensePerSen] = divide_var(
        speech_dict[r_config.nlp_pastTensePerAns], num_sentences)

    #Pronoun per answer
    pronounsPerAns = all_POSs.count('PRP') + all_POSs.count('PRP$')
    speech_dict[r_config.nlp_pronounsPerAns] = pronounsPerAns if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_pronounsPerSen] = divide_var(
        speech_dict[r_config.nlp_pronounsPerAns], num_sentences)

    #Verb per answer
    verbPerAns = all_POSs.count('VB') + all_POSs.count('VBD') + all_POSs.count('VBG') \
                      + all_POSs.count('VBN') + all_POSs.count('VBP') + all_POSs.count('VBZ')
    speech_dict[r_config.
                nlp_verbsPerAns] = verbPerAns if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_verbsPerSen] = divide_var(
        speech_dict[r_config.nlp_verbsPerAns], num_sentences)

    #Adjective per answer
    adjectivesAns = all_POSs.count('JJ') + all_POSs.count(
        'JJR') + all_POSs.count('JJS')
    speech_dict[r_config.nlp_adjectivesPerAns] = adjectivesAns if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_adjectivesPerSen] = divide_var(
        speech_dict[r_config.nlp_adjectivesPerAns], num_sentences)

    #Noun per answer
    nounsAns = all_POSs.count('NN') + all_POSs.count('NNP') + all_POSs.count(
        'NNS')
    speech_dict[
        r_config.nlp_nounsPerAns] = nounsAns if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_nounsPerSen] = divide_var(
        speech_dict[r_config.nlp_nounsPerAns], num_sentences)

    #Sentiment analysis
    vader = SentimentIntensityAnalyzer()
    sentence_valences = []

    for s in sentences:
        sentiment_dict = vader.polarity_scores(s)
        sentence_valences.append(sentiment_dict['compound'])

    speech_dict[r_config.nlp_sentiment_mean] = np.mean(
        sentence_valences) if len(sentence_valences) > 0 else np.nan
    non_punc = list(value for value in words_all
                    if value not in ['.', '!', '?'])

    non_punc_as_str = " ".join(str(non_punc))
    lex = LexicalRichness(non_punc_as_str)
    speech_dict[r_config.nlp_mattr] = lex.mattr(
        window_size=lex.words) if lex.words > 0 else np.nan

    #Number of words per minute
    speech_dict[r_config.nlp_wordsPerMin] = divide_var(len(non_punc),
                                                       total_time) * 60
    speech_dict[r_config.nlp_totalTime] = total_time
    speech_dict['dbm_master_url'] = master_url

    df_speech = pd.DataFrame([speech_dict])
    return df_speech