# readability
        readability.append(textstat.flesch_reading_ease(instance))

        # remove stopwords & lemmatize
        lemmatizer = WordNetLemmatizer()
        instance_no_stopwords = remove_stopwords(instance)
        new_instance = ' '.join([
            lemmatizer.lemmatize(w)
            for w in word_tokenize(instance_no_stopwords)
        ])

        # calculate TTR and MTLD
        lex = LexicalRichness(new_instance)
        ttr.append(lex.ttr)
        mtld.append(lex.mtld(threshold=0.72))

        inst_id = str(pod_id[i]) + '_' + str(speaker_id[i])
        instance_ids.append(inst_id)
        unique_words.append(lex.terms)

    # create dictionary with results
    measurements = {
        'instance': instance_ids,
        'mtld': mtld,
        'ttr': ttr,
        'readability': readability,
        'unique_words': unique_words,
        'avg sentence len': avg_sentence_length,
        'avg word len': avg_word_length
    }
def compute_lexical_richness(events,
                             by=["event_type"],
                             extent=["discontiguous_triggers"],
                             preproc=None):
    """
    Compute lexical richness measures of unit attributes.
    event_type extracts the mention tokens

    :return:
    :param events: list of Event objects
    :param by: Group metric by attribute name. Used for grouping by event_type or subtype
    :param extent: Extent of the text getter functions on Event. Default: Full even trigger with discont.,
    :param preproc: list of preprocessing functions that take a string of text as input.
    :return:
    """
    from lexicalrichness import LexicalRichness

    print(
        f"Computing lexical richness of {str(extent).upper()} grouped by {str(by).upper()} with preprocessing: {str(preproc)}"
    )
    # collect text by attribute
    all_text = {}
    for attrib_name, g in groupby(events,
                                  key=lambda x:
                                  (getattr(x, attrib_n) for attrib_n in by)):
        attrib_name = ".".join(str(attrib_name))

        for event in g:
            text = event.get_extent_text(extent=extent)
            if preproc:
                for preproc_func in preproc:
                    text = preproc_func(text)
            all_text.setdefault(attrib_name, []).append(text)

    # compute lexical diversity by attribute
    d = []
    for attrib_name, text in all_text.items():
        # This was a bad idea because mention TTR is nearly always 1.
        # # mean mention type-token ratio: variant of mean segment ttr (Johnsson 1944)
        # # instead of to_fix segments: annotation mentions
        # mention_ttr = [LexicalRichness(t).ttr for t in text]
        # mmttr = sum(mention_ttr) / len(text)
        # print(mention_ttr)
        # print(mmttr)

        # Lexical entropy
        p, lns = Counter(text), float(len(text))
        entropy = -sum(count / lns * math.log(count / lns, 2)
                       for count in p.values())

        # metrics on all mentions together
        text = " ".join(text)
        lr = LexicalRichness(text)

        d.append({
            "annotation_type": attrib_name,
            # "Mean mention TTR": mmttr, # this was a bad idea of mine
            "cttr": lr.cttr,
            "entropy": entropy,
            "dugast": lr.Dugast,
            "type_count": lr.terms,
            "token_count": lr.words,
            "herdan": lr.Herdan,
            "somers": lr.Summer,
            "maas": lr.Maas,  #  low sensivitty
            "ttr": lr.ttr,
            "rttr": lr.rttr,
            "mtld": lr.mtld(threshold=0.72),  # length correct,  mid sensivitty
            "msttr":
            lr.msttr(segment_window=25),  # length correct, mid sensivity
            "mattr":
            lr.mattr(window_size=25),  # length correct, mid sensivitty
            "hdd": lr.hdd(draws=42),  # length correct, high sensitivity
        })

    df_lr = pd.DataFrame(d)
    # invert Maas for plotting
    df_lr["maas_inv"] = df_lr["maas"] * -1.0

    rec_metrics = ["maas", "hdd",
                   "mtld"]  # recommended metrics in McCarthy 2010
    # rank
    df_lr = util.rank_dataframe_column(
        df_lr, ascending=False)  # add rank column for easy comparison
    df_lr["maas_rank"] = (df_lr["maas"].rank().astype(int)
                          )  # Maas is inverted, lower score is more richness
    df_lr = df_lr.drop(labels=["annotation_type_rank"],
                       axis=1)  # no need for index column ranking

    # nicer output
    df_lr = df_lr.sort_index(axis=1)  # sort columns alphabetically
    rank_cols = [c for c in df_lr if "_rank" in c and "_count" not in c]
    df_lr["rank_all"] = (df_lr[rank_cols].sum(axis=1).rank().astype(int)
                         )  # sum every metric rank and rank inversely
    df_lr["rank_maas_hdd_mtld"] = (df_lr[[m + "_rank" for m in rec_metrics
                                          ]].sum(axis=1).rank().astype(int)
                                   )  # combine recommended metrics
    df_lr = df_lr.set_index("annotation_type")
    df_lr = df_lr.sort_values(
        by="rank_maas_hdd_mtld"
    )  # sort values by conbination of recommended metrics in McCarthy 2010
    return df_lr
 def calculate_lexical_richness_measure(self, text, window_size = 200, threshold = 0.72):
     lex = LexicalRichness(text)
     self.measures['mattr'] = lex.mattr(window_size=window_size) #moving average
     self.measures['mtld'] = lex.mtld(threshold=threshold) #measure of lexical diversity
     return self.measures
    def process_dataset(self,
                        dataset,
                        remove_stop_words=False,
                        stem=False,
                        remove_punct=False,
                        n_gram=1,
                        tags=False,
                        pos=False,
                        dep=False,
                        alpha=False,
                        ent=False,
                        sentiment=False,
                        vectorizer='count',
                        lex=False,
                        normalize=False,
                        tag_ngram=False,
                        text_features=False):

        # return processed_corpus
        if vectorizer == 'tfidf':
            # self.vectorizer = TfidfVectorizer( ngram_range=(1, n_gram), max_df=0.5, min_df=2 )
            self.vectorizer = TfidfVectorizer()
            # n_gram = 1
        else:
            self.vectorizer = CountVectorizer()

        processed_corpus = [
            self.proccess_text(text,
                               remove_stop_words=remove_stop_words,
                               stem=stem,
                               remove_punct=remove_punct,
                               n_gram=n_gram,
                               tags=tags,
                               pos=pos,
                               dep=dep,
                               alpha=alpha,
                               ent=ent,
                               sentiment=sentiment,
                               tag_ngram=tag_ngram) for text in dataset
        ]
        if vectorizer == None:
            return processed_corpus

        X = self.vectorizer.fit_transform(processed_corpus)
        X = X.toarray()

        if normalize:
            X = preprocessing.normalize(X)

        if lex:
            lex_features = []
            for text in dataset:
                lex = LexicalRichness(text)
                li = []
                try:
                    li.append(lex.ttr)
                except:
                    li.append(0.0)
                try:
                    li.append(lex.rttr)
                except:
                    li.append(0.0)
                try:
                    li.append(lex.cttr)
                except:
                    li.append(0.0)
                try:
                    li.append(lex.mtld(threshold=0.72))
                except:
                    li.append(0.0)
                lex_features.append(li)
            lex_features = np.array(lex_features)
            if normalize:
                lex_features = preprocessing.normalize(lex_features)

            _text_features = []
            if (text_features):
                for text in dataset:
                    li = []
                    li.append(self.countChar(text))
                    li.append(self.countCharWithoutSpace(text))
                    li.append(self.countSentences(text))
                    li.append(self.avarageSentenceLength(text))
                    li.append(self.maxSentenceLength(text))
                    li.append(self.minSentenceLength(text))
                    _text_features.append(li)

            X = np.concatenate((X, lex_features), axis=1)
            X = np.concatenate((X, _text_features), axis=1)

        # print(len(self.vectorizer.get_feature_names()), '- Vocabulary\n\n')
        return X