# readability readability.append(textstat.flesch_reading_ease(instance)) # remove stopwords & lemmatize lemmatizer = WordNetLemmatizer() instance_no_stopwords = remove_stopwords(instance) new_instance = ' '.join([ lemmatizer.lemmatize(w) for w in word_tokenize(instance_no_stopwords) ]) # calculate TTR and MTLD lex = LexicalRichness(new_instance) ttr.append(lex.ttr) mtld.append(lex.mtld(threshold=0.72)) inst_id = str(pod_id[i]) + '_' + str(speaker_id[i]) instance_ids.append(inst_id) unique_words.append(lex.terms) # create dictionary with results measurements = { 'instance': instance_ids, 'mtld': mtld, 'ttr': ttr, 'readability': readability, 'unique_words': unique_words, 'avg sentence len': avg_sentence_length, 'avg word len': avg_word_length }
def compute_lexical_richness(events, by=["event_type"], extent=["discontiguous_triggers"], preproc=None): """ Compute lexical richness measures of unit attributes. event_type extracts the mention tokens :return: :param events: list of Event objects :param by: Group metric by attribute name. Used for grouping by event_type or subtype :param extent: Extent of the text getter functions on Event. Default: Full even trigger with discont., :param preproc: list of preprocessing functions that take a string of text as input. :return: """ from lexicalrichness import LexicalRichness print( f"Computing lexical richness of {str(extent).upper()} grouped by {str(by).upper()} with preprocessing: {str(preproc)}" ) # collect text by attribute all_text = {} for attrib_name, g in groupby(events, key=lambda x: (getattr(x, attrib_n) for attrib_n in by)): attrib_name = ".".join(str(attrib_name)) for event in g: text = event.get_extent_text(extent=extent) if preproc: for preproc_func in preproc: text = preproc_func(text) all_text.setdefault(attrib_name, []).append(text) # compute lexical diversity by attribute d = [] for attrib_name, text in all_text.items(): # This was a bad idea because mention TTR is nearly always 1. # # mean mention type-token ratio: variant of mean segment ttr (Johnsson 1944) # # instead of to_fix segments: annotation mentions # mention_ttr = [LexicalRichness(t).ttr for t in text] # mmttr = sum(mention_ttr) / len(text) # print(mention_ttr) # print(mmttr) # Lexical entropy p, lns = Counter(text), float(len(text)) entropy = -sum(count / lns * math.log(count / lns, 2) for count in p.values()) # metrics on all mentions together text = " ".join(text) lr = LexicalRichness(text) d.append({ "annotation_type": attrib_name, # "Mean mention TTR": mmttr, # this was a bad idea of mine "cttr": lr.cttr, "entropy": entropy, "dugast": lr.Dugast, "type_count": lr.terms, "token_count": lr.words, "herdan": lr.Herdan, "somers": lr.Summer, "maas": lr.Maas, # low sensivitty "ttr": lr.ttr, "rttr": lr.rttr, "mtld": lr.mtld(threshold=0.72), # length correct, mid sensivitty "msttr": lr.msttr(segment_window=25), # length correct, mid sensivity "mattr": lr.mattr(window_size=25), # length correct, mid sensivitty "hdd": lr.hdd(draws=42), # length correct, high sensitivity }) df_lr = pd.DataFrame(d) # invert Maas for plotting df_lr["maas_inv"] = df_lr["maas"] * -1.0 rec_metrics = ["maas", "hdd", "mtld"] # recommended metrics in McCarthy 2010 # rank df_lr = util.rank_dataframe_column( df_lr, ascending=False) # add rank column for easy comparison df_lr["maas_rank"] = (df_lr["maas"].rank().astype(int) ) # Maas is inverted, lower score is more richness df_lr = df_lr.drop(labels=["annotation_type_rank"], axis=1) # no need for index column ranking # nicer output df_lr = df_lr.sort_index(axis=1) # sort columns alphabetically rank_cols = [c for c in df_lr if "_rank" in c and "_count" not in c] df_lr["rank_all"] = (df_lr[rank_cols].sum(axis=1).rank().astype(int) ) # sum every metric rank and rank inversely df_lr["rank_maas_hdd_mtld"] = (df_lr[[m + "_rank" for m in rec_metrics ]].sum(axis=1).rank().astype(int) ) # combine recommended metrics df_lr = df_lr.set_index("annotation_type") df_lr = df_lr.sort_values( by="rank_maas_hdd_mtld" ) # sort values by conbination of recommended metrics in McCarthy 2010 return df_lr
def calculate_lexical_richness_measure(self, text, window_size = 200, threshold = 0.72): lex = LexicalRichness(text) self.measures['mattr'] = lex.mattr(window_size=window_size) #moving average self.measures['mtld'] = lex.mtld(threshold=threshold) #measure of lexical diversity return self.measures
def process_dataset(self, dataset, remove_stop_words=False, stem=False, remove_punct=False, n_gram=1, tags=False, pos=False, dep=False, alpha=False, ent=False, sentiment=False, vectorizer='count', lex=False, normalize=False, tag_ngram=False, text_features=False): # return processed_corpus if vectorizer == 'tfidf': # self.vectorizer = TfidfVectorizer( ngram_range=(1, n_gram), max_df=0.5, min_df=2 ) self.vectorizer = TfidfVectorizer() # n_gram = 1 else: self.vectorizer = CountVectorizer() processed_corpus = [ self.proccess_text(text, remove_stop_words=remove_stop_words, stem=stem, remove_punct=remove_punct, n_gram=n_gram, tags=tags, pos=pos, dep=dep, alpha=alpha, ent=ent, sentiment=sentiment, tag_ngram=tag_ngram) for text in dataset ] if vectorizer == None: return processed_corpus X = self.vectorizer.fit_transform(processed_corpus) X = X.toarray() if normalize: X = preprocessing.normalize(X) if lex: lex_features = [] for text in dataset: lex = LexicalRichness(text) li = [] try: li.append(lex.ttr) except: li.append(0.0) try: li.append(lex.rttr) except: li.append(0.0) try: li.append(lex.cttr) except: li.append(0.0) try: li.append(lex.mtld(threshold=0.72)) except: li.append(0.0) lex_features.append(li) lex_features = np.array(lex_features) if normalize: lex_features = preprocessing.normalize(lex_features) _text_features = [] if (text_features): for text in dataset: li = [] li.append(self.countChar(text)) li.append(self.countCharWithoutSpace(text)) li.append(self.countSentences(text)) li.append(self.avarageSentenceLength(text)) li.append(self.maxSentenceLength(text)) li.append(self.minSentenceLength(text)) _text_features.append(li) X = np.concatenate((X, lex_features), axis=1) X = np.concatenate((X, _text_features), axis=1) # print(len(self.vectorizer.get_feature_names()), '- Vocabulary\n\n') return X