def _compute_lexicalRichness(text, lemmatized_test, word_count, unique_words_cnt, lexical_diversity): lex = LexicalRichness(text) lex_lemmatized = LexicalRichness(lemmatized_test) # word count word_count.append(lex.words) # unique term count unique_words_cnt.append(lex.terms) # measure of Textual Lexical Diversity lexical_diversity.append( float(lex_lemmatized.terms) / float(lex_lemmatized.words)) return word_count, unique_words_cnt, lexical_diversity
def extractLexicalRichness(): """ Extract lexical richness of the text documents. The original text is used here too. """ Path1 = 'Gutenberg_English_Fiction_1k' Path2 = 'Gutenberg_English_Fiction_1k' HTMLFilesPath = 'Gutenberg_19th_century_English_Fiction' lexScores = [] badIndexes = [] dataPath = os.path.join(os.getcwd(),Path1,Path2, HTMLFilesPath) data = pp.readIndexes() for i in range(len(data)): print(i) htmlFilePath = os.path.join(dataPath,data['book_id'][i])[:-5] + '-content.html' corpus = pp.readHTMLFile(htmlFilePath) if corpus: lex = LexicalRichness(corpus) del lex.wordlist del lex.text lexScores.append(lex) else: badIndexes.append(i) with open(lexRichFile, 'wb') as f: pickle.dump(lexScores,f)
def type_token_ratio(row): text = row['reviewText'] lex = LexicalRichness(text) try: return lex.ttr except: return 0
def __call__(self, doc): # check if the tagger component is part of the pipeline if not doc.is_tagged: warnings.warn( 'The spaCy document was not tagged. Please add the TAGGER component or else the ' 'lexical variation indices will be based on all tokens and not just lexical ones!', Warning) lex = LexicalRichness(doc.text) else: # remove all non lexical words self.mark_lexical_token(doc) list_lexical_words = [ str(token) for token in doc if token._.is_lexical ] lexical_word_text = ' '.join(list_lexical_words) lex = LexicalRichness(lexical_word_text) doc._.features['LV_W'] = self.get_lv_w(lex) doc._.features['LV_WT'] = self.get_lv_wt(lex) doc._.features['LV_WT1'] = self.get_lv_wt1(lex) doc._.features['LV_TTR'] = self.get_lv_ttr(lex) doc._.features['LV_CTTR'] = self.get_lv_cttr(lex) doc._.features['LV_RTTR'] = self.get_lv_rttr(lex) doc._.features['LV_HDD'] = self.get_lv_hdd(lex) doc._.features['LV_DUGA'] = self.get_lv_duga(lex) doc._.features['LV_MAAS'] = self.get_lv_maas(lex) doc._.features['LV_SUMM'] = self.get_lv_summ(lex) doc._.features['LV_YULEK'] = self.get_lv_yulek(lex) doc._.features['LV_MTLD'] = self.get_lv_mtld(lex) doc._.features['LV_MSTTR'] = self.get_lv_msttr(lex) doc._.features['LV_MATTR'] = self.get_lv_mattr(lex) # deprecated! do not use doc._.features_lv anymore, use doc._.features instead! doc._.features_lv = [ doc._.features['LV_W'], doc._.features['LV_WT'], doc._.features['LV_WT1'], doc._.features['LV_TTR'], doc._.features['LV_CTTR'], doc._.features['LV_RTTR'], doc._.features['LV_HDD'], doc._.features['LV_DUGA'], doc._.features['LV_MAAS'], doc._.features['LV_SUMM'], doc._.features['LV_YULEK'], doc._.features['LV_MTLD'], doc._.features['LV_MSTTR'], doc._.features['LV_MATTR'] ] return doc
def lexical_richness(data): logging.info(f"Processing score lexical richness") ttr = [] for doc in data['text']: lex = LexicalRichness(doc) ttr.append(lex.ttr) data['lexical_richness'] = ttr return data
def get_mtld(doc): segment = doc.text # measure of textual lexical diversity (McCarthy 2005, McCarthy and # Jarvis 2010) if re.search("[a-zA-Z]", segment) is None: # e.g., "2", "223)." return 0 else: return LexicalRichness(segment).mtld(threshold=0.72)
def get_lexical_richness(sentence: str): lex = LexicalRichness(sentence) results = 0 try: results = lex.ttr except ZeroDivisionError: # Contains no effective words, return 0 instead pass finally: return results
def calculate_lexical_richness_measure(self, text, window_size = 200, threshold = 0.72): lex = LexicalRichness(text) self.measures['mattr'] = lex.mattr(window_size=window_size) #moving average self.measures['mtld'] = lex.mtld(threshold=threshold) #measure of lexical diversity return self.measures
def process_speech(transcribe_df, r_config): """ Preparing speech features Args: transcribe_df: Transcribed dataframe r_config: raw config file object Returns: Dataframe for speech features """ err_transcribe = transcribe_df[r_config.err_reason].iloc[0] transcribe = transcribe_df[r_config.nlp_transcribe].iloc[0] total_time = transcribe_df[r_config.nlp_totalTime].iloc[0] master_url = transcribe_df['dbm_master_url'].iloc[0] #clean transcribe transcribe = transcribe.replace(",", "") transcribe = " ".join(re.findall(r"[\w']+|[.!?]", transcribe)) if err_transcribe != 'Pass': df_speech = empty_speech(r_config, master_url, error_txt) return df_speech speech_dict = {} nltk_download() sentences = nltk.tokenize.sent_tokenize(transcribe) words_all = nltk.tokenize.word_tokenize(transcribe) num_sentences = len(sentences) speech_dict[r_config.nlp_numSentences] = num_sentences #nlp_singPron i_s = transcribe.count('I') me_s = transcribe.count('me') my_s = transcribe.count('my') sing_count = i_s + me_s + my_s speech_dict[r_config.nlp_singPronPerAns] = sing_count if len( words_all) > 0 else np.nan speech_dict[r_config.nlp_singPronPerSen] = divide_var( speech_dict[r_config.nlp_singPronPerAns], num_sentences) tagged = nltk.pos_tag(transcribe.split()) tagged_df = pd.DataFrame(tagged, columns=['word', 'pos_tag']) #Past tense per answer all_POSs = tagged_df['pos_tag'].tolist() speech_dict[r_config.nlp_pastTensePerAns] = all_POSs.count( 'VBD') if len(words_all) > 0 else np.nan speech_dict[r_config.nlp_pastTensePerSen] = divide_var( speech_dict[r_config.nlp_pastTensePerAns], num_sentences) #Pronoun per answer pronounsPerAns = all_POSs.count('PRP') + all_POSs.count('PRP$') speech_dict[r_config.nlp_pronounsPerAns] = pronounsPerAns if len( words_all) > 0 else np.nan speech_dict[r_config.nlp_pronounsPerSen] = divide_var( speech_dict[r_config.nlp_pronounsPerAns], num_sentences) #Verb per answer verbPerAns = all_POSs.count('VB') + all_POSs.count('VBD') + all_POSs.count('VBG') \ + all_POSs.count('VBN') + all_POSs.count('VBP') + all_POSs.count('VBZ') speech_dict[r_config. nlp_verbsPerAns] = verbPerAns if len(words_all) > 0 else np.nan speech_dict[r_config.nlp_verbsPerSen] = divide_var( speech_dict[r_config.nlp_verbsPerAns], num_sentences) #Adjective per answer adjectivesAns = all_POSs.count('JJ') + all_POSs.count( 'JJR') + all_POSs.count('JJS') speech_dict[r_config.nlp_adjectivesPerAns] = adjectivesAns if len( words_all) > 0 else np.nan speech_dict[r_config.nlp_adjectivesPerSen] = divide_var( speech_dict[r_config.nlp_adjectivesPerAns], num_sentences) #Noun per answer nounsAns = all_POSs.count('NN') + all_POSs.count('NNP') + all_POSs.count( 'NNS') speech_dict[ r_config.nlp_nounsPerAns] = nounsAns if len(words_all) > 0 else np.nan speech_dict[r_config.nlp_nounsPerSen] = divide_var( speech_dict[r_config.nlp_nounsPerAns], num_sentences) #Sentiment analysis vader = SentimentIntensityAnalyzer() sentence_valences = [] for s in sentences: sentiment_dict = vader.polarity_scores(s) sentence_valences.append(sentiment_dict['compound']) speech_dict[r_config.nlp_sentiment_mean] = np.mean( sentence_valences) if len(sentence_valences) > 0 else np.nan non_punc = list(value for value in words_all if value not in ['.', '!', '?']) non_punc_as_str = " ".join(str(non_punc)) lex = LexicalRichness(non_punc_as_str) speech_dict[r_config.nlp_mattr] = lex.mattr( window_size=lex.words) if lex.words > 0 else np.nan #Number of words per minute speech_dict[r_config.nlp_wordsPerMin] = divide_var(len(non_punc), total_time) * 60 speech_dict[r_config.nlp_totalTime] = total_time speech_dict['dbm_master_url'] = master_url df_speech = pd.DataFrame([speech_dict]) return df_speech
def process_dataset(self, dataset, remove_stop_words=False, stem=False, remove_punct=False, n_gram=1, tags=False, pos=False, dep=False, alpha=False, ent=False, sentiment=False, vectorizer='count', lex=False, normalize=False, tag_ngram=False, text_features=False): # return processed_corpus if vectorizer == 'tfidf': # self.vectorizer = TfidfVectorizer( ngram_range=(1, n_gram), max_df=0.5, min_df=2 ) self.vectorizer = TfidfVectorizer() # n_gram = 1 else: self.vectorizer = CountVectorizer() processed_corpus = [ self.proccess_text(text, remove_stop_words=remove_stop_words, stem=stem, remove_punct=remove_punct, n_gram=n_gram, tags=tags, pos=pos, dep=dep, alpha=alpha, ent=ent, sentiment=sentiment, tag_ngram=tag_ngram) for text in dataset ] if vectorizer == None: return processed_corpus X = self.vectorizer.fit_transform(processed_corpus) X = X.toarray() if normalize: X = preprocessing.normalize(X) if lex: lex_features = [] for text in dataset: lex = LexicalRichness(text) li = [] try: li.append(lex.ttr) except: li.append(0.0) try: li.append(lex.rttr) except: li.append(0.0) try: li.append(lex.cttr) except: li.append(0.0) try: li.append(lex.mtld(threshold=0.72)) except: li.append(0.0) lex_features.append(li) lex_features = np.array(lex_features) if normalize: lex_features = preprocessing.normalize(lex_features) _text_features = [] if (text_features): for text in dataset: li = [] li.append(self.countChar(text)) li.append(self.countCharWithoutSpace(text)) li.append(self.countSentences(text)) li.append(self.avarageSentenceLength(text)) li.append(self.maxSentenceLength(text)) li.append(self.minSentenceLength(text)) _text_features.append(li) X = np.concatenate((X, lex_features), axis=1) X = np.concatenate((X, _text_features), axis=1) # print(len(self.vectorizer.get_feature_names()), '- Vocabulary\n\n') return X
def compute_lexical_richness(events, by=["event_type"], extent=["discontiguous_triggers"], preproc=None): """ Compute lexical richness measures of unit attributes. event_type extracts the mention tokens :return: :param events: list of Event objects :param by: Group metric by attribute name. Used for grouping by event_type or subtype :param extent: Extent of the text getter functions on Event. Default: Full even trigger with discont., :param preproc: list of preprocessing functions that take a string of text as input. :return: """ from lexicalrichness import LexicalRichness print( f"Computing lexical richness of {str(extent).upper()} grouped by {str(by).upper()} with preprocessing: {str(preproc)}" ) # collect text by attribute all_text = {} for attrib_name, g in groupby(events, key=lambda x: (getattr(x, attrib_n) for attrib_n in by)): attrib_name = ".".join(str(attrib_name)) for event in g: text = event.get_extent_text(extent=extent) if preproc: for preproc_func in preproc: text = preproc_func(text) all_text.setdefault(attrib_name, []).append(text) # compute lexical diversity by attribute d = [] for attrib_name, text in all_text.items(): # This was a bad idea because mention TTR is nearly always 1. # # mean mention type-token ratio: variant of mean segment ttr (Johnsson 1944) # # instead of to_fix segments: annotation mentions # mention_ttr = [LexicalRichness(t).ttr for t in text] # mmttr = sum(mention_ttr) / len(text) # print(mention_ttr) # print(mmttr) # Lexical entropy p, lns = Counter(text), float(len(text)) entropy = -sum(count / lns * math.log(count / lns, 2) for count in p.values()) # metrics on all mentions together text = " ".join(text) lr = LexicalRichness(text) d.append({ "annotation_type": attrib_name, # "Mean mention TTR": mmttr, # this was a bad idea of mine "cttr": lr.cttr, "entropy": entropy, "dugast": lr.Dugast, "type_count": lr.terms, "token_count": lr.words, "herdan": lr.Herdan, "somers": lr.Summer, "maas": lr.Maas, # low sensivitty "ttr": lr.ttr, "rttr": lr.rttr, "mtld": lr.mtld(threshold=0.72), # length correct, mid sensivitty "msttr": lr.msttr(segment_window=25), # length correct, mid sensivity "mattr": lr.mattr(window_size=25), # length correct, mid sensivitty "hdd": lr.hdd(draws=42), # length correct, high sensitivity }) df_lr = pd.DataFrame(d) # invert Maas for plotting df_lr["maas_inv"] = df_lr["maas"] * -1.0 rec_metrics = ["maas", "hdd", "mtld"] # recommended metrics in McCarthy 2010 # rank df_lr = util.rank_dataframe_column( df_lr, ascending=False) # add rank column for easy comparison df_lr["maas_rank"] = (df_lr["maas"].rank().astype(int) ) # Maas is inverted, lower score is more richness df_lr = df_lr.drop(labels=["annotation_type_rank"], axis=1) # no need for index column ranking # nicer output df_lr = df_lr.sort_index(axis=1) # sort columns alphabetically rank_cols = [c for c in df_lr if "_rank" in c and "_count" not in c] df_lr["rank_all"] = (df_lr[rank_cols].sum(axis=1).rank().astype(int) ) # sum every metric rank and rank inversely df_lr["rank_maas_hdd_mtld"] = (df_lr[[m + "_rank" for m in rec_metrics ]].sum(axis=1).rank().astype(int) ) # combine recommended metrics df_lr = df_lr.set_index("annotation_type") df_lr = df_lr.sort_values( by="rank_maas_hdd_mtld" ) # sort values by conbination of recommended metrics in McCarthy 2010 return df_lr
def preprocess(df_total): """Preprocessing article text : avergae length of sentence, frequency of tags, POS tagging""" # Cleaning text df_total["text"] = df_total.text.apply(lambda x: x.lower()) # table = str.maketrans('', '', string.punctuation) # df_total["text"] = df_total.text.apply(lambda x: x.translate(table)) df_total["text"] = df_total.text.apply(lambda x: re.sub(r'\d+', 'num', x)) # substituting "U.S." df_total["little_clean"] = df_total.text.apply( lambda x: re.sub("U.S.", "United States", x)) # cleaning text table_ = str.maketrans('', '') df_total['cleaned_text'] = df_total.text.str.translate(table_) # *******SYNTACTIC FEATURES *******# # splitting articles into sentences df_total["sentences"] = df_total.little_clean.str.split("\. ") # calculating num of sentences in each article df_total["num_of_sentences"] = df_total.sentences.apply(lambda x: len(x)) # average length of sentences df_total["avg_sentence_length"] = df_total.sentences.apply( lambda x: round(np.mean([len(item) for item in x]))) # POS Tagging df_total['POS_tags'] = df_total.cleaned_text.apply( lambda x: nltk.pos_tag(nltk.word_tokenize(x), tagset='universal')) # frequency of tags df_total["tag_fq"] = df_total.POS_tags.apply( lambda x: nltk.FreqDist(tag for (word, tag) in x)) # count of each tag in each article df_total['Noun'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['NOUN']) df_total['Verb'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['VERB']) df_total['Punctuation'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['.']) df_total['Adposition'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['ADP']) df_total['Determiner'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['DET']) df_total['Adjective'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['ADJ']) df_total['Particle'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['PRT']) df_total['Adverb'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['ADV']) df_total['Pronoun'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['PRON']) df_total['Conjunction'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['CONJ']) df_total['Numeral'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['NUM']) df_total['Other'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['X']) # *********LEXICAL FEATURES **********# # word count df_total['characters_count'] = df_total.text.str.len() # Filtering only large texts df_total = df_total.loc[df_total.characters_count >= 100] # word average df_total['word_average'] = df_total['text'].apply( lambda x: np.mean([len(w) for w in x.split(' ')])) # lexical diversity df_total['lexical_diversity'] = df_total.text.apply( lambda x: ld.ttr([w for w in x.split(' ')])) # lexical richness df_total['lex_words'] = df_total.text.apply( lambda x: LexicalRichness(x).words) df_total['lex_uniquewords'] = df_total.text.apply( lambda x: LexicalRichness(x).terms) df_total['lex_ttr'] = df_total.text.apply( lambda x: LexicalRichness( x).ttr) # type token ratio : lexical richness # *********PSYCOLINGUISTIC FEATURES **********# # Sentiment score analyser = SentimentIntensityAnalyzer() df_total['sentiment_score'] = df_total.text.apply( lambda x: analyser.polarity_scores(x)['compound']) return df_total
def unique_words(row): text = row['reviewText'] lex = LexicalRichness(text) return lex.terms
text_data = np.array(df.iloc[:, 3]) pod_id = np.array(df.iloc[:, 0]) speaker_id = np.array(df.iloc[:, 2]) # measure complexity & record results in excel sheet instance_ids = [] ttr = [] mtld = [] number_of_words = [] readability = [] unique_words = [] avg_sentence_length = [] avg_word_length = [] for i, instance in enumerate(text_data): # lexical richness instance for entry before lemmatization and stopword removal lex_with_stopwords = LexicalRichness(instance) number_of_words.append(lex_with_stopwords.words) # mean sentence length mean_sentence_len = int(lex_with_stopwords.words / textstat.sentence_count(instance)) avg_sentence_length.append(mean_sentence_len) # mean word length num_chars = sum([len(w) for w in tokenizer.tokenize(instance)]) mean_word_len = round(num_chars / lex_with_stopwords.words, 1) avg_word_length.append(mean_word_len) # readability readability.append(textstat.flesch_reading_ease(instance))
def process_paper(file_path): with open(file_path, "r") as file: text = file.read() name = file_path.split("/")[-1].split(".")[0] start = detect_start(text, conferences[name]["title"]) end = detect_end(text) text = char_re.sub("", text[start:end]) doc = nlp(text) lr = LexicalRichness(text) document = { "n_chars": len(text), "n_tokens": 0, "n_sentences": 0, "word_lengths": {}, "sentence_lengths": {}, "sentence_tokens": {}, "punctuation": {}, "function_words": {}, "tokens": [], "tags": [], "pos": [], "metrics": {} } for metric in [ "cttr", "rttr", "ttr", "Dugast", "Herdan", "Maas", "Summer" ]: try: document["metrics"][metric] = getattr(lr, metric) except: document["metrics"][metric] = 0 for metric in ["hdd", "mattr", "msttr", "mtld"]: try: document["metrics"][metric] = getattr(lr, metric)() except: document["metrics"][metric] = 0 for sent in doc.sents: for token in sent: document["tokens"].append(token.lemma_) document["pos"].append(token.pos_) document["tags"].append(token.tag_) # document["tokens"].setdefault(token.lower(), 0) # document["tokens"][token.lower()] += 1 document["word_lengths"].setdefault(len(token), 0) document["word_lengths"][len(token)] += 1 document["n_tokens"] += 1 if token.lemma_ in function_words: document["function_words"].setdefault(token.lemma_, 0) document["function_words"][token.lemma_] += 1 document["sentence_lengths"].setdefault(len(sent.text), 0) document["sentence_lengths"][len(sent.text)] += 1 document["sentence_tokens"].setdefault(len(sent), 0) document["sentence_tokens"][len(sent)] += 1 document["n_sentences"] += 1 for char in string.punctuation: document["punctuation"][char] = text.count(char) with open(f"../data/pkls/{name}.pkl", "wb") as file: pickle.dump(document, file)
scores = {} for result in cur.fetchall(): # each result consists of and ideology name # as it's first element, and an array # of words associated with that ideology # as it's second element, like so: # ("communism", ["lots", "of", "communist", "words"]) ideology = result[0] # only get up to smallest_wordlist_length for STTR formula words = result[1][:smallest_wordlist_length] words_big_string = " ".join(words) scores[ideology] = LexicalRichness(words_big_string).ttr # Create chart save to disk # Michael me va a regañar por abusar los comprehensions # se lo juro que son apropriados aquí x_names = [ideology.capitalize() for ideology in scores] y_values = [score for ideology, score in scores.items()] y_pos = np.arange(len(x_names)) # plt.style.use('dark_background') plt.bar(y_pos, y_values, align='center', color="#330033", alpha=0.7) plt.ylim(bottom=.27, top=.285) plt.xticks(y_pos, x_names) plt.ylabel('Type Token Ratio') plt.title('Lexical Richness by Ideology')
def __init__(self): self.measures = {} self.lex = LexicalRichness("")
def calc_ttr(text): try: return LexicalRichness(text).ttr except: return 0
def get_complexity(lyrics): score = LexicalRichness(lyrics) return 1 - score.ttr