def compute_mtld(sentences):
    ''' Computes the MTLD
    
        :param sentences: sentences
    
        :returns: The MTLD (float)
    '''

    ll = ' '.join(sentences)
    return ld.mtld(ll)
def build_aux_metrics(filename_series, doc_series):
	lex_vol = []; ttr = []; mtld = []; vocd = []  # lexical div measures
	neg_mean = []; neu_mean = []; pos_mean = []; compound_mean = []
	neg_std = []; neu_std = []; pos_std = []; compound_std = []    
	filename = []  # sentiment measures

	for i0 in range(len(doc_series)):

		filename0 = filename_series.iloc[i0]; filename0
		doc0 = doc_series.iloc[i0]; doc0
		doc0_list = nltk.sent_tokenize(doc0); doc0_list
		doc0_string = " ".join(doc0_list); doc0_string
		n1 = len(doc0_list); n1

		if n1 > 1:
			vs_list = []	
			for i1 in range(n1):
				sent0 = doc0_list[i1]
				vs0 = analyzer.polarity_scores(sent0); vs0
				vs_list.append(vs0)
	
			doc0_df = pd.DataFrame(vs_list); doc0_df	
			mean_list0 = [x for x in doc0_df.mean()]; mean_list0
			std_list0 = [x for x in doc0_df.std()]; std_list0

		else:
			mean_list0 = [float(0) for x in range(4)]; mean_list0
			std_list0 = [float(0) for x in range(4)]; std_list0

		neg_mean.append(mean_list0[0]); neu_mean.append(mean_list0[1])
		pos_mean.append(mean_list0[2]); compound_mean.append(mean_list0[3])                        		
		neg_std.append(std_list0[0]); neu_std.append(std_list0[1])
		pos_std.append(std_list0[2]); compound_std.append(std_list0[3])                        
		filename.append(filename0)

		flt = ld.flemmatize(doc0_string); flt
		lex_vol0 = len(flt)  # lexical volume measure
		ttr0 = ld.ttr(flt)  # basic Text-Type Ratio or TTR
		mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability
		vocd0 = ld.hdd(flt) # vocd or Hypergeometric distribution D (HDD), as per McCarthy and Jarvis (2007, 2010)

		lex_vol.append(lex_vol0)
		ttr.append(ttr0)
		mtld.append(mtld0)
		vocd.append(vocd0)

		if i0%5000 == 0:
			print(i0)

	# save as df
	df1 = pd.DataFrame({'filename':filename, 
                     'senti_neg': neg_mean, 'senti_neu': neu_mean, 'senti_pos': pos_mean, 'senti_compound': compound_mean,
                     'senti_neg_std': neg_std, 'senti_neu_std': neu_std, 'senti_pos_std': pos_std, 'senti_compound_std': compound_std,
                      'lex_vol':lex_vol, 'ttr':ttr, 'mtld':mtld, 'vocd':vocd})
	return(df1)
def build_aux_metrics1(filename_series, doc_series):
	lex_vol = []; mtld = []; # lexical div measures
	compound_mean = []; compound_std = [] # sentiment measures    	
	filename = []; #hyp_relev_num =[]  

	for i0 in range(len(doc_series)):

		filename0 = filename_series.iloc[i0]; filename0
		doc0 = doc_series.iloc[i0]; doc0
		doc0_list = nltk.sent_tokenize(doc0); doc0_list
		doc0_string = " ".join(doc0_list); doc0_string
		n1 = len(doc0_list); n1

		if n1 > 1:
			vs_list = []	
			for i1 in range(n1):
				sent0 = doc0_list[i1]
				vs0 = analyzer.polarity_scores(sent0); vs0
				vs_list.append(vs0)
	
			doc0_df = pd.DataFrame(vs_list); doc0_df	
			mean_list0 = [x for x in doc0_df.mean()]; mean_list0
			std_list0 = [x for x in doc0_df.std()]; std_list0

		else:
			mean_list0 = [float(0) for x in range(4)]; mean_list0
			std_list0 = [float(0) for x in range(4)]; std_list0

		compound_mean.append(mean_list0[3]); compound_std.append(std_list0[3])                        		
		filename.append(filename0)

		flt = ld.flemmatize(str(doc0_string)); flt
		lex_vol0 = len(flt)  # lexical volume measure
		mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability

		lex_vol.append(lex_vol0)
		mtld.append(mtld0)

		if i0%5000 == 0:
			print(i0)

	# save as df
	df1 = pd.DataFrame({'filename':filename, 'senti_compound': compound_mean, 'senti_compound_std': compound_std,
                      'lex_vol':lex_vol, 'mtld':mtld})
	return(df1)
 def lexdiv(self, text):
     self.lexical_diversity = float('{:.2f}'.format(
         ld.mtld(ld.flemmatize(text))))
     pass
def get_news_features(headline, text):

    nlp = es_core_news_md.load()

    ## headline ##
    headline = re.sub(r"http\S+", "", headline)
    headline = re.sub(r"http", "", headline)
    headline = re.sub(r"@\S+", "", headline)
    headline = re.sub("\n", " ", headline)
    headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline)
    headline = headline.replace(r"*NUMBER*", "número")
    headline = headline.replace(r"*PHONE*", "número")
    headline = headline.replace(r"*EMAIL*", "email")
    headline = headline.replace(r"*URL*", "url")
    headline_lower = headline.lower()
    doc_h = nlp(headline_lower)

    list_tokens_h = []
    list_tags_h = []

    for sentence_h in doc_h.sents:
        for token in sentence_h:
            list_tokens_h.append(token.text)

    fdist_h = FreqDist(list_tokens_h)
    syllables_h = get_nsyllables(headline)
    words_h = len(list_tokens_h)

    # headline complexity features
    avg_word_size_h = round(
        sum(len(word) for word in list_tokens_h) / words_h, 2)
    avg_syllables_word_h = round(syllables_h / words_h, 2)
    unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2)
    mltd_h = round(ld.mtld(list_tokens_h), 2)
    ttr_h = round(ld.ttr(list_tokens_h) * 100, 2)

    ## text content##
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"http", "", text)
    text = re.sub("\n", " ", text)
    text = text.replace(r"*NUMBER*", "número")
    text = text.replace(r"*PHONE*", "número")
    text = text.replace(r"*EMAIL*", "email")
    text = text.replace(r"*URL*", "url")

    # to later calculate upper case letters ratio
    alph = list(filter(str.isalpha, text))
    text_lower = text.lower()
    doc = nlp(text_lower)

    list_tokens = []
    list_pos = []
    list_tag = []
    list_entities = []
    sents = 0

    for entity in doc.ents:
        list_entities.append(entity.label_)

    for sentence in doc.sents:
        sents += 1
        for token in sentence:
            list_tokens.append(token.text)
            list_pos.append(token.pos_)
            list_tag.append(token.tag_)

    # Calculate entities, pos, tag, freq, syllables, words and quotes
    entities = len(list_entities)
    n_pos = nltk.Counter(list_pos)
    n_tag = nltk.Counter(list_tag)
    fdist = FreqDist(list_tokens)
    syllables = get_nsyllables(text)
    words = len(list_tokens)
    quotes = n_tag['PUNCT__PunctType=Quot']

    # complexity features
    avg_word_sentence = round(words / sents, 2)
    avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2)
    avg_syllables_word = round(syllables / words, 2)
    unique_words = round((len(fdist.hapaxes()) / words) * 100, 2)
    ttr = round(ld.ttr(list_tokens) * 100, 2)

    # readability spanish test
    huerta_score = round(
        206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2)
    szigriszt_score = round(
        206.835 - ((62.3 * syllables) / words) - (words / sents), 2)

    # stylometric features
    mltd = round(ld.mtld(list_tokens), 2)
    upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2)
    entity_ratio = round((entities / words) * 100, 2)
    quotes_ratio = round((quotes / words) * 100, 2)
    propn_ratio = round((n_pos['PROPN'] / words) * 100, 2)
    noun_ratio = round((n_pos['NOUN'] / words) * 100, 2)
    pron_ratio = round((n_pos['PRON'] / words) * 100, 2)
    adp_ratio = round((n_pos['ADP'] / words) * 100, 2)
    det_ratio = round((n_pos['DET'] / words) * 100, 2)
    punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2)
    verb_ratio = round((n_pos['VERB'] / words) * 100, 2)
    adv_ratio = round((n_pos['ADV'] / words) * 100, 2)
    sym_ratio = round((n_tag['SYM'] / words) * 100, 2)

    # create df_features
    df_features = pd.DataFrame({
        'text': text_lower,
        'headline': headline_lower,
        'words_h': words_h,
        'word_size_h': [avg_word_size_h],
        'avg_syllables_word_h': [avg_syllables_word_h],
        'unique_words_h': [unique_words_h],
        'ttr_h': ttr_h,
        'mltd_h': [mltd_h],
        'sents': sents,
        'words': words,
        'avg_words_sent': [avg_word_sentence],
        'avg_word_size': [avg_word_size],
        'avg_syllables_word': avg_syllables_word,
        'unique_words': [unique_words],
        'ttr': [ttr],
        'huerta_score': [huerta_score],
        'szigriszt_score': [szigriszt_score],
        'mltd': [mltd],
        'upper_case_ratio': [upper_case_ratio],
        'entity_ratio': [entity_ratio],
        'quotes': quotes,
        'quotes_ratio': [quotes_ratio],
        'propn_ratio': [propn_ratio],
        'noun_ratio': [noun_ratio],
        'pron_ratio': [pron_ratio],
        'adp_ratio': [adp_ratio],
        'det_ratio': [det_ratio],
        'punct_ratio': [punct_ratio],
        'verb_ratio': [verb_ratio],
        'adv_ratio': [adv_ratio],
        'sym_ratio': [sym_ratio]
    })

    return df_features
Ejemplo n.º 6
0
 def lexical_mtld(tokens):
     return lex_div.mtld(tokens)
Ejemplo n.º 7
0
 def mtld_ld(text):
     tokens = BasicMetrics.trueTokens(text)
     return ld.mtld(tokens)