Ejemplo n.º 1
0
def LexicalDiversity(text):
    tok = ld.tokenize(text)
    basic =  len(tok) / len(set(tok))
    SimpleTTR = ld.ttr(tok)
    RootTTR = ld.root_ttr(tok) # lexical_diversity(text)[2] # sztem ez a legjobb lexdiv mutató
    LogTTR = ld.log_ttr(tok)
    return basic, SimpleTTR, RootTTR, LogTTR
def build_aux_metrics(filename_series, doc_series):
	lex_vol = []; ttr = []; mtld = []; vocd = []  # lexical div measures
	neg_mean = []; neu_mean = []; pos_mean = []; compound_mean = []
	neg_std = []; neu_std = []; pos_std = []; compound_std = []    
	filename = []  # sentiment measures

	for i0 in range(len(doc_series)):

		filename0 = filename_series.iloc[i0]; filename0
		doc0 = doc_series.iloc[i0]; doc0
		doc0_list = nltk.sent_tokenize(doc0); doc0_list
		doc0_string = " ".join(doc0_list); doc0_string
		n1 = len(doc0_list); n1

		if n1 > 1:
			vs_list = []	
			for i1 in range(n1):
				sent0 = doc0_list[i1]
				vs0 = analyzer.polarity_scores(sent0); vs0
				vs_list.append(vs0)
	
			doc0_df = pd.DataFrame(vs_list); doc0_df	
			mean_list0 = [x for x in doc0_df.mean()]; mean_list0
			std_list0 = [x for x in doc0_df.std()]; std_list0

		else:
			mean_list0 = [float(0) for x in range(4)]; mean_list0
			std_list0 = [float(0) for x in range(4)]; std_list0

		neg_mean.append(mean_list0[0]); neu_mean.append(mean_list0[1])
		pos_mean.append(mean_list0[2]); compound_mean.append(mean_list0[3])                        		
		neg_std.append(std_list0[0]); neu_std.append(std_list0[1])
		pos_std.append(std_list0[2]); compound_std.append(std_list0[3])                        
		filename.append(filename0)

		flt = ld.flemmatize(doc0_string); flt
		lex_vol0 = len(flt)  # lexical volume measure
		ttr0 = ld.ttr(flt)  # basic Text-Type Ratio or TTR
		mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability
		vocd0 = ld.hdd(flt) # vocd or Hypergeometric distribution D (HDD), as per McCarthy and Jarvis (2007, 2010)

		lex_vol.append(lex_vol0)
		ttr.append(ttr0)
		mtld.append(mtld0)
		vocd.append(vocd0)

		if i0%5000 == 0:
			print(i0)

	# save as df
	df1 = pd.DataFrame({'filename':filename, 
                     'senti_neg': neg_mean, 'senti_neu': neu_mean, 'senti_pos': pos_mean, 'senti_compound': compound_mean,
                     'senti_neg_std': neg_std, 'senti_neu_std': neu_std, 'senti_pos_std': pos_std, 'senti_compound_std': compound_std,
                      'lex_vol':lex_vol, 'ttr':ttr, 'mtld':mtld, 'vocd':vocd})
	return(df1)
Ejemplo n.º 3
0
def extract_lexical_features(Authors):
    '''
    Extract the readability and typed-token-ratio features 
    Takes dictionary of authors as an input and returns the modified version.
    '''
    # On raw text, get average grade level of the tweets
    for author in Authors.keys():
        Authors[author].readability = 0
        for tweet in Authors[author].tweets:
            Authors[author].readability += (textstat.text_standard(tweet, float_output=True)/len(Authors[author].tweets)) # i am angery at textstat
    
    # On lemmatized text, get the TTR to determine the lexical diversity
    for author in Authors.keys():
        Authors[author].TTR = ld.ttr(Authors[author].clean)

    return Authors
Ejemplo n.º 4
0
len_word_rng_auth = [max(len_tw_word[i*100:i*100+99])-min(len_tw_word[i*100:i*100+99]) for
                     i in range(int(len(len_tw_word)/100))]

len_char_mean_auth = [np.mean(len_tw_char[i*100:i*100+99]) for i in range(int(len(len_tw_char)/100))]
len_word_mean_auth = [np.mean(len_tw_word[i*100:i*100+99]) for i in range(int(len(len_tw_word)/100))]

##########
#
# vocab variety (TTR)
#

tweets_szerz = [" ".join(list(es_data["Tweets"])[i*100:99+i*100]) for
                i in range(int(len(len_tw_char)/100))]


ttr_szerz = [ld.ttr(ld.flemmatize(i)) for i in tweets_szerz]


##########
#
# tags
#

#RT
rt_szerz = [np.sum([k == "RT" for k in i.split(" ")]) for i in tweets_szerz]

#URL
url_szerz = [np.sum([k == "#URL#" for k in i.split(" ")]) for i in tweets_szerz]

#hashtag
hsg_szerz = [np.sum([k == "#HASHTAG#" for k in i.split(" ")]) for i in tweets_szerz]
def get_news_features(headline, text):

    nlp = es_core_news_md.load()

    ## headline ##
    headline = re.sub(r"http\S+", "", headline)
    headline = re.sub(r"http", "", headline)
    headline = re.sub(r"@\S+", "", headline)
    headline = re.sub("\n", " ", headline)
    headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline)
    headline = headline.replace(r"*NUMBER*", "número")
    headline = headline.replace(r"*PHONE*", "número")
    headline = headline.replace(r"*EMAIL*", "email")
    headline = headline.replace(r"*URL*", "url")
    headline_lower = headline.lower()
    doc_h = nlp(headline_lower)

    list_tokens_h = []
    list_tags_h = []

    for sentence_h in doc_h.sents:
        for token in sentence_h:
            list_tokens_h.append(token.text)

    fdist_h = FreqDist(list_tokens_h)
    syllables_h = get_nsyllables(headline)
    words_h = len(list_tokens_h)

    # headline complexity features
    avg_word_size_h = round(
        sum(len(word) for word in list_tokens_h) / words_h, 2)
    avg_syllables_word_h = round(syllables_h / words_h, 2)
    unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2)
    mltd_h = round(ld.mtld(list_tokens_h), 2)
    ttr_h = round(ld.ttr(list_tokens_h) * 100, 2)

    ## text content##
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"http", "", text)
    text = re.sub("\n", " ", text)
    text = text.replace(r"*NUMBER*", "número")
    text = text.replace(r"*PHONE*", "número")
    text = text.replace(r"*EMAIL*", "email")
    text = text.replace(r"*URL*", "url")

    # to later calculate upper case letters ratio
    alph = list(filter(str.isalpha, text))
    text_lower = text.lower()
    doc = nlp(text_lower)

    list_tokens = []
    list_pos = []
    list_tag = []
    list_entities = []
    sents = 0

    for entity in doc.ents:
        list_entities.append(entity.label_)

    for sentence in doc.sents:
        sents += 1
        for token in sentence:
            list_tokens.append(token.text)
            list_pos.append(token.pos_)
            list_tag.append(token.tag_)

    # Calculate entities, pos, tag, freq, syllables, words and quotes
    entities = len(list_entities)
    n_pos = nltk.Counter(list_pos)
    n_tag = nltk.Counter(list_tag)
    fdist = FreqDist(list_tokens)
    syllables = get_nsyllables(text)
    words = len(list_tokens)
    quotes = n_tag['PUNCT__PunctType=Quot']

    # complexity features
    avg_word_sentence = round(words / sents, 2)
    avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2)
    avg_syllables_word = round(syllables / words, 2)
    unique_words = round((len(fdist.hapaxes()) / words) * 100, 2)
    ttr = round(ld.ttr(list_tokens) * 100, 2)

    # readability spanish test
    huerta_score = round(
        206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2)
    szigriszt_score = round(
        206.835 - ((62.3 * syllables) / words) - (words / sents), 2)

    # stylometric features
    mltd = round(ld.mtld(list_tokens), 2)
    upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2)
    entity_ratio = round((entities / words) * 100, 2)
    quotes_ratio = round((quotes / words) * 100, 2)
    propn_ratio = round((n_pos['PROPN'] / words) * 100, 2)
    noun_ratio = round((n_pos['NOUN'] / words) * 100, 2)
    pron_ratio = round((n_pos['PRON'] / words) * 100, 2)
    adp_ratio = round((n_pos['ADP'] / words) * 100, 2)
    det_ratio = round((n_pos['DET'] / words) * 100, 2)
    punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2)
    verb_ratio = round((n_pos['VERB'] / words) * 100, 2)
    adv_ratio = round((n_pos['ADV'] / words) * 100, 2)
    sym_ratio = round((n_tag['SYM'] / words) * 100, 2)

    # create df_features
    df_features = pd.DataFrame({
        'text': text_lower,
        'headline': headline_lower,
        'words_h': words_h,
        'word_size_h': [avg_word_size_h],
        'avg_syllables_word_h': [avg_syllables_word_h],
        'unique_words_h': [unique_words_h],
        'ttr_h': ttr_h,
        'mltd_h': [mltd_h],
        'sents': sents,
        'words': words,
        'avg_words_sent': [avg_word_sentence],
        'avg_word_size': [avg_word_size],
        'avg_syllables_word': avg_syllables_word,
        'unique_words': [unique_words],
        'ttr': [ttr],
        'huerta_score': [huerta_score],
        'szigriszt_score': [szigriszt_score],
        'mltd': [mltd],
        'upper_case_ratio': [upper_case_ratio],
        'entity_ratio': [entity_ratio],
        'quotes': quotes,
        'quotes_ratio': [quotes_ratio],
        'propn_ratio': [propn_ratio],
        'noun_ratio': [noun_ratio],
        'pron_ratio': [pron_ratio],
        'adp_ratio': [adp_ratio],
        'det_ratio': [det_ratio],
        'punct_ratio': [punct_ratio],
        'verb_ratio': [verb_ratio],
        'adv_ratio': [adv_ratio],
        'sym_ratio': [sym_ratio]
    })

    return df_features
    def lex_div(self, text):

        token = ld.tokenize(text)
        return ld.ttr(token)
Ejemplo n.º 7
0
 def lexical_ttr(tokens):
     return lex_div.ttr(tokens)
Ejemplo n.º 8
0
def preprocess(df_total):
    """Preprocessing article text : avergae length of sentence,
     frequency of tags, POS tagging"""
    # Cleaning text
    df_total["text"] = df_total.text.apply(lambda x: x.lower())
    # table = str.maketrans('', '', string.punctuation)
    # df_total["text"] = df_total.text.apply(lambda x: x.translate(table))
    df_total["text"] = df_total.text.apply(lambda x: re.sub(r'\d+', 'num', x))

    # substituting "U.S."
    df_total["little_clean"] = df_total.text.apply(
        lambda x: re.sub("U.S.", "United States", x))

    # cleaning text
    table_ = str.maketrans('', '')
    df_total['cleaned_text'] = df_total.text.str.translate(table_)

    # *******SYNTACTIC FEATURES *******#

    # splitting articles into sentences
    df_total["sentences"] = df_total.little_clean.str.split("\. ")

    # calculating num of sentences in each article
    df_total["num_of_sentences"] = df_total.sentences.apply(lambda x: len(x))

    # average length of sentences
    df_total["avg_sentence_length"] = df_total.sentences.apply(
        lambda x: round(np.mean([len(item) for item in x])))

    # POS Tagging
    df_total['POS_tags'] = df_total.cleaned_text.apply(
        lambda x: nltk.pos_tag(nltk.word_tokenize(x), tagset='universal'))

    # frequency of tags
    df_total["tag_fq"] = df_total.POS_tags.apply(
        lambda x: nltk.FreqDist(tag for (word, tag) in x))

    # count of each tag in each article
    df_total['Noun'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['NOUN'])
    df_total['Verb'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['VERB'])
    df_total['Punctuation'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['.'])
    df_total['Adposition'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['ADP'])
    df_total['Determiner'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['DET'])
    df_total['Adjective'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['ADJ'])
    df_total['Particle'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['PRT'])
    df_total['Adverb'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['ADV'])
    df_total['Pronoun'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['PRON'])
    df_total['Conjunction'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['CONJ'])
    df_total['Numeral'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['NUM'])
    df_total['Other'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['X'])

    # *********LEXICAL FEATURES **********#

    # word count
    df_total['characters_count'] = df_total.text.str.len()

    # Filtering only large texts
    df_total = df_total.loc[df_total.characters_count >= 100]

    # word average
    df_total['word_average'] = df_total['text'].apply(
        lambda x: np.mean([len(w) for w in x.split(' ')]))

    # lexical diversity
    df_total['lexical_diversity'] = df_total.text.apply(
        lambda x: ld.ttr([w for w in x.split(' ')]))

    # lexical richness
    df_total['lex_words'] = df_total.text.apply(
        lambda x: LexicalRichness(x).words)
    df_total['lex_uniquewords'] = df_total.text.apply(
        lambda x: LexicalRichness(x).terms)
    df_total['lex_ttr'] = df_total.text.apply(
        lambda x: LexicalRichness(
            x).ttr)  # type token ratio : lexical richness

    # *********PSYCOLINGUISTIC FEATURES **********#

    # Sentiment score
    analyser = SentimentIntensityAnalyzer()
    df_total['sentiment_score'] = df_total.text.apply(
        lambda x: analyser.polarity_scores(x)['compound'])

    return df_total
Ejemplo n.º 9
0
wb = Workbook()
sheet = wb.active
wbin = load_workbook(filename="yale_tweets.xlsx")
wbinsh = wbin.active
sheet["A1"] = "Flesh Reading Ease"
sheet["B1"] = "Flesh Kincaid Grade Level"
sheet["C1"] = "Coleman Liau Index"
sheet["D1"] = "Gunning Fog Index"
sheet["E1"] = "SMOG Index"
sheet["F1"] = "ARI Index"
sheet["G1"] = "LIX Index"
sheet["H1"] = "Dale-Chall Score"
sheet["I1"] = "TTR Simple"


for i in range(1,144332):
    if len(str(wbinsh["K"+str(i)].value).split(" "))> 15:
        calc = readcalc.ReadCalc(wbinsh["K"+str(i)].value)
        tokenized = word_tokenize(str(wbinsh["K"+str(i)].value))
        sheet["A"+str(i+1)] = calc.get_flesch_reading_ease()
        sheet["B"+str(i+1)] = calc.get_flesch_kincaid_grade_level()
        sheet["C"+str(i+1)] = calc.get_coleman_liau_index()
        sheet["D"+str(i+1)] = calc.get_gunning_fog_index()
        sheet["E"+str(i+1)] = calc.get_smog_index()
        sheet["F"+str(i+1)] = calc.get_ari_index()
        sheet["G"+str(i+1)] = calc.get_lix_index()
        sheet["H"+str(i+1)] = calc.get_dale_chall_score()
        sheet["I"+str(i+1)] = ld.ttr(tokenized)

wb.save(filename="yale_scores.xlsx")