Exemple #1
0
def diversity_features(text):
    flt = ld.flemmatize(text)
    funcs = [
        ld.ttr, ld.root_ttr, ld.log_ttr, ld.maas_ttr, ld.msttr, ld.mattr,
        ld.hdd, ld.mtld, ld.mtld_ma_wrap, ld.mtld_ma_bid
    ]
    return list(map(lambda x: x(text), funcs))
def build_aux_metrics(filename_series, doc_series):
	lex_vol = []; ttr = []; mtld = []; vocd = []  # lexical div measures
	neg_mean = []; neu_mean = []; pos_mean = []; compound_mean = []
	neg_std = []; neu_std = []; pos_std = []; compound_std = []    
	filename = []  # sentiment measures

	for i0 in range(len(doc_series)):

		filename0 = filename_series.iloc[i0]; filename0
		doc0 = doc_series.iloc[i0]; doc0
		doc0_list = nltk.sent_tokenize(doc0); doc0_list
		doc0_string = " ".join(doc0_list); doc0_string
		n1 = len(doc0_list); n1

		if n1 > 1:
			vs_list = []	
			for i1 in range(n1):
				sent0 = doc0_list[i1]
				vs0 = analyzer.polarity_scores(sent0); vs0
				vs_list.append(vs0)
	
			doc0_df = pd.DataFrame(vs_list); doc0_df	
			mean_list0 = [x for x in doc0_df.mean()]; mean_list0
			std_list0 = [x for x in doc0_df.std()]; std_list0

		else:
			mean_list0 = [float(0) for x in range(4)]; mean_list0
			std_list0 = [float(0) for x in range(4)]; std_list0

		neg_mean.append(mean_list0[0]); neu_mean.append(mean_list0[1])
		pos_mean.append(mean_list0[2]); compound_mean.append(mean_list0[3])                        		
		neg_std.append(std_list0[0]); neu_std.append(std_list0[1])
		pos_std.append(std_list0[2]); compound_std.append(std_list0[3])                        
		filename.append(filename0)

		flt = ld.flemmatize(doc0_string); flt
		lex_vol0 = len(flt)  # lexical volume measure
		ttr0 = ld.ttr(flt)  # basic Text-Type Ratio or TTR
		mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability
		vocd0 = ld.hdd(flt) # vocd or Hypergeometric distribution D (HDD), as per McCarthy and Jarvis (2007, 2010)

		lex_vol.append(lex_vol0)
		ttr.append(ttr0)
		mtld.append(mtld0)
		vocd.append(vocd0)

		if i0%5000 == 0:
			print(i0)

	# save as df
	df1 = pd.DataFrame({'filename':filename, 
                     'senti_neg': neg_mean, 'senti_neu': neu_mean, 'senti_pos': pos_mean, 'senti_compound': compound_mean,
                     'senti_neg_std': neg_std, 'senti_neu_std': neu_std, 'senti_pos_std': pos_std, 'senti_compound_std': compound_std,
                      'lex_vol':lex_vol, 'ttr':ttr, 'mtld':mtld, 'vocd':vocd})
	return(df1)
def build_aux_metrics1(filename_series, doc_series):
	lex_vol = []; mtld = []; # lexical div measures
	compound_mean = []; compound_std = [] # sentiment measures    	
	filename = []; #hyp_relev_num =[]  

	for i0 in range(len(doc_series)):

		filename0 = filename_series.iloc[i0]; filename0
		doc0 = doc_series.iloc[i0]; doc0
		doc0_list = nltk.sent_tokenize(doc0); doc0_list
		doc0_string = " ".join(doc0_list); doc0_string
		n1 = len(doc0_list); n1

		if n1 > 1:
			vs_list = []	
			for i1 in range(n1):
				sent0 = doc0_list[i1]
				vs0 = analyzer.polarity_scores(sent0); vs0
				vs_list.append(vs0)
	
			doc0_df = pd.DataFrame(vs_list); doc0_df	
			mean_list0 = [x for x in doc0_df.mean()]; mean_list0
			std_list0 = [x for x in doc0_df.std()]; std_list0

		else:
			mean_list0 = [float(0) for x in range(4)]; mean_list0
			std_list0 = [float(0) for x in range(4)]; std_list0

		compound_mean.append(mean_list0[3]); compound_std.append(std_list0[3])                        		
		filename.append(filename0)

		flt = ld.flemmatize(str(doc0_string)); flt
		lex_vol0 = len(flt)  # lexical volume measure
		mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability

		lex_vol.append(lex_vol0)
		mtld.append(mtld0)

		if i0%5000 == 0:
			print(i0)

	# save as df
	df1 = pd.DataFrame({'filename':filename, 'senti_compound': compound_mean, 'senti_compound_std': compound_std,
                      'lex_vol':lex_vol, 'mtld':mtld})
	return(df1)
 def lexdiv(self, text):
     self.lexical_diversity = float('{:.2f}'.format(
         ld.mtld(ld.flemmatize(text))))
     pass
Exemple #5
0
len_word_rng_auth = [max(len_tw_word[i*100:i*100+99])-min(len_tw_word[i*100:i*100+99]) for
                     i in range(int(len(len_tw_word)/100))]

len_char_mean_auth = [np.mean(len_tw_char[i*100:i*100+99]) for i in range(int(len(len_tw_char)/100))]
len_word_mean_auth = [np.mean(len_tw_word[i*100:i*100+99]) for i in range(int(len(len_tw_word)/100))]

##########
#
# vocab variety (TTR)
#

tweets_szerz = [" ".join(list(es_data["Tweets"])[i*100:99+i*100]) for
                i in range(int(len(len_tw_char)/100))]


ttr_szerz = [ld.ttr(ld.flemmatize(i)) for i in tweets_szerz]


##########
#
# tags
#

#RT
rt_szerz = [np.sum([k == "RT" for k in i.split(" ")]) for i in tweets_szerz]

#URL
url_szerz = [np.sum([k == "#URL#" for k in i.split(" ")]) for i in tweets_szerz]

#hashtag
hsg_szerz = [np.sum([k == "#HASHTAG#" for k in i.split(" ")]) for i in tweets_szerz]
def analyze_album(album_id):
    tracks = []
    track_ids = []
    results = sp.album_tracks(album_id)
    tracks.extend(results['items'])
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    for track in tracks:
        track_ids.append(track['id'])
    analysis_json = sp.audio_features(tracks=track_ids)
    analysis_json = list(filter(None, analysis_json))
    tracks_json = sp.album_tracks(album_id)["items"]
    tracks_json = list(filter(None, tracks_json))
    analysis_df = json_normalize(analysis_json)
    tracks_df = json_normalize(tracks_json)
    df = analysis_df.merge(tracks_df, on='id', how='inner')
    album_name = sp.album(album_id)["name"]
    album_name = clean_lyrics(album_name)
    release_date = sp.album(album_id)["release_date"]

    artist = json_normalize(
        sp.album_tracks(album_id)["items"][0]["artists"])["name"][0]

    keys = {
        0: 'C',
        1: 'C#',
        2: 'D',
        3: 'D#',
        4: 'E',
        5: 'F',
        6: 'F#',
        7: 'G',
        8: 'G#',
        9: 'A',
        10: 'A#',
        11: 'B'
    }

    df["key"] = df['key'].map(keys, na_action='ignore')

    mode = {0: 'Minor', 1: 'Major'}

    df["mode"] = df['mode'].map(mode, na_action='ignore')

    df["duration"] = (df["duration_ms_x"] / (1000 * 60)) % 60

    df['track'] = df['track_number']
    df = df.loc[df["disc_number"] == 1]
    df = df.set_index('track_number')
    df["album_id"] = album_id

    sent_score = []
    song_lyrics = []
    new_titles = []
    genius_url = []
    genius_songid = []
    keywords = []
    affect_freq = []
    msttr = []
    lexical_depth = []
    cliche_word_perc = []
    cliche_total_count = []
    df["metacritic"] = search_metacritic(artist, album_name)

    for title in df["name"]:
        try:
            title = title.split("- Remaster", 1)[0]
            title = title.split("[Remaster", 1)[0]
            title = title.split("(Remaster", 1)[0]
            title = title.split("- Mono", 1)[0]
            title = title.split("(Mono", 1)[0]
            title = title.split("[Mono", 1)[0]
            title = title.split("(with", 1)[0]
            title = title.split("[with", 1)[0]
            title = title.split("(featuring", 1)[0]
            title = title.split("- featuring", 1)[0]
            title = title.split("[featuring", 1)[0]
            new_titles.append(title)
            remote_song_info = request_song_info(title, artist)
            matching_artist = remote_song_info['result']['primary_artist'][
                'name']
            matching_artist = matching_artist.lower()
            ratio = levenshtein_ratio_and_distance(artist.lower(),
                                                   matching_artist,
                                                   ratio_calc=True)
            if ratio > .6:
                url = remote_song_info['result']['url']
                genius_url.append(url)
                genius_songid.append(str(remote_song_info['result']['id']))
                lyrics = get_lyrics(url)
                flt = ld.flemmatize(clean_lyrics(lyrics))
                clean_flt = [x for x in flt if x.lower() not in excluded_words]
                spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)
                depth = sum(
                    [1 for x in clean_flt if x.lower() not in spacy_stopwords])
                cliche_count = sum(
                    [1 for x in clean_flt if x.lower() in cliche_words])
                cliche_perc = cliche_count / depth
                if depth >= 5:
                    msttr.append(ld.msttr((clean_flt), window_length=100))
                    lexical_depth.append(depth)
                    cliche_word_perc.append(cliche_perc)
                    cliche_total_count.append(cliche_count)
                else:
                    msttr.append(None)
                    lexical_depth.append(None)
                    cliche_word_perc.append(None)
                    cliche_total_count.append(None)
                keywords.append(
                    return_keywords(preprocess(clean_lyrics(lyrics))))
                sent = sentiment_analyzer_scores(clean_lyrics(lyrics))
                sent = round((sent + 1) / 2, 3)
                sent_score.append(sent)
                text_object = NRCLex(lyrics)
                affect_freq.append(text_object.affect_frequencies)
                song_lyrics.append(clean_lyrics(lyrics))
            else:
                sent_score.append(None)
                song_lyrics.append(None)
                keywords.append(None)
                affect_freq.append(None)
                genius_url.append(None)
                genius_songid.append(None)
                msttr.append(None)
                lexical_depth.append(None)
                cliche_word_perc.append(None)
                cliche_total_count.append(None)
        except:
            sent_score.append(None)
            song_lyrics.append(None)
            keywords.append(None)
            affect_freq.append(None)
            # genius_url.append(None)
            # genius_songid.append(None)
            msttr.append(None)
            lexical_depth.append(None)
            cliche_word_perc.append(None)
            cliche_total_count.append(None)

    df['title'] = new_titles
    df["lyr_valence"] = sent_score
    df['mood'] = np.where(df['lyr_valence'].isnull(), df['valence'],
                          round((df["lyr_valence"] + df["valence"]) / 2, 3))
    df["mood_discrep"] = df["valence"] - df["lyr_valence"]
    df["lyrics"] = song_lyrics
    pos_neg(df, 'lyr_valence_des', 'lyr_valence')
    pos_neg(df, 'valence_des', 'valence')
    pos_neg(df, 'mood_des', 'mood')
    high_low(df, 'energy_des', 'energy')
    high_low(df, 'dance_des', 'danceability')
    df["artist"] = artist
    df["album_name"] = album_name
    df["release_date"] = release_date
    df["sp_id"] = df["id"]
    print(album_name)
    print(genius_songid)
    df["genius_songid"] = genius_songid
    df["url"] = genius_url
    df['keywords'] = keywords
    df['affect_freq'] = affect_freq
    df["lyr_valence"] = df["lyr_valence"].replace({np.nan: None})
    df["mood_discrep"] = df["mood_discrep"].replace({np.nan: None})
    df["lyr_valence_des"] = df["lyr_valence_des"].replace({'0': 'Not Found'})
    df['msttr'] = msttr
    df['lexical_depth'] = lexical_depth
    df['cliche_word_perc'] = cliche_word_perc
    df['cliche_total_words'] = cliche_total_count
    df["lexical_depth"] = df["lexical_depth"].replace({np.nan: None})
    df["msttr"] = df["msttr"].replace({np.nan: None})
    df["cliche_word_perc"] = df["cliche_word_perc"].replace({np.nan: None})
    df["cliche_total_words"] = df["cliche_total_words"].replace({np.nan: None})

    df = df.rename(columns={"valence": "mus_valence"})
    df = df.rename(columns={"external_urls.spotify": "external_urls_spotify"})

    energy_z = abs(stats.zscore(df["energy"]))
    mood_z = abs(stats.zscore(df["mood"]))
    mus_valence_z = abs(stats.zscore(df["mus_valence"]))
    dance_z = abs(stats.zscore(df["danceability"]))
    duration_z = abs(stats.zscore(df["duration"]))
    loudness_z = abs(stats.zscore(df["loudness"]))
    if None in df["msttr"].values:
        df["uniqueness"] = (energy_z + dance_z + duration_z + loudness_z +
                            mood_z) / 5
    else:
        lex_diversity = abs(stats.zscore(df["msttr"]))
        lyr_valence_z = abs(stats.zscore(df["lyr_valence"]))
        df["uniqueness"] = (energy_z + dance_z + duration_z + loudness_z +
                            lyr_valence_z + mus_valence_z + lex_diversity) / 7
    df = df[[
        "title", "energy", "mus_valence", "lyr_valence", "mood",
        "danceability", "loudness", "tempo", "key", "mode", "time_signature",
        "duration", "sp_id", "track", "lyrics", "speechiness", "acousticness",
        "instrumentalness", "liveness", "artist", "album_name", "disc_number",
        "explicit", "external_urls_spotify", "mood_discrep", "release_date",
        "uniqueness", "lyr_valence_des", "valence_des", "mood_des",
        "energy_des", "dance_des", "album_id", "url", "genius_songid",
        "keywords", "affect_freq", "metacritic", "msttr", "lexical_depth",
        "cliche_word_perc", "cliche_total_words"
    ]]

    df = df.to_dict('records')
    return df