def diversity_features(text): flt = ld.flemmatize(text) funcs = [ ld.ttr, ld.root_ttr, ld.log_ttr, ld.maas_ttr, ld.msttr, ld.mattr, ld.hdd, ld.mtld, ld.mtld_ma_wrap, ld.mtld_ma_bid ] return list(map(lambda x: x(text), funcs))
def build_aux_metrics(filename_series, doc_series): lex_vol = []; ttr = []; mtld = []; vocd = [] # lexical div measures neg_mean = []; neu_mean = []; pos_mean = []; compound_mean = [] neg_std = []; neu_std = []; pos_std = []; compound_std = [] filename = [] # sentiment measures for i0 in range(len(doc_series)): filename0 = filename_series.iloc[i0]; filename0 doc0 = doc_series.iloc[i0]; doc0 doc0_list = nltk.sent_tokenize(doc0); doc0_list doc0_string = " ".join(doc0_list); doc0_string n1 = len(doc0_list); n1 if n1 > 1: vs_list = [] for i1 in range(n1): sent0 = doc0_list[i1] vs0 = analyzer.polarity_scores(sent0); vs0 vs_list.append(vs0) doc0_df = pd.DataFrame(vs_list); doc0_df mean_list0 = [x for x in doc0_df.mean()]; mean_list0 std_list0 = [x for x in doc0_df.std()]; std_list0 else: mean_list0 = [float(0) for x in range(4)]; mean_list0 std_list0 = [float(0) for x in range(4)]; std_list0 neg_mean.append(mean_list0[0]); neu_mean.append(mean_list0[1]) pos_mean.append(mean_list0[2]); compound_mean.append(mean_list0[3]) neg_std.append(std_list0[0]); neu_std.append(std_list0[1]) pos_std.append(std_list0[2]); compound_std.append(std_list0[3]) filename.append(filename0) flt = ld.flemmatize(doc0_string); flt lex_vol0 = len(flt) # lexical volume measure ttr0 = ld.ttr(flt) # basic Text-Type Ratio or TTR mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability vocd0 = ld.hdd(flt) # vocd or Hypergeometric distribution D (HDD), as per McCarthy and Jarvis (2007, 2010) lex_vol.append(lex_vol0) ttr.append(ttr0) mtld.append(mtld0) vocd.append(vocd0) if i0%5000 == 0: print(i0) # save as df df1 = pd.DataFrame({'filename':filename, 'senti_neg': neg_mean, 'senti_neu': neu_mean, 'senti_pos': pos_mean, 'senti_compound': compound_mean, 'senti_neg_std': neg_std, 'senti_neu_std': neu_std, 'senti_pos_std': pos_std, 'senti_compound_std': compound_std, 'lex_vol':lex_vol, 'ttr':ttr, 'mtld':mtld, 'vocd':vocd}) return(df1)
def build_aux_metrics1(filename_series, doc_series): lex_vol = []; mtld = []; # lexical div measures compound_mean = []; compound_std = [] # sentiment measures filename = []; #hyp_relev_num =[] for i0 in range(len(doc_series)): filename0 = filename_series.iloc[i0]; filename0 doc0 = doc_series.iloc[i0]; doc0 doc0_list = nltk.sent_tokenize(doc0); doc0_list doc0_string = " ".join(doc0_list); doc0_string n1 = len(doc0_list); n1 if n1 > 1: vs_list = [] for i1 in range(n1): sent0 = doc0_list[i1] vs0 = analyzer.polarity_scores(sent0); vs0 vs_list.append(vs0) doc0_df = pd.DataFrame(vs_list); doc0_df mean_list0 = [x for x in doc0_df.mean()]; mean_list0 std_list0 = [x for x in doc0_df.std()]; std_list0 else: mean_list0 = [float(0) for x in range(4)]; mean_list0 std_list0 = [float(0) for x in range(4)]; std_list0 compound_mean.append(mean_list0[3]); compound_std.append(std_list0[3]) filename.append(filename0) flt = ld.flemmatize(str(doc0_string)); flt lex_vol0 = len(flt) # lexical volume measure mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability lex_vol.append(lex_vol0) mtld.append(mtld0) if i0%5000 == 0: print(i0) # save as df df1 = pd.DataFrame({'filename':filename, 'senti_compound': compound_mean, 'senti_compound_std': compound_std, 'lex_vol':lex_vol, 'mtld':mtld}) return(df1)
def lexdiv(self, text): self.lexical_diversity = float('{:.2f}'.format( ld.mtld(ld.flemmatize(text)))) pass
len_word_rng_auth = [max(len_tw_word[i*100:i*100+99])-min(len_tw_word[i*100:i*100+99]) for i in range(int(len(len_tw_word)/100))] len_char_mean_auth = [np.mean(len_tw_char[i*100:i*100+99]) for i in range(int(len(len_tw_char)/100))] len_word_mean_auth = [np.mean(len_tw_word[i*100:i*100+99]) for i in range(int(len(len_tw_word)/100))] ########## # # vocab variety (TTR) # tweets_szerz = [" ".join(list(es_data["Tweets"])[i*100:99+i*100]) for i in range(int(len(len_tw_char)/100))] ttr_szerz = [ld.ttr(ld.flemmatize(i)) for i in tweets_szerz] ########## # # tags # #RT rt_szerz = [np.sum([k == "RT" for k in i.split(" ")]) for i in tweets_szerz] #URL url_szerz = [np.sum([k == "#URL#" for k in i.split(" ")]) for i in tweets_szerz] #hashtag hsg_szerz = [np.sum([k == "#HASHTAG#" for k in i.split(" ")]) for i in tweets_szerz]
def analyze_album(album_id): tracks = [] track_ids = [] results = sp.album_tracks(album_id) tracks.extend(results['items']) while results['next']: results = sp.next(results) tracks.extend(results['items']) for track in tracks: track_ids.append(track['id']) analysis_json = sp.audio_features(tracks=track_ids) analysis_json = list(filter(None, analysis_json)) tracks_json = sp.album_tracks(album_id)["items"] tracks_json = list(filter(None, tracks_json)) analysis_df = json_normalize(analysis_json) tracks_df = json_normalize(tracks_json) df = analysis_df.merge(tracks_df, on='id', how='inner') album_name = sp.album(album_id)["name"] album_name = clean_lyrics(album_name) release_date = sp.album(album_id)["release_date"] artist = json_normalize( sp.album_tracks(album_id)["items"][0]["artists"])["name"][0] keys = { 0: 'C', 1: 'C#', 2: 'D', 3: 'D#', 4: 'E', 5: 'F', 6: 'F#', 7: 'G', 8: 'G#', 9: 'A', 10: 'A#', 11: 'B' } df["key"] = df['key'].map(keys, na_action='ignore') mode = {0: 'Minor', 1: 'Major'} df["mode"] = df['mode'].map(mode, na_action='ignore') df["duration"] = (df["duration_ms_x"] / (1000 * 60)) % 60 df['track'] = df['track_number'] df = df.loc[df["disc_number"] == 1] df = df.set_index('track_number') df["album_id"] = album_id sent_score = [] song_lyrics = [] new_titles = [] genius_url = [] genius_songid = [] keywords = [] affect_freq = [] msttr = [] lexical_depth = [] cliche_word_perc = [] cliche_total_count = [] df["metacritic"] = search_metacritic(artist, album_name) for title in df["name"]: try: title = title.split("- Remaster", 1)[0] title = title.split("[Remaster", 1)[0] title = title.split("(Remaster", 1)[0] title = title.split("- Mono", 1)[0] title = title.split("(Mono", 1)[0] title = title.split("[Mono", 1)[0] title = title.split("(with", 1)[0] title = title.split("[with", 1)[0] title = title.split("(featuring", 1)[0] title = title.split("- featuring", 1)[0] title = title.split("[featuring", 1)[0] new_titles.append(title) remote_song_info = request_song_info(title, artist) matching_artist = remote_song_info['result']['primary_artist'][ 'name'] matching_artist = matching_artist.lower() ratio = levenshtein_ratio_and_distance(artist.lower(), matching_artist, ratio_calc=True) if ratio > .6: url = remote_song_info['result']['url'] genius_url.append(url) genius_songid.append(str(remote_song_info['result']['id'])) lyrics = get_lyrics(url) flt = ld.flemmatize(clean_lyrics(lyrics)) clean_flt = [x for x in flt if x.lower() not in excluded_words] spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS) depth = sum( [1 for x in clean_flt if x.lower() not in spacy_stopwords]) cliche_count = sum( [1 for x in clean_flt if x.lower() in cliche_words]) cliche_perc = cliche_count / depth if depth >= 5: msttr.append(ld.msttr((clean_flt), window_length=100)) lexical_depth.append(depth) cliche_word_perc.append(cliche_perc) cliche_total_count.append(cliche_count) else: msttr.append(None) lexical_depth.append(None) cliche_word_perc.append(None) cliche_total_count.append(None) keywords.append( return_keywords(preprocess(clean_lyrics(lyrics)))) sent = sentiment_analyzer_scores(clean_lyrics(lyrics)) sent = round((sent + 1) / 2, 3) sent_score.append(sent) text_object = NRCLex(lyrics) affect_freq.append(text_object.affect_frequencies) song_lyrics.append(clean_lyrics(lyrics)) else: sent_score.append(None) song_lyrics.append(None) keywords.append(None) affect_freq.append(None) genius_url.append(None) genius_songid.append(None) msttr.append(None) lexical_depth.append(None) cliche_word_perc.append(None) cliche_total_count.append(None) except: sent_score.append(None) song_lyrics.append(None) keywords.append(None) affect_freq.append(None) # genius_url.append(None) # genius_songid.append(None) msttr.append(None) lexical_depth.append(None) cliche_word_perc.append(None) cliche_total_count.append(None) df['title'] = new_titles df["lyr_valence"] = sent_score df['mood'] = np.where(df['lyr_valence'].isnull(), df['valence'], round((df["lyr_valence"] + df["valence"]) / 2, 3)) df["mood_discrep"] = df["valence"] - df["lyr_valence"] df["lyrics"] = song_lyrics pos_neg(df, 'lyr_valence_des', 'lyr_valence') pos_neg(df, 'valence_des', 'valence') pos_neg(df, 'mood_des', 'mood') high_low(df, 'energy_des', 'energy') high_low(df, 'dance_des', 'danceability') df["artist"] = artist df["album_name"] = album_name df["release_date"] = release_date df["sp_id"] = df["id"] print(album_name) print(genius_songid) df["genius_songid"] = genius_songid df["url"] = genius_url df['keywords'] = keywords df['affect_freq'] = affect_freq df["lyr_valence"] = df["lyr_valence"].replace({np.nan: None}) df["mood_discrep"] = df["mood_discrep"].replace({np.nan: None}) df["lyr_valence_des"] = df["lyr_valence_des"].replace({'0': 'Not Found'}) df['msttr'] = msttr df['lexical_depth'] = lexical_depth df['cliche_word_perc'] = cliche_word_perc df['cliche_total_words'] = cliche_total_count df["lexical_depth"] = df["lexical_depth"].replace({np.nan: None}) df["msttr"] = df["msttr"].replace({np.nan: None}) df["cliche_word_perc"] = df["cliche_word_perc"].replace({np.nan: None}) df["cliche_total_words"] = df["cliche_total_words"].replace({np.nan: None}) df = df.rename(columns={"valence": "mus_valence"}) df = df.rename(columns={"external_urls.spotify": "external_urls_spotify"}) energy_z = abs(stats.zscore(df["energy"])) mood_z = abs(stats.zscore(df["mood"])) mus_valence_z = abs(stats.zscore(df["mus_valence"])) dance_z = abs(stats.zscore(df["danceability"])) duration_z = abs(stats.zscore(df["duration"])) loudness_z = abs(stats.zscore(df["loudness"])) if None in df["msttr"].values: df["uniqueness"] = (energy_z + dance_z + duration_z + loudness_z + mood_z) / 5 else: lex_diversity = abs(stats.zscore(df["msttr"])) lyr_valence_z = abs(stats.zscore(df["lyr_valence"])) df["uniqueness"] = (energy_z + dance_z + duration_z + loudness_z + lyr_valence_z + mus_valence_z + lex_diversity) / 7 df = df[[ "title", "energy", "mus_valence", "lyr_valence", "mood", "danceability", "loudness", "tempo", "key", "mode", "time_signature", "duration", "sp_id", "track", "lyrics", "speechiness", "acousticness", "instrumentalness", "liveness", "artist", "album_name", "disc_number", "explicit", "external_urls_spotify", "mood_discrep", "release_date", "uniqueness", "lyr_valence_des", "valence_des", "mood_des", "energy_des", "dance_des", "album_id", "url", "genius_songid", "keywords", "affect_freq", "metacritic", "msttr", "lexical_depth", "cliche_word_perc", "cliche_total_words" ]] df = df.to_dict('records') return df