def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge="none"): ''' This function generates a chordial visualization for the recordings of the provided path. ARGUMENTS: - folder: path of the folder that contains the WAV files to be processed - dimReductionMethod: method used to reduce the dimension of the initial feature space before computing the similarity. - priorKnowledge: if this is set equal to "artist" ''' if dimReductionMethod == "pca": allMtFeatures, wavFilesList = aF.dirWavFeatureExtraction(folder, 30.0, 30.0, 0.050, 0.050, computeBEAT=True) if allMtFeatures.shape[0] == 0: print "Error: No data found! Check input folder" return namesCategoryToVisualize = [ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList]; namesToVisualize = [ntpath.basename(w).replace('.wav', '') for w in wavFilesList]; (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.concatenate(F) pca = mlpy.PCA(method='cov') # pca (eigenvalue decomposition) pca.learn(F) coeff = pca.coeff() # check that the new PCA dimension is at most equal to the number of samples K1 = 2 K2 = 10 if K1 > F.shape[0]: K1 = F.shape[0] if K2 > F.shape[0]: K2 = F.shape[0] finalDims = pca.transform(F, k=K1) finalDims2 = pca.transform(F, k=K2) else: allMtFeatures, Ys, wavFilesList = aF.dirWavFeatureExtractionNoAveraging(folder, 20.0, 5.0, 0.040, 0.040) # long-term statistics cannot be applied in this context (LDA needs mid-term features) if allMtFeatures.shape[0] == 0: print "Error: No data found! Check input folder" return namesCategoryToVisualize = [ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList]; namesToVisualize = [ntpath.basename(w).replace('.wav', '') for w in wavFilesList]; ldaLabels = Ys if priorKnowledge == "artist": uNamesCategoryToVisualize = list(set(namesCategoryToVisualize)) YsNew = np.zeros(Ys.shape) for i, uname in enumerate(uNamesCategoryToVisualize): # for each unique artist name: indicesUCategories = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname] for j in indicesUCategories: indices = np.nonzero(Ys == j) YsNew[indices] = i ldaLabels = YsNew (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.array(F[0]) clf = LDA(n_components=10) clf.fit(F, ldaLabels) reducedDims = clf.transform(F) pca = mlpy.PCA(method='cov') # pca (eigenvalue decomposition) pca.learn(reducedDims) coeff = pca.coeff() reducedDims = pca.transform(reducedDims, k=2) # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY???? uLabels = np.sort(np.unique((Ys))) # uLabels must have as many labels as the number of wavFilesList elements reducedDimsAvg = np.zeros((uLabels.shape[0], reducedDims.shape[1])) finalDims = np.zeros((uLabels.shape[0], 2)) for i, u in enumerate(uLabels): indices = [j for j, x in enumerate(Ys) if x == u] f = reducedDims[indices, :] finalDims[i, :] = f.mean(axis=0) finalDims2 = reducedDims for i in range(finalDims.shape[0]): plt.text(finalDims[i, 0], finalDims[i, 1], ntpath.basename(wavFilesList[i].replace('.wav', '')), horizontalalignment='center', verticalalignment='center', fontsize=10) plt.plot(finalDims[i, 0], finalDims[i, 1], '*r') plt.xlim([1.2 * finalDims[:, 0].min(), 1.2 * finalDims[:, 0].max()]) plt.ylim([1.2 * finalDims[:, 1].min(), 1.2 * finalDims[:, 1].max()]) plt.show() SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0; chordialDiagram("visualization", SM, 0.50, namesToVisualize, namesCategoryToVisualize) SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0; chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize, namesCategoryToVisualize) # plot super-categories (i.e. artistname uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize))) finalDimsGroup = np.zeros((len(uNamesCategoryToVisualize), finalDims2.shape[1])) for i, uname in enumerate(uNamesCategoryToVisualize): indices = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname] f = finalDims2[indices, :] finalDimsGroup[i, :] = f.mean(axis=0) SMgroup = 1.0 - distance.squareform(distance.pdist(finalDimsGroup, 'cosine')) for i in range(SMgroup.shape[0]): SMgroup[i, i] = 0.0; chordialDiagram("visualizationGroup", SMgroup, 0.50, uNamesCategoryToVisualize, uNamesCategoryToVisualize)
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge="none"): ''' This function generates a chordial visualization for the recordings of the provided path. ARGUMENTS: - folder: path of the folder that contains the WAV files to be processed - dimReductionMethod: method used to reduce the dimension of the initial feature space before computing the similarity. - priorKnowledge: if this is set equal to "artist" ''' if dimReductionMethod == "pca": allMtFeatures, wavFilesList = aF.dirWavFeatureExtraction( folder, 30.0, 30.0, 0.050, 0.050, computeBEAT=True) if allMtFeatures.shape[0] == 0: print("Error: No data found! Check input folder") return namesCategoryToVisualize = [ ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList ] namesToVisualize = [ ntpath.basename(w).replace('.wav', '') for w in wavFilesList ] (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.concatenate(F) # check that the new PCA dimension is at most equal to the number of samples K1 = 2 K2 = 10 if K1 > F.shape[0]: K1 = F.shape[0] if K2 > F.shape[0]: K2 = F.shape[0] pca1 = sklearn.decomposition.PCA(n_components=K1) pca1.fit(F) pca2 = sklearn.decomposition.PCA(n_components=K2) pca2.fit(F) finalDims = pca1.transform(F) finalDims2 = pca2.transform(F) else: allMtFeatures, Ys, wavFilesList = aF.dirWavFeatureExtractionNoAveraging( folder, 20.0, 5.0, 0.040, 0.040 ) # long-term statistics cannot be applied in this context (LDA needs mid-term features) if allMtFeatures.shape[0] == 0: print("Error: No data found! Check input folder") return namesCategoryToVisualize = [ ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList ] namesToVisualize = [ ntpath.basename(w).replace('.wav', '') for w in wavFilesList ] ldaLabels = Ys if priorKnowledge == "artist": uNamesCategoryToVisualize = list(set(namesCategoryToVisualize)) YsNew = np.zeros(Ys.shape) for i, uname in enumerate( uNamesCategoryToVisualize): # for each unique artist name: indicesUCategories = [ j for j, x in enumerate(namesCategoryToVisualize) if x == uname ] for j in indicesUCategories: indices = np.nonzero(Ys == j) YsNew[indices] = i ldaLabels = YsNew (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.array(F[0]) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=10) clf.fit(F, ldaLabels) reducedDims = clf.transform(F) pca = sklearn.decomposition.PCA(n_components=2) pca.fit(reducedDims) reducedDims = pca.transform(reducedDims) # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY???? uLabels = np.sort( np.unique((Ys)) ) # uLabels must have as many labels as the number of wavFilesList elements reducedDimsAvg = np.zeros((uLabels.shape[0], reducedDims.shape[1])) finalDims = np.zeros((uLabels.shape[0], 2)) for i, u in enumerate(uLabels): indices = [j for j, x in enumerate(Ys) if x == u] f = reducedDims[indices, :] finalDims[i, :] = f.mean(axis=0) finalDims2 = reducedDims for i in range(finalDims.shape[0]): plt.text(finalDims[i, 0], finalDims[i, 1], ntpath.basename(wavFilesList[i].replace('.wav', '')), horizontalalignment='center', verticalalignment='center', fontsize=10) plt.plot(finalDims[i, 0], finalDims[i, 1], '*r') plt.xlim([1.2 * finalDims[:, 0].min(), 1.2 * finalDims[:, 0].max()]) plt.ylim([1.2 * finalDims[:, 1].min(), 1.2 * finalDims[:, 1].max()]) plt.show() SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0 chordialDiagram("visualization", SM, 0.50, namesToVisualize, namesCategoryToVisualize) SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0 chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize, namesCategoryToVisualize) # plot super-categories (i.e. artistname uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize))) finalDimsGroup = np.zeros( (len(uNamesCategoryToVisualize), finalDims2.shape[1])) for i, uname in enumerate(uNamesCategoryToVisualize): indices = [ j for j, x in enumerate(namesCategoryToVisualize) if x == uname ] f = finalDims2[indices, :] finalDimsGroup[i, :] = f.mean(axis=0) SMgroup = 1.0 - distance.squareform( distance.pdist(finalDimsGroup, 'cosine')) for i in range(SMgroup.shape[0]): SMgroup[i, i] = 0.0 chordialDiagram("visualizationGroup", SMgroup, 0.50, uNamesCategoryToVisualize, uNamesCategoryToVisualize)
def main(): start_time = time.time() # all bands are grouped by style style_list = [ [ 'arch_enemy', 'at_the_gates', 'children_of_bodom', 'wintersun', 'in_flames', 'fleshgod_apocalypse', 'opeth', 'katatonia', 'insomnium', 'swallow_the_sun', 'ghost', 'enslaved', 'dark_tranquillity', 'dark_lunacy', 'trivium', 'paradise_lost', 'behemoth' ], [ 'while_she_sleeps', 'underoath', 'motionless_in_white', 'memphis_may_fire', 'killswitch_engage', 'caliban', 'bullet_for_my_valentine', 'bring_me_the_horizon', 'blessthefall', 'black_veil_brides', 'august_burns_red' ], [ 'whitechapel', 'veil_of_maya', 'thy_art_is_murder', 'the_black_dahlia_murder', 'suicide_silence', 'rings_of_saturn', 'job_for_a_cowboy', 'infant_annhilator', 'decapitated', 'carnifex' ], [ 'anthrax', 'iron_maiden', 'judas_priest', 'megadeth', 'metallica', 'sepultura', 'slayer', 'death', 'testament' ], [ 'leprous', 'ihsahn', 'between_the_buried_and_me', 'baroness', 'gojira', 'code_orange', 'symphony_x', 'dream_theater' ], [ 'acid_bath', 'isis', 'neurosis', 'mastodon', 'today_is_the_day', 'torche', 'cult_of_luna' ], [ 'the_faceless', 'protest_the_hero', 'meshuggah', 'born_of_osiris', 'periphery' ], [ 'xandria', 'within_temptation', 'epica', 'rhapsody_of_fire', 'haggard', 'eluveitie' ], [ 'amon_amarth', 'amorphis', 'blind_guardian', 'eluveitie', 'finntroll', 'enslaved' ], ['nile', 'napalm_death', 'deicide', 'carcass', 'cannibal_corpse'], ['venom', 'mayhem', 'immortal', 'darkthrone', 'burzum'] ] # randomly pick 5 bands from a random style random_style = int(uniform(0, 11)) random_bands = sample(style_list[random_style], k=5) print 'bands picked' print random_bands print '\n' all_songs_from_file = [] for each_band in random_bands: # get all songs from selected band # path = "C:\Users\Nathan\Desktop\WI project all data\COMP4075_PROJECT\project_songs\songs_" + each_band path = "/Users/sunjingxuan/Desktop/WI_project_all_data/COMP4075_PROJECT/project_songs/songs_" + each_band all_songs_from_file.append( [f for f in os.listdir(path) if fnmatch.fnmatch(f, '*.mp3')]) all_songs = [] for each_song_list in all_songs_from_file: for each_song in each_song_list: all_songs.append(each_song) # preprocessing for song names -> different format processed_all_song_name = [] for each_song_name in all_songs: tmp_song_name = '' inBracket = False inSquare = False for index, each_word in enumerate(each_song_name): if each_word == '(': #for removing (xxx) inBracket = True elif each_word == ')': inBracket = False if each_word == '[': #for removing (xxx) inSquare = True elif each_word == ']': inSquare = False if inBracket == False and inSquare == False: if each_word == '.': #if the end of the name-> break break if each_word is not '-' and each_word is not ')' and \ each_word is not ']': tmp_song_name += each_word elif each_word == '-': tmp_song_name = '' #remove band name processed_all_song_name.append(tmp_song_name) for idx, each_name in enumerate(processed_all_song_name): if each_name != '': if each_name[0] == ' ': #remove the starting " " tmp = '' for index, each_char in enumerate(each_name): if index != 0: tmp += each_char processed_all_song_name[idx] = tmp for idx, each_name in enumerate(processed_all_song_name): if each_name != '': if each_name[len(each_name) - 1] == ' ': #remove the ending " " tmp = '' for index, each_char in enumerate(each_name): if index != len(each_name) - 1: tmp += each_char processed_all_song_name[idx] = tmp # print 'processed all song names' # print len(processed_all_song_name) # print '\n' ##########################################finish preprocessing########################################### each_song = [] #contains all candidate songs all_lyrics = '' songs_have_lyrics = [] # load the lyrics file for each_band in random_bands: each_file = 'lyrics_' + each_band + '.txt' # based on the mp3 list, pick those songs that its lyrics can be found with open(each_file, 'r') as each_lyrics_file: lyrics = each_lyrics_file.readlines() for index_for_items, data_items in enumerate( lyrics): #for each line for index, each_character in enumerate( data_items): #for each word if parseINT(each_character ): #if the first word is an int-> song if data_items[index + 1] == '.': sep = data_items.split() each_song_name = '' for idx, each_part in enumerate(sep): if idx is not 0 and idx is not len(sep) - 1: each_song_name += each_part + ' ' #get song names elif idx is len(sep) - 1: each_song_name += each_part each_song.append(each_song_name) for processed_song in processed_all_song_name: if processed_song in each_song_name and processed_song not in songs_have_lyrics: songs_have_lyrics.append(processed_song) break each_lyrics_file.close() # print 'songs that have lyrics' # print len(songs_have_lyrics) # print '\n' selected_playlist = sample(songs_have_lyrics, k=10) print 'selected playlist' print selected_playlist print '\n' #####################################finish picking songs########################################## songs_picked = '' for each_band in random_bands: each_file = 'lyrics_' + each_band + '.txt' with open(each_file, 'r') as each_lyrics_file: lyrics = each_lyrics_file.readlines() start_copy = False for index_for_items, data_items in enumerate( lyrics): #for each line for index, each_character in enumerate( data_items): #for each word if parseINT(each_character ): #if the first word is an int-> song if data_items[index + 1] == '.': sep = data_items.split() each_song_name = '' for idx, each_part in enumerate(sep): if idx is not 0 and idx is not len(sep) - 1: each_song_name += each_part + ' ' #get song names elif idx is len(sep) - 1: each_song_name += each_part for selected_song in selected_playlist: if selected_song in each_song_name and selected_song not in songs_picked: songs_picked += (selected_song + " ") start_copy = True break else: start_copy = False if start_copy == True: all_lyrics += data_items each_lyrics_file.close() #####################################finish getting lyrics########################################## preprocessed_lyrics = preprocess_text(all_lyrics, False) preprocessed_lyrics = preprocessed_lyrics.split(" ") ''' ###############################preprocessing for text of all bands################################### path = "C:\Users\Nathan\Desktop\WI project all data\COMP4075_PROJECT\project_lyrics" all_songs_from_file.append([f for f in os.listdir(path) if fnmatch.fnmatch(f, '*.mp3')]) lyricLists=[] lyricsFiles = [f for f in os.listdir(path) if fnmatch.fnmatch(f, '*.txt')] for lyricsFile in lyricsFiles: if lyricsFile!= "all_words.txt" and lyricsFile != 'all_top_k_words.txt': with open(lyricsFile,'r') as rf: text='' for row in rf: text=text+row lyricLists.append(preprocess_text(text, False)) #add all bands' songs rf.close() vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, sublinear_tf=True) wordvec = vectorizer.fit_transform(lyricLists) wordvec_matrix=wordvec.toarray() #print len(wordvec_matrix[0]) numpy.save('tfidf_model', wordvec_matrix) #output training model all_words = '' for i, feature in enumerate(vectorizer.get_feature_names()): #get the word sequence of the vector all_words += feature + ',' with open('all_words.txt', 'w') as output: output.write(all_words) #output all words (with correct sequence) output.close() with open('all_words.txt', 'r') as input_file: #load in all words in all bands tmp_words = input_file.read() all_words = tmp_words.split(",") input_file.close() # find top-k words for each vector all_top_k_word = [] for each_vector in wordvec_matrix: synonym = [] top_k_words_value = nlargest(50, each_vector) top_k_index = [] for each_value in top_k_words_value: for index, each_cal in enumerate(each_vector): if each_value == each_cal and index not in top_k_index: top_k_index.append(index) #find the corresponding index in all_words break for index, each_index in enumerate(top_k_index): for syn in wordnet.synsets(all_words[each_index]): for l in syn.lemmas(): synonym.append(l.name().encode('ascii', 'ignore')) synonym.append(all_words[each_index]) synonym = list(set(synonym)) #remove duplicates print len(synonym) string_format = '' for idx, each_syn in enumerate(synonym): if idx != len(synonym)-1: string_format += each_syn + ' ' else: string_format += each_syn all_top_k_word.append(string_format) #cast list to string and output output_top_k = '' for index, each_string in enumerate(all_top_k_word): if index != len(all_top_k_word) -1: output_top_k += each_string + ',' else: output_top_k += each_string with open('all_top_k_words.txt', 'w') as output_k: output_k.write(output_top_k) #output all words (with correct sequence) output_k.close() #print output_top_k ''' ''' ############################################preprocessing for IDF vector############################################### with open('all_words.txt', 'r') as input_file: #load in all words in all bands tmp_words = input_file.read() all_words = tmp_words.split(",") input_file.close() # precalculating the idf dict lyricLists=[] path = "C:\Users\Nathan\Desktop\WI project all data\COMP4075_PROJECT\project_lyrics" lyricsFiles = [f for f in os.listdir(path) if fnmatch.fnmatch(f, '*.txt')] for lyricsFile in lyricsFiles: if lyricsFile != 'all_words.txt' and lyricsFile != 'all_top_k_words.txt': with open(lyricsFile,'r') as rf: text='' for row in rf: text=text+row lyricLists.append(preprocess_text(text, False)) #add all bands' songs rf.close() idf_list = inv_doc_freq(all_words, lyricLists) with open('idf_file.csv', 'wb') as csv_file: writer = csv.writer(csv_file) for key, value in idf_list.items(): writer.writerow([key, value]) csv_file.close() ''' with open('idf_file.csv', 'rb') as csv_file: #load in preprocessed idf file reader = csv.reader(csv_file) idf_list = dict(reader) csv_file.close() with open('all_words.txt', 'r') as input_file: #load in all words in all bands tmp_words = input_file.read() all_words = tmp_words.split(",") input_file.close() preprocessed_lyrics = set( preprocessed_lyrics) #remove duplicate word (faster) tf_list = term_freq(preprocessed_lyrics) # remove words that are filtered out in TFIDF vectorizer removed_words = list(set(preprocessed_lyrics).difference(set(all_words))) preprocessed_lyrics = list(set(preprocessed_lyrics) - set(removed_words)) # get the playlist TF-IDF vector playlist_vector = [] #the playlist TF-IDF vector for each_word in all_words: exist = False for each_lyrics in preprocessed_lyrics: if each_word == each_lyrics: tfidf_value = float(tf_list[each_word]) * float( idf_list[each_word]) playlist_vector.append(tfidf_value) exist = True break if exist == False: playlist_vector.append(0) # get top-k word in the playlist top_k_words_value = nlargest(50, playlist_vector) top_k_index = [] playlist_top_k = [] for each_value in top_k_words_value: for index, each_cal in enumerate(playlist_vector): if each_value == each_cal and index not in top_k_index: top_k_index.append( index) #find the corresponding index in all_words break for each_index in top_k_index: playlist_top_k.append(all_words[each_index]) #get the actual word synonyms = [] for each_word in playlist_top_k: for syn in wordnet.synsets(each_word): for l in syn.lemmas(): synonyms.append(l.name().encode('ascii', 'ignore')) playlist_top_k += synonyms #concadenate two list playlist_top_k = list(set(playlist_top_k)) print "playlist keywords length (including synonyms)" print len(playlist_top_k) print "\n" # load in all bands' top-k word with open('all_top_k_words.txt', 'r') as top_k_input_file: #load in all words in all bands tmp = top_k_input_file.read() all_bands_top_k = tmp.split(",") top_k_input_file.close() # calculate intersect keyword between user's playlist and each_band all_lyrics_similarity = [] for each_string in all_bands_top_k: each_band_top_k = each_string.split(" ") all_lyrics_similarity.append( len(list(set(playlist_top_k).intersection(set(each_band_top_k))))) # print 'lyrics similarity' # print all_lyrics_similarity # print '\n' ''' ''' all_band_names = [['acid_bath'],['amon_amarth'],['amorphis'],['anthrax'],['arch_enemy'],['at_the_gates'],['august_burns_red'],['avatar'],['avenged_sevenfold'],['baroness'],\ ['behemoth'],['between_the_buried_and_me'],['black_sabbath'],['black_veil_brides'],['blessthefall'],['blind_guardian'],['born_of_osiris'],['breakdown_of_sanity'],['bring_me_the_horizon'],['bullet_for_my_valentine'],\ ['burzum'],['caliban'],['cannibal_corpse'],['carcass'],['carnifex'],['children_of_bodom'],['code_orange'],['converge'],['cradle_of_filth'],['cult_of_luna'],\ ['dark_lunacy'],['dark_tranquillity'],['darkthrone'],['death'],['decapitated'],['deftones'],['deicide'],['dimmu_borgir'],['dream_theater'],['eluveitie'],\ ['enslaved'],['epica'],['fear_factory'],['finntroll'],['fleshgod_apocalypse'],['ghost'],['gojira'],['haggard'],['ihsahn'],['immortal'],\ ['in_flames'],['infant_annhilator'],['insomnium'],['iron_maiden'],['isis'],['job_for_a_cowboy'],['judas_priest'],['katatonia'],['killswitch_engage'],['leprous'],\ ['mastodon'],['mayhem'],['megadeth'],['memphis_may_fire'],['meshuggah'],['metallica'],['motionless_in_white'],['motley_crue'],['mr_bungle'],['napalm_death'],\ ['neurosis'],['nile'],['opeth'],['ozzy_osbourne'],['paradise_lost'],['periphery'],['protest_the_hero'],['rhapsody_of_fire'],['rings_of_saturn'],['sepultura'],\ ['slayer'],['slipknot'],['suicide_silence'],['swallow_the_sun'],['symphony_x'],['testament'],['the_black_dahlia_murder'],['the_faceless'],['thy_art_is_murder'],['today_is_the_day'],\ ['torche'],['trivium'],['underoath'],['veil_of_maya'],['venom'],['while_she_sleeps'],['whitechapel'],['wintersun'],['within_temptation'],['xandria']] # for getting the top-k most similar band (lyrics) most_similar_band_by_lyrics = [] top_k_similar_values = nlargest(5, all_lyrics_similarity) similar_index = [] for each_value in top_k_similar_values: for index, sim in enumerate(all_lyrics_similarity): if each_value == sim and index not in similar_index: similar_index.append(index) for each_index in similar_index: most_similar_band_by_lyrics.append(all_band_names[each_index]) print 'top-5 bands for lyrics' print most_similar_band_by_lyrics print '\n' # min-max normalization normalized_lyrics_similarity = [] max_lyrics_similarity = max(all_lyrics_similarity) min_lyrics_similarity = min(all_lyrics_similarity) for each_score in all_lyrics_similarity: normalized_lyrics_similarity.append( float((each_score - min_lyrics_similarity)) / float( (max_lyrics_similarity - min_lyrics_similarity))) #print normalized_lyrics_similarity # create a new folder and copy all playlist songs into the folder for audio feature extraction all_song_dir = [] all_song_path_name = [] for each_band in random_bands: all_song_dir.append( "/Users/sunjingxuan/Desktop/WI_project_all_data/COMP4075_PROJECT/project_songs/songs_" + each_band) for each_dir in all_song_dir: for each_song in os.listdir(each_dir): all_song_path_name.append(each_dir + "/" + each_song) all_path_needed = [] for each_selected_song in selected_playlist: for each_song_path in all_song_path_name: if each_selected_song in each_song_path: all_path_needed.append(each_song_path) break play_list_directory = "/Users/sunjingxuan/Desktop/user_playlist" if not os.path.exists( play_list_directory): #create a tmp directory to store playlist os.makedirs(play_list_directory) #for calling audio feature function else: shutil.rmtree(play_list_directory) os.makedirs(play_list_directory) for each_path in all_path_needed: shutil.copy(each_path, play_list_directory) #############################finish outputing songs to a file############################### # load in preextracted audio feature for each band [allMtFeatures, wavFilesList2] = afe.dirWavFeatureExtraction(play_list_directory, 1.0, 1.0, 0.050, 0.050, False) allMtFeatures = allMtFeatures.flatten() all_bands_audio_feature = numpy.load( "audio_feature_new_sequence.npy") #get all bands features #calculate audio similarity all_audio_similarity = [] for one_band_audio_feature in all_bands_audio_feature: one_band_audio_feature = one_band_audio_feature.flatten() one_audio_similarity = dist.cosine(one_band_audio_feature, allMtFeatures) all_audio_similarity.append(one_audio_similarity) # print 'audio similarity' # print all_audio_similarity # print '\n' # for getting the top-k most similar band (audio) most_similar_band_by_audio = [] top_k_similar_values = nlargest(5, all_audio_similarity) similar_index = [] for each_value in top_k_similar_values: for index, sim in enumerate(all_audio_similarity): if each_value == sim and index not in similar_index: similar_index.append(index) for each_index in similar_index: most_similar_band_by_audio.append(all_band_names[each_index]) print 'top-5 bands for audio' print most_similar_band_by_audio print '\n' # min-max normalization normalized_audio_similarity = [] max_audio_similarity = max(all_audio_similarity) min_audio_similarity = min(all_audio_similarity) for each_score in all_audio_similarity: normalized_audio_similarity.append( float((each_score - min_audio_similarity)) / float( (max_audio_similarity - min_audio_similarity))) # print normalized_audio_similarity # merge and get overall similarity overall_weighted_similarity = [] weight = 0.7 for index, each_sim in enumerate(normalized_lyrics_similarity): overall_weighted_similarity.append( (float(weight) * float(normalized_lyrics_similarity[index])) + (float((1 - weight)) * float(normalized_audio_similarity[index]))) # get final recommendation final_recommendation = [] top_k_similar_values = nlargest(5, overall_weighted_similarity) similar_index = [] for each_value in top_k_similar_values: for index, sim in enumerate(overall_weighted_similarity): if each_value == sim and index not in similar_index: similar_index.append(index) for each_index in similar_index: final_recommendation.append(all_band_names[each_index]) print 'Final recommendation' print final_recommendation print '\n' # check if the style is correct num_correct = 0 correct_list = [] for each_recommendation in final_recommendation: for each_band in style_list[random_style]: if each_band == each_recommendation[0]: num_correct += 1 correct_list.append(each_recommendation) print "number of correct recommendation" print num_correct print '\n' print 'recomendation list' print correct_list print 'time used' print time.time() - start_time print '\n'