def create_songdet(h5, sngidxfle): ''' Collects song details for all unique songs heard by 100 raters. Format of dictionary: { SongID : [ Att_0, Att_1, Att_2 ] } ''' import hdf5_getters sngdetfle = open("songdet.txt", "wb") sngdetdic = dict() sngidxfle = open(sngidxfle, "rb") sngidxdic = pickle.load(sngidxfle) for elem in sngidxdic: songidx = sngidxdic[elem] tempo = hdf5_getters.get_tempo(h5, songidx) loud = hdf5_getters.get_loudness(h5, songidx) year = hdf5_getters.get_year(h5, songidx) tmsig = hdf5_getters.get_time_signature(h5, songidx) key = hdf5_getters.get_key(h5, songidx) mode = hdf5_getters.get_mode(h5, songidx) duration = hdf5_getters.get_duration(h5, songidx) fadein = hdf5_getters.get_end_of_fade_in(h5, songidx) fadeout = hdf5_getters.get_start_of_fade_out(h5, songidx) artfam = hdf5_getters.get_artist_familiarity(h5, songidx) sngdetdic[elem] = [duration, tmsig, tempo, key, mode, fadein, fadeout, year, loud, artfam] pickle.dump(sngdetdic, sngdetfle) sngdetfle.close()
def load_non_time_data(): years = [] ten_features=[] num = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = getter.open_h5_file_read(f) num += 1 print(num) try: year = getter.get_year(h5) if year!=0: years.append(year) title_length = len(getter.get_title(h5)) terms_length = len(getter.get_artist_terms(h5)) tags_length = len(getter.get_artist_mbtags(h5)) hotness = getter.get_artist_hotttnesss(h5) duration = getter.get_duration(h5) loudness = getter.get_loudness(h5) mode = getter.get_mode(h5) release_length = len(getter.get_release(h5)) tempo = getter.get_tempo(h5) name_length = len(getter.get_artist_name(h5)) ten_feature = np.hstack([title_length,tags_length, hotness, duration, terms_length, loudness, mode, release_length, tempo, name_length]) ten_features.append(ten_feature) except: print(1) h5.close() return years,ten_features
def feat_from_file(path): feats = [] h5 = GETTERS.open_h5_file_read(path) feats.append( GETTERS.get_track_id(h5) ) feats.append( GETTERS.get_title(h5) ) feats.append( GETTERS.get_artist_name(h5) ) feats.append( GETTERS.get_year(h5) ) feats.append( GETTERS.get_loudness(h5) ) feats.append( GETTERS.get_tempo(h5) ) feats.append( GETTERS.get_time_signature(h5) ) feats.append( GETTERS.get_key(h5) ) feats.append( GETTERS.get_mode(h5) ) feats.append( GETTERS.get_duration(h5) ) #timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre, axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre, axis=0) for k in var_timbre: feats.append(k) h5.close() return feats
def getSongProperties(songCount = 3000, splitData = True): songDict = {} songIdDict = {} songIdCount = 0 for root, dirs, files in os.walk(msd_subset_data_path): files = glob.glob(os.path.join(root,'*.h5')) for f in files: h5 = GETTERS.open_h5_file_read(f) tempo = GETTERS.get_tempo(h5) danceability = GETTERS.get_danceability(h5) energy = GETTERS.get_energy(h5) loudness = GETTERS.get_loudness(h5) #print GETTERS.get_artist_terms(h5) timbre = GETTERS.get_segments_timbre(h5) artist_hotness = GETTERS.get_artist_hotttnesss(h5) song_key = GETTERS.get_key(h5) songIdDict[GETTERS.get_song_id(h5)] = songIdCount songDict[songIdCount] = [tempo,danceability,energy,loudness,artist_hotness,song_key] songIdCount += 1 h5.close() #if len(songDict) >2: # break #if len(songDict) >2: # break if songIdCount > songCount and splitData: break return songIdDict,songDict
def get_all_examples(basedir, genre_dict, ext='.h5'): """ From a base directory, goes through all subdirectories, and grabs all songs and their features and puts them into a pandas dataframe INPUT basedir - base directory of the dataset genre_dict - a dictionary mapping track id to genre based tagraum dataset ext - extension, .h5 by default RETURN dataframe containing all song examples """ features_vs_genre = pd.DataFrame() # iterate over all files in all subdirectories for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) # # count files # count += len(files) # apply function to all files for f in files: h5 = GETTERS.open_h5_file_read(f) song_id = GETTERS.get_track_id(h5).decode('utf-8') if (song_id in genre_dict): genre = genre_dict[song_id] year = GETTERS.get_year(h5) duration = GETTERS.get_duration(h5) end_of_fade_in = GETTERS.get_end_of_fade_in(h5) loudness = GETTERS.get_loudness(h5) song_hotttnesss = GETTERS.get_song_hotttnesss(h5) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) key_confidence = GETTERS.get_key_confidence(h5) mode = GETTERS.get_mode(h5) mode_confidence = GETTERS.get_mode_confidence(h5) time_signature = GETTERS.get_time_signature(h5) time_signature_confidence = GETTERS.get_time_signature_confidence( h5) artist_name = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) # length of sections_start array gives us number of start num_sections = len(GETTERS.get_sections_start(h5)) num_segments = len(GETTERS.get_segments_confidence(h5)) example = pd.DataFrame( data=[ (artist_name, title, song_id, genre, year, key, key_confidence, mode, mode_confidence, time_signature, time_signature_confidence, duration, end_of_fade_in, loudness, song_hotttnesss, tempo, num_sections) ], columns=[ 'artist_name', 'title', 'song_id', 'genre', 'year', 'key', 'key_confidence', 'mode', 'mode_confidence', 'time_signature', 'time_signature_confidence', 'duration', 'end_of_fade_in', 'loudness', 'song_hotttnesss', 'tempo', 'num_segments' ]) features_vs_genre = features_vs_genre.append(example) h5.close() return features_vs_genre
def func_to_extract_features(filename): """ This function extracts all features: per-track, per-section and per-segment """ # - open the song file h5 = GETTERS.open_h5_file_read(filename) # - get per-track features and put them artist_id = GETTERS.get_artist_id(h5) song_id = GETTERS.get_song_id(h5) artist_familiarity = GETTERS.get_artist_familiarity(h5) artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5) artist_latitude = GETTERS.get_artist_latitude(h5) artist_longitude = GETTERS.get_artist_longitude(h5) danceability = GETTERS.get_danceability(h5) energy = GETTERS.get_energy(h5) loudness = GETTERS.get_loudness(h5) song_hotttnesss = GETTERS.get_song_hotttnesss(h5) tempo = GETTERS.get_tempo(h5) year = GETTERS.get_year(h5) # artist_ids.add(artist_id) # features_tuple = (artist_id, artist_familiarity, artist_hotttnesss, artist_latitude, artist_longitude, danceability, energy, loudness, song_hotttnesss, tempo, year) features_tuple = (artist_id, artist_familiarity, artist_hotttnesss, loudness, song_hotttnesss, tempo, year) # print features_tuple features_tuples[song_id] = features_tuple # files_per_artist[artist_id] += 1 # - close the file h5.close()
def func_to_desired_song_data(filename): h5 = GETTERS.open_h5_file_read(filename) track_id = GETTERS.get_track_id(h5) for song in random_songs: if song[0] == track_id: print("FOUND ONE!") title = replace_characters(GETTERS.get_title(h5)) artist = replace_characters(GETTERS.get_artist_name(h5)) year = GETTERS.get_year(h5) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) loudness = GETTERS.get_loudness(h5) energy = GETTERS.get_energy(h5) danceability = GETTERS.get_danceability(h5) time_signature = GETTERS.get_time_signature(h5) mode = GETTERS.get_mode(h5) hotttness = GETTERS.get_song_hotttnesss(h5) song_data = { 'title': title, 'artist': artist, 'year': year, 'tempo': tempo, 'key': key, 'loudness': loudness, 'energy': energy, 'danceability': danceability, 'time_signature': time_signature, 'mode': mode, 'hotttness': hotttness } all_the_data.append(song_data) h5.close()
def get_all_data(dir,ext,outDir): songsDict = {} i = 0 for root, dirs, files in os.walk(dir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = get.open_h5_file_read(f) #songId = get.get_song_id(h5) pitchAr = (np.array(get.get_segments_pitches(h5))).mean(axis=0) timbreAr = (np.array(get.get_segments_timbre(h5))).mean(axis=0) loudness = get.get_loudness(h5) concatenatedArr = np.append([pitchAr,timbreAr],loudness) featureSet[i] = concatenatedArr #songsDict[songId] = featureSet[i] print i i += 1 h5.close() #with open(outDir, "a") as f: #for key in songsDict.keys(): #f.write("{0}|{1}".format(key,songsDict[key])) #f.write("\n") np.savetxt(outputDirPath,featureSet,delimiter = ',')
def get_all_data(dir, ext, outDir): songsDict = {} i = 0 for root, dirs, files in os.walk(dir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: h5 = get.open_h5_file_read(f) #songId = get.get_song_id(h5) pitchAr = (np.array(get.get_segments_pitches(h5))).mean(axis=0) timbreAr = (np.array(get.get_segments_timbre(h5))).mean(axis=0) loudness = get.get_loudness(h5) concatenatedArr = np.append([pitchAr, timbreAr], loudness) featureSet[i] = concatenatedArr #songsDict[songId] = featureSet[i] print i i += 1 h5.close() #with open(outDir, "a") as f: #for key in songsDict.keys(): #f.write("{0}|{1}".format(key,songsDict[key])) #f.write("\n") np.savetxt(outputDirPath, featureSet, delimiter=',')
def create_songdet(h5, sngidxfle): ''' Collects song details for all unique songs heard by 100 raters. Format of dictionary: { SongID : [ Att_0, Att_1, Att_2 ] } ''' import hdf5_getters sngdetfle = open("songdet.txt", "wb") sngdetdic = dict() sngidxfle = open(sngidxfle, "rb") sngidxdic = pickle.load(sngidxfle) for elem in sngidxdic: songidx = sngidxdic[elem] tempo = hdf5_getters.get_tempo(h5, songidx) loud = hdf5_getters.get_loudness(h5, songidx) year = hdf5_getters.get_year(h5, songidx) tmsig = hdf5_getters.get_time_signature(h5, songidx) key = hdf5_getters.get_key(h5, songidx) mode = hdf5_getters.get_mode(h5, songidx) duration = hdf5_getters.get_duration(h5, songidx) fadein = hdf5_getters.get_end_of_fade_in(h5, songidx) fadeout = hdf5_getters.get_start_of_fade_out(h5, songidx) artfam = hdf5_getters.get_artist_familiarity(h5, songidx) sngdetdic[elem] = [ duration, tmsig, tempo, key, mode, fadein, fadeout, year, loud, artfam ] pickle.dump(sngdetdic, sngdetfle) sngdetfle.close()
def feat_from_file(path): """ Extract a list of features in an array, already converted to string """ feats = [] h5 = GETTERS.open_h5_file_read(path) # basic info feats.append( GETTERS.get_track_id(h5) ) feats.append( GETTERS.get_artist_name(h5).replace(',','') ) feats.append( GETTERS.get_title(h5).replace(',','') ) feats.append( GETTERS.get_loudness(h5) ) feats.append( GETTERS.get_tempo(h5) ) feats.append( GETTERS.get_time_signature(h5) ) feats.append( GETTERS.get_key(h5) ) feats.append( GETTERS.get_mode(h5) ) feats.append( GETTERS.get_duration(h5) ) # timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre,axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre,axis=0) for k in var_timbre: feats.append(k) # done with h5 file h5.close() # makes sure we return strings feats = map(lambda x: str(x), feats) return feats
def extract_features(filename): h5 = hdf5_getters.open_h5_file_read(filename) f = [None] * len(features) f[features.index('track_id')] = hdf5_getters.get_track_id(h5, 0).item() f[features.index('song_id')] = hdf5_getters.get_song_id(h5, 0).item() f[features.index('hotttnesss')] = hdf5_getters.get_artist_hotttnesss( h5, 0).item() f[features.index('danceability')] = hdf5_getters.get_danceability( h5, 0).item() f[features.index('duration')] = hdf5_getters.get_duration(h5, 0).item() f[features.index('key')] = hdf5_getters.get_key(h5, 0).item() f[features.index('energy')] = hdf5_getters.get_energy(h5, 0).item() f[features.index('loudness')] = hdf5_getters.get_loudness(h5, 0).item() f[features.index('year')] = hdf5_getters.get_year(h5, 0).item() f[features.index('time_signature')] = hdf5_getters.get_time_signature( h5, 0).item() f[features.index('tempo')] = hdf5_getters.get_tempo(h5, 0).item() tags = '' for tag in hdf5_getters.get_artist_terms(h5): tags += ('%s|' % tag) # Remove trailing pipe. tags = tags[:len(tags) - 1] f[features.index('tags')] = tags h5.close() return f
def read(self, path): files = os.listdir(path) import csv with open('library_csv.csv', 'w') as library_csv: writer = csv.writer(library_csv) writer.writerow([ 'Loudness', 'Danceability', 'Energy', 'Tempo', 'timeSignature', 'Title' ]) # get params for filename in files: # hdf5path = filename hdf5path = "Data/" + filename # hdf5path.replace("'","",2) # sanity check if not os.path.isfile(hdf5path): print 'ERROR: file', hdf5path, 'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) # get all getters loudness = hdf5.get_loudness(h5) dance = hdf5.get_danceability(h5) energy = hdf5.get_energy(h5) tempo = hdf5.get_tempo(h5) ts = hdf5.get_time_signature(h5) title = hdf5.get_title(h5) writer.writerow([loudness, dance, energy, tempo, ts, title]) # print them h5.close() library_csv.close()
def get_attribute(files): array = [] count = 0 for f in files: temp = [] count += 1 print(f) h5 = hdf5_getters.open_h5_file_read(f) temp.append(hdf5_getters.get_num_songs(h5)) temp.append(hdf5_getters.get_artist_familiarity(h5)) temp.append(hdf5_getters.get_artist_hotttnesss(h5)) temp.append(hdf5_getters.get_danceability(h5)) temp.append(hdf5_getters.get_energy(h5)) temp.append(hdf5_getters.get_key(h5)) temp.append(hdf5_getters.get_key_confidence(h5)) temp.append(hdf5_getters.get_loudness(h5)) temp.append(hdf5_getters.get_mode(h5)) temp.append(hdf5_getters.get_mode_confidence(h5)) temp.append(hdf5_getters.get_tempo(h5)) temp.append(hdf5_getters.get_time_signature(h5)) temp.append(hdf5_getters.get_time_signature_confidence(h5)) temp.append(hdf5_getters.get_title(h5)) temp.append(hdf5_getters.get_artist_name(h5)) temp = np.nan_to_num(temp) array.append(temp) # if count%100 ==0: # print(array[count-100:count-1]) # kmean.fit(array[count-100:count-1]) h5.close() return array
def feat_from_file(path): """ Extract a list of features in an array, already converted to string """ feats = [] h5 = GETTERS.open_h5_file_read(path) # basic info feats.append(GETTERS.get_track_id(h5)) feats.append(GETTERS.get_artist_name(h5).decode().replace(',', '')) feats.append(GETTERS.get_title(h5).decode().replace(',', '')) feats.append(GETTERS.get_loudness(h5)) feats.append(GETTERS.get_tempo(h5)) feats.append(GETTERS.get_time_signature(h5)) feats.append(GETTERS.get_key(h5)) feats.append(GETTERS.get_mode(h5)) feats.append(GETTERS.get_duration(h5)) # timbre timbre = GETTERS.get_segments_timbre(h5) avg_timbre = np.average(timbre, axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre, axis=0) for k in var_timbre: feats.append(k) # done with h5 file h5.close() # makes sure we return strings feats = map(lambda x: str(x), feats) return feats
def get_all_titles(basedir, ext='.h5'): titles = [] artist_names = [] terms = [] loudness = [] segments_loudness_max = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append(hdf5_getters.get_title(h5)) artist_names.append(hdf5_getters.get_artist_name(h5)) try: terms.append(hdf5_getters.get_artist_terms(h5)) except: pass loudness.append(hdf5_getters.get_loudness(h5)) try: segments_loudness_max.append( hdf5_getters.get_segments_loudness_max(h5)) except: pass h5.close() return titles, artist_names, terms, loudness, segments_loudness_max
def plots(track): f, axarr = plt.subplots(2, sharex=True) path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) segments = (GETTERS.get_segments_start(h5)) sections = (GETTERS.get_sections_start(h5)) max_loudness = (GETTERS.get_segments_loudness_max(h5)) loudness = (GETTERS.get_segments_loudness_start(h5)) average_loudness = (max_loudness + loudness) / 2 average_loudness_song = GETTERS.get_loudness(h5) start_fade_out = GETTERS.get_start_of_fade_out(h5) end_fade_in = GETTERS.get_end_of_fade_in(h5) pitches = GETTERS.get_segments_pitches(h5) h5.close() axarr[0].set_title('loudness curve for ' + ut.get_track_info(track)) axarr[0].plot(segments, average_loudness, label='Filtered') axarr[0].axhline(average_loudness_song, color='green') for section in enumerate(sections): axarr[0].axvline(section[1], color='red') axarr[0].axvline(start_fade_out, color='green') axarr[0].axvline(end_fade_in, color='green') idx = list() for section in sections: dif = segments - section posdif = np.where(dif >=0) idx.append(posdif[0][0]) axarr[1].set_title('Chroma values for ' + ut.get_track_info(track)) #axarr[1].set_xticks(idx,sections.astype(int)) extent =[0,segments.shape[0],0,12] axarr[1].imshow(pitches.transpose(),extent = extent,aspect = 'auto',interpolation='nearest',origin='lower') plt.show()
def get_all_titles(basedir,ext='.h5') : titles = [] artist_names = [] terms = [] loudness = [] segments_loudness_max = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = hdf5_getters.open_h5_file_read(f) titles.append(hdf5_getters.get_title(h5)) artist_names.append(hdf5_getters.get_artist_name(h5)) try: terms.append(hdf5_getters.get_artist_terms(h5)) except: pass loudness.append(hdf5_getters.get_loudness(h5)) try: segments_loudness_max.append(hdf5_getters.get_segments_loudness_max(h5)) except: pass h5.close() return titles, artist_names, terms, loudness, segments_loudness_max
def process_song(h5_song_file): song = {} song['artist_familiarity'] = hdf5_getters.get_artist_familiarity(h5) song['artist_id'] = hdf5_getters.get_artist_id(h5) song['artist_name'] = hdf5_getters.get_artist_name(h5) song['artist_hotttnesss'] = hdf5_getters.get_artist_hotttnesss(h5); song['title'] = hdf5_getters.get_title(h5) terms = hdf5_getters.get_artist_terms(h5) terms_freq = hdf5_getters.get_artist_terms_freq(h5) terms_weight = hdf5_getters.get_artist_terms_weight(h5) terms_array = [] # Creating a array of [term, its frequency, its weight]. Doing this for all terms associated # with the artist for i in range(len(terms)): terms_array.append([terms[i], terms_freq[i], terms_weight[i]]) song['artist_terms'] = terms_array beats_start = hdf5_getters.get_beats_start(h5) song['beats_start_variance'] = variance(beats_start) #beats variance in yocto seconds(10^-24s) song['number_of_beats'] = len(beats_start) song['duration'] = hdf5_getters.get_duration(h5) song['loudness'] = hdf5_getters.get_loudness(h5) sections_start = hdf5_getters.get_sections_start(h5) song['sections_start_variance'] = variance(sections_start) song['number_of_sections'] = len(sections_start) segments_pitches = hdf5_getters.get_segments_pitches(h5) (a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_pitches) song['segments_pitches_variance'] = [variance(a0), variance(a1), variance(a2), variance(a3), variance(a4), variance(a5), variance(a6), variance(a7), variance(a8), variance(a9), variance(a10), variance(a11)] song['segments_pitches_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)] segments_timbre = hdf5_getters.get_segments_timbre(h5) (a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_timbre) song['segments_timbre_variance'] = [variance(a0), variance(a1), variance(a2), variance(a3), variance(a4), variance(a5), variance(a6), variance(a7), variance(a8), variance(a9), variance(a10), variance(a11)] song['segments_timbre_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)] song['tempo'] = hdf5_getters.get_tempo(h5) song['_id'] = hdf5_getters.get_song_id(h5) song['year'] = hdf5_getters.get_year(h5) return song
def get_loudness(track,h5=None): #returns #0: the average loudness of the track #1: the variance of the loudness over the track #2: the difference between the highest and lowest dB value between the end of fade in and start of fade out close = (h5== None) if h5 == None: path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) loudness_avg = GETTERS.get_loudness(h5) loudnesses_interval = GETTERS.get_segments_loudness_max(h5) start_segments = GETTERS.get_segments_start(h5) start_fade_out = GETTERS.get_start_of_fade_out(h5) end_fade_in = GETTERS.get_end_of_fade_in(h5) idx = np.where((start_segments > end_fade_in) & (start_segments < start_fade_out)) if close: h5.close() #return (loudness_avg, np.var(loudnesses_interval[idx]), min(loudnesses_interval[idx]), max(loudnesses_interval[idx])) return (loudness_avg, np.var(loudnesses_interval[idx]), abs(max(loudnesses_interval[idx]) - abs(min(loudnesses_interval[idx]))))
def load_raw_data(): years = [] ten_features=[] timbres = [] pitches = [] min_length = 10000 num = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = getter.open_h5_file_read(f) num += 1 print(num) try: year = getter.get_year(h5) if year!=0: timbre = getter.get_segments_timbre(h5) s = np.size(timbre,0) if s>=100: if s<min_length: min_length = s pitch = getter.get_segments_pitches(h5) years.append(year) timbres.append(timbre) pitches.append(pitch) title_length = len(getter.get_title(h5)) terms_length = len(getter.get_artist_terms(h5)) tags_length = len(getter.get_artist_mbtags(h5)) hotness = getter.get_artist_hotttnesss(h5) duration = getter.get_duration(h5) loudness = getter.get_loudness(h5) mode = getter.get_mode(h5) release_length = len(getter.get_release(h5)) tempo = getter.get_tempo(h5) name_length = len(getter.get_artist_name(h5)) ten_feature = np.hstack([title_length, hotness, duration, tags_length, terms_length,loudness, mode, release_length, tempo, name_length]) ten_features.append(ten_feature) except: print(1) h5.close() return years, timbres, pitches,min_length,ten_features
def h5_to_csv_fields(h5,song): '''Converts h5 format to text Inputs: h5, an h5 file object, usable with the wrapper code MSongsDB song, an integer, representing which song in the h5 file to take the info out of (h5 files contain many songs) Output: a string representing all the information of this song, as a single line of a csv file ''' rv=[] ##All these are regular getter functions from wrapper code rv.append(gt.get_artist_name(h5,song)) rv.append(gt.get_title(h5, song)) rv.append(gt.get_release(h5, song)) rv.append(gt.get_year(h5,song)) rv.append(gt.get_duration(h5,song)) rv.append(gt.get_artist_familiarity(h5,song)) rv.append(gt.get_artist_hotttnesss(h5,song)) rv.append(gt.get_song_hotttnesss(h5, song)) ##artist_terms, artist_terms_freq, and artist_terms_weight getter functions ##are all arrays, so we need to turn them into strings first. We used '_' as a separator rv.append(array_to_csv_field(list(gt.get_artist_terms(h5,song)))) rv.append(array_to_csv_field(list(gt.get_artist_terms_freq(h5,song)))) rv.append(array_to_csv_field(list(gt.get_artist_terms_weight(h5,song)))) rv.append(gt.get_mode(h5,song)) rv.append(gt.get_key(h5,song)) rv.append(gt.get_tempo(h5,song)) rv.append(gt.get_loudness(h5,song)) rv.append(gt.get_danceability(h5,song)) rv.append(gt.get_energy(h5,song)) rv.append(gt.get_time_signature(h5,song)) rv.append(array_to_csv_field(list(gt.get_segments_start(h5,song)))) ##These arrays have vectors (Arrays) as items, 12 dimensional each ##An array like [[1,2,3],[4,5,6]] will be written to csv as '1;2;3_4;5;6', i.e. there's two types of separators rv.append(double_Array_to_csv_field(list(gt.get_segments_timbre(h5,song)),'_',';')) rv.append(double_Array_to_csv_field(list(gt.get_segments_pitches(h5,song)),'_',';')) rv.append(array_to_csv_field(list(gt.get_segments_loudness_start(h5,song)))) rv.append(array_to_csv_field(list(gt.get_segments_loudness_max(h5,song)))) rv.append(array_to_csv_field(list(gt.get_segments_loudness_max_time(h5,song)))) rv.append(array_to_csv_field(list(gt.get_sections_start(h5,song)))) ##turn this list into a string with comma separators (i.e. a csv line) rv_string=array_to_csv_field(rv, ",") rv_string+="\n" return rv_string
def append_files(self, letter): path = '/Users/cole/eclipse-workspace/EC2 File Transfer/Data/' + letter + '/' files = os.listdir(path) import csv with open('library_csv.csv', 'a') as library_csv: writer = csv.writer(library_csv) for filename in files: hdf5path = path + filename h5 = hdf5_getters.open_h5_file_read(hdf5path) # get all getters loudness = hdf5.get_loudness(h5) key = hdf5.get_key(h5) mode = hdf5.get_mode(h5) tempo = hdf5.get_tempo(h5) ts = hdf5.get_time_signature(h5) title = hdf5.get_title(h5) writer.writerow([loudness, key, mode, tempo, ts, title]) # print them h5.close() library_csv.close()
def getTrackInfo(starting_num): my_list = [] f = hdf5_getters.open_h5_file_read(filepath) progress_bar = tqdm(range(tracks_per_thread)) for iteration in progress_bar: i = int(iteration) + (starting_num*tracks_per_thread) track_id = hdf5_getters.get_track_id(f, i).decode() if track_id not in lyric_track_ids_set: continue # skip it an go on artist_name = hdf5_getters.get_artist_name(f, i).decode() duration = hdf5_getters.get_duration(f, i) loudness = hdf5_getters.get_loudness(f, i) tempo = hdf5_getters.get_tempo(f, i) title = hdf5_getters.get_title(f, i).decode() year = hdf5_getters.get_year(f, i) long_list = [track_id, artist_name, duration, loudness, tempo, title, year] my_list.append(long_list) progress_bar.set_description("Iteration %d" % i) f.close() return my_list
def func_to_extract_loudness(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global listloudness h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 #Get target feature: song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: listloudness.append(song_loudness) h5.close()
def get_attribute(f): temp = [] count += 1 print(f) h5 = hdf5_getters.open_h5_file_read(f) temp.append(hdf5_getters.get_num_songs(h5)) temp.append(hdf5_getters.get_artist_familiarity(h5)) temp.append(hdf5_getters.get_artist_hotttnesss(h5)) temp.append(hdf5_getters.get_danceability(h5)) temp.append(hdf5_getters.get_energy(h5)) temp.append(hdf5_getters.get_key(h5)) temp.append(hdf5_getters.get_key_confidence(h5)) temp.append(hdf5_getters.get_loudness(h5)) temp.append(hdf5_getters.get_mode(h5)) temp.append(hdf5_getters.get_mode_confidence(h5)) temp.append(hdf5_getters.get_tempo(h5)) temp.append(hdf5_getters.get_time_signature(h5)) temp.append(hdf5_getters.get_time_signature_confidence(h5)) temp = np.nan_to_num(temp) array.append(temp) h5.close()
def get_all_data(target, basedir, ext='.h5') : # header target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( "track_id", "song_id", "title", "artist_name", "artist_location", "artist_hotttnesss", "release", "year", "song_hotttnesss", "danceability", "duration", "loudness", "sample_rate", "tempo" )) count = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: for line in f: new_file = open("tmp.txt", 'w') new_file.write(line) h5 = hdf5_getters.open_h5_file_read(new_file) target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( hdf5_getters.get_track_id(h5), hdf5_getters.get_song_id(h5), hdf5_getters.get_title(h5), hdf5_getters.get_artist_name(h5), hdf5_getters.get_artist_location(h5), hdf5_getters.get_artist_hotttnesss(h5), hdf5_getters.get_release(h5), hdf5_getters.get_year(h5), hdf5_getters.get_song_hotttnesss(h5), hdf5_getters.get_danceability(h5), hdf5_getters.get_duration(h5), hdf5_getters.get_loudness(h5), hdf5_getters.get_analysis_sample_rate(h5), hdf5_getters.get_tempo(h5) )) # show progress count += 1 print "%d/10000" % (count) h5.close()
def get_song_data(results): songs_data = [] for f in results: h5 = getter.open_h5_file_read(f) songs_data.append([ os.path.basename(f), getter.get_artist_name(h5), getter.get_title(h5), getter.get_time_signature(h5), getter.get_key(h5), getter.get_segments_loudness_max(h5), getter.get_mode(h5), getter.get_beats_confidence(h5), getter.get_duration(h5), getter.get_tempo(h5), getter.get_loudness(h5), getter.get_segments_timbre(h5), getter.get_segments_pitches(h5), getter.get_key_confidence(h5), ]) h5.close() return songs_data
def get_all_attributes(filename): """ This function does 3 simple things: - open the song file - get all required attributes - write it to a csv file - close the files """ with open('attributes.csv', 'a') as csvfile: try: # let's apply the previous function to all files csvwriter = csv.writer(csvfile, delimiter='\t') h5 = GETTERS.open_h5_file_read(filename) RESULTS = [] RESULTS.append(GETTERS.get_year(h5)) RESULTS.append(GETTERS.get_artist_id(h5)) RESULTS.append(GETTERS.get_artist_name(h5)) RESULTS.append(GETTERS.get_artist_mbid(h5)) RESULTS.append(convert_terms(GETTERS.get_artist_terms(h5))) RESULTS.append(GETTERS.get_artist_hotttnesss(h5)) RESULTS.append(GETTERS.get_artist_latitude(h5)) RESULTS.append(GETTERS.get_artist_longitude(h5)) RESULTS.append(GETTERS.get_artist_familiarity(h5)) RESULTS.append(GETTERS.get_danceability(h5)) RESULTS.append(GETTERS.get_duration(h5)) RESULTS.append(GETTERS.get_energy(h5)) RESULTS.append(GETTERS.get_loudness(h5)) RESULTS.append(GETTERS.get_song_hotttnesss(h5)) RESULTS.append(GETTERS.get_song_id(h5)) RESULTS.append(GETTERS.get_tempo(h5)) RESULTS.append(GETTERS.get_time_signature(h5)) RESULTS.append(GETTERS.get_title(h5)) RESULTS.append(GETTERS.get_track_id(h5)) RESULTS.append(GETTERS.get_release(h5)) csvwriter.writerow(RESULTS) h5.close() except AttributeError: pass
def plot_loudness_curve(track): path = "../../msd_dense_subset/mood/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) segments = (GETTERS.get_segments_start(h5)) sections = (GETTERS.get_sections_start(h5)) max_loudness = (GETTERS.get_segments_loudness_max(h5)) loudness = (GETTERS.get_segments_loudness_start(h5)) average_loudness = (max_loudness + loudness) / 2 average_loudness_song = GETTERS.get_loudness(h5) start_fade_out = GETTERS.get_start_of_fade_out(h5) end_fade_in = GETTERS.get_end_of_fade_in(h5) plt.title('loudness curve for ' + ut.get_track_info(track)) plt.ylabel('Loudness(dB)') plt.xlabel('Time(s)') plt.plot(segments, average_loudness, label='Filtered') plt.axhline(average_loudness_song, color='green',label='average') for section in enumerate(sections): plt.axvline(section[1], color='red') plt.axvline(start_fade_out, color='green') plt.axvline(end_fade_in, color='green') #plt.legend(('average')) h5.close() plt.show()
def feat_from_file(path): """ Extract a list of features in an array, already converted to string """ feats = [] h5 = GETTERS.open_h5_file_read(path) # basic info feats.append(GETTERS.get_track_id(h5)) #feats.append( GETTERS.get_artist_name(h5).replace(',','') ) #feats.append( GETTERS.get_title(h5).replace(',','') ) feats.append(GETTERS.get_loudness(h5)) feats.append(GETTERS.get_tempo(h5)) feats.append(GETTERS.get_time_signature(h5)) feats.append(GETTERS.get_key(h5)) feats.append(GETTERS.get_mode(h5)) feats.append(GETTERS.get_duration(h5)) feats.append(GETTERS.get_hotnesss(h5)) segments_loudness = np.asarray(GETTERS.get_segments_loudness_max(h5)) max_segment_indice = np.argmax(segments_loudness) # timbre timbre = GETTERS.get_segments_timbre(h5) max_segment_timbre = timbre[max_segment_indice, :] avg_timbre = np.average(timbre, axis=0) for k in avg_timbre: feats.append(k) var_timbre = np.var(timbre, axis=0) for k in var_timbre: feats.append(k) for k in max_segment_timbre: feats.append(k) # done with h5 file h5.close() # makes sure we return strings feats = [str(x) for x in feats] return feats
def root_retrieve(self, path): #r=root, d=directory, f=file import csv with open('library_csv.csv', 'w') as library_csv: writer = csv.writer(library_csv) writer.writerow( ['Loudness', 'Key', 'Mode', 'Tempo', 'timeSignature', 'Title']) for r, d, f in os.walk(path): for song in f: hdf5path = os.path.join(r, song) h5 = hdf5_getters.open_h5_file_read(hdf5path) # get all getters loudness = hdf5.get_loudness(h5) key = hdf5.get_key(h5) mode = hdf5.get_mode(h5) tempo = hdf5.get_tempo(h5) ts = hdf5.get_time_signature(h5) title = hdf5.get_title(h5) writer.writerow([loudness, key, mode, tempo, ts, title]) # print them h5.close() library_csv.close() print("done")
def get_loudness(track, h5=None): #returns #0: the average loudness of the track #1: the variance of the loudness over the track #2: the difference between the highest and lowest dB value between the end of fade in and start of fade out close = (h5 == None) if h5 == None: path = "../../msd_dense_subset/dense/" + track[2] + "/" + track[ 3] + "/" + track[4] + "/" + track + ".h5" h5 = GETTERS.open_h5_file_read(path) loudness_avg = GETTERS.get_loudness(h5) loudnesses_interval = GETTERS.get_segments_loudness_max(h5) start_segments = GETTERS.get_segments_start(h5) start_fade_out = GETTERS.get_start_of_fade_out(h5) end_fade_in = GETTERS.get_end_of_fade_in(h5) idx = np.where((start_segments > end_fade_in) & (start_segments < start_fade_out)) if close: h5.close() #return (loudness_avg, np.var(loudnesses_interval[idx]), min(loudnesses_interval[idx]), max(loudnesses_interval[idx])) return (loudness_avg, np.var(loudnesses_interval[idx]), abs( max(loudnesses_interval[idx]) - abs(min(loudnesses_interval[idx]))))
def get_feats(h5): f = [] f.append(hdf5_getters.get_artist_name(h5).decode('utf8').replace(',', '')) f.append(hdf5_getters.get_title(h5).decode('utf8').replace(',', '')) f.append(str(hdf5_getters.get_loudness(h5))) f.append(str(hdf5_getters.get_tempo(h5))) f.append(str(hdf5_getters.get_time_signature(h5))) f.append(str(hdf5_getters.get_key(h5))) f.append(str(hdf5_getters.get_mode(h5))) f.append(str(hdf5_getters.get_duration(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_timbre(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_pitches(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_loudness_max(h5))) f.extend( get_statistical_feats(hdf5_getters.get_segments_loudness_max_time(h5))) f.extend( get_statistical_feats(hdf5_getters.get_segments_loudness_start(h5))) f.append(str(hdf5_getters.get_song_hotttnesss(h5))) f.append(str(hdf5_getters.get_danceability(h5))) f.append(str(hdf5_getters.get_end_of_fade_in(h5))) f.append(str(hdf5_getters.get_energy(h5))) f.append(str(hdf5_getters.get_start_of_fade_out(h5))) f.append(str(hdf5_getters.get_year(h5))) return f
def get_fields(files): tracks = [] counts = {} field_counts = [] for file in files: h5 = hdf5_getters.open_h5_file_read(file) t = {} t['artist_familiarity'] = hdf5_getters.get_artist_familiarity( h5) # estimation t['artist_hotttnesss'] = hdf5_getters.get_artist_hotttnesss( h5) # estimation t['artist_name'] = hdf5_getters.get_artist_name(h5) # artist name t['release'] = hdf5_getters.get_release(h5) # album name t['title'] = hdf5_getters.get_title(h5) # title t['len_similar_artists'] = len( hdf5_getters.get_similar_artists(h5)) # number of similar artists t['analysis_sample_rate'] = hdf5_getters.get_analysis_sample_rate( h5) # sample rate of the audio used ????????? t['duration'] = hdf5_getters.get_duration(h5) # seconds t['key'] = hdf5_getters.get_key(h5) # key the song is in t['key_confidence'] = hdf5_getters.get_key_confidence( h5) # confidence measure t['loudness'] = hdf5_getters.get_loudness(h5) # overall loudness in dB t['mode_confidence'] = hdf5_getters.get_mode_confidence( h5) # confidence measure t['start_of_fade_out'] = hdf5_getters.get_start_of_fade_out( h5) # time in sec t['tempo'] = hdf5_getters.get_tempo(h5) # estimated tempo in BPM t['time_signature'] = hdf5_getters.get_time_signature( h5) # estimate of number of beats per bar, e.g. 4 t['year'] = hdf5_getters.get_year( h5) # song release year from MusicBrainz or 0 timbre = hdf5_getters.get_segments_timbre( h5) # 2D float array, texture features (MFCC+PCA-like) t['segments_timbre'] = timbre t['timbre_avg'] = timbre.mean(axis=0) # list of 12 averages cov_mat_timbre = np.cov(timbre, rowvar=False) cov_timbre = [] for i in range(len(cov_mat_timbre)): for j in range(len(cov_mat_timbre) - i): cov_timbre.append(cov_mat_timbre[i][j]) t['timbre_cov'] = cov_timbre # list of 78 covariances pitch = hdf5_getters.get_segments_pitches( h5) # 2D float array, chroma feature, one value per note t['segments_pitch'] = pitch t['pitch_avg'] = pitch.mean(axis=0) # list of 12 averages cov_mat_pitch = np.cov(pitch, rowvar=False) cov_pitch = [] for i in range(len(cov_mat_pitch)): for j in range(len(cov_mat_pitch) - i): cov_pitch.append(cov_mat_timbre[i][j]) t['pitch_cov'] = cov_pitch # list of 78 covariances # seg_pitch = hdf5_getters.get_segments_pitches(h5) # 2D float array, chroma feature, one value per note # print(seg_pitch.shape) # t['artist_latitude'] = hdf5_getters.get_artist_latitude(h5) # float, ???????????????????????????????????????? # t['artist_longitude'] = hdf5_getters.get_artist_longitude(h5) # float, ?????????????????????????????????????? # t['artist_location'] = hdf5_getters.get_artist_location(h5) # location name # t['song_hotttnesss'] = hdf5_getters.get_song_hotttnesss(h5) # estimation # t['danceability'] = hdf5_getters.get_danceability(h5) # estimation # t['end_of_fade_in'] = hdf5_getters.get_end_of_fade_in(h5) # seconds at the beginning of the song # t['energy'] = hdf5_getters.get_energy(h5) # energy from listener point of view # t['mode'] = hdf5_getters.get_mode(h5) # major or minor # t['time_signature_confidence'] = hdf5_getters.get_time_signature_confidence(h5) # confidence measure # t['artist_mbtags_count'] = len(hdf5_getters.get_artist_mbtags_count(h5)) # array int, tag counts for musicbrainz tags # bad types or non arithmatic numbers ''' # t['audio_md5'] = hdf5_getters.get_audio_md5(h5) # hash code of the audio used for the analysis by The Echo Nest # t['artist_terms_weight'] = hdf5_getters.get_artist_terms_weight(h5) # array float, echonest tags weight ????? # t['artist_terms_freq'] = hdf5_getters.get_artist_terms_freq(h5) # array float, echonest tags freqs ?????????? # t['artist_terms'] = hdf5_getters.get_artist_terms(h5) # array string, echonest tags ????????????????????????? # t['artist_id'] = hdf5_getters.get_artist_id(h5) # echonest id # t['artist_mbid'] = hdf5_getters.get_artist_mbid(h5) # musicbrainz id # t['artist_playmeid'] = hdf5_getters.get_artist_playmeid(h5) # playme id # t['artist_7digitalid'] = hdf5_getters.get_artist_7digitalid(h5) # 7digital id # t['release_7digitalid'] = hdf5_getters.get_release_7digitalid(h5) # 7digital id # t['song_id'] = hdf5_getters.get_song_id(h5) # echonest id # t['track_7digitalid'] = hdf5_getters.get_track_7digitalid(h5) # 7digital id # t['similar_artists'] = hdf5_getters.get_similar_artists(h5) # string array of sim artist ids # t['track_id'] = hdf5_getters.get_track_id(h5) # echonest track id # t['segments_start'] = hdf5_getters.get_segments_start(h5) # array floats, musical events, ~ note onsets # t['segments_confidence'] = hdf5_getters.get_segments_confidence(h5) # array floats, confidence measure # t['segments_pitches'] = hdf5_getters.get_segments_pitches(h5) # 2D float array, chroma feature, one value per note # t['segments_timbre'] = hdf5_getters.get_segments_timbre(h5) # 2D float array, texture features (MFCC+PCA-like) # t['segments_loudness_max'] = hdf5_getters.get_segments_loudness_max(h5) # float array, max dB value # t['segments_loudness_max_time'] = hdf5_getters.get_segments_loudness_max_time(h5) # float array, time of max dB value, i.e. end of attack # t['segments_loudness_start'] = hdf5_getters.get_segments_loudness_start(h5) # array float, dB value at onset # t['sections_start'] = hdf5_getters.get_sections_start(h5) # array float, largest grouping in a song, e.g. verse # t['sections_confidence'] = hdf5_getters.get_sections_confidence(h5) # array float, confidence measure # t['beats_start'] = hdf5_getters.get_beats_start(h5) # array float, result of beat tracking # t['beats_confidence'] = hdf5_getters.get_beats_confidence(h5) # array float, confidence measure # t['bars_start'] = hdf5_getters.get_bars_start(h5) # array float, beginning of bars, usually on a beat # t['bars_confidence'] = hdf5_getters.get_bars_confidence(h5) # array float, confidence measure # t['tatums_start'] = hdf5_getters.get_tatums_start(h5) # array float, smallest rythmic element # t['tatums_confidence'] = hdf5_getters.get_tatums_confidence(h5) # array float, confidence measure # t['artist_mbtags'] = hdf5_getters.get_artist_mbtags(h5) # array string, tags from musicbrainz.org ''' h5.close() for key, value in t.items(): if isinstance(value, float) and math.isnan(value): pass if type(value) is np.ndarray: if key in counts.keys(): counts[key] += 1 else: counts[key] = 1 elif value: if key in counts.keys(): counts[key] += 1 else: counts[key] = 1 elif key not in counts.keys(): counts[key] = 0 count = 0 for key, value in t.items(): if isinstance(value, float) and math.isnan(value): pass elif type(value) is np.ndarray: count += 1 elif value: count += 1 field_counts.append(count) # progress bar if num_of_tracks >= 100: i = files.index(file) + 1 scale = num_of_tracks / 100 if i % math.ceil(len(files) * .05) == 0: sys.stdout.write('\r') # the exact output you're looking for: sys.stdout.write("Loading dataframe: [%-100s] %d%%" % ('=' * int(i // scale), 1 / scale * i)) sys.stdout.flush() time.sleep(.01) tracks.append(t) print() return tracks, counts, field_counts
def func_to_extract_features(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global listfeatures cf = [] h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 # Get target feature: song hotness # FEATURE 0 song_hotness = GETTERS.get_song_hotttnesss(h5) if math.isnan(song_hotness): nanfound = 1 cntnan = cntnan + 1 h5.close() return 0 elif song_hotness > 0.3 and song_hotness < 0.6: h5.close() return 0 else: cf.append(song_hotness) # FEATURE 1 # Get song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_loudness) # FEATURE 2 # Get key of the song song_key = GETTERS.get_key(h5) if math.isnan(song_key): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_key) # FEATURE 3 # Get duration of the song song_duration = GETTERS.get_duration(h5) if math.isnan(song_duration): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_duration) # Feature 4 # Get song tempo song_tempo = GETTERS.get_tempo(h5) if math.isnan(song_tempo): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_tempo) # Feature 5: artist familarity artist_familiarity = GETTERS.get_artist_familiarity(h5) if math.isnan(artist_familiarity): nanfound = 1 cntnan = cntnan + 1 else: cf.append(artist_familiarity) # Feature 6: artist_hotness artist_hotness = GETTERS.get_artist_hotttnesss(h5) if math.isnan(artist_hotness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(artist_hotness) # Feature 7 time signature time_signature = GETTERS.get_time_signature(h5) cf.append(time_signature) # Feature 8 # Loudness COV loudness_segments = np.array(GETTERS.get_segments_loudness_max(h5)) loudness_cov = abs(variation(loudness_segments)) if math.isnan(loudness_cov): nanfound = 1 cntnan = cntnan + 1 else: cf.append(loudness_cov) # Feature 9 # Beat COV beat_segments = np.array(GETTERS.get_beats_start(h5)) beat_cov = abs(variation(beat_segments)) if math.isnan(beat_cov): nanfound = 1 cntnan = cntnan + 1 else: cf.append(beat_cov) # Feature 10 # Year song_year = GETTERS.get_year(h5) if song_year == 0: nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_year) if nanfound == 0: strlist = list_to_csv(cf) listfeatures.append(strlist) strtitle = GETTERS.get_title(h5) listtitle.append(strtitle) h5.close()
def data_to_flat_file(basedir,ext='.h5') : """This function extract the information from the tables and creates the flat file.""" count = 0; #song counter list_to_write= [] row_to_write = "" writer = csv.writer(open("metadata.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') comma=title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') #eliminating commas in the album comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,"); if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg= get_avg(bars_c) bars_c_max= get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev= get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max= get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev= get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg= get_avg(beats_c) beats_c_max= get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev= get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max= get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev= get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg= get_avg(sec_c) sec_c_max= get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev= get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max= get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev= get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg= get_avg(seg_c) seg_c_max= get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev= get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg= get_avg(seg_loud_max) seg_loud_max_max= get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev= get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg= get_avg(seg_loud_max_time) seg_loud_max_time_max= get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg= get_avg(seg_loud_start) seg_loud_start_max= get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev= get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg= get_avg(seg_start) seg_start_max= get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev= get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg= get_avg(tatms_c) tatms_c_max= get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev= get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg= get_avg(tatms_start) tatms_start_max= get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev= get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 #genre was found in dictionary if genre_set == 1: col_num=[] for genre in final_genre: column=int(genre) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array else: genre_array=genre_columns(-1) #the genre was not found in the dictionary transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 #Writing to the flat file writer.writerow([title,album,artist_name,duration,samp_rt,artist_7digitalid,artist_fam,artist_hotness,artist_id,artist_lat,artist_loc,artist_lon,artist_mbid,genre_array[0],genre_array[1],genre_array[2], genre_array[3],genre_array[4],genre_array[5],genre_array[6],genre_array[7],genre_array[8],genre_array[9],genre_array[10],genre_array[11],genre_array[12],genre_array[13],genre_array[14],genre_array[15], genre_array[16],genre_array[17],genre_array[18],genre_array[19],genre_array[20],genre_array[21],genre_array[22],genre_array[23],genre_array[24],genre_array[25],genre_array[26], genre_array[27],genre_array[28],genre_array[29],genre_array[30],genre_array[31],genre_array[32],genre_array[33],genre_array[34],genre_array[35],genre_array[36],genre_array[37],genre_array[38], genre_array[39],genre_array[40],genre_array[41],genre_array[42],genre_array[43],genre_array[44],genre_array[45],genre_array[46],genre_array[47],genre_array[48],genre_array[49], genre_array[50],genre_array[51],genre_array[52],genre_array[53],genre_array[54],genre_array[55],genre_array[56],genre_array[57],genre_array[58],genre_array[59], genre_array[60],genre_array[61],genre_array[62],genre_array[63],genre_array[64],genre_array[65],genre_array[66],genre_array[67],genre_array[68],genre_array[69], genre_array[70],genre_array[71],genre_array[72],genre_array[73],genre_array[74],genre_array[75],genre_array[76],genre_array[77],genre_array[78],genre_array[79], genre_array[80],genre_array[81],genre_array[82],genre_array[83],genre_array[84],genre_array[85],genre_array[86],genre_array[87],genre_array[88],genre_array[89], genre_array[90],genre_array[91],genre_array[92],genre_array[93],genre_array[94],genre_array[95],genre_array[96],genre_array[97],genre_array[98],genre_array[99],genre_array[100],genre_array[101], genre_array[102],genre_array[103],genre_array[104],genre_array[105],genre_array[106],genre_array[107],genre_array[108],genre_array[109],genre_array[110],genre_array[111],genre_array[112], genre_array[113],genre_array[114],genre_array[115],genre_array[116],genre_array[117],genre_array[118],genre_array[119],genre_array[120],genre_array[121],genre_array[122],genre_array[123], genre_array[124],genre_array[125],genre_array[126],genre_array[127],genre_array[128],genre_array[129],genre_array[130],genre_array[131],genre_array[132], artist_pmid,audio_md5,danceability,end_fade_in,energy,song_key,key_c,loudness,mode,mode_conf,release_7digitalid,song_hot,song_id,start_fade_out,tempo,time_sig,time_sig_c,track_id,track_7digitalid,year,bars_c_avg,bars_c_max,bars_c_min,bars_c_stddev,bars_c_count,bars_c_sum,bars_start_avg,bars_start_max,bars_start_min,bars_start_stddev,bars_start_count,bars_start_sum,beats_c_avg,beats_c_max,beats_c_min,beats_c_stddev,beats_c_count,beats_c_sum,beats_start_avg,beats_start_max,beats_start_min, beats_start_stddev,beats_start_count,beats_start_sum, sec_c_avg,sec_c_max,sec_c_min,sec_c_stddev,sec_c_count,sec_c_sum,sec_start_avg,sec_start_max,sec_start_min,sec_start_stddev,sec_start_count,sec_start_sum,seg_c_avg,seg_c_max,seg_c_min,seg_c_stddev,seg_c_count,seg_c_sum,seg_loud_max_avg,seg_loud_max_max,seg_loud_max_min,seg_loud_max_stddev,seg_loud_max_count,seg_loud_max_sum,seg_loud_max_time_avg,seg_loud_max_time_max,seg_loud_max_time_min,seg_loud_max_time_stddev,seg_loud_max_time_count,seg_loud_max_time_sum,seg_loud_start_avg,seg_loud_start_max,seg_loud_start_min,seg_loud_start_stddev,seg_loud_start_count,seg_loud_start_sum,seg_pitch_avg[0],seg_pitch_max[0],seg_pitch_min[0],seg_pitch_stddev[0],seg_pitch_count[0],seg_pitch_sum[0],seg_pitch_avg[1],seg_pitch_max[1],seg_pitch_min[1],seg_pitch_stddev[1],seg_pitch_count[1],seg_pitch_sum[1],seg_pitch_avg[2],seg_pitch_max[2],seg_pitch_min[2],seg_pitch_stddev[2],seg_pitch_count[2],seg_pitch_sum[2],seg_pitch_avg[3],seg_pitch_max[3],seg_pitch_min[3],seg_pitch_stddev[3],seg_pitch_count[3],seg_pitch_sum[3],seg_pitch_avg[4],seg_pitch_max[4],seg_pitch_min[4],seg_pitch_stddev[4],seg_pitch_count[4],seg_pitch_sum[4],seg_pitch_avg[5],seg_pitch_max[5],seg_pitch_min[5],seg_pitch_stddev[5],seg_pitch_count[5],seg_pitch_sum[5],seg_pitch_avg[6],seg_pitch_max[6],seg_pitch_min[6],seg_pitch_stddev[6],seg_pitch_count[6],seg_pitch_sum[6],seg_pitch_avg[7],seg_pitch_max[7],seg_pitch_min[7],seg_pitch_stddev[7],seg_pitch_count[7],seg_pitch_sum[7],seg_pitch_avg[8],seg_pitch_max[8],seg_pitch_min[8],seg_pitch_stddev[8],seg_pitch_count[8],seg_pitch_sum[8],seg_pitch_avg[9],seg_pitch_max[9],seg_pitch_min[9],seg_pitch_stddev[9],seg_pitch_count[9],seg_pitch_sum[9],seg_pitch_avg[10],seg_pitch_max[10],seg_pitch_min[10],seg_pitch_stddev[10],seg_pitch_count[10],seg_pitch_sum[10],seg_pitch_avg[11],seg_pitch_max[11],seg_pitch_min[11], seg_pitch_stddev[11],seg_pitch_count[11],seg_pitch_sum[11],seg_start_avg,seg_start_max,seg_start_min,seg_start_stddev, seg_start_count,seg_start_sum,seg_timbre_avg[0],seg_timbre_max[0],seg_timbre_min[0],seg_timbre_stddev[0],seg_timbre_count[0], seg_timbre_sum[0],seg_timbre_avg[1],seg_timbre_max[1],seg_timbre_min[1],seg_timbre_stddev[1],seg_timbre_count[1], seg_timbre_sum[1],seg_timbre_avg[2],seg_timbre_max[2],seg_timbre_min[2],seg_timbre_stddev[2],seg_timbre_count[2], seg_timbre_sum[2],seg_timbre_avg[3],seg_timbre_max[3],seg_timbre_min[3],seg_timbre_stddev[3],seg_timbre_count[3], seg_timbre_sum[3],seg_timbre_avg[4],seg_timbre_max[4],seg_timbre_min[4],seg_timbre_stddev[4],seg_timbre_count[4], seg_timbre_sum[4],seg_timbre_avg[5],seg_timbre_max[5],seg_timbre_min[5],seg_timbre_stddev[5],seg_timbre_count[5], seg_timbre_sum[5],seg_timbre_avg[6],seg_timbre_max[6],seg_timbre_min[6],seg_timbre_stddev[6],seg_timbre_count[6], seg_timbre_sum[6],seg_timbre_avg[7],seg_timbre_max[7],seg_timbre_min[7],seg_timbre_stddev[7],seg_timbre_count[7], seg_timbre_sum[7],seg_timbre_avg[8],seg_timbre_max[8],seg_timbre_min[8],seg_timbre_stddev[8],seg_timbre_count[8], seg_timbre_sum[8],seg_timbre_avg[9],seg_timbre_max[9],seg_timbre_min[9],seg_timbre_stddev[9],seg_timbre_count[9], seg_timbre_sum[9],seg_timbre_avg[10],seg_timbre_max[10],seg_timbre_min[10],seg_timbre_stddev[10],seg_timbre_count[10], seg_timbre_sum[10],seg_timbre_avg[11],seg_timbre_max[11],seg_timbre_min[11],seg_timbre_stddev[11],seg_timbre_count[11], seg_timbre_sum[11],tatms_c_avg,tatms_c_max,tatms_c_min,tatms_c_stddev,tatms_c_count,tatms_c_sum,tatms_start_avg,tatms_start_max,tatms_start_min,tatms_start_stddev,tatms_start_count,tatms_start_sum]) h5.close() count=count+1; print count;
def func_to_extract_features(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global cntdanceability global listfeatures cf = [] h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 #Get target feature: song hotness song_hotness = GETTERS.get_song_hotttnesss(h5) if math.isnan(song_hotness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_hotness) #Get danceablity # song_danceability = GETTERS.get_danceability(h5) # if song_danceability == 0: # nanfound = 1 # cntnan = cntnan + 1 # else: # cf.append(song_danceability) #Get song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_loudness) #Get song energy # song_energy = GETTERS.get_energy(h5) # if song_energy == 0: # nanfound = 1 # cntnan = cntnan + 1 # else: # cf.append(song_energy) #Get key of the song song_key = GETTERS.get_key(h5) if math.isnan(song_key): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_key) #Get mode of the song song_mode = GETTERS.get_mode(h5) if math.isnan(song_mode): nanfound = 1 cntnan = cntnan + 1 elif song_mode == 0: nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_mode) #Get duration of the song song_duration = GETTERS.get_duration(h5) if math.isnan(song_duration): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_duration) #Get Average Pitch Class across all segments #Get the pitches (12 pitches histogram for each segment) pitches = GETTERS.get_segments_pitches(h5) M = np.mat(pitches) meanpitches = M.mean(axis=0) pitches_arr = np.asarray(meanpitches) pitches_list = [] for i in range(0,12): pitches_list.append(pitches_arr[0][i]) cf.append(pitches_list) #Get Average Timbre Class across all segments timbres = GETTERS.get_segments_timbre(h5) M = np.mat(timbres) meantimbres = M.mean(axis=0) timbre_arr = np.asarray(meantimbres) timbre_list = [] for i in range(0,12): timbre_list.append(timbre_arr[0][i]) cf.append(timbre_list) #Get song year song_year = GETTERS.get_year(h5) if song_year == 0: nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_year) if nanfound == 0: strlist = list_to_csv(cf) print strlist listfeatures.append(strlist) h5.close()
def getLoudness(h5): #Returns the loudness of the song return [hdf5_getters.get_loudness(h5)]
# Check existence of the HDF5 file if not os.path.isfile(hdf5path): continue #print('ERROR: file',hdf5path,'does not exist.') #sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) # Retrieve features from HDF5 danceability = hdf5_getters.get_danceability(h5) duration = hdf5_getters.get_duration(h5) time_of_fade_in = hdf5_getters.get_end_of_fade_in(h5) energy = hdf5_getters.get_energy(h5) key = hdf5_getters.get_key(h5) key_confidence = hdf5_getters.get_key_confidence(h5) loudness = hdf5_getters.get_loudness(h5) mode = hdf5_getters.get_mode(h5) mode_confidence = hdf5_getters.get_mode_confidence(h5) sections_start = hdf5_getters.get_sections_start(h5) num_sections = len(sections_start) if num_sections == 0: h5.close() continue segments_loudness_max = hdf5_getters.get_segments_loudness_max(h5) segments_loudness_start = hdf5_getters.get_segments_loudness_start(h5) num_segments = len(hdf5_getters.get_segments_start(h5)) num_tatums = len(hdf5_getters.get_tatums_start(h5)) time_of_fade_out = duration - hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) time_signature = hdf5_getters.get_time_signature(h5) time_signature_confidence = hdf5_getters.get_time_signature_confidence(h5)
def classify(h5): output_array={} # duration duration=hdf5_getters.get_duration(h5) output_array["duration"]=duration ### ADDED VALUE TO ARRAY # number of bars bars=hdf5_getters.get_bars_start(h5) num_bars=len(bars) output_array["num_bars"]=num_bars ### ADDED VALUE TO ARRAY # mean and variance in bar length bar_length=numpy.ediff1d(bars) variance_bar_length=numpy.var(bar_length) output_array["variance_bar_length"]=variance_bar_length ### ADDED VALUE TO ARRAY # number of beats beats=hdf5_getters.get_beats_start(h5) num_beats=len(beats) output_array["num_beats"]=num_beats ### ADDED VALUE TO ARRAY # mean and variance in beats length beats_length=numpy.ediff1d(beats) variance_beats_length=numpy.var(bar_length) output_array["variance_beats_length"]=variance_beats_length ### ADDED VALUE TO ARRAY # danceability danceability=hdf5_getters.get_danceability(h5) output_array["danceability"]=danceability ### ADDED VALUE TO ARRAY # end of fade in end_of_fade_in=hdf5_getters.get_end_of_fade_in(h5) output_array["end_of_fade_in"]=end_of_fade_in ### ADDED VALUE TO ARRAY # energy energy=hdf5_getters.get_energy(h5) output_array["energy"]=energy ### ADDED VALUE TO ARRAY # key key=hdf5_getters.get_key(h5) output_array["key"]=int(key) ### ADDED VALUE TO ARRAY # loudness loudness=hdf5_getters.get_loudness(h5) output_array["loudness"]=loudness ### ADDED VALUE TO ARRAY # mode mode=hdf5_getters.get_mode(h5) output_array["mode"]=int(mode) ### ADDED VALUE TO ARRAY # number sections sections=hdf5_getters.get_sections_start(h5) num_sections=len(sections) output_array["num_sections"]=num_sections ### ADDED VALUE TO ARRAY # mean and variance in sections length sections_length=numpy.ediff1d(sections) variance_sections_length=numpy.var(sections) output_array["variance_sections_length"]=variance_sections_length ### ADDED VALUE TO ARRAY # number segments segments=hdf5_getters.get_segments_start(h5) num_segments=len(segments) output_array["num_segments"]=num_segments ### ADDED VALUE TO ARRAY # mean and variance in segments length segments_length=numpy.ediff1d(segments) variance_segments_length=numpy.var(segments) output_array["variance_segments_length"]=variance_segments_length ### ADDED VALUE TO ARRAY # segment loudness max segment_loudness_max_array=hdf5_getters.get_segments_loudness_max(h5) segment_loudness_max_time_array=hdf5_getters.get_segments_loudness_max_time(h5) segment_loudness_max_index=0 for i in range(len(segment_loudness_max_array)): if segment_loudness_max_array[i]>segment_loudness_max_array[segment_loudness_max_index]: segment_loudness_max_index=i segment_loudness_max=segment_loudness_max_array[segment_loudness_max_index] segment_loudness_max_time=segment_loudness_max_time_array[segment_loudness_max_index] output_array["segment_loudness_max"]=segment_loudness_max ### ADDED VALUE TO ARRAY output_array["segment_loudness_time"]=segment_loudness_max_time ### ADDED VALUE TO ARRAY # POSSIBLE TODO: use average function instead and weight by segment length # segment loudness mean (start) segment_loudness_array=hdf5_getters.get_segments_loudness_start(h5) segment_loudness_mean=numpy.mean(segment_loudness_array) output_array["segment_loudness_mean"]=segment_loudness_mean ### ADDED VALUE TO ARRAY # segment loudness variance (start) segment_loudness_variance=numpy.var(segment_loudness_array) output_array["segment_loudness_variance"]=segment_loudness_variance ### ADDED VALUE TO ARRAY # segment pitches segment_pitches_array=hdf5_getters.get_segments_pitches(h5) segment_pitches_mean=numpy.mean(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_mean"]=segment_pitches_mean # segment pitches variance (start) segment_pitches_variance=numpy.var(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_variance"]=segment_pitches_variance # segment timbres segment_timbres_array=hdf5_getters.get_segments_timbre(h5) segment_timbres_mean=numpy.mean(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_mean"]=segment_timbres_mean # segment timbres variance (start) segment_timbres_variance=numpy.var(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_variance"]=segment_timbres_variance # hotttnesss hottness=hdf5_getters.get_song_hotttnesss(h5,0) output_array["hottness"]=hottness ### ADDED VALUE TO ARRAY # duration-start of fade out start_of_fade_out=hdf5_getters.get_start_of_fade_out(h5) fade_out=duration-start_of_fade_out output_array["fade_out"]=fade_out ### ADDED VALUE TO ARRAY # tatums tatums=hdf5_getters.get_tatums_start(h5) num_tatums=len(tatums) output_array["num_tatums"]=num_tatums ### ADDED VALUE TO ARRAY # mean and variance in tatums length tatums_length=numpy.ediff1d(tatums) variance_tatums_length=numpy.var(tatums_length) output_array["variance_tatums_length"]=variance_tatums_length ### ADDED VALUE TO ARRAY # tempo tempo=hdf5_getters.get_tempo(h5) output_array["tempo"]=tempo ### ADDED VALUE TO ARRAY # time signature time_signature=hdf5_getters.get_time_signature(h5) output_array["time_signature"]=int(time_signature) ### ADDED VALUE TO ARRAY # year year=hdf5_getters.get_year(h5) output_array["year"]=int(year) ### ADDED VALUE TO ARRAY # artist terms artist_terms=hdf5_getters.get_artist_terms(h5,0) output_array["artist_terms"]=artist_terms.tolist() artist_terms_freq=hdf5_getters.get_artist_terms_freq(h5,0) output_array["artist_terms_freq"]=artist_terms_freq.tolist() artist_name=hdf5_getters.get_artist_name(h5,0) output_array["artist_name"]=artist_name artist_id=hdf5_getters.get_artist_id(h5,0) output_array["artist_id"]=artist_id # title title=hdf5_getters.get_title(h5,0) output_array["title"]=title return output_array
def get_all_titles(basedir,ext='.h5') : global errorcount global count global cap global truecount features = [] decade = [] decadecount = defaultdict(int) timbre = None i = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: feature = [] try: h5 = hdf5_getters.open_h5_file_read(f) except HDF5ExtError as e: errorcount += 1 print "Unexpected error:", sys.exc_info()[0] print traceback.format_exc() continue year = hdf5_getters.get_year(h5) print i i+=1 if year == 0: h5.close() continue label = getbin(year) #label = (year/10)*10 truecount[label] += 1 if decadecount[label] > cap: flag = checkforcompletion(decadecount) h5.close() if flag: for dec in decadecount.keys(): print 'Decade : ' + str(dec) + ' Count : ' + str(decadecount[dec]) return features,decade continue # dec = (year/10)*10 # if dec < 1960: # h5.close() # continue # try: # # bttimbre = bt.get_bttimbre(h5) # timbres = bttimbre.argmax(axis = 0) + 1 # Is a vector of timbre values sutiable for training an HMM # for timbre in timbres: # timbredict[timbre] += 1 # for i in range(1,13): # feature.append(timbredict[i]) # except: # h5.close() # continue # clustercount = {} # for x in range(12): # clustercount[x] = 0 # try: # bttimbre = bt.get_bttimbre(h5) # btT = bttimbre.T # for x in btT: # timbre = x.argmax(axis = 0) # clustercount[timbre]+=1 # except: # h5.close() # continue # for y in range(12): # features.append(clustercount[y]) try: btchromas = bt.get_btchromas(h5) for chroma in btchromas: feature.append(mean(chroma)) covmat = get_covariance(btchromas) feature.extend(covmat) bttimbre = bt.get_bttimbre(h5) for timbre in bttimbre: feature.append(mean(timbre)) covmat = get_covariance(bttimbre) feature.extend(covmat) # btT = bttimbre.T # for x in btT: # timbre = x.argmax(axis = 0) # clustercount[timbre]+=1 # for y in range(12): # feature.append(clustercount[y]) except: errorcount += 1 h5.close() continue loudness = hdf5_getters.get_loudness(h5) feature.append(loudness) duration = hdf5_getters.get_duration(h5) feature.append(duration) features.append(feature) decade.append(label) decadecount[label] += 1 count += 1 h5.close() # title = hdf5_getters.get_title(h5) # segstarts = hdf5_getters.get_segments_start(h5) # segstarts = np.array(segstarts).flatten() # btstarts = hdf5_getters.get_beats_start(h5) # btstarts = np.array(btstarts).flatten() for dec in decadecount.keys(): print 'Decade : ' + str(dec) + ' Count : ' + str(decadecount[dec]) return features,decade
def get_loudness(self): if self.h5 == None: self.open() return hdf5_getters.get_loudness(self.h5)
filepaths = glob.glob(glob_path) for filepath in tqdm(filepaths): h5 = hdf5_getters.open_h5_file_read(filepath) n = hdf5_getters.get_num_songs(h5) for row in range(n): song_id = hdf5_getters.get_song_id(h5, songidx=row).decode('UTF-8') # artist = hdf5_getters.get_artist_name(h5,songidx=row).decode('UTF-8') # title= hdf5_getters.get_title(h5,songidx=row)#.decode('UTF-8') # artist = "".join(c for c in unicodedata.normalize('NFD', str(artist.decode("utf8"))) if unicodedata.category(c) != "Mn") # title = "".join(c for c in unicodedata.normalize('NFD', str(title.decode("utf8"))) if unicodedata.category(c) != "Mn") #single number features danceability = hdf5_getters.get_danceability(h5, songidx=row) duration = hdf5_getters.get_duration(h5, songidx=row) energy = hdf5_getters.get_energy(h5, songidx=row) loudness = hdf5_getters.get_loudness(h5, songidx=row) musicalKey = hdf5_getters.get_key(h5, songidx=row) mode = hdf5_getters.get_mode(h5, songidx=row) tempo = hdf5_getters.get_tempo(h5, songidx=row) time_signature = hdf5_getters.get_time_signature(h5, songidx=row) year = hdf5_getters.get_year(h5, songidx=row) song_hottness = hdf5_getters.get_song_hotttnesss(h5, songidx=row) end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5, songidx=row) start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5, songidx=row) #timestamp features #take last element and divide by length to get beats/unit time, segments/unit_time bars_start = hdf5_getters.get_bars_start(h5, songidx=row) beats_start = hdf5_getters.get_beats_start(h5, songidx=row) sections_start = hdf5_getters.get_sections_start(h5, songidx=row) tatums_start = hdf5_getters.get_tatums_start(h5, songidx=row)
def func_to_extract_features(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global listfeatures cf = [] h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 #Get target feature: song hotness #FEATURE 0 song_hotness = GETTERS.get_song_hotttnesss(h5) if math.isnan(song_hotness): nanfound = 1 cntnan = cntnan + 1 h5.close() return 0 elif song_hotness > 0.3 and song_hotness < 0.6: h5.close() return 0 else: if song_hotness <= 0.3: hotness_class = 0 elif song_hotness >= 0.6: hotness_class = 1 cf.append(hotness_class) #FEATURE 1 #Get song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: #cf.append(song_loudness) pass #FEATURE 2 #Get key of the song song_key = GETTERS.get_key(h5) if math.isnan(song_key): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(song_key) pass #FEATURE 3 song_duration = GETTERS.get_duration(h5) if math.isnan(song_duration): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(song_duration) pass #Feature 4 #Get song tempo song_tempo = GETTERS.get_tempo(h5) if math.isnan(song_tempo): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(song_tempo) pass #Feature 5: artist familarity artist_familiarity = GETTERS.get_artist_familiarity(h5) if math.isnan(artist_familiarity): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(artist_familiarity) pass #Feature 6: artist_hotness artist_hotness = GETTERS.get_artist_hotttnesss(h5) if math.isnan(artist_hotness): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(artist_hotness) pass #Feature 7 time signature time_signature = GETTERS.get_time_signature(h5) # cf.append(time_signature) #Feature 8 #Loudness COV loudness_segments = np.array(GETTERS.get_segments_loudness_max(h5)) loudness_cov = abs(variation(loudness_segments)) if math.isnan(loudness_cov): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(loudness_cov) pass #Feature 9 #Beat COV beat_segments = np.array(GETTERS.get_beats_start(h5)) beat_cov = abs(variation(beat_segments)) if math.isnan(beat_cov): nanfound = 1 cntnan = cntnan + 1 else: # cf.append(beat_cov) pass #Feature 10 #Year song_year = GETTERS.get_year(h5) if song_year == 0: nanfound = 1 cntnan = cntnan + 1 else: # cf.append(song_year) pass title = GETTERS.get_title(h5) if title in energydict: audio_summary = energydict[title] energy = audio_summary['energy'] danceability = audio_summary['danceability'] speechiness = audio_summary['speechiness'] liveness = audio_summary['liveness'] else: stitle = re.sub(r'\([^)]*\)','', title) if stitle in energydict: audio_summary = energydict[stitle] energy = audio_summary['energy'] danceability = audio_summary['danceability'] speechiness = audio_summary['speechiness'] liveness = audio_summary['liveness'] else: energy = 0.0 danceability = 0.0 speechiness = 0.0 liveness = 0.0 # Feature 11 cf.append(energy) # Feature 12 # cf.append(danceability) # Feature 13 # cf.append(speechiness) # Feature 14 # cf.append(liveness) if nanfound == 0: strlist = list_to_csv(cf) listfeatures.append(strlist) h5.close()
with open("fields.csv", "wb") as f: writer = csv.writer(f) # initialize the csv writer # for each track in the summary file, get the 11 fields and output to csv h5_file = hdf5_getters.open_h5_file_read("msd_summary_file.h5") for k in range(1000000): print "index!!!: ", k id = hdf5_getters.get_track_id(h5_file, k) # get track_id TRA13e39.. title = hdf5_getters.get_title(h5_file, k) # get song title artist_name = hdf5_getters.get_artist_name(h5_file, k) year = int(hdf5_getters.get_year(h5_file, k)) hotness = float(hdf5_getters.get_song_hotttnesss(h5_file, k)) artist_familiarity = float(hdf5_getters.get_artist_familiarity(h5_file, k)) f5 = int(hdf5_getters.get_key(h5_file, k)) # get key f2 = float(hdf5_getters.get_loudness(h5_file, k)) # get loudness f1 = float(hdf5_getters.get_tempo(h5_file, k)) # get tempo f4 = int(hdf5_getters.get_duration(h5_file, k)) # get duration f3 = float(hdf5_getters.get_time_signature(h5_file, k)) # get time signature # Get rid of missing info and change invalid numbers for meta data if not artist_name: artist_name = "unknown" if not artist_familiarity: artist_familiarity = 0.0 if not hotness: hotness = 0.0
def data_to_flat_file(basedir,ext='.h5') : """This function extract the information from the tables and creates the flat file.""" count = 0; #song counter list_to_write= [] row_to_write = "" writer = csv.writer(open("metadata_wholeA.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') comma=title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') #eliminating commas in the album comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,"); if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg= get_avg(bars_c) bars_c_max= get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev= get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max= get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev= get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg= get_avg(beats_c) beats_c_max= get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev= get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max= get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev= get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg= get_avg(sec_c) sec_c_max= get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev= get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max= get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev= get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg= get_avg(seg_c) seg_c_max= get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev= get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg= get_avg(seg_loud_max) seg_loud_max_max= get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev= get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg= get_avg(seg_loud_max_time) seg_loud_max_time_max= get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg= get_avg(seg_loud_start) seg_loud_start_max= get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev= get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg= get_avg(seg_start) seg_start_max= get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev= get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg= get_avg(tatms_c) tatms_c_max= get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev= get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg= get_avg(tatms_start) tatms_start_max= get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev= get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 #genre was found in dictionary if genre_set == 1: col_num=[] for genre in final_genre: column=int(genre) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array else: genre_array=genre_columns(-1) #the genre was not found in the dictionary transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 #Writing to the flat file writer.writerow([title,album,artist_name,year,duration,seg_start_count, tempo]) h5.close() count=count+1; print count;
def func_to_extract_features(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global listfeatures cf = [] h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 #Get target feature: song hotness #FEATURE 0 song_hotness = GETTERS.get_song_hotttnesss(h5) if math.isnan(song_hotness): nanfound = 1 cntnan = cntnan + 1 else: if song_hotness <= 0.2: song_hotness_class = 0 elif song_hotness <= 0.4: song_hotness_class = 1 elif song_hotness <= 0.6: song_hotness_class = 2 elif song_hotness <= 0.8: song_hotness_class = 3 else: song_hotness_class = 4 cf.append(song_hotness_class) #FEATURE 1 #Get song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_loudness) #FEATURE 2 #Get song year song_year = GETTERS.get_year(h5) if song_year == 0: nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_year) #FEATURE 3 #Get song tempo song_tempo = GETTERS.get_tempo(h5) cf.append(song_tempo) #Feature 4 #Artist familarity artist_familiarity = GETTERS.get_artist_familiarity(h5) cf.append(artist_familiarity) #Feature 5 artist_hotness = GETTERS.get_artist_hotttnesss(h5) if math.isnan(artist_hotness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(artist_hotness) if nanfound == 0: strlist = list_to_csv(cf) listfeatures.append(strlist) h5.close()
def get_all_data(dir,ext,outDir): for root, dirs, files in os.walk(dir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: h5 = get.open_h5_file_read(f) keys = {} keys['trackId'] = get.get_track_id(h5) #keys['analysisSampleRate'] = get.get_analysis_sample_rate(h5) #keys['artistMbTags'] = arrToStr(get.get_artist_mbtags(h5)) #keys['artistMbId'] = get.get_artist_mbid(h5) #keys['artistName'] = get.get_artist_name(h5) #keys['artistPlayMeId'] = get.get_artist_playmeid(h5) #keys['artistTermsWeightArr'] = arrToStr(get.get_artist_terms_weight(h5)) #array #keys['artistTermsFreqArr'] = arrToStr(get.get_artist_terms_freq(h5)) #array #keys['artistTermsArr'] = arrToStr(get.get_artist_terms(h5)) # array""" #keys['audioMd5'] = get.get_audio_md5(h5) #keys['barsConfidence'] = arrToStr(get.get_bars_confidence(h5)) # #keys['barsStart'] = arrToStr(get.get_bars_start(h5)) # #keys['beatsConfidence'] = arrToStr(get.get_beats_confidence(h5)) # #keys['beatsStart']= arrToStr(get.get_beats_start(h5)) # #keys['danceability'] = get.get_danceability(h5) #keys['duration'] = get.get_duration(h5) #keys['endofFadeIn'] = get.get_end_of_fade_in(h5) #keys['energy'] = get.get_energy(h5) #keys['key'] = get.get_key(h5) keys['loudness'] = get.get_loudness(h5) #keys['keyConfidence'] = get.get_key_confidence(h5) #keys['mode'] = get.get_mode(h5) #keys['release'] = get.get_release(h5) #keys['release7DigitalId'] = get.get_release_7digitalid(h5) #keys['modeConfidence'] = get.get_mode_confidence(h5) #keys['sectionsConfidenceArr'] = arrToStr(get.get_sections_confidence(h5)) # #keys['sectionsStartArr'] = arrToStr(get.get_sections_start(h5)) #Segments data begin keys['segmentsStartArr'] = arrToStr(get.get_segments_start(h5)) # #keys['segmentsConfidenceArr'] = arrToStr(get.get_segments_confidence(h5)) # #keys['segmentsLoudnessMaxArr'] = arrToStr(get.get_segments_loudness_max_time(h5)) # #keys['segmentsLoudnessMaxTimeArr'] = arrToStr(get.get_segments_loudness_max_time(h5)) # #keys['segmentsLoudnessStartArr'] = arrToStr(get.get_segments_loudness_start(h5)) # keys['segmentsPitchesArr'] = arrToStr((np.array(get.get_segments_pitches(h5))).flatten()) # keys['segmentsTimbreArr'] = arrToStr((np.array(get.get_segments_timbre(h5))).flatten()) #Timbre data #Segments data End #keys['similarArtists'] = arrToStr(get.get_similar_artists(h5)) #keys['songHotness'] = get.get_song_hotttnesss(h5) keys['songId'] = get.get_song_id(h5) #keys['startOfFadeOut'] = get.get_start_of_fade_out(h5) #keys['tatumsConfidence'] = arrToStr(get.get_tatums_confidence(h5)) # #keys['tatumsStart'] = arrToStr(get.get_tatums_start(h5)) # #keys['tempo'] = get.get_tempo(h5) #keys['timeSignature'] = get.get_time_signature(h5) #keys['timeSignatureConfidence'] = get.get_time_signature_confidence(h5) keys['title'] = get.get_title(h5) #keys['track7DigitalId'] = get.get_track_7digitalid(h5) #keys['year'] = get.get_year(h5) with open(outDir,'a') as f: for key in ordering: f.write('{0}\t'.format(keys[key])) f.write('\n') """ for key in keys: print key print "---> " print keys[key] """ h5.close()
def main(): dataset_dir = sys.argv[1] global feat Create_BoW(dataset_dir) Size_BoW = Index_BoW(Bag_Words) count = Frequency(Size_BoW, dataset_dir) Size_BoW = Prune(count) Lablify() print "Forming Dataset..." listing1 = os.listdir(dataset_dir) for a in listing1: listing2 = os.listdir(dataset_dir+a+'/') for b in listing2: listing3 = os.listdir(dataset_dir+a+'/'+b+'/') for c in listing3: listing4 = os.listdir(dataset_dir+a+'/'+b+'/'+c+'/') for d in listing4: h5 = hdf5_getters.open_h5_file_read(dataset_dir+a+'/'+b+'/'+c+'/'+d) feat = [] temp = hdf5_getters.get_artist_hotttnesss(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue feat.append(temp) temp = hdf5_getters.get_artist_familiarity(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue feat.append(temp) temp = hdf5_getters.get_bars_confidence(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_beats_confidence(h5) if temp.size == 0: h5.close() continue mm = np.mean(temp) vv = np.var(temp) if mm==0.0 and vv==0.0: h5.close() continue feat.append(mm) feat.append(vv) feat.append(hdf5_getters.get_duration(h5)) temp = hdf5_getters.get_end_of_fade_in(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) feat.append(hdf5_getters.get_key(h5)) temp = hdf5_getters.get_key_confidence(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) temp = hdf5_getters.get_loudness(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) feat.append(hdf5_getters.get_mode(h5)) temp = hdf5_getters.get_mode_confidence(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) temp = hdf5_getters.get_sections_confidence(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_confidence(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_loudness_max(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_loudness_max_time(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_pitches(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_timbre(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_start_of_fade_out(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) temp = hdf5_getters.get_tatums_confidence(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_tempo(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) feat.append(hdf5_getters.get_time_signature(h5)) temp = hdf5_getters.get_time_signature_confidence(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) temp = hdf5_getters.get_year(h5) if temp == 0: h5.close() continue feat.append(temp) temp = hdf5_getters.get_artist_terms(h5) if temp.size == 0: h5.close() continue temp_ = hdf5_getters.get_artist_terms_weight(h5) if temp_.size == 0: continue for j in Final_BoW: if j in temp: x = np.where(temp==j) x = x[0][0] feat.append(temp_[x]) else: x = 0.0 feat.append(x) temp = hdf5_getters.get_song_hotttnesss(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue hott = 0 if temp >=0.75: hott = 1 elif temp >=0.40 and temp <0.75: hott = 2 else: hott = 3 feat.append(hott) h5.close() count = 1 f=open('MSD_DATASET.txt', 'a') outstring='' cnt = 0 feat_size = len(feat) for i in feat: cnt+=1 outstring+=str(i) if (cnt!=feat_size): outstring+=',' outstring+='\n' f.write(outstring) f.close()
def main(): outputFile1 = open('SongCSVFinal.csv', 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input( "\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude," + " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = ( "SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation," + "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature," + "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence," + "Title,Year,mbID,Energy,ArtistFamiliarity,Hotness,end_of_fade_in,key,keyConfidence,Loudness," + "mode,mode_confidence,start_of_fade_out") ################################################# csvAttributeList = re.split('\W+', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() outputFile1.write("SongNumber,") outputFile1.write(csvRowString + "\n") csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "." # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #files = glob.glob(os.path.join(root,'*'+ext)) #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) print(files) for f in files: songH5File = hdf5_getters.open_h5_file_read(f) print('hello 1') print(songH5File) song = Song(str(hdf5_getters.get_song_id(songH5File))) testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) song.albumName = str(hdf5_getters.get_release(songH5File)) song.artistLatitude = str( hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str( hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str( hdf5_getters.get_artist_longitude(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File)) song.keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File)) # song.lyrics = None # song.popularity = None song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str( hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) song.mbID = str(hdf5_getters.get_artist_mbid(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) #song.beatConfidence = str(hdf5_getters.get_beats_confidence(songH5File)) song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.key = str(hdf5_getters.get_key(songH5File)) song.artistFamiliarity = str( hdf5_getters.get_artist_familiarity(songH5File)) song.keyConfidence = str( hdf5_getters.get_key_confidence(songH5File)) song.hotness = str(hdf5_getters.get_artist_hotttnesss(songH5File)) #song.sampleRate= str(hdf5_getters.get_analysis_sample_rate(songH5File)) song.end_of_fade_in = str( hdf5_getters.get_end_of_fade_in(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.mode_confidence = str( hdf5_getters.get_mode_confidence(songH5File)) song.start_of_fade_out = str( hdf5_getters.get_start_of_fade_out(songH5File)) #print song count csvRowString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace(',', "") csvRowString += "\"" + albumName + "\"" elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',', '') csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year elif attribute == 'mbID'.lower(): csvRowString += song.mbID elif attribute == 'Energy'.lower(): csvRowString += song.energy elif attribute == 'ArtistFamiliarity'.lower(): csvRowString += song.artistFamiliarity elif attribute == 'Hotness'.lower(): csvRowString += song.hotness #elif attribute == 'SampleRate'.lower(): #csvRowString += song.sampleRate elif attribute == 'end_of_fade_in'.lower(): csvRowString += song.end_of_fade_in elif attribute == 'key'.lower(): csvRowString += song.key elif attribute == 'keyConfidence'.lower(): csvRowString += song.keyConfidence elif attribute == 'Loudness'.lower(): csvRowString += song.loudness elif attribute == 'mode'.lower(): csvRowString += song.mode elif attribute == 'mode_confidence'.lower(): csvRowString += song.mode_confidence elif attribute == 'start_of_fade_out'.lower(): csvRowString += song.start_of_fade_out else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
def data_to_flat_file(basedir,ext='.h5') : """ This function extracts the information from the tables and creates the flat file. """ count = 0; #song counter list_to_write= [] group_index=0 row_to_write = "" writer = csv.writer(open("complete.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: row=[] print f h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') row.append(title) comma=title.find(',') if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') row.append(album) comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') row.append(artist_name) duration = hdf5_getters.get_duration(h5) row.append(duration) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) row.append(samp_rt) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) row.append(artist_7digitalid) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 row.append(artist_fam) artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 row.append(artist_hotness) artist_id = hdf5_getters.get_artist_id(h5) row.append(artist_id) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 row.append(artist_lat) artist_loc = hdf5_getters.get_artist_location(h5) row.append(artist_loc) artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 row.append(artist_lon) artist_mbid = hdf5_getters.get_artist_mbid(h5) row.append(artist_mbid) #Getting the genre art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq genre_set=0 #flag to see if the genre has been set or not final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 if genre_set == 1: col_num=[] for i in final_genre: column=int(i) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array for i in range(len(genre_array)): #appending the genre_array to the row row.append(genre_array[i]) else: genre_array=genre_columns(-1) #when there is no genre matched, return an array of [0...0] for i in range(len(genre_array)): #appending the genre_array to the row row.append(genre_array[i]) artist_pmid = hdf5_getters.get_artist_playmeid(h5) row.append(artist_pmid) audio_md5 = hdf5_getters.get_audio_md5(h5) row.append(audio_md5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 row.append(danceability) end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 row.append(end_fade_in) energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 row.append(energy) song_key = hdf5_getters.get_key(h5) row.append(song_key) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 row.append(key_c) loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 row.append(loudness) mode = hdf5_getters.get_mode(h5) row.append(mode) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 row.append(mode_conf) release_7digitalid = hdf5_getters.get_release_7digitalid(h5) row.append(release_7digitalid) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 row.append(song_hot) song_id = hdf5_getters.get_song_id(h5) row.append(song_id) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) row.append(start_fade_out) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 row.append(tempo) time_sig = hdf5_getters.get_time_signature(h5) row.append(time_sig) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 row.append(time_sig_c) track_id = hdf5_getters.get_track_id(h5) row.append(track_id) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) row.append(track_7digitalid) year = hdf5_getters.get_year(h5) row.append(year) bars_c = hdf5_getters.get_bars_confidence(h5) bars_start = hdf5_getters.get_bars_start(h5) row_bars_padding=padding(245) #this is the array that will be attached at the end of th row #--------------bars---------------" gral_info=[] gral_info=row[:] empty=[] for i,item in enumerate(bars_c): row.append(group_index) row.append(i) row.append(bars_c[i]) bars_c_avg= get_avg(bars_c) row.append(bars_c_avg) bars_c_max= get_max(bars_c) row.append(bars_c_max) bars_c_min = get_min(bars_c) row.append(bars_c_min) bars_c_stddev= get_stddev(bars_c) row.append(bars_c_stddev) bars_c_count = get_count(bars_c) row.append(bars_c_count) bars_c_sum = get_sum(bars_c) row.append(bars_c_sum) row.append(bars_start[i]) bars_start_avg = get_avg(bars_start) row.append(bars_start_avg) bars_start_max= get_max(bars_start) row.append(bars_start_max) bars_start_min = get_min(bars_start) row.append(bars_start_min) bars_start_stddev= get_stddev(bars_start) row.append(bars_start_stddev) bars_start_count = get_count(bars_start) row.append(bars_start_count) bars_start_sum = get_sum(bars_start) row.append(bars_start_sum) for i in row_bars_padding: row.append(i) writer.writerow(row) row=[] row=gral_info[:] #--------beats---------------" beats_c = hdf5_getters.get_beats_confidence(h5) group_index=1 row=[] row=gral_info[:] row_front=padding(14) #blanks left in front of the row(empty spaces for bars) row_beats_padding=padding(231) for i,item in enumerate(beats_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the beats row.append(index) row.append(beats_c[i]) beats_c_avg= get_avg(beats_c) row.append(beats_c_avg) beats_c_max= get_max(beats_c) row.append(beats_c_max) beats_c_min = get_min(beats_c) row.append(beats_c_min) beats_c_stddev= get_stddev(beats_c) row.append(beats_c_stddev) beats_c_count = get_count(beats_c) row.append(beats_c_count) beats_c_sum = get_sum(beats_c) row.append(beats_c_sum) beats_start = hdf5_getters.get_beats_start(h5) row.append(beats_start[i]) beats_start_avg = get_avg(beats_start) row.append(beats_start_avg) beats_start_max= get_max(beats_start) row.append(beats_start_max) beats_start_min = get_min(beats_start) row.append(beats_start_min) beats_start_stddev= get_stddev(beats_start) row.append(beats_start_stddev) beats_start_count = get_count(beats_start) row.append(beats_start_count) beats_start_sum = get_sum(beats_start) row.append(beats_start_sum) for i in row_beats_padding: row.append(i) writer.writerow(row) row=[] row=gral_info[:] # "--------sections---------------" row_sec_padding=padding(217) #blank spaces left at the end of the row sec_c = hdf5_getters.get_sections_confidence(h5) group_index=2 row=[] row=gral_info[:] row_front=padding(28) #blank spaces left in front(empty spaces for bars,beats) for i,item in enumerate(sec_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the sections row.append(index) row.append(sec_c[i]) sec_c_avg= get_avg(sec_c) row.append(sec_c_avg) sec_c_max= get_max(sec_c) row.append(sec_c_max) sec_c_min = get_min(sec_c) row.append(sec_c_min) sec_c_stddev= get_stddev(sec_c) row.append(sec_c_stddev) sec_c_count = get_count(sec_c) row.append(sec_c_count) sec_c_sum = get_sum(sec_c) row.append(sec_c_sum) sec_start = hdf5_getters.get_sections_start(h5) row.append(sec_start[i]) sec_start_avg = get_avg(sec_start) row.append(sec_start_avg) sec_start_max= get_max(sec_start) row.append(sec_start_max) sec_start_min = get_min(sec_start) row.append(sec_start_min) sec_start_stddev= get_stddev(sec_start) row.append(sec_start_stddev) sec_start_count = get_count(sec_start) row.append(sec_start_count) sec_start_sum = get_sum(sec_start) row.append(sec_start_sum) for i in row_sec_padding: #appending the blank spaces at the end of the row row.append(i) writer.writerow(row) row=[] row=gral_info[:] #--------segments---------------" row_seg_padding=padding(182) #blank spaces at the end of the row row_front=padding(42) #blank spaces left in front of segments seg_c = hdf5_getters.get_segments_confidence(h5) group_index=3 row=[] row=gral_info[:] for i,item in enumerate(seg_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the segments row.append(index) row.append(seg_c[i]) seg_c_avg= get_avg(seg_c) row.append(seg_c_avg) seg_c_max= get_max(seg_c) row.append(seg_c_max) seg_c_min = get_min(seg_c) row.append(seg_c_min) seg_c_stddev= get_stddev(seg_c) row.append(seg_c_stddev) seg_c_count = get_count(seg_c) row.append(seg_c_count) seg_c_sum = get_sum(seg_c) row.append(seg_c_sum) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) row.append(seg_loud_max[i]) seg_loud_max_avg= get_avg(seg_loud_max) row.append(seg_loud_max_avg) seg_loud_max_max= get_max(seg_loud_max) row.append(seg_loud_max_max) seg_loud_max_min = get_min(seg_loud_max) row.append(seg_loud_max_min) seg_loud_max_stddev= get_stddev(seg_loud_max) row.append(seg_loud_max_stddev) seg_loud_max_count = get_count(seg_loud_max) row.append(seg_loud_max_count) seg_loud_max_sum = get_sum(seg_loud_max) row.append(seg_loud_max_sum) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) row.append(seg_loud_max_time[i]) seg_loud_max_time_avg= get_avg(seg_loud_max_time) row.append(seg_loud_max_time_avg) seg_loud_max_time_max= get_max(seg_loud_max_time) row.append(seg_loud_max_time_max) seg_loud_max_time_min = get_min(seg_loud_max_time) row.append(seg_loud_max_time_min) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) row.append(seg_loud_max_time_stddev) seg_loud_max_time_count = get_count(seg_loud_max_time) row.append(seg_loud_max_time_count) seg_loud_max_time_sum = get_sum(seg_loud_max_time) row.append(seg_loud_max_time_sum) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) row.append(seg_loud_start[i]) seg_loud_start_avg= get_avg(seg_loud_start) row.append(seg_loud_start_avg) seg_loud_start_max= get_max(seg_loud_start) row.append(seg_loud_start_max) seg_loud_start_min = get_min(seg_loud_start) row.append(seg_loud_start_min) seg_loud_start_stddev= get_stddev(seg_loud_start) row.append(seg_loud_start_stddev) seg_loud_start_count = get_count(seg_loud_start) row.append(seg_loud_start_count) seg_loud_start_sum = get_sum(seg_loud_start) row.append(seg_loud_start_sum) seg_start = hdf5_getters.get_segments_start(h5) row.append(seg_start[i]) seg_start_avg= get_avg(seg_start) row.append(seg_start_avg) seg_start_max= get_max(seg_start) row.append(seg_start_max) seg_start_min = get_min(seg_start) row.append(seg_start_min) seg_start_stddev= get_stddev(seg_start) row.append(seg_start_stddev) seg_start_count = get_count(seg_start) row.append(seg_start_count) seg_start_sum = get_sum(seg_start) row.append(seg_start_sum) for i in row_seg_padding: #appending blank spaces at the end of the row row.append(i) writer.writerow(row) row=[] row=gral_info[:] #----------segments pitch and timbre---------------" row_seg2_padding=padding(14) #blank spaces left at the end of the row row_front=padding(77) #blank spaces left at the front of the segments and timbre seg_pitch = hdf5_getters.get_segments_pitches(h5) transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows group_index=4 row=[] row=gral_info[:] for i,item in enumerate(transpose_pitch[0]): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of segments and timbre row.append(index) row.append(transpose_pitch[0][i]) seg_pitch_avg= get_avg(transpose_pitch[0]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[0]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[0]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[0]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[0]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[0]) row.append(seg_pitch_sum) row.append(transpose_pitch[1][i]) seg_pitch_avg= get_avg(transpose_pitch[1]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[1]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[1]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[1]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[1]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[1]) row.append(seg_pitch_sum) row.append(transpose_pitch[2][i]) seg_pitch_avg= get_avg(transpose_pitch[2]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[2]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[2]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[2]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[2]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[2]) row.append(seg_pitch_sum) row.append(transpose_pitch[3][i]) seg_pitch_avg= get_avg(transpose_pitch[3]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[3]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[3]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[3]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[3]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[3]) row.append(seg_pitch_sum) row.append(transpose_pitch[4][i]) seg_pitch_avg= get_avg(transpose_pitch[4]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[4]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[4]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[4]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[4]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[4]) row.append(seg_pitch_sum) row.append(transpose_pitch[5][i]) seg_pitch_avg= get_avg(transpose_pitch[5]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[5]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[5]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[5]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[5]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[5]) row.append(seg_pitch_sum) row.append(transpose_pitch[6][i]) seg_pitch_avg= get_avg(transpose_pitch[6]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[6]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[6]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[6]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[6]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[6]) row.append(seg_pitch_sum) row.append(transpose_pitch[7][i]) seg_pitch_avg= get_avg(transpose_pitch[7]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[7]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[7]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[7]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[7]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[7]) row.append(seg_pitch_sum) row.append(transpose_pitch[8][i]) seg_pitch_avg= get_avg(transpose_pitch[8]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[8]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[8]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[8]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[8]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[8]) row.append(seg_pitch_sum) row.append(transpose_pitch[9][i]) seg_pitch_avg= get_avg(transpose_pitch[9]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[9]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[9]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[9]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[9]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[9]) row.append(seg_pitch_sum) row.append(transpose_pitch[10][i]) seg_pitch_avg= get_avg(transpose_pitch[10]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[10]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[10]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[10]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[10]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[10]) row.append(seg_pitch_sum) row.append(transpose_pitch[11][i]) seg_pitch_avg= get_avg(transpose_pitch[11]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[11]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[11]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[11]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[11]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[11]) row.append(seg_pitch_sum) #timbre arrays seg_timbre = hdf5_getters.get_segments_timbre(h5) transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows row.append(transpose_timbre[0][i]) seg_timbre_avg= get_avg(transpose_timbre[0]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[0]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[0]) row.append(seg_timbre_min) seg_timbre_stddev=get_stddev(transpose_timbre[0]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[0]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[0]) row.append(seg_timbre_sum) row.append(transpose_timbre[1][i]) seg_timbre_avg= get_avg(transpose_timbre[1]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[1]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[1]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[1]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[1]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[1]) row.append(seg_timbre_sum) row.append(transpose_timbre[2][i]) seg_timbre_avg= get_avg(transpose_timbre[2]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[2]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[2]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[2]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[2]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[2]) row.append(seg_timbre_sum) row.append(transpose_timbre[3][i]) seg_timbre_avg= get_avg(transpose_timbre[3]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[3]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[3]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[3]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[3]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[3]) row.append(seg_timbre_sum) row.append(transpose_timbre[4][i]) seg_timbre_avg= get_avg(transpose_timbre[4]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[4]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[4]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[4]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[4]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[4]) row.append(seg_timbre_sum) row.append(transpose_timbre[5][i]) seg_timbre_avg= get_avg(transpose_timbre[5]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[5]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[5]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[5]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[5]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[5]) row.append(seg_timbre_sum) row.append(transpose_timbre[6][i]) seg_timbre_avg= get_avg(transpose_timbre[6]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[6]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[6]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[6]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[6]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[6]) row.append(seg_timbre_sum) row.append(transpose_timbre[7][i]) seg_timbre_avg= get_avg(transpose_timbre[7]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[7]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[7]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[7]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[7]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[7]) row.append(seg_timbre_sum) row.append(transpose_timbre[8][i]) seg_timbre_avg= get_avg(transpose_timbre[8]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[8]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[8]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[8]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[8]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[8]) row.append(seg_timbre_sum) row.append(transpose_timbre[9][i]) seg_timbre_avg= get_avg(transpose_timbre[9]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[9]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[9]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[9]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[9]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[9]) row.append(seg_timbre_sum) row.append(transpose_timbre[10][i]) seg_timbre_avg= get_avg(transpose_timbre[10]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[10]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[10]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[10]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[10]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[10]) row.append(seg_timbre_sum) row.append(transpose_timbre[11][i]) seg_timbre_avg= get_avg(transpose_timbre[11]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[11]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[11]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[11]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[11]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[11]) row.append(seg_timbre_sum) for item in row_seg2_padding: row.append(item) writer.writerow(row) row=[] row=gral_info[:] # "--------tatums---------------" tatms_c = hdf5_getters.get_tatums_confidence(h5) group_index=5 row_front=padding(245) #blank spaces left in front of tatums row=[] row=gral_info[:] for i,item in enumerate(tatms_c): row.append(group_index) row.append(i) for item in row_front: #appending blank spaces at the front of the row row.append(item) row.append(tatms_c[i]) tatms_c_avg= get_avg(tatms_c) row.append(tatms_c_avg) tatms_c_max= get_max(tatms_c) row.append(tatms_c_max) tatms_c_min = get_min(tatms_c) row.append(tatms_c_min) tatms_c_stddev= get_stddev(tatms_c) row.append(tatms_c_stddev) tatms_c_count = get_count(tatms_c) row.append(tatms_c_count) tatms_c_sum = get_sum(tatms_c) row.append(tatms_c_sum) tatms_start = hdf5_getters.get_tatums_start(h5) row.append(tatms_start[i]) tatms_start_avg= get_avg(tatms_start) row.append(tatms_start_avg) tatms_start_max= get_max(tatms_start) row.append(tatms_start_max) tatms_start_min = get_min(tatms_start) row.append(tatms_start_min) tatms_start_stddev= get_stddev(tatms_start) row.append(tatms_start_stddev) tatms_start_count = get_count(tatms_start) row.append(tatms_start_count) tatms_start_sum = get_sum(tatms_start) row.append(tatms_start_sum) writer.writerow(row) row=[] row=gral_info[:] transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 h5.close() count=count+1; print count;
def data_to_flat_file(basedir, ext='.h5'): """This function extract the information from the tables and creates the flat file.""" count = 0 #song counter list_to_write = [] row_to_write = "" writer = csv.writer(open("metadata_wholeA.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title = title.replace('"', '') comma = title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album = album.replace('"', '') #eliminating commas in the album comma = album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma = artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name = artist_name.replace('"', '') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam = -1 artist_hotness = hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness = -1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat = -1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,") if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon = -1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability = -1 end_fade_in = hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in = -1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy = -1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c = -1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness = -1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf = -1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot = -1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo = -1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c = -1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg = get_avg(bars_c) bars_c_max = get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev = get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max = get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev = get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg = get_avg(beats_c) beats_c_max = get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev = get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max = get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev = get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg = get_avg(sec_c) sec_c_max = get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev = get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max = get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev = get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg = get_avg(seg_c) seg_c_max = get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev = get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg = get_avg(seg_loud_max) seg_loud_max_max = get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev = get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg = get_avg(seg_loud_max_time) seg_loud_max_time_max = get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev = get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg = get_avg(seg_loud_start) seg_loud_start_max = get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev = get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg = get_avg(seg_start) seg_start_max = get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev = get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg = get_avg(tatms_c) tatms_c_max = get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev = get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg = get_avg(tatms_start) tatms_start_max = get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev = get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes = get_genre_indexes( trm_freq) #index of the highest freq final_genre = [] genres_so_far = [] for i in range(len(genre_indexes)): genre_tmp = get_genre( art_trm, genre_indexes[i] ) #genre that corresponds to the highest freq genres_so_far = genre_dict.get_genre_in_dict( genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set = 1 #genre was found in dictionary if genre_set == 1: col_num = [] for genre in final_genre: column = int( genre) #getting the column number of the genre col_num.append(column) genre_array = genre_columns(col_num) #genre array else: genre_array = genre_columns( -1) #the genre was not found in the dictionary transpose_pitch = seg_pitch.transpose( ) #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg = [] seg_pitch_max = [] seg_pitch_min = [] seg_pitch_stddev = [] seg_pitch_count = [] seg_pitch_sum = [] i = 0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i = i + 1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose( ) #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg = [] seg_timbre_max = [] seg_timbre_min = [] seg_timbre_stddev = [] seg_timbre_count = [] seg_timbre_sum = [] i = 0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i = i + 1 #Writing to the flat file writer.writerow([ title, album, artist_name, year, duration, seg_start_count, tempo ]) h5.close() count = count + 1 print count
def getData(starting_point): starting = starting_point * 10000 files = glob.glob('/mnt/snap/data/*/*/*/*.h5') file_one_round = files[starting:starting + 10000] artist_ids = [] song_beats_persecond = [] song_duration = [] song_end_fade_in = [] song_start_fade_out = [] song_key = [] song_loudness = [] song_segments_loudness_max = [] song_segments_loudness_min = [] song_segments_loudness_med = [] song_segments_loudness_time_max = [] song_segments_loudness_time_min = [] song_segments_loudness_time_med = [] song_mode = [] song_sections_start = [] song_pitches = [] song_timbre = [] song_tempo = [] song_time_signature = [] song_title = [] artist_name = [] year = [] idx = np.triu_indices(12) #count = 1 for f in file_one_round: h5 = HDF5.open_h5_file_read(f) songYear = g.get_year(h5) if songYear < 1990: continue artist_id = g.get_artist_id(h5) song_beat = (g.get_beats_start(h5)).tolist() songDuration = g.get_duration(h5) song_beat_persecond = float(len(song_beat)) / songDuration song_end_fadein = g.get_end_of_fade_in(h5) song_start_fadeout = g.get_start_of_fade_out(h5) songKey = g.get_key(h5) songLoudness = g.get_loudness(h5) song_loudness_max = (g.get_segments_loudness_max(h5)) // 10 song_loudness_antilog = np.power(10, song_loudness_max) song_segmentsLoudness_max = np.amax(song_loudness_antilog) song_segmentsLoudness_min = np.amin(song_loudness_antilog) song_segmentsLoudness_med = np.median(song_loudness_antilog) song_segmentsLoudness_max_time = ( g.get_segments_loudness_max_time(h5)).tolist() song_loudness_time = np.multiply(song_loudness_antilog, song_segmentsLoudness_max_time) song_segmentsLoudnessTime_max = np.amax(song_loudness_time) song_segmentsLoudnessTime_min = np.amin(song_loudness_time) song_segmentsLoudnessTime_med = np.median(song_loudness_time) songMode = g.get_mode(h5) song_sectionsStart = (g.get_sections_start(h5)).tolist() songPitches = g.get_segments_pitches(h5) songPitches_cov = np.cov(songPitches, rowvar=False) songPitches_mean = np.mean(songPitches, axis=0) #print(songPitches_cov.shape) songTimbre = g.get_segments_timbre(h5) songTimbre_cov = np.cov(songTimbre, rowvar=False) songTimbre_mean = np.mean(songTimbre, axis=0) #print(songTimbre_cov.shape) songTempo = g.get_tempo(h5) songTime_signature = g.get_time_signature(h5) songTitle = g.get_title(h5) artistName = g.get_artist_name(h5) artist_ids.append(artist_id) song_beats_persecond.append(song_beat_persecond) song_duration.append(songDuration) song_end_fade_in.append(song_end_fadein) song_start_fade_out.append(song_start_fadeout) song_key.append(songKey) song_loudness.append(songLoudness) song_segments_loudness_max.append(song_segmentsLoudness_max) song_segments_loudness_min.append(song_segmentsLoudness_min) song_segments_loudness_med.append(song_segmentsLoudness_med) song_segments_loudness_time_max.append(song_segmentsLoudnessTime_max) song_segments_loudness_time_min.append(song_segmentsLoudnessTime_min) song_segments_loudness_time_med.append(song_segmentsLoudnessTime_med) song_mode.append(songMode) song_sections_start.append(song_sectionsStart) pitches_mean_cov = (songPitches_cov[idx]).tolist() pitches_mean_cov.extend((songPitches_mean).tolist()) song_pitches.append(pitches_mean_cov) timbre_mean_cov = (songTimbre_cov[idx]).tolist() timbre_mean_cov.extend((songTimbre_mean).tolist()) song_timbre.append(timbre_mean_cov) song_tempo.append(songTempo) song_time_signature.append(songTime_signature) song_title.append(songTitle) artist_name.append(artistName) year.append(songYear) #print(count) #count = count + 1 h5.close() #def createDictsFrom2DArray(dictionary, colName, featureList): # for i in range(0,12): # dictionary[colName+str(i)] = featureList[i] #i = 1 #for t in itertools.izip_longest(*featureList): # dictionary[colName+str(i)] = t # i = i + 1 # return dictionary data = collections.OrderedDict() data['year'] = year data['artist_name'] = artist_name data['artist_id'] = artist_ids data['song_title'] = song_title data['song_beats_persecond'] = song_beats_persecond data['song_duration'] = song_duration data['song_end_fade_in'] = song_end_fade_in data['song_start_fade_out'] = song_start_fade_out data['song_key'] = song_key data['song_loudness'] = song_loudness data['song_loudness_max'] = song_segments_loudness_max data['song_loudness_min'] = song_segments_loudness_min data['song_loudness_med'] = song_segments_loudness_med data['song_loudness_time_max'] = song_segments_loudness_time_max data['song_loudness_time_min'] = song_segments_loudness_time_min data['song_loudness_time_med'] = song_segments_loudness_time_med data['song_mode'] = song_mode data['song_tempo'] = song_tempo data['song_time_signature'] = song_time_signature data = createDictsFrom1DArray(data, 'pitches', song_pitches) data = createDictsFrom1DArray(data, 'timbre', song_timbre) data = createDictsFrom1DArray(data, 'sections_start', song_sections_start) df = pd.DataFrame(data) print('before return ' + str(starting_point)) return df
# Get artist familiarity familiarity = hdf5_getters.get_artist_familiarity(h5) # Get danceability danceability = hdf5_getters.get_danceability(h5) # Get duration duration = hdf5_getters.get_duration(h5) # Get energy #*****useless... column is filled with 0's? energy = hdf5_getters.get_energy(h5) # Get loudness loudness = hdf5_getters.get_loudness(h5) # Get year year = hdf5_getters.get_year(h5) # Get tempo tempo = hdf5_getters.get_tempo(h5) ######################################################### # Get analysis sample rate analysis_rate = hdf5_getters.get_analysis_sample_rate(h5) # Get end of fade in end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5)
def convert_to_csv(): i = 0 data = [] target = [] count = 0 with open('data_timbre.csv', 'w+') as f: with open('target_timbre.csv', 'w+') as f2: writer = csv.writer(f) target_writer = csv.writer(f2) for root, dirs, files in os.walk(msd_subset_data_path): files = glob.glob(os.path.join(root,'*.h5')) for f in sorted(files): try: # Opening is very prone to causing exceptions, we'll just skip file if exception is thrown h5 = getter.open_h5_file_read(f) year = getter.get_year(h5) if year: count +=1 analysis_file = open('current_analysis_status.txt','a') update = "Currently at file name: " + str(f) + " and at number " + str(count) + "\n" analysis_file.write(update) print update analysis_file.close() target.append([year]) row = [] timbre = getter.get_segments_timbre(h5) segstarts = getter.get_segments_start(h5) btstarts = getter.get_beats_start(h5) duration = getter.get_duration(h5) end_of_fade_in = getter.get_end_of_fade_in(h5) key = getter.get_key(h5) key_confidence = getter.get_key_confidence(h5) loudness = getter.get_loudness(h5) start_of_fade_out = getter.get_start_of_fade_out(h5) tempo = getter.get_tempo(h5) time_signature = getter.get_time_signature(h5) time_signature_confidence = getter.get_time_signature_confidence(h5) h5.close() # VERY IMPORTANT segstarts = np.array(segstarts).flatten() btstarts = np.array(btstarts).flatten() bttimbre = align_feats(timbre.T, segstarts, btstarts, duration, end_of_fade_in, key, key_confidence, loudness, start_of_fade_out, tempo, time_signature, time_signature_confidence) if bttimbre is None: continue # Skip this track, some features broken npicks, winsize, finaldim = 12, 12, 144 # Calculated by 12 * 12. 12 is fixed as number of dimensions. processed_feats = extract_and_compress(bttimbre, npicks, winsize, finaldim) n_p_feats = processed_feats.shape[0] if processed_feats is None: continue # Skip this track, some features broken row = processed_feats.flatten() if len(row) != 12*144: # 12 dimensions * 144 features per dimension continue # Not enough features year_row = np.array([year]) if row.any() and year_row.any(): writer.writerow(row) target_writer.writerow(year_row) i+=1 else: h5.close() except Exception: pass print 'Finished!' analysis_file = open('current_analysis_status.txt','a') analysis_file.write('Done!') analysis_file.close() return
def func_to_extract_features(filename): """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ global cntnan global cntdanceability global listfeatures global listhotness global listyear global listloudness global listkey global listmode global listduration cf = [] h5 = GETTERS.open_h5_file_read(filename) nanfound = 0 #Get target feature: song hotness #FEATURE 0 song_hotness = GETTERS.get_song_hotttnesss(h5) if math.isnan(song_hotness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_hotness) #FEATURE 1 #Get song loudness song_loudness = GETTERS.get_loudness(h5) if math.isnan(song_loudness): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_loudness) #FEATURE 2 #Get key of the song song_key = GETTERS.get_key(h5) if math.isnan(song_key): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_key) #FEATURE 3 #Get duration of the song song_duration = GETTERS.get_duration(h5) if math.isnan(song_duration): nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_duration) #FEATURE 4-15 #Get Average Pitch Class across all segments #Get the pitches (12 pitches histogram for each segment) pitches = GETTERS.get_segments_pitches(h5) M = np.mat(pitches) meanpitches = M.mean(axis=0) pitches_arr = np.asarray(meanpitches) pitches_list = [] for i in range(0,12): pitches_list.append(pitches_arr[0][i]) cf.append(pitches_list) #FEATURE 16, 27 #Get Average Timbre Class across all segments timbres = GETTERS.get_segments_timbre(h5) M = np.mat(timbres) meantimbres = M.mean(axis=0) timbre_arr = np.asarray(meantimbres) timbre_list = [] for i in range(0,12): timbre_list.append(timbre_arr[0][i]) cf.append(timbre_list) #FEATURE 28 #Get song year song_year = GETTERS.get_year(h5) if song_year == 0: nanfound = 1 cntnan = cntnan + 1 else: cf.append(song_year) #FEATURE 29 #Get song tempo song_tempo = GETTERS.get_tempo(h5) cf.append(song_tempo) #Feature 30 #Get max loudness for each segment max_loudness_arr = GETTERS.get_segments_loudness_max(h5) start_loudness_arr = GETTERS.get_segments_loudness_start(h5) if nanfound == 0: cf.append(max(max_loudness_arr)-min(start_loudness_arr)) #Feature 31 artist_familiarity = GETTERS.get_artist_familiarity(h5) cf.append(artist_familiarity) #Feature 32 song_title = GETTERS.get_title(h5) cf.append(song_title) #Featture 33 artist_name = GETTERS.get_artist_name(h5) cf.append(artist_name) #Feature 34 #location = GETTERS.get_artist_location(h5) #cf.append(location) #Tags artist_mbtags = GETTERS.get_artist_mbtags(h5) if not artist_mbtags.size: genre = "Unknown" else: artist_mbcount = np.array(GETTERS.get_artist_mbtags_count(h5)) index_max = artist_mbcount.argmax(axis=0) genre = artist_mbtags[index_max] if genre == 'espa\xc3\xb1ol': genre = "Unknown" cf.append(genre) if nanfound == 0: strlist = list_to_csv(cf) listfeatures.append(strlist) mydict.setdefault(artist_name,[]).append(song_hotness) h5.close()
def parse_aggregate_songs(file_name,file_name2,artist_map): """ Given an aggregate filename and artist_map in the format {artist_name: {data pertaining to artist}} """ """ TODO: -this function goes through each song, if artist not in there, add all data necesary and add first song info. else update any specific song info -song info is a map from attributename:[values] """ #artist_map = {} h5 = hdf5_getters.open_h5_file_read(file_name) numSongs = hdf5_getters.get_num_songs(h5) print 'Parsing song file...' for i in range(numSongs): artist_name = hdf5_getters.get_artist_name(h5,i) #Filter location longi = hdf5_getters.get_artist_longitude(h5,i) lat = hdf5_getters.get_artist_latitude(h5,i) loc = hdf5_getters.get_artist_location(h5,i) if math.isnan(lat) or math.isnan(longi): #skip if no location continue #filter year yr = hdf5_getters.get_year(h5,i) if yr == 0: #skip if no year continue #filter hotttness and familiarity familiarity = hdf5_getters.get_artist_familiarity(h5,i) hotttness = hdf5_getters.get_artist_hotttnesss(h5,i) if familiarity<=0.0 or hotttness<=0.0: #skip if no hotttness or familiarity computations continue #TODO:MAYBE filter on dance and energy timbre = hdf5_getters.get_segments_timbre(h5,i) #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each if not artist_name in artist_map: #have not encountered the artist yet, so populate new map sub_map = {} sub_map['artist_familiarity'] = familiarity sub_map['artist_hotttnesss'] = hotttness sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i) #longi = hdf5_getters.get_artist_longitude(h5,i) #lat = hdf5_getters.get_artist_latitude(h5,i) #longi = None if math.isnan(longi) else longi #lat = None if math.isnan(lat) else lat sub_map['artist_latitude'] = lat sub_map['artist_longitude'] = longi sub_map['artist_location'] = loc sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i) #TODO:see if should weight by freq or weight for if the term matches one of the feature terms sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i)) sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i)) #song-sepcific data #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA: #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy sub_map['danceability'] = [dance] sub_map['duration'] = [hdf5_getters.get_duration(h5,i)] sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)] sub_map['energy'] = [energy] #since each song has a key, ask if feature for keys should be num of songs that appear in that key or #just binary if any of their songs has that key or just be avg of songs with that key #same for mode, since its either major or minor...should it be count or avg.? sub_map['key'] = [hdf5_getters.get_key(h5,i)] sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)] sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot sub_map['song_hotttnesss'] = [s_hot] sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)] sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)] #should time signature be count as well? binary? sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)] sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)] #should year be binary since they can have many songs across years and should it be year:count sub_map['year'] = [yr] artist_map[artist_name] = sub_map else: #artist already exists, so get its map and update song fields dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy artist_map[artist_name]['danceability'].append(dance) artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i)) artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i)) artist_map[artist_name]['energy'].append(energy) artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i)) artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i)) artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot artist_map[artist_name]['song_hotttnesss'].append(s_hot) artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i)) artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i)) #should time signature be count as well? binary? artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i)) artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i)) #should year be binary since they can have many songs across years and should it be year:count artist_map[artist_name]['year'].append(yr) h5 = hdf5_getters.open_h5_file_read(file_name2) numSongs = hdf5_getters.get_num_songs(h5) print 'Parsing song file2...' for i in range(numSongs): song_id = hdf5_getters.get_track_id(h5,i) artist_name = hdf5_getters.get_artist_name(h5,i) if artist_name in artist_map and song_id in artist_map[artist_name]['track_id']: continue #Filter location longi = hdf5_getters.get_artist_longitude(h5,i) lat = hdf5_getters.get_artist_latitude(h5,i) loc = hdf5_getters.get_artist_location(h5,i) if math.isnan(lat) or math.isnan(longi): #skip if no location continue #filter year yr = hdf5_getters.get_year(h5,i) if yr == 0: #skip if no year continue #filter hotttness and familiarity familiarity = hdf5_getters.get_artist_familiarity(h5,i) hotttness = hdf5_getters.get_artist_hotttnesss(h5,i) if familiarity<=0.0 or hotttness<=0.0: #skip if no hotttness or familiarity computations continue #TODO:MAYBE filter on dance and energy timbre = hdf5_getters.get_segments_timbre(h5,i) #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each if not artist_name in artist_map: #have not encountered the artist yet, so populate new map sub_map = {} sub_map['artist_familiarity'] = familiarity sub_map['artist_hotttnesss'] = hotttness sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i) #longi = hdf5_getters.get_artist_longitude(h5,i) #lat = hdf5_getters.get_artist_latitude(h5,i) #longi = None if math.isnan(longi) else longi #lat = None if math.isnan(lat) else lat sub_map['artist_latitude'] = lat sub_map['artist_longitude'] = longi sub_map['artist_location'] = loc sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i) #TODO:see if should weight by freq or weight for if the term matches one of the feature terms sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i)) sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i)) #song-sepcific data #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA: #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy sub_map['danceability'] = [dance] sub_map['duration'] = [hdf5_getters.get_duration(h5,i)] sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)] sub_map['energy'] = [energy] #since each song has a key, ask if feature for keys should be num of songs that appear in that key or #just binary if any of their songs has that key or just be avg of songs with that key #same for mode, since its either major or minor...should it be count or avg.? sub_map['key'] = [hdf5_getters.get_key(h5,i)] sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)] sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot sub_map['song_hotttnesss'] = [s_hot] sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)] sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)] #should time signature be count as well? binary? sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)] sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)] #should year be binary since they can have many songs across years and should it be year:count sub_map['year'] = [yr] artist_map[artist_name] = sub_map else: #artist already exists, so get its map and update song fields dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy artist_map[artist_name]['danceability'].append(dance) artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i)) artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i)) artist_map[artist_name]['energy'].append(energy) artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i)) artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i)) artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot artist_map[artist_name]['song_hotttnesss'].append(s_hot) artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i)) artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i)) #should time signature be count as well? binary? artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i)) artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i)) #should year be binary since they can have many songs across years and should it be year:count artist_map[artist_name]['year'].append(yr)